Enhance vec_pack_trunc for integral mode mask.

Message ID 20220118062642.48846-1-hongtao.liu@intel.com
State New
Headers
Series Enhance vec_pack_trunc for integral mode mask. |

Commit Message

Liu, Hongtao Jan. 18, 2022, 6:26 a.m. UTC
  For testcase in PR, the patch supports QI:4 -> HI:16 pack with
multi steps(first pack QI:4 -> QI:8 through vec_pack_sbool_trunc_qi,
then pack QI:8 -> HI:16 through vec_pack_trunc_hi).
Similar for QI:2 -> HI:16 which is test4 in mask-pack-prefer-128.c.

Bootstrapped both with and w/o '--with-arch=native --with-cpu=native'
on CLX.
Regtested for x86_64-pc-linux-gnu{-m32,} and
x86_64-pc-linux-gnu{-m32\ -march=cascadelake,\ -march=cascadelake} on CLX.

This patch can partially fix the regression in PR since it enable
vectorization for epilogue.
I'm also working on another patch to prevent mask pack for testcase in PR,
and there will be a separate patch.

Ok for trunk?

gcc/ChangeLog:

	PR target/103771
	* tree-vect-stmts.c (supportable_narrowing_operation): Enhance
	integral mode mask pack by multi steps which takes
	vec_pack_sbool_trunc_optab as start when elements number is
	less than BITS_PER_UNITS.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/mask-pack-prefer128.c: New test.
	* gcc.target/i386/mask-pack-prefer128.c: New test.
	* gcc.target/i386/pr103771.c: New test.
---
 .../gcc.target/i386/mask-pack-prefer128.c      |  8 ++++++++
 .../gcc.target/i386/mask-pack-prefer256.c      |  8 ++++++++
 gcc/testsuite/gcc.target/i386/pr103771.c       | 18 ++++++++++++++++++
 gcc/tree-vect-stmts.c                          | 11 ++++++++---
 4 files changed, 42 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c
 create mode 100644 gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr103771.c
  

Comments

Richard Biener Jan. 18, 2022, 9:23 a.m. UTC | #1
On Tue, Jan 18, 2022 at 7:26 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> For testcase in PR, the patch supports QI:4 -> HI:16 pack with
> multi steps(first pack QI:4 -> QI:8 through vec_pack_sbool_trunc_qi,
> then pack QI:8 -> HI:16 through vec_pack_trunc_hi).
> Similar for QI:2 -> HI:16 which is test4 in mask-pack-prefer-128.c.
>
> Bootstrapped both with and w/o '--with-arch=native --with-cpu=native'
> on CLX.
> Regtested for x86_64-pc-linux-gnu{-m32,} and
> x86_64-pc-linux-gnu{-m32\ -march=cascadelake,\ -march=cascadelake} on CLX.
>
> This patch can partially fix the regression in PR since it enable
> vectorization for epilogue.
> I'm also working on another patch to prevent mask pack for testcase in PR,
> and there will be a separate patch.
>
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR target/103771
>         * tree-vect-stmts.c (supportable_narrowing_operation): Enhance
>         integral mode mask pack by multi steps which takes
>         vec_pack_sbool_trunc_optab as start when elements number is
>         less than BITS_PER_UNITS.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/mask-pack-prefer128.c: New test.
>         * gcc.target/i386/mask-pack-prefer128.c: New test.
>         * gcc.target/i386/pr103771.c: New test.
> ---
>  .../gcc.target/i386/mask-pack-prefer128.c      |  8 ++++++++
>  .../gcc.target/i386/mask-pack-prefer256.c      |  8 ++++++++
>  gcc/testsuite/gcc.target/i386/pr103771.c       | 18 ++++++++++++++++++
>  gcc/tree-vect-stmts.c                          | 11 ++++++++---
>  4 files changed, 42 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr103771.c
>
> diff --git a/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c b/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c
> new file mode 100644
> index 00000000000..c9ea37c7ed3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c
> @@ -0,0 +1,8 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=skylake-avx512 -O3 -fopenmp-simd -fdump-tree-vect-details -mprefer-vector-width=128" } */
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */
> +/* { dg-final { scan-assembler-not "maskmov" } } */
> +
> +#include "mask-pack.c"
> diff --git a/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c b/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c
> new file mode 100644
> index 00000000000..841f51b4041
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c
> @@ -0,0 +1,8 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=skylake-avx512 -O3 -fopenmp-simd -fdump-tree-vect-details -mprefer-vector-width=256" } */
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */
> +/* { dg-final { scan-assembler-not "maskmov" } } */
> +
> +#include "mask-pack.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr103771.c b/gcc/testsuite/gcc.target/i386/pr103771.c
> new file mode 100644
> index 00000000000..a1a9952b6a8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103771.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=cascadelake -O3 -fdump-tree-vect-details -mprefer-vector-width=128" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> +
> +typedef unsigned char uint8_t;
> +
> +static uint8_t x264_clip_uint8 (int x)
> +{
> +  return x & (~255) ? (-x) >> 31 : x;
> +}
> +
> +void
> +mc_weight (uint8_t* __restrict dst, uint8_t* __restrict src,
> +          int i_width,int i_scale)
> +{
> +  for(int x = 0; x < i_width; x++)
> +    dst[x] = x264_clip_uint8 (src[x] * i_scale);
> +}
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index f2625a2ff40..4baf5e36127 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -12111,6 +12111,7 @@ supportable_narrowing_operation (enum tree_code code,
>    tree intermediate_type, prev_type;
>    machine_mode intermediate_mode, prev_mode;
>    int i;
> +  unsigned HOST_WIDE_INT n_elts;
>    bool uns;
>
>    *multi_step_cvt = 0;
> @@ -12120,8 +12121,10 @@ supportable_narrowing_operation (enum tree_code code,
>        c1 = VEC_PACK_TRUNC_EXPR;
>        if (VECTOR_BOOLEAN_TYPE_P (narrow_vectype)
>           && VECTOR_BOOLEAN_TYPE_P (vectype)
> -         && TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype)
> -         && SCALAR_INT_MODE_P (TYPE_MODE (vectype)))
> +         && SCALAR_INT_MODE_P (TYPE_MODE (vectype))
> +         && (TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype)
> +             || (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&n_elts)
> +                 && n_elts < BITS_PER_UNIT)))

your description above hints at that the actual modes involved in the
vec_pack_sbool_trunc are the same so the TYPE_MODE (narrow_vectype)
and TYPE_MODE (vectype) are not the actual modes participating.  I think
it would be way better to fix that.

I suppose that since we know TYPE_VECTOR_SUBPARTS is a power of two
it's always going to be only QImode that is of interest here so maybe a better
check would be TYPE_MODE (narrow_vectype) == QImode rather than
the equality check or elide the mode check completely and only retain
the TYPE_VECTOR_SUBPARTS check you add?

>         optab1 = vec_pack_sbool_trunc_optab;
>        else
>         optab1 = optab_for_tree_code (c1, vectype, optab_default);
> @@ -12213,7 +12216,9 @@ supportable_narrowing_operation (enum tree_code code,
>        if (VECTOR_BOOLEAN_TYPE_P (intermediate_type)
>           && VECTOR_BOOLEAN_TYPE_P (prev_type)
>           && intermediate_mode == prev_mode

Likewise here.

So I think the change is OK if you remove the mode equality checks.

Thanks,
Richard.

> -         && SCALAR_INT_MODE_P (prev_mode))
> +         && SCALAR_INT_MODE_P (prev_mode)
> +         && (TYPE_VECTOR_SUBPARTS (intermediate_type).is_constant (&n_elts)
> +             && n_elts < BITS_PER_UNIT))
>         interm_optab = vec_pack_sbool_trunc_optab;
>        else
>         interm_optab
> --
> 2.18.1
>
  

Patch

diff --git a/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c b/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c
new file mode 100644
index 00000000000..c9ea37c7ed3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c
@@ -0,0 +1,8 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=skylake-avx512 -O3 -fopenmp-simd -fdump-tree-vect-details -mprefer-vector-width=128" } */
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */
+/* { dg-final { scan-assembler-not "maskmov" } } */
+
+#include "mask-pack.c"
diff --git a/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c b/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c
new file mode 100644
index 00000000000..841f51b4041
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c
@@ -0,0 +1,8 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=skylake-avx512 -O3 -fopenmp-simd -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */
+/* { dg-final { scan-assembler-not "maskmov" } } */
+
+#include "mask-pack.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr103771.c b/gcc/testsuite/gcc.target/i386/pr103771.c
new file mode 100644
index 00000000000..a1a9952b6a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103771.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=cascadelake -O3 -fdump-tree-vect-details -mprefer-vector-width=128" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+
+typedef unsigned char uint8_t;
+
+static uint8_t x264_clip_uint8 (int x)
+{
+  return x & (~255) ? (-x) >> 31 : x;
+}
+
+void
+mc_weight (uint8_t* __restrict dst, uint8_t* __restrict src,
+	   int i_width,int i_scale)
+{
+  for(int x = 0; x < i_width; x++)
+    dst[x] = x264_clip_uint8 (src[x] * i_scale);
+}
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index f2625a2ff40..4baf5e36127 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -12111,6 +12111,7 @@  supportable_narrowing_operation (enum tree_code code,
   tree intermediate_type, prev_type;
   machine_mode intermediate_mode, prev_mode;
   int i;
+  unsigned HOST_WIDE_INT n_elts;
   bool uns;
 
   *multi_step_cvt = 0;
@@ -12120,8 +12121,10 @@  supportable_narrowing_operation (enum tree_code code,
       c1 = VEC_PACK_TRUNC_EXPR;
       if (VECTOR_BOOLEAN_TYPE_P (narrow_vectype)
 	  && VECTOR_BOOLEAN_TYPE_P (vectype)
-	  && TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype)
-	  && SCALAR_INT_MODE_P (TYPE_MODE (vectype)))
+	  && SCALAR_INT_MODE_P (TYPE_MODE (vectype))
+	  && (TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype)
+	      || (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&n_elts)
+		  && n_elts < BITS_PER_UNIT)))
 	optab1 = vec_pack_sbool_trunc_optab;
       else
 	optab1 = optab_for_tree_code (c1, vectype, optab_default);
@@ -12213,7 +12216,9 @@  supportable_narrowing_operation (enum tree_code code,
       if (VECTOR_BOOLEAN_TYPE_P (intermediate_type)
 	  && VECTOR_BOOLEAN_TYPE_P (prev_type)
 	  && intermediate_mode == prev_mode
-	  && SCALAR_INT_MODE_P (prev_mode))
+	  && SCALAR_INT_MODE_P (prev_mode)
+	  && (TYPE_VECTOR_SUBPARTS (intermediate_type).is_constant (&n_elts)
+	      && n_elts < BITS_PER_UNIT))
 	interm_optab = vec_pack_sbool_trunc_optab;
       else
 	interm_optab