[3/8] middle-end: Support extractions of subvectors from arbitrary element position inside a vector

Message ID Y1+4Nu1ryQIKoOQA@arm.com
State Dropped
Headers
Series [1/8] middle-end: Recognize scalar reductions from bitfields and array_refs |

Commit Message

Tamar Christina Oct. 31, 2022, 11:57 a.m. UTC
  Hi All,

The current vector extract pattern can only extract from a vector when the
position to extract is a multiple of the vector bitsize as a whole.

That means extract something like a V2SI from a V4SI vector from position 32
isn't possible as 32 is not a multiple of 64.  Ideally this optab should have
worked on multiple of the element size, but too many targets rely on this
semantic now.

So instead add a new case which allows any extraction as long as the bit pos
is a multiple of the element size.  We use a VEC_PERM to shuffle the elements
into the bottom parts of the vector and then use a subreg to extract the values
out.  This now allows various vector operations that before were being
decomposed into very inefficient scalar operations.

NOTE: I added 3 testcases, I only fixed the 3rd one.

The 1st one missed because we don't optimize VEC_PERM expressions into
bitfields.  The 2nd one is missed because extract_bit_field only works on
vector modes.  In this case the intermediate extract is DImode.

On targets where the scalar mode is tiable to vector modes the extract should
work fine.

However I ran out of time to fix the first two and so will do so in GCC 14.
For now this catches the case that my pattern now introduces more easily.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* expmed.cc (extract_bit_field_1): Add support for vector element
	extracts.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/ext_1.c: New.

--- inline copy of patch -- 
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index bab020c07222afa38305ef8d7333f271b1965b78..ffdf65210d17580a216477cfe4ac1598941ac9e4 100644




--
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index bab020c07222afa38305ef8d7333f271b1965b78..ffdf65210d17580a216477cfe4ac1598941ac9e4 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -1718,6 +1718,45 @@ extract_bit_field_1 (rtx str_rtx, poly_uint64 bitsize, poly_uint64 bitnum,
 	      return target;
 	    }
 	}
+      else if (!known_eq (bitnum, 0U)
+	       && multiple_p (GET_MODE_UNIT_BITSIZE (tmode), bitnum, &pos))
+	{
+	  /* The encoding has a single stepped pattern.  */
+	  poly_uint64 nunits = GET_MODE_NUNITS (new_mode);
+	  int nelts = nunits.to_constant ();
+	  vec_perm_builder sel (nunits, nelts, 1);
+	  int delta = -pos.to_constant ();
+	  for (int i = 0; i < nelts; ++i)
+	    sel.quick_push ((i - delta) % nelts);
+	  vec_perm_indices indices (sel, 1, nunits);
+
+	  if (can_vec_perm_const_p (new_mode, new_mode, indices, false))
+	    {
+	      class expand_operand ops[4];
+	      machine_mode outermode = new_mode;
+	      machine_mode innermode = tmode;
+	      enum insn_code icode
+		= direct_optab_handler (vec_perm_optab, outermode);
+	      target = gen_reg_rtx (outermode);
+	      if (icode != CODE_FOR_nothing)
+		{
+		  rtx sel = vec_perm_indices_to_rtx (outermode, indices);
+		  create_output_operand (&ops[0], target, outermode);
+		  ops[0].target = 1;
+		  create_input_operand (&ops[1], op0, outermode);
+		  create_input_operand (&ops[2], op0, outermode);
+		  create_input_operand (&ops[3], sel, outermode);
+		  if (maybe_expand_insn (icode, 4, ops))
+		    return simplify_gen_subreg (innermode, target, outermode, 0);
+		}
+	      else if (targetm.vectorize.vec_perm_const != NULL)
+		{
+		  if (targetm.vectorize.vec_perm_const (outermode, outermode,
+							target, op0, op0, indices))
+		    return simplify_gen_subreg (innermode, target, outermode, 0);
+		}
+	    }
+	}
     }
 
   /* See if we can get a better vector mode before extracting.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/ext_1.c b/gcc/testsuite/gcc.target/aarch64/ext_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..18a10a14f1161584267a8472e571b3bc2ddf887a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ext_1.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <string.h>
+
+typedef unsigned int v4si __attribute__((vector_size (16)));
+typedef unsigned int v2si __attribute__((vector_size (8)));
+
+/*
+** extract: { xfail *-*-* }
+**	ext	v0.16b, v0.16b, v0.16b, #4
+**	ret
+*/
+v2si extract (v4si x)
+{
+    v2si res = {x[1], x[2]};
+    return res;
+}
+
+/*
+** extract1: { xfail *-*-* }
+**	ext	v0.16b, v0.16b, v0.16b, #4
+**	ret
+*/
+v2si extract1 (v4si x)
+{
+    v2si res;
+    memcpy (&res, ((int*)&x)+1, sizeof(res));
+    return res;
+}
+
+typedef struct cast {
+  int a;
+  v2si b __attribute__((packed));
+} cast_t;
+
+typedef union Data {
+   v4si x;
+   cast_t y;
+} data;  
+
+/*
+** extract2:
+**	ext	v0.16b, v0.16b, v0.16b, #4
+**	ret
+*/
+v2si extract2 (v4si x)
+{
+    data d;
+    d.x = x;
+    return d.y.b;
+}
+
  

Comments

Jeff Law Oct. 31, 2022, 9:44 p.m. UTC | #1
On 10/31/22 05:57, Tamar Christina wrote:
> Hi All,
>
> The current vector extract pattern can only extract from a vector when the
> position to extract is a multiple of the vector bitsize as a whole.
>
> That means extract something like a V2SI from a V4SI vector from position 32
> isn't possible as 32 is not a multiple of 64.  Ideally this optab should have
> worked on multiple of the element size, but too many targets rely on this
> semantic now.
>
> So instead add a new case which allows any extraction as long as the bit pos
> is a multiple of the element size.  We use a VEC_PERM to shuffle the elements
> into the bottom parts of the vector and then use a subreg to extract the values
> out.  This now allows various vector operations that before were being
> decomposed into very inefficient scalar operations.
>
> NOTE: I added 3 testcases, I only fixed the 3rd one.
>
> The 1st one missed because we don't optimize VEC_PERM expressions into
> bitfields.  The 2nd one is missed because extract_bit_field only works on
> vector modes.  In this case the intermediate extract is DImode.
>
> On targets where the scalar mode is tiable to vector modes the extract should
> work fine.
>
> However I ran out of time to fix the first two and so will do so in GCC 14.
> For now this catches the case that my pattern now introduces more easily.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* expmed.cc (extract_bit_field_1): Add support for vector element
> 	extracts.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/ext_1.c: New.

OK.

jeff
  
Richard Sandiford Nov. 1, 2022, 2:25 p.m. UTC | #2
Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> Hi All,
>
> The current vector extract pattern can only extract from a vector when the
> position to extract is a multiple of the vector bitsize as a whole.
>
> That means extract something like a V2SI from a V4SI vector from position 32
> isn't possible as 32 is not a multiple of 64.  Ideally this optab should have
> worked on multiple of the element size, but too many targets rely on this
> semantic now.
>
> So instead add a new case which allows any extraction as long as the bit pos
> is a multiple of the element size.  We use a VEC_PERM to shuffle the elements
> into the bottom parts of the vector and then use a subreg to extract the values
> out.  This now allows various vector operations that before were being
> decomposed into very inefficient scalar operations.
>
> NOTE: I added 3 testcases, I only fixed the 3rd one.
>
> The 1st one missed because we don't optimize VEC_PERM expressions into
> bitfields.  The 2nd one is missed because extract_bit_field only works on
> vector modes.  In this case the intermediate extract is DImode.
>
> On targets where the scalar mode is tiable to vector modes the extract should
> work fine.
>
> However I ran out of time to fix the first two and so will do so in GCC 14.
> For now this catches the case that my pattern now introduces more easily.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* expmed.cc (extract_bit_field_1): Add support for vector element
> 	extracts.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/ext_1.c: New.
>
> --- inline copy of patch -- 
> diff --git a/gcc/expmed.cc b/gcc/expmed.cc
> index bab020c07222afa38305ef8d7333f271b1965b78..ffdf65210d17580a216477cfe4ac1598941ac9e4 100644
> --- a/gcc/expmed.cc
> +++ b/gcc/expmed.cc
> @@ -1718,6 +1718,45 @@ extract_bit_field_1 (rtx str_rtx, poly_uint64 bitsize, poly_uint64 bitnum,
>  	      return target;
>  	    }
>  	}
> +      else if (!known_eq (bitnum, 0U)
> +	       && multiple_p (GET_MODE_UNIT_BITSIZE (tmode), bitnum, &pos))
> +	{
> +	  /* The encoding has a single stepped pattern.  */
> +	  poly_uint64 nunits = GET_MODE_NUNITS (new_mode);
> +	  int nelts = nunits.to_constant ();
> +	  vec_perm_builder sel (nunits, nelts, 1);
> +	  int delta = -pos.to_constant ();
> +	  for (int i = 0; i < nelts; ++i)
> +	    sel.quick_push ((i - delta) % nelts);
> +	  vec_perm_indices indices (sel, 1, nunits);

Thanks for doing this, looks good.  But I don't think the to_constant
calls are safe.  new_mode and pos could in principle be nonconstant.

To build a stepped pattern, we just need:

    vec_perm_builder sel (nunits, 1, 3);

and then push pos, pos + 1, and pos + 2 to it.  There's no need to
clamp the position to nelts, it happens automatically.

> +
> +	  if (can_vec_perm_const_p (new_mode, new_mode, indices, false))
> +	    {
> +	      class expand_operand ops[4];
> +	      machine_mode outermode = new_mode;
> +	      machine_mode innermode = tmode;
> +	      enum insn_code icode
> +		= direct_optab_handler (vec_perm_optab, outermode);
> +	      target = gen_reg_rtx (outermode);
> +	      if (icode != CODE_FOR_nothing)
> +		{
> +		  rtx sel = vec_perm_indices_to_rtx (outermode, indices);
> +		  create_output_operand (&ops[0], target, outermode);
> +		  ops[0].target = 1;
> +		  create_input_operand (&ops[1], op0, outermode);
> +		  create_input_operand (&ops[2], op0, outermode);
> +		  create_input_operand (&ops[3], sel, outermode);

I think this should be GET_MODE (sel).  Looks like the current
version would ICE for float vectors.  That said...

> +		  if (maybe_expand_insn (icode, 4, ops))
> +		    return simplify_gen_subreg (innermode, target, outermode, 0);
> +		}
> +	      else if (targetm.vectorize.vec_perm_const != NULL)
> +		{
> +		  if (targetm.vectorize.vec_perm_const (outermode, outermode,
> +							target, op0, op0, indices))
> +		    return simplify_gen_subreg (innermode, target, outermode, 0);
> +		}

...can we use expand_vec_perm_const here?  It will try the constant
expansion first, which is the preferred order.  It also has a few
variations up its sleeve.

Thanks,
Richard


> +	    }
> +	}
>      }
>  
>    /* See if we can get a better vector mode before extracting.  */
> diff --git a/gcc/testsuite/gcc.target/aarch64/ext_1.c b/gcc/testsuite/gcc.target/aarch64/ext_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..18a10a14f1161584267a8472e571b3bc2ddf887a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/ext_1.c
> @@ -0,0 +1,54 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include <string.h>
> +
> +typedef unsigned int v4si __attribute__((vector_size (16)));
> +typedef unsigned int v2si __attribute__((vector_size (8)));
> +
> +/*
> +** extract: { xfail *-*-* }
> +**	ext	v0.16b, v0.16b, v0.16b, #4
> +**	ret
> +*/
> +v2si extract (v4si x)
> +{
> +    v2si res = {x[1], x[2]};
> +    return res;
> +}
> +
> +/*
> +** extract1: { xfail *-*-* }
> +**	ext	v0.16b, v0.16b, v0.16b, #4
> +**	ret
> +*/
> +v2si extract1 (v4si x)
> +{
> +    v2si res;
> +    memcpy (&res, ((int*)&x)+1, sizeof(res));
> +    return res;
> +}
> +
> +typedef struct cast {
> +  int a;
> +  v2si b __attribute__((packed));
> +} cast_t;
> +
> +typedef union Data {
> +   v4si x;
> +   cast_t y;
> +} data;  
> +
> +/*
> +** extract2:
> +**	ext	v0.16b, v0.16b, v0.16b, #4
> +**	ret
> +*/
> +v2si extract2 (v4si x)
> +{
> +    data d;
> +    d.x = x;
> +    return d.y.b;
> +}
> +
  
Tamar Christina Nov. 11, 2022, 2:33 p.m. UTC | #3
Hi,

> 
> ...can we use expand_vec_perm_const here?  It will try the constant
> expansion first, which is the preferred order.  It also has a few variations up
> its sleeve.
> 

We can, however it this function seems to be incorrectly assuming it can always
Convert the input mode to a QI vector mode.  When I started using it we got a number
of miscompilations in the AArch64 codegen.  This had the knock-on effect of uncovering
bugs in both the AArch64 backend and i386.  I'll send patched out for those separately.

For now here's the new patch using that hook and updating the permute expansion code:

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* expmed.cc (extract_bit_field_1): Add support for vector element
	extracts.
	* optabs.cc (expand_vec_perm_const): Add checks before converting
	permute to QImode fallback.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/ext_1.c: New.

--- inline copy of patch ---

diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index bab020c07222afa38305ef8d7333f271b1965b78..7d38045ae525c8a4665a0c1384fc515e4de88c67 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -1718,6 +1718,21 @@ extract_bit_field_1 (rtx str_rtx, poly_uint64 bitsize, poly_uint64 bitnum,
 	      return target;
 	    }
 	}
+      else if (!known_eq (bitnum, 0U)
+	       && multiple_p (GET_MODE_UNIT_BITSIZE (tmode), bitnum, &pos))
+	{
+	  /* The encoding has a single stepped pattern.  */
+	  poly_uint64 nunits = GET_MODE_NUNITS (new_mode);
+	  vec_perm_builder sel (nunits, 1, 3);
+	  sel.quick_push (pos);
+	  sel.quick_push (pos + 1);
+	  sel.quick_push (pos + 2);
+
+	  rtx res
+	    = expand_vec_perm_const (new_mode, op0, op0, sel, new_mode, NULL);
+	  if (res)
+	    return simplify_gen_subreg (tmode, res, new_mode, 0);
+	}
     }
 
   /* See if we can get a better vector mode before extracting.  */
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index cff37ccb0dfc3dd79b97d0abfd872f340855dc96..f338df410265dfe55b6896160090a453cc6a28d9 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -6267,6 +6267,7 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
       v0_qi = gen_lowpart (qimode, v0);
       v1_qi = gen_lowpart (qimode, v1);
       if (targetm.vectorize.vec_perm_const != NULL
+	  && targetm.can_change_mode_class (mode, qimode, ALL_REGS)
 	  && targetm.vectorize.vec_perm_const (qimode, qimode, target_qi, v0_qi,
 					       v1_qi, qimode_indices))
 	return gen_lowpart (mode, target_qi);
@@ -6311,7 +6312,8 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
     }
 
   if (qimode != VOIDmode
-      && selector_fits_mode_p (qimode, qimode_indices))
+      && selector_fits_mode_p (qimode, qimode_indices)
+      && targetm.can_change_mode_class (mode, qimode, ALL_REGS))
     {
       icode = direct_optab_handler (vec_perm_optab, qimode);
       if (icode != CODE_FOR_nothing)
diff --git a/gcc/testsuite/gcc.target/aarch64/ext_1.c b/gcc/testsuite/gcc.target/aarch64/ext_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..18a10a14f1161584267a8472e571b3bc2ddf887a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ext_1.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <string.h>
+
+typedef unsigned int v4si __attribute__((vector_size (16)));
+typedef unsigned int v2si __attribute__((vector_size (8)));
+
+/*
+** extract: { xfail *-*-* }
+**	ext	v0.16b, v0.16b, v0.16b, #4
+**	ret
+*/
+v2si extract (v4si x)
+{
+    v2si res = {x[1], x[2]};
+    return res;
+}
+
+/*
+** extract1: { xfail *-*-* }
+**	ext	v0.16b, v0.16b, v0.16b, #4
+**	ret
+*/
+v2si extract1 (v4si x)
+{
+    v2si res;
+    memcpy (&res, ((int*)&x)+1, sizeof(res));
+    return res;
+}
+
+typedef struct cast {
+  int a;
+  v2si b __attribute__((packed));
+} cast_t;
+
+typedef union Data {
+   v4si x;
+   cast_t y;
+} data;  
+
+/*
+** extract2:
+**	ext	v0.16b, v0.16b, v0.16b, #4
+**	ret
+*/
+v2si extract2 (v4si x)
+{
+    data d;
+    d.x = x;
+    return d.y.b;
+}
+
  
Hongtao Liu Nov. 15, 2022, 8:35 a.m. UTC | #4
Hi:
  I'm from https://gcc.gnu.org/pipermail/gcc-patches/2022-November/606040.html.
>      }
>
>    /* See if we can get a better vector mode before extracting.  */
> diff --git a/gcc/optabs.cc b/gcc/optabs.cc
> index cff37ccb0dfc3dd79b97d0abfd872f340855dc96..f338df410265dfe55b6896160090a453cc6a28d9 100644
> --- a/gcc/optabs.cc
> +++ b/gcc/optabs.cc
> @@ -6267,6 +6267,7 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
>        v0_qi = gen_lowpart (qimode, v0);
>        v1_qi = gen_lowpart (qimode, v1);
>        if (targetm.vectorize.vec_perm_const != NULL
> +         && targetm.can_change_mode_class (mode, qimode, ALL_REGS)
It looks like you want to guard gen_lowpart, shouldn't it be better to
use validate_subreg  or (tmp = gen_lowpart_if_possible (mode,
target_qi)).
IMHO, targetm.can_change_mode_class is mostly used for RA, but not to
guard gen_lowpart.
I did similar things in
https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579296.html
(and ALL_REGS doesn't cover all cases for registers which are both
available for qimode and mode, ALL_REGS fail doesn't mean it can't be
subreg, it just means parts of ALL_REGS can't be subreg. but with a
subset of ALL_REGS, there could be a reg class which return true for
targetm.can_change_mode_class)
>           && targetm.vectorize.vec_perm_const (qimode, qimode, target_qi, v0_qi,
>                                                v1_qi, qimode_indices))
>         return gen_lowpart (mode, target_qi);
> @@ -6311,7 +6312,8 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
>      }
>
>    if (qimode != VOIDmode
> -      && selector_fits_mode_p (qimode, qimode_indices))
> +      && selector_fits_mode_p (qimode, qimode_indices)
> +      && targetm.can_change_mode_class (mode, qimode, ALL_REGS))
>      {
>        icode = direct_optab_handler (vec_perm_optab, qimode);
>        if (icode != CODE_FOR_nothing)
> diff --git a/gcc/testsuite/gcc.target/aarch64/ext_1.c b/gcc/testsuite/gcc.target/aarch64/ext_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..18a10a14f1161584267a8472e571b3bc2ddf887a
  
Tamar Christina Nov. 15, 2022, 8:51 a.m. UTC | #5
> -----Original Message-----
> From: Hongtao Liu <crazylht@gmail.com>
> Sent: Tuesday, November 15, 2022 8:36 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina via
> Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
> rguenther@suse.de
> Subject: Re: [PATCH 3/8]middle-end: Support extractions of subvectors from
> arbitrary element position inside a vector
> 
> Hi:
>   I'm from https://gcc.gnu.org/pipermail/gcc-patches/2022-
> November/606040.html.
> >      }
> >
> >    /* See if we can get a better vector mode before extracting.  */
> > diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
> >
> cff37ccb0dfc3dd79b97d0abfd872f340855dc96..f338df410265dfe55b689616009
> 0
> > a453cc6a28d9 100644
> > --- a/gcc/optabs.cc
> > +++ b/gcc/optabs.cc
> > @@ -6267,6 +6267,7 @@ expand_vec_perm_const (machine_mode mode,
> rtx v0, rtx v1,
> >        v0_qi = gen_lowpart (qimode, v0);
> >        v1_qi = gen_lowpart (qimode, v1);
> >        if (targetm.vectorize.vec_perm_const != NULL
> > +         && targetm.can_change_mode_class (mode, qimode, ALL_REGS)
> It looks like you want to guard gen_lowpart, shouldn't it be better to use
> validate_subreg  or (tmp = gen_lowpart_if_possible (mode, target_qi)).
> IMHO, targetm.can_change_mode_class is mostly used for RA, but not to
> guard gen_lowpart.

Hmm I don't think this is quite true, there are existing usages in expr.cc and rtanal.cc
That do this and aren't part of RA.  As I mentioned before for instance the
canoncalization of vec_select to subreg in rtlanal for instances uses this.

So there are already existing precedence for this.  And the documentation for
the hook says:

"This hook returns true if it is possible to bitcast values held in registers of class rclass from mode from to mode to and if doing so preserves the low-order bits that are common to both modes. The result is only meaningful if rclass has registers that can hold both from and to. The default implementation returns true"

So it looks like it's use outside of RA is perfectly valid.. and the documentation also mentions
in the example the use from the mid-end as an example.

But if the mid-end maintainers are happy I'll use something else.

Tamar

> I did similar things in
> https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579296.html
> (and ALL_REGS doesn't cover all cases for registers which are both available
> for qimode and mode, ALL_REGS fail doesn't mean it can't be subreg, it just
> means parts of ALL_REGS can't be subreg. but with a subset of ALL_REGS,
> there could be a reg class which return true for
> targetm.can_change_mode_class)
> >           && targetm.vectorize.vec_perm_const (qimode, qimode, target_qi,
> v0_qi,
> >                                                v1_qi, qimode_indices))
> >         return gen_lowpart (mode, target_qi); @@ -6311,7 +6312,8 @@
> > expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
> >      }
> >
> >    if (qimode != VOIDmode
> > -      && selector_fits_mode_p (qimode, qimode_indices))
> > +      && selector_fits_mode_p (qimode, qimode_indices)
> > +      && targetm.can_change_mode_class (mode, qimode, ALL_REGS))
> >      {
> >        icode = direct_optab_handler (vec_perm_optab, qimode);
> >        if (icode != CODE_FOR_nothing)
> > diff --git a/gcc/testsuite/gcc.target/aarch64/ext_1.c
> > b/gcc/testsuite/gcc.target/aarch64/ext_1.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..18a10a14f1161584267a8472e5
> 71
> > b3bc2ddf887a
> 
> 
> 
> 
> --
> BR,
> Hongtao
  
Hongtao Liu Nov. 15, 2022, 9:37 a.m. UTC | #6
On Tue, Nov 15, 2022 at 4:51 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
>
> > -----Original Message-----
> > From: Hongtao Liu <crazylht@gmail.com>
> > Sent: Tuesday, November 15, 2022 8:36 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina via
> > Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
> > rguenther@suse.de
> > Subject: Re: [PATCH 3/8]middle-end: Support extractions of subvectors from
> > arbitrary element position inside a vector
> >
> > Hi:
> >   I'm from https://gcc.gnu.org/pipermail/gcc-patches/2022-
> > November/606040.html.
> > >      }
> > >
> > >    /* See if we can get a better vector mode before extracting.  */
> > > diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
> > >
> > cff37ccb0dfc3dd79b97d0abfd872f340855dc96..f338df410265dfe55b689616009
> > 0
> > > a453cc6a28d9 100644
> > > --- a/gcc/optabs.cc
> > > +++ b/gcc/optabs.cc
> > > @@ -6267,6 +6267,7 @@ expand_vec_perm_const (machine_mode mode,
> > rtx v0, rtx v1,
> > >        v0_qi = gen_lowpart (qimode, v0);
> > >        v1_qi = gen_lowpart (qimode, v1);
> > >        if (targetm.vectorize.vec_perm_const != NULL
> > > +         && targetm.can_change_mode_class (mode, qimode, ALL_REGS)
> > It looks like you want to guard gen_lowpart, shouldn't it be better to use
> > validate_subreg  or (tmp = gen_lowpart_if_possible (mode, target_qi)).
> > IMHO, targetm.can_change_mode_class is mostly used for RA, but not to
> > guard gen_lowpart.
>
> Hmm I don't think this is quite true, there are existing usages in expr.cc and rtanal.cc
> That do this and aren't part of RA.  As I mentioned before for instance the
> canoncalization of vec_select to subreg in rtlanal for instances uses this.
In theory, we need to iterate through all reg classes that can be
assigned for both qimode and mode, if any regclass returns true for
targetm.can_change_mode_class, the bitcast(validate_subreg) should be
ok.
Here we just passed ALL_REGS.
>
> So there are already existing precedence for this.  And the documentation for
> the hook says:
>
> "This hook returns true if it is possible to bitcast values held in registers of class rclass from mode from to mode to and if doing so preserves the low-order bits that are common to both modes. The result is only meaningful if rclass has registers that can hold both from and to. The default implementation returns true"
>
> So it looks like it's use outside of RA is perfectly valid.. and the documentation also mentions
> in the example the use from the mid-end as an example.
>
> But if the mid-end maintainers are happy I'll use something else.
>
> Tamar
>
> > I did similar things in
> > https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579296.html
> > (and ALL_REGS doesn't cover all cases for registers which are both available
> > for qimode and mode, ALL_REGS fail doesn't mean it can't be subreg, it just
> > means parts of ALL_REGS can't be subreg. but with a subset of ALL_REGS,
> > there could be a reg class which return true for
> > targetm.can_change_mode_class)
> > >           && targetm.vectorize.vec_perm_const (qimode, qimode, target_qi,
> > v0_qi,
> > >                                                v1_qi, qimode_indices))
> > >         return gen_lowpart (mode, target_qi); @@ -6311,7 +6312,8 @@
> > > expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
> > >      }
> > >
> > >    if (qimode != VOIDmode
> > > -      && selector_fits_mode_p (qimode, qimode_indices))
> > > +      && selector_fits_mode_p (qimode, qimode_indices)
> > > +      && targetm.can_change_mode_class (mode, qimode, ALL_REGS))
> > >      {
> > >        icode = direct_optab_handler (vec_perm_optab, qimode);
> > >        if (icode != CODE_FOR_nothing)
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/ext_1.c
> > > b/gcc/testsuite/gcc.target/aarch64/ext_1.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..18a10a14f1161584267a8472e5
> > 71
> > > b3bc2ddf887a
> >
> >
> >
> >
> > --
> > BR,
> > Hongtao
  
Tamar Christina Nov. 15, 2022, 10 a.m. UTC | #7
> -----Original Message-----
> From: Hongtao Liu <crazylht@gmail.com>
> Sent: Tuesday, November 15, 2022 9:37 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina via
> Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
> rguenther@suse.de
> Subject: Re: [PATCH 3/8]middle-end: Support extractions of subvectors from
> arbitrary element position inside a vector
> 
> On Tue, Nov 15, 2022 at 4:51 PM Tamar Christina
> <Tamar.Christina@arm.com> wrote:
> >
> > > -----Original Message-----
> > > From: Hongtao Liu <crazylht@gmail.com>
> > > Sent: Tuesday, November 15, 2022 8:36 AM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina
> > > via Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
> > > rguenther@suse.de
> > > Subject: Re: [PATCH 3/8]middle-end: Support extractions of
> > > subvectors from arbitrary element position inside a vector
> > >
> > > Hi:
> > >   I'm from https://gcc.gnu.org/pipermail/gcc-patches/2022-
> > > November/606040.html.
> > > >      }
> > > >
> > > >    /* See if we can get a better vector mode before extracting.
> > > > */ diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
> > > >
> > >
> cff37ccb0dfc3dd79b97d0abfd872f340855dc96..f338df410265dfe55b68961600
> > > 9
> > > 0
> > > > a453cc6a28d9 100644
> > > > --- a/gcc/optabs.cc
> > > > +++ b/gcc/optabs.cc
> > > > @@ -6267,6 +6267,7 @@ expand_vec_perm_const (machine_mode
> mode,
> > > rtx v0, rtx v1,
> > > >        v0_qi = gen_lowpart (qimode, v0);
> > > >        v1_qi = gen_lowpart (qimode, v1);
> > > >        if (targetm.vectorize.vec_perm_const != NULL
> > > > +         && targetm.can_change_mode_class (mode, qimode,
> > > > + ALL_REGS)
> > > It looks like you want to guard gen_lowpart, shouldn't it be better
> > > to use validate_subreg  or (tmp = gen_lowpart_if_possible (mode,
> target_qi)).
> > > IMHO, targetm.can_change_mode_class is mostly used for RA, but not
> > > to guard gen_lowpart.
> >
> > Hmm I don't think this is quite true, there are existing usages in
> > expr.cc and rtanal.cc That do this and aren't part of RA.  As I
> > mentioned before for instance the canoncalization of vec_select to subreg
> in rtlanal for instances uses this.
> In theory, we need to iterate through all reg classes that can be assigned for
> both qimode and mode, if any regclass returns true for
> targetm.can_change_mode_class, the bitcast(validate_subreg) should be ok.
> Here we just passed ALL_REGS.

Yes, and most targets where this transformation is valid return true here.

I've checked:
 * alpha
 * arm
 * aarch64
 * rs6000
 * s390
 * sparc
 * pa
 * mips

And even the default example that other targets use from the documentation
would return true as the size of the modes are the same.

X86 and RISCV are the only two targets that I found (but didn't check all) that
blankly return a result based on just the register classes.

That is to say, there are more targets that adhere to the interpretation that
rclass here means "should be possible in some class in rclass" rather than
"should be possible in ALL classes of rclass".

> >
> > So there are already existing precedence for this.  And the
> > documentation for the hook says:
> >
> > "This hook returns true if it is possible to bitcast values held in registers of
> class rclass from mode from to mode to and if doing so preserves the low-
> order bits that are common to both modes. The result is only meaningful if
> rclass has registers that can hold both from and to. The default
> implementation returns true"
> >
> > So it looks like it's use outside of RA is perfectly valid.. and the
> > documentation also mentions in the example the use from the mid-end as
> an example.
> >
> > But if the mid-end maintainers are happy I'll use something else.
> >
> > Tamar
> >
> > > I did similar things in
> > > https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579296.html
> > > (and ALL_REGS doesn't cover all cases for registers which are both
> > > available for qimode and mode, ALL_REGS fail doesn't mean it can't
> > > be subreg, it just means parts of ALL_REGS can't be subreg. but with
> > > a subset of ALL_REGS, there could be a reg class which return true
> > > for
> > > targetm.can_change_mode_class)
> > > >           && targetm.vectorize.vec_perm_const (qimode, qimode,
> > > > target_qi,
> > > v0_qi,
> > > >                                                v1_qi, qimode_indices))
> > > >         return gen_lowpart (mode, target_qi); @@ -6311,7 +6312,8
> > > > @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
> > > >      }
> > > >
> > > >    if (qimode != VOIDmode
> > > > -      && selector_fits_mode_p (qimode, qimode_indices))
> > > > +      && selector_fits_mode_p (qimode, qimode_indices)
> > > > +      && targetm.can_change_mode_class (mode, qimode, ALL_REGS))
> > > >      {
> > > >        icode = direct_optab_handler (vec_perm_optab, qimode);
> > > >        if (icode != CODE_FOR_nothing) diff --git
> > > > a/gcc/testsuite/gcc.target/aarch64/ext_1.c
> > > > b/gcc/testsuite/gcc.target/aarch64/ext_1.c
> > > > new file mode 100644
> > > > index
> > > >
> > >
> 0000000000000000000000000000000000000000..18a10a14f1161584267a8472e5
> > > 71
> > > > b3bc2ddf887a
> > >
> > >
> > >
> > >
> > > --
> > > BR,
> > > Hongtao
> 
> 
> 
> --
> BR,
> Hongtao
  
Richard Sandiford Nov. 15, 2022, 5:39 p.m. UTC | #8
Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Hongtao Liu <crazylht@gmail.com>
>> Sent: Tuesday, November 15, 2022 9:37 AM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina via
>> Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
>> rguenther@suse.de
>> Subject: Re: [PATCH 3/8]middle-end: Support extractions of subvectors from
>> arbitrary element position inside a vector
>>
>> On Tue, Nov 15, 2022 at 4:51 PM Tamar Christina
>> <Tamar.Christina@arm.com> wrote:
>> >
>> > > -----Original Message-----
>> > > From: Hongtao Liu <crazylht@gmail.com>
>> > > Sent: Tuesday, November 15, 2022 8:36 AM
>> > > To: Tamar Christina <Tamar.Christina@arm.com>
>> > > Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina
>> > > via Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
>> > > rguenther@suse.de
>> > > Subject: Re: [PATCH 3/8]middle-end: Support extractions of
>> > > subvectors from arbitrary element position inside a vector
>> > >
>> > > Hi:
>> > >   I'm from https://gcc.gnu.org/pipermail/gcc-patches/2022-
>> > > November/606040.html.
>> > > >      }
>> > > >
>> > > >    /* See if we can get a better vector mode before extracting.
>> > > > */ diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
>> > > >
>> > >
>> cff37ccb0dfc3dd79b97d0abfd872f340855dc96..f338df410265dfe55b68961600
>> > > 9
>> > > 0
>> > > > a453cc6a28d9 100644
>> > > > --- a/gcc/optabs.cc
>> > > > +++ b/gcc/optabs.cc
>> > > > @@ -6267,6 +6267,7 @@ expand_vec_perm_const (machine_mode
>> mode,
>> > > rtx v0, rtx v1,
>> > > >        v0_qi = gen_lowpart (qimode, v0);
>> > > >        v1_qi = gen_lowpart (qimode, v1);
>> > > >        if (targetm.vectorize.vec_perm_const != NULL
>> > > > +         && targetm.can_change_mode_class (mode, qimode,
>> > > > + ALL_REGS)
>> > > It looks like you want to guard gen_lowpart, shouldn't it be better
>> > > to use validate_subreg  or (tmp = gen_lowpart_if_possible (mode,
>> target_qi)).
>> > > IMHO, targetm.can_change_mode_class is mostly used for RA, but not
>> > > to guard gen_lowpart.
>> >
>> > Hmm I don't think this is quite true, there are existing usages in
>> > expr.cc and rtanal.cc That do this and aren't part of RA.  As I
>> > mentioned before for instance the canoncalization of vec_select to subreg
>> in rtlanal for instances uses this.
>> In theory, we need to iterate through all reg classes that can be assigned for
>> both qimode and mode, if any regclass returns true for
>> targetm.can_change_mode_class, the bitcast(validate_subreg) should be ok.
>> Here we just passed ALL_REGS.
>
> Yes, and most targets where this transformation is valid return true here.
>
> I've checked:
>  * alpha
>  * arm
>  * aarch64
>  * rs6000
>  * s390
>  * sparc
>  * pa
>  * mips
>
> And even the default example that other targets use from the documentation
> would return true as the size of the modes are the same.
>
> X86 and RISCV are the only two targets that I found (but didn't check all) that
> blankly return a result based on just the register classes.
>
> That is to say, there are more targets that adhere to the interpretation that
> rclass here means "should be possible in some class in rclass" rather than
> "should be possible in ALL classes of rclass".

Yeah, I agree.  A query "can something stored in ALL_REGS change from
mode M1 to mode M2?" is meaningful if at least one register R in ALL_REGS
can hold both M1 and M2.  It's then the target's job to answer
conservatively so that the result covers all such R.

In principle it's OK for a target to err on the side of caution and forbid
things that are actually OK.  But that's going to risk losing performance
in some cases, and sometimes that loss of performance will be unacceptable.
IMO that's what's happening here.  The target is applying x87 rules to
things that (AIUI) are never stored in x87 registers, and so losing
performance as a result.

Note that the RA also uses ALL_REGS for some things, so this usage
isn't specific to non-RA code.

IMO it's not the job of target-independent code to iterate through
individual classes and aggregate the result.  One of the reasons for
having union classes is to avoid the need to do that.  And ALL_REGS
is the ultimate union class. :-)

The patch looks correct to me.

Thanks,
Richard

>> >
>> > So there are already existing precedence for this.  And the
>> > documentation for the hook says:
>> >
>> > "This hook returns true if it is possible to bitcast values held in registers of
>> class rclass from mode from to mode to and if doing so preserves the low-
>> order bits that are common to both modes. The result is only meaningful if
>> rclass has registers that can hold both from and to. The default
>> implementation returns true"
>> >
>> > So it looks like it's use outside of RA is perfectly valid.. and the
>> > documentation also mentions in the example the use from the mid-end as
>> an example.
>> >
>> > But if the mid-end maintainers are happy I'll use something else.
>> >
>> > Tamar
>> >
>> > > I did similar things in
>> > > https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579296.html
>> > > (and ALL_REGS doesn't cover all cases for registers which are both
>> > > available for qimode and mode, ALL_REGS fail doesn't mean it can't
>> > > be subreg, it just means parts of ALL_REGS can't be subreg. but with
>> > > a subset of ALL_REGS, there could be a reg class which return true
>> > > for
>> > > targetm.can_change_mode_class)
>> > > >           && targetm.vectorize.vec_perm_const (qimode, qimode,
>> > > > target_qi,
>> > > v0_qi,
>> > > >                                                v1_qi, qimode_indices))
>> > > >         return gen_lowpart (mode, target_qi); @@ -6311,7 +6312,8
>> > > > @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
>> > > >      }
>> > > >
>> > > >    if (qimode != VOIDmode
>> > > > -      && selector_fits_mode_p (qimode, qimode_indices))
>> > > > +      && selector_fits_mode_p (qimode, qimode_indices)
>> > > > +      && targetm.can_change_mode_class (mode, qimode, ALL_REGS))
>> > > >      {
>> > > >        icode = direct_optab_handler (vec_perm_optab, qimode);
>> > > >        if (icode != CODE_FOR_nothing) diff --git
>> > > > a/gcc/testsuite/gcc.target/aarch64/ext_1.c
>> > > > b/gcc/testsuite/gcc.target/aarch64/ext_1.c
>> > > > new file mode 100644
>> > > > index
>> > > >
>> > >
>> 0000000000000000000000000000000000000000..18a10a14f1161584267a8472e5
>> > > 71
>> > > > b3bc2ddf887a
>> > >
>> > >
>> > >
>> > >
>> > > --
>> > > BR,
>> > > Hongtao
>>
>>
>>
>> --
>> BR,
>> Hongtao
  
Hongtao Liu Nov. 17, 2022, 8:04 a.m. UTC | #9
On Wed, Nov 16, 2022 at 1:39 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> -----Original Message-----
> >> From: Hongtao Liu <crazylht@gmail.com>
> >> Sent: Tuesday, November 15, 2022 9:37 AM
> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina via
> >> Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
> >> rguenther@suse.de
> >> Subject: Re: [PATCH 3/8]middle-end: Support extractions of subvectors from
> >> arbitrary element position inside a vector
> >>
> >> On Tue, Nov 15, 2022 at 4:51 PM Tamar Christina
> >> <Tamar.Christina@arm.com> wrote:
> >> >
> >> > > -----Original Message-----
> >> > > From: Hongtao Liu <crazylht@gmail.com>
> >> > > Sent: Tuesday, November 15, 2022 8:36 AM
> >> > > To: Tamar Christina <Tamar.Christina@arm.com>
> >> > > Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina
> >> > > via Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
> >> > > rguenther@suse.de
> >> > > Subject: Re: [PATCH 3/8]middle-end: Support extractions of
> >> > > subvectors from arbitrary element position inside a vector
> >> > >
> >> > > Hi:
> >> > >   I'm from https://gcc.gnu.org/pipermail/gcc-patches/2022-
> >> > > November/606040.html.
> >> > > >      }
> >> > > >
> >> > > >    /* See if we can get a better vector mode before extracting.
> >> > > > */ diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
> >> > > >
> >> > >
> >> cff37ccb0dfc3dd79b97d0abfd872f340855dc96..f338df410265dfe55b68961600
> >> > > 9
> >> > > 0
> >> > > > a453cc6a28d9 100644
> >> > > > --- a/gcc/optabs.cc
> >> > > > +++ b/gcc/optabs.cc
> >> > > > @@ -6267,6 +6267,7 @@ expand_vec_perm_const (machine_mode
> >> mode,
> >> > > rtx v0, rtx v1,
> >> > > >        v0_qi = gen_lowpart (qimode, v0);
> >> > > >        v1_qi = gen_lowpart (qimode, v1);
> >> > > >        if (targetm.vectorize.vec_perm_const != NULL
> >> > > > +         && targetm.can_change_mode_class (mode, qimode,
> >> > > > + ALL_REGS)
> >> > > It looks like you want to guard gen_lowpart, shouldn't it be better
> >> > > to use validate_subreg  or (tmp = gen_lowpart_if_possible (mode,
> >> target_qi)).
> >> > > IMHO, targetm.can_change_mode_class is mostly used for RA, but not
> >> > > to guard gen_lowpart.
> >> >
> >> > Hmm I don't think this is quite true, there are existing usages in
> >> > expr.cc and rtanal.cc That do this and aren't part of RA.  As I
> >> > mentioned before for instance the canoncalization of vec_select to subreg
> >> in rtlanal for instances uses this.
> >> In theory, we need to iterate through all reg classes that can be assigned for
> >> both qimode and mode, if any regclass returns true for
> >> targetm.can_change_mode_class, the bitcast(validate_subreg) should be ok.
> >> Here we just passed ALL_REGS.
> >
> > Yes, and most targets where this transformation is valid return true here.
> >
> > I've checked:
> >  * alpha
> >  * arm
> >  * aarch64
> >  * rs6000
> >  * s390
> >  * sparc
> >  * pa
> >  * mips
> >
> > And even the default example that other targets use from the documentation
> > would return true as the size of the modes are the same.
> >
> > X86 and RISCV are the only two targets that I found (but didn't check all) that
> > blankly return a result based on just the register classes.
> >
> > That is to say, there are more targets that adhere to the interpretation that
> > rclass here means "should be possible in some class in rclass" rather than
> > "should be possible in ALL classes of rclass".
>
> Yeah, I agree.  A query "can something stored in ALL_REGS change from
> mode M1 to mode M2?" is meaningful if at least one register R in ALL_REGS
> can hold both M1 and M2.  It's then the target's job to answer
> conservatively so that the result covers all such R.
>
> In principle it's OK for a target to err on the side of caution and forbid
> things that are actually OK.  But that's going to risk losing performance
> in some cases, and sometimes that loss of performance will be unacceptable.
> IMO that's what's happening here.  The target is applying x87 rules to
> things that (AIUI) are never stored in x87 registers, and so losing
Yes, it can be optimized since some mode will never assigned to x87 registers.
> performance as a result.
>
> Note that the RA also uses ALL_REGS for some things, so this usage
> isn't specific to non-RA code.
RA passes the minimal reg class(REGNO_REG_CLASS) which contains REGN
to decide if can_change_mode_class, not ALL_REGS.
511/* Given a hard REGN a FROM mode and a TO mode, return true if
512   REGN can change from mode FROM to mode TO.  */
513#define REG_CAN_CHANGE_MODE_P(REGN, FROM, TO)                          \
514  (targetm.can_change_mode_class (FROM, TO, REGNO_REG_CLASS (REGN)))
515

So I still think using can_change_mode_class outside of RA with
ALL_REGS passed to decide whether it's ok to generate subreg is not a
good idea.
>
> IMO it's not the job of target-independent code to iterate through
> individual classes and aggregate the result.  One of the reasons for
> having union classes is to avoid the need to do that.  And ALL_REGS
> is the ultimate union class. :-)
>
> The patch looks correct to me.
>
> Thanks,
> Richard
>
> >> >
> >> > So there are already existing precedence for this.  And the
> >> > documentation for the hook says:
> >> >
> >> > "This hook returns true if it is possible to bitcast values held in registers of
> >> class rclass from mode from to mode to and if doing so preserves the low-
> >> order bits that are common to both modes. The result is only meaningful if
> >> rclass has registers that can hold both from and to. The default
> >> implementation returns true"
> >> >
> >> > So it looks like it's use outside of RA is perfectly valid.. and the
> >> > documentation also mentions in the example the use from the mid-end as
> >> an example.
> >> >
> >> > But if the mid-end maintainers are happy I'll use something else.
> >> >
> >> > Tamar
> >> >
> >> > > I did similar things in
> >> > > https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579296.html
> >> > > (and ALL_REGS doesn't cover all cases for registers which are both
> >> > > available for qimode and mode, ALL_REGS fail doesn't mean it can't
> >> > > be subreg, it just means parts of ALL_REGS can't be subreg. but with
> >> > > a subset of ALL_REGS, there could be a reg class which return true
> >> > > for
> >> > > targetm.can_change_mode_class)
> >> > > >           && targetm.vectorize.vec_perm_const (qimode, qimode,
> >> > > > target_qi,
> >> > > v0_qi,
> >> > > >                                                v1_qi, qimode_indices))
> >> > > >         return gen_lowpart (mode, target_qi); @@ -6311,7 +6312,8
> >> > > > @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
> >> > > >      }
> >> > > >
> >> > > >    if (qimode != VOIDmode
> >> > > > -      && selector_fits_mode_p (qimode, qimode_indices))
> >> > > > +      && selector_fits_mode_p (qimode, qimode_indices)
> >> > > > +      && targetm.can_change_mode_class (mode, qimode, ALL_REGS))
> >> > > >      {
> >> > > >        icode = direct_optab_handler (vec_perm_optab, qimode);
> >> > > >        if (icode != CODE_FOR_nothing) diff --git
> >> > > > a/gcc/testsuite/gcc.target/aarch64/ext_1.c
> >> > > > b/gcc/testsuite/gcc.target/aarch64/ext_1.c
> >> > > > new file mode 100644
> >> > > > index
> >> > > >
> >> > >
> >> 0000000000000000000000000000000000000000..18a10a14f1161584267a8472e5
> >> > > 71
> >> > > > b3bc2ddf887a
> >> > >
> >> > >
> >> > >
> >> > >
> >> > > --
> >> > > BR,
> >> > > Hongtao
> >>
> >>
> >>
> >> --
> >> BR,
> >> Hongtao
  
Richard Sandiford Nov. 17, 2022, 9:39 a.m. UTC | #10
Hongtao Liu <crazylht@gmail.com> writes:
> On Wed, Nov 16, 2022 at 1:39 AM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Tamar Christina <Tamar.Christina@arm.com> writes:
>> >> -----Original Message-----
>> >> From: Hongtao Liu <crazylht@gmail.com>
>> >> Sent: Tuesday, November 15, 2022 9:37 AM
>> >> To: Tamar Christina <Tamar.Christina@arm.com>
>> >> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina via
>> >> Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
>> >> rguenther@suse.de
>> >> Subject: Re: [PATCH 3/8]middle-end: Support extractions of subvectors from
>> >> arbitrary element position inside a vector
>> >>
>> >> On Tue, Nov 15, 2022 at 4:51 PM Tamar Christina
>> >> <Tamar.Christina@arm.com> wrote:
>> >> >
>> >> > > -----Original Message-----
>> >> > > From: Hongtao Liu <crazylht@gmail.com>
>> >> > > Sent: Tuesday, November 15, 2022 8:36 AM
>> >> > > To: Tamar Christina <Tamar.Christina@arm.com>
>> >> > > Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina
>> >> > > via Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
>> >> > > rguenther@suse.de
>> >> > > Subject: Re: [PATCH 3/8]middle-end: Support extractions of
>> >> > > subvectors from arbitrary element position inside a vector
>> >> > >
>> >> > > Hi:
>> >> > >   I'm from https://gcc.gnu.org/pipermail/gcc-patches/2022-
>> >> > > November/606040.html.
>> >> > > >      }
>> >> > > >
>> >> > > >    /* See if we can get a better vector mode before extracting.
>> >> > > > */ diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
>> >> > > >
>> >> > >
>> >> cff37ccb0dfc3dd79b97d0abfd872f340855dc96..f338df410265dfe55b68961600
>> >> > > 9
>> >> > > 0
>> >> > > > a453cc6a28d9 100644
>> >> > > > --- a/gcc/optabs.cc
>> >> > > > +++ b/gcc/optabs.cc
>> >> > > > @@ -6267,6 +6267,7 @@ expand_vec_perm_const (machine_mode
>> >> mode,
>> >> > > rtx v0, rtx v1,
>> >> > > >        v0_qi = gen_lowpart (qimode, v0);
>> >> > > >        v1_qi = gen_lowpart (qimode, v1);
>> >> > > >        if (targetm.vectorize.vec_perm_const != NULL
>> >> > > > +         && targetm.can_change_mode_class (mode, qimode,
>> >> > > > + ALL_REGS)
>> >> > > It looks like you want to guard gen_lowpart, shouldn't it be better
>> >> > > to use validate_subreg  or (tmp = gen_lowpart_if_possible (mode,
>> >> target_qi)).
>> >> > > IMHO, targetm.can_change_mode_class is mostly used for RA, but not
>> >> > > to guard gen_lowpart.
>> >> >
>> >> > Hmm I don't think this is quite true, there are existing usages in
>> >> > expr.cc and rtanal.cc That do this and aren't part of RA.  As I
>> >> > mentioned before for instance the canoncalization of vec_select to subreg
>> >> in rtlanal for instances uses this.
>> >> In theory, we need to iterate through all reg classes that can be assigned for
>> >> both qimode and mode, if any regclass returns true for
>> >> targetm.can_change_mode_class, the bitcast(validate_subreg) should be ok.
>> >> Here we just passed ALL_REGS.
>> >
>> > Yes, and most targets where this transformation is valid return true here.
>> >
>> > I've checked:
>> >  * alpha
>> >  * arm
>> >  * aarch64
>> >  * rs6000
>> >  * s390
>> >  * sparc
>> >  * pa
>> >  * mips
>> >
>> > And even the default example that other targets use from the documentation
>> > would return true as the size of the modes are the same.
>> >
>> > X86 and RISCV are the only two targets that I found (but didn't check all) that
>> > blankly return a result based on just the register classes.
>> >
>> > That is to say, there are more targets that adhere to the interpretation that
>> > rclass here means "should be possible in some class in rclass" rather than
>> > "should be possible in ALL classes of rclass".
>>
>> Yeah, I agree.  A query "can something stored in ALL_REGS change from
>> mode M1 to mode M2?" is meaningful if at least one register R in ALL_REGS
>> can hold both M1 and M2.  It's then the target's job to answer
>> conservatively so that the result covers all such R.
>>
>> In principle it's OK for a target to err on the side of caution and forbid
>> things that are actually OK.  But that's going to risk losing performance
>> in some cases, and sometimes that loss of performance will be unacceptable.
>> IMO that's what's happening here.  The target is applying x87 rules to
>> things that (AIUI) are never stored in x87 registers, and so losing
> Yes, it can be optimized since some mode will never assigned to x87 registers.
>> performance as a result.
>>
>> Note that the RA also uses ALL_REGS for some things, so this usage
>> isn't specific to non-RA code.
> RA passes the minimal reg class(REGNO_REG_CLASS) which contains REGN
> to decide if can_change_mode_class, not ALL_REGS.
> 511/* Given a hard REGN a FROM mode and a TO mode, return true if
> 512   REGN can change from mode FROM to mode TO.  */
> 513#define REG_CAN_CHANGE_MODE_P(REGN, FROM, TO)                          \
> 514  (targetm.can_change_mode_class (FROM, TO, REGNO_REG_CLASS (REGN)))
> 515
>
> So I still think using can_change_mode_class outside of RA with
> ALL_REGS passed to decide whether it's ok to generate subreg is not a
> good idea.

But if the argument is that the only valid uses of can_change_mode_class
are through this macro, the hook isn't describing a class property,
it's describing the property of individual registers.  If we say that
querying individual registers is the only valid thing to do them
we should change the hook to take a register number rather than
a class enum.

The reason we have a class-based rather than register-based interface
is because it is useful to query classes before you've picked a
specific register.

Thanks,
Richard

>> IMO it's not the job of target-independent code to iterate through
>> individual classes and aggregate the result.  One of the reasons for
>> having union classes is to avoid the need to do that.  And ALL_REGS
>> is the ultimate union class. :-)
>>
>> The patch looks correct to me.
>>
>> Thanks,
>> Richard
>>
>> >> >
>> >> > So there are already existing precedence for this.  And the
>> >> > documentation for the hook says:
>> >> >
>> >> > "This hook returns true if it is possible to bitcast values held in registers of
>> >> class rclass from mode from to mode to and if doing so preserves the low-
>> >> order bits that are common to both modes. The result is only meaningful if
>> >> rclass has registers that can hold both from and to. The default
>> >> implementation returns true"
>> >> >
>> >> > So it looks like it's use outside of RA is perfectly valid.. and the
>> >> > documentation also mentions in the example the use from the mid-end as
>> >> an example.
>> >> >
>> >> > But if the mid-end maintainers are happy I'll use something else.
>> >> >
>> >> > Tamar
>> >> >
>> >> > > I did similar things in
>> >> > > https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579296.html
>> >> > > (and ALL_REGS doesn't cover all cases for registers which are both
>> >> > > available for qimode and mode, ALL_REGS fail doesn't mean it can't
>> >> > > be subreg, it just means parts of ALL_REGS can't be subreg. but with
>> >> > > a subset of ALL_REGS, there could be a reg class which return true
>> >> > > for
>> >> > > targetm.can_change_mode_class)
>> >> > > >           && targetm.vectorize.vec_perm_const (qimode, qimode,
>> >> > > > target_qi,
>> >> > > v0_qi,
>> >> > > >                                                v1_qi, qimode_indices))
>> >> > > >         return gen_lowpart (mode, target_qi); @@ -6311,7 +6312,8
>> >> > > > @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
>> >> > > >      }
>> >> > > >
>> >> > > >    if (qimode != VOIDmode
>> >> > > > -      && selector_fits_mode_p (qimode, qimode_indices))
>> >> > > > +      && selector_fits_mode_p (qimode, qimode_indices)
>> >> > > > +      && targetm.can_change_mode_class (mode, qimode, ALL_REGS))
>> >> > > >      {
>> >> > > >        icode = direct_optab_handler (vec_perm_optab, qimode);
>> >> > > >        if (icode != CODE_FOR_nothing) diff --git
>> >> > > > a/gcc/testsuite/gcc.target/aarch64/ext_1.c
>> >> > > > b/gcc/testsuite/gcc.target/aarch64/ext_1.c
>> >> > > > new file mode 100644
>> >> > > > index
>> >> > > >
>> >> > >
>> >> 0000000000000000000000000000000000000000..18a10a14f1161584267a8472e5
>> >> > > 71
>> >> > > > b3bc2ddf887a
>> >> > >
>> >> > >
>> >> > >
>> >> > >
>> >> > > --
>> >> > > BR,
>> >> > > Hongtao
>> >>
>> >>
>> >>
>> >> --
>> >> BR,
>> >> Hongtao
  
Hongtao Liu Nov. 17, 2022, 10:20 a.m. UTC | #11
On Thu, Nov 17, 2022 at 5:39 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Hongtao Liu <crazylht@gmail.com> writes:
> > On Wed, Nov 16, 2022 at 1:39 AM Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> >> -----Original Message-----
> >> >> From: Hongtao Liu <crazylht@gmail.com>
> >> >> Sent: Tuesday, November 15, 2022 9:37 AM
> >> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> >> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina via
> >> >> Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
> >> >> rguenther@suse.de
> >> >> Subject: Re: [PATCH 3/8]middle-end: Support extractions of subvectors from
> >> >> arbitrary element position inside a vector
> >> >>
> >> >> On Tue, Nov 15, 2022 at 4:51 PM Tamar Christina
> >> >> <Tamar.Christina@arm.com> wrote:
> >> >> >
> >> >> > > -----Original Message-----
> >> >> > > From: Hongtao Liu <crazylht@gmail.com>
> >> >> > > Sent: Tuesday, November 15, 2022 8:36 AM
> >> >> > > To: Tamar Christina <Tamar.Christina@arm.com>
> >> >> > > Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina
> >> >> > > via Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
> >> >> > > rguenther@suse.de
> >> >> > > Subject: Re: [PATCH 3/8]middle-end: Support extractions of
> >> >> > > subvectors from arbitrary element position inside a vector
> >> >> > >
> >> >> > > Hi:
> >> >> > >   I'm from https://gcc.gnu.org/pipermail/gcc-patches/2022-
> >> >> > > November/606040.html.
> >> >> > > >      }
> >> >> > > >
> >> >> > > >    /* See if we can get a better vector mode before extracting.
> >> >> > > > */ diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
> >> >> > > >
> >> >> > >
> >> >> cff37ccb0dfc3dd79b97d0abfd872f340855dc96..f338df410265dfe55b68961600
> >> >> > > 9
> >> >> > > 0
> >> >> > > > a453cc6a28d9 100644
> >> >> > > > --- a/gcc/optabs.cc
> >> >> > > > +++ b/gcc/optabs.cc
> >> >> > > > @@ -6267,6 +6267,7 @@ expand_vec_perm_const (machine_mode
> >> >> mode,
> >> >> > > rtx v0, rtx v1,
> >> >> > > >        v0_qi = gen_lowpart (qimode, v0);
> >> >> > > >        v1_qi = gen_lowpart (qimode, v1);
> >> >> > > >        if (targetm.vectorize.vec_perm_const != NULL
> >> >> > > > +         && targetm.can_change_mode_class (mode, qimode,
> >> >> > > > + ALL_REGS)
> >> >> > > It looks like you want to guard gen_lowpart, shouldn't it be better
> >> >> > > to use validate_subreg  or (tmp = gen_lowpart_if_possible (mode,
> >> >> target_qi)).
> >> >> > > IMHO, targetm.can_change_mode_class is mostly used for RA, but not
> >> >> > > to guard gen_lowpart.
> >> >> >
> >> >> > Hmm I don't think this is quite true, there are existing usages in
> >> >> > expr.cc and rtanal.cc That do this and aren't part of RA.  As I
> >> >> > mentioned before for instance the canoncalization of vec_select to subreg
> >> >> in rtlanal for instances uses this.
> >> >> In theory, we need to iterate through all reg classes that can be assigned for
> >> >> both qimode and mode, if any regclass returns true for
> >> >> targetm.can_change_mode_class, the bitcast(validate_subreg) should be ok.
> >> >> Here we just passed ALL_REGS.
> >> >
> >> > Yes, and most targets where this transformation is valid return true here.
> >> >
> >> > I've checked:
> >> >  * alpha
> >> >  * arm
> >> >  * aarch64
> >> >  * rs6000
> >> >  * s390
> >> >  * sparc
> >> >  * pa
> >> >  * mips
> >> >
> >> > And even the default example that other targets use from the documentation
> >> > would return true as the size of the modes are the same.
> >> >
> >> > X86 and RISCV are the only two targets that I found (but didn't check all) that
> >> > blankly return a result based on just the register classes.
> >> >
> >> > That is to say, there are more targets that adhere to the interpretation that
> >> > rclass here means "should be possible in some class in rclass" rather than
> >> > "should be possible in ALL classes of rclass".
> >>
> >> Yeah, I agree.  A query "can something stored in ALL_REGS change from
> >> mode M1 to mode M2?" is meaningful if at least one register R in ALL_REGS
> >> can hold both M1 and M2.  It's then the target's job to answer
> >> conservatively so that the result covers all such R.
> >>
> >> In principle it's OK for a target to err on the side of caution and forbid
> >> things that are actually OK.  But that's going to risk losing performance
> >> in some cases, and sometimes that loss of performance will be unacceptable.
> >> IMO that's what's happening here.  The target is applying x87 rules to
> >> things that (AIUI) are never stored in x87 registers, and so losing
> > Yes, it can be optimized since some mode will never assigned to x87 registers.
> >> performance as a result.
> >>
> >> Note that the RA also uses ALL_REGS for some things, so this usage
> >> isn't specific to non-RA code.
> > RA passes the minimal reg class(REGNO_REG_CLASS) which contains REGN
> > to decide if can_change_mode_class, not ALL_REGS.
> > 511/* Given a hard REGN a FROM mode and a TO mode, return true if
> > 512   REGN can change from mode FROM to mode TO.  */
> > 513#define REG_CAN_CHANGE_MODE_P(REGN, FROM, TO)                          \
> > 514  (targetm.can_change_mode_class (FROM, TO, REGNO_REG_CLASS (REGN)))
> > 515
> >
> > So I still think using can_change_mode_class outside of RA with
> > ALL_REGS passed to decide whether it's ok to generate subreg is not a
> > good idea.
>
> But if the argument is that the only valid uses of can_change_mode_class
> are through this macro, the hook isn't describing a class property,
> it's describing the property of individual registers.  If we say that
> querying individual registers is the only valid thing to do them
> we should change the hook to take a register number rather than
> a class enum.
>
> The reason we have a class-based rather than register-based interface
> is because it is useful to query classes before you've picked a
> specific register.
For individual registers in the minimal reg class, we assume they are
not different from each other, I guess that's why we have
REGNO_REG_CLASS and class-based interfaces other than register-based
interfaces.
But for ALL_REGS, it's not the minimal reg class, it's the largest.
Using it It's not that suitable.
If the argument is if some r in rclass is ok for mode change, the hook
would return true, then why would RA use REGNO_REG_CLASS other than
ALL_REGS.
Another spot is in validate_subreg, we're using the minimal reg class
instead of ALL_REGS.
 973  /* This is a normal subreg.  Verify that the offset is representable.  */
 974
 975  /* For hard registers, we already have most of these rules collected in
 976     subreg_offset_representable_p.  */
 977  if (reg && REG_P (reg) && HARD_REGISTER_P (reg))
 978    {
 979      unsigned int regno = REGNO (reg);
 980
 981      if ((COMPLEX_MODE_P (imode) || VECTOR_MODE_P (imode))
 982          && GET_MODE_INNER (imode) == omode)
 983        ;
 984      else if (!REG_CAN_CHANGE_MODE_P (regno, imode, omode))
 985        return false;

I think we do need some hook in the middle end to query things like if
some r in rclass is ok for mode change?  but not reusing
can_change_mode_class.
>
> Thanks,
> Richard
>
> >> IMO it's not the job of target-independent code to iterate through
> >> individual classes and aggregate the result.  One of the reasons for
> >> having union classes is to avoid the need to do that.  And ALL_REGS
> >> is the ultimate union class. :-)
> >>
> >> The patch looks correct to me.
> >>
> >> Thanks,
> >> Richard
> >>
> >> >> >
> >> >> > So there are already existing precedence for this.  And the
> >> >> > documentation for the hook says:
> >> >> >
> >> >> > "This hook returns true if it is possible to bitcast values held in registers of
> >> >> class rclass from mode from to mode to and if doing so preserves the low-
> >> >> order bits that are common to both modes. The result is only meaningful if
> >> >> rclass has registers that can hold both from and to. The default
> >> >> implementation returns true"
> >> >> >
> >> >> > So it looks like it's use outside of RA is perfectly valid.. and the
> >> >> > documentation also mentions in the example the use from the mid-end as
> >> >> an example.
> >> >> >
> >> >> > But if the mid-end maintainers are happy I'll use something else.
> >> >> >
> >> >> > Tamar
> >> >> >
> >> >> > > I did similar things in
> >> >> > > https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579296.html
> >> >> > > (and ALL_REGS doesn't cover all cases for registers which are both
> >> >> > > available for qimode and mode, ALL_REGS fail doesn't mean it can't
> >> >> > > be subreg, it just means parts of ALL_REGS can't be subreg. but with
> >> >> > > a subset of ALL_REGS, there could be a reg class which return true
> >> >> > > for
> >> >> > > targetm.can_change_mode_class)
> >> >> > > >           && targetm.vectorize.vec_perm_const (qimode, qimode,
> >> >> > > > target_qi,
> >> >> > > v0_qi,
> >> >> > > >                                                v1_qi, qimode_indices))
> >> >> > > >         return gen_lowpart (mode, target_qi); @@ -6311,7 +6312,8
> >> >> > > > @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
> >> >> > > >      }
> >> >> > > >
> >> >> > > >    if (qimode != VOIDmode
> >> >> > > > -      && selector_fits_mode_p (qimode, qimode_indices))
> >> >> > > > +      && selector_fits_mode_p (qimode, qimode_indices)
> >> >> > > > +      && targetm.can_change_mode_class (mode, qimode, ALL_REGS))
> >> >> > > >      {
> >> >> > > >        icode = direct_optab_handler (vec_perm_optab, qimode);
> >> >> > > >        if (icode != CODE_FOR_nothing) diff --git
> >> >> > > > a/gcc/testsuite/gcc.target/aarch64/ext_1.c
> >> >> > > > b/gcc/testsuite/gcc.target/aarch64/ext_1.c
> >> >> > > > new file mode 100644
> >> >> > > > index
> >> >> > > >
> >> >> > >
> >> >> 0000000000000000000000000000000000000000..18a10a14f1161584267a8472e5
> >> >> > > 71
> >> >> > > > b3bc2ddf887a
> >> >> > >
> >> >> > >
> >> >> > >
> >> >> > >
> >> >> > > --
> >> >> > > BR,
> >> >> > > Hongtao
> >> >>
> >> >>
> >> >>
> >> >> --
> >> >> BR,
> >> >> Hongtao
  
Richard Sandiford Nov. 17, 2022, 1:59 p.m. UTC | #12
Hongtao Liu <crazylht@gmail.com> writes:
> On Thu, Nov 17, 2022 at 5:39 PM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Hongtao Liu <crazylht@gmail.com> writes:
>> > On Wed, Nov 16, 2022 at 1:39 AM Richard Sandiford
>> > <richard.sandiford@arm.com> wrote:
>> >>
>> >> Tamar Christina <Tamar.Christina@arm.com> writes:
>> >> >> -----Original Message-----
>> >> >> From: Hongtao Liu <crazylht@gmail.com>
>> >> >> Sent: Tuesday, November 15, 2022 9:37 AM
>> >> >> To: Tamar Christina <Tamar.Christina@arm.com>
>> >> >> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina via
>> >> >> Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
>> >> >> rguenther@suse.de
>> >> >> Subject: Re: [PATCH 3/8]middle-end: Support extractions of subvectors from
>> >> >> arbitrary element position inside a vector
>> >> >>
>> >> >> On Tue, Nov 15, 2022 at 4:51 PM Tamar Christina
>> >> >> <Tamar.Christina@arm.com> wrote:
>> >> >> >
>> >> >> > > -----Original Message-----
>> >> >> > > From: Hongtao Liu <crazylht@gmail.com>
>> >> >> > > Sent: Tuesday, November 15, 2022 8:36 AM
>> >> >> > > To: Tamar Christina <Tamar.Christina@arm.com>
>> >> >> > > Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina
>> >> >> > > via Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
>> >> >> > > rguenther@suse.de
>> >> >> > > Subject: Re: [PATCH 3/8]middle-end: Support extractions of
>> >> >> > > subvectors from arbitrary element position inside a vector
>> >> >> > >
>> >> >> > > Hi:
>> >> >> > >   I'm from https://gcc.gnu.org/pipermail/gcc-patches/2022-
>> >> >> > > November/606040.html.
>> >> >> > > >      }
>> >> >> > > >
>> >> >> > > >    /* See if we can get a better vector mode before extracting.
>> >> >> > > > */ diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
>> >> >> > > >
>> >> >> > >
>> >> >> cff37ccb0dfc3dd79b97d0abfd872f340855dc96..f338df410265dfe55b68961600
>> >> >> > > 9
>> >> >> > > 0
>> >> >> > > > a453cc6a28d9 100644
>> >> >> > > > --- a/gcc/optabs.cc
>> >> >> > > > +++ b/gcc/optabs.cc
>> >> >> > > > @@ -6267,6 +6267,7 @@ expand_vec_perm_const (machine_mode
>> >> >> mode,
>> >> >> > > rtx v0, rtx v1,
>> >> >> > > >        v0_qi = gen_lowpart (qimode, v0);
>> >> >> > > >        v1_qi = gen_lowpart (qimode, v1);
>> >> >> > > >        if (targetm.vectorize.vec_perm_const != NULL
>> >> >> > > > +         && targetm.can_change_mode_class (mode, qimode,
>> >> >> > > > + ALL_REGS)
>> >> >> > > It looks like you want to guard gen_lowpart, shouldn't it be better
>> >> >> > > to use validate_subreg  or (tmp = gen_lowpart_if_possible (mode,
>> >> >> target_qi)).
>> >> >> > > IMHO, targetm.can_change_mode_class is mostly used for RA, but not
>> >> >> > > to guard gen_lowpart.
>> >> >> >
>> >> >> > Hmm I don't think this is quite true, there are existing usages in
>> >> >> > expr.cc and rtanal.cc That do this and aren't part of RA.  As I
>> >> >> > mentioned before for instance the canoncalization of vec_select to subreg
>> >> >> in rtlanal for instances uses this.
>> >> >> In theory, we need to iterate through all reg classes that can be assigned for
>> >> >> both qimode and mode, if any regclass returns true for
>> >> >> targetm.can_change_mode_class, the bitcast(validate_subreg) should be ok.
>> >> >> Here we just passed ALL_REGS.
>> >> >
>> >> > Yes, and most targets where this transformation is valid return true here.
>> >> >
>> >> > I've checked:
>> >> >  * alpha
>> >> >  * arm
>> >> >  * aarch64
>> >> >  * rs6000
>> >> >  * s390
>> >> >  * sparc
>> >> >  * pa
>> >> >  * mips
>> >> >
>> >> > And even the default example that other targets use from the documentation
>> >> > would return true as the size of the modes are the same.
>> >> >
>> >> > X86 and RISCV are the only two targets that I found (but didn't check all) that
>> >> > blankly return a result based on just the register classes.
>> >> >
>> >> > That is to say, there are more targets that adhere to the interpretation that
>> >> > rclass here means "should be possible in some class in rclass" rather than
>> >> > "should be possible in ALL classes of rclass".
>> >>
>> >> Yeah, I agree.  A query "can something stored in ALL_REGS change from
>> >> mode M1 to mode M2?" is meaningful if at least one register R in ALL_REGS
>> >> can hold both M1 and M2.  It's then the target's job to answer
>> >> conservatively so that the result covers all such R.
>> >>
>> >> In principle it's OK for a target to err on the side of caution and forbid
>> >> things that are actually OK.  But that's going to risk losing performance
>> >> in some cases, and sometimes that loss of performance will be unacceptable.
>> >> IMO that's what's happening here.  The target is applying x87 rules to
>> >> things that (AIUI) are never stored in x87 registers, and so losing
>> > Yes, it can be optimized since some mode will never assigned to x87 registers.
>> >> performance as a result.
>> >>
>> >> Note that the RA also uses ALL_REGS for some things, so this usage
>> >> isn't specific to non-RA code.
>> > RA passes the minimal reg class(REGNO_REG_CLASS) which contains REGN
>> > to decide if can_change_mode_class, not ALL_REGS.
>> > 511/* Given a hard REGN a FROM mode and a TO mode, return true if
>> > 512   REGN can change from mode FROM to mode TO.  */
>> > 513#define REG_CAN_CHANGE_MODE_P(REGN, FROM, TO)                          \
>> > 514  (targetm.can_change_mode_class (FROM, TO, REGNO_REG_CLASS (REGN)))
>> > 515
>> >
>> > So I still think using can_change_mode_class outside of RA with
>> > ALL_REGS passed to decide whether it's ok to generate subreg is not a
>> > good idea.
>>
>> But if the argument is that the only valid uses of can_change_mode_class
>> are through this macro, the hook isn't describing a class property,
>> it's describing the property of individual registers.  If we say that
>> querying individual registers is the only valid thing to do them
>> we should change the hook to take a register number rather than
>> a class enum.
>>
>> The reason we have a class-based rather than register-based interface
>> is because it is useful to query classes before you've picked a
>> specific register.
> For individual registers in the minimal reg class, we assume they are
> not different from each other, I guess that's why we have
> REGNO_REG_CLASS and class-based interfaces other than register-based
> interfaces.

I don't think that's necessarily true.  We have things like
hard_regno_nregs that operate on individual registers.  And even
the x86 implementation of the hook uses subset operations rather
than comparing the class for equality, which suggests some attempt
to handle classes other than those returned by REGNO_REG_CLASS.

> But for ALL_REGS, it's not the minimal reg class, it's the largest.
> Using it It's not that suitable.

But I think it is suitable for the gimple level, where we have no
information that would narrow the choice to a specific register class.

> If the argument is if some r in rclass is ok for mode change, the hook
> would return true, then why would RA use REGNO_REG_CLASS other than
> ALL_REGS.

If you know which hard register something is stored in, it makes
sense to ask about the closest enclosing class rather than something
more general.  If a particular mode can be stored in both general
registers and floating-point registers, and if floating-point registers
have restrictions that general registers don't, ALL_REGS should honour
the floating-point constraints.  But it wouldn't make sense to use the
floating-point constraints for something that is known to be in a
general register.

> Another spot is in validate_subreg, we're using the minimal reg class
> instead of ALL_REGS.
>  973  /* This is a normal subreg.  Verify that the offset is representable.  */
>  974
>  975  /* For hard registers, we already have most of these rules collected in
>  976     subreg_offset_representable_p.  */
>  977  if (reg && REG_P (reg) && HARD_REGISTER_P (reg))
>  978    {
>  979      unsigned int regno = REGNO (reg);
>  980
>  981      if ((COMPLEX_MODE_P (imode) || VECTOR_MODE_P (imode))
>  982          && GET_MODE_INNER (imode) == omode)
>  983        ;
>  984      else if (!REG_CAN_CHANGE_MODE_P (regno, imode, omode))
>  985        return false;

But here too, we're using REG_CAN_CHANGE_MODE_P because we know
the register number.  It wouldn't make sense to ask about a more
general class than necessary.

REG_CANNOT_CHANGE_MODE_P (as it was then) was added in 2002 as a
convenient interface to CANNOT_CHANGE_MODE_CLASS.  CANNOT_CHANGE_MODE_CLASS
in turn replaced CLASS_CANNOT_CHANGE_MODE_P, which only took two modes,
and wasn't given any class information.  So this interface was
originally a query about modes, not a query about classes.  The class
information was added later since (understandably) modes weren't always
enough on their own.  But I think it is still fundamentally a query
about modes, with the class provided for context, rather than a query
about classes, with modes provided by context.

> I think we do need some hook in the middle end to query things like if
> some r in rclass is ok for mode change?  but not reusing
> can_change_mode_class.

But if we add a hook to say "are mode changes from mode M1 to mode M2 OK?",
which is what Tamar's patch and some other existing code wants to know,
I fear we'll just reintroduce the old CLASS_CANNOT_CHANGE_MODE_P (but
hopefully without embedding the negative sense).  I don't think it makes
sense to have that hook alongside the existing one.  It would require
targets to duplicate information and would run the risk on conflicting
information for corner cases.  IMO it would repeat the mistake of having
both hard_regno_nregs and class_max_nregs; really, the latter ought
to be calculated from the former.

Thanks,
Richard

>> Thanks,
>> Richard
>>
>> >> IMO it's not the job of target-independent code to iterate through
>> >> individual classes and aggregate the result.  One of the reasons for
>> >> having union classes is to avoid the need to do that.  And ALL_REGS
>> >> is the ultimate union class. :-)
>> >>
>> >> The patch looks correct to me.
>> >>
>> >> Thanks,
>> >> Richard
>> >>
>> >> >> >
>> >> >> > So there are already existing precedence for this.  And the
>> >> >> > documentation for the hook says:
>> >> >> >
>> >> >> > "This hook returns true if it is possible to bitcast values held in registers of
>> >> >> class rclass from mode from to mode to and if doing so preserves the low-
>> >> >> order bits that are common to both modes. The result is only meaningful if
>> >> >> rclass has registers that can hold both from and to. The default
>> >> >> implementation returns true"
>> >> >> >
>> >> >> > So it looks like it's use outside of RA is perfectly valid.. and the
>> >> >> > documentation also mentions in the example the use from the mid-end as
>> >> >> an example.
>> >> >> >
>> >> >> > But if the mid-end maintainers are happy I'll use something else.
>> >> >> >
>> >> >> > Tamar
>> >> >> >
>> >> >> > > I did similar things in
>> >> >> > > https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579296.html
>> >> >> > > (and ALL_REGS doesn't cover all cases for registers which are both
>> >> >> > > available for qimode and mode, ALL_REGS fail doesn't mean it can't
>> >> >> > > be subreg, it just means parts of ALL_REGS can't be subreg. but with
>> >> >> > > a subset of ALL_REGS, there could be a reg class which return true
>> >> >> > > for
>> >> >> > > targetm.can_change_mode_class)
>> >> >> > > >           && targetm.vectorize.vec_perm_const (qimode, qimode,
>> >> >> > > > target_qi,
>> >> >> > > v0_qi,
>> >> >> > > >                                                v1_qi, qimode_indices))
>> >> >> > > >         return gen_lowpart (mode, target_qi); @@ -6311,7 +6312,8
>> >> >> > > > @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
>> >> >> > > >      }
>> >> >> > > >
>> >> >> > > >    if (qimode != VOIDmode
>> >> >> > > > -      && selector_fits_mode_p (qimode, qimode_indices))
>> >> >> > > > +      && selector_fits_mode_p (qimode, qimode_indices)
>> >> >> > > > +      && targetm.can_change_mode_class (mode, qimode, ALL_REGS))
>> >> >> > > >      {
>> >> >> > > >        icode = direct_optab_handler (vec_perm_optab, qimode);
>> >> >> > > >        if (icode != CODE_FOR_nothing) diff --git
>> >> >> > > > a/gcc/testsuite/gcc.target/aarch64/ext_1.c
>> >> >> > > > b/gcc/testsuite/gcc.target/aarch64/ext_1.c
>> >> >> > > > new file mode 100644
>> >> >> > > > index
>> >> >> > > >
>> >> >> > >
>> >> >> 0000000000000000000000000000000000000000..18a10a14f1161584267a8472e5
>> >> >> > > 71
>> >> >> > > > b3bc2ddf887a
>> >> >> > >
>> >> >> > >
>> >> >> > >
>> >> >> > >
>> >> >> > > --
>> >> >> > > BR,
>> >> >> > > Hongtao
>> >> >>
>> >> >>
>> >> >>
>> >> >> --
>> >> >> BR,
>> >> >> Hongtao
  
Hongtao Liu Nov. 18, 2022, 2:31 a.m. UTC | #13
On Thu, Nov 17, 2022 at 9:59 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Hongtao Liu <crazylht@gmail.com> writes:
> > On Thu, Nov 17, 2022 at 5:39 PM Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> Hongtao Liu <crazylht@gmail.com> writes:
> >> > On Wed, Nov 16, 2022 at 1:39 AM Richard Sandiford
> >> > <richard.sandiford@arm.com> wrote:
> >> >>
> >> >> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> >> >> -----Original Message-----
> >> >> >> From: Hongtao Liu <crazylht@gmail.com>
> >> >> >> Sent: Tuesday, November 15, 2022 9:37 AM
> >> >> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> >> >> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina via
> >> >> >> Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
> >> >> >> rguenther@suse.de
> >> >> >> Subject: Re: [PATCH 3/8]middle-end: Support extractions of subvectors from
> >> >> >> arbitrary element position inside a vector
> >> >> >>
> >> >> >> On Tue, Nov 15, 2022 at 4:51 PM Tamar Christina
> >> >> >> <Tamar.Christina@arm.com> wrote:
> >> >> >> >
> >> >> >> > > -----Original Message-----
> >> >> >> > > From: Hongtao Liu <crazylht@gmail.com>
> >> >> >> > > Sent: Tuesday, November 15, 2022 8:36 AM
> >> >> >> > > To: Tamar Christina <Tamar.Christina@arm.com>
> >> >> >> > > Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina
> >> >> >> > > via Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
> >> >> >> > > rguenther@suse.de
> >> >> >> > > Subject: Re: [PATCH 3/8]middle-end: Support extractions of
> >> >> >> > > subvectors from arbitrary element position inside a vector
> >> >> >> > >
> >> >> >> > > Hi:
> >> >> >> > >   I'm from https://gcc.gnu.org/pipermail/gcc-patches/2022-
> >> >> >> > > November/606040.html.
> >> >> >> > > >      }
> >> >> >> > > >
> >> >> >> > > >    /* See if we can get a better vector mode before extracting.
> >> >> >> > > > */ diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
> >> >> >> > > >
> >> >> >> > >
> >> >> >> cff37ccb0dfc3dd79b97d0abfd872f340855dc96..f338df410265dfe55b68961600
> >> >> >> > > 9
> >> >> >> > > 0
> >> >> >> > > > a453cc6a28d9 100644
> >> >> >> > > > --- a/gcc/optabs.cc
> >> >> >> > > > +++ b/gcc/optabs.cc
> >> >> >> > > > @@ -6267,6 +6267,7 @@ expand_vec_perm_const (machine_mode
> >> >> >> mode,
> >> >> >> > > rtx v0, rtx v1,
> >> >> >> > > >        v0_qi = gen_lowpart (qimode, v0);
> >> >> >> > > >        v1_qi = gen_lowpart (qimode, v1);
> >> >> >> > > >        if (targetm.vectorize.vec_perm_const != NULL
> >> >> >> > > > +         && targetm.can_change_mode_class (mode, qimode,
> >> >> >> > > > + ALL_REGS)
> >> >> >> > > It looks like you want to guard gen_lowpart, shouldn't it be better
> >> >> >> > > to use validate_subreg  or (tmp = gen_lowpart_if_possible (mode,
> >> >> >> target_qi)).
> >> >> >> > > IMHO, targetm.can_change_mode_class is mostly used for RA, but not
> >> >> >> > > to guard gen_lowpart.
> >> >> >> >
> >> >> >> > Hmm I don't think this is quite true, there are existing usages in
> >> >> >> > expr.cc and rtanal.cc That do this and aren't part of RA.  As I
> >> >> >> > mentioned before for instance the canoncalization of vec_select to subreg
> >> >> >> in rtlanal for instances uses this.
> >> >> >> In theory, we need to iterate through all reg classes that can be assigned for
> >> >> >> both qimode and mode, if any regclass returns true for
> >> >> >> targetm.can_change_mode_class, the bitcast(validate_subreg) should be ok.
> >> >> >> Here we just passed ALL_REGS.
> >> >> >
> >> >> > Yes, and most targets where this transformation is valid return true here.
> >> >> >
> >> >> > I've checked:
> >> >> >  * alpha
> >> >> >  * arm
> >> >> >  * aarch64
> >> >> >  * rs6000
> >> >> >  * s390
> >> >> >  * sparc
> >> >> >  * pa
> >> >> >  * mips
> >> >> >
> >> >> > And even the default example that other targets use from the documentation
> >> >> > would return true as the size of the modes are the same.
> >> >> >
> >> >> > X86 and RISCV are the only two targets that I found (but didn't check all) that
> >> >> > blankly return a result based on just the register classes.
> >> >> >
> >> >> > That is to say, there are more targets that adhere to the interpretation that
> >> >> > rclass here means "should be possible in some class in rclass" rather than
> >> >> > "should be possible in ALL classes of rclass".
> >> >>
> >> >> Yeah, I agree.  A query "can something stored in ALL_REGS change from
> >> >> mode M1 to mode M2?" is meaningful if at least one register R in ALL_REGS
> >> >> can hold both M1 and M2.  It's then the target's job to answer
> >> >> conservatively so that the result covers all such R.
> >> >>
> >> >> In principle it's OK for a target to err on the side of caution and forbid
> >> >> things that are actually OK.  But that's going to risk losing performance
> >> >> in some cases, and sometimes that loss of performance will be unacceptable.
> >> >> IMO that's what's happening here.  The target is applying x87 rules to
> >> >> things that (AIUI) are never stored in x87 registers, and so losing
> >> > Yes, it can be optimized since some mode will never assigned to x87 registers.
> >> >> performance as a result.
> >> >>
> >> >> Note that the RA also uses ALL_REGS for some things, so this usage
> >> >> isn't specific to non-RA code.
> >> > RA passes the minimal reg class(REGNO_REG_CLASS) which contains REGN
> >> > to decide if can_change_mode_class, not ALL_REGS.
> >> > 511/* Given a hard REGN a FROM mode and a TO mode, return true if
> >> > 512   REGN can change from mode FROM to mode TO.  */
> >> > 513#define REG_CAN_CHANGE_MODE_P(REGN, FROM, TO)                          \
> >> > 514  (targetm.can_change_mode_class (FROM, TO, REGNO_REG_CLASS (REGN)))
> >> > 515
> >> >
> >> > So I still think using can_change_mode_class outside of RA with
> >> > ALL_REGS passed to decide whether it's ok to generate subreg is not a
> >> > good idea.
> >>
> >> But if the argument is that the only valid uses of can_change_mode_class
> >> are through this macro, the hook isn't describing a class property,
> >> it's describing the property of individual registers.  If we say that
> >> querying individual registers is the only valid thing to do them
> >> we should change the hook to take a register number rather than
> >> a class enum.
> >>
> >> The reason we have a class-based rather than register-based interface
> >> is because it is useful to query classes before you've picked a
> >> specific register.
> > For individual registers in the minimal reg class, we assume they are
> > not different from each other, I guess that's why we have
> > REGNO_REG_CLASS and class-based interfaces other than register-based
> > interfaces.
>
> I don't think that's necessarily true.  We have things like
> hard_regno_nregs that operate on individual registers.  And even
> the x86 implementation of the hook uses subset operations rather
> than comparing the class for equality, which suggests some attempt
> to handle classes other than those returned by REGNO_REG_CLASS.
From the gcc doc for subreg, it says if any reg is not ok for subreg,
can_change_mode_class should be false for all classes that include
reg.
---------------------------------------------------------------------
The rules above apply to both pseudo regs and hard regs. If the semantics
are not correct for particular combinations of m1, m2 and hard reg,
the target specific code must ensure that those combinations are never
used. For example:
TARGET_CAN_CHANGE_MODE_CLASS (m2, m1, class)
must be false for ****every class class***** that includes reg.
--------------------------------------------------------------------
>
> > But for ALL_REGS, it's not the minimal reg class, it's the largest.
> > Using it It's not that suitable.
>
> But I think it is suitable for the gimple level, where we have no
> information that would narrow the choice to a specific register class.
>
Maybe we should have different semantics for ALL_REGS(and outside of
RA?) since it's probably a combination of many subclasses, and for RA,
it always query the smallest REG class, and not using ALL_REG(unless
it is the minimal reg class containing regno).
> > If the argument is if some r in rclass is ok for mode change, the hook
> > would return true, then why would RA use REGNO_REG_CLASS other than
> > ALL_REGS.
>
> If you know which hard register something is stored in, it makes
> sense to ask about the closest enclosing class rather than something
> more general.  If a particular mode can be stored in both general
> registers and floating-point registers, and if floating-point registers
> have restrictions that general registers don't, ALL_REGS should honour
> the floating-point constraints.  But it wouldn't make sense to use the
> floating-point constraints for something that is known to be in a
> general register.
Yes, the backend should optimize the hook according to hard_regno_mode_ok.
But it's ok to be conservative, it should be a performance issue, not
correctness issue, no?
>
> > Another spot is in validate_subreg, we're using the minimal reg class
> > instead of ALL_REGS.
> >  973  /* This is a normal subreg.  Verify that the offset is representable.  */
> >  974
> >  975  /* For hard registers, we already have most of these rules collected in
> >  976     subreg_offset_representable_p.  */
> >  977  if (reg && REG_P (reg) && HARD_REGISTER_P (reg))
> >  978    {
> >  979      unsigned int regno = REGNO (reg);
> >  980
> >  981      if ((COMPLEX_MODE_P (imode) || VECTOR_MODE_P (imode))
> >  982          && GET_MODE_INNER (imode) == omode)
> >  983        ;
> >  984      else if (!REG_CAN_CHANGE_MODE_P (regno, imode, omode))
> >  985        return false;
>
> But here too, we're using REG_CAN_CHANGE_MODE_P because we know
> the register number.  It wouldn't make sense to ask about a more
> general class than necessary.
>
> REG_CANNOT_CHANGE_MODE_P (as it was then) was added in 2002 as a
> convenient interface to CANNOT_CHANGE_MODE_CLASS.  CANNOT_CHANGE_MODE_CLASS
> in turn replaced CLASS_CANNOT_CHANGE_MODE_P, which only took two modes,
> and wasn't given any class information.  So this interface was
> originally a query about modes, not a query about classes.  The class
> information was added later since (understandably) modes weren't always
> enough on their own.  But I think it is still fundamentally a query
> about modes, with the class provided for context, rather than a query
> about classes, with modes provided by context.
>
> > I think we do need some hook in the middle end to query things like if
> > some r in rclass is ok for mode change?  but not reusing
> > can_change_mode_class.
>
> But if we add a hook to say "are mode changes from mode M1 to mode M2 OK?",
> which is what Tamar's patch and some other existing code wants to know,
> I fear we'll just reintroduce the old CLASS_CANNOT_CHANGE_MODE_P (but
> hopefully without embedding the negative sense).  I don't think it makes
> sense to have that hook alongside the existing one.  It would require
> targets to duplicate information and would run the risk on conflicting
> information for corner cases.  IMO it would repeat the mistake of having
> both hard_regno_nregs and class_max_nregs; really, the latter ought
> to be calculated from the former.
We already have conflict info between validate_subreg and
REG_CAN_CHANGE_MODE_P, and it caused the ICE which Tamar wants to fix
in another x86 patch.
I agree we shouldn't introduce more mess before we clean existed up.
>
> Thanks,
> Richard
>
> >> Thanks,
> >> Richard
> >>
> >> >> IMO it's not the job of target-independent code to iterate through
> >> >> individual classes and aggregate the result.  One of the reasons for
> >> >> having union classes is to avoid the need to do that.  And ALL_REGS
> >> >> is the ultimate union class. :-)
> >> >>
> >> >> The patch looks correct to me.
> >> >>
> >> >> Thanks,
> >> >> Richard
> >> >>
> >> >> >> >
> >> >> >> > So there are already existing precedence for this.  And the
> >> >> >> > documentation for the hook says:
> >> >> >> >
> >> >> >> > "This hook returns true if it is possible to bitcast values held in registers of
> >> >> >> class rclass from mode from to mode to and if doing so preserves the low-
> >> >> >> order bits that are common to both modes. The result is only meaningful if
> >> >> >> rclass has registers that can hold both from and to. The default
> >> >> >> implementation returns true"
> >> >> >> >
> >> >> >> > So it looks like it's use outside of RA is perfectly valid.. and the
> >> >> >> > documentation also mentions in the example the use from the mid-end as
> >> >> >> an example.
> >> >> >> >
> >> >> >> > But if the mid-end maintainers are happy I'll use something else.
> >> >> >> >
> >> >> >> > Tamar
> >> >> >> >
> >> >> >> > > I did similar things in
> >> >> >> > > https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579296.html
> >> >> >> > > (and ALL_REGS doesn't cover all cases for registers which are both
> >> >> >> > > available for qimode and mode, ALL_REGS fail doesn't mean it can't
> >> >> >> > > be subreg, it just means parts of ALL_REGS can't be subreg. but with
> >> >> >> > > a subset of ALL_REGS, there could be a reg class which return true
> >> >> >> > > for
> >> >> >> > > targetm.can_change_mode_class)
> >> >> >> > > >           && targetm.vectorize.vec_perm_const (qimode, qimode,
> >> >> >> > > > target_qi,
> >> >> >> > > v0_qi,
> >> >> >> > > >                                                v1_qi, qimode_indices))
> >> >> >> > > >         return gen_lowpart (mode, target_qi); @@ -6311,7 +6312,8
> >> >> >> > > > @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
> >> >> >> > > >      }
> >> >> >> > > >
> >> >> >> > > >    if (qimode != VOIDmode
> >> >> >> > > > -      && selector_fits_mode_p (qimode, qimode_indices))
> >> >> >> > > > +      && selector_fits_mode_p (qimode, qimode_indices)
> >> >> >> > > > +      && targetm.can_change_mode_class (mode, qimode, ALL_REGS))
> >> >> >> > > >      {
> >> >> >> > > >        icode = direct_optab_handler (vec_perm_optab, qimode);
> >> >> >> > > >        if (icode != CODE_FOR_nothing) diff --git
> >> >> >> > > > a/gcc/testsuite/gcc.target/aarch64/ext_1.c
> >> >> >> > > > b/gcc/testsuite/gcc.target/aarch64/ext_1.c
> >> >> >> > > > new file mode 100644
> >> >> >> > > > index
> >> >> >> > > >
> >> >> >> > >
> >> >> >> 0000000000000000000000000000000000000000..18a10a14f1161584267a8472e5
> >> >> >> > > 71
> >> >> >> > > > b3bc2ddf887a
> >> >> >> > >
> >> >> >> > >
> >> >> >> > >
> >> >> >> > >
> >> >> >> > > --
> >> >> >> > > BR,
> >> >> >> > > Hongtao
> >> >> >>
> >> >> >>
> >> >> >>
> >> >> >> --
> >> >> >> BR,
> >> >> >> Hongtao
  
Richard Sandiford Nov. 18, 2022, 9:16 a.m. UTC | #14
Hongtao Liu <crazylht@gmail.com> writes:
> On Thu, Nov 17, 2022 at 9:59 PM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Hongtao Liu <crazylht@gmail.com> writes:
>> > On Thu, Nov 17, 2022 at 5:39 PM Richard Sandiford
>> > <richard.sandiford@arm.com> wrote:
>> >>
>> >> Hongtao Liu <crazylht@gmail.com> writes:
>> >> > On Wed, Nov 16, 2022 at 1:39 AM Richard Sandiford
>> >> > <richard.sandiford@arm.com> wrote:
>> >> >>
>> >> >> Tamar Christina <Tamar.Christina@arm.com> writes:
>> >> >> >> -----Original Message-----
>> >> >> >> From: Hongtao Liu <crazylht@gmail.com>
>> >> >> >> Sent: Tuesday, November 15, 2022 9:37 AM
>> >> >> >> To: Tamar Christina <Tamar.Christina@arm.com>
>> >> >> >> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina via
>> >> >> >> Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
>> >> >> >> rguenther@suse.de
>> >> >> >> Subject: Re: [PATCH 3/8]middle-end: Support extractions of subvectors from
>> >> >> >> arbitrary element position inside a vector
>> >> >> >>
>> >> >> >> On Tue, Nov 15, 2022 at 4:51 PM Tamar Christina
>> >> >> >> <Tamar.Christina@arm.com> wrote:
>> >> >> >> >
>> >> >> >> > > -----Original Message-----
>> >> >> >> > > From: Hongtao Liu <crazylht@gmail.com>
>> >> >> >> > > Sent: Tuesday, November 15, 2022 8:36 AM
>> >> >> >> > > To: Tamar Christina <Tamar.Christina@arm.com>
>> >> >> >> > > Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina
>> >> >> >> > > via Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
>> >> >> >> > > rguenther@suse.de
>> >> >> >> > > Subject: Re: [PATCH 3/8]middle-end: Support extractions of
>> >> >> >> > > subvectors from arbitrary element position inside a vector
>> >> >> >> > >
>> >> >> >> > > Hi:
>> >> >> >> > >   I'm from https://gcc.gnu.org/pipermail/gcc-patches/2022-
>> >> >> >> > > November/606040.html.
>> >> >> >> > > >      }
>> >> >> >> > > >
>> >> >> >> > > >    /* See if we can get a better vector mode before extracting.
>> >> >> >> > > > */ diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
>> >> >> >> > > >
>> >> >> >> > >
>> >> >> >> cff37ccb0dfc3dd79b97d0abfd872f340855dc96..f338df410265dfe55b68961600
>> >> >> >> > > 9
>> >> >> >> > > 0
>> >> >> >> > > > a453cc6a28d9 100644
>> >> >> >> > > > --- a/gcc/optabs.cc
>> >> >> >> > > > +++ b/gcc/optabs.cc
>> >> >> >> > > > @@ -6267,6 +6267,7 @@ expand_vec_perm_const (machine_mode
>> >> >> >> mode,
>> >> >> >> > > rtx v0, rtx v1,
>> >> >> >> > > >        v0_qi = gen_lowpart (qimode, v0);
>> >> >> >> > > >        v1_qi = gen_lowpart (qimode, v1);
>> >> >> >> > > >        if (targetm.vectorize.vec_perm_const != NULL
>> >> >> >> > > > +         && targetm.can_change_mode_class (mode, qimode,
>> >> >> >> > > > + ALL_REGS)
>> >> >> >> > > It looks like you want to guard gen_lowpart, shouldn't it be better
>> >> >> >> > > to use validate_subreg  or (tmp = gen_lowpart_if_possible (mode,
>> >> >> >> target_qi)).
>> >> >> >> > > IMHO, targetm.can_change_mode_class is mostly used for RA, but not
>> >> >> >> > > to guard gen_lowpart.
>> >> >> >> >
>> >> >> >> > Hmm I don't think this is quite true, there are existing usages in
>> >> >> >> > expr.cc and rtanal.cc That do this and aren't part of RA.  As I
>> >> >> >> > mentioned before for instance the canoncalization of vec_select to subreg
>> >> >> >> in rtlanal for instances uses this.
>> >> >> >> In theory, we need to iterate through all reg classes that can be assigned for
>> >> >> >> both qimode and mode, if any regclass returns true for
>> >> >> >> targetm.can_change_mode_class, the bitcast(validate_subreg) should be ok.
>> >> >> >> Here we just passed ALL_REGS.
>> >> >> >
>> >> >> > Yes, and most targets where this transformation is valid return true here.
>> >> >> >
>> >> >> > I've checked:
>> >> >> >  * alpha
>> >> >> >  * arm
>> >> >> >  * aarch64
>> >> >> >  * rs6000
>> >> >> >  * s390
>> >> >> >  * sparc
>> >> >> >  * pa
>> >> >> >  * mips
>> >> >> >
>> >> >> > And even the default example that other targets use from the documentation
>> >> >> > would return true as the size of the modes are the same.
>> >> >> >
>> >> >> > X86 and RISCV are the only two targets that I found (but didn't check all) that
>> >> >> > blankly return a result based on just the register classes.
>> >> >> >
>> >> >> > That is to say, there are more targets that adhere to the interpretation that
>> >> >> > rclass here means "should be possible in some class in rclass" rather than
>> >> >> > "should be possible in ALL classes of rclass".
>> >> >>
>> >> >> Yeah, I agree.  A query "can something stored in ALL_REGS change from
>> >> >> mode M1 to mode M2?" is meaningful if at least one register R in ALL_REGS
>> >> >> can hold both M1 and M2.  It's then the target's job to answer
>> >> >> conservatively so that the result covers all such R.
>> >> >>
>> >> >> In principle it's OK for a target to err on the side of caution and forbid
>> >> >> things that are actually OK.  But that's going to risk losing performance
>> >> >> in some cases, and sometimes that loss of performance will be unacceptable.
>> >> >> IMO that's what's happening here.  The target is applying x87 rules to
>> >> >> things that (AIUI) are never stored in x87 registers, and so losing
>> >> > Yes, it can be optimized since some mode will never assigned to x87 registers.
>> >> >> performance as a result.
>> >> >>
>> >> >> Note that the RA also uses ALL_REGS for some things, so this usage
>> >> >> isn't specific to non-RA code.
>> >> > RA passes the minimal reg class(REGNO_REG_CLASS) which contains REGN
>> >> > to decide if can_change_mode_class, not ALL_REGS.
>> >> > 511/* Given a hard REGN a FROM mode and a TO mode, return true if
>> >> > 512   REGN can change from mode FROM to mode TO.  */
>> >> > 513#define REG_CAN_CHANGE_MODE_P(REGN, FROM, TO)                          \
>> >> > 514  (targetm.can_change_mode_class (FROM, TO, REGNO_REG_CLASS (REGN)))
>> >> > 515
>> >> >
>> >> > So I still think using can_change_mode_class outside of RA with
>> >> > ALL_REGS passed to decide whether it's ok to generate subreg is not a
>> >> > good idea.
>> >>
>> >> But if the argument is that the only valid uses of can_change_mode_class
>> >> are through this macro, the hook isn't describing a class property,
>> >> it's describing the property of individual registers.  If we say that
>> >> querying individual registers is the only valid thing to do them
>> >> we should change the hook to take a register number rather than
>> >> a class enum.
>> >>
>> >> The reason we have a class-based rather than register-based interface
>> >> is because it is useful to query classes before you've picked a
>> >> specific register.
>> > For individual registers in the minimal reg class, we assume they are
>> > not different from each other, I guess that's why we have
>> > REGNO_REG_CLASS and class-based interfaces other than register-based
>> > interfaces.
>>
>> I don't think that's necessarily true.  We have things like
>> hard_regno_nregs that operate on individual registers.  And even
>> the x86 implementation of the hook uses subset operations rather
>> than comparing the class for equality, which suggests some attempt
>> to handle classes other than those returned by REGNO_REG_CLASS.
> From the gcc doc for subreg, it says if any reg is not ok for subreg,
> can_change_mode_class should be false for all classes that include
> reg.
> ---------------------------------------------------------------------
> The rules above apply to both pseudo regs and hard regs. If the semantics
> are not correct for particular combinations of m1, m2 and hard reg,
> the target specific code must ensure that those combinations are never
> used. For example:
> TARGET_CAN_CHANGE_MODE_CLASS (m2, m1, class)
> must be false for ****every class class***** that includes reg.
> --------------------------------------------------------------------

Yeah.  My point was that the x86 hook is right to look at other classes,
and is right to use subset tests.  It corresponds to my later general
registers vs. floating-point registers example: ALL_REGS contains
floating-point registers, so ALL_REGS is bound by the floating-point
register constraints.

My point was: you seemed to be saying that the only valid use of the
hook was through REG_CAN_CHANGE_MODE_P, or for the minimal register
classes.  If that was true, the hook wouldn't need to handle superclasses.
But even the x86 hook (rightly) handles those too, even though they
would never be returned by REGNO_REG_CLASS.

The reason the current x86 hook doesn't really follow the documented
behaviour is that it sometimes (but not always) ignores m1 and m2.

>> > But for ALL_REGS, it's not the minimal reg class, it's the largest.
>> > Using it It's not that suitable.
>>
>> But I think it is suitable for the gimple level, where we have no
>> information that would narrow the choice to a specific register class.
>>
> Maybe we should have different semantics for ALL_REGS(and outside of
> RA?) since it's probably a combination of many subclasses, and for RA,
> it always query the smallest REG class, and not using ALL_REG(unless
> it is the minimal reg class containing regno).
>> > If the argument is if some r in rclass is ok for mode change, the hook
>> > would return true, then why would RA use REGNO_REG_CLASS other than
>> > ALL_REGS.
>>
>> If you know which hard register something is stored in, it makes
>> sense to ask about the closest enclosing class rather than something
>> more general.  If a particular mode can be stored in both general
>> registers and floating-point registers, and if floating-point registers
>> have restrictions that general registers don't, ALL_REGS should honour
>> the floating-point constraints.  But it wouldn't make sense to use the
>> floating-point constraints for something that is known to be in a
>> general register.
> Yes, the backend should optimize the hook according to hard_regno_mode_ok.
> But it's ok to be conservative, it should be a performance issue, not
> correctness issue, no?

Yeah, it's OK to be conservative.  As I understood it, the issue
in Tamar's case was that ix86_vectorize_vec_perm_const assumed that
target-independent code would handle some cases by converting the
permutation to operate on a vector of QIs, whereas x86_can_change_mode_class
said that the associated mode change wasn't valid.  Neither of the hooks
are wrong individually, it's the combination that causes problems.

>> > Another spot is in validate_subreg, we're using the minimal reg class
>> > instead of ALL_REGS.
>> >  973  /* This is a normal subreg.  Verify that the offset is representable.  */
>> >  974
>> >  975  /* For hard registers, we already have most of these rules collected in
>> >  976     subreg_offset_representable_p.  */
>> >  977  if (reg && REG_P (reg) && HARD_REGISTER_P (reg))
>> >  978    {
>> >  979      unsigned int regno = REGNO (reg);
>> >  980
>> >  981      if ((COMPLEX_MODE_P (imode) || VECTOR_MODE_P (imode))
>> >  982          && GET_MODE_INNER (imode) == omode)
>> >  983        ;
>> >  984      else if (!REG_CAN_CHANGE_MODE_P (regno, imode, omode))
>> >  985        return false;
>>
>> But here too, we're using REG_CAN_CHANGE_MODE_P because we know
>> the register number.  It wouldn't make sense to ask about a more
>> general class than necessary.
>>
>> REG_CANNOT_CHANGE_MODE_P (as it was then) was added in 2002 as a
>> convenient interface to CANNOT_CHANGE_MODE_CLASS.  CANNOT_CHANGE_MODE_CLASS
>> in turn replaced CLASS_CANNOT_CHANGE_MODE_P, which only took two modes,
>> and wasn't given any class information.  So this interface was
>> originally a query about modes, not a query about classes.  The class
>> information was added later since (understandably) modes weren't always
>> enough on their own.  But I think it is still fundamentally a query
>> about modes, with the class provided for context, rather than a query
>> about classes, with modes provided by context.
>>
>> > I think we do need some hook in the middle end to query things like if
>> > some r in rclass is ok for mode change?  but not reusing
>> > can_change_mode_class.
>>
>> But if we add a hook to say "are mode changes from mode M1 to mode M2 OK?",
>> which is what Tamar's patch and some other existing code wants to know,
>> I fear we'll just reintroduce the old CLASS_CANNOT_CHANGE_MODE_P (but
>> hopefully without embedding the negative sense).  I don't think it makes
>> sense to have that hook alongside the existing one.  It would require
>> targets to duplicate information and would run the risk on conflicting
>> information for corner cases.  IMO it would repeat the mistake of having
>> both hard_regno_nregs and class_max_nregs; really, the latter ought
>> to be calculated from the former.
> We already have conflict info between validate_subreg and
> REG_CAN_CHANGE_MODE_P, and it caused the ICE which Tamar wants to fix
> in another x86 patch.

Ah, haven't got to that thread yet.

Thanks,
Richard

> I agree we shouldn't introduce more mess before we clean existed up.
>>
>> Thanks,
>> Richard
>>
>> >> Thanks,
>> >> Richard
>> >>
>> >> >> IMO it's not the job of target-independent code to iterate through
>> >> >> individual classes and aggregate the result.  One of the reasons for
>> >> >> having union classes is to avoid the need to do that.  And ALL_REGS
>> >> >> is the ultimate union class. :-)
>> >> >>
>> >> >> The patch looks correct to me.
>> >> >>
>> >> >> Thanks,
>> >> >> Richard
>> >> >>
>> >> >> >> >
>> >> >> >> > So there are already existing precedence for this.  And the
>> >> >> >> > documentation for the hook says:
>> >> >> >> >
>> >> >> >> > "This hook returns true if it is possible to bitcast values held in registers of
>> >> >> >> class rclass from mode from to mode to and if doing so preserves the low-
>> >> >> >> order bits that are common to both modes. The result is only meaningful if
>> >> >> >> rclass has registers that can hold both from and to. The default
>> >> >> >> implementation returns true"
>> >> >> >> >
>> >> >> >> > So it looks like it's use outside of RA is perfectly valid.. and the
>> >> >> >> > documentation also mentions in the example the use from the mid-end as
>> >> >> >> an example.
>> >> >> >> >
>> >> >> >> > But if the mid-end maintainers are happy I'll use something else.
>> >> >> >> >
>> >> >> >> > Tamar
>> >> >> >> >
>> >> >> >> > > I did similar things in
>> >> >> >> > > https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579296.html
>> >> >> >> > > (and ALL_REGS doesn't cover all cases for registers which are both
>> >> >> >> > > available for qimode and mode, ALL_REGS fail doesn't mean it can't
>> >> >> >> > > be subreg, it just means parts of ALL_REGS can't be subreg. but with
>> >> >> >> > > a subset of ALL_REGS, there could be a reg class which return true
>> >> >> >> > > for
>> >> >> >> > > targetm.can_change_mode_class)
>> >> >> >> > > >           && targetm.vectorize.vec_perm_const (qimode, qimode,
>> >> >> >> > > > target_qi,
>> >> >> >> > > v0_qi,
>> >> >> >> > > >                                                v1_qi, qimode_indices))
>> >> >> >> > > >         return gen_lowpart (mode, target_qi); @@ -6311,7 +6312,8
>> >> >> >> > > > @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
>> >> >> >> > > >      }
>> >> >> >> > > >
>> >> >> >> > > >    if (qimode != VOIDmode
>> >> >> >> > > > -      && selector_fits_mode_p (qimode, qimode_indices))
>> >> >> >> > > > +      && selector_fits_mode_p (qimode, qimode_indices)
>> >> >> >> > > > +      && targetm.can_change_mode_class (mode, qimode, ALL_REGS))
>> >> >> >> > > >      {
>> >> >> >> > > >        icode = direct_optab_handler (vec_perm_optab, qimode);
>> >> >> >> > > >        if (icode != CODE_FOR_nothing) diff --git
>> >> >> >> > > > a/gcc/testsuite/gcc.target/aarch64/ext_1.c
>> >> >> >> > > > b/gcc/testsuite/gcc.target/aarch64/ext_1.c
>> >> >> >> > > > new file mode 100644
>> >> >> >> > > > index
>> >> >> >> > > >
>> >> >> >> > >
>> >> >> >> 0000000000000000000000000000000000000000..18a10a14f1161584267a8472e5
>> >> >> >> > > 71
>> >> >> >> > > > b3bc2ddf887a
>> >> >> >> > >
>> >> >> >> > >
>> >> >> >> > >
>> >> >> >> > >
>> >> >> >> > > --
>> >> >> >> > > BR,
>> >> >> >> > > Hongtao
>> >> >> >>
>> >> >> >>
>> >> >> >>
>> >> >> >> --
>> >> >> >> BR,
>> >> >> >> Hongtao
  

Patch

--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -1718,6 +1718,45 @@  extract_bit_field_1 (rtx str_rtx, poly_uint64 bitsize, poly_uint64 bitnum,
 	      return target;
 	    }
 	}
+      else if (!known_eq (bitnum, 0U)
+	       && multiple_p (GET_MODE_UNIT_BITSIZE (tmode), bitnum, &pos))
+	{
+	  /* The encoding has a single stepped pattern.  */
+	  poly_uint64 nunits = GET_MODE_NUNITS (new_mode);
+	  int nelts = nunits.to_constant ();
+	  vec_perm_builder sel (nunits, nelts, 1);
+	  int delta = -pos.to_constant ();
+	  for (int i = 0; i < nelts; ++i)
+	    sel.quick_push ((i - delta) % nelts);
+	  vec_perm_indices indices (sel, 1, nunits);
+
+	  if (can_vec_perm_const_p (new_mode, new_mode, indices, false))
+	    {
+	      class expand_operand ops[4];
+	      machine_mode outermode = new_mode;
+	      machine_mode innermode = tmode;
+	      enum insn_code icode
+		= direct_optab_handler (vec_perm_optab, outermode);
+	      target = gen_reg_rtx (outermode);
+	      if (icode != CODE_FOR_nothing)
+		{
+		  rtx sel = vec_perm_indices_to_rtx (outermode, indices);
+		  create_output_operand (&ops[0], target, outermode);
+		  ops[0].target = 1;
+		  create_input_operand (&ops[1], op0, outermode);
+		  create_input_operand (&ops[2], op0, outermode);
+		  create_input_operand (&ops[3], sel, outermode);
+		  if (maybe_expand_insn (icode, 4, ops))
+		    return simplify_gen_subreg (innermode, target, outermode, 0);
+		}
+	      else if (targetm.vectorize.vec_perm_const != NULL)
+		{
+		  if (targetm.vectorize.vec_perm_const (outermode, outermode,
+							target, op0, op0, indices))
+		    return simplify_gen_subreg (innermode, target, outermode, 0);
+		}
+	    }
+	}
     }
 
   /* See if we can get a better vector mode before extracting.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/ext_1.c b/gcc/testsuite/gcc.target/aarch64/ext_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..18a10a14f1161584267a8472e571b3bc2ddf887a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ext_1.c
@@ -0,0 +1,54 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <string.h>
+
+typedef unsigned int v4si __attribute__((vector_size (16)));
+typedef unsigned int v2si __attribute__((vector_size (8)));
+
+/*
+** extract: { xfail *-*-* }
+**	ext	v0.16b, v0.16b, v0.16b, #4
+**	ret
+*/
+v2si extract (v4si x)
+{
+    v2si res = {x[1], x[2]};
+    return res;
+}
+
+/*
+** extract1: { xfail *-*-* }
+**	ext	v0.16b, v0.16b, v0.16b, #4
+**	ret
+*/
+v2si extract1 (v4si x)
+{
+    v2si res;
+    memcpy (&res, ((int*)&x)+1, sizeof(res));
+    return res;
+}
+
+typedef struct cast {
+  int a;
+  v2si b __attribute__((packed));
+} cast_t;
+
+typedef union Data {
+   v4si x;
+   cast_t y;
+} data;  
+
+/*
+** extract2:
+**	ext	v0.16b, v0.16b, v0.16b, #4
+**	ret
+*/
+v2si extract2 (v4si x)
+{
+    data d;
+    d.x = x;
+    return d.y.b;
+}
+