[2/3] AArch64 Promote function arguments using a paradoxical subreg when beneficial.

Message ID Yn6RThbeeE97co2Y@arm.com
State Dropped
Headers
Series [1/3] middle-end: Add the ability to let the target decide the method of argument promotions. |

Commit Message

Tamar Christina May 13, 2022, 5:11 p.m. UTC
  Hi All,

The PROMOTE_MODE always promotes 8 and 16-bit parameters to 32-bits.
This promotion is not required for the ABI which states:


```
C.9	If the argument is an Integral or Pointer Type, the size of the argument is
less than or equal to 8 bytes and the NGRN is less than 8, the argument is
copied to the least significant bits in x[NGRN]. The NGRN is incremented by one.
The argument has now been allocated.

C.16	If the size of the argument is less than 8 bytes then the size of the
argument is set to 8 bytes. The effect is as if the argument was copied to the
least significant bits of a 64-bit register and the remaining bits filled with
unspecified values
```

That is, the bits in the registers are unspecified and callees cannot assume
any particular status.

This means that we can avoid the promotion and still get correct code as the
language level promotion rules require values to be extended when the bits are
significant.

So if we are .e.g OR-ing two 8-bit values no extend is needed as the top bits
are irrelevant.  If we are doing e.g. addition, then the top bits *might* be
relevant depending on the result type.  But the middle end will always
contain the appropriate extend in those cases.

The mid-end also has optimizations around this assumption and the AArch64 port
actively undoes them.

So for instance

uint16_t fd (uint8_t xr){
    return xr + 1;
}

uint8_t fd2 (uint8_t xr){
    return xr + 1;
}

should produce

fd:                                     // @fd
        and     w8, w0, #0xff
        add     w0, w8, #1
        ret
fd2:                                    // @fd2
        add     w0, w0, #1
        ret

like clang does instead of

fd:
        and     w0, w0, 255
        add     w0, w0, 1
        ret
fd2:
        and     w0, w0, 255
        add     w0, w0, 1
        ret

like we do now.  Removing this forced expansion maintains correctness but fixes
issues with various codegen defects.  It also brings us inline with clang.

Note that C, C++ and Fortran etc all correctly specify what should happen w.r.t
extends and e.g. array indexing, pointer arith etc so we never get incorrect
code.

There is however a second reason for doing this promotion: RTL efficiency.
The promotion stops us from having to promote the values to SI to be able to
use them in instructions and then truncating again afterwards.

To get both the efficiency and the simpler RTL we can instead promote to a
paradoxical subreg.  This patch implements the hook for AArch64 and adds an
explicit opt-out for values that feed into comparisons.  This is done because:

1. our comparisons patterns already allow us to absorb the zero extend
2. The extension allows us to use cbz/cbnz/tbz etc.  In some cases such as

int foo (char a, char b)
{
   if (a)
     if (b)
       bar1 ();
     else
       ...
    else
     if (b)
       bar2 ();
     else
       ...
}

by zero extending the value we can avoid having to repeatedly test the value
before a branch.  Allowing the zero extend also allows our existing `ands`
patterns to work as expected.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
I have to commit this and the last patch together but ease of review
I have split them up here. However 209 missed optimization xfails are
fixed.

No performance difference on SPECCPU 2017 but no failures.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (aarch64_promote_function_args_subreg_p):
	(TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P): New.
	* config/aarch64/aarch64.h (PROMOTE_MODE): Expand doc.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/apc-subreg.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index efa46ac0b8799b5849b609d591186e26e5cb37ff..cc74a816fcc6458aa065246a30a4d2184692ad74 100644




--
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index efa46ac0b8799b5849b609d591186e26e5cb37ff..cc74a816fcc6458aa065246a30a4d2184692ad74 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -34,7 +34,8 @@
 
 #define REGISTER_TARGET_PRAGMAS() aarch64_register_pragmas ()
 
-/* Target machine storage layout.  */
+/* Target machine storage layout.  See also
+   TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P.  */
 
 #define PROMOTE_MODE(MODE, UNSIGNEDP, TYPE)	\
   if (GET_MODE_CLASS (MODE) == MODE_INT		\
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 2f559600cff55af9d468e8d0810545583cc986f5..252d6c2af72afc1dfee1a86644a5753784b41f59 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -3736,6 +3736,57 @@ aarch64_array_mode_supported_p (machine_mode mode,
   return false;
 }
 
+/* Implement target hook TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P to complement
+   PROMOTE_MODE.  If any argument promotion was done, do them as subregs.  */
+static bool
+aarch64_promote_function_args_subreg_p (machine_mode mode,
+					machine_mode promoted_mode,
+					int /* unsignedp */, tree parm)
+{
+  bool candidate_p = GET_MODE_CLASS (mode) == MODE_INT
+		     && GET_MODE_CLASS (promoted_mode) == MODE_INT
+		     && known_lt (GET_MODE_SIZE (mode), 4)
+		     && promoted_mode == SImode;
+
+  if (!candidate_p)
+    return false;
+
+  if (!parm || !is_gimple_reg (parm))
+    return true;
+
+  tree var = parm;
+  if (!VAR_P (var))
+    {
+      if (TREE_CODE (parm) == SSA_NAME
+	   && !(var = SSA_NAME_VAR (var)))
+	return true;
+      else if (TREE_CODE (parm) != PARM_DECL)
+	return true;
+    }
+
+  /* If the variable is used inside a comparison which sets CC then we should
+     still promote using an extend.  By doing this we make it easier to use
+     cbz/cbnz but also repeatedly having to test the value in certain
+     circumstances like nested if values that test the same value with calls
+     in between. */
+  tree ssa_var = ssa_default_def (cfun, var);
+  if (!ssa_var)
+    return true;
+
+  const ssa_use_operand_t *const head = &(SSA_NAME_IMM_USE_NODE (ssa_var));
+  const ssa_use_operand_t *ptr;
+
+  for (ptr = head->next; ptr != head; ptr = ptr->next)
+    if (USE_STMT(ptr) && is_gimple_assign (USE_STMT (ptr)))
+      {
+	tree_code code = gimple_assign_rhs_code (USE_STMT(ptr));
+	if (TREE_CODE_CLASS (code) == tcc_comparison)
+	  return false;
+      }
+
+  return true;
+}
+
 /* MODE is some form of SVE vector mode.  For data modes, return the number
    of vector register bits that each element of MODE occupies, such as 64
    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
@@ -27490,6 +27541,10 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_ARRAY_MODE_SUPPORTED_P
 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
 
+#undef TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P
+#define TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P \
+  aarch64_promote_function_args_subreg_p
+
 #undef TARGET_VECTORIZE_CREATE_COSTS
 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
 
diff --git a/gcc/testsuite/gcc.target/aarch64/apc-subreg.c b/gcc/testsuite/gcc.target/aarch64/apc-subreg.c
new file mode 100644
index 0000000000000000000000000000000000000000..2d7563a11ce11fa677f7ad4bf2a090e6a136e4d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/apc-subreg.c
@@ -0,0 +1,103 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdint.h>
+
+/*
+** f0:
+**	mvn	w0, w0
+**	ret
+*/
+uint8_t f0 (uint8_t xr){
+    return (uint8_t) (0xff - xr);
+}
+
+/*
+** f1:
+**	mvn	w0, w0
+**	ret
+*/
+int8_t f1 (int8_t xr){
+    return (int8_t) (0xff - xr);
+}
+
+/*
+** f2:
+**	mvn	w0, w0
+**	ret
+*/
+uint16_t f2 (uint16_t xr){
+    return (uint16_t) (0xffFF - xr);
+}
+
+/*
+** f3:
+**	mvn	w0, w0
+**	ret
+*/
+uint32_t f3 (uint32_t xr){
+    return (uint32_t) (0xffFFffff - xr);
+}
+
+/*
+** f4:
+**	mvn	x0, x0
+**	ret
+*/
+uint64_t f4 (uint64_t xr){
+    return (uint64_t) (0xffFFffffffffffff - xr);
+}
+
+/*
+** f5:
+**	mvn	w0, w0
+**	sub	w0, w0, w1
+**	ret
+*/
+uint8_t f5 (uint8_t xr, uint8_t xc){
+    return (uint8_t) (0xff - xr - xc);
+}
+
+/*
+** f6:
+**	mvn	w0, w0
+**	and	w0, w0, 255
+**	and	w1, w1, 255
+**	mul	w0, w0, w1
+**	ret
+*/
+uint16_t f6 (uint8_t xr, uint8_t xc){
+    return ((uint8_t) (0xff - xr)) * xc;
+}
+
+/*
+** f7:
+**	and	w0, w0, 255
+**	and	w1, w1, 255
+**	mul	w0, w0, w1
+**	ret
+*/
+uint16_t f7 (uint8_t xr, uint8_t xc){
+    return xr * xc;
+}
+
+/*
+** f8:
+**	mul	w0, w0, w1
+**	and	w0, w0, 255
+**	ret
+*/
+uint16_t f8 (uint8_t xr, uint8_t xc){
+    return (uint8_t)(xr * xc);
+}
+
+/*
+** f9:
+**	and	w0, w0, 255
+**	add	w0, w0, w1
+**	ret
+*/
+uint16_t f9 (uint8_t xr, uint16_t xc){
+    return xr + xc;
+}
  

Comments

Andrew Pinski Oct. 27, 2022, 3:15 a.m. UTC | #1
On Fri, May 13, 2022 at 10:14 AM Tamar Christina via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi All,
>
> The PROMOTE_MODE always promotes 8 and 16-bit parameters to 32-bits.
> This promotion is not required for the ABI which states:
>
>
> ```
> C.9     If the argument is an Integral or Pointer Type, the size of the argument is
> less than or equal to 8 bytes and the NGRN is less than 8, the argument is
> copied to the least significant bits in x[NGRN]. The NGRN is incremented by one.
> The argument has now been allocated.
>
> C.16    If the size of the argument is less than 8 bytes then the size of the
> argument is set to 8 bytes. The effect is as if the argument was copied to the
> least significant bits of a 64-bit register and the remaining bits filled with
> unspecified values
> ```
>
> That is, the bits in the registers are unspecified and callees cannot assume
> any particular status.
>
> This means that we can avoid the promotion and still get correct code as the
> language level promotion rules require values to be extended when the bits are
> significant.
>
> So if we are .e.g OR-ing two 8-bit values no extend is needed as the top bits
> are irrelevant.  If we are doing e.g. addition, then the top bits *might* be
> relevant depending on the result type.  But the middle end will always
> contain the appropriate extend in those cases.
>
> The mid-end also has optimizations around this assumption and the AArch64 port
> actively undoes them.
>
> So for instance
>
> uint16_t fd (uint8_t xr){
>     return xr + 1;
> }
>
> uint8_t fd2 (uint8_t xr){
>     return xr + 1;
> }
>
> should produce
>
> fd:                                     // @fd
>         and     w8, w0, #0xff
>         add     w0, w8, #1
>         ret
> fd2:                                    // @fd2
>         add     w0, w0, #1
>         ret
>
> like clang does instead of
>
> fd:
>         and     w0, w0, 255
>         add     w0, w0, 1
>         ret
> fd2:
>         and     w0, w0, 255
>         add     w0, w0, 1
>         ret
>
> like we do now.  Removing this forced expansion maintains correctness but fixes
> issues with various codegen defects.  It also brings us inline with clang.
>
> Note that C, C++ and Fortran etc all correctly specify what should happen w.r.t
> extends and e.g. array indexing, pointer arith etc so we never get incorrect
> code.
>
> There is however a second reason for doing this promotion: RTL efficiency.
> The promotion stops us from having to promote the values to SI to be able to
> use them in instructions and then truncating again afterwards.
>
> To get both the efficiency and the simpler RTL we can instead promote to a
> paradoxical subreg.  This patch implements the hook for AArch64 and adds an
> explicit opt-out for values that feed into comparisons.  This is done because:
>
> 1. our comparisons patterns already allow us to absorb the zero extend
> 2. The extension allows us to use cbz/cbnz/tbz etc.  In some cases such as
>
> int foo (char a, char b)
> {
>    if (a)
>      if (b)
>        bar1 ();
>      else
>        ...
>     else
>      if (b)
>        bar2 ();
>      else
>        ...
> }
>
> by zero extending the value we can avoid having to repeatedly test the value
> before a branch.  Allowing the zero extend also allows our existing `ands`
> patterns to work as expected.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> I have to commit this and the last patch together but ease of review
> I have split them up here. However 209 missed optimization xfails are
> fixed.
>
> No performance difference on SPECCPU 2017 but no failures.
>
> Ok for master?

Did this patch ever get approved? It is a nice improvement that would
be nice to get into GCC 13 before the close of stage 1.

Thanks,
Andrew

>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>         * config/aarch64/aarch64.cc (aarch64_promote_function_args_subreg_p):
>         (TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P): New.
>         * config/aarch64/aarch64.h (PROMOTE_MODE): Expand doc.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/apc-subreg.c: New test.
>
> --- inline copy of patch --
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index efa46ac0b8799b5849b609d591186e26e5cb37ff..cc74a816fcc6458aa065246a30a4d2184692ad74 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -34,7 +34,8 @@
>
>  #define REGISTER_TARGET_PRAGMAS() aarch64_register_pragmas ()
>
> -/* Target machine storage layout.  */
> +/* Target machine storage layout.  See also
> +   TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P.  */
>
>  #define PROMOTE_MODE(MODE, UNSIGNEDP, TYPE)    \
>    if (GET_MODE_CLASS (MODE) == MODE_INT                \
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 2f559600cff55af9d468e8d0810545583cc986f5..252d6c2af72afc1dfee1a86644a5753784b41f59 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -3736,6 +3736,57 @@ aarch64_array_mode_supported_p (machine_mode mode,
>    return false;
>  }
>
> +/* Implement target hook TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P to complement
> +   PROMOTE_MODE.  If any argument promotion was done, do them as subregs.  */
> +static bool
> +aarch64_promote_function_args_subreg_p (machine_mode mode,
> +                                       machine_mode promoted_mode,
> +                                       int /* unsignedp */, tree parm)
> +{
> +  bool candidate_p = GET_MODE_CLASS (mode) == MODE_INT
> +                    && GET_MODE_CLASS (promoted_mode) == MODE_INT
> +                    && known_lt (GET_MODE_SIZE (mode), 4)
> +                    && promoted_mode == SImode;
> +
> +  if (!candidate_p)
> +    return false;
> +
> +  if (!parm || !is_gimple_reg (parm))
> +    return true;
> +
> +  tree var = parm;
> +  if (!VAR_P (var))
> +    {
> +      if (TREE_CODE (parm) == SSA_NAME
> +          && !(var = SSA_NAME_VAR (var)))
> +       return true;
> +      else if (TREE_CODE (parm) != PARM_DECL)
> +       return true;
> +    }
> +
> +  /* If the variable is used inside a comparison which sets CC then we should
> +     still promote using an extend.  By doing this we make it easier to use
> +     cbz/cbnz but also repeatedly having to test the value in certain
> +     circumstances like nested if values that test the same value with calls
> +     in between. */
> +  tree ssa_var = ssa_default_def (cfun, var);
> +  if (!ssa_var)
> +    return true;
> +
> +  const ssa_use_operand_t *const head = &(SSA_NAME_IMM_USE_NODE (ssa_var));
> +  const ssa_use_operand_t *ptr;
> +
> +  for (ptr = head->next; ptr != head; ptr = ptr->next)
> +    if (USE_STMT(ptr) && is_gimple_assign (USE_STMT (ptr)))
> +      {
> +       tree_code code = gimple_assign_rhs_code (USE_STMT(ptr));
> +       if (TREE_CODE_CLASS (code) == tcc_comparison)
> +         return false;
> +      }
> +
> +  return true;
> +}
> +
>  /* MODE is some form of SVE vector mode.  For data modes, return the number
>     of vector register bits that each element of MODE occupies, such as 64
>     for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
> @@ -27490,6 +27541,10 @@ aarch64_libgcc_floating_mode_supported_p
>  #undef TARGET_ARRAY_MODE_SUPPORTED_P
>  #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
>
> +#undef TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P
> +#define TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P \
> +  aarch64_promote_function_args_subreg_p
> +
>  #undef TARGET_VECTORIZE_CREATE_COSTS
>  #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/apc-subreg.c b/gcc/testsuite/gcc.target/aarch64/apc-subreg.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..2d7563a11ce11fa677f7ad4bf2a090e6a136e4d9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/apc-subreg.c
> @@ -0,0 +1,103 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include <stdint.h>
> +
> +/*
> +** f0:
> +**     mvn     w0, w0
> +**     ret
> +*/
> +uint8_t f0 (uint8_t xr){
> +    return (uint8_t) (0xff - xr);
> +}
> +
> +/*
> +** f1:
> +**     mvn     w0, w0
> +**     ret
> +*/
> +int8_t f1 (int8_t xr){
> +    return (int8_t) (0xff - xr);
> +}
> +
> +/*
> +** f2:
> +**     mvn     w0, w0
> +**     ret
> +*/
> +uint16_t f2 (uint16_t xr){
> +    return (uint16_t) (0xffFF - xr);
> +}
> +
> +/*
> +** f3:
> +**     mvn     w0, w0
> +**     ret
> +*/
> +uint32_t f3 (uint32_t xr){
> +    return (uint32_t) (0xffFFffff - xr);
> +}
> +
> +/*
> +** f4:
> +**     mvn     x0, x0
> +**     ret
> +*/
> +uint64_t f4 (uint64_t xr){
> +    return (uint64_t) (0xffFFffffffffffff - xr);
> +}
> +
> +/*
> +** f5:
> +**     mvn     w0, w0
> +**     sub     w0, w0, w1
> +**     ret
> +*/
> +uint8_t f5 (uint8_t xr, uint8_t xc){
> +    return (uint8_t) (0xff - xr - xc);
> +}
> +
> +/*
> +** f6:
> +**     mvn     w0, w0
> +**     and     w0, w0, 255
> +**     and     w1, w1, 255
> +**     mul     w0, w0, w1
> +**     ret
> +*/
> +uint16_t f6 (uint8_t xr, uint8_t xc){
> +    return ((uint8_t) (0xff - xr)) * xc;
> +}
> +
> +/*
> +** f7:
> +**     and     w0, w0, 255
> +**     and     w1, w1, 255
> +**     mul     w0, w0, w1
> +**     ret
> +*/
> +uint16_t f7 (uint8_t xr, uint8_t xc){
> +    return xr * xc;
> +}
> +
> +/*
> +** f8:
> +**     mul     w0, w0, w1
> +**     and     w0, w0, 255
> +**     ret
> +*/
> +uint16_t f8 (uint8_t xr, uint8_t xc){
> +    return (uint8_t)(xr * xc);
> +}
> +
> +/*
> +** f9:
> +**     and     w0, w0, 255
> +**     add     w0, w0, w1
> +**     ret
> +*/
> +uint16_t f9 (uint8_t xr, uint16_t xc){
> +    return xr + xc;
> +}
>
>
>
>
> --
  
Tamar Christina Oct. 28, 2022, 9:57 a.m. UTC | #2
> -----Original Message-----
> From: Andrew Pinski <pinskia@gmail.com>
> Sent: Thursday, October 27, 2022 4:15 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>
> Subject: Re: [PATCH 2/3]AArch64 Promote function arguments using a
> paradoxical subreg when beneficial.
> 
> On Fri, May 13, 2022 at 10:14 AM Tamar Christina via Gcc-patches <gcc-
> patches@gcc.gnu.org> wrote:
> >
> > Hi All,
> >
> > The PROMOTE_MODE always promotes 8 and 16-bit parameters to 32-bits.
> > This promotion is not required for the ABI which states:
> >
> >
> > ```
> > C.9     If the argument is an Integral or Pointer Type, the size of the
> argument is
> > less than or equal to 8 bytes and the NGRN is less than 8, the
> > argument is copied to the least significant bits in x[NGRN]. The NGRN is
> incremented by one.
> > The argument has now been allocated.
> >
> > C.16    If the size of the argument is less than 8 bytes then the size of the
> > argument is set to 8 bytes. The effect is as if the argument was
> > copied to the least significant bits of a 64-bit register and the
> > remaining bits filled with unspecified values ```
> >
> > That is, the bits in the registers are unspecified and callees cannot
> > assume any particular status.
> >
> > This means that we can avoid the promotion and still get correct code
> > as the language level promotion rules require values to be extended
> > when the bits are significant.
> >
> > So if we are .e.g OR-ing two 8-bit values no extend is needed as the
> > top bits are irrelevant.  If we are doing e.g. addition, then the top
> > bits *might* be relevant depending on the result type.  But the middle
> > end will always contain the appropriate extend in those cases.
> >
> > The mid-end also has optimizations around this assumption and the
> > AArch64 port actively undoes them.
> >
> > So for instance
> >
> > uint16_t fd (uint8_t xr){
> >     return xr + 1;
> > }
> >
> > uint8_t fd2 (uint8_t xr){
> >     return xr + 1;
> > }
> >
> > should produce
> >
> > fd:                                     // @fd
> >         and     w8, w0, #0xff
> >         add     w0, w8, #1
> >         ret
> > fd2:                                    // @fd2
> >         add     w0, w0, #1
> >         ret
> >
> > like clang does instead of
> >
> > fd:
> >         and     w0, w0, 255
> >         add     w0, w0, 1
> >         ret
> > fd2:
> >         and     w0, w0, 255
> >         add     w0, w0, 1
> >         ret
> >
> > like we do now.  Removing this forced expansion maintains correctness
> > but fixes issues with various codegen defects.  It also brings us inline with
> clang.
> >
> > Note that C, C++ and Fortran etc all correctly specify what should
> > happen w.r.t extends and e.g. array indexing, pointer arith etc so we
> > never get incorrect code.
> >
> > There is however a second reason for doing this promotion: RTL efficiency.
> > The promotion stops us from having to promote the values to SI to be
> > able to use them in instructions and then truncating again afterwards.
> >
> > To get both the efficiency and the simpler RTL we can instead promote
> > to a paradoxical subreg.  This patch implements the hook for AArch64
> > and adds an explicit opt-out for values that feed into comparisons.  This is
> done because:
> >
> > 1. our comparisons patterns already allow us to absorb the zero extend
> > 2. The extension allows us to use cbz/cbnz/tbz etc.  In some cases
> > such as
> >
> > int foo (char a, char b)
> > {
> >    if (a)
> >      if (b)
> >        bar1 ();
> >      else
> >        ...
> >     else
> >      if (b)
> >        bar2 ();
> >      else
> >        ...
> > }
> >
> > by zero extending the value we can avoid having to repeatedly test the
> > value before a branch.  Allowing the zero extend also allows our
> > existing `ands` patterns to work as expected.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > I have to commit this and the last patch together but ease of review I
> > have split them up here. However 209 missed optimization xfails are
> > fixed.
> >
> > No performance difference on SPECCPU 2017 but no failures.
> >
> > Ok for master?
> 
> Did this patch ever get approved? It is a nice improvement that would be nice
> to get into GCC 13 before the close of stage 1.

No, It was requested I make a standalone pass that introduces a new kind of extension
in the mid-end.  Unfortunately due to constrains on how much time I can dedicate to
that this year I've had to drop it for GCC 13.

I'll try to pick it up again during GCC 14.

Regards,
Tamar

> 
> Thanks,
> Andrew
> 
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> >         * config/aarch64/aarch64.cc
> (aarch64_promote_function_args_subreg_p):
> >         (TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P): New.
> >         * config/aarch64/aarch64.h (PROMOTE_MODE): Expand doc.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/aarch64/apc-subreg.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64.h
> > b/gcc/config/aarch64/aarch64.h index
> >
> efa46ac0b8799b5849b609d591186e26e5cb37ff..cc74a816fcc6458aa065246a30
> a4
> > d2184692ad74 100644
> > --- a/gcc/config/aarch64/aarch64.h
> > +++ b/gcc/config/aarch64/aarch64.h
> > @@ -34,7 +34,8 @@
> >
> >  #define REGISTER_TARGET_PRAGMAS() aarch64_register_pragmas ()
> >
> > -/* Target machine storage layout.  */
> > +/* Target machine storage layout.  See also
> > +   TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P.  */
> >
> >  #define PROMOTE_MODE(MODE, UNSIGNEDP, TYPE)    \
> >    if (GET_MODE_CLASS (MODE) == MODE_INT                \
> > diff --git a/gcc/config/aarch64/aarch64.cc
> > b/gcc/config/aarch64/aarch64.cc index
> >
> 2f559600cff55af9d468e8d0810545583cc986f5..252d6c2af72afc1dfee1a86644a5
> > 753784b41f59 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -3736,6 +3736,57 @@ aarch64_array_mode_supported_p
> (machine_mode mode,
> >    return false;
> >  }
> >
> > +/* Implement target hook
> TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P to complement
> > +   PROMOTE_MODE.  If any argument promotion was done, do them as
> > +subregs.  */ static bool aarch64_promote_function_args_subreg_p
> > +(machine_mode mode,
> > +                                       machine_mode promoted_mode,
> > +                                       int /* unsignedp */, tree
> > +parm) {
> > +  bool candidate_p = GET_MODE_CLASS (mode) == MODE_INT
> > +                    && GET_MODE_CLASS (promoted_mode) == MODE_INT
> > +                    && known_lt (GET_MODE_SIZE (mode), 4)
> > +                    && promoted_mode == SImode;
> > +
> > +  if (!candidate_p)
> > +    return false;
> > +
> > +  if (!parm || !is_gimple_reg (parm))
> > +    return true;
> > +
> > +  tree var = parm;
> > +  if (!VAR_P (var))
> > +    {
> > +      if (TREE_CODE (parm) == SSA_NAME
> > +          && !(var = SSA_NAME_VAR (var)))
> > +       return true;
> > +      else if (TREE_CODE (parm) != PARM_DECL)
> > +       return true;
> > +    }
> > +
> > +  /* If the variable is used inside a comparison which sets CC then we
> should
> > +     still promote using an extend.  By doing this we make it easier to use
> > +     cbz/cbnz but also repeatedly having to test the value in certain
> > +     circumstances like nested if values that test the same value with calls
> > +     in between. */
> > +  tree ssa_var = ssa_default_def (cfun, var);  if (!ssa_var)
> > +    return true;
> > +
> > +  const ssa_use_operand_t *const head =
> &(SSA_NAME_IMM_USE_NODE
> > + (ssa_var));  const ssa_use_operand_t *ptr;
> > +
> > +  for (ptr = head->next; ptr != head; ptr = ptr->next)
> > +    if (USE_STMT(ptr) && is_gimple_assign (USE_STMT (ptr)))
> > +      {
> > +       tree_code code = gimple_assign_rhs_code (USE_STMT(ptr));
> > +       if (TREE_CODE_CLASS (code) == tcc_comparison)
> > +         return false;
> > +      }
> > +
> > +  return true;
> > +}
> > +
> >  /* MODE is some form of SVE vector mode.  For data modes, return the
> number
> >     of vector register bits that each element of MODE occupies, such as 64
> >     for both VNx2DImode and VNx2SImode (where each 32-bit value is
> > stored @@ -27490,6 +27541,10 @@
> > aarch64_libgcc_floating_mode_supported_p
> >  #undef TARGET_ARRAY_MODE_SUPPORTED_P
> >  #define TARGET_ARRAY_MODE_SUPPORTED_P
> aarch64_array_mode_supported_p
> >
> > +#undef TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P
> > +#define TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P \
> > +  aarch64_promote_function_args_subreg_p
> > +
> >  #undef TARGET_VECTORIZE_CREATE_COSTS
> >  #define TARGET_VECTORIZE_CREATE_COSTS
> aarch64_vectorize_create_costs
> >
> > diff --git a/gcc/testsuite/gcc.target/aarch64/apc-subreg.c
> > b/gcc/testsuite/gcc.target/aarch64/apc-subreg.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..2d7563a11ce11fa677f7ad4bf2
> a0
> > 90e6a136e4d9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/apc-subreg.c
> > @@ -0,0 +1,103 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +#include <stdint.h>
> > +
> > +/*
> > +** f0:
> > +**     mvn     w0, w0
> > +**     ret
> > +*/
> > +uint8_t f0 (uint8_t xr){
> > +    return (uint8_t) (0xff - xr);
> > +}
> > +
> > +/*
> > +** f1:
> > +**     mvn     w0, w0
> > +**     ret
> > +*/
> > +int8_t f1 (int8_t xr){
> > +    return (int8_t) (0xff - xr);
> > +}
> > +
> > +/*
> > +** f2:
> > +**     mvn     w0, w0
> > +**     ret
> > +*/
> > +uint16_t f2 (uint16_t xr){
> > +    return (uint16_t) (0xffFF - xr);
> > +}
> > +
> > +/*
> > +** f3:
> > +**     mvn     w0, w0
> > +**     ret
> > +*/
> > +uint32_t f3 (uint32_t xr){
> > +    return (uint32_t) (0xffFFffff - xr); }
> > +
> > +/*
> > +** f4:
> > +**     mvn     x0, x0
> > +**     ret
> > +*/
> > +uint64_t f4 (uint64_t xr){
> > +    return (uint64_t) (0xffFFffffffffffff - xr); }
> > +
> > +/*
> > +** f5:
> > +**     mvn     w0, w0
> > +**     sub     w0, w0, w1
> > +**     ret
> > +*/
> > +uint8_t f5 (uint8_t xr, uint8_t xc){
> > +    return (uint8_t) (0xff - xr - xc); }
> > +
> > +/*
> > +** f6:
> > +**     mvn     w0, w0
> > +**     and     w0, w0, 255
> > +**     and     w1, w1, 255
> > +**     mul     w0, w0, w1
> > +**     ret
> > +*/
> > +uint16_t f6 (uint8_t xr, uint8_t xc){
> > +    return ((uint8_t) (0xff - xr)) * xc; }
> > +
> > +/*
> > +** f7:
> > +**     and     w0, w0, 255
> > +**     and     w1, w1, 255
> > +**     mul     w0, w0, w1
> > +**     ret
> > +*/
> > +uint16_t f7 (uint8_t xr, uint8_t xc){
> > +    return xr * xc;
> > +}
> > +
> > +/*
> > +** f8:
> > +**     mul     w0, w0, w1
> > +**     and     w0, w0, 255
> > +**     ret
> > +*/
> > +uint16_t f8 (uint8_t xr, uint8_t xc){
> > +    return (uint8_t)(xr * xc);
> > +}
> > +
> > +/*
> > +** f9:
> > +**     and     w0, w0, 255
> > +**     add     w0, w0, w1
> > +**     ret
> > +*/
> > +uint16_t f9 (uint8_t xr, uint16_t xc){
> > +    return xr + xc;
> > +}
> >
> >
> >
> >
> > --
  

Patch

--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -34,7 +34,8 @@ 
 
 #define REGISTER_TARGET_PRAGMAS() aarch64_register_pragmas ()
 
-/* Target machine storage layout.  */
+/* Target machine storage layout.  See also
+   TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P.  */
 
 #define PROMOTE_MODE(MODE, UNSIGNEDP, TYPE)	\
   if (GET_MODE_CLASS (MODE) == MODE_INT		\
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 2f559600cff55af9d468e8d0810545583cc986f5..252d6c2af72afc1dfee1a86644a5753784b41f59 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -3736,6 +3736,57 @@  aarch64_array_mode_supported_p (machine_mode mode,
   return false;
 }
 
+/* Implement target hook TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P to complement
+   PROMOTE_MODE.  If any argument promotion was done, do them as subregs.  */
+static bool
+aarch64_promote_function_args_subreg_p (machine_mode mode,
+					machine_mode promoted_mode,
+					int /* unsignedp */, tree parm)
+{
+  bool candidate_p = GET_MODE_CLASS (mode) == MODE_INT
+		     && GET_MODE_CLASS (promoted_mode) == MODE_INT
+		     && known_lt (GET_MODE_SIZE (mode), 4)
+		     && promoted_mode == SImode;
+
+  if (!candidate_p)
+    return false;
+
+  if (!parm || !is_gimple_reg (parm))
+    return true;
+
+  tree var = parm;
+  if (!VAR_P (var))
+    {
+      if (TREE_CODE (parm) == SSA_NAME
+	   && !(var = SSA_NAME_VAR (var)))
+	return true;
+      else if (TREE_CODE (parm) != PARM_DECL)
+	return true;
+    }
+
+  /* If the variable is used inside a comparison which sets CC then we should
+     still promote using an extend.  By doing this we make it easier to use
+     cbz/cbnz but also repeatedly having to test the value in certain
+     circumstances like nested if values that test the same value with calls
+     in between. */
+  tree ssa_var = ssa_default_def (cfun, var);
+  if (!ssa_var)
+    return true;
+
+  const ssa_use_operand_t *const head = &(SSA_NAME_IMM_USE_NODE (ssa_var));
+  const ssa_use_operand_t *ptr;
+
+  for (ptr = head->next; ptr != head; ptr = ptr->next)
+    if (USE_STMT(ptr) && is_gimple_assign (USE_STMT (ptr)))
+      {
+	tree_code code = gimple_assign_rhs_code (USE_STMT(ptr));
+	if (TREE_CODE_CLASS (code) == tcc_comparison)
+	  return false;
+      }
+
+  return true;
+}
+
 /* MODE is some form of SVE vector mode.  For data modes, return the number
    of vector register bits that each element of MODE occupies, such as 64
    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
@@ -27490,6 +27541,10 @@  aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_ARRAY_MODE_SUPPORTED_P
 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
 
+#undef TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P
+#define TARGET_PROMOTE_FUNCTION_ARGS_SUBREG_P \
+  aarch64_promote_function_args_subreg_p
+
 #undef TARGET_VECTORIZE_CREATE_COSTS
 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
 
diff --git a/gcc/testsuite/gcc.target/aarch64/apc-subreg.c b/gcc/testsuite/gcc.target/aarch64/apc-subreg.c
new file mode 100644
index 0000000000000000000000000000000000000000..2d7563a11ce11fa677f7ad4bf2a090e6a136e4d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/apc-subreg.c
@@ -0,0 +1,103 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdint.h>
+
+/*
+** f0:
+**	mvn	w0, w0
+**	ret
+*/
+uint8_t f0 (uint8_t xr){
+    return (uint8_t) (0xff - xr);
+}
+
+/*
+** f1:
+**	mvn	w0, w0
+**	ret
+*/
+int8_t f1 (int8_t xr){
+    return (int8_t) (0xff - xr);
+}
+
+/*
+** f2:
+**	mvn	w0, w0
+**	ret
+*/
+uint16_t f2 (uint16_t xr){
+    return (uint16_t) (0xffFF - xr);
+}
+
+/*
+** f3:
+**	mvn	w0, w0
+**	ret
+*/
+uint32_t f3 (uint32_t xr){
+    return (uint32_t) (0xffFFffff - xr);
+}
+
+/*
+** f4:
+**	mvn	x0, x0
+**	ret
+*/
+uint64_t f4 (uint64_t xr){
+    return (uint64_t) (0xffFFffffffffffff - xr);
+}
+
+/*
+** f5:
+**	mvn	w0, w0
+**	sub	w0, w0, w1
+**	ret
+*/
+uint8_t f5 (uint8_t xr, uint8_t xc){
+    return (uint8_t) (0xff - xr - xc);
+}
+
+/*
+** f6:
+**	mvn	w0, w0
+**	and	w0, w0, 255
+**	and	w1, w1, 255
+**	mul	w0, w0, w1
+**	ret
+*/
+uint16_t f6 (uint8_t xr, uint8_t xc){
+    return ((uint8_t) (0xff - xr)) * xc;
+}
+
+/*
+** f7:
+**	and	w0, w0, 255
+**	and	w1, w1, 255
+**	mul	w0, w0, w1
+**	ret
+*/
+uint16_t f7 (uint8_t xr, uint8_t xc){
+    return xr * xc;
+}
+
+/*
+** f8:
+**	mul	w0, w0, w1
+**	and	w0, w0, 255
+**	ret
+*/
+uint16_t f8 (uint8_t xr, uint8_t xc){
+    return (uint8_t)(xr * xc);
+}
+
+/*
+** f9:
+**	and	w0, w0, 255
+**	add	w0, w0, w1
+**	ret
+*/
+uint16_t f9 (uint8_t xr, uint16_t xc){
+    return xr + xc;
+}