[V4] rs6000: Optimize cmp on rotated 16bits constant

Message ID 20220725132922.45470-1-guojiufu@linux.ibm.com
State New
Headers
Series [V4] rs6000: Optimize cmp on rotated 16bits constant |

Commit Message

Jiufu Guo July 25, 2022, 1:29 p.m. UTC
  Hi,

When checking eq/neq with a constant which has only 16bits, it can be
optimized to check the rotated data.  By this, the constant building
is optimized.

As the example in PR103743:
For "in == 0x8000000000000000LL", this patch generates:
        rotldi %r3,%r3,16
        cmpldi %cr0,%r3,32768
instead:
        li %r9,-1
        rldicr %r9,%r9,0,0
        cmpd %cr0,%r3,%r9

This is a new patch to optimize compare(eq/ne) on rotatable constant.
This patch would be more straitforward than previous patch, like:
https://gcc.gnu.org/pipermail/gcc-patches/2022-May/595576.html.

This patch pass bootstrap and regtest on ppc64 and ppc64le.
Is it ok for trunk?  Thanks for comments!

BR,
Jeff(Jiufu)


	PR target/103743

gcc/ChangeLog:

	* config/rs6000/rs6000-protos.h (rotate_from_leading_zeros_const):
	New decl.
	* config/rs6000/rs6000.cc (rotate_from_leading_zeros_const): New
	define for checking simply rotated constant.
	* config/rs6000/rs6000.md (*rotate_on_cmpdi): New
	define_insn_and_split to optimze comparing on rotated constant.

gcc/testsuite/ChangeLog:

	* gcc.target/powerpc/pr103743.c: New test.
	* gcc.target/powerpc/pr103743_1.c: New test.

---
 gcc/config/rs6000/rs6000-protos.h             |  1 +
 gcc/config/rs6000/rs6000.cc                   | 29 ++++++
 gcc/config/rs6000/rs6000.md                   | 54 ++++++++++-
 gcc/testsuite/gcc.target/powerpc/pr103743.c   | 52 ++++++++++
 gcc/testsuite/gcc.target/powerpc/pr103743_1.c | 95 +++++++++++++++++++
 5 files changed, 230 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr103743.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr103743_1.c
  

Comments

Jiufu Guo Aug. 17, 2022, 3:12 a.m. UTC | #1
Gentle ping:
https://gcc.gnu.org/pipermail/gcc-patches/2022-July/598769.html

BR,
Jeff(Jiufu)

Jiufu Guo <guojiufu@linux.ibm.com> writes:

> Hi,
>
> When checking eq/neq with a constant which has only 16bits, it can be
> optimized to check the rotated data.  By this, the constant building
> is optimized.
>
> As the example in PR103743:
> For "in == 0x8000000000000000LL", this patch generates:
>         rotldi %r3,%r3,16
>         cmpldi %cr0,%r3,32768
> instead:
>         li %r9,-1
>         rldicr %r9,%r9,0,0
>         cmpd %cr0,%r3,%r9
>
> This is a new patch to optimize compare(eq/ne) on rotatable constant.
> This patch would be more straitforward than previous patch, like:
> https://gcc.gnu.org/pipermail/gcc-patches/2022-May/595576.html.
>
> This patch pass bootstrap and regtest on ppc64 and ppc64le.
> Is it ok for trunk?  Thanks for comments!
>
> BR,
> Jeff(Jiufu)
>
>
> 	PR target/103743
>
> gcc/ChangeLog:
>
> 	* config/rs6000/rs6000-protos.h (rotate_from_leading_zeros_const):
> 	New decl.
> 	* config/rs6000/rs6000.cc (rotate_from_leading_zeros_const): New
> 	define for checking simply rotated constant.
> 	* config/rs6000/rs6000.md (*rotate_on_cmpdi): New
> 	define_insn_and_split to optimze comparing on rotated constant.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/powerpc/pr103743.c: New test.
> 	* gcc.target/powerpc/pr103743_1.c: New test.
>
> ---
>  gcc/config/rs6000/rs6000-protos.h             |  1 +
>  gcc/config/rs6000/rs6000.cc                   | 29 ++++++
>  gcc/config/rs6000/rs6000.md                   | 54 ++++++++++-
>  gcc/testsuite/gcc.target/powerpc/pr103743.c   | 52 ++++++++++
>  gcc/testsuite/gcc.target/powerpc/pr103743_1.c | 95 +++++++++++++++++++
>  5 files changed, 230 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr103743.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr103743_1.c
>
> diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
> index 3ea01023609..bc7f6ff64ef 100644
> --- a/gcc/config/rs6000/rs6000-protos.h
> +++ b/gcc/config/rs6000/rs6000-protos.h
> @@ -35,6 +35,7 @@ extern bool xxspltib_constant_p (rtx, machine_mode, int *, int *);
>  extern int vspltis_shifted (rtx);
>  extern HOST_WIDE_INT const_vector_elt_as_int (rtx, unsigned int);
>  extern bool macho_lo_sum_memory_operand (rtx, machine_mode);
> +extern int rotate_from_leading_zeros_const (unsigned HOST_WIDE_INT, int);
>  extern int num_insns_constant (rtx, machine_mode);
>  extern int small_data_operand (rtx, machine_mode);
>  extern bool mem_operand_gpr (rtx, machine_mode);
> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
> index 3ff16b8ae04..9c0996603d0 100644
> --- a/gcc/config/rs6000/rs6000.cc
> +++ b/gcc/config/rs6000/rs6000.cc
> @@ -14861,6 +14861,35 @@ rs6000_reverse_condition (machine_mode mode, enum rtx_code code)
>      return reverse_condition (code);
>  }
>  
> +/* Check if C can be rotated from an immediate which contains leading
> +   zeros at least CLZ.
> +
> +   Return the number by which C can be rotated from the immediate.
> +   Return -1 if C can not be rotated as from.  */
> +
> +int
> +rotate_from_leading_zeros_const (unsigned HOST_WIDE_INT c, int clz)
> +{
> +  /* case. 0..0xxx: already at least clz zeros.  */
> +  int lz = clz_hwi (c);
> +  if (lz >= clz)
> +    return 0;
> +
> +  /* case a. 0..0xxx0..0: at least clz zeros.  */
> +  int tz = ctz_hwi (c);
> +  if (lz + tz >= clz)
> +    return tz;
> +
> +  /* xx0..0xx: rotate enough bits firstly, then check case a.  */
> +  const int rot_bits = HOST_BITS_PER_WIDE_INT - clz + 1;
> +  unsigned HOST_WIDE_INT rc = (c >> rot_bits) | (c << (clz - 1));
> +  tz = ctz_hwi (rc);
> +  if (clz_hwi (rc) + tz >= clz)
> +    return tz + rot_bits;
> +
> +  return -1;
> +}
> +
>  /* Generate a compare for CODE.  Return a brand-new rtx that
>     represents the result of the compare.  */
>  
> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
> index 1367a2cb779..603781b29aa 100644
> --- a/gcc/config/rs6000/rs6000.md
> +++ b/gcc/config/rs6000/rs6000.md
> @@ -7720,6 +7720,59 @@ (define_insn "*movsi_from_df"
>    "xscvdpsp %x0,%x1"
>    [(set_attr "type" "fp")])
>  
> +
> +(define_code_iterator eqne [eq ne])
> +
> +;; "i == C" ==> "rotl(i,N) == rotl(C,N)"
> +(define_insn_and_split "*rotate_on_cmpdi"
> +  [(set (pc)
> +     (if_then_else (eqne (match_operand:DI 1 "gpc_reg_operand" "r")
> +			 (match_operand:DI 2 "const_int_operand" "n"))
> +			 (label_ref (match_operand 0 ""))
> +		   (pc)))]
> +  "TARGET_POWERPC64 && !reload_completed && can_create_pseudo_p ()
> +   && num_insns_constant (operands[2], DImode) > 1
> +   && (rotate_from_leading_zeros_const (~UINTVAL (operands[2]), 49) > 0
> +       || rotate_from_leading_zeros_const (UINTVAL (operands[2]), 48) > 0)"
> +   "#"
> +   "&& 1"
> +  [(pc)]
> +{
> +  rtx cnd = XEXP (SET_SRC (single_set (curr_insn)), 0);
> +  bool ne = GET_CODE (cnd) == NE;
> +
> +  bool sgn = false;
> +  unsigned HOST_WIDE_INT C = INTVAL (operands[2]);
> +  int rot = rotate_from_leading_zeros_const (C, 48);
> +  if (rot < 0)
> +  {
> +     sgn = true;
> +     rot = rotate_from_leading_zeros_const (~C, 49);
> +  }
> +  rtx n = GEN_INT (HOST_BITS_PER_WIDE_INT - rot);
> +
> +  /* i' = rotl (i, n) */
> +  rtx op0 = gen_reg_rtx (DImode);
> +  emit_insn (gen_rtx_SET (op0, gen_rtx_ROTATE (DImode, operands[1], n)));
> +
> +  /* C' = rotl (C, n) */
> +  rtx op1 = GEN_INT ((C << INTVAL (n)) | (C >> rot));
> +
> +  /* i' ==  C' */
> +  machine_mode comp_mode = sgn ? CCmode : CCUNSmode;
> +  rtx cc = gen_reg_rtx (comp_mode);
> +  emit_insn (gen_rtx_SET (cc, gen_rtx_COMPARE (comp_mode, op0, op1)));
> +
> +  rtx cmp = ne ? gen_rtx_NE (CCmode, cc, const0_rtx)
> +	       : gen_rtx_EQ (CCmode, cc, const0_rtx);
> +  rtx loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[0]);
> +  emit_jump_insn (gen_rtx_SET (pc_rtx,
> +			       gen_rtx_IF_THEN_ELSE (VOIDmode, cmp,
> +						     loc_ref, pc_rtx)));
> +  DONE;
> +}
> +)
> +
>  ;; Split a load of a large constant into the appropriate two-insn
>  ;; sequence.
>  
> @@ -13426,7 +13479,6 @@ (define_expand "@ctr<mode>"
>  ;; rs6000_legitimate_combined_insn prevents combine creating any of
>  ;; the ctr<mode> insns.
>  
> -(define_code_iterator eqne [eq ne])
>  (define_code_attr bd [(eq "bdz") (ne "bdnz")])
>  (define_code_attr bd_neg [(eq "bdnz") (ne "bdz")])
>  
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr103743.c b/gcc/testsuite/gcc.target/powerpc/pr103743.c
> new file mode 100644
> index 00000000000..e18833685fb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr103743.c
> @@ -0,0 +1,52 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target lp64 } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-times "cmpldi" 10  } } */
> +/* { dg-final { scan-assembler-times "cmpdi" 4  } } */
> +/* { dg-final { scan-assembler-times "rotldi" 14  } } */
> +
> +int foo (int a);
> +
> +int __attribute__ ((noinline)) udi_fun (unsigned long long in)
> +{
> +  if (in == (0x8642000000000000ULL))
> +    return foo (1);
> +  if (in == (0x7642000000000000ULL))
> +    return foo (12);
> +  if (in == (0x8000000000000000ULL))
> +    return foo (32);
> +  if (in == (0x8000000000000001ULL))
> +    return foo (33);
> +  if (in == (0x8642FFFFFFFFFFFFULL))
> +    return foo (46);
> +  if (in == (0x7642FFFFFFFFFFFFULL))
> +    return foo (51);
> +  if (in == (0x7567000000ULL))
> +    return foo (9);
> +  if (in == (0xFFF8567FFFFFFFFFULL))
> +    return foo (19);
> +
> +  return 0;
> +}
> +
> +int __attribute__ ((noinline)) di_fun (long long in)
> +{
> +  if (in == (0x8642000000000000LL))
> +    return foo (1);
> +  if (in == (0x7642000000000000LL))
> +    return foo (12);
> +  if (in == (0x8000000000000000LL))
> +    return foo (32);
> +  if (in == (0x8000000000000001LL))
> +    return foo (33);
> +  if (in == (0x8642FFFFFFFFFFFFLL))
> +    return foo (46);
> +  if (in == (0x7642FFFFFFFFFFFFLL))
> +    return foo (51);
> +  if (in == (0x7567000000LL))
> +    return foo (9);
> +  if (in == (0xFFF8567FFFFFFFFFLL))
> +    return foo (19);
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr103743_1.c b/gcc/testsuite/gcc.target/powerpc/pr103743_1.c
> new file mode 100644
> index 00000000000..2c08c56714a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr103743_1.c
> @@ -0,0 +1,95 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -std=c99" } */
> +
> +int
> +foo (int a)
> +{
> +  return a + 6;
> +}
> +
> +int __attribute__ ((noinline)) udi_fun (unsigned long long in)
> +{
> +  if (in == (0x8642000000000000ULL))
> +    return foo (1);
> +  if (in == (0x7642000000000000ULL))
> +    return foo (12);
> +  if (in == (0x8000000000000000ULL))
> +    return foo (32);
> +  if (in == (0x8000000000000001ULL))
> +    return foo (33);
> +  if (in == (0x8642FFFFFFFFFFFFULL))
> +    return foo (46);
> +  if (in == (0x7642FFFFFFFFFFFFULL))
> +    return foo (51);
> +  if (in == (0x7567000000ULL))
> +    return foo (9);
> +  if (in == (0xFFF8567FFFFFFFFFULL))
> +    return foo (19);
> +  
> +  return 0;
> +}
> +
> +int __attribute__ ((noinline)) di_fun (long long in)
> +{
> +  if (in == (0x8642000000000000LL))
> +    return foo (1);
> +  if (in == (0x7642000000000000LL))
> +    return foo (12);
> +  if (in == (0x8000000000000000LL))
> +    return foo (32);
> +  if (in == (0x8000000000000001LL))
> +    return foo (33);
> +  if (in == (0x8642FFFFFFFFFFFFLL))
> +    return foo (46);
> +  if (in == (0x7642FFFFFFFFFFFFLL))
> +    return foo (51);
> +  return 0;
> +}
> +
> +int
> +main ()
> +{
> +  int e = 0;
> +  if (udi_fun (6) != 0)
> +    e++;
> +  if (udi_fun (0x8642000000000000ULL) != foo (1))
> +    e++;
> +  if (udi_fun (0x7642000000000000ULL) != foo (12))
> +    e++;
> +  if (udi_fun (0x8000000000000000ULL) != foo (32))
> +    e++;
> +  if (udi_fun (0x8000000000000001ULL) != foo (33))
> +    e++;
> +  if (udi_fun (0x8642FFFFFFFFFFFFULL) != foo (46))
> +    e++;
> +  if (udi_fun (0x7642FFFFFFFFFFFFULL) != foo (51))
> +    e++;
> +  if (udi_fun (0x7567000000ULL) != foo (9))
> +    e++;
> +  if (udi_fun (0xFFF8567FFFFFFFFFULL) != foo (19))
> +    e++;
> +
> +  if (di_fun (6) != 0)
> +    e++;
> +  if (di_fun (0x8642000000000000LL) != foo (1))
> +    e++;
> +  if (di_fun (0x7642000000000000LL) != foo (12))
> +    e++;
> +  if (di_fun (0x8000000000000000LL) != foo (32))
> +    e++;
> +  if (di_fun (0x8000000000000001LL) != foo (33))
> +    e++;
> +  if (di_fun (0x8642FFFFFFFFFFFFLL) != foo (46))
> +    e++;
> +  if (di_fun (0x7642FFFFFFFFFFFFLL) != foo (51))
> +    e++;
> +  if (udi_fun (0x7567000000LL) != foo (9))
> +    e++;
> +  if (udi_fun (0xFFF8567FFFFFFFFFLL) != foo (19))
> +    e++;
> +
> +  if (e)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
  
Segher Boessenkool Aug. 23, 2022, 10:18 p.m. UTC | #2
Hi!

On Mon, Jul 25, 2022 at 09:29:22PM +0800, Jiufu Guo wrote:
> When checking eq/neq with a constant which has only 16bits, it can be
> optimized to check the rotated data.  By this, the constant building
> is optimized.

"ne", not "neq".

> gcc/ChangeLog:
> 
> 	* config/rs6000/rs6000-protos.h (rotate_from_leading_zeros_const):
> 	New decl.

"New declaration." or just "New".  Also, don't break lines early please,
especially if that means ending a line in a colon, which then looks as
if you forgot to write something there.

> 	* config/rs6000/rs6000.cc (rotate_from_leading_zeros_const): New
> 	define for checking simply rotated constant.

"New." or "New definition." or such.

> +/* Check if C can be rotated from an immediate which contains leading
> +   zeros at least CLZ.

"Which starts (as 64 bit integer) with at least CLZ bits zero" or such.

> +  /* xx0..0xx: rotate enough bits firstly, then check case a.  */
> +  const int rot_bits = HOST_BITS_PER_WIDE_INT - clz + 1;
> +  unsigned HOST_WIDE_INT rc = (c >> rot_bits) | (c << (clz - 1));
> +  tz = ctz_hwi (rc);
> +  if (clz_hwi (rc) + tz >= clz)
> +    return tz + rot_bits;

This could use some more explanation.

> +(define_code_iterator eqne [eq ne])


> +;; "i == C" ==> "rotl(i,N) == rotl(C,N)"
> +(define_insn_and_split "*rotate_on_cmpdi"
> +  [(set (pc)
> +     (if_then_else (eqne (match_operand:DI 1 "gpc_reg_operand" "r")

Wrong indentation.  The ( should be in the same column as the preceding (
it matches.

Setting "pc" to either 0 or 1 is never correct.

> +  "TARGET_POWERPC64 && !reload_completed && can_create_pseudo_p ()

reload_completed in splitters is almost always wrong.  It isn't any
better if it is in the insn condition of a define_insn_and_split :-)

> +   && num_insns_constant (operands[2], DImode) > 1
> +   && (rotate_from_leading_zeros_const (~UINTVAL (operands[2]), 49) > 0
> +       || rotate_from_leading_zeros_const (UINTVAL (operands[2]), 48) > 0)"

There must be a better way to describe this.

> +  if (rot < 0)
> +  {
> +     sgn = true;
> +     rot = rotate_from_leading_zeros_const (~C, 49);
> +  }

Bad indentation.

> +  rtx cmp = ne ? gen_rtx_NE (CCmode, cc, const0_rtx)
> +	       : gen_rtx_EQ (CCmode, cc, const0_rtx);

  rtx cmp = gen_rtx_<EQNE> (...);

(and define a code_attr EQNE to just output the uppercase EQ or NE).

Why is this doing a conditional branch at all?  Unpredictable
conditional branches are extremely costly.

> +/* { dg-require-effective-target lp64 } */

arch_ppc64

> +/* { dg-final { scan-assembler-times "cmpldi" 10  } } */
> +/* { dg-final { scan-assembler-times "cmpdi" 4  } } */
> +/* { dg-final { scan-assembler-times "rotldi" 14  } } */

Please use \m and \M .

Thanks,


Segher
  
Jiufu Guo Aug. 24, 2022, 7:48 a.m. UTC | #3
Hi!

Segher Boessenkool <segher@kernel.crashing.org> writes:

> Hi!
>
> On Mon, Jul 25, 2022 at 09:29:22PM +0800, Jiufu Guo wrote:
>> When checking eq/neq with a constant which has only 16bits, it can be
>> optimized to check the rotated data.  By this, the constant building
>> is optimized.
>
> "ne", not "neq".
Oh, thanks!
>
>> gcc/ChangeLog:
>> 
>> 	* config/rs6000/rs6000-protos.h (rotate_from_leading_zeros_const):
>> 	New decl.
>
> "New declaration." or just "New".  Also, don't break lines early please,
> especially if that means ending a line in a colon, which then looks as
> if you forgot to write something there.
OK, will update.
>
>> 	* config/rs6000/rs6000.cc (rotate_from_leading_zeros_const): New
>> 	define for checking simply rotated constant.
>
> "New." or "New definition." or such.
OK, will update.
>
>> +/* Check if C can be rotated from an immediate which contains leading
>> +   zeros at least CLZ.
>
> "Which starts (as 64 bit integer) with at least CLZ bits zero" or such.
Thanks a lot! Will update.
>
>> +  /* xx0..0xx: rotate enough bits firstly, then check case a.  */
>> +  const int rot_bits = HOST_BITS_PER_WIDE_INT - clz + 1;
>> +  unsigned HOST_WIDE_INT rc = (c >> rot_bits) | (c << (clz - 1));
>> +  tz = ctz_hwi (rc);
>> +  if (clz_hwi (rc) + tz >= clz)
>> +    return tz + rot_bits;
>
> This could use some more explanation.
OK, thanks, will update.
>
>> +(define_code_iterator eqne [eq ne])
>
>
>> +;; "i == C" ==> "rotl(i,N) == rotl(C,N)"
>> +(define_insn_and_split "*rotate_on_cmpdi"
>> +  [(set (pc)
>> +     (if_then_else (eqne (match_operand:DI 1 "gpc_reg_operand" "r")
>
> Wrong indentation.  The ( should be in the same column as the preceding (
> it matches.
Oh, thanks for point this out! Will update.
>
> Setting "pc" to either 0 or 1 is never correct.
>
>> +  "TARGET_POWERPC64 && !reload_completed && can_create_pseudo_p ()
>
> reload_completed in splitters is almost always wrong.  It isn't any
> better if it is in the insn condition of a define_insn_and_split :-)
>
Thanks, 'can_create_pseudo_p' would be ok for this patch.
Or just FAIL, if !can_create_pseudo_p()?

>> +   && num_insns_constant (operands[2], DImode) > 1
>> +   && (rotate_from_leading_zeros_const (~UINTVAL (operands[2]), 49) > 0
>> +       || rotate_from_leading_zeros_const (UINTVAL (operands[2]), 48) > 0)"
>
> There must be a better way to describe this.
Will update this. I'm thinking to replace this with a meaning function,
maybe 'compare_rotate_immediate_p'.

>
>> +  if (rot < 0)
>> +  {
>> +     sgn = true;
>> +     rot = rotate_from_leading_zeros_const (~C, 49);
>> +  }
>
> Bad indentation.
Will update.
>
>> +  rtx cmp = ne ? gen_rtx_NE (CCmode, cc, const0_rtx)
>> +	       : gen_rtx_EQ (CCmode, cc, const0_rtx);
>
>   rtx cmp = gen_rtx_<EQNE> (...);
>
> (and define a code_attr EQNE to just output the uppercase EQ or NE).
Great! Thanks for always helpful suggestions!
>
> Why is this doing a conditional branch at all?  Unpredictable
> conditional branches are extremely costly.
This optimization needs to check whether the comparison code is ne/eq or
not.  To get the comparison code, we need to check the parent insn of
the 'cmp' insn.  This is why conditional branch patterns in used here.

This patch should not change the information (about prediction) of the
branch insn. I'm  thinking of updating the patch to keep the 'note info
REG_BR_PROB' for the branch instruction.

I will submit the updated patch for review.

>
>> +/* { dg-require-effective-target lp64 } */
>
> arch_ppc64
OK, will update.
>
>> +/* { dg-final { scan-assembler-times "cmpldi" 10  } } */
>> +/* { dg-final { scan-assembler-times "cmpdi" 4  } } */
>> +/* { dg-final { scan-assembler-times "rotldi" 14  } } */
>
> Please use \m and \M .
OK, will update.

Thanks again!

BR,
Jeff (Jiufu)

>
> Thanks,
>
>
> Segher
  
Segher Boessenkool Aug. 24, 2022, 2:07 p.m. UTC | #4
On Wed, Aug 24, 2022 at 03:48:49PM +0800, Jiufu Guo wrote:
> Segher Boessenkool <segher@kernel.crashing.org> writes:
> >> +  "TARGET_POWERPC64 && !reload_completed && can_create_pseudo_p ()
> >
> > reload_completed in splitters is almost always wrong.  It isn't any
> > better if it is in the insn condition of a define_insn_and_split :-)
> >
> Thanks, 'can_create_pseudo_p' would be ok for this patch.
> Or just FAIL, if !can_create_pseudo_p()?

You usually can split fine if you cannot create new pseudos, by reusing
existing registers.

FAIL will cause an ICE: the RTL instruction does match, but will fail
when trying to generate machine code for it.

> >> +   && num_insns_constant (operands[2], DImode) > 1
> >> +   && (rotate_from_leading_zeros_const (~UINTVAL (operands[2]), 49) > 0
> >> +       || rotate_from_leading_zeros_const (UINTVAL (operands[2]), 48) > 0)"
> > There must be a better way to describe this.
> Will update this. I'm thinking to replace this with a meaning function,
> maybe 'compare_rotate_immediate_p'.

Thanks!

> > Why is this doing a conditional branch at all?  Unpredictable
> > conditional branches are extremely costly.
> This optimization needs to check whether the comparison code is ne/eq or
> not.  To get the comparison code, we need to check the parent insn of
> the 'cmp' insn.  This is why conditional branch patterns in used here.
> 
> This patch should not change the information (about prediction) of the
> branch insn. I'm  thinking of updating the patch to keep the 'note info
> REG_BR_PROB' for the branch instruction.

Ah, good.  Explain a bit about that?  In a code comment or in the commit
message, whichever works best here.

Thanks!


Segher
  
Jiufu Guo Aug. 25, 2022, 12:11 p.m. UTC | #5
Hi,

Segher Boessenkool <segher@kernel.crashing.org> writes:

> On Wed, Aug 24, 2022 at 03:48:49PM +0800, Jiufu Guo wrote:
>> Segher Boessenkool <segher@kernel.crashing.org> writes:
>> >> +  "TARGET_POWERPC64 && !reload_completed && can_create_pseudo_p ()
>> >
>> > reload_completed in splitters is almost always wrong.  It isn't any
>> > better if it is in the insn condition of a define_insn_and_split :-)
>> >
>> Thanks, 'can_create_pseudo_p' would be ok for this patch.
>> Or just FAIL, if !can_create_pseudo_p()?
>
> You usually can split fine if you cannot create new pseudos, by reusing
> existing registers.
>
> FAIL will cause an ICE: the RTL instruction does match, but will fail
> when trying to generate machine code for it.
>
Previous patch is using "gen_reg_rtx (DImode)" to generate a pseudo for
the rotated result to prevent orignal one being changed accidently.
So, an 'assert (can_create_pseudo_p ())' would catch it in after RA.

To enable this splitter works after RA, we may need to reserve one
register (clobber would be ok).  Such as below:

  [(set (pc)
	(if_then_else (eqne (match_operand:DI 1 "gpc_reg_operand" "r")
			    (match_operand:DI 2 "const_int_operand" "n"))
		      (label_ref (match_operand 0 ""))
		      (pc)))
  (clobber (match_scratch:DI 3 "=r"))
  (clobber (match_scratch:CCUNS 4 "=y"))]
  "TARGET_POWERPC64 && num_insns_constant (operands[2], DImode) > 1
   && compare_rotate_immediate_p (UINTVAL (operands[2]))"
   "#"
   "&& 1"
  [(pc)]

>> >> +   && num_insns_constant (operands[2], DImode) > 1
>> >> +   && (rotate_from_leading_zeros_const (~UINTVAL (operands[2]), 49) > 0
>> >> +       || rotate_from_leading_zeros_const (UINTVAL (operands[2]), 48) > 0)"
>> > There must be a better way to describe this.
>> Will update this. I'm thinking to replace this with a meaning function,
>> maybe 'compare_rotate_immediate_p'.
>
> Thanks!
>
>> > Why is this doing a conditional branch at all?  Unpredictable
>> > conditional branches are extremely costly.
>> This optimization needs to check whether the comparison code is ne/eq or
>> not.  To get the comparison code, we need to check the parent insn of
>> the 'cmp' insn.  This is why conditional branch patterns in used here.
>> 
>> This patch should not change the information (about prediction) of the
>> branch insn. I'm  thinking of updating the patch to keep the 'note info
>> REG_BR_PROB' for the branch instruction.
>
> Ah, good.  Explain a bit about that?  In a code comment or in the commit
> message, whichever works best here.
>
Thanks! will add a comment for this.

BR,
Jeff(Jiufu)

> Thanks!
>
>
> Segher
  
Segher Boessenkool Aug. 25, 2022, 12:34 p.m. UTC | #6
Hi!

On Thu, Aug 25, 2022 at 08:11:31PM +0800, Jiufu Guo wrote:
> Segher Boessenkool <segher@kernel.crashing.org> writes:
> > You usually can split fine if you cannot create new pseudos, by reusing
> > existing registers.
> >
> > FAIL will cause an ICE: the RTL instruction does match, but will fail
> > when trying to generate machine code for it.
> >
> Previous patch is using "gen_reg_rtx (DImode)" to generate a pseudo for
> the rotated result to prevent orignal one being changed accidently.
> So, an 'assert (can_create_pseudo_p ())' would catch it in after RA.

It sounds like you want a define_split, not a define_insn_and_split.
That is much more stomachable anyway.

Anything that creates conditional branches together with compars insns
belongs before RA, before sched1 even.

> To enable this splitter works after RA, we may need to reserve one
> register (clobber would be ok).  Such as below:
> 
>   [(set (pc)
> 	(if_then_else (eqne (match_operand:DI 1 "gpc_reg_operand" "r")
> 			    (match_operand:DI 2 "const_int_operand" "n"))
> 		      (label_ref (match_operand 0 ""))
> 		      (pc)))
>   (clobber (match_scratch:DI 3 "=r"))
>   (clobber (match_scratch:CCUNS 4 "=y"))]

Yes, that is one way to do it.  Another way is to reuse operand 1.  A
clobber is probably better in this case though :-)

If this is only so combine can match things, you certainly want just a
define_split, and the compare+branch in one pattern is not as bad then.


Segher
  
Jiufu Guo Aug. 26, 2022, 9:28 a.m. UTC | #7
Hi,

Segher Boessenkool <segher@kernel.crashing.org> writes:

> Hi!
>
> On Thu, Aug 25, 2022 at 08:11:31PM +0800, Jiufu Guo wrote:
>> Segher Boessenkool <segher@kernel.crashing.org> writes:
>> > You usually can split fine if you cannot create new pseudos, by reusing
>> > existing registers.
>> >
>> > FAIL will cause an ICE: the RTL instruction does match, but will fail
>> > when trying to generate machine code for it.
>> >
>> Previous patch is using "gen_reg_rtx (DImode)" to generate a pseudo for
>> the rotated result to prevent orignal one being changed accidently.
>> So, an 'assert (can_create_pseudo_p ())' would catch it in after RA.
>
> It sounds like you want a define_split, not a define_insn_and_split.
> That is much more stomachable anyway.
>
Thanks for pointing out this!

As you mentioned, since it is only 'combine pass' that can match the
patterns, it would be better to just a define_split.  While I tried to
use this way, like:

(define_split
  [(set (pc)
        (if_then_else (eqne (match_operand:DI 1 "gpc_reg_operand")
                            (match_operand:DI 2 "const_int_operand"))
                      (label_ref (match_operand 0))
                      (pc)))]
  "TARGET_POWERPC64 && num_insns_constant (operands[2], DImode) > 1
   && compare_rotate_immediate_p (UINTVAL (operands[2]))"
  [(pc)]

But this does not work.  With more debugging, it seems that,
"combine_split_insns/split_insns" returns correctly with sequence of
three insns.   But after return, only less than two insns can be
handled.  Just as the code comment:
     If we were combining three insns and the result is a simple SET
     with no ASM_OPERANDS that wasn't recognized, try to split it into two
     insns.

then, that 'define_split' fail to keep the result.


In the patch, for 'define_insn_and_split', it is handled as the
process:
In 'combine' pass, the new defined insns "rotate_on_cmpdi" is combined
from three instructions; 
And then, in the 'split1' pass, it was split into other three insns.


> Anything that creates conditional branches together with compars insns
> belongs before RA, before sched1 even.
>
For this patch, it would run in 'split1' mostly.  The good thing is
'split1' is before sched1. :-)

>> To enable this splitter works after RA, we may need to reserve one
>> register (clobber would be ok).  Such as below:
>> 
>>   [(set (pc)
>> 	(if_then_else (eqne (match_operand:DI 1 "gpc_reg_operand" "r")
>> 			    (match_operand:DI 2 "const_int_operand" "n"))
>> 		      (label_ref (match_operand 0 ""))
>> 		      (pc)))
>>   (clobber (match_scratch:DI 3 "=r"))
>>   (clobber (match_scratch:CCUNS 4 "=y"))]
>
> Yes, that is one way to do it.  Another way is to reuse operand 1.  A
> clobber is probably better in this case though :-)
Yes, a clobber would be better -:)  For example:
If %3 is used later, it would be not safe to change:
"%3:DI!=0x8642000000000000"==>"%3:DI=%3DI<-15, %3:DI!=0x4321"

>
> If this is only so combine can match things, you certainly want just a
> define_split, and the compare+branch in one pattern is not as bad
> then.
As the above comments, since I failed to use 'define_split', so in
patch, 'define_insn_and_split' is used. :(


BR,
Jeff(Jiufu)

>
>
> Segher
  

Patch

diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index 3ea01023609..bc7f6ff64ef 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -35,6 +35,7 @@  extern bool xxspltib_constant_p (rtx, machine_mode, int *, int *);
 extern int vspltis_shifted (rtx);
 extern HOST_WIDE_INT const_vector_elt_as_int (rtx, unsigned int);
 extern bool macho_lo_sum_memory_operand (rtx, machine_mode);
+extern int rotate_from_leading_zeros_const (unsigned HOST_WIDE_INT, int);
 extern int num_insns_constant (rtx, machine_mode);
 extern int small_data_operand (rtx, machine_mode);
 extern bool mem_operand_gpr (rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 3ff16b8ae04..9c0996603d0 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -14861,6 +14861,35 @@  rs6000_reverse_condition (machine_mode mode, enum rtx_code code)
     return reverse_condition (code);
 }
 
+/* Check if C can be rotated from an immediate which contains leading
+   zeros at least CLZ.
+
+   Return the number by which C can be rotated from the immediate.
+   Return -1 if C can not be rotated as from.  */
+
+int
+rotate_from_leading_zeros_const (unsigned HOST_WIDE_INT c, int clz)
+{
+  /* case. 0..0xxx: already at least clz zeros.  */
+  int lz = clz_hwi (c);
+  if (lz >= clz)
+    return 0;
+
+  /* case a. 0..0xxx0..0: at least clz zeros.  */
+  int tz = ctz_hwi (c);
+  if (lz + tz >= clz)
+    return tz;
+
+  /* xx0..0xx: rotate enough bits firstly, then check case a.  */
+  const int rot_bits = HOST_BITS_PER_WIDE_INT - clz + 1;
+  unsigned HOST_WIDE_INT rc = (c >> rot_bits) | (c << (clz - 1));
+  tz = ctz_hwi (rc);
+  if (clz_hwi (rc) + tz >= clz)
+    return tz + rot_bits;
+
+  return -1;
+}
+
 /* Generate a compare for CODE.  Return a brand-new rtx that
    represents the result of the compare.  */
 
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 1367a2cb779..603781b29aa 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -7720,6 +7720,59 @@  (define_insn "*movsi_from_df"
   "xscvdpsp %x0,%x1"
   [(set_attr "type" "fp")])
 
+
+(define_code_iterator eqne [eq ne])
+
+;; "i == C" ==> "rotl(i,N) == rotl(C,N)"
+(define_insn_and_split "*rotate_on_cmpdi"
+  [(set (pc)
+     (if_then_else (eqne (match_operand:DI 1 "gpc_reg_operand" "r")
+			 (match_operand:DI 2 "const_int_operand" "n"))
+			 (label_ref (match_operand 0 ""))
+		   (pc)))]
+  "TARGET_POWERPC64 && !reload_completed && can_create_pseudo_p ()
+   && num_insns_constant (operands[2], DImode) > 1
+   && (rotate_from_leading_zeros_const (~UINTVAL (operands[2]), 49) > 0
+       || rotate_from_leading_zeros_const (UINTVAL (operands[2]), 48) > 0)"
+   "#"
+   "&& 1"
+  [(pc)]
+{
+  rtx cnd = XEXP (SET_SRC (single_set (curr_insn)), 0);
+  bool ne = GET_CODE (cnd) == NE;
+
+  bool sgn = false;
+  unsigned HOST_WIDE_INT C = INTVAL (operands[2]);
+  int rot = rotate_from_leading_zeros_const (C, 48);
+  if (rot < 0)
+  {
+     sgn = true;
+     rot = rotate_from_leading_zeros_const (~C, 49);
+  }
+  rtx n = GEN_INT (HOST_BITS_PER_WIDE_INT - rot);
+
+  /* i' = rotl (i, n) */
+  rtx op0 = gen_reg_rtx (DImode);
+  emit_insn (gen_rtx_SET (op0, gen_rtx_ROTATE (DImode, operands[1], n)));
+
+  /* C' = rotl (C, n) */
+  rtx op1 = GEN_INT ((C << INTVAL (n)) | (C >> rot));
+
+  /* i' ==  C' */
+  machine_mode comp_mode = sgn ? CCmode : CCUNSmode;
+  rtx cc = gen_reg_rtx (comp_mode);
+  emit_insn (gen_rtx_SET (cc, gen_rtx_COMPARE (comp_mode, op0, op1)));
+
+  rtx cmp = ne ? gen_rtx_NE (CCmode, cc, const0_rtx)
+	       : gen_rtx_EQ (CCmode, cc, const0_rtx);
+  rtx loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[0]);
+  emit_jump_insn (gen_rtx_SET (pc_rtx,
+			       gen_rtx_IF_THEN_ELSE (VOIDmode, cmp,
+						     loc_ref, pc_rtx)));
+  DONE;
+}
+)
+
 ;; Split a load of a large constant into the appropriate two-insn
 ;; sequence.
 
@@ -13426,7 +13479,6 @@  (define_expand "@ctr<mode>"
 ;; rs6000_legitimate_combined_insn prevents combine creating any of
 ;; the ctr<mode> insns.
 
-(define_code_iterator eqne [eq ne])
 (define_code_attr bd [(eq "bdz") (ne "bdnz")])
 (define_code_attr bd_neg [(eq "bdnz") (ne "bdz")])
 
diff --git a/gcc/testsuite/gcc.target/powerpc/pr103743.c b/gcc/testsuite/gcc.target/powerpc/pr103743.c
new file mode 100644
index 00000000000..e18833685fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr103743.c
@@ -0,0 +1,52 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-times "cmpldi" 10  } } */
+/* { dg-final { scan-assembler-times "cmpdi" 4  } } */
+/* { dg-final { scan-assembler-times "rotldi" 14  } } */
+
+int foo (int a);
+
+int __attribute__ ((noinline)) udi_fun (unsigned long long in)
+{
+  if (in == (0x8642000000000000ULL))
+    return foo (1);
+  if (in == (0x7642000000000000ULL))
+    return foo (12);
+  if (in == (0x8000000000000000ULL))
+    return foo (32);
+  if (in == (0x8000000000000001ULL))
+    return foo (33);
+  if (in == (0x8642FFFFFFFFFFFFULL))
+    return foo (46);
+  if (in == (0x7642FFFFFFFFFFFFULL))
+    return foo (51);
+  if (in == (0x7567000000ULL))
+    return foo (9);
+  if (in == (0xFFF8567FFFFFFFFFULL))
+    return foo (19);
+
+  return 0;
+}
+
+int __attribute__ ((noinline)) di_fun (long long in)
+{
+  if (in == (0x8642000000000000LL))
+    return foo (1);
+  if (in == (0x7642000000000000LL))
+    return foo (12);
+  if (in == (0x8000000000000000LL))
+    return foo (32);
+  if (in == (0x8000000000000001LL))
+    return foo (33);
+  if (in == (0x8642FFFFFFFFFFFFLL))
+    return foo (46);
+  if (in == (0x7642FFFFFFFFFFFFLL))
+    return foo (51);
+  if (in == (0x7567000000LL))
+    return foo (9);
+  if (in == (0xFFF8567FFFFFFFFFLL))
+    return foo (19);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr103743_1.c b/gcc/testsuite/gcc.target/powerpc/pr103743_1.c
new file mode 100644
index 00000000000..2c08c56714a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr103743_1.c
@@ -0,0 +1,95 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -std=c99" } */
+
+int
+foo (int a)
+{
+  return a + 6;
+}
+
+int __attribute__ ((noinline)) udi_fun (unsigned long long in)
+{
+  if (in == (0x8642000000000000ULL))
+    return foo (1);
+  if (in == (0x7642000000000000ULL))
+    return foo (12);
+  if (in == (0x8000000000000000ULL))
+    return foo (32);
+  if (in == (0x8000000000000001ULL))
+    return foo (33);
+  if (in == (0x8642FFFFFFFFFFFFULL))
+    return foo (46);
+  if (in == (0x7642FFFFFFFFFFFFULL))
+    return foo (51);
+  if (in == (0x7567000000ULL))
+    return foo (9);
+  if (in == (0xFFF8567FFFFFFFFFULL))
+    return foo (19);
+  
+  return 0;
+}
+
+int __attribute__ ((noinline)) di_fun (long long in)
+{
+  if (in == (0x8642000000000000LL))
+    return foo (1);
+  if (in == (0x7642000000000000LL))
+    return foo (12);
+  if (in == (0x8000000000000000LL))
+    return foo (32);
+  if (in == (0x8000000000000001LL))
+    return foo (33);
+  if (in == (0x8642FFFFFFFFFFFFLL))
+    return foo (46);
+  if (in == (0x7642FFFFFFFFFFFFLL))
+    return foo (51);
+  return 0;
+}
+
+int
+main ()
+{
+  int e = 0;
+  if (udi_fun (6) != 0)
+    e++;
+  if (udi_fun (0x8642000000000000ULL) != foo (1))
+    e++;
+  if (udi_fun (0x7642000000000000ULL) != foo (12))
+    e++;
+  if (udi_fun (0x8000000000000000ULL) != foo (32))
+    e++;
+  if (udi_fun (0x8000000000000001ULL) != foo (33))
+    e++;
+  if (udi_fun (0x8642FFFFFFFFFFFFULL) != foo (46))
+    e++;
+  if (udi_fun (0x7642FFFFFFFFFFFFULL) != foo (51))
+    e++;
+  if (udi_fun (0x7567000000ULL) != foo (9))
+    e++;
+  if (udi_fun (0xFFF8567FFFFFFFFFULL) != foo (19))
+    e++;
+
+  if (di_fun (6) != 0)
+    e++;
+  if (di_fun (0x8642000000000000LL) != foo (1))
+    e++;
+  if (di_fun (0x7642000000000000LL) != foo (12))
+    e++;
+  if (di_fun (0x8000000000000000LL) != foo (32))
+    e++;
+  if (di_fun (0x8000000000000001LL) != foo (33))
+    e++;
+  if (di_fun (0x8642FFFFFFFFFFFFLL) != foo (46))
+    e++;
+  if (di_fun (0x7642FFFFFFFFFFFFLL) != foo (51))
+    e++;
+  if (udi_fun (0x7567000000LL) != foo (9))
+    e++;
+  if (udi_fun (0xFFF8567FFFFFFFFFLL) != foo (19))
+    e++;
+
+  if (e)
+    __builtin_abort ();
+  return 0;
+}
+