diff mbox series

[V2] rs6000: Store complicated constant into pool

Message ID	20220614132355.117887-1-guojiufu@linux.ibm.com
State	New
Headers	DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 106873858D28 To: gcc-patches@gcc.gnu.org Subject: [PATCH V2]rs6000: Store complicated constant into pool Date: Tue, 14 Jun 2022 21:23:55 +0800 Message-Id: <20220614132355.117887-1-guojiufu@linux.ibm.com> MIME-Version: 1.0 Precedence: list From: Jiufu Guo via Gcc-patches <gcc-patches@gcc.gnu.org> Reply-To: Jiufu Guo <guojiufu@linux.ibm.com> Cc: amodra@gmail.com, dje.gcc@gmail.com, segher@kernel.crashing.org, linkw@gcc.gnu.org Errors-To: gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org>
Series	[V2] rs6000: Store complicated constant into pool \| [V2] rs6000: Store complicated constant into pool

Commit Message

Jiufu Guo June 14, 2022, 1:23 p.m. UTC

  Hi,

This patch is same with nearly:
https://gcc.gnu.org/pipermail/gcc-patches/2022-May/595378.html

The concept of this patch is similar with the patches which is
attached in PR63281.  e.g.
https://gcc.gnu.org/bugzilla/attachment.cgi?id=42186

I had a test for perlbench from SPEC2017. As expected, on -O2 for P9,
there is ~2% performance improvement with this patch. 

This patch reduces the threshold of instruction number for storing
constant to pool and update cost for constant and mem accessing.
And then if building the constant needs more than 2 instructions (or
more than 1 instruction on P10), then prefer to load it from constant
pool.

Bootstrap and regtest pass on ppc64le and ppc64.
Is this ok for trunk?  Thanks for comments and sugguestions.


BR,
Jiufu


	PR target/63281

gcc/ChangeLog:
2022-06-14  Jiufu Guo  <guojiufu@linux.ibm.com>
	    Alan Modra <amodra@gmail.com>

	* config/rs6000/rs6000.cc (rs6000_cannot_force_const_mem):
	Exclude rtx with code 'HIGH'.
	(rs6000_emit_move): Update threshold of const insn.
	(rs6000_rtx_costs): Update cost of constant and mem.

gcc/testsuite/ChangeLog:
2022-06-14  Jiufu Guo  <guojiufu@linux.ibm.com>
	    Alan Modra <amodra@gmail.com>

	* gcc.target/powerpc/medium_offset.c: Update.
	* gcc.target/powerpc/pr93012.c: Update.
	* gcc.target/powerpc/pr63281.c: New test.


---
 gcc/config/rs6000/rs6000.cc                   | 23 +++++++++++++++----
 .../gcc.target/powerpc/medium_offset.c        |  2 +-
 gcc/testsuite/gcc.target/powerpc/pr63281.c    | 11 +++++++++
 gcc/testsuite/gcc.target/powerpc/pr93012.c    |  2 +-
 4 files changed, 31 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr63281.c

Comments

Segher Boessenkool June 14, 2022, 10:34 p.m. UTC | #1

Hi!

On Tue, Jun 14, 2022 at 09:23:55PM +0800, Jiufu Guo wrote:
> This patch reduces the threshold of instruction number for storing
> constant to pool and update cost for constant and mem accessing.
> And then if building the constant needs more than 2 instructions (or
> more than 1 instruction on P10), then prefer to load it from constant
> pool.

Have you tried with different limits?  And, p10 is a red herring, you
actually test if prefixed insns are used.

> 	* config/rs6000/rs6000.cc (rs6000_cannot_force_const_mem):
> 	Exclude rtx with code 'HIGH'.

> --- a/gcc/config/rs6000/rs6000.cc
> +++ b/gcc/config/rs6000/rs6000.cc
> @@ -9706,8 +9706,9 @@ rs6000_init_stack_protect_guard (void)
>  static bool
>  rs6000_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
>  {
> -  if (GET_CODE (x) == HIGH
> -      && GET_CODE (XEXP (x, 0)) == UNSPEC)
> +  /* Exclude CONSTANT HIGH part.  e.g.
> +     (high:DI (symbol_ref:DI ("var") [flags 0xc0] <var_decl>)).  */
> +  if (GET_CODE (x) == HIGH)
>      return true;

So, why is this?  You didn't explain.

> @@ -11139,7 +11140,7 @@ rs6000_emit_move (rtx dest, rtx source, machine_mode mode)
>  		    && FP_REGNO_P (REGNO (operands[0])))
>  		   || !CONST_INT_P (operands[1])
>  		   || (num_insns_constant (operands[1], mode)
> -		       > (TARGET_CMODEL != CMODEL_SMALL ? 3 : 2)))
> +		       > (TARGET_PREFIXED ? 1 : 2)))
>  	       && !toc_relative_expr_p (operands[1], false, NULL, NULL)
>  	       && (TARGET_CMODEL == CMODEL_SMALL
>  		   || can_create_pseudo_p ()

This is the more obvious part.

> @@ -22101,6 +22102,14 @@ rs6000_rtx_costs (rtx x, machine_mode mode, int outer_code,
>  
>      case CONST_DOUBLE:
>      case CONST_WIDE_INT:
> +      /* It may needs a few insns for const to SET. -1 for outer SET code.  */
> +      if (outer_code == SET)
> +	{
> +	  *total = COSTS_N_INSNS (num_insns_constant (x, mode)) - 1;
> +	  return true;
> +	}
> +      /* FALLTHRU */
> +
>      case CONST:
>      case HIGH:
>      case SYMBOL_REF:

But this again isn't an obvious improvement at all, and needs
performance testing -- separately of the other changes.

> @@ -22110,8 +22119,12 @@ rs6000_rtx_costs (rtx x, machine_mode mode, int outer_code,
>      case MEM:
>        /* When optimizing for size, MEM should be slightly more expensive
>  	 than generating address, e.g., (plus (reg) (const)).
> -	 L1 cache latency is about two instructions.  */
> -      *total = !speed ? COSTS_N_INSNS (1) + 1 : COSTS_N_INSNS (2);
> +	 L1 cache latency is about two instructions.
> +	 For prefixed load (pld), we would set it slightly faster than
> +	 than two instructions. */
> +      *total = !speed
> +		 ? COSTS_N_INSNS (1) + 1
> +		 : TARGET_PREFIXED ? COSTS_N_INSNS (2) - 1 : COSTS_N_INSNS (2);
>        if (rs6000_slow_unaligned_access (mode, MEM_ALIGN (x)))
>  	*total += COSTS_N_INSNS (100);
>        return true;

And this is completely independent of the rest as well.  Cost 5 or 7 are
completely random numbers, why did you pick these?  Does it work better
than 8, or 4, etc.?


> --- a/gcc/testsuite/gcc.target/powerpc/medium_offset.c
> +++ b/gcc/testsuite/gcc.target/powerpc/medium_offset.c
> @@ -1,7 +1,7 @@
>  /* { dg-do compile { target { powerpc*-*-* } } } */
>  /* { dg-require-effective-target lp64 } */
>  /* { dg-options "-O" } */
> -/* { dg-final { scan-assembler-not "\\+4611686018427387904" } } */
> +/* { dg-final { scan-assembler-times {\msldi|pld\M} 1 } } */

Why?  This is still better generated in code, no?  It should never be
loaded from a constant pool (it is hex 4000_0000_0000_0000, easy to
construct with just one or two insns).

> --- a/gcc/testsuite/gcc.target/powerpc/pr93012.c
> +++ b/gcc/testsuite/gcc.target/powerpc/pr93012.c
> @@ -10,4 +10,4 @@ unsigned long long mskh1() { return 0xffff9234ffff9234ULL; }
>  unsigned long long mskl1() { return 0x2bcdffff2bcdffffULL; }
>  unsigned long long mskse() { return 0xffff1234ffff1234ULL; }
>  
> -/* { dg-final { scan-assembler-times {\mrldimi\M} 7 } } */
> +/* { dg-final { scan-assembler-times {\mrldimi|ld|pld\M} 7 } } */

Please make this the exact number of times you want to see rldimi and
the number of times you want a load.

Btw, you need to write
  \m(?:rldimi|ld|pld)\M
or it will mean
  \mrldimi
or
  ld
or
  pld\M
(and that "ld" will match anything that "pld$" will match of course).


So no doubt this will improve things, but we need testing of each part
separately.  Also look at code size, or differences in the generated
code in general: this is much more sensitive to detect than performance,
and is not itself sensitive to things like system load, so a) is easier
to measure, and b) has more useful outputs, outputs that tell more of
the whole story.


Segher

Jiufu Guo June 15, 2022, 8:19 a.m. UTC | #2

Segher Boessenkool <segher@kernel.crashing.org> writes:

Hi Segher,

Thanks so much for your review and sugguestions!

> Hi!
>
> On Tue, Jun 14, 2022 at 09:23:55PM +0800, Jiufu Guo wrote:
>> This patch reduces the threshold of instruction number for storing
>> constant to pool and update cost for constant and mem accessing.
>> And then if building the constant needs more than 2 instructions (or
>> more than 1 instruction on P10), then prefer to load it from constant
>> pool.
>
> Have you tried with different limits?  And, p10 is a red herring, you
> actually test if prefixed insns are used.

I drafted cases(C code, and updated asm code) to test runtime costs for
different code sequences:  building constants with 5 insns; with 3
insns and 2 insns; and loading from rodata.
'pld' is the instruction which we care about instead target p10 :-). So
in patch, 'TARGET_PREFIXED' is tested.

>
>> 	* config/rs6000/rs6000.cc (rs6000_cannot_force_const_mem):
>> 	Exclude rtx with code 'HIGH'.
>
>> --- a/gcc/config/rs6000/rs6000.cc
>> +++ b/gcc/config/rs6000/rs6000.cc
>> @@ -9706,8 +9706,9 @@ rs6000_init_stack_protect_guard (void)
>>  static bool
>>  rs6000_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
>>  {
>> -  if (GET_CODE (x) == HIGH
>> -      && GET_CODE (XEXP (x, 0)) == UNSPEC)
>> +  /* Exclude CONSTANT HIGH part.  e.g.
>> +     (high:DI (symbol_ref:DI ("var") [flags 0xc0] <var_decl>)).  */
>> +  if (GET_CODE (x) == HIGH)
>>      return true;
>
> So, why is this?  You didn't explain.

In the function rs6000_cannot_force_const_mem, we already excluded
"HIGH with UNSPEC" like: "high:P (unspec:P [symbol_ref..".
I find, "high:DI (symbol_ref:DI" is similar, which may also need to
exclude.  Half high part of address would be invalid for constant pool.

Or maybe, we could use below code to exclude it.
  /* high part of a symbol_ref could not be put into constant pool.  */
  if (GET_CODE (x) == HIGH
      && (GET_CODE (XEXP (x, 0)) == UNSPEC || SYMBOL_REF_P (XEXP (x, 0))))

One interesting thing is how the rtx "(high:DI (symbol_ref:DI (var" is
generated.  It seems in the middle of optimization, it is checked to
see if ok to store in constant memory.
  
>
>> @@ -11139,7 +11140,7 @@ rs6000_emit_move (rtx dest, rtx source, machine_mode mode)
>>  		    && FP_REGNO_P (REGNO (operands[0])))
>>  		   || !CONST_INT_P (operands[1])
>>  		   || (num_insns_constant (operands[1], mode)
>> -		       > (TARGET_CMODEL != CMODEL_SMALL ? 3 : 2)))
>> +		       > (TARGET_PREFIXED ? 1 : 2)))
>>  	       && !toc_relative_expr_p (operands[1], false, NULL, NULL)
>>  	       && (TARGET_CMODEL == CMODEL_SMALL
>>  		   || can_create_pseudo_p ()
>
> This is the more obvious part.
>
>> @@ -22101,6 +22102,14 @@ rs6000_rtx_costs (rtx x, machine_mode mode, int outer_code,
>>  
>>      case CONST_DOUBLE:
>>      case CONST_WIDE_INT:
>> +      /* It may needs a few insns for const to SET. -1 for outer SET code.  */
>> +      if (outer_code == SET)
>> +	{
>> +	  *total = COSTS_N_INSNS (num_insns_constant (x, mode)) - 1;
>> +	  return true;
>> +	}
>> +      /* FALLTHRU */
>> +
>>      case CONST:
>>      case HIGH:
>>      case SYMBOL_REF:
>
> But this again isn't an obvious improvement at all, and needs
> performance testing -- separately of the other changes.

This code updates the cost of constant assigning, we need this to avoid
CSE to replace the constant loading with constant assigning.
The above change (the threshold for storing const in the pool) depends
on this code.
This part does not depend on other part, so this part could be tested to
tune separately.

>
>> @@ -22110,8 +22119,12 @@ rs6000_rtx_costs (rtx x, machine_mode mode, int outer_code,
>>      case MEM:
>>        /* When optimizing for size, MEM should be slightly more expensive
>>  	 than generating address, e.g., (plus (reg) (const)).
>> -	 L1 cache latency is about two instructions.  */
>> -      *total = !speed ? COSTS_N_INSNS (1) + 1 : COSTS_N_INSNS (2);
>> +	 L1 cache latency is about two instructions.
>> +	 For prefixed load (pld), we would set it slightly faster than
>> +	 than two instructions. */
>> +      *total = !speed
>> +		 ? COSTS_N_INSNS (1) + 1
>> +		 : TARGET_PREFIXED ? COSTS_N_INSNS (2) - 1 : COSTS_N_INSNS (2);
>>        if (rs6000_slow_unaligned_access (mode, MEM_ALIGN (x)))
>>  	*total += COSTS_N_INSNS (100);
>>        return true;
>
> And this is completely independent of the rest as well.  Cost 5 or 7 are
> completely random numbers, why did you pick these?  Does it work better
> than 8, or 4, etc.?
For 'pld', asm code would be "pld %r9,.LC0@pcrel"; as tests, it is
faster than 2insns (like, "li+oris"), but still slower than one
instruction (e.g. li, pli).  So, "COSTS_N_INSNS (2) - 1" is selected.
And with this, cse will not replace 'pld' with 'li + oris'. 

>
>
>> --- a/gcc/testsuite/gcc.target/powerpc/medium_offset.c
>> +++ b/gcc/testsuite/gcc.target/powerpc/medium_offset.c
>> @@ -1,7 +1,7 @@
>>  /* { dg-do compile { target { powerpc*-*-* } } } */
>>  /* { dg-require-effective-target lp64 } */
>>  /* { dg-options "-O" } */
>> -/* { dg-final { scan-assembler-not "\\+4611686018427387904" } } */
>> +/* { dg-final { scan-assembler-times {\msldi|pld\M} 1 } } */
>
> Why?  This is still better generated in code, no?  It should never be
> loaded from a constant pool (it is hex 4000_0000_0000_0000, easy to
> construct with just one or two insns).

For p8/9, two insns "lis 0x4000+sldi 32" are used:
        addis %r3,%r2,.LANCHOR0@toc@ha
        addi %r3,%r3,.LANCHOR0@toc@l
        lis %r9,0x4000
        sldi %r9,%r9,32
        add %r3,%r3,%r9
	blr

On p10, as expected, 'pld' would be better than 'lis+sldi'.
And for this case, it also saves other instructions(addis/addi):
        pld %r3,.LC1@pcrel
        blr
....
.LC1:
     	.quad   .LANCHOR0+4611686018427387904
....
        .set    .LANCHOR0,. + 0
        .type   x, @object
        .size   x, 4
x:
        .zero   4
        
>
>> --- a/gcc/testsuite/gcc.target/powerpc/pr93012.c
>> +++ b/gcc/testsuite/gcc.target/powerpc/pr93012.c
>> @@ -10,4 +10,4 @@ unsigned long long mskh1() { return 0xffff9234ffff9234ULL; }
>>  unsigned long long mskl1() { return 0x2bcdffff2bcdffffULL; }
>>  unsigned long long mskse() { return 0xffff1234ffff1234ULL; }
>>  
>> -/* { dg-final { scan-assembler-times {\mrldimi\M} 7 } } */
>> +/* { dg-final { scan-assembler-times {\mrldimi|ld|pld\M} 7 } } */
>
> Please make this the exact number of times you want to see rldimi and
> the number of times you want a load.
>
> Btw, you need to write
>   \m(?:rldimi|ld|pld)\M
Oh, thanks!  I did not aware of this.

> or it will mean
>   \mrldimi
> or
>   ld
> or
>   pld\M
> (and that "ld" will match anything that "pld$" will match of course).
Maybe I should separate out two cases one for "-mdejagnu-cpu=power10",
and one for "-mdejagnu-cpu=power7", because the load instructions
the number would be different.

>
>
> So no doubt this will improve things, but we need testing of each part
> separately.  Also look at code size, or differences in the generated
> code in general: this is much more sensitive to detect than performance,
> and is not itself sensitive to things like system load, so a) is easier
> to measure, and b) has more useful outputs, outputs that tell more of
> the whole story.
Thanks for your suggestions!
I also feel it is very sensitive when I test performance. The change
(e.g. cost of constant) may affect other optimization passes indirectly.
I will also check the object changes (object size or differences). I may
use objects from GCC bootstrap and GCC test suite.

BR,
Jiufu Guo

>
>
> Segher

Segher Boessenkool June 15, 2022, 5:44 p.m. UTC | #3

Hi!

On Wed, Jun 15, 2022 at 04:19:41PM +0800, Jiufu Guo wrote:
> Segher Boessenkool <segher@kernel.crashing.org> writes:
> > Have you tried with different limits?
> I drafted cases(C code, and updated asm code) to test runtime costs for
> different code sequences:  building constants with 5 insns; with 3
> insns and 2 insns; and loading from rodata.

I'm not asking about trivial examples here.  Have you tried the
resulting compiler, on some big real-life code, with various choices for
these essentially random choices, on different cpus?

Huge changes like this need quite a bit of experimental data to back it
up, or some convincing other evidence.  That is the only way to move
forward: anything else will resemble Brownian motion :-(

> >  And, p10 is a red herring, you
> > actually test if prefixed insns are used.
> 
> 'pld' is the instruction which we care about instead target p10 :-). So
> in patch, 'TARGET_PREFIXED' is tested.

Huh.  I would think "pli" is the most important thing here!  Which is
not a load instruction at all, notwithstanding its name.

> >> --- a/gcc/config/rs6000/rs6000.cc
> >> +++ b/gcc/config/rs6000/rs6000.cc
> >> @@ -9706,8 +9706,9 @@ rs6000_init_stack_protect_guard (void)
> >>  static bool
> >>  rs6000_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
> >>  {
> >> -  if (GET_CODE (x) == HIGH
> >> -      && GET_CODE (XEXP (x, 0)) == UNSPEC)
> >> +  /* Exclude CONSTANT HIGH part.  e.g.
> >> +     (high:DI (symbol_ref:DI ("var") [flags 0xc0] <var_decl>)).  */
> >> +  if (GET_CODE (x) == HIGH)
> >>      return true;
> >
> > So, why is this?  You didn't explain.
> 
> In the function rs6000_cannot_force_const_mem, we already excluded
> "HIGH with UNSPEC" like: "high:P (unspec:P [symbol_ref..".
> I find, "high:DI (symbol_ref:DI" is similar, which may also need to
> exclude.  Half high part of address would be invalid for constant pool.
> 
> Or maybe, we could use below code to exclude it.
>   /* high part of a symbol_ref could not be put into constant pool.  */
>   if (GET_CODE (x) == HIGH
>       && (GET_CODE (XEXP (x, 0)) == UNSPEC || SYMBOL_REF_P (XEXP (x, 0))))
> 
> One interesting thing is how the rtx "(high:DI (symbol_ref:DI (var" is
> generated.  It seems in the middle of optimization, it is checked to
> see if ok to store in constant memory.

It probably is fine to not allow the HIGH of *anything* to be put in a
constant pool, sure.  But again, that is a change independent of other
things, and it could use some supporting evidence.

> >>      case CONST_DOUBLE:
> >>      case CONST_WIDE_INT:
> >> +      /* It may needs a few insns for const to SET. -1 for outer SET code.  */
> >> +      if (outer_code == SET)
> >> +	{
> >> +	  *total = COSTS_N_INSNS (num_insns_constant (x, mode)) - 1;
> >> +	  return true;
> >> +	}
> >> +      /* FALLTHRU */
> >> +
> >>      case CONST:
> >>      case HIGH:
> >>      case SYMBOL_REF:
> >
> > But this again isn't an obvious improvement at all, and needs
> > performance testing -- separately of the other changes.
> 
> This code updates the cost of constant assigning, we need this to avoid
> CSE to replace the constant loading with constant assigning.

No.  This is rtx_cost, which makes a cost estimate of random RTL, not
typically corresponding to existing RTL insns at all.  We do not use it
a lot anymore thankfully (we use insn_cost in most places), but things
like CSE still use it.

> The above change (the threshold for storing const in the pool) depends
> on this code.

Yes, and a gazillion other places as well still (or about 50 really :-) )

> > And this is completely independent of the rest as well.  Cost 5 or 7 are
> > completely random numbers, why did you pick these?  Does it work better
> > than 8, or 4, etc.?
> For 'pld', asm code would be "pld %r9,.LC0@pcrel"; as tests, it is
> faster than 2insns (like, "li+oris"), but still slower than one
> instruction (e.g. li, pli).  So, "COSTS_N_INSNS (2) - 1" is selected.
> And with this, cse will not replace 'pld' with 'li + oris'. 

(That is WinNT assembler syntax btw.  We never use that.  We write
either "pld 9,.LC0@pcrel" or "pld r9,.LC0@pcrel".)

COSTS_N_INSNS(1) means 4, always.  The costs are not additive at all in
reality of course, so you cannot simply add them and get any sensible
result :-(

Why did you choose 7?  Your explanation makes 5 and 6 good candidates
as well.  My money is on 6 performing best here, but there is no way to
know without actually trying out things.

> >> --- a/gcc/testsuite/gcc.target/powerpc/medium_offset.c
> >> +++ b/gcc/testsuite/gcc.target/powerpc/medium_offset.c
> >> @@ -1,7 +1,7 @@
> >>  /* { dg-do compile { target { powerpc*-*-* } } } */
> >>  /* { dg-require-effective-target lp64 } */
> >>  /* { dg-options "-O" } */
> >> -/* { dg-final { scan-assembler-not "\\+4611686018427387904" } } */
> >> +/* { dg-final { scan-assembler-times {\msldi|pld\M} 1 } } */
> >
> > Why?  This is still better generated in code, no?  It should never be
> > loaded from a constant pool (it is hex 4000_0000_0000_0000, easy to
> > construct with just one or two insns).
> 
> For p8/9, two insns "lis 0x4000+sldi 32" are used:
>         addis %r3,%r2,.LANCHOR0@toc@ha
>         addi %r3,%r3,.LANCHOR0@toc@l
>         lis %r9,0x4000
>         sldi %r9,%r9,32
>         add %r3,%r3,%r9
> 	blr

That does not mean putting this constant in the constant pool is a good
idea at all, of course.

> On p10, as expected, 'pld' would be better than 'lis+sldi'.

Is it?

> And for this case, it also saves other instructions(addis/addi):
>         pld %r3,.LC1@pcrel
>         blr

This explanation then belongs in your commit message :-)

> > Btw, you need to write
> >   \m(?:rldimi|ld|pld)\M
> Oh, thanks!  I did not aware of this.

(?:xxx) is just the same as (xxx), but the parentheses in the latter are
capturing, which messes up scan-assembler-times, so you need
non-capturing grouping.

> Maybe I should separate out two cases one for "-mdejagnu-cpu=power10",
> and one for "-mdejagnu-cpu=power7", because the load instructions
> the number would be different.

Yeah, it is probably best to never allow both a load and non-load insns
in the same check.

> > So no doubt this will improve things, but we need testing of each part
> > separately.  Also look at code size, or differences in the generated
> > code in general: this is much more sensitive to detect than performance,
> > and is not itself sensitive to things like system load, so a) is easier
> > to measure, and b) has more useful outputs, outputs that tell more of
> > the whole story.
> Thanks for your suggestions!
> I also feel it is very sensitive when I test performance.

Yeah, there are constants *everywhere* in typical compiler output, so a
small change will easily move the needle already.

> The change
> (e.g. cost of constant) may affect other optimization passes indirectly.

That too, yes.

> I will also check the object changes (object size or differences). I may
> use objects from GCC bootstrap and GCC test suite.

Good choice :-)

Looking forward to it,


Segher

Jiufu Guo June 16, 2022, 7:47 a.m. UTC | #4

Segher Boessenkool <segher@kernel.crashing.org> writes:

Hi Segher!

Thanks for your comments and suggestions!!

> Hi!
>
> On Wed, Jun 15, 2022 at 04:19:41PM +0800, Jiufu Guo wrote:
>> Segher Boessenkool <segher@kernel.crashing.org> writes:
>> > Have you tried with different limits?
>> I drafted cases(C code, and updated asm code) to test runtime costs for
>> different code sequences:  building constants with 5 insns; with 3
>> insns and 2 insns; and loading from rodata.
>
> I'm not asking about trivial examples here.  Have you tried the
> resulting compiler, on some big real-life code, with various choices for
> these essentially random choices, on different cpus?

I tested the patch with spec2017 on p9 and p10.  I only tested combined
variations: 'threshold 2 insns' + 'constant cost COSTS_N_INSNS(N)
- 1' (or do not -1).  And I find only few benchmarks are affected
noticeablely. 
I will test more variations.  Any suggestions about workloads?

>
> Huge changes like this need quite a bit of experimental data to back it
> up, or some convincing other evidence.  That is the only way to move
> forward: anything else will resemble Brownian motion :-(

Understood! I would collect more data to see if it is good in general.

>
>> >  And, p10 is a red herring, you
>> > actually test if prefixed insns are used.
>> 
>> 'pld' is the instruction which we care about instead target p10 :-). So
>> in patch, 'TARGET_PREFIXED' is tested.
>
> Huh.  I would think "pli" is the most important thing here!  Which is
> not a load instruction at all, notwithstanding its name.

"pli" could help a lot on constant building!  When loading constant from
memory, "pld" would also be good for some cases. :-)

>
>> >> --- a/gcc/config/rs6000/rs6000.cc
>> >> +++ b/gcc/config/rs6000/rs6000.cc
>> >> @@ -9706,8 +9706,9 @@ rs6000_init_stack_protect_guard (void)
>> >>  static bool
>> >>  rs6000_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
>> >>  {
>> >> -  if (GET_CODE (x) == HIGH
>> >> -      && GET_CODE (XEXP (x, 0)) == UNSPEC)
>> >> +  /* Exclude CONSTANT HIGH part.  e.g.
>> >> +     (high:DI (symbol_ref:DI ("var") [flags 0xc0] <var_decl>)).  */
>> >> +  if (GET_CODE (x) == HIGH)
>> >>      return true;
>> >
>> > So, why is this?  You didn't explain.
>> 
>> In the function rs6000_cannot_force_const_mem, we already excluded
>> "HIGH with UNSPEC" like: "high:P (unspec:P [symbol_ref..".
>> I find, "high:DI (symbol_ref:DI" is similar, which may also need to
>> exclude.  Half high part of address would be invalid for constant pool.
>> 
>> Or maybe, we could use below code to exclude it.
>>   /* high part of a symbol_ref could not be put into constant pool.  */
>>   if (GET_CODE (x) == HIGH
>>       && (GET_CODE (XEXP (x, 0)) == UNSPEC || SYMBOL_REF_P (XEXP (x, 0))))
>> 
>> One interesting thing is how the rtx "(high:DI (symbol_ref:DI (var" is
>> generated.  It seems in the middle of optimization, it is checked to
>> see if ok to store in constant memory.
>
> It probably is fine to not allow the HIGH of *anything* to be put in a
> constant pool, sure.  But again, that is a change independent of other
> things, and it could use some supporting evidence.

In code, I will add comments about these examples of high half part
address that need to be excluded.

/* The high part address is invalid for constant pool. The form of
address high part would be: "high:P (unspec:P [symbol_ref".  In the
middle of one pass, the form could also be "high:DI (symbol_ref".
Returns true for rtx with HIGH code. */

>
>> >>      case CONST_DOUBLE:
>> >>      case CONST_WIDE_INT:
>> >> +      /* It may needs a few insns for const to SET. -1 for outer SET code.  */
>> >> +      if (outer_code == SET)
>> >> +	{
>> >> +	  *total = COSTS_N_INSNS (num_insns_constant (x, mode)) - 1;
>> >> +	  return true;
>> >> +	}
>> >> +      /* FALLTHRU */
>> >> +
>> >>      case CONST:
>> >>      case HIGH:
>> >>      case SYMBOL_REF:
>> >
>> > But this again isn't an obvious improvement at all, and needs
>> > performance testing -- separately of the other changes.
>> 
>> This code updates the cost of constant assigning, we need this to avoid
>> CSE to replace the constant loading with constant assigning.
>
> No.  This is rtx_cost, which makes a cost estimate of random RTL, not
> typically corresponding to existing RTL insns at all.  We do not use it
> a lot anymore thankfully (we use insn_cost in most places), but things
> like CSE still use it.

This change increases rtx_cost for constant on SET. Because CSE is using
the rtx_cost.  Then with this, we could avoid CSE get lower cost
incorrectly on complicated constant assigning. 

>
>> The above change (the threshold for storing const in the pool) depends
>> on this code.
>
> Yes, and a gazillion other places as well still (or about 50 really :-) )
>
>> > And this is completely independent of the rest as well.  Cost 5 or 7 are
>> > completely random numbers, why did you pick these?  Does it work better
>> > than 8, or 4, etc.?
>> For 'pld', asm code would be "pld %r9,.LC0@pcrel"; as tests, it is
>> faster than 2insns (like, "li+oris"), but still slower than one
>> instruction (e.g. li, pli).  So, "COSTS_N_INSNS (2) - 1" is selected.
>> And with this, cse will not replace 'pld' with 'li + oris'. 
>
> (That is WinNT assembler syntax btw.  We never use that.  We write
> either "pld 9,.LC0@pcrel" or "pld r9,.LC0@pcrel".)
Thanks.
We used to see asm code: "pld 9,.LC0@pcrel".  Which I want to show is
"pld r9,.LC0@pcrel" :-).  I did not find a way to dump this asm code.
"-mregnames" generates "%r9".

>
> COSTS_N_INSNS(1) means 4, always.  The costs are not additive at all in
> reality of course, so you cannot simply add them and get any sensible
> result :-(
>
> Why did you choose 7?  Your explanation makes 5 and 6 good candidates
> as well.  My money is on 6 performing best here, but there is no way to
> know without actually trying out things.

Thanks for sugguestion! 
As tests, it would be a value between 4-8. I also selected
"5"(slightly slower than 4--one instruction), selected "7" because it
may means close to the cost of two instructions.
"6" maybe better, I will have a test.

>
>> >> --- a/gcc/testsuite/gcc.target/powerpc/medium_offset.c
>> >> +++ b/gcc/testsuite/gcc.target/powerpc/medium_offset.c
>> >> @@ -1,7 +1,7 @@
>> >>  /* { dg-do compile { target { powerpc*-*-* } } } */
>> >>  /* { dg-require-effective-target lp64 } */
>> >>  /* { dg-options "-O" } */
>> >> -/* { dg-final { scan-assembler-not "\\+4611686018427387904" } } */
>> >> +/* { dg-final { scan-assembler-times {\msldi|pld\M} 1 } } */
>> >
>> > Why?  This is still better generated in code, no?  It should never be
>> > loaded from a constant pool (it is hex 4000_0000_0000_0000, easy to
>> > construct with just one or two insns).
>> 
>> For p8/9, two insns "lis 0x4000+sldi 32" are used:
>>         addis %r3,%r2,.LANCHOR0@toc@ha
>>         addi %r3,%r3,.LANCHOR0@toc@l
>>         lis %r9,0x4000
>>         sldi %r9,%r9,32
>>         add %r3,%r3,%r9
>> 	blr
>
> That does not mean putting this constant in the constant pool is a good
> idea at all, of course.
>
>> On p10, as expected, 'pld' would be better than 'lis+sldi'.
>
> Is it?

With simple cases, it shows 'pld' seems better. For perlbench, it may
also indicate this. But I did not test this part separately.
As you suggested, I will collect more data to check this change.

>
>> And for this case, it also saves other instructions(addis/addi):
>>         pld %r3,.LC1@pcrel
>>         blr
>
> This explanation then belongs in your commit message :-)
>
>> > Btw, you need to write
>> >   \m(?:rldimi|ld|pld)\M
>> Oh, thanks!  I did not aware of this.
>
> (?:xxx) is just the same as (xxx), but the parentheses in the latter are
> capturing, which messes up scan-assembler-times, so you need
> non-capturing grouping.
Thanks.
>
>> Maybe I should separate out two cases one for "-mdejagnu-cpu=power10",
>> and one for "-mdejagnu-cpu=power7", because the load instructions
>> the number would be different.
>
> Yeah, it is probably best to never allow both a load and non-load insns
> in the same check.
OK, thanks.
>
>> > So no doubt this will improve things, but we need testing of each part
>> > separately.  Also look at code size, or differences in the generated
>> > code in general: this is much more sensitive to detect than performance,
>> > and is not itself sensitive to things like system load, so a) is easier
>> > to measure, and b) has more useful outputs, outputs that tell more of
>> > the whole story.
>> Thanks for your suggestions!
>> I also feel it is very sensitive when I test performance.
>
> Yeah, there are constants *everywhere* in typical compiler output, so a
> small change will easily move the needle already.
>
>> The change
>> (e.g. cost of constant) may affect other optimization passes indirectly.
>
> That too, yes.
>
>> I will also check the object changes (object size or differences). I may
>> use objects from GCC bootstrap and GCC test suite.
>
> Good choice :-)
>
> Looking forward to it,

Any comments or suggestions, thanks for point out!

BR,
Jiufu Guo

>
>
> Segher

Segher Boessenkool June 21, 2022, 6:10 p.m. UTC | #5

Hi!

On Thu, Jun 16, 2022 at 03:47:49PM +0800, Jiufu Guo wrote:
> Segher Boessenkool <segher@kernel.crashing.org> writes:
> >> >> --- a/gcc/testsuite/gcc.target/powerpc/medium_offset.c
> >> >> +++ b/gcc/testsuite/gcc.target/powerpc/medium_offset.c
> >> >> @@ -1,7 +1,7 @@
> >> >>  /* { dg-do compile { target { powerpc*-*-* } } } */
> >> >>  /* { dg-require-effective-target lp64 } */
> >> >>  /* { dg-options "-O" } */
> >> >> -/* { dg-final { scan-assembler-not "\\+4611686018427387904" } } */
> >> >> +/* { dg-final { scan-assembler-times {\msldi|pld\M} 1 } } */
> >> >
> >> > Why?  This is still better generated in code, no?  It should never be
> >> > loaded from a constant pool (it is hex 4000_0000_0000_0000, easy to
> >> > construct with just one or two insns).
> >> 
> >> For p8/9, two insns "lis 0x4000+sldi 32" are used:
> >>         addis %r3,%r2,.LANCHOR0@toc@ha
> >>         addi %r3,%r3,.LANCHOR0@toc@l
> >>         lis %r9,0x4000
> >>         sldi %r9,%r9,32
> >>         add %r3,%r3,%r9
> >> 	blr
> >
> > That does not mean putting this constant in the constant pool is a good
> > idea at all, of course.
> >
> >> On p10, as expected, 'pld' would be better than 'lis+sldi'.
> >
> > Is it?
> 
> With simple cases, it shows 'pld' seems better. For perlbench, it may
> also indicate this. But I did not test this part separately.
> As you suggested, I will collect more data to check this change.

Look at p10 for example.  There can be only two pld's concurrently, and
they might miss in the cache as well (not likely hopefully, but it is
costly).  pld is between 4 and 6 cycles latency, so that is never better
than 1+1 to 3+3 what the addi+rldicr (li+sldi) are, and easily worse.

If you really see loads being better than two simple integer insns, we
need to rethink more :-/


Segher

diff mbox series

Patch

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index cd291f93019..90c91a8e1ea 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -9706,8 +9706,9 @@  rs6000_init_stack_protect_guard (void)
 static bool
 rs6000_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
 {
-  if (GET_CODE (x) == HIGH
-      && GET_CODE (XEXP (x, 0)) == UNSPEC)
+  /* Exclude CONSTANT HIGH part.  e.g.
+     (high:DI (symbol_ref:DI ("var") [flags 0xc0] <var_decl>)).  */
+  if (GET_CODE (x) == HIGH)
     return true;
 
   /* A TLS symbol in the TOC cannot contain a sum.  */
@@ -11139,7 +11140,7 @@  rs6000_emit_move (rtx dest, rtx source, machine_mode mode)
 		    && FP_REGNO_P (REGNO (operands[0])))
 		   || !CONST_INT_P (operands[1])
 		   || (num_insns_constant (operands[1], mode)
-		       > (TARGET_CMODEL != CMODEL_SMALL ? 3 : 2)))
+		       > (TARGET_PREFIXED ? 1 : 2)))
 	       && !toc_relative_expr_p (operands[1], false, NULL, NULL)
 	       && (TARGET_CMODEL == CMODEL_SMALL
 		   || can_create_pseudo_p ()
@@ -22101,6 +22102,14 @@  rs6000_rtx_costs (rtx x, machine_mode mode, int outer_code,
 
     case CONST_DOUBLE:
     case CONST_WIDE_INT:
+      /* It may needs a few insns for const to SET. -1 for outer SET code.  */
+      if (outer_code == SET)
+	{
+	  *total = COSTS_N_INSNS (num_insns_constant (x, mode)) - 1;
+	  return true;
+	}
+      /* FALLTHRU */
+
     case CONST:
     case HIGH:
     case SYMBOL_REF:
@@ -22110,8 +22119,12 @@  rs6000_rtx_costs (rtx x, machine_mode mode, int outer_code,
     case MEM:
       /* When optimizing for size, MEM should be slightly more expensive
 	 than generating address, e.g., (plus (reg) (const)).
-	 L1 cache latency is about two instructions.  */
-      *total = !speed ? COSTS_N_INSNS (1) + 1 : COSTS_N_INSNS (2);
+	 L1 cache latency is about two instructions.
+	 For prefixed load (pld), we would set it slightly faster than
+	 than two instructions. */
+      *total = !speed
+		 ? COSTS_N_INSNS (1) + 1
+		 : TARGET_PREFIXED ? COSTS_N_INSNS (2) - 1 : COSTS_N_INSNS (2);
       if (rs6000_slow_unaligned_access (mode, MEM_ALIGN (x)))
 	*total += COSTS_N_INSNS (100);
       return true;
diff --git a/gcc/testsuite/gcc.target/powerpc/medium_offset.c b/gcc/testsuite/gcc.target/powerpc/medium_offset.c
index f29eba08c38..4889e8fa8ec 100644
--- a/gcc/testsuite/gcc.target/powerpc/medium_offset.c
+++ b/gcc/testsuite/gcc.target/powerpc/medium_offset.c
@@ -1,7 +1,7 @@ 
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-require-effective-target lp64 } */
 /* { dg-options "-O" } */
-/* { dg-final { scan-assembler-not "\\+4611686018427387904" } } */
+/* { dg-final { scan-assembler-times {\msldi|pld\M} 1 } } */
 
 static int x;
 
diff --git a/gcc/testsuite/gcc.target/powerpc/pr63281.c b/gcc/testsuite/gcc.target/powerpc/pr63281.c
new file mode 100644
index 00000000000..469a8f64400
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr63281.c
@@ -0,0 +1,11 @@ 
+/* PR target/63281 */
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O2 -std=c99" } */
+
+void
+foo (unsigned long long *a)
+{
+  *a = 0x020805006106003;
+}
+
+/* { dg-final { scan-assembler-times {\mp?ld\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr93012.c b/gcc/testsuite/gcc.target/powerpc/pr93012.c
index 4f764d0576f..5afb4f79c45 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr93012.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr93012.c
@@ -10,4 +10,4 @@  unsigned long long mskh1() { return 0xffff9234ffff9234ULL; }
 unsigned long long mskl1() { return 0x2bcdffff2bcdffffULL; }
 unsigned long long mskse() { return 0xffff1234ffff1234ULL; }
 
-/* { dg-final { scan-assembler-times {\mrldimi\M} 7 } } */
+/* { dg-final { scan-assembler-times {\mrldimi|ld|pld\M} 7 } } */