[v2] rs6000: fmr gets used instead of faster xxlor [PR93571]

Message ID 174972e2-3792-935b-ed4e-4e9d3d4ec26a@linux.ibm.com
State New
Headers
Series [v2] rs6000: fmr gets used instead of faster xxlor [PR93571] |

Commit Message

Ajit Agarwal Feb. 25, 2023, 9:50 a.m. UTC
  Hello All:

Here is the patch that uses xxlor instead of fmr where possible.
Performance results shows that fmr is better in power9 and 
power10 architectures whereas xxlor is better in power7 and
power 8 architectures. fmr is the only option before p7.

Bootstrapped and regtested on powerpc64-linux-gnu

Thanks & Regards
Ajit

	rs6000: Use xxlor instead of fmr where possible

	Replaces fmr with xxlor instruction for power7 and power8
	architectures whereas for power9 and power10 keep fmr
	instruction.

	Perf measurement results:

	Power9 fmr:  201,847,661 cycles.
	Power9 xxlor: 201,877,78 cycles.
	Power8 fmr: 200,901,043 cycles.
	Power8 xxlor: 201,020,518 cycles.
	Power7 fmr: 201,059,524 cycles.
	Power7 xxlor: 201,042,851 cycles.

	2023-02-25  Ajit Kumar Agarwal  <aagarwa1@linux.ibm.com>

gcc/ChangeLog:

	* config/rs6000/rs6000.md (*movdf_hardfloat64): Use xxlor for power7
	and power8 and fmr for power9 and power10.
---
 gcc/config/rs6000/rs6000.md | 44 +++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 16 deletions(-)
  

Comments

Ajit Agarwal June 12, 2023, 8:43 a.m. UTC | #1
Hello Segher:

Please review and let me know your feedback to submit in trunk.

Thanks & Regards
Ajit

On 25/02/23 3:20 pm, Ajit Agarwal via Gcc-patches wrote:
> Hello All:
> 
> Here is the patch that uses xxlor instead of fmr where possible.
> Performance results shows that fmr is better in power9 and 
> power10 architectures whereas xxlor is better in power7 and
> power 8 architectures. fmr is the only option before p7.
> 
> Bootstrapped and regtested on powerpc64-linux-gnu
> 
> Thanks & Regards
> Ajit
> 
> 	rs6000: Use xxlor instead of fmr where possible
> 
> 	Replaces fmr with xxlor instruction for power7 and power8
> 	architectures whereas for power9 and power10 keep fmr
> 	instruction.
> 
> 	Perf measurement results:
> 
> 	Power9 fmr:  201,847,661 cycles.
> 	Power9 xxlor: 201,877,78 cycles.
> 	Power8 fmr: 200,901,043 cycles.
> 	Power8 xxlor: 201,020,518 cycles.
> 	Power7 fmr: 201,059,524 cycles.
> 	Power7 xxlor: 201,042,851 cycles.
> 
> 	2023-02-25  Ajit Kumar Agarwal  <aagarwa1@linux.ibm.com>
> 
> gcc/ChangeLog:
> 
> 	* config/rs6000/rs6000.md (*movdf_hardfloat64): Use xxlor for power7
> 	and power8 and fmr for power9 and power10.
> ---
>  gcc/config/rs6000/rs6000.md | 44 +++++++++++++++++++++++--------------
>  1 file changed, 28 insertions(+), 16 deletions(-)
> 
> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
> index 81bffb04ceb..e101f7f5fc1 100644
> --- a/gcc/config/rs6000/rs6000.md
> +++ b/gcc/config/rs6000/rs6000.md
> @@ -354,7 +354,7 @@ (define_attr "cpu"
>    (const (symbol_ref "(enum attr_cpu) rs6000_tune")))
>  
>  ;; The ISA we implement.
> -(define_attr "isa" "any,p5,p6,p7,p7v,p8v,p9,p9v,p9kf,p9tf,p10"
> +(define_attr "isa" "any,p5,p6,p7,p7v,p8v,p7p8v,p9,p9v,p9kf,p9tf,p10"
>    (const_string "any"))
>  
>  ;; Is this alternative enabled for the current CPU/ISA/etc.?
> @@ -402,6 +402,11 @@ (define_attr "enabled" ""
>       (and (eq_attr "isa" "p10")
>  	  (match_test "TARGET_POWER10"))
>       (const_int 1)
> +      
> +     (and (eq_attr "isa" "p7p8v")
> +	  (match_test "TARGET_VSX && !TARGET_P9_VECTOR"))
> +     (const_int 1)
> +
>      ] (const_int 0)))
>  
>  ;; If this instruction is microcoded on the CELL processor
> @@ -8436,27 +8441,29 @@ (define_insn "*mov<mode>_softfloat32"
>  
>  (define_insn "*mov<mode>_hardfloat64"
>    [(set (match_operand:FMOVE64 0 "nonimmediate_operand"
> -           "=m,           d,          d,          <f64_p9>,   wY,
> -             <f64_av>,    Z,          <f64_vsx>,  <f64_vsx>,  !r,
> +           "=m,           d,          <f64_vsx>,  <f64_p9>,   wY,
> +             <f64_av>,    Z,          wa,         <f64_vsx>,  !r,
>               YZ,          r,          !r,         *c*l,       !r,
> -            *h,           r,          <f64_dm>,   wa")
> +            *h,           r,          <f64_dm>,   d,          wn,
> +            wa")
>  	(match_operand:FMOVE64 1 "input_operand"
> -            "d,           m,          d,          wY,         <f64_p9>,
> -             Z,           <f64_av>,   <f64_vsx>,  <zero_fp>,  <zero_fp>,
> +            "d,           m,          <f64_vsx>,  wY,         <f64_p9>,
> +             Z,           <f64_av>,   wa,         <zero_fp>,  <zero_fp>,
>               r,           YZ,         r,          r,          *h,
> -             0,           <f64_dm>,   r,          eP"))]
> +             0,           <f64_dm>,   r,          d,          wn,
> +             eP"))]
>    "TARGET_POWERPC64 && TARGET_HARD_FLOAT
>     && (gpc_reg_operand (operands[0], <MODE>mode)
>         || gpc_reg_operand (operands[1], <MODE>mode))"
>    "@
>     stfd%U0%X0 %1,%0
>     lfd%U1%X1 %0,%1
> -   fmr %0,%1
> +   xxlor %x0,%x1,%x1
>     lxsd %0,%1
>     stxsd %1,%0
>     lxsdx %x0,%y1
>     stxsdx %x1,%y0
> -   xxlor %x0,%x1,%x1
> +   fmr %0,%1
>     xxlxor %x0,%x0,%x0
>     li %0,0
>     std%U0%X0 %1,%0
> @@ -8467,23 +8474,28 @@ (define_insn "*mov<mode>_hardfloat64"
>     nop
>     mfvsrd %0,%x1
>     mtvsrd %x0,%1
> +   fmr %0,%1
> +   fmr %0,%1
>     #"
>    [(set_attr "type"
> -            "fpstore,     fpload,     fpsimple,   fpload,     fpstore,
> +            "fpstore,     fpload,     veclogical, fpload,     fpstore,
>               fpload,      fpstore,    veclogical, veclogical, integer,
>               store,       load,       *,          mtjmpr,     mfjmpr,
> -             *,           mfvsr,      mtvsr,      vecperm")
> +             *,           mfvsr,      mtvsr,      fpsimple,   fpsimple,
> +             vecperm")
>     (set_attr "size" "64")
>     (set_attr "isa"
> -            "*,           *,          *,          p9v,        p9v,
> -             p7v,         p7v,        *,          *,          *,
> -             *,           *,          *,          *,          *,
> -             *,           p8v,        p8v,        p10")
> +            "*,           *,          p7p8v,        p9v,        p9v,
> +             p7v,         p7v,        *,           *,          *,
> +             *,           *,          *,           *,          *,
> +             *,           p8v,        p8v,         *,          *,
> +             p10")
>     (set_attr "prefixed"
>              "*,           *,          *,          *,          *,
>               *,           *,          *,          *,          *,
>               *,           *,          *,          *,          *,
> -             *,           *,          *,          *")])
> +             *,           *,          *,          *,          *,
> +             *")])
>  
>  ;;           STD      LD       MR      MT<SPR> MF<SPR> G-const
>  ;;           H-const  F-const  Special
  
Segher Boessenkool June 12, 2023, 12:51 p.m. UTC | #2
Hi!

On Sat, Feb 25, 2023 at 03:20:33PM +0530, Ajit Agarwal wrote:
> Here is the patch that uses xxlor instead of fmr where possible.
> Performance results shows that fmr is better in power9 and 
> power10 architectures whereas xxlor is better in power7 and
> power 8 architectures. fmr is the only option before p7.

>  ;; The ISA we implement.
> -(define_attr "isa" "any,p5,p6,p7,p7v,p8v,p9,p9v,p9kf,p9tf,p10"
> +(define_attr "isa" "any,p5,p6,p7,p7v,p8v,p7p8v,p9,p9v,p9kf,p9tf,p10"
>    (const_string "any"))

This isn't really about what insn we *can* use here.

> +     (and (eq_attr "isa" "p7p8v")
> +	  (match_test "TARGET_VSX && !TARGET_P9_VECTOR"))
> +     (const_int 1)

What is needed here is test the *tune* setting.  For example if someone
uses -mcpu=power8 -mtune=power9 (this is a setting that is really used,
or was a few years ago anyway) you *do* want fmr insns generated.

So don't do this via the isa attribute at all, just add some insn
condition (testing the tune setting)?


Segher
  
Surya Kumari Jangala June 14, 2023, 6:59 a.m. UTC | #3
On 25/02/23 3:20 pm, Ajit Agarwal via Gcc-patches wrote:
> Hello All:
> 
> Here is the patch that uses xxlor instead of fmr where possible.
> Performance results shows that fmr is better in power9 and 
> power10 architectures whereas xxlor is better in power7 and
> power 8 architectures. fmr is the only option before p7.
> 
> Bootstrapped and regtested on powerpc64-linux-gnu
> 
> Thanks & Regards
> Ajit
> 
> 	rs6000: Use xxlor instead of fmr where possible
> 
> 	Replaces fmr with xxlor instruction for power7 and power8
> 	architectures whereas for power9 and power10 keep fmr
> 	instruction.
> 
> 	Perf measurement results:
> 
> 	Power9 fmr:  201,847,661 cycles.
> 	Power9 xxlor: 201,877,78 cycles.
> 	Power8 fmr: 200,901,043 cycles.
> 	Power8 xxlor: 201,020,518 cycles.

'fmr' is better than 'xxlor' for power8 according to the above numbers. Should we then replace fmr with xxlor?

-Surya
  

Patch

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 81bffb04ceb..e101f7f5fc1 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -354,7 +354,7 @@  (define_attr "cpu"
   (const (symbol_ref "(enum attr_cpu) rs6000_tune")))
 
 ;; The ISA we implement.
-(define_attr "isa" "any,p5,p6,p7,p7v,p8v,p9,p9v,p9kf,p9tf,p10"
+(define_attr "isa" "any,p5,p6,p7,p7v,p8v,p7p8v,p9,p9v,p9kf,p9tf,p10"
   (const_string "any"))
 
 ;; Is this alternative enabled for the current CPU/ISA/etc.?
@@ -402,6 +402,11 @@  (define_attr "enabled" ""
      (and (eq_attr "isa" "p10")
 	  (match_test "TARGET_POWER10"))
      (const_int 1)
+      
+     (and (eq_attr "isa" "p7p8v")
+	  (match_test "TARGET_VSX && !TARGET_P9_VECTOR"))
+     (const_int 1)
+
     ] (const_int 0)))
 
 ;; If this instruction is microcoded on the CELL processor
@@ -8436,27 +8441,29 @@  (define_insn "*mov<mode>_softfloat32"
 
 (define_insn "*mov<mode>_hardfloat64"
   [(set (match_operand:FMOVE64 0 "nonimmediate_operand"
-           "=m,           d,          d,          <f64_p9>,   wY,
-             <f64_av>,    Z,          <f64_vsx>,  <f64_vsx>,  !r,
+           "=m,           d,          <f64_vsx>,  <f64_p9>,   wY,
+             <f64_av>,    Z,          wa,         <f64_vsx>,  !r,
              YZ,          r,          !r,         *c*l,       !r,
-            *h,           r,          <f64_dm>,   wa")
+            *h,           r,          <f64_dm>,   d,          wn,
+            wa")
 	(match_operand:FMOVE64 1 "input_operand"
-            "d,           m,          d,          wY,         <f64_p9>,
-             Z,           <f64_av>,   <f64_vsx>,  <zero_fp>,  <zero_fp>,
+            "d,           m,          <f64_vsx>,  wY,         <f64_p9>,
+             Z,           <f64_av>,   wa,         <zero_fp>,  <zero_fp>,
              r,           YZ,         r,          r,          *h,
-             0,           <f64_dm>,   r,          eP"))]
+             0,           <f64_dm>,   r,          d,          wn,
+             eP"))]
   "TARGET_POWERPC64 && TARGET_HARD_FLOAT
    && (gpc_reg_operand (operands[0], <MODE>mode)
        || gpc_reg_operand (operands[1], <MODE>mode))"
   "@
    stfd%U0%X0 %1,%0
    lfd%U1%X1 %0,%1
-   fmr %0,%1
+   xxlor %x0,%x1,%x1
    lxsd %0,%1
    stxsd %1,%0
    lxsdx %x0,%y1
    stxsdx %x1,%y0
-   xxlor %x0,%x1,%x1
+   fmr %0,%1
    xxlxor %x0,%x0,%x0
    li %0,0
    std%U0%X0 %1,%0
@@ -8467,23 +8474,28 @@  (define_insn "*mov<mode>_hardfloat64"
    nop
    mfvsrd %0,%x1
    mtvsrd %x0,%1
+   fmr %0,%1
+   fmr %0,%1
    #"
   [(set_attr "type"
-            "fpstore,     fpload,     fpsimple,   fpload,     fpstore,
+            "fpstore,     fpload,     veclogical, fpload,     fpstore,
              fpload,      fpstore,    veclogical, veclogical, integer,
              store,       load,       *,          mtjmpr,     mfjmpr,
-             *,           mfvsr,      mtvsr,      vecperm")
+             *,           mfvsr,      mtvsr,      fpsimple,   fpsimple,
+             vecperm")
    (set_attr "size" "64")
    (set_attr "isa"
-            "*,           *,          *,          p9v,        p9v,
-             p7v,         p7v,        *,          *,          *,
-             *,           *,          *,          *,          *,
-             *,           p8v,        p8v,        p10")
+            "*,           *,          p7p8v,        p9v,        p9v,
+             p7v,         p7v,        *,           *,          *,
+             *,           *,          *,           *,          *,
+             *,           p8v,        p8v,         *,          *,
+             p10")
    (set_attr "prefixed"
             "*,           *,          *,          *,          *,
              *,           *,          *,          *,          *,
              *,           *,          *,          *,          *,
-             *,           *,          *,          *")])
+             *,           *,          *,          *,          *,
+             *")])
 
 ;;           STD      LD       MR      MT<SPR> MF<SPR> G-const
 ;;           H-const  F-const  Special