[1/3] Add power10 zero cycle moves for switches & indirect jumps
Commit Message
Add power10 zero cycle moves for switches.
Power10 will fuse adjacenet 'mtctr' and 'bctr' instructions to form zero
cycle moves. This code exploits this fusion opportunity.
I have built bootstrapped compilers with this patch on little endian power9 and
power10 systems with no regressions. Can I install this into the master
branch?
2021-11-19 Michael Meissner <meissner@the-meissners.org>
* config/rs6000/rs6000-cpus.def (ISA_3_1_MASKS_SERVER): Add
support for -mpower10-fusion-zero-cycle.
(POWERPC_MASKS): Likewise.
* config/rs6000/rs6000.c (rs6000_option_override_internal):
Likewise.
* config/rs6000/rs6000.md (indirect_jump): Support zero cycle
moves.
(indirect_jump<mode>_zero_cycle): New insns.
(tablejump<mode>_normal): Likewise.
(tablejump<mode>_absolute): Likewise.
(tablejump<mode>_insn_zero_cycle): New insn.
* config/rs6000/rs6000.opt (-mpower10-fusion-zero-cycle): New
debug switch.
---
gcc/config/rs6000/rs6000-cpus.def | 4 ++-
gcc/config/rs6000/rs6000.c | 4 +++
gcc/config/rs6000/rs6000.md | 52 ++++++++++++++++++++++++++++---
gcc/config/rs6000/rs6000.opt | 4 +++
4 files changed, 59 insertions(+), 5 deletions(-)
Comments
Hi Mike,
Thanks for this patch!
On 11/19/21 8:53 AM, Michael Meissner wrote:
> Add power10 zero cycle moves for switches.
>
> Power10 will fuse adjacenet 'mtctr' and 'bctr' instructions to form zero
> cycle moves. This code exploits this fusion opportunity.
>
> I have built bootstrapped compilers with this patch on little endian power9 and
> power10 systems with no regressions. Can I install this into the master
> branch?
>
> 2021-11-19 Michael Meissner <meissner@the-meissners.org>
>
> * config/rs6000/rs6000-cpus.def (ISA_3_1_MASKS_SERVER): Add
> support for -mpower10-fusion-zero-cycle.
> (POWERPC_MASKS): Likewise.
> * config/rs6000/rs6000.c (rs6000_option_override_internal):
> Likewise.
> * config/rs6000/rs6000.md (indirect_jump): Support zero cycle
> moves.
> (indirect_jump<mode>_zero_cycle): New insns.
> (tablejump<mode>_normal): Likewise.
> (tablejump<mode>_absolute): Likewise.
> (tablejump<mode>_insn_zero_cycle): New insn.
> * config/rs6000/rs6000.opt (-mpower10-fusion-zero-cycle): New
> debug switch.
> ---
> gcc/config/rs6000/rs6000-cpus.def | 4 ++-
> gcc/config/rs6000/rs6000.c | 4 +++
> gcc/config/rs6000/rs6000.md | 52 ++++++++++++++++++++++++++++---
> gcc/config/rs6000/rs6000.opt | 4 +++
> 4 files changed, 59 insertions(+), 5 deletions(-)
>
> diff --git a/gcc/config/rs6000/rs6000-cpus.def b/gcc/config/rs6000/rs6000-cpus.def
> index f5812da0184..cc072ee94ea 100644
> --- a/gcc/config/rs6000/rs6000-cpus.def
> +++ b/gcc/config/rs6000/rs6000-cpus.def
> @@ -91,7 +91,8 @@
> | OPTION_MASK_P10_FUSION_LOGADD \
> | OPTION_MASK_P10_FUSION_ADDLOG \
> | OPTION_MASK_P10_FUSION_2ADD \
> - | OPTION_MASK_P10_FUSION_2STORE)
> + | OPTION_MASK_P10_FUSION_2STORE \
> + | OPTION_MASK_P10_FUSION_ZERO_CYCLE)
I guess it's fine to introduce one more for now, but ultimately we want
all these to get collapsed down to one. No worries from me.
>
> /* Flags that need to be turned off if -mno-power9-vector. */
> #define OTHER_P9_VECTOR_MASKS (OPTION_MASK_FLOAT128_HW \
> @@ -145,6 +146,7 @@
> | OPTION_MASK_P10_FUSION_ADDLOG \
> | OPTION_MASK_P10_FUSION_2ADD \
> | OPTION_MASK_P10_FUSION_2STORE \
> + | OPTION_MASK_P10_FUSION_ZERO_CYCLE \
> | OPTION_MASK_HTM \
> | OPTION_MASK_ISEL \
> | OPTION_MASK_MFCRF \
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index e4843eb0f1c..6780304a5eb 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -4497,6 +4497,10 @@ rs6000_option_override_internal (bool global_init_p)
> && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0)
> rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE;
>
> + if (TARGET_POWER10
> + && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_ZERO_CYCLE) == 0)
> + rs6000_isa_flags |= OPTION_MASK_P10_FUSION_ZERO_CYCLE;
> +
> /* Turn off vector pair/mma options on non-power10 systems. */
> else if (!TARGET_POWER10 && TARGET_MMA)
> {
> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
> index 6bec2bddbde..ea41eb4ada3 100644
> --- a/gcc/config/rs6000/rs6000.md
> +++ b/gcc/config/rs6000/rs6000.md
> @@ -12988,15 +12988,34 @@ (define_expand "indirect_jump"
> emit_jump_insn (gen_indirect_jump_nospec (Pmode, operands[0], ccreg));
> DONE;
> }
> + if (TARGET_P10_FUSION && TARGET_P10_FUSION_ZERO_CYCLE)
> + {
> + emit_jump_insn (gen_indirect_jump_zero_cycle (Pmode, operands[0]));
> + DONE;
> + }
> })
>
> (define_insn "*indirect_jump<mode>"
> [(set (pc)
> (match_operand:P 0 "register_operand" "c,*l"))]
> - "rs6000_speculate_indirect_jumps"
> + "rs6000_speculate_indirect_jumps
> + && !(TARGET_P10_FUSION && TARGET_P10_FUSION_ZERO_CYCLE)"
> "b%T0"
> [(set_attr "type" "jmpreg")])
>
> +(define_insn "@indirect_jump<mode>_zero_cycle"
I don't know why this is an "@" pattern, but honestly I don't
know why @indirect_jump<mode>_nospec is an "@" pattern either.
The documentation for such things is hard for me to understand,
so I'm probably just missing something obvious, but I don't
immediately see why we would need the @ here.
> + [(set (pc)
> + (match_operand:P 0 "register_operand" "r,r,!cl"))
> + (clobber (match_scratch:P 1 "=c,*l,X"))]
Do we need the *l and X alternatives if we're only doing this for
mtctr/bctr?
> + "rs6000_speculate_indirect_jumps && TARGET_P10_FUSION
> + && TARGET_P10_FUSION_ZERO_CYCLE"
> + "@
> + mt%T1 %0\;b%T1
> + mt%T1 %0\;b%T1
> + b%T0"
> + [(set_attr "type" "jmpreg")
> + (set_attr "length" "8,8,4")])
> +
> (define_insn "@indirect_jump<mode>_nospec"
> [(set (pc) (match_operand:P 0 "register_operand" "c,*l"))
> (clobber (match_operand:CC 1 "cc_reg_operand" "=y,y"))]
> @@ -13050,7 +13069,11 @@ (define_expand "@tablejump<mode>_normal"
> rtx addr = gen_reg_rtx (Pmode);
>
> emit_insn (gen_add<mode>3 (addr, off, lab));
> - emit_jump_insn (gen_tablejump_insn_normal (Pmode, addr, operands[1]));
> + rtx insn = (TARGET_P10_FUSION && TARGET_P10_FUSION_ZERO_CYCLE
> + ? gen_tablejump_insn_zero_cycle (Pmode, addr, operands[1])
> + : gen_tablejump_insn_normal (Pmode, addr, operands[1]));
> +
> + emit_jump_insn (insn);
> DONE;
> })
>
> @@ -13062,7 +13085,11 @@ (define_expand "@tablejump<mode>_absolute"
> rtx addr = gen_reg_rtx (Pmode);
> emit_move_insn (addr, operands[0]);
>
> - emit_jump_insn (gen_tablejump_insn_normal (Pmode, addr, operands[1]));
> + rtx insn = (TARGET_P10_FUSION && TARGET_P10_FUSION_ZERO_CYCLE
> + ? gen_tablejump_insn_zero_cycle (Pmode, addr, operands[1])
> + : gen_tablejump_insn_normal (Pmode, addr, operands[1]));
> +
> + emit_jump_insn (insn);
> DONE;
> })
>
> @@ -13107,10 +13134,27 @@ (define_insn "@tablejump<mode>_insn_normal"
> [(set (pc)
> (match_operand:P 0 "register_operand" "c,*l"))
> (use (label_ref (match_operand 1)))]
> - "rs6000_speculate_indirect_jumps"
> + "rs6000_speculate_indirect_jumps
> + && !(TARGET_P10_FUSION && TARGET_P10_FUSION_ZERO_CYCLE)"
> "b%T0"
> [(set_attr "type" "jmpreg")])
>
> +;; Version of indirect jump that fuses the mtctr to bctr to achieve 0 cycle
> +;; moves on Power10.
> +(define_insn "@tablejump<mode>_insn_zero_cycle"
Same question about @.
> + [(set (pc)
> + (match_operand:P 0 "register_operand" "r,r,!cl"))
> + (use (label_ref (match_operand 1)))
> + (clobber (match_scratch:P 2 "=c,*l,X"))]
Same question about 2nd and 3rd alternatives.
Otherwise LGTM... over to the maintainers. :)
Thanks!
Bill
> + "rs6000_speculate_indirect_jumps && TARGET_P10_FUSION
> + && TARGET_P10_FUSION_ZERO_CYCLE"
> + "@
> + mt%T2 %0\;b%T2
> + mt%T2 %0\;b%T2
> + b%T0"
> + [(set_attr "type" "jmpreg")
> + (set_attr "length" "8,8,4")])
> +
> (define_insn "@tablejump<mode>_insn_nospec"
> [(set (pc)
> (match_operand:P 0 "register_operand" "c,*l"))
> diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
> index 9d7878f144a..ba674947557 100644
> --- a/gcc/config/rs6000/rs6000.opt
> +++ b/gcc/config/rs6000/rs6000.opt
> @@ -518,6 +518,10 @@ mpower10-fusion-2store
> Target Undocumented Mask(P10_FUSION_2STORE) Var(rs6000_isa_flags)
> Fuse certain store operations together for better performance on power10.
>
> +mpower10-fusion-zero-cycle
> +Target Undocumented Mask(P10_FUSION_ZERO_CYCLE) Var(rs6000_isa_flags)
> +Fuse move to special register and jump for better performance on power10.
> +
> mcrypto
> Target Mask(CRYPTO) Var(rs6000_isa_flags)
> Use ISA 2.07 Category:Vector.AES and Category:Vector.SHA2 instructions.
On Mon, Nov 22, 2021 at 10:36:13AM -0600, Bill Schmidt wrote:
> Hi Mike,
>
> Thanks for this patch!
> > --- a/gcc/config/rs6000/rs6000.md
> > +++ b/gcc/config/rs6000/rs6000.md
> > @@ -12988,15 +12988,34 @@ (define_expand "indirect_jump"
> > emit_jump_insn (gen_indirect_jump_nospec (Pmode, operands[0], ccreg));
> > DONE;
> > }
> > + if (TARGET_P10_FUSION && TARGET_P10_FUSION_ZERO_CYCLE)
> > + {
> > + emit_jump_insn (gen_indirect_jump_zero_cycle (Pmode, operands[0]));
> > + DONE;
> > + }
> > })
> >
> > (define_insn "*indirect_jump<mode>"
> > [(set (pc)
> > (match_operand:P 0 "register_operand" "c,*l"))]
> > - "rs6000_speculate_indirect_jumps"
> > + "rs6000_speculate_indirect_jumps
> > + && !(TARGET_P10_FUSION && TARGET_P10_FUSION_ZERO_CYCLE)"
> > "b%T0"
> > [(set_attr "type" "jmpreg")])
> >
> > +(define_insn "@indirect_jump<mode>_zero_cycle"
>
> I don't know why this is an "@" pattern, but honestly I don't
> know why @indirect_jump<mode>_nospec is an "@" pattern either.
> The documentation for such things is hard for me to understand,
> so I'm probably just missing something obvious, but I don't
> immediately see why we would need the @ here.
I didn't know about it either. Basically the next insn used it:
(define_insn "@indirect_jump<mode>_nospec"
[(set (pc) (match_operand:P 0 "register_operand" "c,*l"))
(clobber (match_operand:CC 1 "cc_reg_operand" "=y,y"))]
"!rs6000_speculate_indirect_jumps"
"crset %E1\;beq%T0- %1\;b $"
[(set_attr "type" "jmpreg")
(set_attr "length" "12")])
This creates a function:
gen_indirect_jump_nospec (machine_mode arg0, rtx x0, rtx x1)
where the mode of the P iterator is passed as argument. I.e. you can do:
rtx foo = gen_indirect_jump_nospec (Pmode, op0, op1);
instead of:
rtx foo;
if (Pmode == SImode)
foo = gen_indirect_jumpsi_nospec (op0, op1);
else if (Pmode == DImode)
foo = gen_indirect_jumpdi_nospec (op0, op1);
else
gcc_unreachable ();
> > + [(set (pc)
> > + (match_operand:P 0 "register_operand" "r,r,!cl"))
> > + (clobber (match_scratch:P 1 "=c,*l,X"))]
>
> Do we need the *l and X alternatives if we're only doing this for
> mtctr/bctr?
Probably not, but I recall back before the current allocator, that it would
cause crashes if we didn't have LR. I could certainly eliminate the *l
alternative.
Ping patch.
| Date: Fri, 19 Nov 2021 09:53:14 -0500
| From: Michael Meissner <meissner@linux.ibm.com>
| Subject: [PATCH 1/3] Add power10 zero cycle moves for switches & indirect jumps
| Message-ID: <YZe6WugqvxPKNQJj@toto.the-meissners.org>
https://gcc.gnu.org/pipermail/gcc-patches/2021-November/585005.html
Note, I will on-line through December 20th. I will be off-line from December
21st through January 1st.
@@ -91,7 +91,8 @@
| OPTION_MASK_P10_FUSION_LOGADD \
| OPTION_MASK_P10_FUSION_ADDLOG \
| OPTION_MASK_P10_FUSION_2ADD \
- | OPTION_MASK_P10_FUSION_2STORE)
+ | OPTION_MASK_P10_FUSION_2STORE \
+ | OPTION_MASK_P10_FUSION_ZERO_CYCLE)
/* Flags that need to be turned off if -mno-power9-vector. */
#define OTHER_P9_VECTOR_MASKS (OPTION_MASK_FLOAT128_HW \
@@ -145,6 +146,7 @@
| OPTION_MASK_P10_FUSION_ADDLOG \
| OPTION_MASK_P10_FUSION_2ADD \
| OPTION_MASK_P10_FUSION_2STORE \
+ | OPTION_MASK_P10_FUSION_ZERO_CYCLE \
| OPTION_MASK_HTM \
| OPTION_MASK_ISEL \
| OPTION_MASK_MFCRF \
@@ -4497,6 +4497,10 @@ rs6000_option_override_internal (bool global_init_p)
&& (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0)
rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE;
+ if (TARGET_POWER10
+ && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_ZERO_CYCLE) == 0)
+ rs6000_isa_flags |= OPTION_MASK_P10_FUSION_ZERO_CYCLE;
+
/* Turn off vector pair/mma options on non-power10 systems. */
else if (!TARGET_POWER10 && TARGET_MMA)
{
@@ -12988,15 +12988,34 @@ (define_expand "indirect_jump"
emit_jump_insn (gen_indirect_jump_nospec (Pmode, operands[0], ccreg));
DONE;
}
+ if (TARGET_P10_FUSION && TARGET_P10_FUSION_ZERO_CYCLE)
+ {
+ emit_jump_insn (gen_indirect_jump_zero_cycle (Pmode, operands[0]));
+ DONE;
+ }
})
(define_insn "*indirect_jump<mode>"
[(set (pc)
(match_operand:P 0 "register_operand" "c,*l"))]
- "rs6000_speculate_indirect_jumps"
+ "rs6000_speculate_indirect_jumps
+ && !(TARGET_P10_FUSION && TARGET_P10_FUSION_ZERO_CYCLE)"
"b%T0"
[(set_attr "type" "jmpreg")])
+(define_insn "@indirect_jump<mode>_zero_cycle"
+ [(set (pc)
+ (match_operand:P 0 "register_operand" "r,r,!cl"))
+ (clobber (match_scratch:P 1 "=c,*l,X"))]
+ "rs6000_speculate_indirect_jumps && TARGET_P10_FUSION
+ && TARGET_P10_FUSION_ZERO_CYCLE"
+ "@
+ mt%T1 %0\;b%T1
+ mt%T1 %0\;b%T1
+ b%T0"
+ [(set_attr "type" "jmpreg")
+ (set_attr "length" "8,8,4")])
+
(define_insn "@indirect_jump<mode>_nospec"
[(set (pc) (match_operand:P 0 "register_operand" "c,*l"))
(clobber (match_operand:CC 1 "cc_reg_operand" "=y,y"))]
@@ -13050,7 +13069,11 @@ (define_expand "@tablejump<mode>_normal"
rtx addr = gen_reg_rtx (Pmode);
emit_insn (gen_add<mode>3 (addr, off, lab));
- emit_jump_insn (gen_tablejump_insn_normal (Pmode, addr, operands[1]));
+ rtx insn = (TARGET_P10_FUSION && TARGET_P10_FUSION_ZERO_CYCLE
+ ? gen_tablejump_insn_zero_cycle (Pmode, addr, operands[1])
+ : gen_tablejump_insn_normal (Pmode, addr, operands[1]));
+
+ emit_jump_insn (insn);
DONE;
})
@@ -13062,7 +13085,11 @@ (define_expand "@tablejump<mode>_absolute"
rtx addr = gen_reg_rtx (Pmode);
emit_move_insn (addr, operands[0]);
- emit_jump_insn (gen_tablejump_insn_normal (Pmode, addr, operands[1]));
+ rtx insn = (TARGET_P10_FUSION && TARGET_P10_FUSION_ZERO_CYCLE
+ ? gen_tablejump_insn_zero_cycle (Pmode, addr, operands[1])
+ : gen_tablejump_insn_normal (Pmode, addr, operands[1]));
+
+ emit_jump_insn (insn);
DONE;
})
@@ -13107,10 +13134,27 @@ (define_insn "@tablejump<mode>_insn_normal"
[(set (pc)
(match_operand:P 0 "register_operand" "c,*l"))
(use (label_ref (match_operand 1)))]
- "rs6000_speculate_indirect_jumps"
+ "rs6000_speculate_indirect_jumps
+ && !(TARGET_P10_FUSION && TARGET_P10_FUSION_ZERO_CYCLE)"
"b%T0"
[(set_attr "type" "jmpreg")])
+;; Version of indirect jump that fuses the mtctr to bctr to achieve 0 cycle
+;; moves on Power10.
+(define_insn "@tablejump<mode>_insn_zero_cycle"
+ [(set (pc)
+ (match_operand:P 0 "register_operand" "r,r,!cl"))
+ (use (label_ref (match_operand 1)))
+ (clobber (match_scratch:P 2 "=c,*l,X"))]
+ "rs6000_speculate_indirect_jumps && TARGET_P10_FUSION
+ && TARGET_P10_FUSION_ZERO_CYCLE"
+ "@
+ mt%T2 %0\;b%T2
+ mt%T2 %0\;b%T2
+ b%T0"
+ [(set_attr "type" "jmpreg")
+ (set_attr "length" "8,8,4")])
+
(define_insn "@tablejump<mode>_insn_nospec"
[(set (pc)
(match_operand:P 0 "register_operand" "c,*l"))
@@ -518,6 +518,10 @@ mpower10-fusion-2store
Target Undocumented Mask(P10_FUSION_2STORE) Var(rs6000_isa_flags)
Fuse certain store operations together for better performance on power10.
+mpower10-fusion-zero-cycle
+Target Undocumented Mask(P10_FUSION_ZERO_CYCLE) Var(rs6000_isa_flags)
+Fuse move to special register and jump for better performance on power10.
+
mcrypto
Target Mask(CRYPTO) Var(rs6000_isa_flags)
Use ISA 2.07 Category:Vector.AES and Category:Vector.SHA2 instructions.