AArch64: Improve immediate expansion [PR105928]
Checks
Context |
Check |
Description |
linaro-tcwg-bot/tcwg_gcc_build--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 |
success
|
Testing passed
|
Commit Message
Support immediate expansion of immediates which can be created from 2 MOVKs
and a shifted ORR or BIC instruction. Change aarch64_split_dimode_const_store
to apply if we save one instruction.
This reduces the number of 4-instruction immediates in SPECINT/FP by 5%.
Passes regress, OK for commit?
gcc/ChangeLog:
PR target/105928
* config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
Add support for immediates using shifted ORR/BIC.
(aarch64_split_dimode_const_store): Apply if we save one instruction.
* config/aarch64/aarch64.md (<LOGICAL:optab>_<SHIFT:optab><mode>3):
Make pattern global.
gcc/testsuite:
PR target/105928
* gcc.target/aarch64/pr105928.c: Add new test.
* gcc.target/aarch64/vect-cse-codegen.c: Fix test.
---
Comments
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Support immediate expansion of immediates which can be created from 2 MOVKs
> and a shifted ORR or BIC instruction. Change aarch64_split_dimode_const_store
> to apply if we save one instruction.
>
> This reduces the number of 4-instruction immediates in SPECINT/FP by 5%.
>
> Passes regress, OK for commit?
>
> gcc/ChangeLog:
> PR target/105928
> * config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
> Add support for immediates using shifted ORR/BIC.
> (aarch64_split_dimode_const_store): Apply if we save one instruction.
> * config/aarch64/aarch64.md (<LOGICAL:optab>_<SHIFT:optab><mode>3):
> Make pattern global.
>
> gcc/testsuite:
> PR target/105928
> * gcc.target/aarch64/pr105928.c: Add new test.
> * gcc.target/aarch64/vect-cse-codegen.c: Fix test.
Looks good apart from a comment below about the test.
I was worried that reusing "dest" for intermediate results would
prevent CSE for cases like:
void g (long long, long long);
void
f (long long *ptr)
{
g (0xee11ee22ee11ee22LL, 0xdc23dc44ee11ee22LL);
}
where the same 32-bit lowpart pattern is used for two immediates.
In principle, that could be avoided using:
if (generate)
{
rtx tmp = aarch64_target_reg (dest, DImode);
emit_insn (gen_rtx_SET (tmp, GEN_INT (val2 & 0xffff)));
emit_insn (gen_insv_immdi (tmp, GEN_INT (16),
GEN_INT (val2 >> 16)));
set_unique_reg_note (get_last_insn (), REG_EQUAL,
GEN_INT (val2));
emit_insn (gen_ior_ashldi3 (dest, tmp, GEN_INT (i), tmp));
}
return 3;
But it doesn't work, since we only expose the individual immediates
during split1, and nothing between split1 and ira is able to remove
redundancies. There's no point complicating the code for a theoretical
future optimisation.
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index c44c0b979d0cc3755c61dcf566cfddedccebf1ea..832f8197ac8d1a04986791e6f3e51861e41944b2 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -5639,7 +5639,7 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
> machine_mode mode)
> {
> int i;
> - unsigned HOST_WIDE_INT val, val2, mask;
> + unsigned HOST_WIDE_INT val, val2, val3, mask;
> int one_match, zero_match;
> int num_insns;
>
> @@ -5721,6 +5721,35 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
> }
> return 3;
> }
> +
> + /* Try shifting and inserting the bottom 32-bits into the top bits. */
> + val2 = val & 0xffffffff;
> + val3 = 0xffffffff;
> + val3 = val2 | (val3 << 32);
> + for (i = 17; i < 48; i++)
> + if ((val2 | (val2 << i)) == val)
> + {
> + if (generate)
> + {
> + emit_insn (gen_rtx_SET (dest, GEN_INT (val2 & 0xffff)));
> + emit_insn (gen_insv_immdi (dest, GEN_INT (16),
> + GEN_INT (val2 >> 16)));
> + emit_insn (gen_ior_ashldi3 (dest, dest, GEN_INT (i), dest));
> + }
> + return 3;
> + }
> + else if ((val3 & ~(val3 << i)) == val)
> + {
> + if (generate)
> + {
> + emit_insn (gen_rtx_SET (dest, GEN_INT (val3 | 0xffff0000)));
> + emit_insn (gen_insv_immdi (dest, GEN_INT (16),
> + GEN_INT (val2 >> 16)));
> + emit_insn (gen_and_one_cmpl_ashldi3 (dest, dest, GEN_INT (i),
> + dest));
> + }
> + return 3;
> + }
> }
>
> /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
> @@ -25506,8 +25535,6 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
> rtx lo = gen_lowpart (SImode, src);
> rtx hi = gen_highpart_mode (SImode, DImode, src);
>
> - bool size_p = optimize_function_for_size_p (cfun);
> -
> if (!rtx_equal_p (lo, hi))
> return false;
>
> @@ -25526,14 +25553,8 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
> MOV w1, 49370
> MOVK w1, 0x140, lsl 16
> STP w1, w1, [x0]
> - So we want to perform this only when we save two instructions
> - or more. When optimizing for size, however, accept any code size
> - savings we can. */
> - if (size_p && orig_cost <= lo_cost)
> - return false;
> -
> - if (!size_p
> - && (orig_cost <= lo_cost + 1))
> + So we want to perform this when we save at least one instruction. */
> + if (orig_cost <= lo_cost)
> return false;
>
> rtx mem_lo = adjust_address (dst, SImode, 0);
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 97f70d39cc0ddeb330e044bae0544d85a695567d..932d4d47a5db1a74e0d0565b565afbd769090853 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -4618,7 +4618,7 @@ (define_insn "*and_<SHIFT:optab>si3_compare0_uxtw"
> [(set_attr "type" "logics_shift_imm")]
> )
>
> -(define_insn "*<LOGICAL:optab>_<SHIFT:optab><mode>3"
> +(define_insn "<LOGICAL:optab>_<SHIFT:optab><mode>3"
> [(set (match_operand:GPI 0 "register_operand" "=r")
> (LOGICAL:GPI (SHIFT:GPI
> (match_operand:GPI 1 "register_operand" "r")
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr105928.c b/gcc/testsuite/gcc.target/aarch64/pr105928.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..ab52247df66020d0b8fe70bc81f572e8b64c2bb5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr105928.c
> @@ -0,0 +1,43 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O2 --save-temps" } */
> +
> +long f1 (void)
> +{
> + return 0x80402010080400;
> +}
> +
> +long f2 (void)
> +{
> + return 0x1234567812345678;
> +}
> +
> +long f3 (void)
> +{
> + return 0x4567800012345678;
> +}
> +
> +long f4 (void)
> +{
> + return 0x3ecccccd3ecccccd;
> +}
> +
> +long f5 (void)
> +{
> + return 0x38e38e38e38e38e;
> +}
> +
> +long f6 (void)
> +{
> + return 0x1745d1745d1745d;
> +}
> +
> +void f7 (long *p)
> +{
> + *p = 0x1234567812345678;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tmovk\t} 7 } } */
> +/* { dg-final { scan-assembler-times {\tmov\t} 7 } } */
> +/* { dg-final { scan-assembler-times {\tbic\t} 2 } } */
> +/* { dg-final { scan-assembler-times {\torr\t} 4 } } */
> +/* { dg-final { scan-assembler-times {\tstp\t} 1 } } */
Using "long" isn't ILP32-friendly. It needs to be "long long"
(or uint64_t, etc.) instead.
OK with that change, thanks.
Richard
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
> index d025e989a1e67f00f4f4ce94897a961d38abfab7..2b8e64313bb47f995f071c728b1d84473807bc64 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
> @@ -72,8 +72,7 @@ test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
> ** ushr v[0-9]+.16b, v[0-9]+.16b, 7
> ** mov x[0-9]+, 16512
> ** movk x[0-9]+, 0x1020, lsl 16
> -** movk x[0-9]+, 0x408, lsl 32
> -** movk x[0-9]+, 0x102, lsl 48
> +** orr x[0-9]+, x[0-9]+, x[0-9]+, lsl 28
> ** fmov d[0-9]+, x[0-9]+
> ** pmull v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
> ** dup v[0-9]+.2d, v[0-9]+.d\[0\]
Hi Richard,
> I was worried that reusing "dest" for intermediate results would
> prevent CSE for cases like:
>
> void g (long long, long long);
> void
> f (long long *ptr)
> {
> g (0xee11ee22ee11ee22LL, 0xdc23dc44ee11ee22LL);
> }
Note that aarch64_internal_mov_immediate may be called after reload,
so it would end up even more complex. This should be done as a
dedicated mid-end optimization similar to TARGET_CONST_ANCHOR.
However the number of 3/4-instruction immediates is so small that
sharable cases would be very rare, so I don't believe it is worth it.
Cheers,
Wilco
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Hi Richard,
>
>> I was worried that reusing "dest" for intermediate results would
>> prevent CSE for cases like:
>>
>> void g (long long, long long);
>> void
>> f (long long *ptr)
>> {
>> g (0xee11ee22ee11ee22LL, 0xdc23dc44ee11ee22LL);
>> }
>
> Note that aarch64_internal_mov_immediate may be called after reload,
> so it would end up even more complex.
The sequence I quoted was supposed to work before and after reload. The:
rtx tmp = aarch64_target_reg (dest, DImode);
would create a fresh temporary before reload and reuse dest otherwise.
So the sequence after reload would be the same as in your patch,
but the sequence before reload would use a temporary.
> This should be done as a
> dedicated mid-end optimization similar to TARGET_CONST_ANCHOR.
> However the number of 3/4-instruction immediates is so small that
> sharable cases would be very rare, so I don't believe it is worth it.
Yeah. If, with a few tweaks, we could easily reuse the existing pass
flow to optimise the split forms, then it might have been worth it.
But I agree it's not worth doing something special that only works
for multi-insn immediates.
I think there are other cases where CSE after split would help though.
Thanks,
Richard
Hi Richard,
>> Note that aarch64_internal_mov_immediate may be called after reload,
>> so it would end up even more complex.
>
> The sequence I quoted was supposed to work before and after reload. The:
>
> rtx tmp = aarch64_target_reg (dest, DImode);
>
> would create a fresh temporary before reload and reuse dest otherwise.
> So the sequence after reload would be the same as in your patch,
> but the sequence before reload would use a temporary.
aarch64_target_reg just returns the input register so it won't do that.
Also the movsi/movdi patterns only split if the destination register is physical.
That's typically after register allocation but not uniformly so (eg. immediates in
returns will get split early), which is inconsistent. Given we always emit register
notes it's not obvious whether splitting early or late is better overall.
Cheers,
Wilco
@@ -5639,7 +5639,7 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
machine_mode mode)
{
int i;
- unsigned HOST_WIDE_INT val, val2, mask;
+ unsigned HOST_WIDE_INT val, val2, val3, mask;
int one_match, zero_match;
int num_insns;
@@ -5721,6 +5721,35 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
}
return 3;
}
+
+ /* Try shifting and inserting the bottom 32-bits into the top bits. */
+ val2 = val & 0xffffffff;
+ val3 = 0xffffffff;
+ val3 = val2 | (val3 << 32);
+ for (i = 17; i < 48; i++)
+ if ((val2 | (val2 << i)) == val)
+ {
+ if (generate)
+ {
+ emit_insn (gen_rtx_SET (dest, GEN_INT (val2 & 0xffff)));
+ emit_insn (gen_insv_immdi (dest, GEN_INT (16),
+ GEN_INT (val2 >> 16)));
+ emit_insn (gen_ior_ashldi3 (dest, dest, GEN_INT (i), dest));
+ }
+ return 3;
+ }
+ else if ((val3 & ~(val3 << i)) == val)
+ {
+ if (generate)
+ {
+ emit_insn (gen_rtx_SET (dest, GEN_INT (val3 | 0xffff0000)));
+ emit_insn (gen_insv_immdi (dest, GEN_INT (16),
+ GEN_INT (val2 >> 16)));
+ emit_insn (gen_and_one_cmpl_ashldi3 (dest, dest, GEN_INT (i),
+ dest));
+ }
+ return 3;
+ }
}
/* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
@@ -25506,8 +25535,6 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
rtx lo = gen_lowpart (SImode, src);
rtx hi = gen_highpart_mode (SImode, DImode, src);
- bool size_p = optimize_function_for_size_p (cfun);
-
if (!rtx_equal_p (lo, hi))
return false;
@@ -25526,14 +25553,8 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
MOV w1, 49370
MOVK w1, 0x140, lsl 16
STP w1, w1, [x0]
- So we want to perform this only when we save two instructions
- or more. When optimizing for size, however, accept any code size
- savings we can. */
- if (size_p && orig_cost <= lo_cost)
- return false;
-
- if (!size_p
- && (orig_cost <= lo_cost + 1))
+ So we want to perform this when we save at least one instruction. */
+ if (orig_cost <= lo_cost)
return false;
rtx mem_lo = adjust_address (dst, SImode, 0);
@@ -4618,7 +4618,7 @@ (define_insn "*and_<SHIFT:optab>si3_compare0_uxtw"
[(set_attr "type" "logics_shift_imm")]
)
-(define_insn "*<LOGICAL:optab>_<SHIFT:optab><mode>3"
+(define_insn "<LOGICAL:optab>_<SHIFT:optab><mode>3"
[(set (match_operand:GPI 0 "register_operand" "=r")
(LOGICAL:GPI (SHIFT:GPI
(match_operand:GPI 1 "register_operand" "r")
new file mode 100644
@@ -0,0 +1,43 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 --save-temps" } */
+
+long f1 (void)
+{
+ return 0x80402010080400;
+}
+
+long f2 (void)
+{
+ return 0x1234567812345678;
+}
+
+long f3 (void)
+{
+ return 0x4567800012345678;
+}
+
+long f4 (void)
+{
+ return 0x3ecccccd3ecccccd;
+}
+
+long f5 (void)
+{
+ return 0x38e38e38e38e38e;
+}
+
+long f6 (void)
+{
+ return 0x1745d1745d1745d;
+}
+
+void f7 (long *p)
+{
+ *p = 0x1234567812345678;
+}
+
+/* { dg-final { scan-assembler-times {\tmovk\t} 7 } } */
+/* { dg-final { scan-assembler-times {\tmov\t} 7 } } */
+/* { dg-final { scan-assembler-times {\tbic\t} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tstp\t} 1 } } */
@@ -72,8 +72,7 @@ test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
** ushr v[0-9]+.16b, v[0-9]+.16b, 7
** mov x[0-9]+, 16512
** movk x[0-9]+, 0x1020, lsl 16
-** movk x[0-9]+, 0x408, lsl 32
-** movk x[0-9]+, 0x102, lsl 48
+** orr x[0-9]+, x[0-9]+, x[0-9]+, lsl 28
** fmov d[0-9]+, x[0-9]+
** pmull v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
** dup v[0-9]+.2d, v[0-9]+.d\[0\]