AArch64: Enable TARGET_CONST_ANCHOR
Commit Message
Enable TARGET_CONST_ANCHOR to allow complex constants to be created via immediate add.
Use a 24-bit range as that enables a 3 or 4-instruction immediate to be replaced by
2 additions. Fix the costing of immediate add to support 24-bit immediate and 12-bit shifted
immediates. The generated code for the testcase is now the same or better than LLVM.
It also results in a small codesize reduction on SPEC.
Passes bootstrap and regress, OK for commit?
gcc/
* config/aarch64/aarch64.cc (aarch64_rtx_costs): Add correct costs for
24-bit immediate add and 12-bit high immediate add.
(TARGET_CONST_ANCHOR): Define.
gcc/testsuite/
* gcc.target/aarch64/movk_3.c: New test.
---
Comments
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Enable TARGET_CONST_ANCHOR to allow complex constants to be created via immediate add.
> Use a 24-bit range as that enables a 3 or 4-instruction immediate to be replaced by
> 2 additions. Fix the costing of immediate add to support 24-bit immediate and 12-bit shifted
> immediates. The generated code for the testcase is now the same or better than LLVM.
> It also results in a small codesize reduction on SPEC.
>
> Passes bootstrap and regress, OK for commit?
>
> gcc/
> * config/aarch64/aarch64.cc (aarch64_rtx_costs): Add correct costs for
> 24-bit immediate add and 12-bit high immediate add.
Very minor, but it might be worth saying "add/sub" rather than "add".
The reason picking the 24-bit range is right is that add & sub together
give us a 25-bit range.
> (TARGET_CONST_ANCHOR): Define.
>
> gcc/testsuite/
> * gcc.target/aarch64/movk_3.c: New test.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index e97f3b32f7c7f43564d6a4207eae5a34b9e9bfe7..a73741800c963ee6605fd2cfa918f4399da4bfdf 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -14257,6 +14257,16 @@ cost_plus:
> return true;
> }
>
> + if (aarch64_pluslong_immediate (op1, mode))
> + {
> + /* 24-bit add in 2 instructions or 12-bit shifted add. */
> + if ((INTVAL (op1) & 0xfff) != 0)
> + *cost += COSTS_N_INSNS (1);
> +
> + *cost += rtx_cost (op0, mode, PLUS, 0, speed);
> + return true;
> + }
> +
I guess for consistency, we arguably should add extra_cost->alu.arith
for each instruction, like we do for the other cases. But that seems
a bit daft when all arith costs are 0. And it's hard to believe that
they would be nonzero for any new core (i.e. that a plain addition
would be more expensive than a typical "fast" instruction). ADD
immediate is effectively the benchmark cost of COSTS_N_INSN (1).
So I agree what you wrote is what we should use. It might be good to
get rid of the existing uses of alu.arith (for integer ADD only), as a
separate follow-up patch.
It looks like there's an off-by-one error in:
(define_predicate "aarch64_pluslong_immediate"
(and (match_code "const_int")
(match_test "(INTVAL (op) < 0xffffff && INTVAL (op) > -0xffffff)")))
which might make the optimisation fail at the extremes. I think it
should be:
(define_predicate "aarch64_pluslong_immediate"
(and (match_code "const_int")
(match_test "IN_RANGE (ival, -0xffffff, 0xffffff)")))
instead.
OK with that change to the predicate, thanks.
Richard
> *cost += rtx_cost (op1, mode, PLUS, 1, speed);
>
> /* Look for ADD (extended register). */
> @@ -28051,6 +28061,9 @@ aarch64_libgcc_floating_mode_supported_p
> #undef TARGET_HAVE_SHADOW_CALL_STACK
> #define TARGET_HAVE_SHADOW_CALL_STACK true
>
> +#undef TARGET_CONST_ANCHOR
> +#define TARGET_CONST_ANCHOR 0x1000000
> +
> struct gcc_target targetm = TARGET_INITIALIZER;
>
> #include "gt-aarch64.h"
> diff --git a/gcc/testsuite/gcc.target/aarch64/movk_3.c b/gcc/testsuite/gcc.target/aarch64/movk_3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..9e8c0c42671bef3f63028b4e51d0bd78c9903994
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/movk_3.c
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 --save-temps" } */
> +
> +
> +/* 2 MOV */
> +void f16 (long *p)
> +{
> + p[0] = 0x1234;
> + p[2] = 0x1235;
> +}
> +
> +/* MOV, MOVK and ADD */
> +void f32_1 (long *p)
> +{
> + p[0] = 0x12345678;
> + p[2] = 0x12345678 + 0xfff;
> +}
> +
> +/* 2 MOV, 2 MOVK */
> +void f32_2 (long *p)
> +{
> + p[0] = 0x12345678;
> + p[2] = 0x12345678 + 0x555555;
> +}
> +
> +/* MOV, MOVK and ADD */
> +void f32_3 (long *p)
> +{
> + p[0] = 0x12345678;
> + p[2] = 0x12345678 + 0x999000;
> +}
> +
> +/* MOV, 2 MOVK and ADD */
> +void f48_1 (long *p)
> +{
> + p[0] = 0x123456789abc;
> + p[2] = 0x123456789abc + 0xfff;
> +}
> +
> +/* MOV, 2 MOVK and 2 ADD */
> +void f48_2 (long *p)
> +{
> + p[0] = 0x123456789abc;
> + p[2] = 0x123456789abc + 0x666666;
> +}
> +
> +/* 2 MOV, 4 MOVK */
> +void f48_3 (long *p)
> +{
> + p[0] = 0x123456789abc;
> + p[2] = 0x123456789abc + 0x1666666;
> +}
> +
> +/* { dg-final { scan-assembler-times "mov\tx\[0-9\]+, \[0-9\]+" 10 } } */
> +/* { dg-final { scan-assembler-times "movk\tx\[0-9\]+, 0x\[0-9a-f\]+" 12 } } */
> +/* { dg-final { scan-assembler-times "add\tx\[0-9\]+, x\[0-9\]+, \[0-9\]+" 5 } } */
@@ -14257,6 +14257,16 @@ cost_plus:
return true;
}
+ if (aarch64_pluslong_immediate (op1, mode))
+ {
+ /* 24-bit add in 2 instructions or 12-bit shifted add. */
+ if ((INTVAL (op1) & 0xfff) != 0)
+ *cost += COSTS_N_INSNS (1);
+
+ *cost += rtx_cost (op0, mode, PLUS, 0, speed);
+ return true;
+ }
+
*cost += rtx_cost (op1, mode, PLUS, 1, speed);
/* Look for ADD (extended register). */
@@ -28051,6 +28061,9 @@ aarch64_libgcc_floating_mode_supported_p
#undef TARGET_HAVE_SHADOW_CALL_STACK
#define TARGET_HAVE_SHADOW_CALL_STACK true
+#undef TARGET_CONST_ANCHOR
+#define TARGET_CONST_ANCHOR 0x1000000
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-aarch64.h"
new file mode 100644
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 --save-temps" } */
+
+
+/* 2 MOV */
+void f16 (long *p)
+{
+ p[0] = 0x1234;
+ p[2] = 0x1235;
+}
+
+/* MOV, MOVK and ADD */
+void f32_1 (long *p)
+{
+ p[0] = 0x12345678;
+ p[2] = 0x12345678 + 0xfff;
+}
+
+/* 2 MOV, 2 MOVK */
+void f32_2 (long *p)
+{
+ p[0] = 0x12345678;
+ p[2] = 0x12345678 + 0x555555;
+}
+
+/* MOV, MOVK and ADD */
+void f32_3 (long *p)
+{
+ p[0] = 0x12345678;
+ p[2] = 0x12345678 + 0x999000;
+}
+
+/* MOV, 2 MOVK and ADD */
+void f48_1 (long *p)
+{
+ p[0] = 0x123456789abc;
+ p[2] = 0x123456789abc + 0xfff;
+}
+
+/* MOV, 2 MOVK and 2 ADD */
+void f48_2 (long *p)
+{
+ p[0] = 0x123456789abc;
+ p[2] = 0x123456789abc + 0x666666;
+}
+
+/* 2 MOV, 4 MOVK */
+void f48_3 (long *p)
+{
+ p[0] = 0x123456789abc;
+ p[2] = 0x123456789abc + 0x1666666;
+}
+
+/* { dg-final { scan-assembler-times "mov\tx\[0-9\]+, \[0-9\]+" 10 } } */
+/* { dg-final { scan-assembler-times "movk\tx\[0-9\]+, 0x\[0-9a-f\]+" 12 } } */
+/* { dg-final { scan-assembler-times "add\tx\[0-9\]+, x\[0-9\]+, \[0-9\]+" 5 } } */