[v2] aarch64: Improve Advanced SIMD popcount expansion by using SVE [PR113860]
Checks
Context |
Check |
Description |
linaro-tcwg-bot/tcwg_gcc_build--master-arm |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 |
success
|
Build passed
|
Commit Message
This patch improves the Advanced SIMD popcount expansion by using SVE if
available.
For example, GCC currently generates the following code sequence for V2DI:
cnt v31.16b, v31.16b
uaddlp v31.8h, v31.16b
uaddlp v31.4s, v31.8h
uaddlp v31.2d, v31.4s
However, by using SVE, we can generate the following sequence instead:
ptrue p7.b, all
cnt z31.d, p7/m, z31.d
Similar improvements can be made for V4HI, V8HI, V2SI and V4SI too.
The scalar popcount expansion can also be improved similarly by using SVE and
those changes will be included in a separate patch.
PR target/113860
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md (popcount<mode>2): Add TARGET_SVE
support.
* config/aarch64/aarch64-sve.md (@aarch64_pred_<optab><mode>): Use new
iterator SVE_VDQ_I.
* config/aarch64/iterators.md (SVE_VDQ_I): New mode iterator.
(VPRED): Add V8QI, V16QI, V4HI, V8HI and V2SI.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/popcnt-sve.c: New test.
Signed-off-by: Pengxuan Zheng <quic_pzheng@quicinc.com>
---
gcc/config/aarch64/aarch64-simd.md | 9 ++
gcc/config/aarch64/aarch64-sve.md | 13 +--
gcc/config/aarch64/iterators.md | 5 ++
gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 88 +++++++++++++++++++
4 files changed, 109 insertions(+), 6 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
Comments
Pushed as r15-2659-ge4b8db26de352.
Pengxuan
> This patch improves the Advanced SIMD popcount expansion by using SVE if
> available.
>
> For example, GCC currently generates the following code sequence for V2DI:
> cnt v31.16b, v31.16b
> uaddlp v31.8h, v31.16b
> uaddlp v31.4s, v31.8h
> uaddlp v31.2d, v31.4s
>
> However, by using SVE, we can generate the following sequence instead:
> ptrue p7.b, all
> cnt z31.d, p7/m, z31.d
>
> Similar improvements can be made for V4HI, V8HI, V2SI and V4SI too.
>
> The scalar popcount expansion can also be improved similarly by using SVE
> and those changes will be included in a separate patch.
>
> PR target/113860
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64-simd.md (popcount<mode>2): Add
> TARGET_SVE
> support.
> * config/aarch64/aarch64-sve.md
> (@aarch64_pred_<optab><mode>): Use new
> iterator SVE_VDQ_I.
> * config/aarch64/iterators.md (SVE_VDQ_I): New mode iterator.
> (VPRED): Add V8QI, V16QI, V4HI, V8HI and V2SI.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/popcnt-sve.c: New test.
>
> Signed-off-by: Pengxuan Zheng <quic_pzheng@quicinc.com>
> ---
> gcc/config/aarch64/aarch64-simd.md | 9 ++
> gcc/config/aarch64/aarch64-sve.md | 13 +--
> gcc/config/aarch64/iterators.md | 5 ++
> gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 88 +++++++++++++++++++
> 4 files changed, 109 insertions(+), 6 deletions(-) create mode 100644
> gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index bbeee221f37..895d6e5eab5 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3508,6 +3508,15 @@ (define_expand "popcount<mode>2"
> (popcount:VDQHSD (match_operand:VDQHSD 1
> "register_operand")))]
> "TARGET_SIMD"
> {
> + if (TARGET_SVE)
> + {
> + rtx p = aarch64_ptrue_reg (<VPRED>mode);
> + emit_insn (gen_aarch64_pred_popcount<mode> (operands[0],
> + p,
> + operands[1]));
> + DONE;
> + }
> +
> /* Generate a byte popcount. */
> machine_mode mode = <bitsize> == 64 ? V8QImode : V16QImode;
> rtx tmp = gen_reg_rtx (mode);
> diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-
> sve.md
> index 5331e7121d5..eb3705ae515 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -3104,16 +3104,16 @@ (define_expand "<optab><mode>2"
>
> ;; Integer unary arithmetic predicated with a PTRUE.
> (define_insn "@aarch64_pred_<optab><mode>"
> - [(set (match_operand:SVE_I 0 "register_operand")
> - (unspec:SVE_I
> + [(set (match_operand:SVE_VDQ_I 0 "register_operand")
> + (unspec:SVE_VDQ_I
> [(match_operand:<VPRED> 1 "register_operand")
> - (SVE_INT_UNARY:SVE_I
> - (match_operand:SVE_I 2 "register_operand"))]
> + (SVE_INT_UNARY:SVE_VDQ_I
> + (match_operand:SVE_VDQ_I 2 "register_operand"))]
> UNSPEC_PRED_X))]
> "TARGET_SVE"
> {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ]
> - [ w , Upl , 0 ; * ] <sve_int_op>\t%0.<Vetype>, %1/m,
> %2.<Vetype>
> - [ ?&w , Upl , w ; yes ] movprfx\t%0,
> %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
> + [ w , Upl , 0 ; * ] <sve_int_op>\t%Z0.<Vetype>, %1/m,
> %Z2.<Vetype>
> + [ ?&w , Upl , w ; yes ] movprfx\t%Z0,
> %Z2\;<sve_int_op>\t%Z0.<Vetype>, %1/m, %Z2.<Vetype>
> }
> )
>
> @@ -3168,6 +3168,7 @@ (define_insn "*cond_<optab><mode>_any"
> }
> )
>
> +
> ;; -------------------------------------------------------------------------
> ;; ---- [INT] General unary arithmetic corresponding to unspecs ;; ---------------
> ----------------------------------------------------------
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index f527b2cfeb8..ee3d1fb98fd 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -559,6 +559,9 @@ (define_mode_iterator SVE_I [VNx16QI VNx8QI
> VNx4QI VNx2QI ;; element modes (define_mode_iterator SVE_I_SIMD_DI
> [SVE_I V2DI])
>
> +;; All SVE and Advanced SIMD integer vector modes.
> +(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I])
> +
> ;; SVE integer vector modes whose elements are 16 bits or wider.
> (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
> VNx4SI VNx2SI
> @@ -2278,6 +2281,8 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI")
> (VNx8QI "VNx8BI")
> (VNx32BF "VNx8BI")
> (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
> (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
> + (V8QI "VNx8BI") (V16QI "VNx16BI")
> + (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
> (V4SI "VNx4BI") (V2DI "VNx2BI")])
>
> ;; ...and again in lower case.
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
> b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
> new file mode 100644
> index 00000000000..8e349efe390
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
> @@ -0,0 +1,88 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=armv8.2-a+sve -fno-vect-cost-model
> +-fno-schedule-insns -fno-schedule-insns2" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +/*
> +** f_v4hi:
> +** ptrue (p[0-7]).b, all
> +** ldr d([0-9]+), \[x0\]
> +** cnt z\2.h, \1/m, z\2.h
> +** str d\2, \[x1\]
> +** ret
> +*/
> +void
> +f_v4hi (unsigned short *__restrict b, unsigned short *__restrict d) {
> + d[0] = __builtin_popcount (b[0]);
> + d[1] = __builtin_popcount (b[1]);
> + d[2] = __builtin_popcount (b[2]);
> + d[3] = __builtin_popcount (b[3]);
> +}
> +
> +/*
> +** f_v8hi:
> +** ptrue (p[0-7]).b, all
> +** ldr q([0-9]+), \[x0\]
> +** cnt z\2.h, \1/m, z\2.h
> +** str q\2, \[x1\]
> +** ret
> +*/
> +void
> +f_v8hi (unsigned short *__restrict b, unsigned short *__restrict d) {
> + d[0] = __builtin_popcount (b[0]);
> + d[1] = __builtin_popcount (b[1]);
> + d[2] = __builtin_popcount (b[2]);
> + d[3] = __builtin_popcount (b[3]);
> + d[4] = __builtin_popcount (b[4]);
> + d[5] = __builtin_popcount (b[5]);
> + d[6] = __builtin_popcount (b[6]);
> + d[7] = __builtin_popcount (b[7]);
> +}
> +
> +/*
> +** f_v2si:
> +** ptrue (p[0-7]).b, all
> +** ldr d([0-9]+), \[x0\]
> +** cnt z\2.s, \1/m, z\2.s
> +** str d\2, \[x1\]
> +** ret
> +*/
> +void
> +f_v2si (unsigned int *__restrict b, unsigned int *__restrict d) {
> + d[0] = __builtin_popcount (b[0]);
> + d[1] = __builtin_popcount (b[1]);
> +}
> +
> +/*
> +** f_v4si:
> +** ptrue (p[0-7]).b, all
> +** ldr q([0-9]+), \[x0\]
> +** cnt z\2.s, \1/m, z\2.s
> +** str q\2, \[x1\]
> +** ret
> +*/
> +void
> +f_v4si (unsigned int *__restrict b, unsigned int *__restrict d) {
> + d[0] = __builtin_popcount (b[0]);
> + d[1] = __builtin_popcount (b[1]);
> + d[2] = __builtin_popcount (b[2]);
> + d[3] = __builtin_popcount (b[3]);
> +}
> +
> +/*
> +** f_v2di:
> +** ptrue (p[0-7]).b, all
> +** ldr q([0-9]+), \[x0\]
> +** cnt z\2.d, \1/m, z\2.d
> +** str q\2, \[x1\]
> +** ret
> +*/
> +void
> +f_v2di (unsigned long *__restrict b, unsigned long *__restrict d) {
> + d[0] = __builtin_popcountll (b[0]);
> + d[1] = __builtin_popcountll (b[1]);
> +}
> --
> 2.17.1
@@ -3508,6 +3508,15 @@ (define_expand "popcount<mode>2"
(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
"TARGET_SIMD"
{
+ if (TARGET_SVE)
+ {
+ rtx p = aarch64_ptrue_reg (<VPRED>mode);
+ emit_insn (gen_aarch64_pred_popcount<mode> (operands[0],
+ p,
+ operands[1]));
+ DONE;
+ }
+
/* Generate a byte popcount. */
machine_mode mode = <bitsize> == 64 ? V8QImode : V16QImode;
rtx tmp = gen_reg_rtx (mode);
@@ -3104,16 +3104,16 @@ (define_expand "<optab><mode>2"
;; Integer unary arithmetic predicated with a PTRUE.
(define_insn "@aarch64_pred_<optab><mode>"
- [(set (match_operand:SVE_I 0 "register_operand")
- (unspec:SVE_I
+ [(set (match_operand:SVE_VDQ_I 0 "register_operand")
+ (unspec:SVE_VDQ_I
[(match_operand:<VPRED> 1 "register_operand")
- (SVE_INT_UNARY:SVE_I
- (match_operand:SVE_I 2 "register_operand"))]
+ (SVE_INT_UNARY:SVE_VDQ_I
+ (match_operand:SVE_VDQ_I 2 "register_operand"))]
UNSPEC_PRED_X))]
"TARGET_SVE"
{@ [ cons: =0 , 1 , 2 ; attrs: movprfx ]
- [ w , Upl , 0 ; * ] <sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
- [ ?&w , Upl , w ; yes ] movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
+ [ w , Upl , 0 ; * ] <sve_int_op>\t%Z0.<Vetype>, %1/m, %Z2.<Vetype>
+ [ ?&w , Upl , w ; yes ] movprfx\t%Z0, %Z2\;<sve_int_op>\t%Z0.<Vetype>, %1/m, %Z2.<Vetype>
}
)
@@ -3168,6 +3168,7 @@ (define_insn "*cond_<optab><mode>_any"
}
)
+
;; -------------------------------------------------------------------------
;; ---- [INT] General unary arithmetic corresponding to unspecs
;; -------------------------------------------------------------------------
@@ -559,6 +559,9 @@ (define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI
;; element modes
(define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI])
+;; All SVE and Advanced SIMD integer vector modes.
+(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I])
+
;; SVE integer vector modes whose elements are 16 bits or wider.
(define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
VNx4SI VNx2SI
@@ -2278,6 +2281,8 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI") (VNx8QI "VNx8BI")
(VNx32BF "VNx8BI")
(VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
(VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
+ (V8QI "VNx8BI") (V16QI "VNx16BI")
+ (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
(V4SI "VNx4BI") (V2DI "VNx2BI")])
;; ...and again in lower case.
new file mode 100644
@@ -0,0 +1,88 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+sve -fno-vect-cost-model -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** f_v4hi:
+** ptrue (p[0-7]).b, all
+** ldr d([0-9]+), \[x0\]
+** cnt z\2.h, \1/m, z\2.h
+** str d\2, \[x1\]
+** ret
+*/
+void
+f_v4hi (unsigned short *__restrict b, unsigned short *__restrict d)
+{
+ d[0] = __builtin_popcount (b[0]);
+ d[1] = __builtin_popcount (b[1]);
+ d[2] = __builtin_popcount (b[2]);
+ d[3] = __builtin_popcount (b[3]);
+}
+
+/*
+** f_v8hi:
+** ptrue (p[0-7]).b, all
+** ldr q([0-9]+), \[x0\]
+** cnt z\2.h, \1/m, z\2.h
+** str q\2, \[x1\]
+** ret
+*/
+void
+f_v8hi (unsigned short *__restrict b, unsigned short *__restrict d)
+{
+ d[0] = __builtin_popcount (b[0]);
+ d[1] = __builtin_popcount (b[1]);
+ d[2] = __builtin_popcount (b[2]);
+ d[3] = __builtin_popcount (b[3]);
+ d[4] = __builtin_popcount (b[4]);
+ d[5] = __builtin_popcount (b[5]);
+ d[6] = __builtin_popcount (b[6]);
+ d[7] = __builtin_popcount (b[7]);
+}
+
+/*
+** f_v2si:
+** ptrue (p[0-7]).b, all
+** ldr d([0-9]+), \[x0\]
+** cnt z\2.s, \1/m, z\2.s
+** str d\2, \[x1\]
+** ret
+*/
+void
+f_v2si (unsigned int *__restrict b, unsigned int *__restrict d)
+{
+ d[0] = __builtin_popcount (b[0]);
+ d[1] = __builtin_popcount (b[1]);
+}
+
+/*
+** f_v4si:
+** ptrue (p[0-7]).b, all
+** ldr q([0-9]+), \[x0\]
+** cnt z\2.s, \1/m, z\2.s
+** str q\2, \[x1\]
+** ret
+*/
+void
+f_v4si (unsigned int *__restrict b, unsigned int *__restrict d)
+{
+ d[0] = __builtin_popcount (b[0]);
+ d[1] = __builtin_popcount (b[1]);
+ d[2] = __builtin_popcount (b[2]);
+ d[3] = __builtin_popcount (b[3]);
+}
+
+/*
+** f_v2di:
+** ptrue (p[0-7]).b, all
+** ldr q([0-9]+), \[x0\]
+** cnt z\2.d, \1/m, z\2.d
+** str q\2, \[x1\]
+** ret
+*/
+void
+f_v2di (unsigned long *__restrict b, unsigned long *__restrict d)
+{
+ d[0] = __builtin_popcountll (b[0]);
+ d[1] = __builtin_popcountll (b[1]);
+}