[v3] aarch64: remove extra XTN in vector concatenation

Message ID 20250106172202.3672-1-Akram.Ahmad@arm.com
State New
Headers
Series [v3] aarch64: remove extra XTN in vector concatenation |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 fail Patch failed to apply
linaro-tcwg-bot/tcwg_gcc_build--master-arm fail Patch failed to apply

Commit Message

Akram Ahmad Jan. 6, 2025, 5:22 p.m. UTC
  Hi Richard,

Thanks for the feedback. I've copied in the resulting patch here- if
this is okay, please could it be committed on my behalf? The patch
continues below.

Many thanks,

Akram

---

GIMPLE code which performs a narrowing truncation on the result of a
vector concatenation currently results in an unnecessary XTN being
emitted following a UZP1 to concate the operands. In cases such as this,
UZP1 should instead use a smaller arrangement specifier to replace the
XTN instruction. This is seen in cases such as in this GIMPLE example:

	int32x2_t foo (svint64_t a, svint64_t b)
	{
	  vector(2) int vect__2.8;
	  long int _1;
	  long int _3;
	  vector(2) long int _12;

	  <bb 2> [local count: 1073741824]:
	  _1 = svaddv_s64 ({ -1, 0, 0, 0, 0, 0, 0, 0, ... }, a_6(D));
	  _3 = svaddv_s64 ({ -1, 0, 0, 0, 0, 0, 0, 0, ... }, b_7(D));
	  _12 = {_1, _3};
	  vect__2.8_13 = (vector(2) int) _12;
	  return vect__2.8_13;

	}

Original assembly generated:

	bar:
	        ptrue   p3.b, all
	        uaddv   d0, p3, z0.d
	        uaddv   d1, p3, z1.d
	        uzp1    v0.2d, v0.2d, v1.2d
	        xtn     v0.2s, v0.2d
	        ret

This patch therefore defines the *aarch64_trunc_concat<mode> insn which
truncates the concatenation result, rather than concatenating the
truncated operands (such as in *aarch64_narrow_trunc<mode>), resulting
in the following optimised assembly being emitted:

	bar:
	        ptrue   p3.b, all
	        uaddv   d0, p3, z0.d
	        uaddv   d1, p3, z1.d
	        uzp1    v0.2s, v0.2s, v1.2s
	        ret

This patch passes all regression tests on aarch64 with no new failures.
A supporting test for this optimisation is also written and passes.

OK for master? I do not have commit rights so I cannot push the patch
myself.

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md: (*aarch64_trunc_concat)
	  new insn definition.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/truncated_concatenation_1.c: new test
	  for the above example and other modes covered by insn
	  definitions.
---
 gcc/config/aarch64/aarch64-simd.md            | 16 ++++++++++
 .../aarch64/sve/truncated_concatenation_1.c   | 32 +++++++++++++++++++
 2 files changed, 48 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c
  

Comments

Richard Sandiford Jan. 6, 2025, 8:10 p.m. UTC | #1
Akram Ahmad <Akram.Ahmad@arm.com> writes:
> Hi Richard,
>
> Thanks for the feedback. I've copied in the resulting patch here- if
> this is okay, please could it be committed on my behalf? The patch
> continues below.
>
> Many thanks,
>
> Akram

Thanks. LGTM.  Pushed to trunk.

Richard

> ---
>
> GIMPLE code which performs a narrowing truncation on the result of a
> vector concatenation currently results in an unnecessary XTN being
> emitted following a UZP1 to concate the operands. In cases such as this,
> UZP1 should instead use a smaller arrangement specifier to replace the
> XTN instruction. This is seen in cases such as in this GIMPLE example:
>
> 	int32x2_t foo (svint64_t a, svint64_t b)
> 	{
> 	  vector(2) int vect__2.8;
> 	  long int _1;
> 	  long int _3;
> 	  vector(2) long int _12;
>
> 	  <bb 2> [local count: 1073741824]:
> 	  _1 = svaddv_s64 ({ -1, 0, 0, 0, 0, 0, 0, 0, ... }, a_6(D));
> 	  _3 = svaddv_s64 ({ -1, 0, 0, 0, 0, 0, 0, 0, ... }, b_7(D));
> 	  _12 = {_1, _3};
> 	  vect__2.8_13 = (vector(2) int) _12;
> 	  return vect__2.8_13;
>
> 	}
>
> Original assembly generated:
>
> 	bar:
> 	        ptrue   p3.b, all
> 	        uaddv   d0, p3, z0.d
> 	        uaddv   d1, p3, z1.d
> 	        uzp1    v0.2d, v0.2d, v1.2d
> 	        xtn     v0.2s, v0.2d
> 	        ret
>
> This patch therefore defines the *aarch64_trunc_concat<mode> insn which
> truncates the concatenation result, rather than concatenating the
> truncated operands (such as in *aarch64_narrow_trunc<mode>), resulting
> in the following optimised assembly being emitted:
>
> 	bar:
> 	        ptrue   p3.b, all
> 	        uaddv   d0, p3, z0.d
> 	        uaddv   d1, p3, z1.d
> 	        uzp1    v0.2s, v0.2s, v1.2s
> 	        ret
>
> This patch passes all regression tests on aarch64 with no new failures.
> A supporting test for this optimisation is also written and passes.
>
> OK for master? I do not have commit rights so I cannot push the patch
> myself.
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-simd.md: (*aarch64_trunc_concat)
> 	  new insn definition.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/sve/truncated_concatenation_1.c: new test
> 	  for the above example and other modes covered by insn
> 	  definitions.
> ---
>  gcc/config/aarch64/aarch64-simd.md            | 16 ++++++++++
>  .../aarch64/sve/truncated_concatenation_1.c   | 32 +++++++++++++++++++
>  2 files changed, 48 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index cfe95bd4c31..6c129d6c4a8 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -1872,6 +1872,22 @@
>    [(set_attr "type" "neon_permute<q>")]
>  )
>  
> +(define_insn "*aarch64_trunc_concat<mode>"
> +  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
> +	(truncate:<VNARROWQ>
> +	  (vec_concat:VQN
> +	    (match_operand:<VHALF> 1 "register_operand" "w")
> +	    (match_operand:<VHALF> 2 "register_operand" "w"))))]
> +  "TARGET_SIMD"
> +{
> +  if (!BYTES_BIG_ENDIAN)
> +    return "uzp1\\t%0.<Vntype>, %1.<Vntype>, %2.<Vntype>";
> +  else
> +    return "uzp1\\t%0.<Vntype>, %2.<Vntype>, %1.<Vntype>";
> +}
> +  [(set_attr "type" "neon_permute<q>")]
> +)
> +
>  ;; Packing doubles.
>  
>  (define_expand "vec_pack_trunc_<mode>"
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c b/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c
> new file mode 100644
> index 00000000000..95577a1a9ef
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c
> @@ -0,0 +1,32 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -Wall -march=armv8.2-a+sve" } */
> +
> +#include <arm_neon.h>
> +#include <arm_sve.h>
> +
> +int8x8_t f1 (int16x4_t a, int16x4_t b) {
> +    int8x8_t ab = vdup_n_s8 (0);
> +    int16x8_t ab_concat = vcombine_s16 (a, b);
> +    ab = vmovn_s16 (ab_concat);
> +    return ab;
> +}
> +
> +int16x4_t f2 (int32x2_t a, int32x2_t b) {
> +    int16x4_t ab = vdup_n_s16 (0);
> +    int32x4_t ab_concat = vcombine_s32 (a, b);
> +    ab = vmovn_s32 (ab_concat);
> +    return ab;
> +}
> +
> +int32x2_t f3 (svint64_t a, svint64_t b) {
> +    int32x2_t ab = vdup_n_s32 (0);
> +    ab = vset_lane_s32 ((int)svaddv_s64 (svptrue_b64 (), a), ab, 0);
> +    ab = vset_lane_s32 ((int)svaddv_s64 (svptrue_b64 (), b), ab, 1);
> +    return ab;
> +}
> +
> +/* { dg-final { scan-assembler-not {\txtn\t} } }*/
> +/* { dg-final { scan-assembler-not {\tfcvtn\t} } }*/
> +/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.8b, v[0-9]+\.8b, v[0-9]+\.8b} 1 } }*/
> +/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.4h, v[0-9]+\.4h, v[0-9]+\.4h} 1 } }*/
> +/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.2s, v[0-9]+\.2s, v[0-9]+\.2s} 1 } }*/
> \ No newline at end of file
  

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index cfe95bd4c31..6c129d6c4a8 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1872,6 +1872,22 @@ 
   [(set_attr "type" "neon_permute<q>")]
 )
 
+(define_insn "*aarch64_trunc_concat<mode>"
+  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
+	(truncate:<VNARROWQ>
+	  (vec_concat:VQN
+	    (match_operand:<VHALF> 1 "register_operand" "w")
+	    (match_operand:<VHALF> 2 "register_operand" "w"))))]
+  "TARGET_SIMD"
+{
+  if (!BYTES_BIG_ENDIAN)
+    return "uzp1\\t%0.<Vntype>, %1.<Vntype>, %2.<Vntype>";
+  else
+    return "uzp1\\t%0.<Vntype>, %2.<Vntype>, %1.<Vntype>";
+}
+  [(set_attr "type" "neon_permute<q>")]
+)
+
 ;; Packing doubles.
 
 (define_expand "vec_pack_trunc_<mode>"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c b/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c
new file mode 100644
index 00000000000..95577a1a9ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c
@@ -0,0 +1,32 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -Wall -march=armv8.2-a+sve" } */
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+
+int8x8_t f1 (int16x4_t a, int16x4_t b) {
+    int8x8_t ab = vdup_n_s8 (0);
+    int16x8_t ab_concat = vcombine_s16 (a, b);
+    ab = vmovn_s16 (ab_concat);
+    return ab;
+}
+
+int16x4_t f2 (int32x2_t a, int32x2_t b) {
+    int16x4_t ab = vdup_n_s16 (0);
+    int32x4_t ab_concat = vcombine_s32 (a, b);
+    ab = vmovn_s32 (ab_concat);
+    return ab;
+}
+
+int32x2_t f3 (svint64_t a, svint64_t b) {
+    int32x2_t ab = vdup_n_s32 (0);
+    ab = vset_lane_s32 ((int)svaddv_s64 (svptrue_b64 (), a), ab, 0);
+    ab = vset_lane_s32 ((int)svaddv_s64 (svptrue_b64 (), b), ab, 1);
+    return ab;
+}
+
+/* { dg-final { scan-assembler-not {\txtn\t} } }*/
+/* { dg-final { scan-assembler-not {\tfcvtn\t} } }*/
+/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.8b, v[0-9]+\.8b, v[0-9]+\.8b} 1 } }*/
+/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.4h, v[0-9]+\.4h, v[0-9]+\.4h} 1 } }*/
+/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.2s, v[0-9]+\.2s, v[0-9]+\.2s} 1 } }*/
\ No newline at end of file