From patchwork Tue May 17 13:28:54 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Patchwork-Submitter: "Andre Vieira (lists)" <Andre.SimoesDiasVieira@arm.com>
X-Patchwork-Id: 54095
Return-Path: <gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id 6B04E385782C
	for <patchwork@sourceware.org>; Tue, 17 May 2022 13:29:38 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 6B04E385782C
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1652794178;
	bh=A+VlO5zEJPFFCITkgq54qR4okOQ71C9RTkUAAclHZ3w=;
	h=Date:To:Subject:List-Id:List-Unsubscribe:List-Archive:List-Post:
	 List-Help:List-Subscribe:From:Reply-To:From;
	b=OAjR7qtdXA9N44ohJOT2Pn1pWz5Ht5ZjStwGCsd3XQXKg2YybBf3yuZVtuI1ERcPa
	 dnuLVhrap9SKidINOlPNE2uPw2HqUA0BY71EyZLdgSWwhN1yr5Tqa5PZ3m6Js7Q+0K
	 n/argQVvw+J/Vo3mnxKZMLj89JstucNynIjayHOU=
X-Original-To: gcc-patches@gcc.gnu.org
Delivered-To: gcc-patches@gcc.gnu.org
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
 by sourceware.org (Postfix) with ESMTP id 381353858C51
 for <gcc-patches@gcc.gnu.org>; Tue, 17 May 2022 13:29:08 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 381353858C51
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
 by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id D1F7F1063;
 Tue, 17 May 2022 06:29:07 -0700 (PDT)
Received: from [10.57.1.176] (unknown [10.57.1.176])
 by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id A65053F718;
 Tue, 17 May 2022 06:29:02 -0700 (PDT)
Message-ID: <084394f1-9c22-126f-719b-7746e119d616@arm.com>
Date: Tue, 17 May 2022 14:28:54 +0100
MIME-Version: 1.0
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101
 Thunderbird/91.9.0
Content-Language: en-US
To: "gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org>,
 Richard Sandiford <richard.sandiford@arm.com>,
 Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
Subject: [AArch64] Improve SVE dup intrinsics codegen
X-Spam-Status: No, score=-11.5 required=5.0 tests=BAYES_00, BODY_8BITS,
 GIT_PATCH_0, KAM_DMARC_STATUS, KAM_LOTSOFHASH, KAM_SHORT, SPF_HELO_NONE,
 SPF_PASS, TXREP,
 T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on
 server2.sourceware.org
X-BeenThere: gcc-patches@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-patches mailing list <gcc-patches.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=subscribe>
X-Patchwork-Original-From: "Andre Vieira \(lists\) via Gcc-patches"
 <gcc-patches@gcc.gnu.org>
From: "Andre Vieira (lists)" <Andre.SimoesDiasVieira@arm.com>
Reply-To: "Andre Vieira \(lists\)" <andre.simoesdiasvieira@arm.com>
Errors-To: gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org
Sender: "Gcc-patches"
 <gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org>

Hi,

This patch teaches the aarch64 backend to improve codegen when using dup 
with NEON vectors with repeating patterns. It will attempt to use a 
smaller NEON vector (or element) to limit the number of instructions 
needed to construct the input vector.

Bootstrapped and regression tested  aarch64-none-linux-gnu.

Is his OK for trunk?

gcc/ChangeLog:

         * config/aarch64/aarch64.cc (aarch64_simd_container_mode): Make 
it global.
         * config/aarch64/aarch64-protos.h 
(aarch64_simd_container_mode): Declare it.
         * config/aarch64/aarch64-sve.md (*vec_duplicate<mode>_reg): 
Rename this to ...
         (@aarch64_vec_duplicae_reg_<mode>): ... this.
         * gcc/config/aarch64-sve-builtins-base.cc 
(svdup_lane_impl::expand): Improve codegen when inputs form a repeating 
pattern.

gcc/testsuite/ChangeLog:

         * gcc.target/aarch64/sve/dup_opt.c: New test.
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 2ac781dff4a93cbe0f0b091147b2521ed1a88750..cfc31b467cf1d3cd79b2dfe6a54e6910dd43b5d8 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -771,6 +771,7 @@ int aarch64_branch_cost (bool, bool);
 enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
 bool aarch64_advsimd_struct_mode_p (machine_mode mode);
 opt_machine_mode aarch64_vq_mode (scalar_mode);
+machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 opt_machine_mode aarch64_full_sve_mode (scalar_mode);
 bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
 bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index c24c05487246f529f81867d6429e636fd6dc74d0..f8b755a83dc37578363270618323f87c95fa327f 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -875,13 +875,98 @@ public:
        argument N to go into architectural lane N, whereas Advanced SIMD
        vectors are loaded memory lsb to register lsb.  We therefore need
        to reverse the elements for big-endian targets.  */
-    rtx vq_reg = gen_reg_rtx (vq_mode);
     rtvec vec = rtvec_alloc (elements_per_vq);
     for (unsigned int i = 0; i < elements_per_vq; ++i)
       {
 	unsigned int argno = BYTES_BIG_ENDIAN ? elements_per_vq - i - 1 : i;
 	RTVEC_ELT (vec, i) = e.args[argno];
       }
+
+    /* Look for a repeating pattern in the 128-bit input as that potentially
+       simplifies constructing the input vector.
+       For example, codegen for svdupq_n_s32 (a, b, a, b), could be simplified
+       from:
+	dup     v0.4s, w0
+	fmov    s1, w1
+	ins     v0.s[1], v1.s[0]
+	ins     v0.s[3], v1.s[0]
+	dup     z0.q, z0.q[0]
+       to:
+	fmov	d0, x0
+	ins	v0.s[1], w1
+	mov	z0.d, d0
+       where we can see it uses a [a, b] input vector reducing the number of
+       needed instructions.  */
+    if  (elements_per_vq > 1 && mode == e.vector_mode(0))
+      {
+	unsigned int new_elements_n = elements_per_vq;
+	bool group = true;
+	while (group && new_elements_n > 1)
+	  {
+	    for (unsigned int i = 0; i < new_elements_n / 2; ++i)
+	      {
+		if (rtx_equal_p (RTVEC_ELT (vec, i),
+				 RTVEC_ELT (vec, new_elements_n / 2 + i)) == 0)
+		  {
+		    group = false;
+		    break;
+		  }
+	      }
+	    if (group)
+	      new_elements_n /= 2;
+	  }
+	/* We have found a repeating pattern smaller than 128-bits, so use that
+	   instead.  */
+	if (new_elements_n < elements_per_vq)
+	  {
+	    unsigned int input_size = 128 / elements_per_vq * new_elements_n;
+	    scalar_mode new_mode
+	      = int_mode_for_size (input_size, 0).require ();
+	    rtx input;
+	    if (new_elements_n > 1)
+	      {
+		if (input_size < 64)
+		  {
+		    /* TODO: Remove this when support for 32- and 16-bit vectors
+		       is added.
+		       */
+		    new_elements_n *= 64/input_size;
+		    input_size = 64;
+		    new_mode = int_mode_for_size (input_size, 0).require ();
+		  }
+		input = gen_reg_rtx (new_mode);
+		rtvec new_vec = rtvec_alloc (new_elements_n);
+		for (unsigned int i = 0; i < new_elements_n; ++i)
+		  RTVEC_ELT (new_vec, i) = RTVEC_ELT (vec, i);
+
+		machine_mode merge_mode
+		  = aarch64_simd_container_mode (element_mode, input_size);
+
+		rtx merge_subreg = simplify_gen_subreg (merge_mode, input,
+							new_mode, 0);
+		aarch64_expand_vector_init (merge_subreg,
+					    gen_rtx_PARALLEL (merge_mode,
+							      new_vec));
+	      }
+	    else
+	      input = simplify_gen_subreg (new_mode, RTVEC_ELT (vec, 0),
+					   element_mode, 0);
+	    machine_mode sve_mode
+	      = aarch64_full_sve_mode (new_mode).require ();
+
+	    rtx target = simplify_gen_subreg (sve_mode, e.possible_target,
+					      mode, 0);
+
+	    expand_operand ops[2];
+	    create_output_operand (&ops[0], target, sve_mode);
+	    create_fixed_operand (&ops[1], input);
+	    expand_insn (code_for_aarch64_vec_duplicate_reg (sve_mode), 2,
+			 ops);
+	    return e.possible_target;
+	  }
+      }
+
+    rtx vq_reg = gen_reg_rtx (vq_mode);
     aarch64_expand_vector_init (vq_reg, gen_rtx_PARALLEL (vq_mode, vec));
 
     /* If the result is a boolean, compare the data vector against zero.  */
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index bd60e65b0c3f05f1c931f03807170f3b9d699de5..a7d6041bcda03318ff10f6d425889801b9a8fa63 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2508,7 +2508,7 @@ (define_expand "vec_duplicate<mode>"
 ;; the scalar input gets spilled to memory during RA.  We want to split
 ;; the load at the first opportunity in order to allow the PTRUE to be
 ;; optimized with surrounding code.
-(define_insn_and_split "*vec_duplicate<mode>_reg"
+(define_insn_and_split "@aarch64_vec_duplicate_reg_<mode>"
   [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w, w")
 	(vec_duplicate:SVE_ALL
 	  (match_operand:<VEL> 1 "aarch64_sve_dup_operand" "r, w, Uty")))
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f650abbc4ce49cf0947049931f86bad1130c3428..f5e66a43ec5d47e6f5d5540cb41fba0e0e9f92d6 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -301,7 +301,6 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 							 const_tree type,
 							 int misalignment,
 							 bool is_packed);
-static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 					    aarch64_addr_query_type);
 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
@@ -20502,7 +20501,7 @@ aarch64_vq_mode (scalar_mode mode)
 
 /* Return appropriate SIMD container
    for MODE within a vector of WIDTH bits.  */
-static machine_mode
+machine_mode
 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
 {
   if (TARGET_SVE
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/dup_opt.c b/gcc/testsuite/gcc.target/aarch64/sve/dup_opt.c
new file mode 100644
index 0000000000000000000000000000000000000000..66a1fcfb585b2c2b36a1344d4a33811257188dee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/dup_opt.c
@@ -0,0 +1,203 @@
+/* { dg-options { "-O2" } } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+#include <arm_sve.h>
+
+/*
+** float32_0:
+**	ins	v0.s\[1\], v1.s\[0\]
+**	mov	z0.d, d0
+**	ret
+*/
+svfloat32_t float32_0(float x, float y)
+{
+    return svdupq_n_f32(x, y, x, y);
+}
+
+/*
+** float32_1:
+**	mov	z0.s, s0
+**	ret
+*/
+
+svfloat32_t float32_1(float x)
+{
+    return svdupq_n_f32(x, x, x, x);
+}
+
+/*
+** float16_0:
+**	ins	v0.h\[1\], v1.h\[0\]
+**	ins	v0.h\[2\], v2.h\[0\]
+**	ins	v0.h\[3\], v3.h\[0\]
+**	mov	z0.d, d0
+**	ret
+*/
+
+svfloat16_t float16_0 (float16_t a, float16_t b, float16_t c, float16_t d)
+{
+  return svdupq_n_f16 (a, b, c, d, a, b, c, d);
+}
+
+/*
+** float16_1:
+**	dup	v0.4h, v0.h\[0\]
+**	ins	v0.h\[1\], v1.h\[0\]
+**	ins	v0.h\[3\], v1.h\[0\]
+**	mov	z0.d, d0
+**	ret
+*/
+
+svfloat16_t float16_1 (float16_t a, float16_t b)
+{
+  return svdupq_n_f16 (a, b, a, b, a, b, a, b);
+}
+
+/*
+** float16_2:
+**	mov	z0.h, h0
+**	ret
+*/
+
+svfloat16_t float16_2 (float16_t a)
+{
+  return svdupq_n_f16 (a, a, a, a, a, a, a, a);
+}
+
+/*
+** int64_0:
+**	mov	z0.d, x0
+**	ret
+*/
+
+svint64_t int64_0 (int64_t a)
+{
+    return svdupq_n_s64 (a, a);
+}
+
+/*
+** int32_0:
+**	fmov	d0, x0
+**	ins	v0.s\[1\], w1
+**	mov	z0.d, d0
+**	ret
+*/
+
+svuint32_t int32_0(uint32_t a, uint32_t b) {
+    return svdupq_n_u32(a, b, a, b);
+}
+
+/*
+** int32_1:
+**	mov	z0.s, w0
+**	ret
+*/
+
+svint32_t int32_1(int32_t a)
+{
+    return svdupq_n_s32(a, a, a, a);
+}
+
+/*
+** int16_0:
+**	...
+**	fmov	d0, x0
+**	ins	v0.h\[1\], w1
+**	ins	v0.h\[2\], w2
+**	ins	v0.h\[3\], w3
+**	mov	z0.d, d0
+**	ret
+*/
+
+svint16_t int16_0(int16_t a, int16_t b, int16_t c, int16_t d)
+{
+    return svdupq_n_s16(a, b, c, d, a, b, c, d);
+}
+
+/*
+** int16_1:
+**	dup	v0.4h, w0
+**	ins	v0.h\[1\], w1
+**	ins	v0.h\[3\], w1
+**	mov	z0.d, d0
+**	ret
+*/
+
+svuint16_t int16_1(uint16_t a, uint16_t b)
+{
+    return svdupq_n_u16(a, b, a, b, a, b, a, b);
+}
+
+/*
+** int16_2:
+**	mov	z0.h, w0
+**	ret
+*/
+
+svint16_t int16_2(int16_t a)
+{
+    return svdupq_n_s16(a, a, a, a, a, a, a, a);
+}
+/*
+** int8_0:
+**	...
+**	fmov	d0, x0
+**	ins	v0.b\[1\], w1
+**	ins	v0.b\[2\], w2
+**	ins	v0.b\[3\], w3
+**	ins	v0.b\[4\], w4
+**	ins	v0.b\[5\], w5
+**	ins	v0.b\[6\], w6
+**	ins	v0.b\[7\], w7
+**	mov	z0.d, d0
+**	ret
+*/
+
+svuint8_t int8_0(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h)
+{
+    return svdupq_n_u8(a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h);
+}
+
+/*
+** int8_1:
+**	dup	v0.8b, w0
+**	ins	v0.b\[1\], w1
+**	ins	v0.b\[2\], w2
+**	ins	v0.b\[3\], w3
+**	ins	v0.b\[5\], w1
+**	ins	v0.b\[6\], w2
+**	ins	v0.b\[7\], w3
+**	mov	z0.d, d0
+**	ret
+*/
+
+svint8_t int8_1(int8_t a, int8_t b, int8_t c, int8_t d)
+{
+    return svdupq_n_s8(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d);
+}
+
+/*
+** int8_2:
+**	dup	v0.8b, w0
+**	ins	v0.b\[1\], w1
+**	ins	v0.b\[3\], w1
+**	ins	v0.b\[5\], w1
+**	ins	v0.b\[7\], w1
+**	mov	z0.d, d0
+**	ret
+*/
+
+svint8_t int8_2(int8_t a, int8_t b)
+{
+    return svdupq_n_s8(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
+}
+
+/*
+** int8_3:
+**	mov	z0.b, w0
+**	ret
+*/
+
+svint8_t int8_3(int8_t a)
+{
+    return svdupq_n_s8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a);
+}