[7/7] AArch64: Implement vector concat of partial SVE vectors

Message ID Z1BIjoIUZDTLQAd5@arm.com
State New
Headers
Series None |

Commit Message

Tamar Christina Dec. 4, 2024, 12:18 p.m. UTC
  Hi All,

This patch adds support for vector constructor from two partial SVE vectors into
a full SVE vector. It also implements support for the standard vec_init obtab to
do this.

gcc/ChangeLog:

	PR target/96342
	* config/aarch64/aarch64-sve.md (vec_init<mode><Vhalf>): New.
	(@aarch64_pack_partial<mode>): New.
	* config/aarch64/aarch64.cc (aarch64_sve_expand_vector_init): Special
	case constructors of two vectors.
	* config/aarch64/iterators.md (SVE_NO2E, SVE_PARTIAL_NO2E): New.
	(VHALF, Vhalf, Vwstype): Add SVE partial vectors.

gcc/testsuite/ChangeLog:

	PR target/96342
	* gcc.target/aarch64/vect-simd-clone-2.c: New test.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

---




--
  

Comments

Tamar Christina Dec. 11, 2024, 1:56 p.m. UTC | #1
ping

> -----Original Message-----
> From: Tamar Christina
> Sent: Wednesday, December 4, 2024 12:18 PM
> To: gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> ktkachov@gcc.gnu.org; Richard Sandiford <Richard.Sandiford@arm.com>
> Subject: [PATCH 7/7]AArch64: Implement vector concat of partial SVE vectors
> 
> Hi All,
> 
> This patch adds support for vector constructor from two partial SVE vectors into
> a full SVE vector. It also implements support for the standard vec_init obtab to
> do this.
> 
> gcc/ChangeLog:
> 
> 	PR target/96342
> 	* config/aarch64/aarch64-sve.md (vec_init<mode><Vhalf>): New.
> 	(@aarch64_pack_partial<mode>): New.
> 	* config/aarch64/aarch64.cc (aarch64_sve_expand_vector_init): Special
> 	case constructors of two vectors.
> 	* config/aarch64/iterators.md (SVE_NO2E, SVE_PARTIAL_NO2E): New.
> 	(VHALF, Vhalf, Vwstype): Add SVE partial vectors.
> 
> gcc/testsuite/ChangeLog:
> 
> 	PR target/96342
> 	* gcc.target/aarch64/vect-simd-clone-2.c: New test.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> ---
> diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-
> sve.md
> index
> 9afd11d347626eeb640722fdba2ab763b8479aa7..9e3577be6e943d7a5c95119
> 6463873d4bcfee07c 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -2840,6 +2840,16 @@ (define_expand "vec_init<mode><Vel>"
>    }
>  )
> 
> +(define_expand "vec_init<mode><Vhalf>"
> +  [(match_operand:SVE_NO2E 0 "register_operand")
> +    (match_operand 1 "")]
> +  "TARGET_SVE"
> +  {
> +    aarch64_sve_expand_vector_init (operands[0], operands[1]);
> +    DONE;
> +  }
> +)
> +
>  ;; Shift an SVE vector left and insert a scalar into element 0.
>  (define_insn "vec_shl_insert_<mode>"
>    [(set (match_operand:SVE_FULL 0 "register_operand")
> @@ -9347,6 +9357,20 @@ (define_insn "vec_pack_trunc_<Vwide>"
>    "uzp1\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
>  )
> 
> +;; Integer partial pack packing two partial SVE types into a single full SVE
> +;; type of the same element type.  Use UZP1 on the wider type, which discards
> +;; the high part of each wide element.  This allows to concat SVE partial types
> +;; into a wider vector.
> +(define_insn "@aarch64_pack_partial<mode>"
> +  [(set (match_operand:SVE_PARTIAL_NO2E 0 "register_operand" "=w")
> +	(unspec:SVE_PARTIAL_NO2E
> +	  [(match_operand:<VHALF> 1 "register_operand" "w")
> +	   (match_operand:<VHALF> 2 "register_operand" "w")]
> +	  UNSPEC_PACK))]
> +  "TARGET_SVE"
> +  "uzp1\t%0.<Vwstype>, %1.<Vwstype>, %2.<Vwstype>"
> +)
> +
>  ;; -------------------------------------------------------------------------
>  ;; ---- [INT<-INT] Unpacks
>  ;; -------------------------------------------------------------------------
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index
> af6fede102c2be6673c24f8020d000ea56322997..690d54b0a2954327e00d559f
> 96c414c81c2e18cd 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -24790,6 +24790,17 @@ aarch64_sve_expand_vector_init (rtx target, rtx
> vals)
>      v.quick_push (XVECEXP (vals, 0, i));
>    v.finalize ();
> 
> +  /* If we have two elements and are concatting vector.  */
> +  machine_mode elem_mode = GET_MODE (v.elt (0));
> +  if (nelts == 2 && VECTOR_MODE_P (elem_mode))
> +    {
> +      /* We've failed expansion using a dup.  Try using a cheeky truncate. */
> +      rtx arg0 = force_reg (elem_mode, v.elt(0));
> +      rtx arg1 = force_reg (elem_mode, v.elt(1));
> +      emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1));
> +      return;
> +    }
> +
>    /* If neither sub-vectors of v could be initialized specially,
>       then use INSR to insert all elements from v into TARGET.
>       ??? This might not be optimal for vectors with large
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index
> 023893d35f3e955e222c322ce370e84c95c29ee6..77d23d6ad795630d3d5fb5c0
> 76c086a479d46fee 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -138,6 +138,14 @@ (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
>  ;; VQ without 2 element modes.
>  (define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
> 
> +;; SVE modes without 2 element modes.
> +(define_mode_iterator SVE_NO2E [VNx16QI VNx8QI VNx4QI VNx8HI VNx4HI
> VNx8HF
> +				VNx4HF VNx8BF VNx4BF VNx4SI VNx4SF])
> +
> +;; Partial SVE modes without 2 element modes.
> +(define_mode_iterator SVE_PARTIAL_NO2E [VNx8QI VNx4QI VNx4HI
> +					VNx4HF VNx8BF VNx4BF])
> +
>  ;; 2 element quad vector modes.
>  (define_mode_iterator VQ_2E [V2DI V2DF])
> 
> @@ -1678,7 +1686,15 @@ (define_mode_attr VHALF [(V8QI "V4QI")  (V16QI
> "V8QI")
>  			 (V2DI "DI")    (V2SF  "SF")
>  			 (V4SF "V2SF")  (V4HF "V2HF")
>  			 (V8HF "V4HF")  (V2DF  "DF")
> -			 (V8BF "V4BF")])
> +			 (V8BF "V4BF")
> +			 (VNx16QI "VNx8QI") (VNx8QI "VNx4QI")
> +			 (VNx4QI "VNx2QI")  (VNx2QI "QI")
> +			 (VNx8HI "VNx4HI")  (VNx4HI "VNx2HI") (VNx2HI "HI")
> +			 (VNx8HF "VNx4HF")  (VNx4HF "VNx2HF") (VNx2HF "HF")
> +			 (VNx8BF "VNx4BF")  (VNx4BF "VNx2BF") (VNx2BF "BF")
> +			 (VNx4SI "VNx2SI")  (VNx2SI "SI")
> +			 (VNx4SF "VNx2SF")  (VNx2SF "SF")
> +			 (VNx2DI "DI")      (VNx2DF "DF")])
> 
>  ;; Half modes of all vector modes, in lower-case.
>  (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
> @@ -1686,7 +1702,15 @@ (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI
> "v8qi")
>  			 (V8HF  "v4hf") (V8BF  "v4bf")
>  			 (V2SI "si")    (V4SI  "v2si")
>  			 (V2DI "di")    (V2SF  "sf")
> -			 (V4SF "v2sf")  (V2DF  "df")])
> +			 (V4SF "v2sf")  (V2DF  "df")
> +			 (VNx16QI "vnx8qi") (VNx8QI "vnx4qi")
> +			 (VNx4QI "vnx2qi")  (VNx2QI "qi")
> +			 (VNx8HI "vnx4hi")  (VNx4HI "vnx2hi") (VNx2HI "hi")
> +			 (VNx8HF "vnx4hf")  (VNx4HF "vnx2hf") (VNx2HF "hf")
> +			 (VNx8BF "vnx4bf")  (VNx4BF "vnx2bf") (VNx2BF "bf")
> +			 (VNx4SI "vnx2si")  (VNx2SI "si")
> +			 (VNx4SF "vnx2sf")  (VNx2SF "sf")
> +			 (VNx2DI "di")      (VNx2DF "df")])
> 
>  ;; Single-element half modes of quad vector modes.
>  (define_mode_attr V1HALF [(V2DI "V1DI")  (V2DF  "V1DF")])
> @@ -1815,7 +1839,10 @@ (define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s")
>  ;; Widened scalar register suffixes.
>  (define_mode_attr Vwstype [(V8QI "h") (V4HI "s")
>  			  (V2SI "") (V16QI "h")
> -			  (V8HI "s") (V4SI "d")])
> +			  (V8HI "s") (V4SI "d")
> +			  (VNx8QI "h") (VNx4QI "h")
> +			  (VNx4HF "s") (VNx4HI "s")
> +			  (VNx8BF "h") (VNx4BF "h")])
>  ;; Add a .1d for V2SI.
>  (define_mode_attr Vwsuf [(V8QI "") (V4HI "")
>  			  (V2SI ".1d") (V16QI "")
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
> b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..a25cae2708dd18cc91a773
> 2f845419bbdb06c5c1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile }  */
> +/* { dg-options "-std=c99" } */
> +/* { dg-additional-options "-O3 -march=armv8-a" } */
> +
> +#pragma GCC target ("+sve")
> +extern char __attribute__ ((simd, const)) fn3 (int, char);
> +void test_fn3 (int *a, int *b, char *c, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    a[i] = (int) (fn3 (b[i], c[i]) + c[i]);
> +}
> +
> +/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn3\n} } } */
> 
> 
> 
> 
> --
  
Richard Sandiford Dec. 12, 2024, 12:30 p.m. UTC | #2
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> This patch adds support for vector constructor from two partial SVE vectors into
> a full SVE vector. It also implements support for the standard vec_init obtab to
> do this.
>
> gcc/ChangeLog:
>
> 	PR target/96342
> 	* config/aarch64/aarch64-sve.md (vec_init<mode><Vhalf>): New.
> 	(@aarch64_pack_partial<mode>): New.
> 	* config/aarch64/aarch64.cc (aarch64_sve_expand_vector_init): Special
> 	case constructors of two vectors.
> 	* config/aarch64/iterators.md (SVE_NO2E, SVE_PARTIAL_NO2E): New.
> 	(VHALF, Vhalf, Vwstype): Add SVE partial vectors.
>
> gcc/testsuite/ChangeLog:
>
> 	PR target/96342
> 	* gcc.target/aarch64/vect-simd-clone-2.c: New test.

This triggers an ICE for:

typedef unsigned int v8si __attribute__((vector_size(32)));
typedef unsigned int v16si __attribute__((vector_size(64)));

v16si __GIMPLE
foo (v8si x, v8si y)
{
  v16si res;

  res = _Literal (v16si) { x, y };
  return res;
}

compiled with -O2 -march=armv8-a+sve -msve-vector-bits=512 -fgimple.
Suggested fix below.

> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
> index 9afd11d347626eeb640722fdba2ab763b8479aa7..9e3577be6e943d7a5c951196463873d4bcfee07c 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -2840,6 +2840,16 @@ (define_expand "vec_init<mode><Vel>"
>    }
>  )
>  
> +(define_expand "vec_init<mode><Vhalf>"
> +  [(match_operand:SVE_NO2E 0 "register_operand")
> +    (match_operand 1 "")]

Nit: excess indentation.

> +  "TARGET_SVE"
> +  {
> +    aarch64_sve_expand_vector_init (operands[0], operands[1]);
> +    DONE;
> +  }
> +)
> +
>  ;; Shift an SVE vector left and insert a scalar into element 0.
>  (define_insn "vec_shl_insert_<mode>"
>    [(set (match_operand:SVE_FULL 0 "register_operand")
> @@ -9347,6 +9357,20 @@ (define_insn "vec_pack_trunc_<Vwide>"
>    "uzp1\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
>  )
>  
> +;; Integer partial pack packing two partial SVE types into a single full SVE
> +;; type of the same element type.  Use UZP1 on the wider type, which discards
> +;; the high part of each wide element.  This allows to concat SVE partial types
> +;; into a wider vector.
> +(define_insn "@aarch64_pack_partial<mode>"
> +  [(set (match_operand:SVE_PARTIAL_NO2E 0 "register_operand" "=w")
> +	(unspec:SVE_PARTIAL_NO2E
> +	  [(match_operand:<VHALF> 1 "register_operand" "w")
> +	   (match_operand:<VHALF> 2 "register_operand" "w")]
> +	  UNSPEC_PACK))]
> +  "TARGET_SVE"
> +  "uzp1\t%0.<Vwstype>, %1.<Vwstype>, %2.<Vwstype>"
> +)
> +

To fix the ICE above, I think we should define this pattern for
all SVE_NO2E.  We can also make it a vec_concat, which should work
for both endiannesses.

Rather than use Vwstype, I think this is conceptually a permute of the
containers, so should use Vctype.  That will change VNx4QI from using .h
(as in the patch) to .s (to match VNx4SI), but both work.

>  ;; -------------------------------------------------------------------------
>  ;; ---- [INT<-INT] Unpacks
>  ;; -------------------------------------------------------------------------
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index af6fede102c2be6673c24f8020d000ea56322997..690d54b0a2954327e00d559f96c414c81c2e18cd 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -24790,6 +24790,17 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals)
>      v.quick_push (XVECEXP (vals, 0, i));
>    v.finalize ();
>  
> +  /* If we have two elements and are concatting vector.  */
> +  machine_mode elem_mode = GET_MODE (v.elt (0));
> +  if (nelts == 2 && VECTOR_MODE_P (elem_mode))
> +    {
> +      /* We've failed expansion using a dup.  Try using a cheeky truncate. */
> +      rtx arg0 = force_reg (elem_mode, v.elt(0));
> +      rtx arg1 = force_reg (elem_mode, v.elt(1));
> +      emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1));
> +      return;
> +    }
> +

I think it'd be better to use an independent routine for this,
since there's not really any overlap with the scalar-element code.
In particular, we might as well get the vectors directly from
XVECEXP (val, 0, ...), since we don't need the rtx_vector_builder
for the expansion.

>    /* If neither sub-vectors of v could be initialized specially,
>       then use INSR to insert all elements from v into TARGET.
>       ??? This might not be optimal for vectors with large
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 023893d35f3e955e222c322ce370e84c95c29ee6..77d23d6ad795630d3d5fb5c076c086a479d46fee 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -138,6 +138,14 @@ (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
>  ;; VQ without 2 element modes.
>  (define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
>  
> +;; SVE modes without 2 element modes.
> +(define_mode_iterator SVE_NO2E [VNx16QI VNx8QI VNx4QI VNx8HI VNx4HI VNx8HF
> +				VNx4HF VNx8BF VNx4BF VNx4SI VNx4SF])
> +
> +;; Partial SVE modes without 2 element modes.
> +(define_mode_iterator SVE_PARTIAL_NO2E [VNx8QI VNx4QI VNx4HI
> +					VNx4HF VNx8BF VNx4BF])
> +
>  ;; 2 element quad vector modes.
>  (define_mode_iterator VQ_2E [V2DI V2DF])
>  
> @@ -1678,7 +1686,15 @@ (define_mode_attr VHALF [(V8QI "V4QI")  (V16QI "V8QI")
>  			 (V2DI "DI")    (V2SF  "SF")
>  			 (V4SF "V2SF")  (V4HF "V2HF")
>  			 (V8HF "V4HF")  (V2DF  "DF")
> -			 (V8BF "V4BF")])
> +			 (V8BF "V4BF")
> +			 (VNx16QI "VNx8QI") (VNx8QI "VNx4QI")
> +			 (VNx4QI "VNx2QI")  (VNx2QI "QI")
> +			 (VNx8HI "VNx4HI")  (VNx4HI "VNx2HI") (VNx2HI "HI")
> +			 (VNx8HF "VNx4HF")  (VNx4HF "VNx2HF") (VNx2HF "HF")
> +			 (VNx8BF "VNx4BF")  (VNx4BF "VNx2BF") (VNx2BF "BF")
> +			 (VNx4SI "VNx2SI")  (VNx2SI "SI")
> +			 (VNx4SF "VNx2SF")  (VNx2SF "SF")
> +			 (VNx2DI "DI")      (VNx2DF "DF")])

Are the x2 entries necessary, given that the new uses are restricted
to NO2E?

Thanks,
Richard

>  ;; Half modes of all vector modes, in lower-case.
>  (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
> @@ -1686,7 +1702,15 @@ (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
>  			 (V8HF  "v4hf") (V8BF  "v4bf")
>  			 (V2SI "si")    (V4SI  "v2si")
>  			 (V2DI "di")    (V2SF  "sf")
> -			 (V4SF "v2sf")  (V2DF  "df")])
> +			 (V4SF "v2sf")  (V2DF  "df")
> +			 (VNx16QI "vnx8qi") (VNx8QI "vnx4qi")
> +			 (VNx4QI "vnx2qi")  (VNx2QI "qi")
> +			 (VNx8HI "vnx4hi")  (VNx4HI "vnx2hi") (VNx2HI "hi")
> +			 (VNx8HF "vnx4hf")  (VNx4HF "vnx2hf") (VNx2HF "hf")
> +			 (VNx8BF "vnx4bf")  (VNx4BF "vnx2bf") (VNx2BF "bf")
> +			 (VNx4SI "vnx2si")  (VNx2SI "si")
> +			 (VNx4SF "vnx2sf")  (VNx2SF "sf")
> +			 (VNx2DI "di")      (VNx2DF "df")])
>  
>  ;; Single-element half modes of quad vector modes.
>  (define_mode_attr V1HALF [(V2DI "V1DI")  (V2DF  "V1DF")])
> @@ -1815,7 +1839,10 @@ (define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s")
>  ;; Widened scalar register suffixes.
>  (define_mode_attr Vwstype [(V8QI "h") (V4HI "s")
>  			  (V2SI "") (V16QI "h")
> -			  (V8HI "s") (V4SI "d")])
> +			  (V8HI "s") (V4SI "d")
> +			  (VNx8QI "h") (VNx4QI "h")
> +			  (VNx4HF "s") (VNx4HI "s")
> +			  (VNx8BF "h") (VNx4BF "h")])
>  ;; Add a .1d for V2SI.
>  (define_mode_attr Vwsuf [(V8QI "") (V4HI "")
>  			  (V2SI ".1d") (V16QI "")
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..a25cae2708dd18cc91a7732f845419bbdb06c5c1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile }  */
> +/* { dg-options "-std=c99" } */
> +/* { dg-additional-options "-O3 -march=armv8-a" } */
> +
> +#pragma GCC target ("+sve")
> +extern char __attribute__ ((simd, const)) fn3 (int, char);
> +void test_fn3 (int *a, int *b, char *c, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    a[i] = (int) (fn3 (b[i], c[i]) + c[i]);
> +}
> +
> +/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn3\n} } } */
  
Tamar Christina Dec. 13, 2024, 9:37 p.m. UTC | #3
> >  ;; 2 element quad vector modes.
> >  (define_mode_iterator VQ_2E [V2DI V2DF])
> >
> > @@ -1678,7 +1686,15 @@ (define_mode_attr VHALF [(V8QI "V4QI")  (V16QI
> "V8QI")
> >  			 (V2DI "DI")    (V2SF  "SF")
> >  			 (V4SF "V2SF")  (V4HF "V2HF")
> >  			 (V8HF "V4HF")  (V2DF  "DF")
> > -			 (V8BF "V4BF")])
> > +			 (V8BF "V4BF")
> > +			 (VNx16QI "VNx8QI") (VNx8QI "VNx4QI")
> > +			 (VNx4QI "VNx2QI")  (VNx2QI "QI")
> > +			 (VNx8HI "VNx4HI")  (VNx4HI "VNx2HI") (VNx2HI "HI")
> > +			 (VNx8HF "VNx4HF")  (VNx4HF "VNx2HF") (VNx2HF "HF")
> > +			 (VNx8BF "VNx4BF")  (VNx4BF "VNx2BF") (VNx2BF "BF")
> > +			 (VNx4SI "VNx2SI")  (VNx2SI "SI")
> > +			 (VNx4SF "VNx2SF")  (VNx2SF "SF")
> > +			 (VNx2DI "DI")      (VNx2DF "DF")])
> 
> Are the x2 entries necessary, given that the new uses are restricted
> to NO2E?
> 

No, but I wanted to keep the symmetry with the Adv. SIMD modes.   Since the
mode attributes don't really control the number of alternatives I thought it would
be better to have the attributes be "fully" defined rather than only the subset I use.

gcc/ChangeLog:

	PR target/96342
	* config/aarch64/aarch64-sve.md (vec_init<mode><Vhalf>): New.
	(@aarch64_pack_partial<mode>): New.
	* config/aarch64/aarch64.cc (aarch64_sve_expand_vector_init_subvector): New.
	* config/aarch64/iterators.md (SVE_NO2E): New.
	(VHALF, Vhalf): Add SVE partial vectors.

gcc/testsuite/ChangeLog:

	PR target/96342
	* gcc.target/aarch64/vect-simd-clone-2.c: New test.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

-- inline copy of patch --

diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index a72ca2a500d394598268c6adfe717eed94a304b3..8ed4221dbe5c49db97b37f186365fa391900eadb 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2839,6 +2839,16 @@ (define_expand "vec_init<mode><Vel>"
   }
 )
 
+(define_expand "vec_init<mode><Vhalf>"
+  [(match_operand:SVE_NO2E 0 "register_operand")
+   (match_operand 1 "")]
+  "TARGET_SVE"
+  {
+    aarch64_sve_expand_vector_init (operands[0], operands[1]);
+    DONE;
+  }
+)
+
 ;; Shift an SVE vector left and insert a scalar into element 0.a
 (define_insn "vec_shl_insert_<mode>"
   [(set (match_operand:SVE_FULL 0 "register_operand")
@@ -9289,6 +9299,19 @@ (define_insn "vec_pack_trunc_<Vwide>"
   "uzp1\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
 )
 
+;; Integer partial pack packing two partial SVE types into a single full SVE
+;; type of the same element type.  Use UZP1 on the wider type, which discards
+;; the high part of each wide element.  This allows to concat SVE partial types
+;; into a wider vector.
+(define_insn "@aarch64_pack_partial<mode>"
+  [(set (match_operand:SVE_NO2E 0 "register_operand" "=w")
+	(vec_concat:SVE_NO2E
+	  (match_operand:<VHALF> 1 "register_operand" "w")
+	  (match_operand:<VHALF> 2 "register_operand" "w")))]
+  "TARGET_SVE"
+  "uzp1\t%0.<Vctype>, %1.<Vctype>, %2.<Vctype>"
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT<-INT] Unpacks
 ;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index de4c0a0783912b54ac35d7c818c24574b27a4ca0..40214e318f3c4e30e619d96073b253887c973efc 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24859,6 +24859,17 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals)
     v.quick_push (XVECEXP (vals, 0, i));
   v.finalize ();
 
+  /* If we have two elements and are concatting vector.  */
+  machine_mode elem_mode = GET_MODE (v.elt (0));
+  if (nelts == 2 && VECTOR_MODE_P (elem_mode))
+    {
+      /* We've failed expansion using a dup.  Try using a cheeky truncate. */
+      rtx arg0 = force_reg (elem_mode, v.elt(0));
+      rtx arg1 = force_reg (elem_mode, v.elt(1));
+      emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1));
+      return;
+    }
+
   /* If neither sub-vectors of v could be initialized specially,
      then use INSR to insert all elements from v into TARGET.
      ??? This might not be optimal for vectors with large
@@ -24870,6 +24881,30 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals)
     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
 }
 
+/* Initialize register TARGET from the two vector subelements in PARALLEL
+   rtx VALS.  */
+
+void
+aarch64_sve_expand_vector_init_subvector (rtx target, rtx vals)
+{
+  machine_mode mode = GET_MODE (target);
+  int nelts = XVECLEN (vals, 0);
+
+  gcc_assert (nelts == 2);
+
+  rtx arg0 = XVECEXP (vals, 0, 0);
+  rtx arg1 = XVECEXP (vals, 0, 1);
+
+  /* If we have two elements and are concatting vector.  */
+  machine_mode elem_mode = GET_MODE (arg0);
+  gcc_assert (VECTOR_MODE_P (elem_mode));
+
+  arg0 = force_reg (elem_mode, arg0);
+  arg1 = force_reg (elem_mode, arg1);
+  emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1));
+  return;
+}
+
 /* Check whether VALUE is a vector constant in which every element
    is either a power of 2 or a negated power of 2.  If so, return
    a constant vector of log2s, and flip CODE between PLUS and MINUS
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 89c72b24aeb791adbbd3edfdb131478d52b248e6..09c2d24c4b8f1f39c27ea691f7cfe0b51bc4f788 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -140,6 +140,10 @@ (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
 ;; VQ without 2 element modes.
 (define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
 
+;; SVE modes without 2 element modes.
+(define_mode_iterator SVE_NO2E [VNx16QI VNx8QI VNx4QI VNx8HI VNx4HI VNx8HF
+				VNx4HF VNx8BF VNx4BF VNx4SI VNx4SF])
+
 ;; 2 element quad vector modes.
 (define_mode_iterator VQ_2E [V2DI V2DF])
 
@@ -1737,7 +1741,15 @@ (define_mode_attr VHALF [(V8QI "V4QI")  (V16QI "V8QI")
 			 (V2DI "DI")    (V2SF  "SF")
 			 (V4SF "V2SF")  (V4HF "V2HF")
 			 (V8HF "V4HF")  (V2DF  "DF")
-			 (V8BF "V4BF")])
+			 (V8BF "V4BF")
+			 (VNx16QI "VNx8QI") (VNx8QI "VNx4QI")
+			 (VNx4QI "VNx2QI")  (VNx2QI "QI")
+			 (VNx8HI "VNx4HI")  (VNx4HI "VNx2HI") (VNx2HI "HI")
+			 (VNx8HF "VNx4HF")  (VNx4HF "VNx2HF") (VNx2HF "HF")
+			 (VNx8BF "VNx4BF")  (VNx4BF "VNx2BF") (VNx2BF "BF")
+			 (VNx4SI "VNx2SI")  (VNx2SI "SI")
+			 (VNx4SF "VNx2SF")  (VNx2SF "SF")
+			 (VNx2DI "DI")      (VNx2DF "DF")])
 
 ;; Half modes of all vector modes, in lower-case.
 (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
@@ -1745,7 +1757,15 @@ (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
 			 (V8HF  "v4hf") (V8BF  "v4bf")
 			 (V2SI "si")    (V4SI  "v2si")
 			 (V2DI "di")    (V2SF  "sf")
-			 (V4SF "v2sf")  (V2DF  "df")])
+			 (V4SF "v2sf")  (V2DF  "df")
+			 (VNx16QI "vnx8qi") (VNx8QI "vnx4qi")
+			 (VNx4QI "vnx2qi")  (VNx2QI "qi")
+			 (VNx8HI "vnx4hi")  (VNx4HI "vnx2hi") (VNx2HI "hi")
+			 (VNx8HF "vnx4hf")  (VNx4HF "vnx2hf") (VNx2HF "hf")
+			 (VNx8BF "vnx4bf")  (VNx4BF "vnx2bf") (VNx2BF "bf")
+			 (VNx4SI "vnx2si")  (VNx2SI "si")
+			 (VNx4SF "vnx2sf")  (VNx2SF "sf")
+			 (VNx2DI "di")      (VNx2DF "df")])
 
 ;; Single-element half modes of quad vector modes.
 (define_mode_attr V1HALF [(V2DI "V1DI")  (V2DF  "V1DF")])
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..a25cae2708dd18cc91a7732f845419bbdb06c5c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
@@ -0,0 +1,13 @@
+/* { dg-do compile }  */
+/* { dg-options "-std=c99" } */
+/* { dg-additional-options "-O3 -march=armv8-a" } */
+
+#pragma GCC target ("+sve")
+extern char __attribute__ ((simd, const)) fn3 (int, char);
+void test_fn3 (int *a, int *b, char *c, int n)
+{
+  for (int i = 0; i < n; ++i)
+    a[i] = (int) (fn3 (b[i], c[i]) + c[i]);
+}
+
+/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn3\n} } } */
  
Richard Sandiford Dec. 19, 2024, 11:03 a.m. UTC | #4
Tamar Christina <Tamar.Christina@arm.com> writes:
>> >  ;; 2 element quad vector modes.
>> >  (define_mode_iterator VQ_2E [V2DI V2DF])
>> >
>> > @@ -1678,7 +1686,15 @@ (define_mode_attr VHALF [(V8QI "V4QI")  (V16QI
>> "V8QI")
>> >  			 (V2DI "DI")    (V2SF  "SF")
>> >  			 (V4SF "V2SF")  (V4HF "V2HF")
>> >  			 (V8HF "V4HF")  (V2DF  "DF")
>> > -			 (V8BF "V4BF")])
>> > +			 (V8BF "V4BF")
>> > +			 (VNx16QI "VNx8QI") (VNx8QI "VNx4QI")
>> > +			 (VNx4QI "VNx2QI")  (VNx2QI "QI")
>> > +			 (VNx8HI "VNx4HI")  (VNx4HI "VNx2HI") (VNx2HI "HI")
>> > +			 (VNx8HF "VNx4HF")  (VNx4HF "VNx2HF") (VNx2HF "HF")
>> > +			 (VNx8BF "VNx4BF")  (VNx4BF "VNx2BF") (VNx2BF "BF")
>> > +			 (VNx4SI "VNx2SI")  (VNx2SI "SI")
>> > +			 (VNx4SF "VNx2SF")  (VNx2SF "SF")
>> > +			 (VNx2DI "DI")      (VNx2DF "DF")])
>> 
>> Are the x2 entries necessary, given that the new uses are restricted
>> to NO2E?
>> 
>
> No, but I wanted to keep the symmetry with the Adv. SIMD modes.   Since the
> mode attributes don't really control the number of alternatives I thought it would
> be better to have the attributes be "fully" defined rather than only the subset I use.

But these are variable-length modes, so DI is only half of VNx2DI for
the minimum vector length.  It's less than half for Neoverse V1 or A64FX.

IMO it'd be better to leave them out for now and defined them when needed,
at which point the right choice would be more obvious.

Thanks,
Richard

>
> gcc/ChangeLog:
>
> 	PR target/96342
> 	* config/aarch64/aarch64-sve.md (vec_init<mode><Vhalf>): New.
> 	(@aarch64_pack_partial<mode>): New.
> 	* config/aarch64/aarch64.cc (aarch64_sve_expand_vector_init_subvector): New.
> 	* config/aarch64/iterators.md (SVE_NO2E): New.
> 	(VHALF, Vhalf): Add SVE partial vectors.
>
> gcc/testsuite/ChangeLog:
>
> 	PR target/96342
> 	* gcc.target/aarch64/vect-simd-clone-2.c: New test.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> -- inline copy of patch --
>
> diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
> index a72ca2a500d394598268c6adfe717eed94a304b3..8ed4221dbe5c49db97b37f186365fa391900eadb 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -2839,6 +2839,16 @@ (define_expand "vec_init<mode><Vel>"
>    }
>  )
>  
> +(define_expand "vec_init<mode><Vhalf>"
> +  [(match_operand:SVE_NO2E 0 "register_operand")
> +   (match_operand 1 "")]
> +  "TARGET_SVE"
> +  {
> +    aarch64_sve_expand_vector_init (operands[0], operands[1]);
> +    DONE;
> +  }
> +)
> +
>  ;; Shift an SVE vector left and insert a scalar into element 0.a
>  (define_insn "vec_shl_insert_<mode>"
>    [(set (match_operand:SVE_FULL 0 "register_operand")
> @@ -9289,6 +9299,19 @@ (define_insn "vec_pack_trunc_<Vwide>"
>    "uzp1\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
>  )
>  
> +;; Integer partial pack packing two partial SVE types into a single full SVE
> +;; type of the same element type.  Use UZP1 on the wider type, which discards
> +;; the high part of each wide element.  This allows to concat SVE partial types
> +;; into a wider vector.
> +(define_insn "@aarch64_pack_partial<mode>"
> +  [(set (match_operand:SVE_NO2E 0 "register_operand" "=w")
> +	(vec_concat:SVE_NO2E
> +	  (match_operand:<VHALF> 1 "register_operand" "w")
> +	  (match_operand:<VHALF> 2 "register_operand" "w")))]
> +  "TARGET_SVE"
> +  "uzp1\t%0.<Vctype>, %1.<Vctype>, %2.<Vctype>"
> +)
> +
>  ;; -------------------------------------------------------------------------
>  ;; ---- [INT<-INT] Unpacks
>  ;; -------------------------------------------------------------------------
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index de4c0a0783912b54ac35d7c818c24574b27a4ca0..40214e318f3c4e30e619d96073b253887c973efc 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -24859,6 +24859,17 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals)
>      v.quick_push (XVECEXP (vals, 0, i));
>    v.finalize ();
>  
> +  /* If we have two elements and are concatting vector.  */
> +  machine_mode elem_mode = GET_MODE (v.elt (0));
> +  if (nelts == 2 && VECTOR_MODE_P (elem_mode))
> +    {
> +      /* We've failed expansion using a dup.  Try using a cheeky truncate. */
> +      rtx arg0 = force_reg (elem_mode, v.elt(0));
> +      rtx arg1 = force_reg (elem_mode, v.elt(1));
> +      emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1));
> +      return;
> +    }
> +
>    /* If neither sub-vectors of v could be initialized specially,
>       then use INSR to insert all elements from v into TARGET.
>       ??? This might not be optimal for vectors with large
> @@ -24870,6 +24881,30 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals)
>      aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
>  }
>  
> +/* Initialize register TARGET from the two vector subelements in PARALLEL
> +   rtx VALS.  */
> +
> +void
> +aarch64_sve_expand_vector_init_subvector (rtx target, rtx vals)
> +{
> +  machine_mode mode = GET_MODE (target);
> +  int nelts = XVECLEN (vals, 0);
> +
> +  gcc_assert (nelts == 2);
> +
> +  rtx arg0 = XVECEXP (vals, 0, 0);
> +  rtx arg1 = XVECEXP (vals, 0, 1);
> +
> +  /* If we have two elements and are concatting vector.  */
> +  machine_mode elem_mode = GET_MODE (arg0);
> +  gcc_assert (VECTOR_MODE_P (elem_mode));
> +
> +  arg0 = force_reg (elem_mode, arg0);
> +  arg1 = force_reg (elem_mode, arg1);
> +  emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1));
> +  return;
> +}
> +
>  /* Check whether VALUE is a vector constant in which every element
>     is either a power of 2 or a negated power of 2.  If so, return
>     a constant vector of log2s, and flip CODE between PLUS and MINUS
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 89c72b24aeb791adbbd3edfdb131478d52b248e6..09c2d24c4b8f1f39c27ea691f7cfe0b51bc4f788 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -140,6 +140,10 @@ (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
>  ;; VQ without 2 element modes.
>  (define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
>  
> +;; SVE modes without 2 element modes.
> +(define_mode_iterator SVE_NO2E [VNx16QI VNx8QI VNx4QI VNx8HI VNx4HI VNx8HF
> +				VNx4HF VNx8BF VNx4BF VNx4SI VNx4SF])
> +
>  ;; 2 element quad vector modes.
>  (define_mode_iterator VQ_2E [V2DI V2DF])
>  
> @@ -1737,7 +1741,15 @@ (define_mode_attr VHALF [(V8QI "V4QI")  (V16QI "V8QI")
>  			 (V2DI "DI")    (V2SF  "SF")
>  			 (V4SF "V2SF")  (V4HF "V2HF")
>  			 (V8HF "V4HF")  (V2DF  "DF")
> -			 (V8BF "V4BF")])
> +			 (V8BF "V4BF")
> +			 (VNx16QI "VNx8QI") (VNx8QI "VNx4QI")
> +			 (VNx4QI "VNx2QI")  (VNx2QI "QI")
> +			 (VNx8HI "VNx4HI")  (VNx4HI "VNx2HI") (VNx2HI "HI")
> +			 (VNx8HF "VNx4HF")  (VNx4HF "VNx2HF") (VNx2HF "HF")
> +			 (VNx8BF "VNx4BF")  (VNx4BF "VNx2BF") (VNx2BF "BF")
> +			 (VNx4SI "VNx2SI")  (VNx2SI "SI")
> +			 (VNx4SF "VNx2SF")  (VNx2SF "SF")
> +			 (VNx2DI "DI")      (VNx2DF "DF")])
>  
>  ;; Half modes of all vector modes, in lower-case.
>  (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
> @@ -1745,7 +1757,15 @@ (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
>  			 (V8HF  "v4hf") (V8BF  "v4bf")
>  			 (V2SI "si")    (V4SI  "v2si")
>  			 (V2DI "di")    (V2SF  "sf")
> -			 (V4SF "v2sf")  (V2DF  "df")])
> +			 (V4SF "v2sf")  (V2DF  "df")
> +			 (VNx16QI "vnx8qi") (VNx8QI "vnx4qi")
> +			 (VNx4QI "vnx2qi")  (VNx2QI "qi")
> +			 (VNx8HI "vnx4hi")  (VNx4HI "vnx2hi") (VNx2HI "hi")
> +			 (VNx8HF "vnx4hf")  (VNx4HF "vnx2hf") (VNx2HF "hf")
> +			 (VNx8BF "vnx4bf")  (VNx4BF "vnx2bf") (VNx2BF "bf")
> +			 (VNx4SI "vnx2si")  (VNx2SI "si")
> +			 (VNx4SF "vnx2sf")  (VNx2SF "sf")
> +			 (VNx2DI "di")      (VNx2DF "df")])
>  
>  ;; Single-element half modes of quad vector modes.
>  (define_mode_attr V1HALF [(V2DI "V1DI")  (V2DF  "V1DF")])
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..a25cae2708dd18cc91a7732f845419bbdb06c5c1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile }  */
> +/* { dg-options "-std=c99" } */
> +/* { dg-additional-options "-O3 -march=armv8-a" } */
> +
> +#pragma GCC target ("+sve")
> +extern char __attribute__ ((simd, const)) fn3 (int, char);
> +void test_fn3 (int *a, int *b, char *c, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    a[i] = (int) (fn3 (b[i], c[i]) + c[i]);
> +}
> +
> +/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn3\n} } } */
  
Tamar Christina Dec. 19, 2024, 12:36 p.m. UTC | #5
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Thursday, December 19, 2024 11:03 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; ktkachov@gcc.gnu.org
> Subject: Re: [PATCH 7/7]AArch64: Implement vector concat of partial SVE vectors
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> >  ;; 2 element quad vector modes.
> >> >  (define_mode_iterator VQ_2E [V2DI V2DF])
> >> >
> >> > @@ -1678,7 +1686,15 @@ (define_mode_attr VHALF [(V8QI "V4QI")
> (V16QI
> >> "V8QI")
> >> >  			 (V2DI "DI")    (V2SF  "SF")
> >> >  			 (V4SF "V2SF")  (V4HF "V2HF")
> >> >  			 (V8HF "V4HF")  (V2DF  "DF")
> >> > -			 (V8BF "V4BF")])
> >> > +			 (V8BF "V4BF")
> >> > +			 (VNx16QI "VNx8QI") (VNx8QI "VNx4QI")
> >> > +			 (VNx4QI "VNx2QI")  (VNx2QI "QI")
> >> > +			 (VNx8HI "VNx4HI")  (VNx4HI "VNx2HI") (VNx2HI "HI")
> >> > +			 (VNx8HF "VNx4HF")  (VNx4HF "VNx2HF") (VNx2HF "HF")
> >> > +			 (VNx8BF "VNx4BF")  (VNx4BF "VNx2BF") (VNx2BF "BF")
> >> > +			 (VNx4SI "VNx2SI")  (VNx2SI "SI")
> >> > +			 (VNx4SF "VNx2SF")  (VNx2SF "SF")
> >> > +			 (VNx2DI "DI")      (VNx2DF "DF")])
> >>
> >> Are the x2 entries necessary, given that the new uses are restricted
> >> to NO2E?
> >>
> >
> > No, but I wanted to keep the symmetry with the Adv. SIMD modes.   Since the
> > mode attributes don't really control the number of alternatives I thought it
> would
> > be better to have the attributes be "fully" defined rather than only the subset I
> use.
> 
> But these are variable-length modes, so DI is only half of VNx2DI for
> the minimum vector length.  It's less than half for Neoverse V1 or A64FX.
> 
> IMO it'd be better to leave them out for now and defined them when needed,
> at which point the right choice would be more obvious.
> 

OK.

gcc/ChangeLog:

	PR target/96342
	* config/aarch64/aarch64-sve.md (vec_init<mode><Vhalf>): New.
	(@aarch64_pack_partial<mode>): New.
	* config/aarch64/aarch64.cc (aarch64_sve_expand_vector_init_subvector): New.
	* config/aarch64/iterators.md (SVE_NO2E): New.
	(VHALF, Vhalf): Add SVE partial vectors.

gcc/testsuite/ChangeLog:

	PR target/96342
	* gcc.target/aarch64/vect-simd-clone-2.c: New test.

Bootstrapped Regtested on aarch64-none-linux-gnu  and no issues.

Ok for master?

Thanks,
Tamar

-- inline copy of patch --

diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index a72ca2a500d394598268c6adfe717eed94a304b3..8ed4221dbe5c49db97b37f186365fa391900eadb 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2839,6 +2839,16 @@ (define_expand "vec_init<mode><Vel>"
   }
 )
 
+(define_expand "vec_init<mode><Vhalf>"
+  [(match_operand:SVE_NO2E 0 "register_operand")
+   (match_operand 1 "")]
+  "TARGET_SVE"
+  {
+    aarch64_sve_expand_vector_init (operands[0], operands[1]);
+    DONE;
+  }
+)
+
 ;; Shift an SVE vector left and insert a scalar into element 0.
 (define_insn "vec_shl_insert_<mode>"
   [(set (match_operand:SVE_FULL 0 "register_operand")
@@ -9289,6 +9299,19 @@ (define_insn "vec_pack_trunc_<Vwide>"
   "uzp1\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
 )
 
+;; Integer partial pack packing two partial SVE types into a single full SVE
+;; type of the same element type.  Use UZP1 on the wider type, which discards
+;; the high part of each wide element.  This allows to concat SVE partial types
+;; into a wider vector.
+(define_insn "@aarch64_pack_partial<mode>"
+  [(set (match_operand:SVE_NO2E 0 "register_operand" "=w")
+	(vec_concat:SVE_NO2E
+	  (match_operand:<VHALF> 1 "register_operand" "w")
+	  (match_operand:<VHALF> 2 "register_operand" "w")))]
+  "TARGET_SVE"
+  "uzp1\t%0.<Vctype>, %1.<Vctype>, %2.<Vctype>"
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT<-INT] Unpacks
 ;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index de4c0a0783912b54ac35d7c818c24574b27a4ca0..40214e318f3c4e30e619d96073b253887c973efc 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24859,6 +24859,17 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals)
     v.quick_push (XVECEXP (vals, 0, i));
   v.finalize ();
 
+  /* If we have two elements and are concatting vector.  */
+  machine_mode elem_mode = GET_MODE (v.elt (0));
+  if (nelts == 2 && VECTOR_MODE_P (elem_mode))
+    {
+      /* We've failed expansion using a dup.  Try using a cheeky truncate. */
+      rtx arg0 = force_reg (elem_mode, v.elt(0));
+      rtx arg1 = force_reg (elem_mode, v.elt(1));
+      emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1));
+      return;
+    }
+
   /* If neither sub-vectors of v could be initialized specially,
      then use INSR to insert all elements from v into TARGET.
      ??? This might not be optimal for vectors with large
@@ -24870,6 +24881,30 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals)
     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
 }
 
+/* Initialize register TARGET from the two vector subelements in PARALLEL
+   rtx VALS.  */
+
+void
+aarch64_sve_expand_vector_init_subvector (rtx target, rtx vals)
+{
+  machine_mode mode = GET_MODE (target);
+  int nelts = XVECLEN (vals, 0);
+
+  gcc_assert (nelts == 2);
+
+  rtx arg0 = XVECEXP (vals, 0, 0);
+  rtx arg1 = XVECEXP (vals, 0, 1);
+
+  /* If we have two elements and are concatting vector.  */
+  machine_mode elem_mode = GET_MODE (arg0);
+  gcc_assert (VECTOR_MODE_P (elem_mode));
+
+  arg0 = force_reg (elem_mode, arg0);
+  arg1 = force_reg (elem_mode, arg1);
+  emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1));
+  return;
+}
+
 /* Check whether VALUE is a vector constant in which every element
    is either a power of 2 or a negated power of 2.  If so, return
    a constant vector of log2s, and flip CODE between PLUS and MINUS
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 89c72b24aeb791adbbd3edfdb131478d52b248e6..34200b05a3abf6d51919313de1027aa4988bcb8d 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -140,6 +140,10 @@ (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
 ;; VQ without 2 element modes.
 (define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
 
+;; SVE modes without 2 element modes.
+(define_mode_iterator SVE_NO2E [VNx16QI VNx8QI VNx4QI VNx8HI VNx4HI VNx8HF
+				VNx4HF VNx8BF VNx4BF VNx4SI VNx4SF])
+
 ;; 2 element quad vector modes.
 (define_mode_iterator VQ_2E [V2DI V2DF])
 
@@ -1737,7 +1741,13 @@ (define_mode_attr VHALF [(V8QI "V4QI")  (V16QI "V8QI")
 			 (V2DI "DI")    (V2SF  "SF")
 			 (V4SF "V2SF")  (V4HF "V2HF")
 			 (V8HF "V4HF")  (V2DF  "DF")
-			 (V8BF "V4BF")])
+			 (V8BF "V4BF")
+			 (VNx16QI "VNx8QI") (VNx8QI "VNx4QI")
+			 (VNx4QI "VNx2QI")
+			 (VNx8HI "VNx4HI")  (VNx4HI "VNx2HI")
+			 (VNx8HF "VNx4HF")  (VNx4HF "VNx2HF")
+			 (VNx8BF "VNx4BF")  (VNx4BF "VNx2BF")
+			 (VNx4SI "VNx2SI")  (VNx4SF "VNx2SF")])
 
 ;; Half modes of all vector modes, in lower-case.
 (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
@@ -1745,7 +1755,13 @@ (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
 			 (V8HF  "v4hf") (V8BF  "v4bf")
 			 (V2SI "si")    (V4SI  "v2si")
 			 (V2DI "di")    (V2SF  "sf")
-			 (V4SF "v2sf")  (V2DF  "df")])
+			 (V4SF "v2sf")  (V2DF  "df")
+			 (VNx16QI "vnx8qi") (VNx8QI "vnx4qi")
+			 (VNx4QI "vnx2qi")
+			 (VNx8HI "vnx4hi")  (VNx4HI "vnx2hi")
+			 (VNx8HF "vnx4hf")  (VNx4HF "vnx2hf")
+			 (VNx8BF "vnx4bf")  (VNx4BF "vnx2bf")
+			 (VNx4SI "vnx2si")  (VNx4SF "vnx2sf")])
 
 ;; Single-element half modes of quad vector modes.
 (define_mode_attr V1HALF [(V2DI "V1DI")  (V2DF  "V1DF")])
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..a25cae2708dd18cc91a7732f845419bbdb06c5c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
@@ -0,0 +1,13 @@
+/* { dg-do compile }  */
+/* { dg-options "-std=c99" } */
+/* { dg-additional-options "-O3 -march=armv8-a" } */
+
+#pragma GCC target ("+sve")
+extern char __attribute__ ((simd, const)) fn3 (int, char);
+void test_fn3 (int *a, int *b, char *c, int n)
+{
+  for (int i = 0; i < n; ++i)
+    a[i] = (int) (fn3 (b[i], c[i]) + c[i]);
+}
+
+/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn3\n} } } */
  
Richard Sandiford Dec. 19, 2024, 1:38 p.m. UTC | #6
Tamar Christina <Tamar.Christina@arm.com> writes:
> gcc/ChangeLog:
>
> 	PR target/96342
> 	* config/aarch64/aarch64-sve.md (vec_init<mode><Vhalf>): New.
> 	(@aarch64_pack_partial<mode>): New.
> 	* config/aarch64/aarch64.cc (aarch64_sve_expand_vector_init_subvector): New.
> 	* config/aarch64/iterators.md (SVE_NO2E): New.
> 	(VHALF, Vhalf): Add SVE partial vectors.
>
> gcc/testsuite/ChangeLog:
>
> 	PR target/96342
> 	* gcc.target/aarch64/vect-simd-clone-2.c: New test.

OK, thanks.

Richard

> Bootstrapped Regtested on aarch64-none-linux-gnu  and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> -- inline copy of patch --
>
> diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
> index a72ca2a500d394598268c6adfe717eed94a304b3..8ed4221dbe5c49db97b37f186365fa391900eadb 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -2839,6 +2839,16 @@ (define_expand "vec_init<mode><Vel>"
>    }
>  )
>  
> +(define_expand "vec_init<mode><Vhalf>"
> +  [(match_operand:SVE_NO2E 0 "register_operand")
> +   (match_operand 1 "")]
> +  "TARGET_SVE"
> +  {
> +    aarch64_sve_expand_vector_init (operands[0], operands[1]);
> +    DONE;
> +  }
> +)
> +
>  ;; Shift an SVE vector left and insert a scalar into element 0.
>  (define_insn "vec_shl_insert_<mode>"
>    [(set (match_operand:SVE_FULL 0 "register_operand")
> @@ -9289,6 +9299,19 @@ (define_insn "vec_pack_trunc_<Vwide>"
>    "uzp1\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
>  )
>  
> +;; Integer partial pack packing two partial SVE types into a single full SVE
> +;; type of the same element type.  Use UZP1 on the wider type, which discards
> +;; the high part of each wide element.  This allows to concat SVE partial types
> +;; into a wider vector.
> +(define_insn "@aarch64_pack_partial<mode>"
> +  [(set (match_operand:SVE_NO2E 0 "register_operand" "=w")
> +	(vec_concat:SVE_NO2E
> +	  (match_operand:<VHALF> 1 "register_operand" "w")
> +	  (match_operand:<VHALF> 2 "register_operand" "w")))]
> +  "TARGET_SVE"
> +  "uzp1\t%0.<Vctype>, %1.<Vctype>, %2.<Vctype>"
> +)
> +
>  ;; -------------------------------------------------------------------------
>  ;; ---- [INT<-INT] Unpacks
>  ;; -------------------------------------------------------------------------
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index de4c0a0783912b54ac35d7c818c24574b27a4ca0..40214e318f3c4e30e619d96073b253887c973efc 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -24859,6 +24859,17 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals)
>      v.quick_push (XVECEXP (vals, 0, i));
>    v.finalize ();
>  
> +  /* If we have two elements and are concatting vector.  */
> +  machine_mode elem_mode = GET_MODE (v.elt (0));
> +  if (nelts == 2 && VECTOR_MODE_P (elem_mode))
> +    {
> +      /* We've failed expansion using a dup.  Try using a cheeky truncate. */
> +      rtx arg0 = force_reg (elem_mode, v.elt(0));
> +      rtx arg1 = force_reg (elem_mode, v.elt(1));
> +      emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1));
> +      return;
> +    }
> +
>    /* If neither sub-vectors of v could be initialized specially,
>       then use INSR to insert all elements from v into TARGET.
>       ??? This might not be optimal for vectors with large
> @@ -24870,6 +24881,30 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals)
>      aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
>  }
>  
> +/* Initialize register TARGET from the two vector subelements in PARALLEL
> +   rtx VALS.  */
> +
> +void
> +aarch64_sve_expand_vector_init_subvector (rtx target, rtx vals)
> +{
> +  machine_mode mode = GET_MODE (target);
> +  int nelts = XVECLEN (vals, 0);
> +
> +  gcc_assert (nelts == 2);
> +
> +  rtx arg0 = XVECEXP (vals, 0, 0);
> +  rtx arg1 = XVECEXP (vals, 0, 1);
> +
> +  /* If we have two elements and are concatting vector.  */
> +  machine_mode elem_mode = GET_MODE (arg0);
> +  gcc_assert (VECTOR_MODE_P (elem_mode));
> +
> +  arg0 = force_reg (elem_mode, arg0);
> +  arg1 = force_reg (elem_mode, arg1);
> +  emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1));
> +  return;
> +}
> +
>  /* Check whether VALUE is a vector constant in which every element
>     is either a power of 2 or a negated power of 2.  If so, return
>     a constant vector of log2s, and flip CODE between PLUS and MINUS
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 89c72b24aeb791adbbd3edfdb131478d52b248e6..34200b05a3abf6d51919313de1027aa4988bcb8d 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -140,6 +140,10 @@ (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
>  ;; VQ without 2 element modes.
>  (define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
>  
> +;; SVE modes without 2 element modes.
> +(define_mode_iterator SVE_NO2E [VNx16QI VNx8QI VNx4QI VNx8HI VNx4HI VNx8HF
> +				VNx4HF VNx8BF VNx4BF VNx4SI VNx4SF])
> +
>  ;; 2 element quad vector modes.
>  (define_mode_iterator VQ_2E [V2DI V2DF])
>  
> @@ -1737,7 +1741,13 @@ (define_mode_attr VHALF [(V8QI "V4QI")  (V16QI "V8QI")
>  			 (V2DI "DI")    (V2SF  "SF")
>  			 (V4SF "V2SF")  (V4HF "V2HF")
>  			 (V8HF "V4HF")  (V2DF  "DF")
> -			 (V8BF "V4BF")])
> +			 (V8BF "V4BF")
> +			 (VNx16QI "VNx8QI") (VNx8QI "VNx4QI")
> +			 (VNx4QI "VNx2QI")
> +			 (VNx8HI "VNx4HI")  (VNx4HI "VNx2HI")
> +			 (VNx8HF "VNx4HF")  (VNx4HF "VNx2HF")
> +			 (VNx8BF "VNx4BF")  (VNx4BF "VNx2BF")
> +			 (VNx4SI "VNx2SI")  (VNx4SF "VNx2SF")])
>  
>  ;; Half modes of all vector modes, in lower-case.
>  (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
> @@ -1745,7 +1755,13 @@ (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
>  			 (V8HF  "v4hf") (V8BF  "v4bf")
>  			 (V2SI "si")    (V4SI  "v2si")
>  			 (V2DI "di")    (V2SF  "sf")
> -			 (V4SF "v2sf")  (V2DF  "df")])
> +			 (V4SF "v2sf")  (V2DF  "df")
> +			 (VNx16QI "vnx8qi") (VNx8QI "vnx4qi")
> +			 (VNx4QI "vnx2qi")
> +			 (VNx8HI "vnx4hi")  (VNx4HI "vnx2hi")
> +			 (VNx8HF "vnx4hf")  (VNx4HF "vnx2hf")
> +			 (VNx8BF "vnx4bf")  (VNx4BF "vnx2bf")
> +			 (VNx4SI "vnx2si")  (VNx4SF "vnx2sf")])
>  
>  ;; Single-element half modes of quad vector modes.
>  (define_mode_attr V1HALF [(V2DI "V1DI")  (V2DF  "V1DF")])
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..a25cae2708dd18cc91a7732f845419bbdb06c5c1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile }  */
> +/* { dg-options "-std=c99" } */
> +/* { dg-additional-options "-O3 -march=armv8-a" } */
> +
> +#pragma GCC target ("+sve")
> +extern char __attribute__ ((simd, const)) fn3 (int, char);
> +void test_fn3 (int *a, int *b, char *c, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    a[i] = (int) (fn3 (b[i], c[i]) + c[i]);
> +}
> +
> +/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn3\n} } } */
  

Patch

diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 9afd11d347626eeb640722fdba2ab763b8479aa7..9e3577be6e943d7a5c951196463873d4bcfee07c 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2840,6 +2840,16 @@  (define_expand "vec_init<mode><Vel>"
   }
 )
 
+(define_expand "vec_init<mode><Vhalf>"
+  [(match_operand:SVE_NO2E 0 "register_operand")
+    (match_operand 1 "")]
+  "TARGET_SVE"
+  {
+    aarch64_sve_expand_vector_init (operands[0], operands[1]);
+    DONE;
+  }
+)
+
 ;; Shift an SVE vector left and insert a scalar into element 0.
 (define_insn "vec_shl_insert_<mode>"
   [(set (match_operand:SVE_FULL 0 "register_operand")
@@ -9347,6 +9357,20 @@  (define_insn "vec_pack_trunc_<Vwide>"
   "uzp1\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
 )
 
+;; Integer partial pack packing two partial SVE types into a single full SVE
+;; type of the same element type.  Use UZP1 on the wider type, which discards
+;; the high part of each wide element.  This allows to concat SVE partial types
+;; into a wider vector.
+(define_insn "@aarch64_pack_partial<mode>"
+  [(set (match_operand:SVE_PARTIAL_NO2E 0 "register_operand" "=w")
+	(unspec:SVE_PARTIAL_NO2E
+	  [(match_operand:<VHALF> 1 "register_operand" "w")
+	   (match_operand:<VHALF> 2 "register_operand" "w")]
+	  UNSPEC_PACK))]
+  "TARGET_SVE"
+  "uzp1\t%0.<Vwstype>, %1.<Vwstype>, %2.<Vwstype>"
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT<-INT] Unpacks
 ;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index af6fede102c2be6673c24f8020d000ea56322997..690d54b0a2954327e00d559f96c414c81c2e18cd 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24790,6 +24790,17 @@  aarch64_sve_expand_vector_init (rtx target, rtx vals)
     v.quick_push (XVECEXP (vals, 0, i));
   v.finalize ();
 
+  /* If we have two elements and are concatting vector.  */
+  machine_mode elem_mode = GET_MODE (v.elt (0));
+  if (nelts == 2 && VECTOR_MODE_P (elem_mode))
+    {
+      /* We've failed expansion using a dup.  Try using a cheeky truncate. */
+      rtx arg0 = force_reg (elem_mode, v.elt(0));
+      rtx arg1 = force_reg (elem_mode, v.elt(1));
+      emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1));
+      return;
+    }
+
   /* If neither sub-vectors of v could be initialized specially,
      then use INSR to insert all elements from v into TARGET.
      ??? This might not be optimal for vectors with large
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 023893d35f3e955e222c322ce370e84c95c29ee6..77d23d6ad795630d3d5fb5c076c086a479d46fee 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -138,6 +138,14 @@  (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
 ;; VQ without 2 element modes.
 (define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
 
+;; SVE modes without 2 element modes.
+(define_mode_iterator SVE_NO2E [VNx16QI VNx8QI VNx4QI VNx8HI VNx4HI VNx8HF
+				VNx4HF VNx8BF VNx4BF VNx4SI VNx4SF])
+
+;; Partial SVE modes without 2 element modes.
+(define_mode_iterator SVE_PARTIAL_NO2E [VNx8QI VNx4QI VNx4HI
+					VNx4HF VNx8BF VNx4BF])
+
 ;; 2 element quad vector modes.
 (define_mode_iterator VQ_2E [V2DI V2DF])
 
@@ -1678,7 +1686,15 @@  (define_mode_attr VHALF [(V8QI "V4QI")  (V16QI "V8QI")
 			 (V2DI "DI")    (V2SF  "SF")
 			 (V4SF "V2SF")  (V4HF "V2HF")
 			 (V8HF "V4HF")  (V2DF  "DF")
-			 (V8BF "V4BF")])
+			 (V8BF "V4BF")
+			 (VNx16QI "VNx8QI") (VNx8QI "VNx4QI")
+			 (VNx4QI "VNx2QI")  (VNx2QI "QI")
+			 (VNx8HI "VNx4HI")  (VNx4HI "VNx2HI") (VNx2HI "HI")
+			 (VNx8HF "VNx4HF")  (VNx4HF "VNx2HF") (VNx2HF "HF")
+			 (VNx8BF "VNx4BF")  (VNx4BF "VNx2BF") (VNx2BF "BF")
+			 (VNx4SI "VNx2SI")  (VNx2SI "SI")
+			 (VNx4SF "VNx2SF")  (VNx2SF "SF")
+			 (VNx2DI "DI")      (VNx2DF "DF")])
 
 ;; Half modes of all vector modes, in lower-case.
 (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
@@ -1686,7 +1702,15 @@  (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
 			 (V8HF  "v4hf") (V8BF  "v4bf")
 			 (V2SI "si")    (V4SI  "v2si")
 			 (V2DI "di")    (V2SF  "sf")
-			 (V4SF "v2sf")  (V2DF  "df")])
+			 (V4SF "v2sf")  (V2DF  "df")
+			 (VNx16QI "vnx8qi") (VNx8QI "vnx4qi")
+			 (VNx4QI "vnx2qi")  (VNx2QI "qi")
+			 (VNx8HI "vnx4hi")  (VNx4HI "vnx2hi") (VNx2HI "hi")
+			 (VNx8HF "vnx4hf")  (VNx4HF "vnx2hf") (VNx2HF "hf")
+			 (VNx8BF "vnx4bf")  (VNx4BF "vnx2bf") (VNx2BF "bf")
+			 (VNx4SI "vnx2si")  (VNx2SI "si")
+			 (VNx4SF "vnx2sf")  (VNx2SF "sf")
+			 (VNx2DI "di")      (VNx2DF "df")])
 
 ;; Single-element half modes of quad vector modes.
 (define_mode_attr V1HALF [(V2DI "V1DI")  (V2DF  "V1DF")])
@@ -1815,7 +1839,10 @@  (define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s")
 ;; Widened scalar register suffixes.
 (define_mode_attr Vwstype [(V8QI "h") (V4HI "s")
 			  (V2SI "") (V16QI "h")
-			  (V8HI "s") (V4SI "d")])
+			  (V8HI "s") (V4SI "d")
+			  (VNx8QI "h") (VNx4QI "h")
+			  (VNx4HF "s") (VNx4HI "s")
+			  (VNx8BF "h") (VNx4BF "h")])
 ;; Add a .1d for V2SI.
 (define_mode_attr Vwsuf [(V8QI "") (V4HI "")
 			  (V2SI ".1d") (V16QI "")
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..a25cae2708dd18cc91a7732f845419bbdb06c5c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile }  */
+/* { dg-options "-std=c99" } */
+/* { dg-additional-options "-O3 -march=armv8-a" } */
+
+#pragma GCC target ("+sve")
+extern char __attribute__ ((simd, const)) fn3 (int, char);
+void test_fn3 (int *a, int *b, char *c, int n)
+{
+  for (int i = 0; i < n; ++i)
+    a[i] = (int) (fn3 (b[i], c[i]) + c[i]);
+}
+
+/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn3\n} } } */