[pushed,8/8] aarch64: Extend vec_concat patterns to 8-byte vectors

Message ID mptleyk55yc.fsf@arm.com
State Committed
Commit 83d7e720cd1d075312e798c4ebd2e093f03465fb
Headers
Series aarch64: Fix regression in vec_init code quality |

Commit Message

Richard Sandiford Feb. 9, 2022, 5:02 p.m. UTC
  This patch extends the previous support for 16-byte vec_concat
so that it supports pairs of 4-byte elements.  This too isn't
strictly a regression fix, since the 8-byte forms weren't affected
by the same problems as the 16-byte forms, but it leaves things in
a more consistent state.

gcc/
	* config/aarch64/iterators.md (VDCSIF): New mode iterator.
	(VDBL): Handle SF.
	(single_wx, single_type, single_dtype, dblq): New mode attributes.
	* config/aarch64/aarch64-simd.md (load_pair_lanes<mode>): Extend
	from VDC to VDCSIF.
	(store_pair_lanes<mode>): Likewise.
	(*aarch64_combine_internal<mode>): Likewise.
	(*aarch64_combine_internal_be<mode>): Likewise.
	(*aarch64_combinez<mode>): Likewise.
	(*aarch64_combinez_be<mode>): Likewise.
	* config/aarch64/aarch64.cc (aarch64_classify_address): Handle
	8-byte modes for ADDR_QUERY_LDP_STP_N.
	(aarch64_print_operand): Likewise for %y.

gcc/testsuite/
	* gcc.target/aarch64/vec-init-13.c: New test.
	* gcc.target/aarch64/vec-init-14.c: Likewise.
	* gcc.target/aarch64/vec-init-15.c: Likewise.
	* gcc.target/aarch64/vec-init-16.c: Likewise.
	* gcc.target/aarch64/vec-init-17.c: Likewise.
---
 gcc/config/aarch64/aarch64-simd.md            |  72 +++++-----
 gcc/config/aarch64/aarch64.cc                 |  16 ++-
 gcc/config/aarch64/iterators.md               |  38 +++++-
 .../gcc.target/aarch64/vec-init-13.c          | 123 ++++++++++++++++++
 .../gcc.target/aarch64/vec-init-14.c          | 123 ++++++++++++++++++
 .../gcc.target/aarch64/vec-init-15.c          |  15 +++
 .../gcc.target/aarch64/vec-init-16.c          |  12 ++
 .../gcc.target/aarch64/vec-init-17.c          |  73 +++++++++++
 8 files changed, 430 insertions(+), 42 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vec-init-13.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vec-init-14.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vec-init-15.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vec-init-16.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vec-init-17.c
  

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index ef6e772503d..18733428f3f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4243,12 +4243,12 @@  (define_insn_and_split "aarch64_get_lane<mode>"
 (define_insn "load_pair_lanes<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w")
 	(vec_concat:<VDBL>
-	   (match_operand:VDC 1 "memory_operand" "Utq")
-	   (match_operand:VDC 2 "memory_operand" "m")))]
+	   (match_operand:VDCSIF 1 "memory_operand" "Utq")
+	   (match_operand:VDCSIF 2 "memory_operand" "m")))]
   "TARGET_SIMD
    && aarch64_mergeable_load_pair_p (<VDBL>mode, operands[1], operands[2])"
-  "ldr\\t%q0, %1"
-  [(set_attr "type" "neon_load1_1reg_q")]
+  "ldr\\t%<single_dtype>0, %1"
+  [(set_attr "type" "neon_load1_1reg<dblq>")]
 )
 
 ;; This STP pattern is a partial duplicate of the general vec_concat patterns
@@ -4273,12 +4273,12 @@  (define_insn "load_pair_lanes<mode>"
 (define_insn "store_pair_lanes<mode>"
   [(set (match_operand:<VDBL> 0 "aarch64_mem_pair_lanes_operand" "=Umn, Umn")
 	(vec_concat:<VDBL>
-	   (match_operand:VDC 1 "register_operand" "w, r")
-	   (match_operand:VDC 2 "register_operand" "w, r")))]
+	   (match_operand:VDCSIF 1 "register_operand" "w, r")
+	   (match_operand:VDCSIF 2 "register_operand" "w, r")))]
   "TARGET_SIMD"
   "@
-   stp\\t%d1, %d2, %y0
-   stp\\t%x1, %x2, %y0"
+   stp\t%<single_type>1, %<single_type>2, %y0
+   stp\t%<single_wx>1, %<single_wx>2, %y0"
   [(set_attr "type" "neon_stp, store_16")]
 )
 
@@ -4292,37 +4292,37 @@  (define_insn "store_pair_lanes<mode>"
 (define_insn "*aarch64_combine_internal<mode>"
   [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, Umn, Umn")
 	(vec_concat:<VDBL>
-	  (match_operand:VDC 1 "register_operand" "0, 0, 0, ?w, ?r")
-	  (match_operand:VDC 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, w, ?r")))]
+	  (match_operand:VDCSIF 1 "register_operand" "0, 0, 0, ?w, ?r")
+	  (match_operand:VDCSIF 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, w, ?r")))]
   "TARGET_SIMD
    && !BYTES_BIG_ENDIAN
    && (register_operand (operands[0], <VDBL>mode)
        || register_operand (operands[2], <MODE>mode))"
   "@
-   ins\t%0.d[1], %2.d[0]
-   ins\t%0.d[1], %2
-   ld1\t{%0.d}[1], %2
-   stp\t%d1, %d2, %y0
-   stp\t%x1, %x2, %y0"
-  [(set_attr "type" "neon_ins_q, neon_from_gp_q, neon_load1_one_lane_q, neon_stp, store_16")]
+   ins\t%0.<single_type>[1], %2.<single_type>[0]
+   ins\t%0.<single_type>[1], %<single_wx>2
+   ld1\t{%0.<single_type>}[1], %2
+   stp\t%<single_type>1, %<single_type>2, %y0
+   stp\t%<single_wx>1, %<single_wx>2, %y0"
+  [(set_attr "type" "neon_ins<dblq>, neon_from_gp<dblq>, neon_load1_one_lane<dblq>, neon_stp, store_16")]
 )
 
 (define_insn "*aarch64_combine_internal_be<mode>"
   [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, Umn, Umn")
 	(vec_concat:<VDBL>
-	  (match_operand:VDC 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, ?w, ?r")
-	  (match_operand:VDC 1 "register_operand" "0, 0, 0, ?w, ?r")))]
+	  (match_operand:VDCSIF 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, ?w, ?r")
+	  (match_operand:VDCSIF 1 "register_operand" "0, 0, 0, ?w, ?r")))]
   "TARGET_SIMD
    && BYTES_BIG_ENDIAN
    && (register_operand (operands[0], <VDBL>mode)
        || register_operand (operands[2], <MODE>mode))"
   "@
-   ins\t%0.d[1], %2.d[0]
-   ins\t%0.d[1], %2
-   ld1\t{%0.d}[1], %2
-   stp\t%d2, %d1, %y0
-   stp\t%x2, %x1, %y0"
-  [(set_attr "type" "neon_ins_q, neon_from_gp_q, neon_load1_one_lane_q, neon_stp, store_16")]
+   ins\t%0.<single_type>[1], %2.<single_type>[0]
+   ins\t%0.<single_type>[1], %<single_wx>2
+   ld1\t{%0.<single_type>}[1], %2
+   stp\t%<single_type>2, %<single_type>1, %y0
+   stp\t%<single_wx>2, %<single_wx>1, %y0"
+  [(set_attr "type" "neon_ins<dblq>, neon_from_gp<dblq>, neon_load1_one_lane<dblq>, neon_stp, store_16")]
 )
 
 ;; In this insn, operand 1 should be low, and operand 2 the high part of the
@@ -4331,13 +4331,13 @@  (define_insn "*aarch64_combine_internal_be<mode>"
 (define_insn "*aarch64_combinez<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
 	(vec_concat:<VDBL>
-	  (match_operand:VDC 1 "nonimmediate_operand" "w,?r,m")
-	  (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")))]
+	  (match_operand:VDCSIF 1 "nonimmediate_operand" "w,?r,m")
+	  (match_operand:VDCSIF 2 "aarch64_simd_or_scalar_imm_zero")))]
   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
   "@
-   mov\\t%0.8b, %1.8b
-   fmov\t%d0, %1
-   ldr\\t%d0, %1"
+   fmov\\t%<single_type>0, %<single_type>1
+   fmov\t%<single_type>0, %<single_wx>1
+   ldr\\t%<single_type>0, %1"
   [(set_attr "type" "neon_move<q>, neon_from_gp, neon_load1_1reg")
    (set_attr "arch" "simd,fp,simd")]
 )
@@ -4345,13 +4345,13 @@  (define_insn "*aarch64_combinez<mode>"
 (define_insn "*aarch64_combinez_be<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
         (vec_concat:<VDBL>
-	  (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")
-	  (match_operand:VDC 1 "nonimmediate_operand" "w,?r,m")))]
+	  (match_operand:VDCSIF 2 "aarch64_simd_or_scalar_imm_zero")
+	  (match_operand:VDCSIF 1 "nonimmediate_operand" "w,?r,m")))]
   "TARGET_SIMD && BYTES_BIG_ENDIAN"
   "@
-   mov\\t%0.8b, %1.8b
-   fmov\t%d0, %1
-   ldr\\t%d0, %1"
+   fmov\\t%<single_type>0, %<single_type>1
+   fmov\t%<single_type>0, %<single_wx>1
+   ldr\\t%<single_type>0, %1"
   [(set_attr "type" "neon_move<q>, neon_from_gp, neon_load1_1reg")
    (set_attr "arch" "simd,fp,simd")]
 )
@@ -4362,8 +4362,8 @@  (define_insn "*aarch64_combinez_be<mode>"
 (define_expand "@aarch64_vec_concat<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand")
 	(vec_concat:<VDBL>
-	  (match_operand:VDC 1 "general_operand")
-	  (match_operand:VDC 2 "general_operand")))]
+	  (match_operand:VDCSIF 1 "general_operand")
+	  (match_operand:VDCSIF 2 "general_operand")))]
   "TARGET_SIMD"
 {
   int lo = BYTES_BIG_ENDIAN ? 2 : 1;
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index af42d1bedfe..7bb97bd48e4 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -9922,9 +9922,15 @@  aarch64_classify_address (struct aarch64_address_info *info,
   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
      corresponds to the actual size of the memory being loaded/stored and the
      mode of the corresponding addressing mode is half of that.  */
-  if (type == ADDR_QUERY_LDP_STP_N
-      && known_eq (GET_MODE_SIZE (mode), 16))
-    mode = DFmode;
+  if (type == ADDR_QUERY_LDP_STP_N)
+    {
+      if (known_eq (GET_MODE_SIZE (mode), 16))
+	mode = DFmode;
+      else if (known_eq (GET_MODE_SIZE (mode), 8))
+	mode = SFmode;
+      else
+	return false;
+    }
 
   bool allow_reg_index_p = (!load_store_pair_p
 			    && ((vec_flags == 0
@@ -11404,7 +11410,9 @@  aarch64_print_operand (FILE *f, rtx x, int code)
 	machine_mode mode = GET_MODE (x);
 
 	if (!MEM_P (x)
-	    || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
+	    || (code == 'y'
+		&& maybe_ne (GET_MODE_SIZE (mode), 8)
+		&& maybe_ne (GET_MODE_SIZE (mode), 16)))
 	  {
 	    output_operand_lossage ("invalid operand for '%%%c'", code);
 	    return;
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index a0c02e4ac15..88067a3536a 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -236,6 +236,9 @@  (define_mode_iterator VQW [V16QI V8HI V4SI])
 ;; Double vector modes for combines.
 (define_mode_iterator VDC [V8QI V4HI V4BF V4HF V2SI V2SF DI DF])
 
+;; VDC plus SI and SF.
+(define_mode_iterator VDCSIF [V8QI V4HI V4BF V4HF V2SI V2SF SI SF DI DF])
+
 ;; Polynomial modes for vector combines.
 (define_mode_iterator VDC_P [V8QI V4HI DI])
 
@@ -1436,8 +1439,8 @@  (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
 (define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI")
 			(V4HF "V8HF")  (V4BF "V8BF")
 			(V2SI "V4SI")  (V2SF "V4SF")
-			(SI   "V2SI")  (DI   "V2DI")
-			(DF   "V2DF")])
+			(SI   "V2SI")  (SF   "V2SF")
+			(DI   "V2DI")  (DF   "V2DF")])
 
 ;; Register suffix for double-length mode.
 (define_mode_attr Vdtype [(V4HF "8h") (V2SF "4s")])
@@ -1557,6 +1560,30 @@  (define_mode_attr Vhalftype [(V16QI "8b") (V8HI "4h")
 			     (V4SI "2s") (V8HF "4h")
 			     (V4SF "2s")])
 
+;; Whether a mode fits in W or X registers (i.e. "w" for 32-bit modes
+;; and "x" for 64-bit modes).
+(define_mode_attr single_wx [(SI   "w") (SF   "w")
+			     (V8QI "x") (V4HI "x")
+			     (V4HF "x") (V4BF "x")
+			     (V2SI "x") (V2SF "x")
+			     (DI   "x") (DF   "x")])
+
+;; Whether a mode fits in S or D registers (i.e. "s" for 32-bit modes
+;; and "d" for 64-bit modes).
+(define_mode_attr single_type [(SI   "s") (SF   "s")
+			       (V8QI "d") (V4HI "d")
+			       (V4HF "d") (V4BF "d")
+			       (V2SI "d") (V2SF "d")
+			       (DI   "d") (DF   "d")])
+
+;; Whether a double-width mode fits in D or Q registers (i.e. "d" for
+;; 32-bit modes and "q" for 64-bit modes).
+(define_mode_attr single_dtype [(SI   "d") (SF   "d")
+			        (V8QI "q") (V4HI "q")
+			        (V4HF "q") (V4BF "q")
+			        (V2SI "q") (V2SF "q")
+			        (DI   "q") (DF   "q")])
+
 ;; Define corresponding core/FP element mode for each vector mode.
 (define_mode_attr vw [(V8QI "w") (V16QI "w")
 		      (V4HI "w") (V8HI "w")
@@ -1849,6 +1876,13 @@  (define_mode_attr q [(V8QI "") (V16QI "_q")
 		     (V4x1DF "") (V4x2DF "_q")
 		     (V4x4BF "") (V4x8BF "_q")])
 
+;; Equivalent of the "q" attribute for the <VDBL> mode.
+(define_mode_attr dblq [(SI   "") (SF   "")
+		        (V8QI "_q") (V4HI "_q")
+		        (V4HF "_q") (V4BF "_q")
+		        (V2SI "_q") (V2SF "_q")
+		        (DI   "_q") (DF   "_q")])
+
 (define_mode_attr vp [(V8QI "v") (V16QI "v")
 		      (V4HI "v") (V8HI  "v")
 		      (V2SI "p") (V4SI  "v")
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-13.c b/gcc/testsuite/gcc.target/aarch64/vec-init-13.c
new file mode 100644
index 00000000000..d0f88cbe71a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-13.c
@@ -0,0 +1,123 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+
+/*
+** s64q_1:
+**	fmov	d0, x0
+**	ret
+*/
+int64x2_t s64q_1(int64_t a0) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (int64x2_t) { 0, a0 };
+  else
+    return (int64x2_t) { a0, 0 };
+}
+/*
+** s64q_2:
+**	ldr	d0, \[x0\]
+**	ret
+*/
+int64x2_t s64q_2(int64_t *ptr) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (int64x2_t) { 0, ptr[0] };
+  else
+    return (int64x2_t) { ptr[0], 0 };
+}
+/*
+** s64q_3:
+**	ldr	d0, \[x0, #?8\]
+**	ret
+*/
+int64x2_t s64q_3(int64_t *ptr) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (int64x2_t) { 0, ptr[1] };
+  else
+    return (int64x2_t) { ptr[1], 0 };
+}
+
+/*
+** f64q_1:
+**	fmov	d0, d0
+**	ret
+*/
+float64x2_t f64q_1(float64_t a0) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (float64x2_t) { 0, a0 };
+  else
+    return (float64x2_t) { a0, 0 };
+}
+/*
+** f64q_2:
+**	ldr	d0, \[x0\]
+**	ret
+*/
+float64x2_t f64q_2(float64_t *ptr) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (float64x2_t) { 0, ptr[0] };
+  else
+    return (float64x2_t) { ptr[0], 0 };
+}
+/*
+** f64q_3:
+**	ldr	d0, \[x0, #?8\]
+**	ret
+*/
+float64x2_t f64q_3(float64_t *ptr) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (float64x2_t) { 0, ptr[1] };
+  else
+    return (float64x2_t) { ptr[1], 0 };
+}
+
+/*
+** s32q_1:
+**	fmov	d0, d0
+**	ret
+*/
+int32x4_t s32q_1(int32x2_t a0, int32x2_t a1) {
+  return vcombine_s32 (a0, (int32x2_t) { 0, 0 });
+}
+/*
+** s32q_2:
+**	ldr	d0, \[x0\]
+**	ret
+*/
+int32x4_t s32q_2(int32x2_t *ptr) {
+  return vcombine_s32 (ptr[0], (int32x2_t) { 0, 0 });
+}
+/*
+** s32q_3:
+**	ldr	d0, \[x0, #?8\]
+**	ret
+*/
+int32x4_t s32q_3(int32x2_t *ptr) {
+  return vcombine_s32 (ptr[1], (int32x2_t) { 0, 0 });
+}
+
+/*
+** f32q_1:
+**	fmov	d0, d0
+**	ret
+*/
+float32x4_t f32q_1(float32x2_t a0, float32x2_t a1) {
+  return vcombine_f32 (a0, (float32x2_t) { 0, 0 });
+}
+/*
+** f32q_2:
+**	ldr	d0, \[x0\]
+**	ret
+*/
+float32x4_t f32q_2(float32x2_t *ptr) {
+  return vcombine_f32 (ptr[0], (float32x2_t) { 0, 0 });
+}
+/*
+** f32q_3:
+**	ldr	d0, \[x0, #?8\]
+**	ret
+*/
+float32x4_t f32q_3(float32x2_t *ptr) {
+  return vcombine_f32 (ptr[1], (float32x2_t) { 0, 0 });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-14.c b/gcc/testsuite/gcc.target/aarch64/vec-init-14.c
new file mode 100644
index 00000000000..02875088cd9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-14.c
@@ -0,0 +1,123 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+
+void ext();
+
+/*
+** s32_1:
+**	fmov	s0, w0
+**	ins	v0\.s\[1\], w1
+**	ret
+*/
+int32x2_t s32_1(int32_t a0, int32_t a1) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (int32x2_t) { a1, a0 };
+  else
+    return (int32x2_t) { a0, a1 };
+}
+/*
+** s32_2:
+**	fmov	s0, w0
+**	ld1	{v0\.s}\[1\], \[x1\]
+**	ret
+*/
+int32x2_t s32_2(int32_t a0, int32_t *ptr) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (int32x2_t) { ptr[0], a0 };
+  else
+    return (int32x2_t) { a0, ptr[0] };
+}
+/*
+** s32_3:
+**	ldr	s0, \[x0\]
+**	ins	v0\.s\[1\], w1
+**	ret
+*/
+int32x2_t s32_3(int32_t *ptr, int32_t a1) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (int32x2_t) { a1, ptr[0] };
+  else
+    return (int32x2_t) { ptr[0], a1 };
+}
+/*
+** s32_4:
+**	stp	w1, w2, \[x0\]
+**	ret
+*/
+void s32_4(int32x2_t *res, int32_t a0, int32_t a1) {
+  res[0] = (int32x2_t) { a0, a1 };
+}
+/*
+** s32_5:
+**	stp	w1, w2, \[x0, #?4\]
+**	ret
+*/
+void s32_5(uintptr_t res, int32_t a0, int32_t a1) {
+  *(int32x2_t *)(res + 4) = (int32x2_t) { a0, a1 };
+}
+/* Currently uses d8 to hold res across the call.  */
+int32x2_t s32_6(int32_t a0, int32_t a1) {
+  int32x2_t res = { a0, a1 };
+  ext ();
+  return res;
+}
+
+/*
+** f32_1:
+**	ins	v0\.s\[1\], v1\.s\[0\]
+**	ret
+*/
+float32x2_t f32_1(float32_t a0, float32_t a1) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (float32x2_t) { a1, a0 };
+  else
+    return (float32x2_t) { a0, a1 };
+}
+/*
+** f32_2:
+**	ld1	{v0\.s}\[1\], \[x0\]
+**	ret
+*/
+float32x2_t f32_2(float32_t a0, float32_t *ptr) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (float32x2_t) { ptr[0], a0 };
+  else
+    return (float32x2_t) { a0, ptr[0] };
+}
+/*
+** f32_3:
+**	ldr	s0, \[x0\]
+**	ins	v0\.s\[1\], v1\.s\[0\]
+**	ret
+*/
+float32x2_t f32_3(float32_t a0, float32_t a1, float32_t *ptr) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (float32x2_t) { a1, ptr[0] };
+  else
+    return (float32x2_t) { ptr[0], a1 };
+}
+/*
+** f32_4:
+**	stp	s0, s1, \[x0\]
+**	ret
+*/
+void f32_4(float32x2_t *res, float32_t a0, float32_t a1) {
+  res[0] = (float32x2_t) { a0, a1 };
+}
+/*
+** f32_5:
+**	stp	s0, s1, \[x0, #?4\]
+**	ret
+*/
+void f32_5(uintptr_t res, float32_t a0, float32_t a1) {
+  *(float32x2_t *)(res + 4) = (float32x2_t) { a0, a1 };
+}
+/* Currently uses d8 to hold res across the call.  */
+float32x2_t f32_6(float32_t a0, float32_t a1) {
+  float32x2_t res = { a0, a1 };
+  ext ();
+  return res;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-15.c b/gcc/testsuite/gcc.target/aarch64/vec-init-15.c
new file mode 100644
index 00000000000..82f0a8f55ee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-15.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+#include <arm_neon.h>
+
+int32x2_t f1(int32_t *x, int c) {
+  return c ? (int32x2_t) { x[0], x[2] } : (int32x2_t) { 0, 0 };
+}
+
+int32x2_t f2(int32_t *x, int i0, int i1, int c) {
+  return c ? (int32x2_t) { x[i0], x[i1] } : (int32x2_t) { 0, 0 };
+}
+
+/* { dg-final { scan-assembler-times {\t(?:ldr\ts[0-9]+|ld1\t)} 4 } } */
+/* { dg-final { scan-assembler-not {\tldr\tw} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-16.c b/gcc/testsuite/gcc.target/aarch64/vec-init-16.c
new file mode 100644
index 00000000000..e00aec7a32c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-16.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+#include <arm_neon.h>
+
+void f1(int32x2_t *res, int32_t *x, int c0, int c1) {
+  res[0] = (int32x2_t) { c0 ? x[0] : 0, c1 ? x[2] : 0 };
+}
+
+/* { dg-final { scan-assembler-times {\tldr\tw[0-9]+} 2 } } */
+/* { dg-final { scan-assembler {\tstp\tw[0-9]+, w[0-9]+} } } */
+/* { dg-final { scan-assembler-not {\tldr\ts} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-17.c b/gcc/testsuite/gcc.target/aarch64/vec-init-17.c
new file mode 100644
index 00000000000..86191b3ca1d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-17.c
@@ -0,0 +1,73 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+
+/*
+** s32_1:
+**	fmov	s0, w0
+**	ret
+*/
+int32x2_t s32_1(int32_t a0) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (int32x2_t) { 0, a0 };
+  else
+    return (int32x2_t) { a0, 0 };
+}
+/*
+** s32_2:
+**	ldr	s0, \[x0\]
+**	ret
+*/
+int32x2_t s32_2(int32_t *ptr) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (int32x2_t) { 0, ptr[0] };
+  else
+    return (int32x2_t) { ptr[0], 0 };
+}
+/*
+** s32_3:
+**	ldr	s0, \[x0, #?4\]
+**	ret
+*/
+int32x2_t s32_3(int32_t *ptr) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (int32x2_t) { 0, ptr[1] };
+  else
+    return (int32x2_t) { ptr[1], 0 };
+}
+
+/*
+** f32_1:
+**	fmov	s0, s0
+**	ret
+*/
+float32x2_t f32_1(float32_t a0) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (float32x2_t) { 0, a0 };
+  else
+    return (float32x2_t) { a0, 0 };
+}
+/*
+** f32_2:
+**	ldr	s0, \[x0\]
+**	ret
+*/
+float32x2_t f32_2(float32_t *ptr) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (float32x2_t) { 0, ptr[0] };
+  else
+    return (float32x2_t) { ptr[0], 0 };
+}
+/*
+** f32_3:
+**	ldr	s0, \[x0, #?4\]
+**	ret
+*/
+float32x2_t f32_3(float32_t *ptr) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (float32x2_t) { 0, ptr[1] };
+  else
+    return (float32x2_t) { ptr[1], 0 };
+}