AArch64 relax predicate on load structure load instructions

Message ID patch-15781-tamar@arm.com
State Dropped
Headers
Series AArch64 relax predicate on load structure load instructions |

Commit Message

Tamar Christina June 8, 2022, 9:38 a.m. UTC
  Hi All,

At some point in time we started lowering the ld1r instructions in gimple.

That is:

uint8x8_t f1(const uint8_t *in) {
    return vld1_dup_u8(&in[1]);
}

generates at gimple:

  _3 = MEM[(const uint8_t *)in_1(D) + 1B];
  _4 = {_3, _3, _3, _3, _3, _3, _3, _3};

Which is good, but we then generate:

f1:
	ldr     b0, [x0, 1]
	dup     v0.8b, v0.b[0]
	ret

instead of ld1r.

The reason for this is because the load instructions have a too restrictive
predicate on them which causes combine not to be able to combine the
instructions due to the predicate only accepting simple addressing modes.

This patch relaxes the predicate to accept any memory operand and relies on
LRA to legitimize the address when it needs to as the constraint still only
allows the simple addressing mode.  Reload is always able to legitimize to
these.

Secondly since we are now actually generating more ld1r it became clear that the
lane instructions suffer from a similar issue.

i.e.

float32x4_t f2(const float32_t *in, float32x4_t a) {
    float32x4_t dup = vld1q_dup_f32(&in[1]);
    return vfmaq_laneq_f32 (a, a, dup, 1);
}

would generate ld1r + vector fmla instead of ldr + lane fmla.

The reason for this is similar to the ld1r issue.  The predicate is too
restrictive in only acception register operands but not memory.

This relaxes it to accept register and/or memory while leaving the constraint
to only accept registers.  This will have LRA generate a reload if needed
forcing the memory to registers using the standard patterns.

These two changes allow combine and reload to generate the right sequences.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (mul_lane<mode>3, mul_laneq<mode>3,
	mul_n<mode>3, *aarch64_mul3_elt_to_64v2df, *aarch64_mla_elt<mode>,
	*aarch64_mla_elt_<vswap_width_name><mode>, aarch64_mla_n<mode>,
	*aarch64_mls_elt<mode>, *aarch64_mls_elt_<vswap_width_name><mode>,
	aarch64_mls_n<mode>, *aarch64_fma4_elt<mode>,
	*aarch64_fma4_elt_<vswap_width_name><mode>,
	*aarch64_fma4_elt_from_dup<mode>, *aarch64_fma4_elt_to_64v2df,
	*aarch64_fnma4_elt<mode>, *aarch64_fnma4_elt_<vswap_width_name><mode>,
	*aarch64_fnma4_elt_from_dup<mode>, *aarch64_fnma4_elt_to_64v2df,
	*aarch64_mulx_elt_<vswap_width_name><mode>,
	*aarch64_mulx_elt<mode>, *aarch64_mulx_elt_from_dup<mode>,
	*aarch64_vgetfmulx<mode>): Relax register_operand to
	nonimmediate_operand.
	(aarch64_simd_ld2<vstruct_elt>, aarch64_simd_ld2r<vstruct_elt>,
	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
	vec_load_lanes<mode><vstruct_elt>, aarch64_simd_st2<vstruct_elt>,
	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
	vec_store_lanes<mode><vstruct_elt>, aarch64_simd_ld3<vstruct_elt>,
	aarch64_simd_ld3r<vstruct_elt>,
	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
	vec_load_lanes<mode><vstruct_elt>, aarch64_simd_st3<vstruct_elt>,
	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
	vec_store_lanes<mode><vstruct_elt>, aarch64_simd_ld4<vstruct_elt>,
	aarch64_simd_ld4r<vstruct_elt>,
	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
	vec_load_lanes<mode><vstruct_elt>, aarch64_simd_st4<vstruct_elt>,
	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
	vec_store_lanes<mode><vstruct_elt>, aarch64_ld1_x3_<vstruct_elt>,
	aarch64_ld1_x4_<vstruct_elt>, aarch64_st1_x2_<vstruct_elt>,
	aarch64_st1_x3_<vstruct_elt>, aarch64_st1_x4_<vstruct_elt>,
	aarch64_be_ld1<mode>, aarch64_be_st1<mode>,
	aarch64_ld2<vstruct_elt>_dreg, aarch64_ld2<vstruct_elt>_dreg,
	aarch64_ld3<vstruct_elt>_dreg, aarch64_ld3<vstruct_elt>_dreg,
	aarch64_ld4<vstruct_elt>_dreg, aarch64_ld4<vstruct_elt>_dreg,
	aarch64_st2<vstruct_elt>_dreg, aarch64_st2<vstruct_elt>_dreg,
	aarch64_st3<vstruct_elt>_dreg, aarch64_st3<vstruct_elt>_dreg,
	aarch64_st4<vstruct_elt>_dreg, aarch64_st4<vstruct_elt>_dreg,
	*aarch64_simd_ld1r<mode>, aarch64_simd_ld1<vstruct_elt>_x2): Relax
	aarch64_simd_struct_operand to memory_operand.
	* config/aarch64/predicates.md (aarch64_simd_struct_operand): Remove.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vld1r.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index be5c70bbb7520ae93d19c4a432ce34863e5b9a64..24e3274ddda2ea76c83571fada8ff4c953b752a1 100644




--
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index be5c70bbb7520ae93d19c4a432ce34863e5b9a64..24e3274ddda2ea76c83571fada8ff4c953b752a1 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -712,7 +712,7 @@ (define_insn "mul_lane<mode>3"
        (mult:VMULD
 	 (vec_duplicate:VMULD
 	   (vec_select:<VEL>
-	     (match_operand:<VCOND> 2 "register_operand" "<h_con>")
+	     (match_operand:<VCOND> 2 "nonimmediate_operand" "<h_con>")
 	     (parallel [(match_operand:SI 3 "immediate_operand" "i")])))
 	 (match_operand:VMULD 1 "register_operand" "w")))]
   "TARGET_SIMD"
@@ -728,7 +728,7 @@ (define_insn "mul_laneq<mode>3"
      (mult:VMUL
        (vec_duplicate:VMUL
 	  (vec_select:<VEL>
-	    (match_operand:<VCONQ> 2 "register_operand" "<h_con>")
+	    (match_operand:<VCONQ> 2 "nonimmediate_operand" "<h_con>")
 	    (parallel [(match_operand:SI 3 "immediate_operand")])))
       (match_operand:VMUL 1 "register_operand" "w")))]
   "TARGET_SIMD"
@@ -743,7 +743,7 @@ (define_insn "mul_n<mode>3"
  [(set (match_operand:VMUL 0 "register_operand" "=w")
        (mult:VMUL
 	 (vec_duplicate:VMUL
-	   (match_operand:<VEL> 2 "register_operand" "<h_con>"))
+	   (match_operand:<VEL> 2 "nonimmediate_operand" "<h_con>"))
 	 (match_operand:VMUL 1 "register_operand" "w")))]
   "TARGET_SIMD"
   "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]";
@@ -789,7 +789,7 @@ (define_insn "*aarch64_mul3_elt_to_64v2df"
   [(set (match_operand:DF 0 "register_operand" "=w")
      (mult:DF
        (vec_select:DF
-	 (match_operand:V2DF 1 "register_operand" "w")
+	 (match_operand:V2DF 1 "nonimmediate_operand" "w")
 	 (parallel [(match_operand:SI 2 "immediate_operand")]))
        (match_operand:DF 3 "register_operand" "w")))]
   "TARGET_SIMD"
@@ -1406,7 +1406,7 @@ (define_insn "*aarch64_mla_elt<mode>"
 	 (mult:VDQHS
 	   (vec_duplicate:VDQHS
 	      (vec_select:<VEL>
-		(match_operand:VDQHS 1 "register_operand" "<h_con>")
+		(match_operand:VDQHS 1 "nonimmediate_operand" "<h_con>")
 		  (parallel [(match_operand:SI 2 "immediate_operand")])))
 	   (match_operand:VDQHS 3 "register_operand" "w"))
 	 (match_operand:VDQHS 4 "register_operand" "0")))]
@@ -1424,7 +1424,7 @@ (define_insn "*aarch64_mla_elt_<vswap_width_name><mode>"
 	 (mult:VDQHS
 	   (vec_duplicate:VDQHS
 	      (vec_select:<VEL>
-		(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+		(match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand" "<h_con>")
 		  (parallel [(match_operand:SI 2 "immediate_operand")])))
 	   (match_operand:VDQHS 3 "register_operand" "w"))
 	 (match_operand:VDQHS 4 "register_operand" "0")))]
@@ -1441,7 +1441,7 @@ (define_insn "aarch64_mla_n<mode>"
 	(plus:VDQHS
 	  (mult:VDQHS
 	    (vec_duplicate:VDQHS
-	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
+	      (match_operand:<VEL> 3 "nonimmediate_operand" "<h_con>"))
 	    (match_operand:VDQHS 2 "register_operand" "w"))
 	  (match_operand:VDQHS 1 "register_operand" "0")))]
  "TARGET_SIMD"
@@ -1466,7 +1466,7 @@ (define_insn "*aarch64_mls_elt<mode>"
 	 (mult:VDQHS
 	   (vec_duplicate:VDQHS
 	      (vec_select:<VEL>
-		(match_operand:VDQHS 1 "register_operand" "<h_con>")
+		(match_operand:VDQHS 1 "nonimmediate_operand" "<h_con>")
 		  (parallel [(match_operand:SI 2 "immediate_operand")])))
 	   (match_operand:VDQHS 3 "register_operand" "w"))))]
  "TARGET_SIMD"
@@ -1484,7 +1484,7 @@ (define_insn "*aarch64_mls_elt_<vswap_width_name><mode>"
 	 (mult:VDQHS
 	   (vec_duplicate:VDQHS
 	      (vec_select:<VEL>
-		(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+		(match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand" "<h_con>")
 		  (parallel [(match_operand:SI 2 "immediate_operand")])))
 	   (match_operand:VDQHS 3 "register_operand" "w"))))]
  "TARGET_SIMD"
@@ -1501,7 +1501,7 @@ (define_insn "aarch64_mls_n<mode>"
 	  (match_operand:VDQHS 1 "register_operand" "0")
 	  (mult:VDQHS
 	    (vec_duplicate:VDQHS
-	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
+	      (match_operand:<VEL> 3 "nonimmediate_operand" "<h_con>"))
 	    (match_operand:VDQHS 2 "register_operand" "w"))))]
   "TARGET_SIMD"
   "mls\t%0.<Vtype>, %2.<Vtype>, %3.<Vetype>[0]"
@@ -2882,7 +2882,7 @@ (define_insn "*aarch64_fma4_elt<mode>"
     (fma:VDQF
       (vec_duplicate:VDQF
 	(vec_select:<VEL>
-	  (match_operand:VDQF 1 "register_operand" "<h_con>")
+	  (match_operand:VDQF 1 "nonimmediate_operand" "<h_con>")
 	  (parallel [(match_operand:SI 2 "immediate_operand")])))
       (match_operand:VDQF 3 "register_operand" "w")
       (match_operand:VDQF 4 "register_operand" "0")))]
@@ -2899,7 +2899,7 @@ (define_insn "*aarch64_fma4_elt_<vswap_width_name><mode>"
     (fma:VDQSF
       (vec_duplicate:VDQSF
 	(vec_select:<VEL>
-	  (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+	  (match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand" "<h_con>")
 	  (parallel [(match_operand:SI 2 "immediate_operand")])))
       (match_operand:VDQSF 3 "register_operand" "w")
       (match_operand:VDQSF 4 "register_operand" "0")))]
@@ -2915,7 +2915,7 @@ (define_insn "*aarch64_fma4_elt_from_dup<mode>"
   [(set (match_operand:VMUL 0 "register_operand" "=w")
     (fma:VMUL
       (vec_duplicate:VMUL
-	  (match_operand:<VEL> 1 "register_operand" "<h_con>"))
+	  (match_operand:<VEL> 1 "nonimmediate_operand" "<h_con>"))
       (match_operand:VMUL 2 "register_operand" "w")
       (match_operand:VMUL 3 "register_operand" "0")))]
   "TARGET_SIMD"
@@ -2927,7 +2927,7 @@ (define_insn "*aarch64_fma4_elt_to_64v2df"
   [(set (match_operand:DF 0 "register_operand" "=w")
     (fma:DF
 	(vec_select:DF
-	  (match_operand:V2DF 1 "register_operand" "w")
+	  (match_operand:V2DF 1 "nonimmediate_operand" "w")
 	  (parallel [(match_operand:SI 2 "immediate_operand")]))
       (match_operand:DF 3 "register_operand" "w")
       (match_operand:DF 4 "register_operand" "0")))]
@@ -2957,7 +2957,7 @@ (define_insn "*aarch64_fnma4_elt<mode>"
         (match_operand:VDQF 3 "register_operand" "w"))
       (vec_duplicate:VDQF
 	(vec_select:<VEL>
-	  (match_operand:VDQF 1 "register_operand" "<h_con>")
+	  (match_operand:VDQF 1 "nonimmediate_operand" "<h_con>")
 	  (parallel [(match_operand:SI 2 "immediate_operand")])))
       (match_operand:VDQF 4 "register_operand" "0")))]
   "TARGET_SIMD"
@@ -2975,7 +2975,7 @@ (define_insn "*aarch64_fnma4_elt_<vswap_width_name><mode>"
         (match_operand:VDQSF 3 "register_operand" "w"))
       (vec_duplicate:VDQSF
 	(vec_select:<VEL>
-	  (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+	  (match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand" "<h_con>")
 	  (parallel [(match_operand:SI 2 "immediate_operand")])))
       (match_operand:VDQSF 4 "register_operand" "0")))]
   "TARGET_SIMD"
@@ -2992,7 +2992,7 @@ (define_insn "*aarch64_fnma4_elt_from_dup<mode>"
       (neg:VMUL
         (match_operand:VMUL 2 "register_operand" "w"))
       (vec_duplicate:VMUL
-	(match_operand:<VEL> 1 "register_operand" "<h_con>"))
+	(match_operand:<VEL> 1 "nonimmediate_operand" "<h_con>"))
       (match_operand:VMUL 3 "register_operand" "0")))]
   "TARGET_SIMD"
   "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
@@ -3003,7 +3003,7 @@ (define_insn "*aarch64_fnma4_elt_to_64v2df"
   [(set (match_operand:DF 0 "register_operand" "=w")
     (fma:DF
       (vec_select:DF
-	(match_operand:V2DF 1 "register_operand" "w")
+	(match_operand:V2DF 1 "nonimmediate_operand" "w")
 	(parallel [(match_operand:SI 2 "immediate_operand")]))
       (neg:DF
         (match_operand:DF 3 "register_operand" "w"))
@@ -4934,7 +4934,7 @@ (define_insn "*aarch64_mulx_elt_<vswap_width_name><mode>"
 	 [(match_operand:VDQSF 1 "register_operand" "w")
 	  (vec_duplicate:VDQSF
 	   (vec_select:<VEL>
-	    (match_operand:<VSWAP_WIDTH> 2 "register_operand" "w")
+	    (match_operand:<VSWAP_WIDTH> 2 "nonimmediate_operand" "w")
 	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
 	 UNSPEC_FMULX))]
   "TARGET_SIMD"
@@ -4953,7 +4953,7 @@ (define_insn "*aarch64_mulx_elt<mode>"
 	 [(match_operand:VDQF 1 "register_operand" "w")
 	  (vec_duplicate:VDQF
 	   (vec_select:<VEL>
-	    (match_operand:VDQF 2 "register_operand" "w")
+	    (match_operand:VDQF 2 "nonimmediate_operand" "w")
 	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
 	 UNSPEC_FMULX))]
   "TARGET_SIMD"
@@ -4971,7 +4971,7 @@ (define_insn "*aarch64_mulx_elt_from_dup<mode>"
 	(unspec:VHSDF
 	 [(match_operand:VHSDF 1 "register_operand" "w")
 	  (vec_duplicate:VHSDF
-	    (match_operand:<VEL> 2 "register_operand" "<h_con>"))]
+	    (match_operand:<VEL> 2 "nonimmediate_operand" "<h_con>"))]
 	 UNSPEC_FMULX))]
   "TARGET_SIMD"
   "fmulx\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]";
@@ -4987,7 +4987,7 @@ (define_insn "*aarch64_vgetfmulx<mode>"
 	(unspec:<VEL>
 	 [(match_operand:<VEL> 1 "register_operand" "w")
 	  (vec_select:<VEL>
-	   (match_operand:VDQF 2 "register_operand" "w")
+	   (match_operand:VDQF 2 "nonimmediate_operand" "w")
 	    (parallel [(match_operand:SI 3 "immediate_operand" "i")]))]
 	 UNSPEC_FMULX))]
   "TARGET_SIMD"
@@ -6768,7 +6768,7 @@ (define_insn "*sqrt<mode>2"
 (define_insn "aarch64_simd_ld2<vstruct_elt>"
   [(set (match_operand:VSTRUCT_2Q 0 "register_operand" "=w")
 	(unspec:VSTRUCT_2Q [
-	  (match_operand:VSTRUCT_2Q 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_2Q 1 "memory_operand" "Utv")]
 	  UNSPEC_LD2))]
   "TARGET_SIMD"
   "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
@@ -6778,7 +6778,7 @@ (define_insn "aarch64_simd_ld2<vstruct_elt>"
 (define_insn "aarch64_simd_ld2r<vstruct_elt>"
   [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_2QD [
-	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:BLK 1 "memory_operand" "Utv")]
           UNSPEC_LD2_DUP))]
   "TARGET_SIMD"
   "ld2r\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
@@ -6788,7 +6788,7 @@ (define_insn "aarch64_simd_ld2r<vstruct_elt>"
 (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
   [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_2QD [
-		(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
+		(match_operand:BLK 1 "memory_operand" "Utv")
 		(match_operand:VSTRUCT_2QD 2 "register_operand" "0")
 		(match_operand:SI 3 "immediate_operand" "i")]
 		UNSPEC_LD2_LANE))]
@@ -6804,7 +6804,7 @@ (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
 (define_expand "vec_load_lanes<mode><vstruct_elt>"
   [(set (match_operand:VSTRUCT_2Q 0 "register_operand")
 	(unspec:VSTRUCT_2Q [
-		(match_operand:VSTRUCT_2Q 1 "aarch64_simd_struct_operand")]
+		(match_operand:VSTRUCT_2Q 1 "memory_operand")]
 		UNSPEC_LD2))]
   "TARGET_SIMD"
 {
@@ -6822,7 +6822,7 @@ (define_expand "vec_load_lanes<mode><vstruct_elt>"
 })
 
 (define_insn "aarch64_simd_st2<vstruct_elt>"
-  [(set (match_operand:VSTRUCT_2Q 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_2Q [
 		(match_operand:VSTRUCT_2Q 1 "register_operand" "w")]
                 UNSPEC_ST2))]
@@ -6833,7 +6833,7 @@ (define_insn "aarch64_simd_st2<vstruct_elt>"
 
 ;; RTL uses GCC vector extension indices, so flip only for assembly.
 (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
-  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
 	(unspec:BLK [(match_operand:VSTRUCT_2QD 1 "register_operand" "w")
 		     (match_operand:SI 2 "immediate_operand" "i")]
 		     UNSPEC_ST2_LANE))]
@@ -6847,7 +6847,7 @@ (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
 )
 
 (define_expand "vec_store_lanes<mode><vstruct_elt>"
-  [(set (match_operand:VSTRUCT_2Q 0 "aarch64_simd_struct_operand")
+  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand")
 	(unspec:VSTRUCT_2Q [(match_operand:VSTRUCT_2Q 1 "register_operand")]
                    UNSPEC_ST2))]
   "TARGET_SIMD"
@@ -6868,7 +6868,7 @@ (define_expand "vec_store_lanes<mode><vstruct_elt>"
 (define_insn "aarch64_simd_ld3<vstruct_elt>"
   [(set (match_operand:VSTRUCT_3Q 0 "register_operand" "=w")
 	(unspec:VSTRUCT_3Q [
-	  (match_operand:VSTRUCT_3Q 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_3Q 1 "memory_operand" "Utv")]
 	  UNSPEC_LD3))]
   "TARGET_SIMD"
   "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
@@ -6878,7 +6878,7 @@ (define_insn "aarch64_simd_ld3<vstruct_elt>"
 (define_insn "aarch64_simd_ld3r<vstruct_elt>"
   [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_3QD [
-	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:BLK 1 "memory_operand" "Utv")]
           UNSPEC_LD3_DUP))]
   "TARGET_SIMD"
   "ld3r\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
@@ -6888,7 +6888,7 @@ (define_insn "aarch64_simd_ld3r<vstruct_elt>"
 (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
   [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_3QD [
-		(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
+		(match_operand:BLK 1 "memory_operand" "Utv")
 		(match_operand:VSTRUCT_3QD 2 "register_operand" "0")
 		(match_operand:SI 3 "immediate_operand" "i")]
 		UNSPEC_LD3_LANE))]
@@ -6904,7 +6904,7 @@ (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
 (define_expand "vec_load_lanes<mode><vstruct_elt>"
   [(set (match_operand:VSTRUCT_3Q 0 "register_operand")
 	(unspec:VSTRUCT_3Q [
-		(match_operand:VSTRUCT_3Q 1 "aarch64_simd_struct_operand")]
+		(match_operand:VSTRUCT_3Q 1 "memory_operand")]
 		UNSPEC_LD3))]
   "TARGET_SIMD"
 {
@@ -6922,7 +6922,7 @@ (define_expand "vec_load_lanes<mode><vstruct_elt>"
 })
 
 (define_insn "aarch64_simd_st3<vstruct_elt>"
-  [(set (match_operand:VSTRUCT_3Q 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_3Q [(match_operand:VSTRUCT_3Q 1 "register_operand" "w")]
                    UNSPEC_ST3))]
   "TARGET_SIMD"
@@ -6932,7 +6932,7 @@ (define_insn "aarch64_simd_st3<vstruct_elt>"
 
 ;; RTL uses GCC vector extension indices, so flip only for assembly.
 (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
-  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
 	(unspec:BLK [(match_operand:VSTRUCT_3QD 1 "register_operand" "w")
 		     (match_operand:SI 2 "immediate_operand" "i")]
 		     UNSPEC_ST3_LANE))]
@@ -6946,7 +6946,7 @@ (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
 )
 
 (define_expand "vec_store_lanes<mode><vstruct_elt>"
-  [(set (match_operand:VSTRUCT_3Q 0 "aarch64_simd_struct_operand")
+  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand")
 	(unspec:VSTRUCT_3Q [
 		(match_operand:VSTRUCT_3Q 1 "register_operand")]
                 UNSPEC_ST3))]
@@ -6968,7 +6968,7 @@ (define_expand "vec_store_lanes<mode><vstruct_elt>"
 (define_insn "aarch64_simd_ld4<vstruct_elt>"
   [(set (match_operand:VSTRUCT_4Q 0 "register_operand" "=w")
 	(unspec:VSTRUCT_4Q [
-	  (match_operand:VSTRUCT_4Q 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_4Q 1 "memory_operand" "Utv")]
 	  UNSPEC_LD4))]
   "TARGET_SIMD"
   "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
@@ -6978,7 +6978,7 @@ (define_insn "aarch64_simd_ld4<vstruct_elt>"
 (define_insn "aarch64_simd_ld4r<vstruct_elt>"
   [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_4QD [
-	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:BLK 1 "memory_operand" "Utv")]
           UNSPEC_LD4_DUP))]
   "TARGET_SIMD"
   "ld4r\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
@@ -6988,7 +6988,7 @@ (define_insn "aarch64_simd_ld4r<vstruct_elt>"
 (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
   [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_4QD [
-		(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
+		(match_operand:BLK 1 "memory_operand" "Utv")
 		(match_operand:VSTRUCT_4QD 2 "register_operand" "0")
 		(match_operand:SI 3 "immediate_operand" "i")]
 		UNSPEC_LD4_LANE))]
@@ -7004,7 +7004,7 @@ (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
 (define_expand "vec_load_lanes<mode><vstruct_elt>"
   [(set (match_operand:VSTRUCT_4Q 0 "register_operand")
 	(unspec:VSTRUCT_4Q [
-		(match_operand:VSTRUCT_4Q 1 "aarch64_simd_struct_operand")]
+		(match_operand:VSTRUCT_4Q 1 "memory_operand")]
 		UNSPEC_LD4))]
   "TARGET_SIMD"
 {
@@ -7022,7 +7022,7 @@ (define_expand "vec_load_lanes<mode><vstruct_elt>"
 })
 
 (define_insn "aarch64_simd_st4<vstruct_elt>"
-  [(set (match_operand:VSTRUCT_4Q 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_4Q [
 		(match_operand:VSTRUCT_4Q 1 "register_operand" "w")]
                 UNSPEC_ST4))]
@@ -7033,7 +7033,7 @@ (define_insn "aarch64_simd_st4<vstruct_elt>"
 
 ;; RTL uses GCC vector extension indices, so flip only for assembly.
 (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
-  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
 	(unspec:BLK [(match_operand:VSTRUCT_4QD 1 "register_operand" "w")
 		     (match_operand:SI 2 "immediate_operand" "i")]
 		     UNSPEC_ST4_LANE))]
@@ -7047,7 +7047,7 @@ (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
 )
 
 (define_expand "vec_store_lanes<mode><vstruct_elt>"
-  [(set (match_operand:VSTRUCT_4Q 0 "aarch64_simd_struct_operand")
+  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand")
 	(unspec:VSTRUCT_4Q [(match_operand:VSTRUCT_4Q 1 "register_operand")]
                    UNSPEC_ST4))]
   "TARGET_SIMD"
@@ -7138,7 +7138,7 @@ (define_expand "aarch64_ld1x3<vstruct_elt>"
 (define_insn "aarch64_ld1_x3_<vstruct_elt>"
   [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
         (unspec:VSTRUCT_3QD
-	  [(match_operand:VSTRUCT_3QD 1 "aarch64_simd_struct_operand" "Utv")]
+	  [(match_operand:VSTRUCT_3QD 1 "memory_operand" "Utv")]
 	  UNSPEC_LD1))]
   "TARGET_SIMD"
   "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
@@ -7158,7 +7158,7 @@ (define_expand "aarch64_ld1x4<vstruct_elt>"
 (define_insn "aarch64_ld1_x4_<vstruct_elt>"
   [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_4QD
-	  [(match_operand:VSTRUCT_4QD 1 "aarch64_simd_struct_operand" "Utv")]
+	  [(match_operand:VSTRUCT_4QD 1 "memory_operand" "Utv")]
 	UNSPEC_LD1))]
   "TARGET_SIMD"
   "ld1\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
@@ -7176,7 +7176,7 @@ (define_expand "aarch64_st1x2<vstruct_elt>"
 })
 
 (define_insn "aarch64_st1_x2_<vstruct_elt>"
-  [(set (match_operand:VSTRUCT_2QD 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_2QD 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_2QD
 		[(match_operand:VSTRUCT_2QD 1 "register_operand" "w")]
 		UNSPEC_ST1))]
@@ -7196,7 +7196,7 @@ (define_expand "aarch64_st1x3<vstruct_elt>"
 })
 
 (define_insn "aarch64_st1_x3_<vstruct_elt>"
-  [(set (match_operand:VSTRUCT_3QD 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_3QD 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_3QD
 		[(match_operand:VSTRUCT_3QD 1 "register_operand" "w")]
 		UNSPEC_ST1))]
@@ -7216,7 +7216,7 @@ (define_expand "aarch64_st1x4<vstruct_elt>"
 })
 
 (define_insn "aarch64_st1_x4_<vstruct_elt>"
-  [(set (match_operand:VSTRUCT_4QD 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_4QD 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_4QD
 		[(match_operand:VSTRUCT_4QD 1 "register_operand" "w")]
 		UNSPEC_ST1))]
@@ -7268,7 +7268,7 @@ (define_insn "*aarch64_movv8di"
 (define_insn "aarch64_be_ld1<mode>"
   [(set (match_operand:VALLDI_F16 0	"register_operand" "=w")
 	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1
-			     "aarch64_simd_struct_operand" "Utv")]
+			     "memory_operand" "Utv")]
 	UNSPEC_LD1))]
   "TARGET_SIMD"
   "ld1\\t{%0<Vmtype>}, %1"
@@ -7276,7 +7276,7 @@ (define_insn "aarch64_be_ld1<mode>"
 )
 
 (define_insn "aarch64_be_st1<mode>"
-  [(set (match_operand:VALLDI_F16 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VALLDI_F16 0 "memory_operand" "=Utv")
 	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1 "register_operand" "w")]
 	UNSPEC_ST1))]
   "TARGET_SIMD"
@@ -7551,7 +7551,7 @@ (define_expand "aarch64_ld<nregs>r<vstruct_elt>"
 (define_insn "aarch64_ld2<vstruct_elt>_dreg"
   [(set (match_operand:VSTRUCT_2DNX 0 "register_operand" "=w")
 	(unspec:VSTRUCT_2DNX [
-	  (match_operand:VSTRUCT_2DNX 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_2DNX 1 "memory_operand" "Utv")]
 	  UNSPEC_LD2_DREG))]
   "TARGET_SIMD"
   "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
@@ -7561,7 +7561,7 @@ (define_insn "aarch64_ld2<vstruct_elt>_dreg"
 (define_insn "aarch64_ld2<vstruct_elt>_dreg"
   [(set (match_operand:VSTRUCT_2DX 0 "register_operand" "=w")
 	(unspec:VSTRUCT_2DX [
-	  (match_operand:VSTRUCT_2DX 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_2DX 1 "memory_operand" "Utv")]
 	  UNSPEC_LD2_DREG))]
   "TARGET_SIMD"
   "ld1\\t{%S0.1d - %T0.1d}, %1"
@@ -7571,7 +7571,7 @@ (define_insn "aarch64_ld2<vstruct_elt>_dreg"
 (define_insn "aarch64_ld3<vstruct_elt>_dreg"
   [(set (match_operand:VSTRUCT_3DNX 0 "register_operand" "=w")
 	(unspec:VSTRUCT_3DNX [
-	  (match_operand:VSTRUCT_3DNX 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_3DNX 1 "memory_operand" "Utv")]
 	  UNSPEC_LD3_DREG))]
   "TARGET_SIMD"
   "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
@@ -7581,7 +7581,7 @@ (define_insn "aarch64_ld3<vstruct_elt>_dreg"
 (define_insn "aarch64_ld3<vstruct_elt>_dreg"
   [(set (match_operand:VSTRUCT_3DX 0 "register_operand" "=w")
 	(unspec:VSTRUCT_3DX [
-	  (match_operand:VSTRUCT_3DX 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_3DX 1 "memory_operand" "Utv")]
 	  UNSPEC_LD3_DREG))]
   "TARGET_SIMD"
   "ld1\\t{%S0.1d - %U0.1d}, %1"
@@ -7591,7 +7591,7 @@ (define_insn "aarch64_ld3<vstruct_elt>_dreg"
 (define_insn "aarch64_ld4<vstruct_elt>_dreg"
   [(set (match_operand:VSTRUCT_4DNX 0 "register_operand" "=w")
 	(unspec:VSTRUCT_4DNX [
-	  (match_operand:VSTRUCT_4DNX 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_4DNX 1 "memory_operand" "Utv")]
 	  UNSPEC_LD4_DREG))]
   "TARGET_SIMD"
   "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
@@ -7601,7 +7601,7 @@ (define_insn "aarch64_ld4<vstruct_elt>_dreg"
 (define_insn "aarch64_ld4<vstruct_elt>_dreg"
   [(set (match_operand:VSTRUCT_4DX 0 "register_operand" "=w")
 	(unspec:VSTRUCT_4DX [
-	  (match_operand:VSTRUCT_4DX 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_4DX 1 "memory_operand" "Utv")]
 	  UNSPEC_LD4_DREG))]
   "TARGET_SIMD"
   "ld1\\t{%S0.1d - %V0.1d}, %1"
@@ -7841,7 +7841,7 @@ (define_insn "aarch64_rev<REVERSE:rev_op><mode>"
 )
 
 (define_insn "aarch64_st2<vstruct_elt>_dreg"
-  [(set (match_operand:VSTRUCT_2DNX 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_2DNX 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_2DNX [
 		(match_operand:VSTRUCT_2DNX 1 "register_operand" "w")]
 		UNSPEC_ST2))]
@@ -7851,7 +7851,7 @@ (define_insn "aarch64_st2<vstruct_elt>_dreg"
 )
 
 (define_insn "aarch64_st2<vstruct_elt>_dreg"
-  [(set (match_operand:VSTRUCT_2DX 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_2DX 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_2DX [
 		(match_operand:VSTRUCT_2DX 1 "register_operand" "w")]
 		UNSPEC_ST2))]
@@ -7861,7 +7861,7 @@ (define_insn "aarch64_st2<vstruct_elt>_dreg"
 )
 
 (define_insn "aarch64_st3<vstruct_elt>_dreg"
-  [(set (match_operand:VSTRUCT_3DNX 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_3DNX 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_3DNX [
 		(match_operand:VSTRUCT_3DNX 1 "register_operand" "w")]
 		UNSPEC_ST3))]
@@ -7871,7 +7871,7 @@ (define_insn "aarch64_st3<vstruct_elt>_dreg"
 )
 
 (define_insn "aarch64_st3<vstruct_elt>_dreg"
-  [(set (match_operand:VSTRUCT_3DX 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_3DX 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_3DX [
 		(match_operand:VSTRUCT_3DX 1 "register_operand" "w")]
 		UNSPEC_ST3))]
@@ -7881,7 +7881,7 @@ (define_insn "aarch64_st3<vstruct_elt>_dreg"
 )
 
 (define_insn "aarch64_st4<vstruct_elt>_dreg"
-  [(set (match_operand:VSTRUCT_4DNX 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_4DNX 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_4DNX [
 		(match_operand:VSTRUCT_4DNX 1 "register_operand" "w")]
 		UNSPEC_ST4))]
@@ -7891,7 +7891,7 @@ (define_insn "aarch64_st4<vstruct_elt>_dreg"
 )
 
 (define_insn "aarch64_st4<vstruct_elt>_dreg"
-  [(set (match_operand:VSTRUCT_4DX 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_4DX 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_4DX [
 		(match_operand:VSTRUCT_4DX 1 "register_operand" "w")]
 		UNSPEC_ST4))]
@@ -7974,7 +7974,7 @@ (define_expand "vec_init<mode><Vhalf>"
 (define_insn "*aarch64_simd_ld1r<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(vec_duplicate:VALL_F16
-	  (match_operand:<VEL> 1 "aarch64_simd_struct_operand" "Utv")))]
+	  (match_operand:<VEL> 1 "memory_operand" "Utv")))]
   "TARGET_SIMD"
   "ld1r\\t{%0.<Vtype>}, %1"
   [(set_attr "type" "neon_load1_all_lanes")]
@@ -7983,7 +7983,7 @@ (define_insn "*aarch64_simd_ld1r<mode>"
 (define_insn "aarch64_simd_ld1<vstruct_elt>_x2"
   [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_2QD [
-	    (match_operand:VSTRUCT_2QD 1 "aarch64_simd_struct_operand" "Utv")]
+	    (match_operand:VSTRUCT_2QD 1 "memory_operand" "Utv")]
 	    UNSPEC_LD1))]
   "TARGET_SIMD"
   "ld1\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index c308015ac2c13d24cd6bcec71247ec45df8cf5e6..6b70a364530c8108457091bfec12fe549f722149 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -494,10 +494,6 @@ (define_predicate "aarch64_simd_reg_or_minus_one"
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "aarch64_simd_imm_minus_one")))
 
-(define_predicate "aarch64_simd_struct_operand"
-  (and (match_code "mem")
-       (match_test "TARGET_SIMD && aarch64_simd_mem_operand_p (op)")))
-
 ;; Like general_operand but allow only valid SIMD addressing modes.
 (define_predicate "aarch64_simd_general_operand"
   (and (match_operand 0 "general_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/vld1r.c b/gcc/testsuite/gcc.target/aarch64/vld1r.c
new file mode 100644
index 0000000000000000000000000000000000000000..72c505c403e9e239771379b7cadd8a9473f06113
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vld1r.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** f1:
+** 	add	x0, x0, 1
+** 	ld1r	{v0.8b}, \[x0\]
+** 	ret
+*/
+uint8x8_t f1(const uint8_t *in) {
+    return vld1_dup_u8(&in[1]);
+}
+
+/*
+** f2:
+** 	ldr	s1, \[x0, 4\]
+** 	fmla	v0.4s, v0.4s, v1.s\[0\]
+** 	ret
+*/
+float32x4_t f2(const float32_t *in, float32x4_t a) {
+    float32x4_t dup = vld1q_dup_f32(&in[1]);
+    return vfmaq_laneq_f32 (a, a, dup, 1);
+}
  

Comments

Richard Sandiford June 8, 2022, 10:31 a.m. UTC | #1
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> At some point in time we started lowering the ld1r instructions in gimple.
>
> That is:
>
> uint8x8_t f1(const uint8_t *in) {
>     return vld1_dup_u8(&in[1]);
> }
>
> generates at gimple:
>
>   _3 = MEM[(const uint8_t *)in_1(D) + 1B];
>   _4 = {_3, _3, _3, _3, _3, _3, _3, _3};
>
> Which is good, but we then generate:
>
> f1:
> 	ldr     b0, [x0, 1]
> 	dup     v0.8b, v0.b[0]
> 	ret
>
> instead of ld1r.
>
> The reason for this is because the load instructions have a too restrictive
> predicate on them which causes combine not to be able to combine the
> instructions due to the predicate only accepting simple addressing modes.
>
> This patch relaxes the predicate to accept any memory operand and relies on
> LRA to legitimize the address when it needs to as the constraint still only
> allows the simple addressing mode.  Reload is always able to legitimize to
> these.
>
> Secondly since we are now actually generating more ld1r it became clear that the
> lane instructions suffer from a similar issue.
>
> i.e.
>
> float32x4_t f2(const float32_t *in, float32x4_t a) {
>     float32x4_t dup = vld1q_dup_f32(&in[1]);
>     return vfmaq_laneq_f32 (a, a, dup, 1);
> }
>
> would generate ld1r + vector fmla instead of ldr + lane fmla.
>
> The reason for this is similar to the ld1r issue.  The predicate is too
> restrictive in only acception register operands but not memory.
>
> This relaxes it to accept register and/or memory while leaving the constraint
> to only accept registers.  This will have LRA generate a reload if needed
> forcing the memory to registers using the standard patterns.
>
> These two changes allow combine and reload to generate the right sequences.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

This is going against the general direction of travel, which is to make
the instruction's predicates and conditions enforce the constraints as
much as possible (making optimistic assumptions about pseudo registers).

The RA *can* deal with things like:

  (match_operand:M N "general_operand" "r")

but it's best avoided, for a few reasons:

(1) The fix-up will be done in LRA, so IRA will not see the temporary
    registers.  This can make the allocation of those temporaries
    suboptimal but (more importantly) it might require other
    previously-allocated registers to be spilled late due to the
    unexpected increase in register pressure.

(2) It ends up hiding instructions from the pre-RA optimisers.

(3) It can also prevent combine opportunities (as well as create them),
    unless the loose predicates in an insn I are propagated to all
    patterns that might result from combining I with something else.

It sounds like the first problem (not generating ld1r) could be fixed
by (a) combining aarch64_simd_dup<mode> and *aarch64_simd_ld1r<mode>,
so that the register and memory alternatives are in the same pattern
and (b) using the merged instruction(s) to implement the vec_duplicate
optab.  Target-independent code should then make the address satisfy
the predicate, simplifying the address where necessary.

I'm not sure whether fixing the ld1r problem that way will avoid the
vfmaq_laneq_f32 problem; let me know if not.

Thanks,
Richard

> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-simd.md (mul_lane<mode>3, mul_laneq<mode>3,
> 	mul_n<mode>3, *aarch64_mul3_elt_to_64v2df, *aarch64_mla_elt<mode>,
> 	*aarch64_mla_elt_<vswap_width_name><mode>, aarch64_mla_n<mode>,
> 	*aarch64_mls_elt<mode>, *aarch64_mls_elt_<vswap_width_name><mode>,
> 	aarch64_mls_n<mode>, *aarch64_fma4_elt<mode>,
> 	*aarch64_fma4_elt_<vswap_width_name><mode>,
> 	*aarch64_fma4_elt_from_dup<mode>, *aarch64_fma4_elt_to_64v2df,
> 	*aarch64_fnma4_elt<mode>, *aarch64_fnma4_elt_<vswap_width_name><mode>,
> 	*aarch64_fnma4_elt_from_dup<mode>, *aarch64_fnma4_elt_to_64v2df,
> 	*aarch64_mulx_elt_<vswap_width_name><mode>,
> 	*aarch64_mulx_elt<mode>, *aarch64_mulx_elt_from_dup<mode>,
> 	*aarch64_vgetfmulx<mode>): Relax register_operand to
> 	nonimmediate_operand.
> 	(aarch64_simd_ld2<vstruct_elt>, aarch64_simd_ld2r<vstruct_elt>,
> 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
> 	vec_load_lanes<mode><vstruct_elt>, aarch64_simd_st2<vstruct_elt>,
> 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
> 	vec_store_lanes<mode><vstruct_elt>, aarch64_simd_ld3<vstruct_elt>,
> 	aarch64_simd_ld3r<vstruct_elt>,
> 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
> 	vec_load_lanes<mode><vstruct_elt>, aarch64_simd_st3<vstruct_elt>,
> 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
> 	vec_store_lanes<mode><vstruct_elt>, aarch64_simd_ld4<vstruct_elt>,
> 	aarch64_simd_ld4r<vstruct_elt>,
> 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
> 	vec_load_lanes<mode><vstruct_elt>, aarch64_simd_st4<vstruct_elt>,
> 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
> 	vec_store_lanes<mode><vstruct_elt>, aarch64_ld1_x3_<vstruct_elt>,
> 	aarch64_ld1_x4_<vstruct_elt>, aarch64_st1_x2_<vstruct_elt>,
> 	aarch64_st1_x3_<vstruct_elt>, aarch64_st1_x4_<vstruct_elt>,
> 	aarch64_be_ld1<mode>, aarch64_be_st1<mode>,
> 	aarch64_ld2<vstruct_elt>_dreg, aarch64_ld2<vstruct_elt>_dreg,
> 	aarch64_ld3<vstruct_elt>_dreg, aarch64_ld3<vstruct_elt>_dreg,
> 	aarch64_ld4<vstruct_elt>_dreg, aarch64_ld4<vstruct_elt>_dreg,
> 	aarch64_st2<vstruct_elt>_dreg, aarch64_st2<vstruct_elt>_dreg,
> 	aarch64_st3<vstruct_elt>_dreg, aarch64_st3<vstruct_elt>_dreg,
> 	aarch64_st4<vstruct_elt>_dreg, aarch64_st4<vstruct_elt>_dreg,
> 	*aarch64_simd_ld1r<mode>, aarch64_simd_ld1<vstruct_elt>_x2): Relax
> 	aarch64_simd_struct_operand to memory_operand.
> 	* config/aarch64/predicates.md (aarch64_simd_struct_operand): Remove.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/vld1r.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index be5c70bbb7520ae93d19c4a432ce34863e5b9a64..24e3274ddda2ea76c83571fada8ff4c953b752a1 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -712,7 +712,7 @@ (define_insn "mul_lane<mode>3"
>         (mult:VMULD
>  	 (vec_duplicate:VMULD
>  	   (vec_select:<VEL>
> -	     (match_operand:<VCOND> 2 "register_operand" "<h_con>")
> +	     (match_operand:<VCOND> 2 "nonimmediate_operand" "<h_con>")
>  	     (parallel [(match_operand:SI 3 "immediate_operand" "i")])))
>  	 (match_operand:VMULD 1 "register_operand" "w")))]
>    "TARGET_SIMD"
> @@ -728,7 +728,7 @@ (define_insn "mul_laneq<mode>3"
>       (mult:VMUL
>         (vec_duplicate:VMUL
>  	  (vec_select:<VEL>
> -	    (match_operand:<VCONQ> 2 "register_operand" "<h_con>")
> +	    (match_operand:<VCONQ> 2 "nonimmediate_operand" "<h_con>")
>  	    (parallel [(match_operand:SI 3 "immediate_operand")])))
>        (match_operand:VMUL 1 "register_operand" "w")))]
>    "TARGET_SIMD"
> @@ -743,7 +743,7 @@ (define_insn "mul_n<mode>3"
>   [(set (match_operand:VMUL 0 "register_operand" "=w")
>         (mult:VMUL
>  	 (vec_duplicate:VMUL
> -	   (match_operand:<VEL> 2 "register_operand" "<h_con>"))
> +	   (match_operand:<VEL> 2 "nonimmediate_operand" "<h_con>"))
>  	 (match_operand:VMUL 1 "register_operand" "w")))]
>    "TARGET_SIMD"
>    "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]";
> @@ -789,7 +789,7 @@ (define_insn "*aarch64_mul3_elt_to_64v2df"
>    [(set (match_operand:DF 0 "register_operand" "=w")
>       (mult:DF
>         (vec_select:DF
> -	 (match_operand:V2DF 1 "register_operand" "w")
> +	 (match_operand:V2DF 1 "nonimmediate_operand" "w")
>  	 (parallel [(match_operand:SI 2 "immediate_operand")]))
>         (match_operand:DF 3 "register_operand" "w")))]
>    "TARGET_SIMD"
> @@ -1406,7 +1406,7 @@ (define_insn "*aarch64_mla_elt<mode>"
>  	 (mult:VDQHS
>  	   (vec_duplicate:VDQHS
>  	      (vec_select:<VEL>
> -		(match_operand:VDQHS 1 "register_operand" "<h_con>")
> +		(match_operand:VDQHS 1 "nonimmediate_operand" "<h_con>")
>  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
>  	   (match_operand:VDQHS 3 "register_operand" "w"))
>  	 (match_operand:VDQHS 4 "register_operand" "0")))]
> @@ -1424,7 +1424,7 @@ (define_insn "*aarch64_mla_elt_<vswap_width_name><mode>"
>  	 (mult:VDQHS
>  	   (vec_duplicate:VDQHS
>  	      (vec_select:<VEL>
> -		(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
> +		(match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand" "<h_con>")
>  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
>  	   (match_operand:VDQHS 3 "register_operand" "w"))
>  	 (match_operand:VDQHS 4 "register_operand" "0")))]
> @@ -1441,7 +1441,7 @@ (define_insn "aarch64_mla_n<mode>"
>  	(plus:VDQHS
>  	  (mult:VDQHS
>  	    (vec_duplicate:VDQHS
> -	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
> +	      (match_operand:<VEL> 3 "nonimmediate_operand" "<h_con>"))
>  	    (match_operand:VDQHS 2 "register_operand" "w"))
>  	  (match_operand:VDQHS 1 "register_operand" "0")))]
>   "TARGET_SIMD"
> @@ -1466,7 +1466,7 @@ (define_insn "*aarch64_mls_elt<mode>"
>  	 (mult:VDQHS
>  	   (vec_duplicate:VDQHS
>  	      (vec_select:<VEL>
> -		(match_operand:VDQHS 1 "register_operand" "<h_con>")
> +		(match_operand:VDQHS 1 "nonimmediate_operand" "<h_con>")
>  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
>  	   (match_operand:VDQHS 3 "register_operand" "w"))))]
>   "TARGET_SIMD"
> @@ -1484,7 +1484,7 @@ (define_insn "*aarch64_mls_elt_<vswap_width_name><mode>"
>  	 (mult:VDQHS
>  	   (vec_duplicate:VDQHS
>  	      (vec_select:<VEL>
> -		(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
> +		(match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand" "<h_con>")
>  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
>  	   (match_operand:VDQHS 3 "register_operand" "w"))))]
>   "TARGET_SIMD"
> @@ -1501,7 +1501,7 @@ (define_insn "aarch64_mls_n<mode>"
>  	  (match_operand:VDQHS 1 "register_operand" "0")
>  	  (mult:VDQHS
>  	    (vec_duplicate:VDQHS
> -	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
> +	      (match_operand:<VEL> 3 "nonimmediate_operand" "<h_con>"))
>  	    (match_operand:VDQHS 2 "register_operand" "w"))))]
>    "TARGET_SIMD"
>    "mls\t%0.<Vtype>, %2.<Vtype>, %3.<Vetype>[0]"
> @@ -2882,7 +2882,7 @@ (define_insn "*aarch64_fma4_elt<mode>"
>      (fma:VDQF
>        (vec_duplicate:VDQF
>  	(vec_select:<VEL>
> -	  (match_operand:VDQF 1 "register_operand" "<h_con>")
> +	  (match_operand:VDQF 1 "nonimmediate_operand" "<h_con>")
>  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
>        (match_operand:VDQF 3 "register_operand" "w")
>        (match_operand:VDQF 4 "register_operand" "0")))]
> @@ -2899,7 +2899,7 @@ (define_insn "*aarch64_fma4_elt_<vswap_width_name><mode>"
>      (fma:VDQSF
>        (vec_duplicate:VDQSF
>  	(vec_select:<VEL>
> -	  (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
> +	  (match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand" "<h_con>")
>  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
>        (match_operand:VDQSF 3 "register_operand" "w")
>        (match_operand:VDQSF 4 "register_operand" "0")))]
> @@ -2915,7 +2915,7 @@ (define_insn "*aarch64_fma4_elt_from_dup<mode>"
>    [(set (match_operand:VMUL 0 "register_operand" "=w")
>      (fma:VMUL
>        (vec_duplicate:VMUL
> -	  (match_operand:<VEL> 1 "register_operand" "<h_con>"))
> +	  (match_operand:<VEL> 1 "nonimmediate_operand" "<h_con>"))
>        (match_operand:VMUL 2 "register_operand" "w")
>        (match_operand:VMUL 3 "register_operand" "0")))]
>    "TARGET_SIMD"
> @@ -2927,7 +2927,7 @@ (define_insn "*aarch64_fma4_elt_to_64v2df"
>    [(set (match_operand:DF 0 "register_operand" "=w")
>      (fma:DF
>  	(vec_select:DF
> -	  (match_operand:V2DF 1 "register_operand" "w")
> +	  (match_operand:V2DF 1 "nonimmediate_operand" "w")
>  	  (parallel [(match_operand:SI 2 "immediate_operand")]))
>        (match_operand:DF 3 "register_operand" "w")
>        (match_operand:DF 4 "register_operand" "0")))]
> @@ -2957,7 +2957,7 @@ (define_insn "*aarch64_fnma4_elt<mode>"
>          (match_operand:VDQF 3 "register_operand" "w"))
>        (vec_duplicate:VDQF
>  	(vec_select:<VEL>
> -	  (match_operand:VDQF 1 "register_operand" "<h_con>")
> +	  (match_operand:VDQF 1 "nonimmediate_operand" "<h_con>")
>  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
>        (match_operand:VDQF 4 "register_operand" "0")))]
>    "TARGET_SIMD"
> @@ -2975,7 +2975,7 @@ (define_insn "*aarch64_fnma4_elt_<vswap_width_name><mode>"
>          (match_operand:VDQSF 3 "register_operand" "w"))
>        (vec_duplicate:VDQSF
>  	(vec_select:<VEL>
> -	  (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
> +	  (match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand" "<h_con>")
>  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
>        (match_operand:VDQSF 4 "register_operand" "0")))]
>    "TARGET_SIMD"
> @@ -2992,7 +2992,7 @@ (define_insn "*aarch64_fnma4_elt_from_dup<mode>"
>        (neg:VMUL
>          (match_operand:VMUL 2 "register_operand" "w"))
>        (vec_duplicate:VMUL
> -	(match_operand:<VEL> 1 "register_operand" "<h_con>"))
> +	(match_operand:<VEL> 1 "nonimmediate_operand" "<h_con>"))
>        (match_operand:VMUL 3 "register_operand" "0")))]
>    "TARGET_SIMD"
>    "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
> @@ -3003,7 +3003,7 @@ (define_insn "*aarch64_fnma4_elt_to_64v2df"
>    [(set (match_operand:DF 0 "register_operand" "=w")
>      (fma:DF
>        (vec_select:DF
> -	(match_operand:V2DF 1 "register_operand" "w")
> +	(match_operand:V2DF 1 "nonimmediate_operand" "w")
>  	(parallel [(match_operand:SI 2 "immediate_operand")]))
>        (neg:DF
>          (match_operand:DF 3 "register_operand" "w"))
> @@ -4934,7 +4934,7 @@ (define_insn "*aarch64_mulx_elt_<vswap_width_name><mode>"
>  	 [(match_operand:VDQSF 1 "register_operand" "w")
>  	  (vec_duplicate:VDQSF
>  	   (vec_select:<VEL>
> -	    (match_operand:<VSWAP_WIDTH> 2 "register_operand" "w")
> +	    (match_operand:<VSWAP_WIDTH> 2 "nonimmediate_operand" "w")
>  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
>  	 UNSPEC_FMULX))]
>    "TARGET_SIMD"
> @@ -4953,7 +4953,7 @@ (define_insn "*aarch64_mulx_elt<mode>"
>  	 [(match_operand:VDQF 1 "register_operand" "w")
>  	  (vec_duplicate:VDQF
>  	   (vec_select:<VEL>
> -	    (match_operand:VDQF 2 "register_operand" "w")
> +	    (match_operand:VDQF 2 "nonimmediate_operand" "w")
>  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
>  	 UNSPEC_FMULX))]
>    "TARGET_SIMD"
> @@ -4971,7 +4971,7 @@ (define_insn "*aarch64_mulx_elt_from_dup<mode>"
>  	(unspec:VHSDF
>  	 [(match_operand:VHSDF 1 "register_operand" "w")
>  	  (vec_duplicate:VHSDF
> -	    (match_operand:<VEL> 2 "register_operand" "<h_con>"))]
> +	    (match_operand:<VEL> 2 "nonimmediate_operand" "<h_con>"))]
>  	 UNSPEC_FMULX))]
>    "TARGET_SIMD"
>    "fmulx\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]";
> @@ -4987,7 +4987,7 @@ (define_insn "*aarch64_vgetfmulx<mode>"
>  	(unspec:<VEL>
>  	 [(match_operand:<VEL> 1 "register_operand" "w")
>  	  (vec_select:<VEL>
> -	   (match_operand:VDQF 2 "register_operand" "w")
> +	   (match_operand:VDQF 2 "nonimmediate_operand" "w")
>  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")]))]
>  	 UNSPEC_FMULX))]
>    "TARGET_SIMD"
> @@ -6768,7 +6768,7 @@ (define_insn "*sqrt<mode>2"
>  (define_insn "aarch64_simd_ld2<vstruct_elt>"
>    [(set (match_operand:VSTRUCT_2Q 0 "register_operand" "=w")
>  	(unspec:VSTRUCT_2Q [
> -	  (match_operand:VSTRUCT_2Q 1 "aarch64_simd_struct_operand" "Utv")]
> +	  (match_operand:VSTRUCT_2Q 1 "memory_operand" "Utv")]
>  	  UNSPEC_LD2))]
>    "TARGET_SIMD"
>    "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> @@ -6778,7 +6778,7 @@ (define_insn "aarch64_simd_ld2<vstruct_elt>"
>  (define_insn "aarch64_simd_ld2r<vstruct_elt>"
>    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
>  	(unspec:VSTRUCT_2QD [
> -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
> +	  (match_operand:BLK 1 "memory_operand" "Utv")]
>            UNSPEC_LD2_DUP))]
>    "TARGET_SIMD"
>    "ld2r\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> @@ -6788,7 +6788,7 @@ (define_insn "aarch64_simd_ld2r<vstruct_elt>"
>  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
>  	(unspec:VSTRUCT_2QD [
> -		(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
> +		(match_operand:BLK 1 "memory_operand" "Utv")
>  		(match_operand:VSTRUCT_2QD 2 "register_operand" "0")
>  		(match_operand:SI 3 "immediate_operand" "i")]
>  		UNSPEC_LD2_LANE))]
> @@ -6804,7 +6804,7 @@ (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>  (define_expand "vec_load_lanes<mode><vstruct_elt>"
>    [(set (match_operand:VSTRUCT_2Q 0 "register_operand")
>  	(unspec:VSTRUCT_2Q [
> -		(match_operand:VSTRUCT_2Q 1 "aarch64_simd_struct_operand")]
> +		(match_operand:VSTRUCT_2Q 1 "memory_operand")]
>  		UNSPEC_LD2))]
>    "TARGET_SIMD"
>  {
> @@ -6822,7 +6822,7 @@ (define_expand "vec_load_lanes<mode><vstruct_elt>"
>  })
>  
>  (define_insn "aarch64_simd_st2<vstruct_elt>"
> -  [(set (match_operand:VSTRUCT_2Q 0 "aarch64_simd_struct_operand" "=Utv")
> +  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand" "=Utv")
>  	(unspec:VSTRUCT_2Q [
>  		(match_operand:VSTRUCT_2Q 1 "register_operand" "w")]
>                  UNSPEC_ST2))]
> @@ -6833,7 +6833,7 @@ (define_insn "aarch64_simd_st2<vstruct_elt>"
>  
>  ;; RTL uses GCC vector extension indices, so flip only for assembly.
>  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
> +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
>  	(unspec:BLK [(match_operand:VSTRUCT_2QD 1 "register_operand" "w")
>  		     (match_operand:SI 2 "immediate_operand" "i")]
>  		     UNSPEC_ST2_LANE))]
> @@ -6847,7 +6847,7 @@ (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
>  )
>  
>  (define_expand "vec_store_lanes<mode><vstruct_elt>"
> -  [(set (match_operand:VSTRUCT_2Q 0 "aarch64_simd_struct_operand")
> +  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand")
>  	(unspec:VSTRUCT_2Q [(match_operand:VSTRUCT_2Q 1 "register_operand")]
>                     UNSPEC_ST2))]
>    "TARGET_SIMD"
> @@ -6868,7 +6868,7 @@ (define_expand "vec_store_lanes<mode><vstruct_elt>"
>  (define_insn "aarch64_simd_ld3<vstruct_elt>"
>    [(set (match_operand:VSTRUCT_3Q 0 "register_operand" "=w")
>  	(unspec:VSTRUCT_3Q [
> -	  (match_operand:VSTRUCT_3Q 1 "aarch64_simd_struct_operand" "Utv")]
> +	  (match_operand:VSTRUCT_3Q 1 "memory_operand" "Utv")]
>  	  UNSPEC_LD3))]
>    "TARGET_SIMD"
>    "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> @@ -6878,7 +6878,7 @@ (define_insn "aarch64_simd_ld3<vstruct_elt>"
>  (define_insn "aarch64_simd_ld3r<vstruct_elt>"
>    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
>  	(unspec:VSTRUCT_3QD [
> -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
> +	  (match_operand:BLK 1 "memory_operand" "Utv")]
>            UNSPEC_LD3_DUP))]
>    "TARGET_SIMD"
>    "ld3r\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> @@ -6888,7 +6888,7 @@ (define_insn "aarch64_simd_ld3r<vstruct_elt>"
>  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
>  	(unspec:VSTRUCT_3QD [
> -		(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
> +		(match_operand:BLK 1 "memory_operand" "Utv")
>  		(match_operand:VSTRUCT_3QD 2 "register_operand" "0")
>  		(match_operand:SI 3 "immediate_operand" "i")]
>  		UNSPEC_LD3_LANE))]
> @@ -6904,7 +6904,7 @@ (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>  (define_expand "vec_load_lanes<mode><vstruct_elt>"
>    [(set (match_operand:VSTRUCT_3Q 0 "register_operand")
>  	(unspec:VSTRUCT_3Q [
> -		(match_operand:VSTRUCT_3Q 1 "aarch64_simd_struct_operand")]
> +		(match_operand:VSTRUCT_3Q 1 "memory_operand")]
>  		UNSPEC_LD3))]
>    "TARGET_SIMD"
>  {
> @@ -6922,7 +6922,7 @@ (define_expand "vec_load_lanes<mode><vstruct_elt>"
>  })
>  
>  (define_insn "aarch64_simd_st3<vstruct_elt>"
> -  [(set (match_operand:VSTRUCT_3Q 0 "aarch64_simd_struct_operand" "=Utv")
> +  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand" "=Utv")
>  	(unspec:VSTRUCT_3Q [(match_operand:VSTRUCT_3Q 1 "register_operand" "w")]
>                     UNSPEC_ST3))]
>    "TARGET_SIMD"
> @@ -6932,7 +6932,7 @@ (define_insn "aarch64_simd_st3<vstruct_elt>"
>  
>  ;; RTL uses GCC vector extension indices, so flip only for assembly.
>  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
> +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
>  	(unspec:BLK [(match_operand:VSTRUCT_3QD 1 "register_operand" "w")
>  		     (match_operand:SI 2 "immediate_operand" "i")]
>  		     UNSPEC_ST3_LANE))]
> @@ -6946,7 +6946,7 @@ (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
>  )
>  
>  (define_expand "vec_store_lanes<mode><vstruct_elt>"
> -  [(set (match_operand:VSTRUCT_3Q 0 "aarch64_simd_struct_operand")
> +  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand")
>  	(unspec:VSTRUCT_3Q [
>  		(match_operand:VSTRUCT_3Q 1 "register_operand")]
>                  UNSPEC_ST3))]
> @@ -6968,7 +6968,7 @@ (define_expand "vec_store_lanes<mode><vstruct_elt>"
>  (define_insn "aarch64_simd_ld4<vstruct_elt>"
>    [(set (match_operand:VSTRUCT_4Q 0 "register_operand" "=w")
>  	(unspec:VSTRUCT_4Q [
> -	  (match_operand:VSTRUCT_4Q 1 "aarch64_simd_struct_operand" "Utv")]
> +	  (match_operand:VSTRUCT_4Q 1 "memory_operand" "Utv")]
>  	  UNSPEC_LD4))]
>    "TARGET_SIMD"
>    "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> @@ -6978,7 +6978,7 @@ (define_insn "aarch64_simd_ld4<vstruct_elt>"
>  (define_insn "aarch64_simd_ld4r<vstruct_elt>"
>    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
>  	(unspec:VSTRUCT_4QD [
> -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
> +	  (match_operand:BLK 1 "memory_operand" "Utv")]
>            UNSPEC_LD4_DUP))]
>    "TARGET_SIMD"
>    "ld4r\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> @@ -6988,7 +6988,7 @@ (define_insn "aarch64_simd_ld4r<vstruct_elt>"
>  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
>  	(unspec:VSTRUCT_4QD [
> -		(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
> +		(match_operand:BLK 1 "memory_operand" "Utv")
>  		(match_operand:VSTRUCT_4QD 2 "register_operand" "0")
>  		(match_operand:SI 3 "immediate_operand" "i")]
>  		UNSPEC_LD4_LANE))]
> @@ -7004,7 +7004,7 @@ (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>  (define_expand "vec_load_lanes<mode><vstruct_elt>"
>    [(set (match_operand:VSTRUCT_4Q 0 "register_operand")
>  	(unspec:VSTRUCT_4Q [
> -		(match_operand:VSTRUCT_4Q 1 "aarch64_simd_struct_operand")]
> +		(match_operand:VSTRUCT_4Q 1 "memory_operand")]
>  		UNSPEC_LD4))]
>    "TARGET_SIMD"
>  {
> @@ -7022,7 +7022,7 @@ (define_expand "vec_load_lanes<mode><vstruct_elt>"
>  })
>  
>  (define_insn "aarch64_simd_st4<vstruct_elt>"
> -  [(set (match_operand:VSTRUCT_4Q 0 "aarch64_simd_struct_operand" "=Utv")
> +  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand" "=Utv")
>  	(unspec:VSTRUCT_4Q [
>  		(match_operand:VSTRUCT_4Q 1 "register_operand" "w")]
>                  UNSPEC_ST4))]
> @@ -7033,7 +7033,7 @@ (define_insn "aarch64_simd_st4<vstruct_elt>"
>  
>  ;; RTL uses GCC vector extension indices, so flip only for assembly.
>  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
> +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
>  	(unspec:BLK [(match_operand:VSTRUCT_4QD 1 "register_operand" "w")
>  		     (match_operand:SI 2 "immediate_operand" "i")]
>  		     UNSPEC_ST4_LANE))]
> @@ -7047,7 +7047,7 @@ (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
>  )
>  
>  (define_expand "vec_store_lanes<mode><vstruct_elt>"
> -  [(set (match_operand:VSTRUCT_4Q 0 "aarch64_simd_struct_operand")
> +  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand")
>  	(unspec:VSTRUCT_4Q [(match_operand:VSTRUCT_4Q 1 "register_operand")]
>                     UNSPEC_ST4))]
>    "TARGET_SIMD"
> @@ -7138,7 +7138,7 @@ (define_expand "aarch64_ld1x3<vstruct_elt>"
>  (define_insn "aarch64_ld1_x3_<vstruct_elt>"
>    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
>          (unspec:VSTRUCT_3QD
> -	  [(match_operand:VSTRUCT_3QD 1 "aarch64_simd_struct_operand" "Utv")]
> +	  [(match_operand:VSTRUCT_3QD 1 "memory_operand" "Utv")]
>  	  UNSPEC_LD1))]
>    "TARGET_SIMD"
>    "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> @@ -7158,7 +7158,7 @@ (define_expand "aarch64_ld1x4<vstruct_elt>"
>  (define_insn "aarch64_ld1_x4_<vstruct_elt>"
>    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
>  	(unspec:VSTRUCT_4QD
> -	  [(match_operand:VSTRUCT_4QD 1 "aarch64_simd_struct_operand" "Utv")]
> +	  [(match_operand:VSTRUCT_4QD 1 "memory_operand" "Utv")]
>  	UNSPEC_LD1))]
>    "TARGET_SIMD"
>    "ld1\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> @@ -7176,7 +7176,7 @@ (define_expand "aarch64_st1x2<vstruct_elt>"
>  })
>  
>  (define_insn "aarch64_st1_x2_<vstruct_elt>"
> -  [(set (match_operand:VSTRUCT_2QD 0 "aarch64_simd_struct_operand" "=Utv")
> +  [(set (match_operand:VSTRUCT_2QD 0 "memory_operand" "=Utv")
>  	(unspec:VSTRUCT_2QD
>  		[(match_operand:VSTRUCT_2QD 1 "register_operand" "w")]
>  		UNSPEC_ST1))]
> @@ -7196,7 +7196,7 @@ (define_expand "aarch64_st1x3<vstruct_elt>"
>  })
>  
>  (define_insn "aarch64_st1_x3_<vstruct_elt>"
> -  [(set (match_operand:VSTRUCT_3QD 0 "aarch64_simd_struct_operand" "=Utv")
> +  [(set (match_operand:VSTRUCT_3QD 0 "memory_operand" "=Utv")
>  	(unspec:VSTRUCT_3QD
>  		[(match_operand:VSTRUCT_3QD 1 "register_operand" "w")]
>  		UNSPEC_ST1))]
> @@ -7216,7 +7216,7 @@ (define_expand "aarch64_st1x4<vstruct_elt>"
>  })
>  
>  (define_insn "aarch64_st1_x4_<vstruct_elt>"
> -  [(set (match_operand:VSTRUCT_4QD 0 "aarch64_simd_struct_operand" "=Utv")
> +  [(set (match_operand:VSTRUCT_4QD 0 "memory_operand" "=Utv")
>  	(unspec:VSTRUCT_4QD
>  		[(match_operand:VSTRUCT_4QD 1 "register_operand" "w")]
>  		UNSPEC_ST1))]
> @@ -7268,7 +7268,7 @@ (define_insn "*aarch64_movv8di"
>  (define_insn "aarch64_be_ld1<mode>"
>    [(set (match_operand:VALLDI_F16 0	"register_operand" "=w")
>  	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1
> -			     "aarch64_simd_struct_operand" "Utv")]
> +			     "memory_operand" "Utv")]
>  	UNSPEC_LD1))]
>    "TARGET_SIMD"
>    "ld1\\t{%0<Vmtype>}, %1"
> @@ -7276,7 +7276,7 @@ (define_insn "aarch64_be_ld1<mode>"
>  )
>  
>  (define_insn "aarch64_be_st1<mode>"
> -  [(set (match_operand:VALLDI_F16 0 "aarch64_simd_struct_operand" "=Utv")
> +  [(set (match_operand:VALLDI_F16 0 "memory_operand" "=Utv")
>  	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1 "register_operand" "w")]
>  	UNSPEC_ST1))]
>    "TARGET_SIMD"
> @@ -7551,7 +7551,7 @@ (define_expand "aarch64_ld<nregs>r<vstruct_elt>"
>  (define_insn "aarch64_ld2<vstruct_elt>_dreg"
>    [(set (match_operand:VSTRUCT_2DNX 0 "register_operand" "=w")
>  	(unspec:VSTRUCT_2DNX [
> -	  (match_operand:VSTRUCT_2DNX 1 "aarch64_simd_struct_operand" "Utv")]
> +	  (match_operand:VSTRUCT_2DNX 1 "memory_operand" "Utv")]
>  	  UNSPEC_LD2_DREG))]
>    "TARGET_SIMD"
>    "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> @@ -7561,7 +7561,7 @@ (define_insn "aarch64_ld2<vstruct_elt>_dreg"
>  (define_insn "aarch64_ld2<vstruct_elt>_dreg"
>    [(set (match_operand:VSTRUCT_2DX 0 "register_operand" "=w")
>  	(unspec:VSTRUCT_2DX [
> -	  (match_operand:VSTRUCT_2DX 1 "aarch64_simd_struct_operand" "Utv")]
> +	  (match_operand:VSTRUCT_2DX 1 "memory_operand" "Utv")]
>  	  UNSPEC_LD2_DREG))]
>    "TARGET_SIMD"
>    "ld1\\t{%S0.1d - %T0.1d}, %1"
> @@ -7571,7 +7571,7 @@ (define_insn "aarch64_ld2<vstruct_elt>_dreg"
>  (define_insn "aarch64_ld3<vstruct_elt>_dreg"
>    [(set (match_operand:VSTRUCT_3DNX 0 "register_operand" "=w")
>  	(unspec:VSTRUCT_3DNX [
> -	  (match_operand:VSTRUCT_3DNX 1 "aarch64_simd_struct_operand" "Utv")]
> +	  (match_operand:VSTRUCT_3DNX 1 "memory_operand" "Utv")]
>  	  UNSPEC_LD3_DREG))]
>    "TARGET_SIMD"
>    "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> @@ -7581,7 +7581,7 @@ (define_insn "aarch64_ld3<vstruct_elt>_dreg"
>  (define_insn "aarch64_ld3<vstruct_elt>_dreg"
>    [(set (match_operand:VSTRUCT_3DX 0 "register_operand" "=w")
>  	(unspec:VSTRUCT_3DX [
> -	  (match_operand:VSTRUCT_3DX 1 "aarch64_simd_struct_operand" "Utv")]
> +	  (match_operand:VSTRUCT_3DX 1 "memory_operand" "Utv")]
>  	  UNSPEC_LD3_DREG))]
>    "TARGET_SIMD"
>    "ld1\\t{%S0.1d - %U0.1d}, %1"
> @@ -7591,7 +7591,7 @@ (define_insn "aarch64_ld3<vstruct_elt>_dreg"
>  (define_insn "aarch64_ld4<vstruct_elt>_dreg"
>    [(set (match_operand:VSTRUCT_4DNX 0 "register_operand" "=w")
>  	(unspec:VSTRUCT_4DNX [
> -	  (match_operand:VSTRUCT_4DNX 1 "aarch64_simd_struct_operand" "Utv")]
> +	  (match_operand:VSTRUCT_4DNX 1 "memory_operand" "Utv")]
>  	  UNSPEC_LD4_DREG))]
>    "TARGET_SIMD"
>    "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> @@ -7601,7 +7601,7 @@ (define_insn "aarch64_ld4<vstruct_elt>_dreg"
>  (define_insn "aarch64_ld4<vstruct_elt>_dreg"
>    [(set (match_operand:VSTRUCT_4DX 0 "register_operand" "=w")
>  	(unspec:VSTRUCT_4DX [
> -	  (match_operand:VSTRUCT_4DX 1 "aarch64_simd_struct_operand" "Utv")]
> +	  (match_operand:VSTRUCT_4DX 1 "memory_operand" "Utv")]
>  	  UNSPEC_LD4_DREG))]
>    "TARGET_SIMD"
>    "ld1\\t{%S0.1d - %V0.1d}, %1"
> @@ -7841,7 +7841,7 @@ (define_insn "aarch64_rev<REVERSE:rev_op><mode>"
>  )
>  
>  (define_insn "aarch64_st2<vstruct_elt>_dreg"
> -  [(set (match_operand:VSTRUCT_2DNX 0 "aarch64_simd_struct_operand" "=Utv")
> +  [(set (match_operand:VSTRUCT_2DNX 0 "memory_operand" "=Utv")
>  	(unspec:VSTRUCT_2DNX [
>  		(match_operand:VSTRUCT_2DNX 1 "register_operand" "w")]
>  		UNSPEC_ST2))]
> @@ -7851,7 +7851,7 @@ (define_insn "aarch64_st2<vstruct_elt>_dreg"
>  )
>  
>  (define_insn "aarch64_st2<vstruct_elt>_dreg"
> -  [(set (match_operand:VSTRUCT_2DX 0 "aarch64_simd_struct_operand" "=Utv")
> +  [(set (match_operand:VSTRUCT_2DX 0 "memory_operand" "=Utv")
>  	(unspec:VSTRUCT_2DX [
>  		(match_operand:VSTRUCT_2DX 1 "register_operand" "w")]
>  		UNSPEC_ST2))]
> @@ -7861,7 +7861,7 @@ (define_insn "aarch64_st2<vstruct_elt>_dreg"
>  )
>  
>  (define_insn "aarch64_st3<vstruct_elt>_dreg"
> -  [(set (match_operand:VSTRUCT_3DNX 0 "aarch64_simd_struct_operand" "=Utv")
> +  [(set (match_operand:VSTRUCT_3DNX 0 "memory_operand" "=Utv")
>  	(unspec:VSTRUCT_3DNX [
>  		(match_operand:VSTRUCT_3DNX 1 "register_operand" "w")]
>  		UNSPEC_ST3))]
> @@ -7871,7 +7871,7 @@ (define_insn "aarch64_st3<vstruct_elt>_dreg"
>  )
>  
>  (define_insn "aarch64_st3<vstruct_elt>_dreg"
> -  [(set (match_operand:VSTRUCT_3DX 0 "aarch64_simd_struct_operand" "=Utv")
> +  [(set (match_operand:VSTRUCT_3DX 0 "memory_operand" "=Utv")
>  	(unspec:VSTRUCT_3DX [
>  		(match_operand:VSTRUCT_3DX 1 "register_operand" "w")]
>  		UNSPEC_ST3))]
> @@ -7881,7 +7881,7 @@ (define_insn "aarch64_st3<vstruct_elt>_dreg"
>  )
>  
>  (define_insn "aarch64_st4<vstruct_elt>_dreg"
> -  [(set (match_operand:VSTRUCT_4DNX 0 "aarch64_simd_struct_operand" "=Utv")
> +  [(set (match_operand:VSTRUCT_4DNX 0 "memory_operand" "=Utv")
>  	(unspec:VSTRUCT_4DNX [
>  		(match_operand:VSTRUCT_4DNX 1 "register_operand" "w")]
>  		UNSPEC_ST4))]
> @@ -7891,7 +7891,7 @@ (define_insn "aarch64_st4<vstruct_elt>_dreg"
>  )
>  
>  (define_insn "aarch64_st4<vstruct_elt>_dreg"
> -  [(set (match_operand:VSTRUCT_4DX 0 "aarch64_simd_struct_operand" "=Utv")
> +  [(set (match_operand:VSTRUCT_4DX 0 "memory_operand" "=Utv")
>  	(unspec:VSTRUCT_4DX [
>  		(match_operand:VSTRUCT_4DX 1 "register_operand" "w")]
>  		UNSPEC_ST4))]
> @@ -7974,7 +7974,7 @@ (define_expand "vec_init<mode><Vhalf>"
>  (define_insn "*aarch64_simd_ld1r<mode>"
>    [(set (match_operand:VALL_F16 0 "register_operand" "=w")
>  	(vec_duplicate:VALL_F16
> -	  (match_operand:<VEL> 1 "aarch64_simd_struct_operand" "Utv")))]
> +	  (match_operand:<VEL> 1 "memory_operand" "Utv")))]
>    "TARGET_SIMD"
>    "ld1r\\t{%0.<Vtype>}, %1"
>    [(set_attr "type" "neon_load1_all_lanes")]
> @@ -7983,7 +7983,7 @@ (define_insn "*aarch64_simd_ld1r<mode>"
>  (define_insn "aarch64_simd_ld1<vstruct_elt>_x2"
>    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
>  	(unspec:VSTRUCT_2QD [
> -	    (match_operand:VSTRUCT_2QD 1 "aarch64_simd_struct_operand" "Utv")]
> +	    (match_operand:VSTRUCT_2QD 1 "memory_operand" "Utv")]
>  	    UNSPEC_LD1))]
>    "TARGET_SIMD"
>    "ld1\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
> index c308015ac2c13d24cd6bcec71247ec45df8cf5e6..6b70a364530c8108457091bfec12fe549f722149 100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -494,10 +494,6 @@ (define_predicate "aarch64_simd_reg_or_minus_one"
>    (ior (match_operand 0 "register_operand")
>         (match_operand 0 "aarch64_simd_imm_minus_one")))
>  
> -(define_predicate "aarch64_simd_struct_operand"
> -  (and (match_code "mem")
> -       (match_test "TARGET_SIMD && aarch64_simd_mem_operand_p (op)")))
> -
>  ;; Like general_operand but allow only valid SIMD addressing modes.
>  (define_predicate "aarch64_simd_general_operand"
>    (and (match_operand 0 "general_operand")
> diff --git a/gcc/testsuite/gcc.target/aarch64/vld1r.c b/gcc/testsuite/gcc.target/aarch64/vld1r.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..72c505c403e9e239771379b7cadd8a9473f06113
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vld1r.c
> @@ -0,0 +1,26 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** f1:
> +** 	add	x0, x0, 1
> +** 	ld1r	{v0.8b}, \[x0\]
> +** 	ret
> +*/
> +uint8x8_t f1(const uint8_t *in) {
> +    return vld1_dup_u8(&in[1]);
> +}
> +
> +/*
> +** f2:
> +** 	ldr	s1, \[x0, 4\]
> +** 	fmla	v0.4s, v0.4s, v1.s\[0\]
> +** 	ret
> +*/
> +float32x4_t f2(const float32_t *in, float32x4_t a) {
> +    float32x4_t dup = vld1q_dup_f32(&in[1]);
> +    return vfmaq_laneq_f32 (a, a, dup, 1);
> +}
  
Tamar Christina June 8, 2022, 1:51 p.m. UTC | #2
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Wednesday, June 8, 2022 11:31 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH]AArch64 relax predicate on load structure load
> instructions
> 
> Tamar Christina <tamar.christina@arm.com> writes:
> > Hi All,
> >
> > At some point in time we started lowering the ld1r instructions in gimple.
> >
> > That is:
> >
> > uint8x8_t f1(const uint8_t *in) {
> >     return vld1_dup_u8(&in[1]);
> > }
> >
> > generates at gimple:
> >
> >   _3 = MEM[(const uint8_t *)in_1(D) + 1B];
> >   _4 = {_3, _3, _3, _3, _3, _3, _3, _3};
> >
> > Which is good, but we then generate:
> >
> > f1:
> > 	ldr     b0, [x0, 1]
> > 	dup     v0.8b, v0.b[0]
> > 	ret
> >
> > instead of ld1r.
> >
> > The reason for this is because the load instructions have a too
> > restrictive predicate on them which causes combine not to be able to
> > combine the instructions due to the predicate only accepting simple
> addressing modes.
> >
> > This patch relaxes the predicate to accept any memory operand and
> > relies on LRA to legitimize the address when it needs to as the
> > constraint still only allows the simple addressing mode.  Reload is
> > always able to legitimize to these.
> >
> > Secondly since we are now actually generating more ld1r it became
> > clear that the lane instructions suffer from a similar issue.
> >
> > i.e.
> >
> > float32x4_t f2(const float32_t *in, float32x4_t a) {
> >     float32x4_t dup = vld1q_dup_f32(&in[1]);
> >     return vfmaq_laneq_f32 (a, a, dup, 1); }
> >
> > would generate ld1r + vector fmla instead of ldr + lane fmla.
> >
> > The reason for this is similar to the ld1r issue.  The predicate is
> > too restrictive in only acception register operands but not memory.
> >
> > This relaxes it to accept register and/or memory while leaving the
> > constraint to only accept registers.  This will have LRA generate a
> > reload if needed forcing the memory to registers using the standard
> patterns.
> >
> > These two changes allow combine and reload to generate the right
> sequences.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> This is going against the general direction of travel, which is to make the
> instruction's predicates and conditions enforce the constraints as much as
> possible (making optimistic assumptions about pseudo registers).
> 
> The RA *can* deal with things like:
> 
>   (match_operand:M N "general_operand" "r")
> 
> but it's best avoided, for a few reasons:
> 
> (1) The fix-up will be done in LRA, so IRA will not see the temporary
>     registers.  This can make the allocation of those temporaries
>     suboptimal but (more importantly) it might require other
>     previously-allocated registers to be spilled late due to the
>     unexpected increase in register pressure.
> 
> (2) It ends up hiding instructions from the pre-RA optimisers.
> 
> (3) It can also prevent combine opportunities (as well as create them),
>     unless the loose predicates in an insn I are propagated to all
>     patterns that might result from combining I with something else.
> 
> It sounds like the first problem (not generating ld1r) could be fixed by (a)
> combining aarch64_simd_dup<mode> and *aarch64_simd_ld1r<mode>, so
> that the register and memory alternatives are in the same pattern and (b)
> using the merged instruction(s) to implement the vec_duplicate optab.
> Target-independent code should then make the address satisfy the
> predicate, simplifying the address where necessary.
> 

I think I am likely missing something here. I would assume that you wanted
to use the optab to split the addressing off from the mem expression so the
combined insn matches.

But in that case, why do you need to combine the two instructions?
I've tried and it doesn't work since the vec_duplicate optab doesn't see the
mem as op1, because in gimple the mem is not part of the duplicate.

So you still just see:

>>> dbgrtx (ops[1].value)
(subreg/s/v:QI (reg:SI 92 [ _3 ]) 0)

As the operand as the argument to the dup is just an SSA_NAME.

If not and you wanted the combined insn to accept

(set (reg:SI 92 [ _3 ])
    (zero_extend:SI (mem:QI (plus:DI (reg:DI 97)
                (const_int 1 [0x1])) [0 MEM[(const uint8_tD.4561 *)in_1(D) + 1B]+0 S1 A8])))

Then that's also not possible without relaxing the combined predicates.  As far as I can tell
If I'm not allowed to use LRA for this, then the only thing that could work is an early split?

Or do I have to modify store_constructor to try a variant where it tries pushing in the
Decl of an SSA_NAME first?

I guess this also only really works for ld1r, whenever we lower ld2(r) etc we'll have the same
issue again... But I suppose that's for the next person ?

Thanks,
Tamar

> I'm not sure whether fixing the ld1r problem that way will avoid the
> vfmaq_laneq_f32 problem; let me know if not.
> 
> Thanks,
> Richard
> 
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* config/aarch64/aarch64-simd.md (mul_lane<mode>3,
> mul_laneq<mode>3,
> > 	mul_n<mode>3, *aarch64_mul3_elt_to_64v2df,
> *aarch64_mla_elt<mode>,
> > 	*aarch64_mla_elt_<vswap_width_name><mode>,
> aarch64_mla_n<mode>,
> > 	*aarch64_mls_elt<mode>,
> *aarch64_mls_elt_<vswap_width_name><mode>,
> > 	aarch64_mls_n<mode>, *aarch64_fma4_elt<mode>,
> > 	*aarch64_fma4_elt_<vswap_width_name><mode>,
> > 	*aarch64_fma4_elt_from_dup<mode>,
> *aarch64_fma4_elt_to_64v2df,
> > 	*aarch64_fnma4_elt<mode>,
> *aarch64_fnma4_elt_<vswap_width_name><mode>,
> > 	*aarch64_fnma4_elt_from_dup<mode>,
> *aarch64_fnma4_elt_to_64v2df,
> > 	*aarch64_mulx_elt_<vswap_width_name><mode>,
> > 	*aarch64_mulx_elt<mode>, *aarch64_mulx_elt_from_dup<mode>,
> > 	*aarch64_vgetfmulx<mode>): Relax register_operand to
> > 	nonimmediate_operand.
> > 	(aarch64_simd_ld2<vstruct_elt>, aarch64_simd_ld2r<vstruct_elt>,
> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
> > 	vec_load_lanes<mode><vstruct_elt>,
> aarch64_simd_st2<vstruct_elt>,
> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
> > 	vec_store_lanes<mode><vstruct_elt>,
> aarch64_simd_ld3<vstruct_elt>,
> > 	aarch64_simd_ld3r<vstruct_elt>,
> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
> > 	vec_load_lanes<mode><vstruct_elt>,
> aarch64_simd_st3<vstruct_elt>,
> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
> > 	vec_store_lanes<mode><vstruct_elt>,
> aarch64_simd_ld4<vstruct_elt>,
> > 	aarch64_simd_ld4r<vstruct_elt>,
> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
> > 	vec_load_lanes<mode><vstruct_elt>,
> aarch64_simd_st4<vstruct_elt>,
> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
> > 	vec_store_lanes<mode><vstruct_elt>,
> aarch64_ld1_x3_<vstruct_elt>,
> > 	aarch64_ld1_x4_<vstruct_elt>, aarch64_st1_x2_<vstruct_elt>,
> > 	aarch64_st1_x3_<vstruct_elt>, aarch64_st1_x4_<vstruct_elt>,
> > 	aarch64_be_ld1<mode>, aarch64_be_st1<mode>,
> > 	aarch64_ld2<vstruct_elt>_dreg, aarch64_ld2<vstruct_elt>_dreg,
> > 	aarch64_ld3<vstruct_elt>_dreg, aarch64_ld3<vstruct_elt>_dreg,
> > 	aarch64_ld4<vstruct_elt>_dreg, aarch64_ld4<vstruct_elt>_dreg,
> > 	aarch64_st2<vstruct_elt>_dreg, aarch64_st2<vstruct_elt>_dreg,
> > 	aarch64_st3<vstruct_elt>_dreg, aarch64_st3<vstruct_elt>_dreg,
> > 	aarch64_st4<vstruct_elt>_dreg, aarch64_st4<vstruct_elt>_dreg,
> > 	*aarch64_simd_ld1r<mode>, aarch64_simd_ld1<vstruct_elt>_x2):
> Relax
> > 	aarch64_simd_struct_operand to memory_operand.
> > 	* config/aarch64/predicates.md (aarch64_simd_struct_operand):
> Remove.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.target/aarch64/vld1r.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index
> >
> be5c70bbb7520ae93d19c4a432ce34863e5b9a64..24e3274ddda2ea76c83571fa
> da8f
> > f4c953b752a1 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -712,7 +712,7 @@ (define_insn "mul_lane<mode>3"
> >         (mult:VMULD
> >  	 (vec_duplicate:VMULD
> >  	   (vec_select:<VEL>
> > -	     (match_operand:<VCOND> 2 "register_operand" "<h_con>")
> > +	     (match_operand:<VCOND> 2 "nonimmediate_operand"
> "<h_con>")
> >  	     (parallel [(match_operand:SI 3 "immediate_operand" "i")])))
> >  	 (match_operand:VMULD 1 "register_operand" "w")))]
> >    "TARGET_SIMD"
> > @@ -728,7 +728,7 @@ (define_insn "mul_laneq<mode>3"
> >       (mult:VMUL
> >         (vec_duplicate:VMUL
> >  	  (vec_select:<VEL>
> > -	    (match_operand:<VCONQ> 2 "register_operand" "<h_con>")
> > +	    (match_operand:<VCONQ> 2 "nonimmediate_operand"
> "<h_con>")
> >  	    (parallel [(match_operand:SI 3 "immediate_operand")])))
> >        (match_operand:VMUL 1 "register_operand" "w")))]
> >    "TARGET_SIMD"
> > @@ -743,7 +743,7 @@ (define_insn "mul_n<mode>3"
> >   [(set (match_operand:VMUL 0 "register_operand" "=w")
> >         (mult:VMUL
> >  	 (vec_duplicate:VMUL
> > -	   (match_operand:<VEL> 2 "register_operand" "<h_con>"))
> > +	   (match_operand:<VEL> 2 "nonimmediate_operand" "<h_con>"))
> >  	 (match_operand:VMUL 1 "register_operand" "w")))]
> >    "TARGET_SIMD"
> >    "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"; @@ -789,7 +789,7
> > @@ (define_insn "*aarch64_mul3_elt_to_64v2df"
> >    [(set (match_operand:DF 0 "register_operand" "=w")
> >       (mult:DF
> >         (vec_select:DF
> > -	 (match_operand:V2DF 1 "register_operand" "w")
> > +	 (match_operand:V2DF 1 "nonimmediate_operand" "w")
> >  	 (parallel [(match_operand:SI 2 "immediate_operand")]))
> >         (match_operand:DF 3 "register_operand" "w")))]
> >    "TARGET_SIMD"
> > @@ -1406,7 +1406,7 @@ (define_insn "*aarch64_mla_elt<mode>"
> >  	 (mult:VDQHS
> >  	   (vec_duplicate:VDQHS
> >  	      (vec_select:<VEL>
> > -		(match_operand:VDQHS 1 "register_operand" "<h_con>")
> > +		(match_operand:VDQHS 1 "nonimmediate_operand"
> "<h_con>")
> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >  	   (match_operand:VDQHS 3 "register_operand" "w"))
> >  	 (match_operand:VDQHS 4 "register_operand" "0")))] @@ -1424,7
> > +1424,7 @@ (define_insn
> "*aarch64_mla_elt_<vswap_width_name><mode>"
> >  	 (mult:VDQHS
> >  	   (vec_duplicate:VDQHS
> >  	      (vec_select:<VEL>
> > -		(match_operand:<VSWAP_WIDTH> 1 "register_operand"
> "<h_con>")
> > +		(match_operand:<VSWAP_WIDTH> 1
> "nonimmediate_operand" "<h_con>")
> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >  	   (match_operand:VDQHS 3 "register_operand" "w"))
> >  	 (match_operand:VDQHS 4 "register_operand" "0")))] @@ -1441,7
> > +1441,7 @@ (define_insn "aarch64_mla_n<mode>"
> >  	(plus:VDQHS
> >  	  (mult:VDQHS
> >  	    (vec_duplicate:VDQHS
> > -	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
> > +	      (match_operand:<VEL> 3 "nonimmediate_operand" "<h_con>"))
> >  	    (match_operand:VDQHS 2 "register_operand" "w"))
> >  	  (match_operand:VDQHS 1 "register_operand" "0")))]
> >   "TARGET_SIMD"
> > @@ -1466,7 +1466,7 @@ (define_insn "*aarch64_mls_elt<mode>"
> >  	 (mult:VDQHS
> >  	   (vec_duplicate:VDQHS
> >  	      (vec_select:<VEL>
> > -		(match_operand:VDQHS 1 "register_operand" "<h_con>")
> > +		(match_operand:VDQHS 1 "nonimmediate_operand"
> "<h_con>")
> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >  	   (match_operand:VDQHS 3 "register_operand" "w"))))]
> >   "TARGET_SIMD"
> > @@ -1484,7 +1484,7 @@ (define_insn
> "*aarch64_mls_elt_<vswap_width_name><mode>"
> >  	 (mult:VDQHS
> >  	   (vec_duplicate:VDQHS
> >  	      (vec_select:<VEL>
> > -		(match_operand:<VSWAP_WIDTH> 1 "register_operand"
> "<h_con>")
> > +		(match_operand:<VSWAP_WIDTH> 1
> "nonimmediate_operand" "<h_con>")
> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >  	   (match_operand:VDQHS 3 "register_operand" "w"))))]
> >   "TARGET_SIMD"
> > @@ -1501,7 +1501,7 @@ (define_insn "aarch64_mls_n<mode>"
> >  	  (match_operand:VDQHS 1 "register_operand" "0")
> >  	  (mult:VDQHS
> >  	    (vec_duplicate:VDQHS
> > -	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
> > +	      (match_operand:<VEL> 3 "nonimmediate_operand" "<h_con>"))
> >  	    (match_operand:VDQHS 2 "register_operand" "w"))))]
> >    "TARGET_SIMD"
> >    "mls\t%0.<Vtype>, %2.<Vtype>, %3.<Vetype>[0]"
> > @@ -2882,7 +2882,7 @@ (define_insn "*aarch64_fma4_elt<mode>"
> >      (fma:VDQF
> >        (vec_duplicate:VDQF
> >  	(vec_select:<VEL>
> > -	  (match_operand:VDQF 1 "register_operand" "<h_con>")
> > +	  (match_operand:VDQF 1 "nonimmediate_operand" "<h_con>")
> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >        (match_operand:VDQF 3 "register_operand" "w")
> >        (match_operand:VDQF 4 "register_operand" "0")))] @@ -2899,7
> > +2899,7 @@ (define_insn
> "*aarch64_fma4_elt_<vswap_width_name><mode>"
> >      (fma:VDQSF
> >        (vec_duplicate:VDQSF
> >  	(vec_select:<VEL>
> > -	  (match_operand:<VSWAP_WIDTH> 1 "register_operand"
> "<h_con>")
> > +	  (match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand"
> "<h_con>")
> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >        (match_operand:VDQSF 3 "register_operand" "w")
> >        (match_operand:VDQSF 4 "register_operand" "0")))] @@ -2915,7
> > +2915,7 @@ (define_insn "*aarch64_fma4_elt_from_dup<mode>"
> >    [(set (match_operand:VMUL 0 "register_operand" "=w")
> >      (fma:VMUL
> >        (vec_duplicate:VMUL
> > -	  (match_operand:<VEL> 1 "register_operand" "<h_con>"))
> > +	  (match_operand:<VEL> 1 "nonimmediate_operand" "<h_con>"))
> >        (match_operand:VMUL 2 "register_operand" "w")
> >        (match_operand:VMUL 3 "register_operand" "0")))]
> >    "TARGET_SIMD"
> > @@ -2927,7 +2927,7 @@ (define_insn "*aarch64_fma4_elt_to_64v2df"
> >    [(set (match_operand:DF 0 "register_operand" "=w")
> >      (fma:DF
> >  	(vec_select:DF
> > -	  (match_operand:V2DF 1 "register_operand" "w")
> > +	  (match_operand:V2DF 1 "nonimmediate_operand" "w")
> >  	  (parallel [(match_operand:SI 2 "immediate_operand")]))
> >        (match_operand:DF 3 "register_operand" "w")
> >        (match_operand:DF 4 "register_operand" "0")))] @@ -2957,7
> > +2957,7 @@ (define_insn "*aarch64_fnma4_elt<mode>"
> >          (match_operand:VDQF 3 "register_operand" "w"))
> >        (vec_duplicate:VDQF
> >  	(vec_select:<VEL>
> > -	  (match_operand:VDQF 1 "register_operand" "<h_con>")
> > +	  (match_operand:VDQF 1 "nonimmediate_operand" "<h_con>")
> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >        (match_operand:VDQF 4 "register_operand" "0")))]
> >    "TARGET_SIMD"
> > @@ -2975,7 +2975,7 @@ (define_insn
> "*aarch64_fnma4_elt_<vswap_width_name><mode>"
> >          (match_operand:VDQSF 3 "register_operand" "w"))
> >        (vec_duplicate:VDQSF
> >  	(vec_select:<VEL>
> > -	  (match_operand:<VSWAP_WIDTH> 1 "register_operand"
> "<h_con>")
> > +	  (match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand"
> "<h_con>")
> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >        (match_operand:VDQSF 4 "register_operand" "0")))]
> >    "TARGET_SIMD"
> > @@ -2992,7 +2992,7 @@ (define_insn
> "*aarch64_fnma4_elt_from_dup<mode>"
> >        (neg:VMUL
> >          (match_operand:VMUL 2 "register_operand" "w"))
> >        (vec_duplicate:VMUL
> > -	(match_operand:<VEL> 1 "register_operand" "<h_con>"))
> > +	(match_operand:<VEL> 1 "nonimmediate_operand" "<h_con>"))
> >        (match_operand:VMUL 3 "register_operand" "0")))]
> >    "TARGET_SIMD"
> >    "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
> > @@ -3003,7 +3003,7 @@ (define_insn "*aarch64_fnma4_elt_to_64v2df"
> >    [(set (match_operand:DF 0 "register_operand" "=w")
> >      (fma:DF
> >        (vec_select:DF
> > -	(match_operand:V2DF 1 "register_operand" "w")
> > +	(match_operand:V2DF 1 "nonimmediate_operand" "w")
> >  	(parallel [(match_operand:SI 2 "immediate_operand")]))
> >        (neg:DF
> >          (match_operand:DF 3 "register_operand" "w")) @@ -4934,7
> > +4934,7 @@ (define_insn
> "*aarch64_mulx_elt_<vswap_width_name><mode>"
> >  	 [(match_operand:VDQSF 1 "register_operand" "w")
> >  	  (vec_duplicate:VDQSF
> >  	   (vec_select:<VEL>
> > -	    (match_operand:<VSWAP_WIDTH> 2 "register_operand" "w")
> > +	    (match_operand:<VSWAP_WIDTH> 2 "nonimmediate_operand"
> "w")
> >  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
> >  	 UNSPEC_FMULX))]
> >    "TARGET_SIMD"
> > @@ -4953,7 +4953,7 @@ (define_insn "*aarch64_mulx_elt<mode>"
> >  	 [(match_operand:VDQF 1 "register_operand" "w")
> >  	  (vec_duplicate:VDQF
> >  	   (vec_select:<VEL>
> > -	    (match_operand:VDQF 2 "register_operand" "w")
> > +	    (match_operand:VDQF 2 "nonimmediate_operand" "w")
> >  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
> >  	 UNSPEC_FMULX))]
> >    "TARGET_SIMD"
> > @@ -4971,7 +4971,7 @@ (define_insn
> "*aarch64_mulx_elt_from_dup<mode>"
> >  	(unspec:VHSDF
> >  	 [(match_operand:VHSDF 1 "register_operand" "w")
> >  	  (vec_duplicate:VHSDF
> > -	    (match_operand:<VEL> 2 "register_operand" "<h_con>"))]
> > +	    (match_operand:<VEL> 2 "nonimmediate_operand" "<h_con>"))]
> >  	 UNSPEC_FMULX))]
> >    "TARGET_SIMD"
> >    "fmulx\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"; @@ -4987,7
> +4987,7
> > @@ (define_insn "*aarch64_vgetfmulx<mode>"
> >  	(unspec:<VEL>
> >  	 [(match_operand:<VEL> 1 "register_operand" "w")
> >  	  (vec_select:<VEL>
> > -	   (match_operand:VDQF 2 "register_operand" "w")
> > +	   (match_operand:VDQF 2 "nonimmediate_operand" "w")
> >  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")]))]
> >  	 UNSPEC_FMULX))]
> >    "TARGET_SIMD"
> > @@ -6768,7 +6768,7 @@ (define_insn "*sqrt<mode>2"
> >  (define_insn "aarch64_simd_ld2<vstruct_elt>"
> >    [(set (match_operand:VSTRUCT_2Q 0 "register_operand" "=w")
> >  	(unspec:VSTRUCT_2Q [
> > -	  (match_operand:VSTRUCT_2Q 1 "aarch64_simd_struct_operand"
> "Utv")]
> > +	  (match_operand:VSTRUCT_2Q 1 "memory_operand" "Utv")]
> >  	  UNSPEC_LD2))]
> >    "TARGET_SIMD"
> >    "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> > @@ -6778,7 +6778,7 @@ (define_insn "aarch64_simd_ld2<vstruct_elt>"
> >  (define_insn "aarch64_simd_ld2r<vstruct_elt>"
> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
> >  	(unspec:VSTRUCT_2QD [
> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
> >            UNSPEC_LD2_DUP))]
> >    "TARGET_SIMD"
> >    "ld2r\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> > @@ -6788,7 +6788,7 @@ (define_insn "aarch64_simd_ld2r<vstruct_elt>"
> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
> >  	(unspec:VSTRUCT_2QD [
> > -		(match_operand:BLK 1 "aarch64_simd_struct_operand"
> "Utv")
> > +		(match_operand:BLK 1 "memory_operand" "Utv")
> >  		(match_operand:VSTRUCT_2QD 2 "register_operand" "0")
> >  		(match_operand:SI 3 "immediate_operand" "i")]
> >  		UNSPEC_LD2_LANE))]
> > @@ -6804,7 +6804,7 @@ (define_insn
> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
> >    [(set (match_operand:VSTRUCT_2Q 0 "register_operand")
> >  	(unspec:VSTRUCT_2Q [
> > -		(match_operand:VSTRUCT_2Q 1
> "aarch64_simd_struct_operand")]
> > +		(match_operand:VSTRUCT_2Q 1 "memory_operand")]
> >  		UNSPEC_LD2))]
> >    "TARGET_SIMD"
> >  {
> > @@ -6822,7 +6822,7 @@ (define_expand
> "vec_load_lanes<mode><vstruct_elt>"
> >  })
> >
> >  (define_insn "aarch64_simd_st2<vstruct_elt>"
> > -  [(set (match_operand:VSTRUCT_2Q 0 "aarch64_simd_struct_operand"
> > "=Utv")
> > +  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand" "=Utv")
> >  	(unspec:VSTRUCT_2Q [
> >  		(match_operand:VSTRUCT_2Q 1 "register_operand" "w")]
> >                  UNSPEC_ST2))]
> > @@ -6833,7 +6833,7 @@ (define_insn "aarch64_simd_st2<vstruct_elt>"
> >
> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
> >  	(unspec:BLK [(match_operand:VSTRUCT_2QD 1 "register_operand"
> "w")
> >  		     (match_operand:SI 2 "immediate_operand" "i")]
> >  		     UNSPEC_ST2_LANE))]
> > @@ -6847,7 +6847,7 @@ (define_insn
> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >  )
> >
> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
> > -  [(set (match_operand:VSTRUCT_2Q 0 "aarch64_simd_struct_operand")
> > +  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand")
> >  	(unspec:VSTRUCT_2Q [(match_operand:VSTRUCT_2Q 1
> "register_operand")]
> >                     UNSPEC_ST2))]
> >    "TARGET_SIMD"
> > @@ -6868,7 +6868,7 @@ (define_expand
> "vec_store_lanes<mode><vstruct_elt>"
> >  (define_insn "aarch64_simd_ld3<vstruct_elt>"
> >    [(set (match_operand:VSTRUCT_3Q 0 "register_operand" "=w")
> >  	(unspec:VSTRUCT_3Q [
> > -	  (match_operand:VSTRUCT_3Q 1 "aarch64_simd_struct_operand"
> "Utv")]
> > +	  (match_operand:VSTRUCT_3Q 1 "memory_operand" "Utv")]
> >  	  UNSPEC_LD3))]
> >    "TARGET_SIMD"
> >    "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> > @@ -6878,7 +6878,7 @@ (define_insn "aarch64_simd_ld3<vstruct_elt>"
> >  (define_insn "aarch64_simd_ld3r<vstruct_elt>"
> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
> >  	(unspec:VSTRUCT_3QD [
> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
> >            UNSPEC_LD3_DUP))]
> >    "TARGET_SIMD"
> >    "ld3r\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> > @@ -6888,7 +6888,7 @@ (define_insn "aarch64_simd_ld3r<vstruct_elt>"
> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
> >  	(unspec:VSTRUCT_3QD [
> > -		(match_operand:BLK 1 "aarch64_simd_struct_operand"
> "Utv")
> > +		(match_operand:BLK 1 "memory_operand" "Utv")
> >  		(match_operand:VSTRUCT_3QD 2 "register_operand" "0")
> >  		(match_operand:SI 3 "immediate_operand" "i")]
> >  		UNSPEC_LD3_LANE))]
> > @@ -6904,7 +6904,7 @@ (define_insn
> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
> >    [(set (match_operand:VSTRUCT_3Q 0 "register_operand")
> >  	(unspec:VSTRUCT_3Q [
> > -		(match_operand:VSTRUCT_3Q 1
> "aarch64_simd_struct_operand")]
> > +		(match_operand:VSTRUCT_3Q 1 "memory_operand")]
> >  		UNSPEC_LD3))]
> >    "TARGET_SIMD"
> >  {
> > @@ -6922,7 +6922,7 @@ (define_expand
> "vec_load_lanes<mode><vstruct_elt>"
> >  })
> >
> >  (define_insn "aarch64_simd_st3<vstruct_elt>"
> > -  [(set (match_operand:VSTRUCT_3Q 0 "aarch64_simd_struct_operand"
> > "=Utv")
> > +  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand" "=Utv")
> >  	(unspec:VSTRUCT_3Q [(match_operand:VSTRUCT_3Q 1
> "register_operand" "w")]
> >                     UNSPEC_ST3))]
> >    "TARGET_SIMD"
> > @@ -6932,7 +6932,7 @@ (define_insn "aarch64_simd_st3<vstruct_elt>"
> >
> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
> >  	(unspec:BLK [(match_operand:VSTRUCT_3QD 1 "register_operand"
> "w")
> >  		     (match_operand:SI 2 "immediate_operand" "i")]
> >  		     UNSPEC_ST3_LANE))]
> > @@ -6946,7 +6946,7 @@ (define_insn
> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >  )
> >
> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
> > -  [(set (match_operand:VSTRUCT_3Q 0 "aarch64_simd_struct_operand")
> > +  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand")
> >  	(unspec:VSTRUCT_3Q [
> >  		(match_operand:VSTRUCT_3Q 1 "register_operand")]
> >                  UNSPEC_ST3))]
> > @@ -6968,7 +6968,7 @@ (define_expand
> "vec_store_lanes<mode><vstruct_elt>"
> >  (define_insn "aarch64_simd_ld4<vstruct_elt>"
> >    [(set (match_operand:VSTRUCT_4Q 0 "register_operand" "=w")
> >  	(unspec:VSTRUCT_4Q [
> > -	  (match_operand:VSTRUCT_4Q 1 "aarch64_simd_struct_operand"
> "Utv")]
> > +	  (match_operand:VSTRUCT_4Q 1 "memory_operand" "Utv")]
> >  	  UNSPEC_LD4))]
> >    "TARGET_SIMD"
> >    "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> > @@ -6978,7 +6978,7 @@ (define_insn "aarch64_simd_ld4<vstruct_elt>"
> >  (define_insn "aarch64_simd_ld4r<vstruct_elt>"
> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
> >  	(unspec:VSTRUCT_4QD [
> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
> >            UNSPEC_LD4_DUP))]
> >    "TARGET_SIMD"
> >    "ld4r\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> > @@ -6988,7 +6988,7 @@ (define_insn "aarch64_simd_ld4r<vstruct_elt>"
> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
> >  	(unspec:VSTRUCT_4QD [
> > -		(match_operand:BLK 1 "aarch64_simd_struct_operand"
> "Utv")
> > +		(match_operand:BLK 1 "memory_operand" "Utv")
> >  		(match_operand:VSTRUCT_4QD 2 "register_operand" "0")
> >  		(match_operand:SI 3 "immediate_operand" "i")]
> >  		UNSPEC_LD4_LANE))]
> > @@ -7004,7 +7004,7 @@ (define_insn
> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
> >    [(set (match_operand:VSTRUCT_4Q 0 "register_operand")
> >  	(unspec:VSTRUCT_4Q [
> > -		(match_operand:VSTRUCT_4Q 1
> "aarch64_simd_struct_operand")]
> > +		(match_operand:VSTRUCT_4Q 1 "memory_operand")]
> >  		UNSPEC_LD4))]
> >    "TARGET_SIMD"
> >  {
> > @@ -7022,7 +7022,7 @@ (define_expand
> "vec_load_lanes<mode><vstruct_elt>"
> >  })
> >
> >  (define_insn "aarch64_simd_st4<vstruct_elt>"
> > -  [(set (match_operand:VSTRUCT_4Q 0 "aarch64_simd_struct_operand"
> > "=Utv")
> > +  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand" "=Utv")
> >  	(unspec:VSTRUCT_4Q [
> >  		(match_operand:VSTRUCT_4Q 1 "register_operand" "w")]
> >                  UNSPEC_ST4))]
> > @@ -7033,7 +7033,7 @@ (define_insn "aarch64_simd_st4<vstruct_elt>"
> >
> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
> >  	(unspec:BLK [(match_operand:VSTRUCT_4QD 1 "register_operand"
> "w")
> >  		     (match_operand:SI 2 "immediate_operand" "i")]
> >  		     UNSPEC_ST4_LANE))]
> > @@ -7047,7 +7047,7 @@ (define_insn
> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >  )
> >
> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
> > -  [(set (match_operand:VSTRUCT_4Q 0 "aarch64_simd_struct_operand")
> > +  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand")
> >  	(unspec:VSTRUCT_4Q [(match_operand:VSTRUCT_4Q 1
> "register_operand")]
> >                     UNSPEC_ST4))]
> >    "TARGET_SIMD"
> > @@ -7138,7 +7138,7 @@ (define_expand "aarch64_ld1x3<vstruct_elt>"
> >  (define_insn "aarch64_ld1_x3_<vstruct_elt>"
> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
> >          (unspec:VSTRUCT_3QD
> > -	  [(match_operand:VSTRUCT_3QD 1 "aarch64_simd_struct_operand"
> "Utv")]
> > +	  [(match_operand:VSTRUCT_3QD 1 "memory_operand" "Utv")]
> >  	  UNSPEC_LD1))]
> >    "TARGET_SIMD"
> >    "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> > @@ -7158,7 +7158,7 @@ (define_expand "aarch64_ld1x4<vstruct_elt>"
> >  (define_insn "aarch64_ld1_x4_<vstruct_elt>"
> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
> >  	(unspec:VSTRUCT_4QD
> > -	  [(match_operand:VSTRUCT_4QD 1 "aarch64_simd_struct_operand"
> "Utv")]
> > +	  [(match_operand:VSTRUCT_4QD 1 "memory_operand" "Utv")]
> >  	UNSPEC_LD1))]
> >    "TARGET_SIMD"
> >    "ld1\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> > @@ -7176,7 +7176,7 @@ (define_expand "aarch64_st1x2<vstruct_elt>"
> >  })
> >
> >  (define_insn "aarch64_st1_x2_<vstruct_elt>"
> > -  [(set (match_operand:VSTRUCT_2QD 0 "aarch64_simd_struct_operand"
> > "=Utv")
> > +  [(set (match_operand:VSTRUCT_2QD 0 "memory_operand" "=Utv")
> >  	(unspec:VSTRUCT_2QD
> >  		[(match_operand:VSTRUCT_2QD 1 "register_operand" "w")]
> >  		UNSPEC_ST1))]
> > @@ -7196,7 +7196,7 @@ (define_expand "aarch64_st1x3<vstruct_elt>"
> >  })
> >
> >  (define_insn "aarch64_st1_x3_<vstruct_elt>"
> > -  [(set (match_operand:VSTRUCT_3QD 0 "aarch64_simd_struct_operand"
> > "=Utv")
> > +  [(set (match_operand:VSTRUCT_3QD 0 "memory_operand" "=Utv")
> >  	(unspec:VSTRUCT_3QD
> >  		[(match_operand:VSTRUCT_3QD 1 "register_operand" "w")]
> >  		UNSPEC_ST1))]
> > @@ -7216,7 +7216,7 @@ (define_expand "aarch64_st1x4<vstruct_elt>"
> >  })
> >
> >  (define_insn "aarch64_st1_x4_<vstruct_elt>"
> > -  [(set (match_operand:VSTRUCT_4QD 0 "aarch64_simd_struct_operand"
> > "=Utv")
> > +  [(set (match_operand:VSTRUCT_4QD 0 "memory_operand" "=Utv")
> >  	(unspec:VSTRUCT_4QD
> >  		[(match_operand:VSTRUCT_4QD 1 "register_operand" "w")]
> >  		UNSPEC_ST1))]
> > @@ -7268,7 +7268,7 @@ (define_insn "*aarch64_movv8di"
> >  (define_insn "aarch64_be_ld1<mode>"
> >    [(set (match_operand:VALLDI_F16 0	"register_operand" "=w")
> >  	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1
> > -			     "aarch64_simd_struct_operand" "Utv")]
> > +			     "memory_operand" "Utv")]
> >  	UNSPEC_LD1))]
> >    "TARGET_SIMD"
> >    "ld1\\t{%0<Vmtype>}, %1"
> > @@ -7276,7 +7276,7 @@ (define_insn "aarch64_be_ld1<mode>"
> >  )
> >
> >  (define_insn "aarch64_be_st1<mode>"
> > -  [(set (match_operand:VALLDI_F16 0 "aarch64_simd_struct_operand"
> > "=Utv")
> > +  [(set (match_operand:VALLDI_F16 0 "memory_operand" "=Utv")
> >  	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1
> "register_operand" "w")]
> >  	UNSPEC_ST1))]
> >    "TARGET_SIMD"
> > @@ -7551,7 +7551,7 @@ (define_expand
> "aarch64_ld<nregs>r<vstruct_elt>"
> >  (define_insn "aarch64_ld2<vstruct_elt>_dreg"
> >    [(set (match_operand:VSTRUCT_2DNX 0 "register_operand" "=w")
> >  	(unspec:VSTRUCT_2DNX [
> > -	  (match_operand:VSTRUCT_2DNX 1
> "aarch64_simd_struct_operand" "Utv")]
> > +	  (match_operand:VSTRUCT_2DNX 1 "memory_operand" "Utv")]
> >  	  UNSPEC_LD2_DREG))]
> >    "TARGET_SIMD"
> >    "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> > @@ -7561,7 +7561,7 @@ (define_insn "aarch64_ld2<vstruct_elt>_dreg"
> >  (define_insn "aarch64_ld2<vstruct_elt>_dreg"
> >    [(set (match_operand:VSTRUCT_2DX 0 "register_operand" "=w")
> >  	(unspec:VSTRUCT_2DX [
> > -	  (match_operand:VSTRUCT_2DX 1 "aarch64_simd_struct_operand"
> "Utv")]
> > +	  (match_operand:VSTRUCT_2DX 1 "memory_operand" "Utv")]
> >  	  UNSPEC_LD2_DREG))]
> >    "TARGET_SIMD"
> >    "ld1\\t{%S0.1d - %T0.1d}, %1"
> > @@ -7571,7 +7571,7 @@ (define_insn "aarch64_ld2<vstruct_elt>_dreg"
> >  (define_insn "aarch64_ld3<vstruct_elt>_dreg"
> >    [(set (match_operand:VSTRUCT_3DNX 0 "register_operand" "=w")
> >  	(unspec:VSTRUCT_3DNX [
> > -	  (match_operand:VSTRUCT_3DNX 1
> "aarch64_simd_struct_operand" "Utv")]
> > +	  (match_operand:VSTRUCT_3DNX 1 "memory_operand" "Utv")]
> >  	  UNSPEC_LD3_DREG))]
> >    "TARGET_SIMD"
> >    "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> > @@ -7581,7 +7581,7 @@ (define_insn "aarch64_ld3<vstruct_elt>_dreg"
> >  (define_insn "aarch64_ld3<vstruct_elt>_dreg"
> >    [(set (match_operand:VSTRUCT_3DX 0 "register_operand" "=w")
> >  	(unspec:VSTRUCT_3DX [
> > -	  (match_operand:VSTRUCT_3DX 1 "aarch64_simd_struct_operand"
> "Utv")]
> > +	  (match_operand:VSTRUCT_3DX 1 "memory_operand" "Utv")]
> >  	  UNSPEC_LD3_DREG))]
> >    "TARGET_SIMD"
> >    "ld1\\t{%S0.1d - %U0.1d}, %1"
> > @@ -7591,7 +7591,7 @@ (define_insn "aarch64_ld3<vstruct_elt>_dreg"
> >  (define_insn "aarch64_ld4<vstruct_elt>_dreg"
> >    [(set (match_operand:VSTRUCT_4DNX 0 "register_operand" "=w")
> >  	(unspec:VSTRUCT_4DNX [
> > -	  (match_operand:VSTRUCT_4DNX 1
> "aarch64_simd_struct_operand" "Utv")]
> > +	  (match_operand:VSTRUCT_4DNX 1 "memory_operand" "Utv")]
> >  	  UNSPEC_LD4_DREG))]
> >    "TARGET_SIMD"
> >    "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> > @@ -7601,7 +7601,7 @@ (define_insn "aarch64_ld4<vstruct_elt>_dreg"
> >  (define_insn "aarch64_ld4<vstruct_elt>_dreg"
> >    [(set (match_operand:VSTRUCT_4DX 0 "register_operand" "=w")
> >  	(unspec:VSTRUCT_4DX [
> > -	  (match_operand:VSTRUCT_4DX 1 "aarch64_simd_struct_operand"
> "Utv")]
> > +	  (match_operand:VSTRUCT_4DX 1 "memory_operand" "Utv")]
> >  	  UNSPEC_LD4_DREG))]
> >    "TARGET_SIMD"
> >    "ld1\\t{%S0.1d - %V0.1d}, %1"
> > @@ -7841,7 +7841,7 @@ (define_insn
> "aarch64_rev<REVERSE:rev_op><mode>"
> >  )
> >
> >  (define_insn "aarch64_st2<vstruct_elt>_dreg"
> > -  [(set (match_operand:VSTRUCT_2DNX 0
> "aarch64_simd_struct_operand"
> > "=Utv")
> > +  [(set (match_operand:VSTRUCT_2DNX 0 "memory_operand" "=Utv")
> >  	(unspec:VSTRUCT_2DNX [
> >  		(match_operand:VSTRUCT_2DNX 1 "register_operand" "w")]
> >  		UNSPEC_ST2))]
> > @@ -7851,7 +7851,7 @@ (define_insn "aarch64_st2<vstruct_elt>_dreg"
> >  )
> >
> >  (define_insn "aarch64_st2<vstruct_elt>_dreg"
> > -  [(set (match_operand:VSTRUCT_2DX 0 "aarch64_simd_struct_operand"
> > "=Utv")
> > +  [(set (match_operand:VSTRUCT_2DX 0 "memory_operand" "=Utv")
> >  	(unspec:VSTRUCT_2DX [
> >  		(match_operand:VSTRUCT_2DX 1 "register_operand" "w")]
> >  		UNSPEC_ST2))]
> > @@ -7861,7 +7861,7 @@ (define_insn "aarch64_st2<vstruct_elt>_dreg"
> >  )
> >
> >  (define_insn "aarch64_st3<vstruct_elt>_dreg"
> > -  [(set (match_operand:VSTRUCT_3DNX 0
> "aarch64_simd_struct_operand"
> > "=Utv")
> > +  [(set (match_operand:VSTRUCT_3DNX 0 "memory_operand" "=Utv")
> >  	(unspec:VSTRUCT_3DNX [
> >  		(match_operand:VSTRUCT_3DNX 1 "register_operand" "w")]
> >  		UNSPEC_ST3))]
> > @@ -7871,7 +7871,7 @@ (define_insn "aarch64_st3<vstruct_elt>_dreg"
> >  )
> >
> >  (define_insn "aarch64_st3<vstruct_elt>_dreg"
> > -  [(set (match_operand:VSTRUCT_3DX 0 "aarch64_simd_struct_operand"
> > "=Utv")
> > +  [(set (match_operand:VSTRUCT_3DX 0 "memory_operand" "=Utv")
> >  	(unspec:VSTRUCT_3DX [
> >  		(match_operand:VSTRUCT_3DX 1 "register_operand" "w")]
> >  		UNSPEC_ST3))]
> > @@ -7881,7 +7881,7 @@ (define_insn "aarch64_st3<vstruct_elt>_dreg"
> >  )
> >
> >  (define_insn "aarch64_st4<vstruct_elt>_dreg"
> > -  [(set (match_operand:VSTRUCT_4DNX 0
> "aarch64_simd_struct_operand"
> > "=Utv")
> > +  [(set (match_operand:VSTRUCT_4DNX 0 "memory_operand" "=Utv")
> >  	(unspec:VSTRUCT_4DNX [
> >  		(match_operand:VSTRUCT_4DNX 1 "register_operand" "w")]
> >  		UNSPEC_ST4))]
> > @@ -7891,7 +7891,7 @@ (define_insn "aarch64_st4<vstruct_elt>_dreg"
> >  )
> >
> >  (define_insn "aarch64_st4<vstruct_elt>_dreg"
> > -  [(set (match_operand:VSTRUCT_4DX 0 "aarch64_simd_struct_operand"
> > "=Utv")
> > +  [(set (match_operand:VSTRUCT_4DX 0 "memory_operand" "=Utv")
> >  	(unspec:VSTRUCT_4DX [
> >  		(match_operand:VSTRUCT_4DX 1 "register_operand" "w")]
> >  		UNSPEC_ST4))]
> > @@ -7974,7 +7974,7 @@ (define_expand "vec_init<mode><Vhalf>"
> >  (define_insn "*aarch64_simd_ld1r<mode>"
> >    [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> >  	(vec_duplicate:VALL_F16
> > -	  (match_operand:<VEL> 1 "aarch64_simd_struct_operand" "Utv")))]
> > +	  (match_operand:<VEL> 1 "memory_operand" "Utv")))]
> >    "TARGET_SIMD"
> >    "ld1r\\t{%0.<Vtype>}, %1"
> >    [(set_attr "type" "neon_load1_all_lanes")] @@ -7983,7 +7983,7 @@
> > (define_insn "*aarch64_simd_ld1r<mode>"
> >  (define_insn "aarch64_simd_ld1<vstruct_elt>_x2"
> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
> >  	(unspec:VSTRUCT_2QD [
> > -	    (match_operand:VSTRUCT_2QD 1 "aarch64_simd_struct_operand"
> "Utv")]
> > +	    (match_operand:VSTRUCT_2QD 1 "memory_operand" "Utv")]
> >  	    UNSPEC_LD1))]
> >    "TARGET_SIMD"
> >    "ld1\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> > diff --git a/gcc/config/aarch64/predicates.md
> > b/gcc/config/aarch64/predicates.md
> > index
> >
> c308015ac2c13d24cd6bcec71247ec45df8cf5e6..6b70a364530c8108457091bfec
> 12
> > fe549f722149 100644
> > --- a/gcc/config/aarch64/predicates.md
> > +++ b/gcc/config/aarch64/predicates.md
> > @@ -494,10 +494,6 @@ (define_predicate
> "aarch64_simd_reg_or_minus_one"
> >    (ior (match_operand 0 "register_operand")
> >         (match_operand 0 "aarch64_simd_imm_minus_one")))
> >
> > -(define_predicate "aarch64_simd_struct_operand"
> > -  (and (match_code "mem")
> > -       (match_test "TARGET_SIMD && aarch64_simd_mem_operand_p
> (op)")))
> > -
> >  ;; Like general_operand but allow only valid SIMD addressing modes.
> >  (define_predicate "aarch64_simd_general_operand"
> >    (and (match_operand 0 "general_operand") diff --git
> > a/gcc/testsuite/gcc.target/aarch64/vld1r.c
> > b/gcc/testsuite/gcc.target/aarch64/vld1r.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..72c505c403e9e239771379b7ca
> dd
> > 8a9473f06113
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vld1r.c
> > @@ -0,0 +1,26 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +#include <arm_neon.h>
> > +
> > +/*
> > +** f1:
> > +** 	add	x0, x0, 1
> > +** 	ld1r	{v0.8b}, \[x0\]
> > +** 	ret
> > +*/
> > +uint8x8_t f1(const uint8_t *in) {
> > +    return vld1_dup_u8(&in[1]);
> > +}
> > +
> > +/*
> > +** f2:
> > +** 	ldr	s1, \[x0, 4\]
> > +** 	fmla	v0.4s, v0.4s, v1.s\[0\]
> > +** 	ret
> > +*/
> > +float32x4_t f2(const float32_t *in, float32x4_t a) {
> > +    float32x4_t dup = vld1q_dup_f32(&in[1]);
> > +    return vfmaq_laneq_f32 (a, a, dup, 1); }
  
Richard Sandiford June 8, 2022, 2:35 p.m. UTC | #3
Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Wednesday, June 8, 2022 11:31 AM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>> Subject: Re: [PATCH]AArch64 relax predicate on load structure load
>> instructions
>> 
>> Tamar Christina <tamar.christina@arm.com> writes:
>> > Hi All,
>> >
>> > At some point in time we started lowering the ld1r instructions in gimple.
>> >
>> > That is:
>> >
>> > uint8x8_t f1(const uint8_t *in) {
>> >     return vld1_dup_u8(&in[1]);
>> > }
>> >
>> > generates at gimple:
>> >
>> >   _3 = MEM[(const uint8_t *)in_1(D) + 1B];
>> >   _4 = {_3, _3, _3, _3, _3, _3, _3, _3};
>> >
>> > Which is good, but we then generate:
>> >
>> > f1:
>> > 	ldr     b0, [x0, 1]
>> > 	dup     v0.8b, v0.b[0]
>> > 	ret
>> >
>> > instead of ld1r.
>> >
>> > The reason for this is because the load instructions have a too
>> > restrictive predicate on them which causes combine not to be able to
>> > combine the instructions due to the predicate only accepting simple
>> addressing modes.
>> >
>> > This patch relaxes the predicate to accept any memory operand and
>> > relies on LRA to legitimize the address when it needs to as the
>> > constraint still only allows the simple addressing mode.  Reload is
>> > always able to legitimize to these.
>> >
>> > Secondly since we are now actually generating more ld1r it became
>> > clear that the lane instructions suffer from a similar issue.
>> >
>> > i.e.
>> >
>> > float32x4_t f2(const float32_t *in, float32x4_t a) {
>> >     float32x4_t dup = vld1q_dup_f32(&in[1]);
>> >     return vfmaq_laneq_f32 (a, a, dup, 1); }
>> >
>> > would generate ld1r + vector fmla instead of ldr + lane fmla.
>> >
>> > The reason for this is similar to the ld1r issue.  The predicate is
>> > too restrictive in only acception register operands but not memory.
>> >
>> > This relaxes it to accept register and/or memory while leaving the
>> > constraint to only accept registers.  This will have LRA generate a
>> > reload if needed forcing the memory to registers using the standard
>> patterns.
>> >
>> > These two changes allow combine and reload to generate the right
>> sequences.
>> >
>> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>> 
>> This is going against the general direction of travel, which is to make the
>> instruction's predicates and conditions enforce the constraints as much as
>> possible (making optimistic assumptions about pseudo registers).
>> 
>> The RA *can* deal with things like:
>> 
>>   (match_operand:M N "general_operand" "r")
>> 
>> but it's best avoided, for a few reasons:
>> 
>> (1) The fix-up will be done in LRA, so IRA will not see the temporary
>>     registers.  This can make the allocation of those temporaries
>>     suboptimal but (more importantly) it might require other
>>     previously-allocated registers to be spilled late due to the
>>     unexpected increase in register pressure.
>> 
>> (2) It ends up hiding instructions from the pre-RA optimisers.
>> 
>> (3) It can also prevent combine opportunities (as well as create them),
>>     unless the loose predicates in an insn I are propagated to all
>>     patterns that might result from combining I with something else.
>> 
>> It sounds like the first problem (not generating ld1r) could be fixed by (a)
>> combining aarch64_simd_dup<mode> and *aarch64_simd_ld1r<mode>, so
>> that the register and memory alternatives are in the same pattern and (b)
>> using the merged instruction(s) to implement the vec_duplicate optab.
>> Target-independent code should then make the address satisfy the
>> predicate, simplifying the address where necessary.
>> 
>
> I think I am likely missing something here. I would assume that you wanted
> to use the optab to split the addressing off from the mem expression so the
> combined insn matches.
>
> But in that case, why do you need to combine the two instructions?
> I've tried and it doesn't work since the vec_duplicate optab doesn't see the
> mem as op1, because in gimple the mem is not part of the duplicate.
>
> So you still just see:
>
>>>> dbgrtx (ops[1].value)
> (subreg/s/v:QI (reg:SI 92 [ _3 ]) 0)
>
> As the operand as the argument to the dup is just an SSA_NAME.

Ah, yeah, I'd forgotten that fixed-length vec_duplicates would
come from a constructor rather than a vec_duplicate_expr, so we don't
get the usual benefit of folding single-use mems during expand.

https://gcc.gnu.org/pipermail/gcc-patches/2022-May/595362.html
moves towards using vec_duplicate even for fixed-length vectors.
If we take that approach, then I suppose a plain constructor
should be folded to a vec_duplicate where possible.

(Alternatively, we could use an extended vec_perm_expr with
scalar inputs, as Richi suggested in that thread.)

If we don't do that, or don't do it yet, then…

> If not and you wanted the combined insn to accept
>
> (set (reg:SI 92 [ _3 ])
>     (zero_extend:SI (mem:QI (plus:DI (reg:DI 97)
>                 (const_int 1 [0x1])) [0 MEM[(const uint8_tD.4561 *)in_1(D) + 1B]+0 S1 A8])))
>
> Then that's also not possible without relaxing the combined predicates.  As far as I can tell
> If I'm not allowed to use LRA for this, then the only thing that could work is an early split?
>
> Or do I have to modify store_constructor to try a variant where it tries pushing in the
> Decl of an SSA_NAME first?

…yeah, something like this would be needed.  But the vec_duplicate_expr/
vec_perm_expr thing seems better, even if we only introduce it during isel.

Not my call either way though :-)  Let's see what Richi (cc:ed) thinks.

Thanks,
Richard

> I guess this also only really works for ld1r, whenever we lower ld2(r) etc we'll have the same
> issue again... But I suppose that's for the next person ?
>
> Thanks,
> Tamar
>
>> I'm not sure whether fixing the ld1r problem that way will avoid the
>> vfmaq_laneq_f32 problem; let me know if not.
>> 
>> Thanks,
>> Richard
>> 
>> > Ok for master?
>> >
>> > Thanks,
>> > Tamar
>> >
>> > gcc/ChangeLog:
>> >
>> > 	* config/aarch64/aarch64-simd.md (mul_lane<mode>3,
>> mul_laneq<mode>3,
>> > 	mul_n<mode>3, *aarch64_mul3_elt_to_64v2df,
>> *aarch64_mla_elt<mode>,
>> > 	*aarch64_mla_elt_<vswap_width_name><mode>,
>> aarch64_mla_n<mode>,
>> > 	*aarch64_mls_elt<mode>,
>> *aarch64_mls_elt_<vswap_width_name><mode>,
>> > 	aarch64_mls_n<mode>, *aarch64_fma4_elt<mode>,
>> > 	*aarch64_fma4_elt_<vswap_width_name><mode>,
>> > 	*aarch64_fma4_elt_from_dup<mode>,
>> *aarch64_fma4_elt_to_64v2df,
>> > 	*aarch64_fnma4_elt<mode>,
>> *aarch64_fnma4_elt_<vswap_width_name><mode>,
>> > 	*aarch64_fnma4_elt_from_dup<mode>,
>> *aarch64_fnma4_elt_to_64v2df,
>> > 	*aarch64_mulx_elt_<vswap_width_name><mode>,
>> > 	*aarch64_mulx_elt<mode>, *aarch64_mulx_elt_from_dup<mode>,
>> > 	*aarch64_vgetfmulx<mode>): Relax register_operand to
>> > 	nonimmediate_operand.
>> > 	(aarch64_simd_ld2<vstruct_elt>, aarch64_simd_ld2r<vstruct_elt>,
>> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
>> > 	vec_load_lanes<mode><vstruct_elt>,
>> aarch64_simd_st2<vstruct_elt>,
>> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
>> > 	vec_store_lanes<mode><vstruct_elt>,
>> aarch64_simd_ld3<vstruct_elt>,
>> > 	aarch64_simd_ld3r<vstruct_elt>,
>> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
>> > 	vec_load_lanes<mode><vstruct_elt>,
>> aarch64_simd_st3<vstruct_elt>,
>> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
>> > 	vec_store_lanes<mode><vstruct_elt>,
>> aarch64_simd_ld4<vstruct_elt>,
>> > 	aarch64_simd_ld4r<vstruct_elt>,
>> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
>> > 	vec_load_lanes<mode><vstruct_elt>,
>> aarch64_simd_st4<vstruct_elt>,
>> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
>> > 	vec_store_lanes<mode><vstruct_elt>,
>> aarch64_ld1_x3_<vstruct_elt>,
>> > 	aarch64_ld1_x4_<vstruct_elt>, aarch64_st1_x2_<vstruct_elt>,
>> > 	aarch64_st1_x3_<vstruct_elt>, aarch64_st1_x4_<vstruct_elt>,
>> > 	aarch64_be_ld1<mode>, aarch64_be_st1<mode>,
>> > 	aarch64_ld2<vstruct_elt>_dreg, aarch64_ld2<vstruct_elt>_dreg,
>> > 	aarch64_ld3<vstruct_elt>_dreg, aarch64_ld3<vstruct_elt>_dreg,
>> > 	aarch64_ld4<vstruct_elt>_dreg, aarch64_ld4<vstruct_elt>_dreg,
>> > 	aarch64_st2<vstruct_elt>_dreg, aarch64_st2<vstruct_elt>_dreg,
>> > 	aarch64_st3<vstruct_elt>_dreg, aarch64_st3<vstruct_elt>_dreg,
>> > 	aarch64_st4<vstruct_elt>_dreg, aarch64_st4<vstruct_elt>_dreg,
>> > 	*aarch64_simd_ld1r<mode>, aarch64_simd_ld1<vstruct_elt>_x2):
>> Relax
>> > 	aarch64_simd_struct_operand to memory_operand.
>> > 	* config/aarch64/predicates.md (aarch64_simd_struct_operand):
>> Remove.
>> >
>> > gcc/testsuite/ChangeLog:
>> >
>> > 	* gcc.target/aarch64/vld1r.c: New test.
>> >
>> > --- inline copy of patch --
>> > diff --git a/gcc/config/aarch64/aarch64-simd.md
>> > b/gcc/config/aarch64/aarch64-simd.md
>> > index
>> >
>> be5c70bbb7520ae93d19c4a432ce34863e5b9a64..24e3274ddda2ea76c83571fa
>> da8f
>> > f4c953b752a1 100644
>> > --- a/gcc/config/aarch64/aarch64-simd.md
>> > +++ b/gcc/config/aarch64/aarch64-simd.md
>> > @@ -712,7 +712,7 @@ (define_insn "mul_lane<mode>3"
>> >         (mult:VMULD
>> >  	 (vec_duplicate:VMULD
>> >  	   (vec_select:<VEL>
>> > -	     (match_operand:<VCOND> 2 "register_operand" "<h_con>")
>> > +	     (match_operand:<VCOND> 2 "nonimmediate_operand"
>> "<h_con>")
>> >  	     (parallel [(match_operand:SI 3 "immediate_operand" "i")])))
>> >  	 (match_operand:VMULD 1 "register_operand" "w")))]
>> >    "TARGET_SIMD"
>> > @@ -728,7 +728,7 @@ (define_insn "mul_laneq<mode>3"
>> >       (mult:VMUL
>> >         (vec_duplicate:VMUL
>> >  	  (vec_select:<VEL>
>> > -	    (match_operand:<VCONQ> 2 "register_operand" "<h_con>")
>> > +	    (match_operand:<VCONQ> 2 "nonimmediate_operand"
>> "<h_con>")
>> >  	    (parallel [(match_operand:SI 3 "immediate_operand")])))
>> >        (match_operand:VMUL 1 "register_operand" "w")))]
>> >    "TARGET_SIMD"
>> > @@ -743,7 +743,7 @@ (define_insn "mul_n<mode>3"
>> >   [(set (match_operand:VMUL 0 "register_operand" "=w")
>> >         (mult:VMUL
>> >  	 (vec_duplicate:VMUL
>> > -	   (match_operand:<VEL> 2 "register_operand" "<h_con>"))
>> > +	   (match_operand:<VEL> 2 "nonimmediate_operand" "<h_con>"))
>> >  	 (match_operand:VMUL 1 "register_operand" "w")))]
>> >    "TARGET_SIMD"
>> >    "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"; @@ -789,7 +789,7
>> > @@ (define_insn "*aarch64_mul3_elt_to_64v2df"
>> >    [(set (match_operand:DF 0 "register_operand" "=w")
>> >       (mult:DF
>> >         (vec_select:DF
>> > -	 (match_operand:V2DF 1 "register_operand" "w")
>> > +	 (match_operand:V2DF 1 "nonimmediate_operand" "w")
>> >  	 (parallel [(match_operand:SI 2 "immediate_operand")]))
>> >         (match_operand:DF 3 "register_operand" "w")))]
>> >    "TARGET_SIMD"
>> > @@ -1406,7 +1406,7 @@ (define_insn "*aarch64_mla_elt<mode>"
>> >  	 (mult:VDQHS
>> >  	   (vec_duplicate:VDQHS
>> >  	      (vec_select:<VEL>
>> > -		(match_operand:VDQHS 1 "register_operand" "<h_con>")
>> > +		(match_operand:VDQHS 1 "nonimmediate_operand"
>> "<h_con>")
>> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
>> >  	   (match_operand:VDQHS 3 "register_operand" "w"))
>> >  	 (match_operand:VDQHS 4 "register_operand" "0")))] @@ -1424,7
>> > +1424,7 @@ (define_insn
>> "*aarch64_mla_elt_<vswap_width_name><mode>"
>> >  	 (mult:VDQHS
>> >  	   (vec_duplicate:VDQHS
>> >  	      (vec_select:<VEL>
>> > -		(match_operand:<VSWAP_WIDTH> 1 "register_operand"
>> "<h_con>")
>> > +		(match_operand:<VSWAP_WIDTH> 1
>> "nonimmediate_operand" "<h_con>")
>> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
>> >  	   (match_operand:VDQHS 3 "register_operand" "w"))
>> >  	 (match_operand:VDQHS 4 "register_operand" "0")))] @@ -1441,7
>> > +1441,7 @@ (define_insn "aarch64_mla_n<mode>"
>> >  	(plus:VDQHS
>> >  	  (mult:VDQHS
>> >  	    (vec_duplicate:VDQHS
>> > -	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
>> > +	      (match_operand:<VEL> 3 "nonimmediate_operand" "<h_con>"))
>> >  	    (match_operand:VDQHS 2 "register_operand" "w"))
>> >  	  (match_operand:VDQHS 1 "register_operand" "0")))]
>> >   "TARGET_SIMD"
>> > @@ -1466,7 +1466,7 @@ (define_insn "*aarch64_mls_elt<mode>"
>> >  	 (mult:VDQHS
>> >  	   (vec_duplicate:VDQHS
>> >  	      (vec_select:<VEL>
>> > -		(match_operand:VDQHS 1 "register_operand" "<h_con>")
>> > +		(match_operand:VDQHS 1 "nonimmediate_operand"
>> "<h_con>")
>> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
>> >  	   (match_operand:VDQHS 3 "register_operand" "w"))))]
>> >   "TARGET_SIMD"
>> > @@ -1484,7 +1484,7 @@ (define_insn
>> "*aarch64_mls_elt_<vswap_width_name><mode>"
>> >  	 (mult:VDQHS
>> >  	   (vec_duplicate:VDQHS
>> >  	      (vec_select:<VEL>
>> > -		(match_operand:<VSWAP_WIDTH> 1 "register_operand"
>> "<h_con>")
>> > +		(match_operand:<VSWAP_WIDTH> 1
>> "nonimmediate_operand" "<h_con>")
>> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
>> >  	   (match_operand:VDQHS 3 "register_operand" "w"))))]
>> >   "TARGET_SIMD"
>> > @@ -1501,7 +1501,7 @@ (define_insn "aarch64_mls_n<mode>"
>> >  	  (match_operand:VDQHS 1 "register_operand" "0")
>> >  	  (mult:VDQHS
>> >  	    (vec_duplicate:VDQHS
>> > -	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
>> > +	      (match_operand:<VEL> 3 "nonimmediate_operand" "<h_con>"))
>> >  	    (match_operand:VDQHS 2 "register_operand" "w"))))]
>> >    "TARGET_SIMD"
>> >    "mls\t%0.<Vtype>, %2.<Vtype>, %3.<Vetype>[0]"
>> > @@ -2882,7 +2882,7 @@ (define_insn "*aarch64_fma4_elt<mode>"
>> >      (fma:VDQF
>> >        (vec_duplicate:VDQF
>> >  	(vec_select:<VEL>
>> > -	  (match_operand:VDQF 1 "register_operand" "<h_con>")
>> > +	  (match_operand:VDQF 1 "nonimmediate_operand" "<h_con>")
>> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
>> >        (match_operand:VDQF 3 "register_operand" "w")
>> >        (match_operand:VDQF 4 "register_operand" "0")))] @@ -2899,7
>> > +2899,7 @@ (define_insn
>> "*aarch64_fma4_elt_<vswap_width_name><mode>"
>> >      (fma:VDQSF
>> >        (vec_duplicate:VDQSF
>> >  	(vec_select:<VEL>
>> > -	  (match_operand:<VSWAP_WIDTH> 1 "register_operand"
>> "<h_con>")
>> > +	  (match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand"
>> "<h_con>")
>> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
>> >        (match_operand:VDQSF 3 "register_operand" "w")
>> >        (match_operand:VDQSF 4 "register_operand" "0")))] @@ -2915,7
>> > +2915,7 @@ (define_insn "*aarch64_fma4_elt_from_dup<mode>"
>> >    [(set (match_operand:VMUL 0 "register_operand" "=w")
>> >      (fma:VMUL
>> >        (vec_duplicate:VMUL
>> > -	  (match_operand:<VEL> 1 "register_operand" "<h_con>"))
>> > +	  (match_operand:<VEL> 1 "nonimmediate_operand" "<h_con>"))
>> >        (match_operand:VMUL 2 "register_operand" "w")
>> >        (match_operand:VMUL 3 "register_operand" "0")))]
>> >    "TARGET_SIMD"
>> > @@ -2927,7 +2927,7 @@ (define_insn "*aarch64_fma4_elt_to_64v2df"
>> >    [(set (match_operand:DF 0 "register_operand" "=w")
>> >      (fma:DF
>> >  	(vec_select:DF
>> > -	  (match_operand:V2DF 1 "register_operand" "w")
>> > +	  (match_operand:V2DF 1 "nonimmediate_operand" "w")
>> >  	  (parallel [(match_operand:SI 2 "immediate_operand")]))
>> >        (match_operand:DF 3 "register_operand" "w")
>> >        (match_operand:DF 4 "register_operand" "0")))] @@ -2957,7
>> > +2957,7 @@ (define_insn "*aarch64_fnma4_elt<mode>"
>> >          (match_operand:VDQF 3 "register_operand" "w"))
>> >        (vec_duplicate:VDQF
>> >  	(vec_select:<VEL>
>> > -	  (match_operand:VDQF 1 "register_operand" "<h_con>")
>> > +	  (match_operand:VDQF 1 "nonimmediate_operand" "<h_con>")
>> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
>> >        (match_operand:VDQF 4 "register_operand" "0")))]
>> >    "TARGET_SIMD"
>> > @@ -2975,7 +2975,7 @@ (define_insn
>> "*aarch64_fnma4_elt_<vswap_width_name><mode>"
>> >          (match_operand:VDQSF 3 "register_operand" "w"))
>> >        (vec_duplicate:VDQSF
>> >  	(vec_select:<VEL>
>> > -	  (match_operand:<VSWAP_WIDTH> 1 "register_operand"
>> "<h_con>")
>> > +	  (match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand"
>> "<h_con>")
>> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
>> >        (match_operand:VDQSF 4 "register_operand" "0")))]
>> >    "TARGET_SIMD"
>> > @@ -2992,7 +2992,7 @@ (define_insn
>> "*aarch64_fnma4_elt_from_dup<mode>"
>> >        (neg:VMUL
>> >          (match_operand:VMUL 2 "register_operand" "w"))
>> >        (vec_duplicate:VMUL
>> > -	(match_operand:<VEL> 1 "register_operand" "<h_con>"))
>> > +	(match_operand:<VEL> 1 "nonimmediate_operand" "<h_con>"))
>> >        (match_operand:VMUL 3 "register_operand" "0")))]
>> >    "TARGET_SIMD"
>> >    "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
>> > @@ -3003,7 +3003,7 @@ (define_insn "*aarch64_fnma4_elt_to_64v2df"
>> >    [(set (match_operand:DF 0 "register_operand" "=w")
>> >      (fma:DF
>> >        (vec_select:DF
>> > -	(match_operand:V2DF 1 "register_operand" "w")
>> > +	(match_operand:V2DF 1 "nonimmediate_operand" "w")
>> >  	(parallel [(match_operand:SI 2 "immediate_operand")]))
>> >        (neg:DF
>> >          (match_operand:DF 3 "register_operand" "w")) @@ -4934,7
>> > +4934,7 @@ (define_insn
>> "*aarch64_mulx_elt_<vswap_width_name><mode>"
>> >  	 [(match_operand:VDQSF 1 "register_operand" "w")
>> >  	  (vec_duplicate:VDQSF
>> >  	   (vec_select:<VEL>
>> > -	    (match_operand:<VSWAP_WIDTH> 2 "register_operand" "w")
>> > +	    (match_operand:<VSWAP_WIDTH> 2 "nonimmediate_operand"
>> "w")
>> >  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
>> >  	 UNSPEC_FMULX))]
>> >    "TARGET_SIMD"
>> > @@ -4953,7 +4953,7 @@ (define_insn "*aarch64_mulx_elt<mode>"
>> >  	 [(match_operand:VDQF 1 "register_operand" "w")
>> >  	  (vec_duplicate:VDQF
>> >  	   (vec_select:<VEL>
>> > -	    (match_operand:VDQF 2 "register_operand" "w")
>> > +	    (match_operand:VDQF 2 "nonimmediate_operand" "w")
>> >  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
>> >  	 UNSPEC_FMULX))]
>> >    "TARGET_SIMD"
>> > @@ -4971,7 +4971,7 @@ (define_insn
>> "*aarch64_mulx_elt_from_dup<mode>"
>> >  	(unspec:VHSDF
>> >  	 [(match_operand:VHSDF 1 "register_operand" "w")
>> >  	  (vec_duplicate:VHSDF
>> > -	    (match_operand:<VEL> 2 "register_operand" "<h_con>"))]
>> > +	    (match_operand:<VEL> 2 "nonimmediate_operand" "<h_con>"))]
>> >  	 UNSPEC_FMULX))]
>> >    "TARGET_SIMD"
>> >    "fmulx\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"; @@ -4987,7
>> +4987,7
>> > @@ (define_insn "*aarch64_vgetfmulx<mode>"
>> >  	(unspec:<VEL>
>> >  	 [(match_operand:<VEL> 1 "register_operand" "w")
>> >  	  (vec_select:<VEL>
>> > -	   (match_operand:VDQF 2 "register_operand" "w")
>> > +	   (match_operand:VDQF 2 "nonimmediate_operand" "w")
>> >  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")]))]
>> >  	 UNSPEC_FMULX))]
>> >    "TARGET_SIMD"
>> > @@ -6768,7 +6768,7 @@ (define_insn "*sqrt<mode>2"
>> >  (define_insn "aarch64_simd_ld2<vstruct_elt>"
>> >    [(set (match_operand:VSTRUCT_2Q 0 "register_operand" "=w")
>> >  	(unspec:VSTRUCT_2Q [
>> > -	  (match_operand:VSTRUCT_2Q 1 "aarch64_simd_struct_operand"
>> "Utv")]
>> > +	  (match_operand:VSTRUCT_2Q 1 "memory_operand" "Utv")]
>> >  	  UNSPEC_LD2))]
>> >    "TARGET_SIMD"
>> >    "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
>> > @@ -6778,7 +6778,7 @@ (define_insn "aarch64_simd_ld2<vstruct_elt>"
>> >  (define_insn "aarch64_simd_ld2r<vstruct_elt>"
>> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
>> >  	(unspec:VSTRUCT_2QD [
>> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
>> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
>> >            UNSPEC_LD2_DUP))]
>> >    "TARGET_SIMD"
>> >    "ld2r\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
>> > @@ -6788,7 +6788,7 @@ (define_insn "aarch64_simd_ld2r<vstruct_elt>"
>> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
>> >  	(unspec:VSTRUCT_2QD [
>> > -		(match_operand:BLK 1 "aarch64_simd_struct_operand"
>> "Utv")
>> > +		(match_operand:BLK 1 "memory_operand" "Utv")
>> >  		(match_operand:VSTRUCT_2QD 2 "register_operand" "0")
>> >  		(match_operand:SI 3 "immediate_operand" "i")]
>> >  		UNSPEC_LD2_LANE))]
>> > @@ -6804,7 +6804,7 @@ (define_insn
>> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
>> >    [(set (match_operand:VSTRUCT_2Q 0 "register_operand")
>> >  	(unspec:VSTRUCT_2Q [
>> > -		(match_operand:VSTRUCT_2Q 1
>> "aarch64_simd_struct_operand")]
>> > +		(match_operand:VSTRUCT_2Q 1 "memory_operand")]
>> >  		UNSPEC_LD2))]
>> >    "TARGET_SIMD"
>> >  {
>> > @@ -6822,7 +6822,7 @@ (define_expand
>> "vec_load_lanes<mode><vstruct_elt>"
>> >  })
>> >
>> >  (define_insn "aarch64_simd_st2<vstruct_elt>"
>> > -  [(set (match_operand:VSTRUCT_2Q 0 "aarch64_simd_struct_operand"
>> > "=Utv")
>> > +  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand" "=Utv")
>> >  	(unspec:VSTRUCT_2Q [
>> >  		(match_operand:VSTRUCT_2Q 1 "register_operand" "w")]
>> >                  UNSPEC_ST2))]
>> > @@ -6833,7 +6833,7 @@ (define_insn "aarch64_simd_st2<vstruct_elt>"
>> >
>> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
>> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
>> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
>> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
>> >  	(unspec:BLK [(match_operand:VSTRUCT_2QD 1 "register_operand"
>> "w")
>> >  		     (match_operand:SI 2 "immediate_operand" "i")]
>> >  		     UNSPEC_ST2_LANE))]
>> > @@ -6847,7 +6847,7 @@ (define_insn
>> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
>> >  )
>> >
>> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
>> > -  [(set (match_operand:VSTRUCT_2Q 0 "aarch64_simd_struct_operand")
>> > +  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand")
>> >  	(unspec:VSTRUCT_2Q [(match_operand:VSTRUCT_2Q 1
>> "register_operand")]
>> >                     UNSPEC_ST2))]
>> >    "TARGET_SIMD"
>> > @@ -6868,7 +6868,7 @@ (define_expand
>> "vec_store_lanes<mode><vstruct_elt>"
>> >  (define_insn "aarch64_simd_ld3<vstruct_elt>"
>> >    [(set (match_operand:VSTRUCT_3Q 0 "register_operand" "=w")
>> >  	(unspec:VSTRUCT_3Q [
>> > -	  (match_operand:VSTRUCT_3Q 1 "aarch64_simd_struct_operand"
>> "Utv")]
>> > +	  (match_operand:VSTRUCT_3Q 1 "memory_operand" "Utv")]
>> >  	  UNSPEC_LD3))]
>> >    "TARGET_SIMD"
>> >    "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
>> > @@ -6878,7 +6878,7 @@ (define_insn "aarch64_simd_ld3<vstruct_elt>"
>> >  (define_insn "aarch64_simd_ld3r<vstruct_elt>"
>> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
>> >  	(unspec:VSTRUCT_3QD [
>> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
>> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
>> >            UNSPEC_LD3_DUP))]
>> >    "TARGET_SIMD"
>> >    "ld3r\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
>> > @@ -6888,7 +6888,7 @@ (define_insn "aarch64_simd_ld3r<vstruct_elt>"
>> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
>> >  	(unspec:VSTRUCT_3QD [
>> > -		(match_operand:BLK 1 "aarch64_simd_struct_operand"
>> "Utv")
>> > +		(match_operand:BLK 1 "memory_operand" "Utv")
>> >  		(match_operand:VSTRUCT_3QD 2 "register_operand" "0")
>> >  		(match_operand:SI 3 "immediate_operand" "i")]
>> >  		UNSPEC_LD3_LANE))]
>> > @@ -6904,7 +6904,7 @@ (define_insn
>> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
>> >    [(set (match_operand:VSTRUCT_3Q 0 "register_operand")
>> >  	(unspec:VSTRUCT_3Q [
>> > -		(match_operand:VSTRUCT_3Q 1
>> "aarch64_simd_struct_operand")]
>> > +		(match_operand:VSTRUCT_3Q 1 "memory_operand")]
>> >  		UNSPEC_LD3))]
>> >    "TARGET_SIMD"
>> >  {
>> > @@ -6922,7 +6922,7 @@ (define_expand
>> "vec_load_lanes<mode><vstruct_elt>"
>> >  })
>> >
>> >  (define_insn "aarch64_simd_st3<vstruct_elt>"
>> > -  [(set (match_operand:VSTRUCT_3Q 0 "aarch64_simd_struct_operand"
>> > "=Utv")
>> > +  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand" "=Utv")
>> >  	(unspec:VSTRUCT_3Q [(match_operand:VSTRUCT_3Q 1
>> "register_operand" "w")]
>> >                     UNSPEC_ST3))]
>> >    "TARGET_SIMD"
>> > @@ -6932,7 +6932,7 @@ (define_insn "aarch64_simd_st3<vstruct_elt>"
>> >
>> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
>> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
>> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
>> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
>> >  	(unspec:BLK [(match_operand:VSTRUCT_3QD 1 "register_operand"
>> "w")
>> >  		     (match_operand:SI 2 "immediate_operand" "i")]
>> >  		     UNSPEC_ST3_LANE))]
>> > @@ -6946,7 +6946,7 @@ (define_insn
>> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
>> >  )
>> >
>> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
>> > -  [(set (match_operand:VSTRUCT_3Q 0 "aarch64_simd_struct_operand")
>> > +  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand")
>> >  	(unspec:VSTRUCT_3Q [
>> >  		(match_operand:VSTRUCT_3Q 1 "register_operand")]
>> >                  UNSPEC_ST3))]
>> > @@ -6968,7 +6968,7 @@ (define_expand
>> "vec_store_lanes<mode><vstruct_elt>"
>> >  (define_insn "aarch64_simd_ld4<vstruct_elt>"
>> >    [(set (match_operand:VSTRUCT_4Q 0 "register_operand" "=w")
>> >  	(unspec:VSTRUCT_4Q [
>> > -	  (match_operand:VSTRUCT_4Q 1 "aarch64_simd_struct_operand"
>> "Utv")]
>> > +	  (match_operand:VSTRUCT_4Q 1 "memory_operand" "Utv")]
>> >  	  UNSPEC_LD4))]
>> >    "TARGET_SIMD"
>> >    "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
>> > @@ -6978,7 +6978,7 @@ (define_insn "aarch64_simd_ld4<vstruct_elt>"
>> >  (define_insn "aarch64_simd_ld4r<vstruct_elt>"
>> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
>> >  	(unspec:VSTRUCT_4QD [
>> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
>> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
>> >            UNSPEC_LD4_DUP))]
>> >    "TARGET_SIMD"
>> >    "ld4r\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
>> > @@ -6988,7 +6988,7 @@ (define_insn "aarch64_simd_ld4r<vstruct_elt>"
>> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
>> >  	(unspec:VSTRUCT_4QD [
>> > -		(match_operand:BLK 1 "aarch64_simd_struct_operand"
>> "Utv")
>> > +		(match_operand:BLK 1 "memory_operand" "Utv")
>> >  		(match_operand:VSTRUCT_4QD 2 "register_operand" "0")
>> >  		(match_operand:SI 3 "immediate_operand" "i")]
>> >  		UNSPEC_LD4_LANE))]
>> > @@ -7004,7 +7004,7 @@ (define_insn
>> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
>> >    [(set (match_operand:VSTRUCT_4Q 0 "register_operand")
>> >  	(unspec:VSTRUCT_4Q [
>> > -		(match_operand:VSTRUCT_4Q 1
>> "aarch64_simd_struct_operand")]
>> > +		(match_operand:VSTRUCT_4Q 1 "memory_operand")]
>> >  		UNSPEC_LD4))]
>> >    "TARGET_SIMD"
>> >  {
>> > @@ -7022,7 +7022,7 @@ (define_expand
>> "vec_load_lanes<mode><vstruct_elt>"
>> >  })
>> >
>> >  (define_insn "aarch64_simd_st4<vstruct_elt>"
>> > -  [(set (match_operand:VSTRUCT_4Q 0 "aarch64_simd_struct_operand"
>> > "=Utv")
>> > +  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand" "=Utv")
>> >  	(unspec:VSTRUCT_4Q [
>> >  		(match_operand:VSTRUCT_4Q 1 "register_operand" "w")]
>> >                  UNSPEC_ST4))]
>> > @@ -7033,7 +7033,7 @@ (define_insn "aarch64_simd_st4<vstruct_elt>"
>> >
>> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
>> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
>> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
>> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
>> >  	(unspec:BLK [(match_operand:VSTRUCT_4QD 1 "register_operand"
>> "w")
>> >  		     (match_operand:SI 2 "immediate_operand" "i")]
>> >  		     UNSPEC_ST4_LANE))]
>> > @@ -7047,7 +7047,7 @@ (define_insn
>> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
>> >  )
>> >
>> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
>> > -  [(set (match_operand:VSTRUCT_4Q 0 "aarch64_simd_struct_operand")
>> > +  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand")
>> >  	(unspec:VSTRUCT_4Q [(match_operand:VSTRUCT_4Q 1
>> "register_operand")]
>> >                     UNSPEC_ST4))]
>> >    "TARGET_SIMD"
>> > @@ -7138,7 +7138,7 @@ (define_expand "aarch64_ld1x3<vstruct_elt>"
>> >  (define_insn "aarch64_ld1_x3_<vstruct_elt>"
>> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
>> >          (unspec:VSTRUCT_3QD
>> > -	  [(match_operand:VSTRUCT_3QD 1 "aarch64_simd_struct_operand"
>> "Utv")]
>> > +	  [(match_operand:VSTRUCT_3QD 1 "memory_operand" "Utv")]
>> >  	  UNSPEC_LD1))]
>> >    "TARGET_SIMD"
>> >    "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
>> > @@ -7158,7 +7158,7 @@ (define_expand "aarch64_ld1x4<vstruct_elt>"
>> >  (define_insn "aarch64_ld1_x4_<vstruct_elt>"
>> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
>> >  	(unspec:VSTRUCT_4QD
>> > -	  [(match_operand:VSTRUCT_4QD 1 "aarch64_simd_struct_operand"
>> "Utv")]
>> > +	  [(match_operand:VSTRUCT_4QD 1 "memory_operand" "Utv")]
>> >  	UNSPEC_LD1))]
>> >    "TARGET_SIMD"
>> >    "ld1\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
>> > @@ -7176,7 +7176,7 @@ (define_expand "aarch64_st1x2<vstruct_elt>"
>> >  })
>> >
>> >  (define_insn "aarch64_st1_x2_<vstruct_elt>"
>> > -  [(set (match_operand:VSTRUCT_2QD 0 "aarch64_simd_struct_operand"
>> > "=Utv")
>> > +  [(set (match_operand:VSTRUCT_2QD 0 "memory_operand" "=Utv")
>> >  	(unspec:VSTRUCT_2QD
>> >  		[(match_operand:VSTRUCT_2QD 1 "register_operand" "w")]
>> >  		UNSPEC_ST1))]
>> > @@ -7196,7 +7196,7 @@ (define_expand "aarch64_st1x3<vstruct_elt>"
>> >  })
>> >
>> >  (define_insn "aarch64_st1_x3_<vstruct_elt>"
>> > -  [(set (match_operand:VSTRUCT_3QD 0 "aarch64_simd_struct_operand"
>> > "=Utv")
>> > +  [(set (match_operand:VSTRUCT_3QD 0 "memory_operand" "=Utv")
>> >  	(unspec:VSTRUCT_3QD
>> >  		[(match_operand:VSTRUCT_3QD 1 "register_operand" "w")]
>> >  		UNSPEC_ST1))]
>> > @@ -7216,7 +7216,7 @@ (define_expand "aarch64_st1x4<vstruct_elt>"
>> >  })
>> >
>> >  (define_insn "aarch64_st1_x4_<vstruct_elt>"
>> > -  [(set (match_operand:VSTRUCT_4QD 0 "aarch64_simd_struct_operand"
>> > "=Utv")
>> > +  [(set (match_operand:VSTRUCT_4QD 0 "memory_operand" "=Utv")
>> >  	(unspec:VSTRUCT_4QD
>> >  		[(match_operand:VSTRUCT_4QD 1 "register_operand" "w")]
>> >  		UNSPEC_ST1))]
>> > @@ -7268,7 +7268,7 @@ (define_insn "*aarch64_movv8di"
>> >  (define_insn "aarch64_be_ld1<mode>"
>> >    [(set (match_operand:VALLDI_F16 0	"register_operand" "=w")
>> >  	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1
>> > -			     "aarch64_simd_struct_operand" "Utv")]
>> > +			     "memory_operand" "Utv")]
>> >  	UNSPEC_LD1))]
>> >    "TARGET_SIMD"
>> >    "ld1\\t{%0<Vmtype>}, %1"
>> > @@ -7276,7 +7276,7 @@ (define_insn "aarch64_be_ld1<mode>"
>> >  )
>> >
>> >  (define_insn "aarch64_be_st1<mode>"
>> > -  [(set (match_operand:VALLDI_F16 0 "aarch64_simd_struct_operand"
>> > "=Utv")
>> > +  [(set (match_operand:VALLDI_F16 0 "memory_operand" "=Utv")
>> >  	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1
>> "register_operand" "w")]
>> >  	UNSPEC_ST1))]
>> >    "TARGET_SIMD"
>> > @@ -7551,7 +7551,7 @@ (define_expand
>> "aarch64_ld<nregs>r<vstruct_elt>"
>> >  (define_insn "aarch64_ld2<vstruct_elt>_dreg"
>> >    [(set (match_operand:VSTRUCT_2DNX 0 "register_operand" "=w")
>> >  	(unspec:VSTRUCT_2DNX [
>> > -	  (match_operand:VSTRUCT_2DNX 1
>> "aarch64_simd_struct_operand" "Utv")]
>> > +	  (match_operand:VSTRUCT_2DNX 1 "memory_operand" "Utv")]
>> >  	  UNSPEC_LD2_DREG))]
>> >    "TARGET_SIMD"
>> >    "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
>> > @@ -7561,7 +7561,7 @@ (define_insn "aarch64_ld2<vstruct_elt>_dreg"
>> >  (define_insn "aarch64_ld2<vstruct_elt>_dreg"
>> >    [(set (match_operand:VSTRUCT_2DX 0 "register_operand" "=w")
>> >  	(unspec:VSTRUCT_2DX [
>> > -	  (match_operand:VSTRUCT_2DX 1 "aarch64_simd_struct_operand"
>> "Utv")]
>> > +	  (match_operand:VSTRUCT_2DX 1 "memory_operand" "Utv")]
>> >  	  UNSPEC_LD2_DREG))]
>> >    "TARGET_SIMD"
>> >    "ld1\\t{%S0.1d - %T0.1d}, %1"
>> > @@ -7571,7 +7571,7 @@ (define_insn "aarch64_ld2<vstruct_elt>_dreg"
>> >  (define_insn "aarch64_ld3<vstruct_elt>_dreg"
>> >    [(set (match_operand:VSTRUCT_3DNX 0 "register_operand" "=w")
>> >  	(unspec:VSTRUCT_3DNX [
>> > -	  (match_operand:VSTRUCT_3DNX 1
>> "aarch64_simd_struct_operand" "Utv")]
>> > +	  (match_operand:VSTRUCT_3DNX 1 "memory_operand" "Utv")]
>> >  	  UNSPEC_LD3_DREG))]
>> >    "TARGET_SIMD"
>> >    "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
>> > @@ -7581,7 +7581,7 @@ (define_insn "aarch64_ld3<vstruct_elt>_dreg"
>> >  (define_insn "aarch64_ld3<vstruct_elt>_dreg"
>> >    [(set (match_operand:VSTRUCT_3DX 0 "register_operand" "=w")
>> >  	(unspec:VSTRUCT_3DX [
>> > -	  (match_operand:VSTRUCT_3DX 1 "aarch64_simd_struct_operand"
>> "Utv")]
>> > +	  (match_operand:VSTRUCT_3DX 1 "memory_operand" "Utv")]
>> >  	  UNSPEC_LD3_DREG))]
>> >    "TARGET_SIMD"
>> >    "ld1\\t{%S0.1d - %U0.1d}, %1"
>> > @@ -7591,7 +7591,7 @@ (define_insn "aarch64_ld3<vstruct_elt>_dreg"
>> >  (define_insn "aarch64_ld4<vstruct_elt>_dreg"
>> >    [(set (match_operand:VSTRUCT_4DNX 0 "register_operand" "=w")
>> >  	(unspec:VSTRUCT_4DNX [
>> > -	  (match_operand:VSTRUCT_4DNX 1
>> "aarch64_simd_struct_operand" "Utv")]
>> > +	  (match_operand:VSTRUCT_4DNX 1 "memory_operand" "Utv")]
>> >  	  UNSPEC_LD4_DREG))]
>> >    "TARGET_SIMD"
>> >    "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
>> > @@ -7601,7 +7601,7 @@ (define_insn "aarch64_ld4<vstruct_elt>_dreg"
>> >  (define_insn "aarch64_ld4<vstruct_elt>_dreg"
>> >    [(set (match_operand:VSTRUCT_4DX 0 "register_operand" "=w")
>> >  	(unspec:VSTRUCT_4DX [
>> > -	  (match_operand:VSTRUCT_4DX 1 "aarch64_simd_struct_operand"
>> "Utv")]
>> > +	  (match_operand:VSTRUCT_4DX 1 "memory_operand" "Utv")]
>> >  	  UNSPEC_LD4_DREG))]
>> >    "TARGET_SIMD"
>> >    "ld1\\t{%S0.1d - %V0.1d}, %1"
>> > @@ -7841,7 +7841,7 @@ (define_insn
>> "aarch64_rev<REVERSE:rev_op><mode>"
>> >  )
>> >
>> >  (define_insn "aarch64_st2<vstruct_elt>_dreg"
>> > -  [(set (match_operand:VSTRUCT_2DNX 0
>> "aarch64_simd_struct_operand"
>> > "=Utv")
>> > +  [(set (match_operand:VSTRUCT_2DNX 0 "memory_operand" "=Utv")
>> >  	(unspec:VSTRUCT_2DNX [
>> >  		(match_operand:VSTRUCT_2DNX 1 "register_operand" "w")]
>> >  		UNSPEC_ST2))]
>> > @@ -7851,7 +7851,7 @@ (define_insn "aarch64_st2<vstruct_elt>_dreg"
>> >  )
>> >
>> >  (define_insn "aarch64_st2<vstruct_elt>_dreg"
>> > -  [(set (match_operand:VSTRUCT_2DX 0 "aarch64_simd_struct_operand"
>> > "=Utv")
>> > +  [(set (match_operand:VSTRUCT_2DX 0 "memory_operand" "=Utv")
>> >  	(unspec:VSTRUCT_2DX [
>> >  		(match_operand:VSTRUCT_2DX 1 "register_operand" "w")]
>> >  		UNSPEC_ST2))]
>> > @@ -7861,7 +7861,7 @@ (define_insn "aarch64_st2<vstruct_elt>_dreg"
>> >  )
>> >
>> >  (define_insn "aarch64_st3<vstruct_elt>_dreg"
>> > -  [(set (match_operand:VSTRUCT_3DNX 0
>> "aarch64_simd_struct_operand"
>> > "=Utv")
>> > +  [(set (match_operand:VSTRUCT_3DNX 0 "memory_operand" "=Utv")
>> >  	(unspec:VSTRUCT_3DNX [
>> >  		(match_operand:VSTRUCT_3DNX 1 "register_operand" "w")]
>> >  		UNSPEC_ST3))]
>> > @@ -7871,7 +7871,7 @@ (define_insn "aarch64_st3<vstruct_elt>_dreg"
>> >  )
>> >
>> >  (define_insn "aarch64_st3<vstruct_elt>_dreg"
>> > -  [(set (match_operand:VSTRUCT_3DX 0 "aarch64_simd_struct_operand"
>> > "=Utv")
>> > +  [(set (match_operand:VSTRUCT_3DX 0 "memory_operand" "=Utv")
>> >  	(unspec:VSTRUCT_3DX [
>> >  		(match_operand:VSTRUCT_3DX 1 "register_operand" "w")]
>> >  		UNSPEC_ST3))]
>> > @@ -7881,7 +7881,7 @@ (define_insn "aarch64_st3<vstruct_elt>_dreg"
>> >  )
>> >
>> >  (define_insn "aarch64_st4<vstruct_elt>_dreg"
>> > -  [(set (match_operand:VSTRUCT_4DNX 0
>> "aarch64_simd_struct_operand"
>> > "=Utv")
>> > +  [(set (match_operand:VSTRUCT_4DNX 0 "memory_operand" "=Utv")
>> >  	(unspec:VSTRUCT_4DNX [
>> >  		(match_operand:VSTRUCT_4DNX 1 "register_operand" "w")]
>> >  		UNSPEC_ST4))]
>> > @@ -7891,7 +7891,7 @@ (define_insn "aarch64_st4<vstruct_elt>_dreg"
>> >  )
>> >
>> >  (define_insn "aarch64_st4<vstruct_elt>_dreg"
>> > -  [(set (match_operand:VSTRUCT_4DX 0 "aarch64_simd_struct_operand"
>> > "=Utv")
>> > +  [(set (match_operand:VSTRUCT_4DX 0 "memory_operand" "=Utv")
>> >  	(unspec:VSTRUCT_4DX [
>> >  		(match_operand:VSTRUCT_4DX 1 "register_operand" "w")]
>> >  		UNSPEC_ST4))]
>> > @@ -7974,7 +7974,7 @@ (define_expand "vec_init<mode><Vhalf>"
>> >  (define_insn "*aarch64_simd_ld1r<mode>"
>> >    [(set (match_operand:VALL_F16 0 "register_operand" "=w")
>> >  	(vec_duplicate:VALL_F16
>> > -	  (match_operand:<VEL> 1 "aarch64_simd_struct_operand" "Utv")))]
>> > +	  (match_operand:<VEL> 1 "memory_operand" "Utv")))]
>> >    "TARGET_SIMD"
>> >    "ld1r\\t{%0.<Vtype>}, %1"
>> >    [(set_attr "type" "neon_load1_all_lanes")] @@ -7983,7 +7983,7 @@
>> > (define_insn "*aarch64_simd_ld1r<mode>"
>> >  (define_insn "aarch64_simd_ld1<vstruct_elt>_x2"
>> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
>> >  	(unspec:VSTRUCT_2QD [
>> > -	    (match_operand:VSTRUCT_2QD 1 "aarch64_simd_struct_operand"
>> "Utv")]
>> > +	    (match_operand:VSTRUCT_2QD 1 "memory_operand" "Utv")]
>> >  	    UNSPEC_LD1))]
>> >    "TARGET_SIMD"
>> >    "ld1\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
>> > diff --git a/gcc/config/aarch64/predicates.md
>> > b/gcc/config/aarch64/predicates.md
>> > index
>> >
>> c308015ac2c13d24cd6bcec71247ec45df8cf5e6..6b70a364530c8108457091bfec
>> 12
>> > fe549f722149 100644
>> > --- a/gcc/config/aarch64/predicates.md
>> > +++ b/gcc/config/aarch64/predicates.md
>> > @@ -494,10 +494,6 @@ (define_predicate
>> "aarch64_simd_reg_or_minus_one"
>> >    (ior (match_operand 0 "register_operand")
>> >         (match_operand 0 "aarch64_simd_imm_minus_one")))
>> >
>> > -(define_predicate "aarch64_simd_struct_operand"
>> > -  (and (match_code "mem")
>> > -       (match_test "TARGET_SIMD && aarch64_simd_mem_operand_p
>> (op)")))
>> > -
>> >  ;; Like general_operand but allow only valid SIMD addressing modes.
>> >  (define_predicate "aarch64_simd_general_operand"
>> >    (and (match_operand 0 "general_operand") diff --git
>> > a/gcc/testsuite/gcc.target/aarch64/vld1r.c
>> > b/gcc/testsuite/gcc.target/aarch64/vld1r.c
>> > new file mode 100644
>> > index
>> >
>> 0000000000000000000000000000000000000000..72c505c403e9e239771379b7ca
>> dd
>> > 8a9473f06113
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/vld1r.c
>> > @@ -0,0 +1,26 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-additional-options "-O" } */
>> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
>> > +} */
>> > +
>> > +#include <arm_neon.h>
>> > +
>> > +/*
>> > +** f1:
>> > +** 	add	x0, x0, 1
>> > +** 	ld1r	{v0.8b}, \[x0\]
>> > +** 	ret
>> > +*/
>> > +uint8x8_t f1(const uint8_t *in) {
>> > +    return vld1_dup_u8(&in[1]);
>> > +}
>> > +
>> > +/*
>> > +** f2:
>> > +** 	ldr	s1, \[x0, 4\]
>> > +** 	fmla	v0.4s, v0.4s, v1.s\[0\]
>> > +** 	ret
>> > +*/
>> > +float32x4_t f2(const float32_t *in, float32x4_t a) {
>> > +    float32x4_t dup = vld1q_dup_f32(&in[1]);
>> > +    return vfmaq_laneq_f32 (a, a, dup, 1); }
  
Tamar Christina June 9, 2022, 7:42 a.m. UTC | #4
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Wednesday, June 8, 2022 3:36 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>; rguenther@suse.de; roger@eyesopen.com
> Subject: Re: [PATCH]AArch64 relax predicate on load structure load
> instructions
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> Sent: Wednesday, June 8, 2022 11:31 AM
> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> >> Subject: Re: [PATCH]AArch64 relax predicate on load structure load
> >> instructions
> >>
> >> Tamar Christina <tamar.christina@arm.com> writes:
> >> > Hi All,
> >> >
> >> > At some point in time we started lowering the ld1r instructions in
> gimple.
> >> >
> >> > That is:
> >> >
> >> > uint8x8_t f1(const uint8_t *in) {
> >> >     return vld1_dup_u8(&in[1]);
> >> > }
> >> >
> >> > generates at gimple:
> >> >
> >> >   _3 = MEM[(const uint8_t *)in_1(D) + 1B];
> >> >   _4 = {_3, _3, _3, _3, _3, _3, _3, _3};
> >> >
> >> > Which is good, but we then generate:
> >> >
> >> > f1:
> >> > 	ldr     b0, [x0, 1]
> >> > 	dup     v0.8b, v0.b[0]
> >> > 	ret
> >> >
> >> > instead of ld1r.
> >> >
> >> > The reason for this is because the load instructions have a too
> >> > restrictive predicate on them which causes combine not to be able
> >> > to combine the instructions due to the predicate only accepting
> >> > simple
> >> addressing modes.
> >> >
> >> > This patch relaxes the predicate to accept any memory operand and
> >> > relies on LRA to legitimize the address when it needs to as the
> >> > constraint still only allows the simple addressing mode.  Reload is
> >> > always able to legitimize to these.
> >> >
> >> > Secondly since we are now actually generating more ld1r it became
> >> > clear that the lane instructions suffer from a similar issue.
> >> >
> >> > i.e.
> >> >
> >> > float32x4_t f2(const float32_t *in, float32x4_t a) {
> >> >     float32x4_t dup = vld1q_dup_f32(&in[1]);
> >> >     return vfmaq_laneq_f32 (a, a, dup, 1); }
> >> >
> >> > would generate ld1r + vector fmla instead of ldr + lane fmla.
> >> >
> >> > The reason for this is similar to the ld1r issue.  The predicate is
> >> > too restrictive in only acception register operands but not memory.
> >> >
> >> > This relaxes it to accept register and/or memory while leaving the
> >> > constraint to only accept registers.  This will have LRA generate a
> >> > reload if needed forcing the memory to registers using the standard
> >> patterns.
> >> >
> >> > These two changes allow combine and reload to generate the right
> >> sequences.
> >> >
> >> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >>
> >> This is going against the general direction of travel, which is to
> >> make the instruction's predicates and conditions enforce the
> >> constraints as much as possible (making optimistic assumptions about
> pseudo registers).
> >>
> >> The RA *can* deal with things like:
> >>
> >>   (match_operand:M N "general_operand" "r")
> >>
> >> but it's best avoided, for a few reasons:
> >>
> >> (1) The fix-up will be done in LRA, so IRA will not see the temporary
> >>     registers.  This can make the allocation of those temporaries
> >>     suboptimal but (more importantly) it might require other
> >>     previously-allocated registers to be spilled late due to the
> >>     unexpected increase in register pressure.
> >>
> >> (2) It ends up hiding instructions from the pre-RA optimisers.
> >>
> >> (3) It can also prevent combine opportunities (as well as create them),
> >>     unless the loose predicates in an insn I are propagated to all
> >>     patterns that might result from combining I with something else.
> >>
> >> It sounds like the first problem (not generating ld1r) could be fixed
> >> by (a) combining aarch64_simd_dup<mode> and
> *aarch64_simd_ld1r<mode>,
> >> so that the register and memory alternatives are in the same pattern
> >> and (b) using the merged instruction(s) to implement the vec_duplicate
> optab.
> >> Target-independent code should then make the address satisfy the
> >> predicate, simplifying the address where necessary.
> >>
> >
> > I think I am likely missing something here. I would assume that you
> > wanted to use the optab to split the addressing off from the mem
> > expression so the combined insn matches.
> >
> > But in that case, why do you need to combine the two instructions?
> > I've tried and it doesn't work since the vec_duplicate optab doesn't
> > see the mem as op1, because in gimple the mem is not part of the
> duplicate.
> >
> > So you still just see:
> >
> >>>> dbgrtx (ops[1].value)
> > (subreg/s/v:QI (reg:SI 92 [ _3 ]) 0)
> >
> > As the operand as the argument to the dup is just an SSA_NAME.
> 
> Ah, yeah, I'd forgotten that fixed-length vec_duplicates would come from a
> constructor rather than a vec_duplicate_expr, so we don't get the usual
> benefit of folding single-use mems during expand.
> 
> https://gcc.gnu.org/pipermail/gcc-patches/2022-May/595362.html
> moves towards using vec_duplicate even for fixed-length vectors.
> If we take that approach, then I suppose a plain constructor should be folded
> to a vec_duplicate where possible.
> 
> (Alternatively, we could use an extended vec_perm_expr with scalar inputs,
> as Richi suggested in that thread.)
> 
> If we don't do that, or don't do it yet, then…
> 
> > If not and you wanted the combined insn to accept
> >
> > (set (reg:SI 92 [ _3 ])
> >     (zero_extend:SI (mem:QI (plus:DI (reg:DI 97)
> >                 (const_int 1 [0x1])) [0 MEM[(const uint8_tD.4561
> > *)in_1(D) + 1B]+0 S1 A8])))
> >
> > Then that's also not possible without relaxing the combined
> > predicates.  As far as I can tell If I'm not allowed to use LRA for this, then
> the only thing that could work is an early split?
> >
> > Or do I have to modify store_constructor to try a variant where it
> > tries pushing in the Decl of an SSA_NAME first?
> 
> …yeah, something like this would be needed.  But the vec_duplicate_expr/
> vec_perm_expr thing seems better, even if we only introduce it during isel.
> 
> Not my call either way though :-)  Let's see what Richi (cc:ed) thinks.

FWIW, since my inner "Richards like patch" detector still needs tunings ?
I did a quick experiment.  Teaching gimple_build_vector_from_val to allow the
non-constant case and then teaching simplify_vector_constructor to use it for the
non-constant case gets them generated.

Then I had to teach aarch64_expand_vector_init to generate vec_duplicate_expr
when the value is non-constant works.

I thought about skipping vec_init entirely in this case during expansion however
there doesn't seem to be a way to test for vec_duplicate_expr as Richi mentioned,
it doesn't seem to have an associated optab.

This approach does fix the problem, but I'll hold out on cleaning it up till I hear it's
acceptable.

Cheers,
Tamar

> 
> Thanks,
> Richard
> 
> > I guess this also only really works for ld1r, whenever we lower ld2(r)
> > etc we'll have the same issue again... But I suppose that's for the
> > next person ?
> >
> > Thanks,
> > Tamar
> >
> >> I'm not sure whether fixing the ld1r problem that way will avoid the
> >> vfmaq_laneq_f32 problem; let me know if not.
> >>
> >> Thanks,
> >> Richard
> >>
> >> > Ok for master?
> >> >
> >> > Thanks,
> >> > Tamar
> >> >
> >> > gcc/ChangeLog:
> >> >
> >> > 	* config/aarch64/aarch64-simd.md (mul_lane<mode>3,
> >> mul_laneq<mode>3,
> >> > 	mul_n<mode>3, *aarch64_mul3_elt_to_64v2df,
> >> *aarch64_mla_elt<mode>,
> >> > 	*aarch64_mla_elt_<vswap_width_name><mode>,
> >> aarch64_mla_n<mode>,
> >> > 	*aarch64_mls_elt<mode>,
> >> *aarch64_mls_elt_<vswap_width_name><mode>,
> >> > 	aarch64_mls_n<mode>, *aarch64_fma4_elt<mode>,
> >> > 	*aarch64_fma4_elt_<vswap_width_name><mode>,
> >> > 	*aarch64_fma4_elt_from_dup<mode>,
> >> *aarch64_fma4_elt_to_64v2df,
> >> > 	*aarch64_fnma4_elt<mode>,
> >> *aarch64_fnma4_elt_<vswap_width_name><mode>,
> >> > 	*aarch64_fnma4_elt_from_dup<mode>,
> >> *aarch64_fnma4_elt_to_64v2df,
> >> > 	*aarch64_mulx_elt_<vswap_width_name><mode>,
> >> > 	*aarch64_mulx_elt<mode>, *aarch64_mulx_elt_from_dup<mode>,
> >> > 	*aarch64_vgetfmulx<mode>): Relax register_operand to
> >> > 	nonimmediate_operand.
> >> > 	(aarch64_simd_ld2<vstruct_elt>, aarch64_simd_ld2r<vstruct_elt>,
> >> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
> >> > 	vec_load_lanes<mode><vstruct_elt>,
> >> aarch64_simd_st2<vstruct_elt>,
> >> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
> >> > 	vec_store_lanes<mode><vstruct_elt>,
> >> aarch64_simd_ld3<vstruct_elt>,
> >> > 	aarch64_simd_ld3r<vstruct_elt>,
> >> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
> >> > 	vec_load_lanes<mode><vstruct_elt>,
> >> aarch64_simd_st3<vstruct_elt>,
> >> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
> >> > 	vec_store_lanes<mode><vstruct_elt>,
> >> aarch64_simd_ld4<vstruct_elt>,
> >> > 	aarch64_simd_ld4r<vstruct_elt>,
> >> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
> >> > 	vec_load_lanes<mode><vstruct_elt>,
> >> aarch64_simd_st4<vstruct_elt>,
> >> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
> >> > 	vec_store_lanes<mode><vstruct_elt>,
> >> aarch64_ld1_x3_<vstruct_elt>,
> >> > 	aarch64_ld1_x4_<vstruct_elt>, aarch64_st1_x2_<vstruct_elt>,
> >> > 	aarch64_st1_x3_<vstruct_elt>, aarch64_st1_x4_<vstruct_elt>,
> >> > 	aarch64_be_ld1<mode>, aarch64_be_st1<mode>,
> >> > 	aarch64_ld2<vstruct_elt>_dreg, aarch64_ld2<vstruct_elt>_dreg,
> >> > 	aarch64_ld3<vstruct_elt>_dreg, aarch64_ld3<vstruct_elt>_dreg,
> >> > 	aarch64_ld4<vstruct_elt>_dreg, aarch64_ld4<vstruct_elt>_dreg,
> >> > 	aarch64_st2<vstruct_elt>_dreg, aarch64_st2<vstruct_elt>_dreg,
> >> > 	aarch64_st3<vstruct_elt>_dreg, aarch64_st3<vstruct_elt>_dreg,
> >> > 	aarch64_st4<vstruct_elt>_dreg, aarch64_st4<vstruct_elt>_dreg,
> >> > 	*aarch64_simd_ld1r<mode>, aarch64_simd_ld1<vstruct_elt>_x2):
> >> Relax
> >> > 	aarch64_simd_struct_operand to memory_operand.
> >> > 	* config/aarch64/predicates.md (aarch64_simd_struct_operand):
> >> Remove.
> >> >
> >> > gcc/testsuite/ChangeLog:
> >> >
> >> > 	* gcc.target/aarch64/vld1r.c: New test.
> >> >
> >> > --- inline copy of patch --
> >> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> >> > b/gcc/config/aarch64/aarch64-simd.md
> >> > index
> >> >
> >>
> be5c70bbb7520ae93d19c4a432ce34863e5b9a64..24e3274ddda2ea76c83571fa
> >> da8f
> >> > f4c953b752a1 100644
> >> > --- a/gcc/config/aarch64/aarch64-simd.md
> >> > +++ b/gcc/config/aarch64/aarch64-simd.md
> >> > @@ -712,7 +712,7 @@ (define_insn "mul_lane<mode>3"
> >> >         (mult:VMULD
> >> >  	 (vec_duplicate:VMULD
> >> >  	   (vec_select:<VEL>
> >> > -	     (match_operand:<VCOND> 2 "register_operand" "<h_con>")
> >> > +	     (match_operand:<VCOND> 2 "nonimmediate_operand"
> >> "<h_con>")
> >> >  	     (parallel [(match_operand:SI 3 "immediate_operand" "i")])))
> >> >  	 (match_operand:VMULD 1 "register_operand" "w")))]
> >> >    "TARGET_SIMD"
> >> > @@ -728,7 +728,7 @@ (define_insn "mul_laneq<mode>3"
> >> >       (mult:VMUL
> >> >         (vec_duplicate:VMUL
> >> >  	  (vec_select:<VEL>
> >> > -	    (match_operand:<VCONQ> 2 "register_operand" "<h_con>")
> >> > +	    (match_operand:<VCONQ> 2 "nonimmediate_operand"
> >> "<h_con>")
> >> >  	    (parallel [(match_operand:SI 3 "immediate_operand")])))
> >> >        (match_operand:VMUL 1 "register_operand" "w")))]
> >> >    "TARGET_SIMD"
> >> > @@ -743,7 +743,7 @@ (define_insn "mul_n<mode>3"
> >> >   [(set (match_operand:VMUL 0 "register_operand" "=w")
> >> >         (mult:VMUL
> >> >  	 (vec_duplicate:VMUL
> >> > -	   (match_operand:<VEL> 2 "register_operand" "<h_con>"))
> >> > +	   (match_operand:<VEL> 2 "nonimmediate_operand" "<h_con>"))
> >> >  	 (match_operand:VMUL 1 "register_operand" "w")))]
> >> >    "TARGET_SIMD"
> >> >    "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"; @@ -789,7
> >> > +789,7 @@ (define_insn "*aarch64_mul3_elt_to_64v2df"
> >> >    [(set (match_operand:DF 0 "register_operand" "=w")
> >> >       (mult:DF
> >> >         (vec_select:DF
> >> > -	 (match_operand:V2DF 1 "register_operand" "w")
> >> > +	 (match_operand:V2DF 1 "nonimmediate_operand" "w")
> >> >  	 (parallel [(match_operand:SI 2 "immediate_operand")]))
> >> >         (match_operand:DF 3 "register_operand" "w")))]
> >> >    "TARGET_SIMD"
> >> > @@ -1406,7 +1406,7 @@ (define_insn "*aarch64_mla_elt<mode>"
> >> >  	 (mult:VDQHS
> >> >  	   (vec_duplicate:VDQHS
> >> >  	      (vec_select:<VEL>
> >> > -		(match_operand:VDQHS 1 "register_operand" "<h_con>")
> >> > +		(match_operand:VDQHS 1 "nonimmediate_operand"
> >> "<h_con>")
> >> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >  	   (match_operand:VDQHS 3 "register_operand" "w"))
> >> >  	 (match_operand:VDQHS 4 "register_operand" "0")))] @@ -1424,7
> >> > +1424,7 @@ (define_insn
> >> "*aarch64_mla_elt_<vswap_width_name><mode>"
> >> >  	 (mult:VDQHS
> >> >  	   (vec_duplicate:VDQHS
> >> >  	      (vec_select:<VEL>
> >> > -		(match_operand:<VSWAP_WIDTH> 1 "register_operand"
> >> "<h_con>")
> >> > +		(match_operand:<VSWAP_WIDTH> 1
> >> "nonimmediate_operand" "<h_con>")
> >> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >  	   (match_operand:VDQHS 3 "register_operand" "w"))
> >> >  	 (match_operand:VDQHS 4 "register_operand" "0")))] @@ -1441,7
> >> > +1441,7 @@ (define_insn "aarch64_mla_n<mode>"
> >> >  	(plus:VDQHS
> >> >  	  (mult:VDQHS
> >> >  	    (vec_duplicate:VDQHS
> >> > -	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
> >> > +	      (match_operand:<VEL> 3 "nonimmediate_operand" "<h_con>"))
> >> >  	    (match_operand:VDQHS 2 "register_operand" "w"))
> >> >  	  (match_operand:VDQHS 1 "register_operand" "0")))]
> >> >   "TARGET_SIMD"
> >> > @@ -1466,7 +1466,7 @@ (define_insn "*aarch64_mls_elt<mode>"
> >> >  	 (mult:VDQHS
> >> >  	   (vec_duplicate:VDQHS
> >> >  	      (vec_select:<VEL>
> >> > -		(match_operand:VDQHS 1 "register_operand" "<h_con>")
> >> > +		(match_operand:VDQHS 1 "nonimmediate_operand"
> >> "<h_con>")
> >> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >  	   (match_operand:VDQHS 3 "register_operand" "w"))))]
> >> >   "TARGET_SIMD"
> >> > @@ -1484,7 +1484,7 @@ (define_insn
> >> "*aarch64_mls_elt_<vswap_width_name><mode>"
> >> >  	 (mult:VDQHS
> >> >  	   (vec_duplicate:VDQHS
> >> >  	      (vec_select:<VEL>
> >> > -		(match_operand:<VSWAP_WIDTH> 1 "register_operand"
> >> "<h_con>")
> >> > +		(match_operand:<VSWAP_WIDTH> 1
> >> "nonimmediate_operand" "<h_con>")
> >> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >  	   (match_operand:VDQHS 3 "register_operand" "w"))))]
> >> >   "TARGET_SIMD"
> >> > @@ -1501,7 +1501,7 @@ (define_insn "aarch64_mls_n<mode>"
> >> >  	  (match_operand:VDQHS 1 "register_operand" "0")
> >> >  	  (mult:VDQHS
> >> >  	    (vec_duplicate:VDQHS
> >> > -	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
> >> > +	      (match_operand:<VEL> 3 "nonimmediate_operand" "<h_con>"))
> >> >  	    (match_operand:VDQHS 2 "register_operand" "w"))))]
> >> >    "TARGET_SIMD"
> >> >    "mls\t%0.<Vtype>, %2.<Vtype>, %3.<Vetype>[0]"
> >> > @@ -2882,7 +2882,7 @@ (define_insn "*aarch64_fma4_elt<mode>"
> >> >      (fma:VDQF
> >> >        (vec_duplicate:VDQF
> >> >  	(vec_select:<VEL>
> >> > -	  (match_operand:VDQF 1 "register_operand" "<h_con>")
> >> > +	  (match_operand:VDQF 1 "nonimmediate_operand" "<h_con>")
> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >        (match_operand:VDQF 3 "register_operand" "w")
> >> >        (match_operand:VDQF 4 "register_operand" "0")))] @@ -2899,7
> >> > +2899,7 @@ (define_insn
> >> "*aarch64_fma4_elt_<vswap_width_name><mode>"
> >> >      (fma:VDQSF
> >> >        (vec_duplicate:VDQSF
> >> >  	(vec_select:<VEL>
> >> > -	  (match_operand:<VSWAP_WIDTH> 1 "register_operand"
> >> "<h_con>")
> >> > +	  (match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand"
> >> "<h_con>")
> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >        (match_operand:VDQSF 3 "register_operand" "w")
> >> >        (match_operand:VDQSF 4 "register_operand" "0")))] @@ -2915,7
> >> > +2915,7 @@ (define_insn "*aarch64_fma4_elt_from_dup<mode>"
> >> >    [(set (match_operand:VMUL 0 "register_operand" "=w")
> >> >      (fma:VMUL
> >> >        (vec_duplicate:VMUL
> >> > -	  (match_operand:<VEL> 1 "register_operand" "<h_con>"))
> >> > +	  (match_operand:<VEL> 1 "nonimmediate_operand" "<h_con>"))
> >> >        (match_operand:VMUL 2 "register_operand" "w")
> >> >        (match_operand:VMUL 3 "register_operand" "0")))]
> >> >    "TARGET_SIMD"
> >> > @@ -2927,7 +2927,7 @@ (define_insn "*aarch64_fma4_elt_to_64v2df"
> >> >    [(set (match_operand:DF 0 "register_operand" "=w")
> >> >      (fma:DF
> >> >  	(vec_select:DF
> >> > -	  (match_operand:V2DF 1 "register_operand" "w")
> >> > +	  (match_operand:V2DF 1 "nonimmediate_operand" "w")
> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")]))
> >> >        (match_operand:DF 3 "register_operand" "w")
> >> >        (match_operand:DF 4 "register_operand" "0")))] @@ -2957,7
> >> > +2957,7 @@ (define_insn "*aarch64_fnma4_elt<mode>"
> >> >          (match_operand:VDQF 3 "register_operand" "w"))
> >> >        (vec_duplicate:VDQF
> >> >  	(vec_select:<VEL>
> >> > -	  (match_operand:VDQF 1 "register_operand" "<h_con>")
> >> > +	  (match_operand:VDQF 1 "nonimmediate_operand" "<h_con>")
> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >        (match_operand:VDQF 4 "register_operand" "0")))]
> >> >    "TARGET_SIMD"
> >> > @@ -2975,7 +2975,7 @@ (define_insn
> >> "*aarch64_fnma4_elt_<vswap_width_name><mode>"
> >> >          (match_operand:VDQSF 3 "register_operand" "w"))
> >> >        (vec_duplicate:VDQSF
> >> >  	(vec_select:<VEL>
> >> > -	  (match_operand:<VSWAP_WIDTH> 1 "register_operand"
> >> "<h_con>")
> >> > +	  (match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand"
> >> "<h_con>")
> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >        (match_operand:VDQSF 4 "register_operand" "0")))]
> >> >    "TARGET_SIMD"
> >> > @@ -2992,7 +2992,7 @@ (define_insn
> >> "*aarch64_fnma4_elt_from_dup<mode>"
> >> >        (neg:VMUL
> >> >          (match_operand:VMUL 2 "register_operand" "w"))
> >> >        (vec_duplicate:VMUL
> >> > -	(match_operand:<VEL> 1 "register_operand" "<h_con>"))
> >> > +	(match_operand:<VEL> 1 "nonimmediate_operand" "<h_con>"))
> >> >        (match_operand:VMUL 3 "register_operand" "0")))]
> >> >    "TARGET_SIMD"
> >> >    "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
> >> > @@ -3003,7 +3003,7 @@ (define_insn
> "*aarch64_fnma4_elt_to_64v2df"
> >> >    [(set (match_operand:DF 0 "register_operand" "=w")
> >> >      (fma:DF
> >> >        (vec_select:DF
> >> > -	(match_operand:V2DF 1 "register_operand" "w")
> >> > +	(match_operand:V2DF 1 "nonimmediate_operand" "w")
> >> >  	(parallel [(match_operand:SI 2 "immediate_operand")]))
> >> >        (neg:DF
> >> >          (match_operand:DF 3 "register_operand" "w")) @@ -4934,7
> >> > +4934,7 @@ (define_insn
> >> "*aarch64_mulx_elt_<vswap_width_name><mode>"
> >> >  	 [(match_operand:VDQSF 1 "register_operand" "w")
> >> >  	  (vec_duplicate:VDQSF
> >> >  	   (vec_select:<VEL>
> >> > -	    (match_operand:<VSWAP_WIDTH> 2 "register_operand" "w")
> >> > +	    (match_operand:<VSWAP_WIDTH> 2 "nonimmediate_operand"
> >> "w")
> >> >  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
> >> >  	 UNSPEC_FMULX))]
> >> >    "TARGET_SIMD"
> >> > @@ -4953,7 +4953,7 @@ (define_insn "*aarch64_mulx_elt<mode>"
> >> >  	 [(match_operand:VDQF 1 "register_operand" "w")
> >> >  	  (vec_duplicate:VDQF
> >> >  	   (vec_select:<VEL>
> >> > -	    (match_operand:VDQF 2 "register_operand" "w")
> >> > +	    (match_operand:VDQF 2 "nonimmediate_operand" "w")
> >> >  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
> >> >  	 UNSPEC_FMULX))]
> >> >    "TARGET_SIMD"
> >> > @@ -4971,7 +4971,7 @@ (define_insn
> >> "*aarch64_mulx_elt_from_dup<mode>"
> >> >  	(unspec:VHSDF
> >> >  	 [(match_operand:VHSDF 1 "register_operand" "w")
> >> >  	  (vec_duplicate:VHSDF
> >> > -	    (match_operand:<VEL> 2 "register_operand" "<h_con>"))]
> >> > +	    (match_operand:<VEL> 2 "nonimmediate_operand" "<h_con>"))]
> >> >  	 UNSPEC_FMULX))]
> >> >    "TARGET_SIMD"
> >> >    "fmulx\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"; @@ -4987,7
> >> +4987,7
> >> > @@ (define_insn "*aarch64_vgetfmulx<mode>"
> >> >  	(unspec:<VEL>
> >> >  	 [(match_operand:<VEL> 1 "register_operand" "w")
> >> >  	  (vec_select:<VEL>
> >> > -	   (match_operand:VDQF 2 "register_operand" "w")
> >> > +	   (match_operand:VDQF 2 "nonimmediate_operand" "w")
> >> >  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")]))]
> >> >  	 UNSPEC_FMULX))]
> >> >    "TARGET_SIMD"
> >> > @@ -6768,7 +6768,7 @@ (define_insn "*sqrt<mode>2"
> >> >  (define_insn "aarch64_simd_ld2<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_2Q 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_2Q [
> >> > -	  (match_operand:VSTRUCT_2Q 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	  (match_operand:VSTRUCT_2Q 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD2))]
> >> >    "TARGET_SIMD"
> >> >    "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> >> > @@ -6778,7 +6778,7 @@ (define_insn "aarch64_simd_ld2<vstruct_elt>"
> >> >  (define_insn "aarch64_simd_ld2r<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_2QD [
> >> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
> >> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
> >> >            UNSPEC_LD2_DUP))]
> >> >    "TARGET_SIMD"
> >> >    "ld2r\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> >> > @@ -6788,7 +6788,7 @@ (define_insn
> "aarch64_simd_ld2r<vstruct_elt>"
> >> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_2QD [
> >> > -		(match_operand:BLK 1 "aarch64_simd_struct_operand"
> >> "Utv")
> >> > +		(match_operand:BLK 1 "memory_operand" "Utv")
> >> >  		(match_operand:VSTRUCT_2QD 2 "register_operand" "0")
> >> >  		(match_operand:SI 3 "immediate_operand" "i")]
> >> >  		UNSPEC_LD2_LANE))]
> >> > @@ -6804,7 +6804,7 @@ (define_insn
> >> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_2Q 0 "register_operand")
> >> >  	(unspec:VSTRUCT_2Q [
> >> > -		(match_operand:VSTRUCT_2Q 1
> >> "aarch64_simd_struct_operand")]
> >> > +		(match_operand:VSTRUCT_2Q 1 "memory_operand")]
> >> >  		UNSPEC_LD2))]
> >> >    "TARGET_SIMD"
> >> >  {
> >> > @@ -6822,7 +6822,7 @@ (define_expand
> >> "vec_load_lanes<mode><vstruct_elt>"
> >> >  })
> >> >
> >> >  (define_insn "aarch64_simd_st2<vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_2Q 0
> "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_2Q [
> >> >  		(match_operand:VSTRUCT_2Q 1 "register_operand" "w")]
> >> >                  UNSPEC_ST2))]
> >> > @@ -6833,7 +6833,7 @@ (define_insn "aarch64_simd_st2<vstruct_elt>"
> >> >
> >> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
> >> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
> >> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
> >> >  	(unspec:BLK [(match_operand:VSTRUCT_2QD 1 "register_operand"
> >> "w")
> >> >  		     (match_operand:SI 2 "immediate_operand" "i")]
> >> >  		     UNSPEC_ST2_LANE))]
> >> > @@ -6847,7 +6847,7 @@ (define_insn
> >> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> >  )
> >> >
> >> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_2Q 0
> "aarch64_simd_struct_operand")
> >> > +  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand")
> >> >  	(unspec:VSTRUCT_2Q [(match_operand:VSTRUCT_2Q 1
> >> "register_operand")]
> >> >                     UNSPEC_ST2))]
> >> >    "TARGET_SIMD"
> >> > @@ -6868,7 +6868,7 @@ (define_expand
> >> "vec_store_lanes<mode><vstruct_elt>"
> >> >  (define_insn "aarch64_simd_ld3<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_3Q 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_3Q [
> >> > -	  (match_operand:VSTRUCT_3Q 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	  (match_operand:VSTRUCT_3Q 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD3))]
> >> >    "TARGET_SIMD"
> >> >    "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> >> > @@ -6878,7 +6878,7 @@ (define_insn "aarch64_simd_ld3<vstruct_elt>"
> >> >  (define_insn "aarch64_simd_ld3r<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_3QD [
> >> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
> >> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
> >> >            UNSPEC_LD3_DUP))]
> >> >    "TARGET_SIMD"
> >> >    "ld3r\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> >> > @@ -6888,7 +6888,7 @@ (define_insn
> "aarch64_simd_ld3r<vstruct_elt>"
> >> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_3QD [
> >> > -		(match_operand:BLK 1 "aarch64_simd_struct_operand"
> >> "Utv")
> >> > +		(match_operand:BLK 1 "memory_operand" "Utv")
> >> >  		(match_operand:VSTRUCT_3QD 2 "register_operand" "0")
> >> >  		(match_operand:SI 3 "immediate_operand" "i")]
> >> >  		UNSPEC_LD3_LANE))]
> >> > @@ -6904,7 +6904,7 @@ (define_insn
> >> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_3Q 0 "register_operand")
> >> >  	(unspec:VSTRUCT_3Q [
> >> > -		(match_operand:VSTRUCT_3Q 1
> >> "aarch64_simd_struct_operand")]
> >> > +		(match_operand:VSTRUCT_3Q 1 "memory_operand")]
> >> >  		UNSPEC_LD3))]
> >> >    "TARGET_SIMD"
> >> >  {
> >> > @@ -6922,7 +6922,7 @@ (define_expand
> >> "vec_load_lanes<mode><vstruct_elt>"
> >> >  })
> >> >
> >> >  (define_insn "aarch64_simd_st3<vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_3Q 0
> "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_3Q [(match_operand:VSTRUCT_3Q 1
> >> "register_operand" "w")]
> >> >                     UNSPEC_ST3))]
> >> >    "TARGET_SIMD"
> >> > @@ -6932,7 +6932,7 @@ (define_insn "aarch64_simd_st3<vstruct_elt>"
> >> >
> >> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
> >> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
> >> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
> >> >  	(unspec:BLK [(match_operand:VSTRUCT_3QD 1 "register_operand"
> >> "w")
> >> >  		     (match_operand:SI 2 "immediate_operand" "i")]
> >> >  		     UNSPEC_ST3_LANE))]
> >> > @@ -6946,7 +6946,7 @@ (define_insn
> >> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> >  )
> >> >
> >> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_3Q 0
> "aarch64_simd_struct_operand")
> >> > +  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand")
> >> >  	(unspec:VSTRUCT_3Q [
> >> >  		(match_operand:VSTRUCT_3Q 1 "register_operand")]
> >> >                  UNSPEC_ST3))]
> >> > @@ -6968,7 +6968,7 @@ (define_expand
> >> "vec_store_lanes<mode><vstruct_elt>"
> >> >  (define_insn "aarch64_simd_ld4<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_4Q 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_4Q [
> >> > -	  (match_operand:VSTRUCT_4Q 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	  (match_operand:VSTRUCT_4Q 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD4))]
> >> >    "TARGET_SIMD"
> >> >    "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> >> > @@ -6978,7 +6978,7 @@ (define_insn "aarch64_simd_ld4<vstruct_elt>"
> >> >  (define_insn "aarch64_simd_ld4r<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_4QD [
> >> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
> >> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
> >> >            UNSPEC_LD4_DUP))]
> >> >    "TARGET_SIMD"
> >> >    "ld4r\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> >> > @@ -6988,7 +6988,7 @@ (define_insn
> "aarch64_simd_ld4r<vstruct_elt>"
> >> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_4QD [
> >> > -		(match_operand:BLK 1 "aarch64_simd_struct_operand"
> >> "Utv")
> >> > +		(match_operand:BLK 1 "memory_operand" "Utv")
> >> >  		(match_operand:VSTRUCT_4QD 2 "register_operand" "0")
> >> >  		(match_operand:SI 3 "immediate_operand" "i")]
> >> >  		UNSPEC_LD4_LANE))]
> >> > @@ -7004,7 +7004,7 @@ (define_insn
> >> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_4Q 0 "register_operand")
> >> >  	(unspec:VSTRUCT_4Q [
> >> > -		(match_operand:VSTRUCT_4Q 1
> >> "aarch64_simd_struct_operand")]
> >> > +		(match_operand:VSTRUCT_4Q 1 "memory_operand")]
> >> >  		UNSPEC_LD4))]
> >> >    "TARGET_SIMD"
> >> >  {
> >> > @@ -7022,7 +7022,7 @@ (define_expand
> >> "vec_load_lanes<mode><vstruct_elt>"
> >> >  })
> >> >
> >> >  (define_insn "aarch64_simd_st4<vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_4Q 0
> "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_4Q [
> >> >  		(match_operand:VSTRUCT_4Q 1 "register_operand" "w")]
> >> >                  UNSPEC_ST4))]
> >> > @@ -7033,7 +7033,7 @@ (define_insn "aarch64_simd_st4<vstruct_elt>"
> >> >
> >> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
> >> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
> >> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
> >> >  	(unspec:BLK [(match_operand:VSTRUCT_4QD 1 "register_operand"
> >> "w")
> >> >  		     (match_operand:SI 2 "immediate_operand" "i")]
> >> >  		     UNSPEC_ST4_LANE))]
> >> > @@ -7047,7 +7047,7 @@ (define_insn
> >> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> >  )
> >> >
> >> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_4Q 0
> "aarch64_simd_struct_operand")
> >> > +  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand")
> >> >  	(unspec:VSTRUCT_4Q [(match_operand:VSTRUCT_4Q 1
> >> "register_operand")]
> >> >                     UNSPEC_ST4))]
> >> >    "TARGET_SIMD"
> >> > @@ -7138,7 +7138,7 @@ (define_expand "aarch64_ld1x3<vstruct_elt>"
> >> >  (define_insn "aarch64_ld1_x3_<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
> >> >          (unspec:VSTRUCT_3QD
> >> > -	  [(match_operand:VSTRUCT_3QD 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	  [(match_operand:VSTRUCT_3QD 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD1))]
> >> >    "TARGET_SIMD"
> >> >    "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> >> > @@ -7158,7 +7158,7 @@ (define_expand "aarch64_ld1x4<vstruct_elt>"
> >> >  (define_insn "aarch64_ld1_x4_<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_4QD
> >> > -	  [(match_operand:VSTRUCT_4QD 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	  [(match_operand:VSTRUCT_4QD 1 "memory_operand" "Utv")]
> >> >  	UNSPEC_LD1))]
> >> >    "TARGET_SIMD"
> >> >    "ld1\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> >> > @@ -7176,7 +7176,7 @@ (define_expand "aarch64_st1x2<vstruct_elt>"
> >> >  })
> >> >
> >> >  (define_insn "aarch64_st1_x2_<vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_2QD 0
> "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_2QD 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_2QD
> >> >  		[(match_operand:VSTRUCT_2QD 1 "register_operand" "w")]
> >> >  		UNSPEC_ST1))]
> >> > @@ -7196,7 +7196,7 @@ (define_expand "aarch64_st1x3<vstruct_elt>"
> >> >  })
> >> >
> >> >  (define_insn "aarch64_st1_x3_<vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_3QD 0
> "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_3QD 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_3QD
> >> >  		[(match_operand:VSTRUCT_3QD 1 "register_operand" "w")]
> >> >  		UNSPEC_ST1))]
> >> > @@ -7216,7 +7216,7 @@ (define_expand "aarch64_st1x4<vstruct_elt>"
> >> >  })
> >> >
> >> >  (define_insn "aarch64_st1_x4_<vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_4QD 0
> "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_4QD 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_4QD
> >> >  		[(match_operand:VSTRUCT_4QD 1 "register_operand" "w")]
> >> >  		UNSPEC_ST1))]
> >> > @@ -7268,7 +7268,7 @@ (define_insn "*aarch64_movv8di"
> >> >  (define_insn "aarch64_be_ld1<mode>"
> >> >    [(set (match_operand:VALLDI_F16 0	"register_operand" "=w")
> >> >  	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1
> >> > -			     "aarch64_simd_struct_operand" "Utv")]
> >> > +			     "memory_operand" "Utv")]
> >> >  	UNSPEC_LD1))]
> >> >    "TARGET_SIMD"
> >> >    "ld1\\t{%0<Vmtype>}, %1"
> >> > @@ -7276,7 +7276,7 @@ (define_insn "aarch64_be_ld1<mode>"
> >> >  )
> >> >
> >> >  (define_insn "aarch64_be_st1<mode>"
> >> > -  [(set (match_operand:VALLDI_F16 0 "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VALLDI_F16 0 "memory_operand" "=Utv")
> >> >  	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1
> >> "register_operand" "w")]
> >> >  	UNSPEC_ST1))]
> >> >    "TARGET_SIMD"
> >> > @@ -7551,7 +7551,7 @@ (define_expand
> >> "aarch64_ld<nregs>r<vstruct_elt>"
> >> >  (define_insn "aarch64_ld2<vstruct_elt>_dreg"
> >> >    [(set (match_operand:VSTRUCT_2DNX 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_2DNX [
> >> > -	  (match_operand:VSTRUCT_2DNX 1
> >> "aarch64_simd_struct_operand" "Utv")]
> >> > +	  (match_operand:VSTRUCT_2DNX 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD2_DREG))]
> >> >    "TARGET_SIMD"
> >> >    "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> >> > @@ -7561,7 +7561,7 @@ (define_insn "aarch64_ld2<vstruct_elt>_dreg"
> >> >  (define_insn "aarch64_ld2<vstruct_elt>_dreg"
> >> >    [(set (match_operand:VSTRUCT_2DX 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_2DX [
> >> > -	  (match_operand:VSTRUCT_2DX 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	  (match_operand:VSTRUCT_2DX 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD2_DREG))]
> >> >    "TARGET_SIMD"
> >> >    "ld1\\t{%S0.1d - %T0.1d}, %1"
> >> > @@ -7571,7 +7571,7 @@ (define_insn "aarch64_ld2<vstruct_elt>_dreg"
> >> >  (define_insn "aarch64_ld3<vstruct_elt>_dreg"
> >> >    [(set (match_operand:VSTRUCT_3DNX 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_3DNX [
> >> > -	  (match_operand:VSTRUCT_3DNX 1
> >> "aarch64_simd_struct_operand" "Utv")]
> >> > +	  (match_operand:VSTRUCT_3DNX 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD3_DREG))]
> >> >    "TARGET_SIMD"
> >> >    "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> >> > @@ -7581,7 +7581,7 @@ (define_insn "aarch64_ld3<vstruct_elt>_dreg"
> >> >  (define_insn "aarch64_ld3<vstruct_elt>_dreg"
> >> >    [(set (match_operand:VSTRUCT_3DX 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_3DX [
> >> > -	  (match_operand:VSTRUCT_3DX 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	  (match_operand:VSTRUCT_3DX 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD3_DREG))]
> >> >    "TARGET_SIMD"
> >> >    "ld1\\t{%S0.1d - %U0.1d}, %1"
> >> > @@ -7591,7 +7591,7 @@ (define_insn "aarch64_ld3<vstruct_elt>_dreg"
> >> >  (define_insn "aarch64_ld4<vstruct_elt>_dreg"
> >> >    [(set (match_operand:VSTRUCT_4DNX 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_4DNX [
> >> > -	  (match_operand:VSTRUCT_4DNX 1
> >> "aarch64_simd_struct_operand" "Utv")]
> >> > +	  (match_operand:VSTRUCT_4DNX 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD4_DREG))]
> >> >    "TARGET_SIMD"
> >> >    "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> >> > @@ -7601,7 +7601,7 @@ (define_insn "aarch64_ld4<vstruct_elt>_dreg"
> >> >  (define_insn "aarch64_ld4<vstruct_elt>_dreg"
> >> >    [(set (match_operand:VSTRUCT_4DX 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_4DX [
> >> > -	  (match_operand:VSTRUCT_4DX 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	  (match_operand:VSTRUCT_4DX 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD4_DREG))]
> >> >    "TARGET_SIMD"
> >> >    "ld1\\t{%S0.1d - %V0.1d}, %1"
> >> > @@ -7841,7 +7841,7 @@ (define_insn
> >> "aarch64_rev<REVERSE:rev_op><mode>"
> >> >  )
> >> >
> >> >  (define_insn "aarch64_st2<vstruct_elt>_dreg"
> >> > -  [(set (match_operand:VSTRUCT_2DNX 0
> >> "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_2DNX 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_2DNX [
> >> >  		(match_operand:VSTRUCT_2DNX 1 "register_operand" "w")]
> >> >  		UNSPEC_ST2))]
> >> > @@ -7851,7 +7851,7 @@ (define_insn "aarch64_st2<vstruct_elt>_dreg"
> >> >  )
> >> >
> >> >  (define_insn "aarch64_st2<vstruct_elt>_dreg"
> >> > -  [(set (match_operand:VSTRUCT_2DX 0
> "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_2DX 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_2DX [
> >> >  		(match_operand:VSTRUCT_2DX 1 "register_operand" "w")]
> >> >  		UNSPEC_ST2))]
> >> > @@ -7861,7 +7861,7 @@ (define_insn "aarch64_st2<vstruct_elt>_dreg"
> >> >  )
> >> >
> >> >  (define_insn "aarch64_st3<vstruct_elt>_dreg"
> >> > -  [(set (match_operand:VSTRUCT_3DNX 0
> >> "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_3DNX 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_3DNX [
> >> >  		(match_operand:VSTRUCT_3DNX 1 "register_operand" "w")]
> >> >  		UNSPEC_ST3))]
> >> > @@ -7871,7 +7871,7 @@ (define_insn "aarch64_st3<vstruct_elt>_dreg"
> >> >  )
> >> >
> >> >  (define_insn "aarch64_st3<vstruct_elt>_dreg"
> >> > -  [(set (match_operand:VSTRUCT_3DX 0
> "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_3DX 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_3DX [
> >> >  		(match_operand:VSTRUCT_3DX 1 "register_operand" "w")]
> >> >  		UNSPEC_ST3))]
> >> > @@ -7881,7 +7881,7 @@ (define_insn "aarch64_st3<vstruct_elt>_dreg"
> >> >  )
> >> >
> >> >  (define_insn "aarch64_st4<vstruct_elt>_dreg"
> >> > -  [(set (match_operand:VSTRUCT_4DNX 0
> >> "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_4DNX 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_4DNX [
> >> >  		(match_operand:VSTRUCT_4DNX 1 "register_operand" "w")]
> >> >  		UNSPEC_ST4))]
> >> > @@ -7891,7 +7891,7 @@ (define_insn "aarch64_st4<vstruct_elt>_dreg"
> >> >  )
> >> >
> >> >  (define_insn "aarch64_st4<vstruct_elt>_dreg"
> >> > -  [(set (match_operand:VSTRUCT_4DX 0
> "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_4DX 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_4DX [
> >> >  		(match_operand:VSTRUCT_4DX 1 "register_operand" "w")]
> >> >  		UNSPEC_ST4))]
> >> > @@ -7974,7 +7974,7 @@ (define_expand "vec_init<mode><Vhalf>"
> >> >  (define_insn "*aarch64_simd_ld1r<mode>"
> >> >    [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> >> >  	(vec_duplicate:VALL_F16
> >> > -	  (match_operand:<VEL> 1 "aarch64_simd_struct_operand" "Utv")))]
> >> > +	  (match_operand:<VEL> 1 "memory_operand" "Utv")))]
> >> >    "TARGET_SIMD"
> >> >    "ld1r\\t{%0.<Vtype>}, %1"
> >> >    [(set_attr "type" "neon_load1_all_lanes")] @@ -7983,7 +7983,7 @@
> >> > (define_insn "*aarch64_simd_ld1r<mode>"
> >> >  (define_insn "aarch64_simd_ld1<vstruct_elt>_x2"
> >> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_2QD [
> >> > -	    (match_operand:VSTRUCT_2QD 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	    (match_operand:VSTRUCT_2QD 1 "memory_operand" "Utv")]
> >> >  	    UNSPEC_LD1))]
> >> >    "TARGET_SIMD"
> >> >    "ld1\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> >> > diff --git a/gcc/config/aarch64/predicates.md
> >> > b/gcc/config/aarch64/predicates.md
> >> > index
> >> >
> >>
> c308015ac2c13d24cd6bcec71247ec45df8cf5e6..6b70a364530c8108457091bfec
> >> 12
> >> > fe549f722149 100644
> >> > --- a/gcc/config/aarch64/predicates.md
> >> > +++ b/gcc/config/aarch64/predicates.md
> >> > @@ -494,10 +494,6 @@ (define_predicate
> >> "aarch64_simd_reg_or_minus_one"
> >> >    (ior (match_operand 0 "register_operand")
> >> >         (match_operand 0 "aarch64_simd_imm_minus_one")))
> >> >
> >> > -(define_predicate "aarch64_simd_struct_operand"
> >> > -  (and (match_code "mem")
> >> > -       (match_test "TARGET_SIMD && aarch64_simd_mem_operand_p
> >> (op)")))
> >> > -
> >> >  ;; Like general_operand but allow only valid SIMD addressing modes.
> >> >  (define_predicate "aarch64_simd_general_operand"
> >> >    (and (match_operand 0 "general_operand") diff --git
> >> > a/gcc/testsuite/gcc.target/aarch64/vld1r.c
> >> > b/gcc/testsuite/gcc.target/aarch64/vld1r.c
> >> > new file mode 100644
> >> > index
> >> >
> >>
> 0000000000000000000000000000000000000000..72c505c403e9e239771379b7ca
> >> dd
> >> > 8a9473f06113
> >> > --- /dev/null
> >> > +++ b/gcc/testsuite/gcc.target/aarch64/vld1r.c
> >> > @@ -0,0 +1,26 @@
> >> > +/* { dg-do compile } */
> >> > +/* { dg-additional-options "-O" } */
> >> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } }
> >> > +} } */
> >> > +
> >> > +#include <arm_neon.h>
> >> > +
> >> > +/*
> >> > +** f1:
> >> > +** 	add	x0, x0, 1
> >> > +** 	ld1r	{v0.8b}, \[x0\]
> >> > +** 	ret
> >> > +*/
> >> > +uint8x8_t f1(const uint8_t *in) {
> >> > +    return vld1_dup_u8(&in[1]);
> >> > +}
> >> > +
> >> > +/*
> >> > +** f2:
> >> > +** 	ldr	s1, \[x0, 4\]
> >> > +** 	fmla	v0.4s, v0.4s, v1.s\[0\]
> >> > +** 	ret
> >> > +*/
> >> > +float32x4_t f2(const float32_t *in, float32x4_t a) {
> >> > +    float32x4_t dup = vld1q_dup_f32(&in[1]);
> >> > +    return vfmaq_laneq_f32 (a, a, dup, 1); }
  
Richard Sandiford June 9, 2022, 8:22 a.m. UTC | #5
Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Wednesday, June 8, 2022 3:36 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
>> <Kyrylo.Tkachov@arm.com>; rguenther@suse.de; roger@eyesopen.com
>> Subject: Re: [PATCH]AArch64 relax predicate on load structure load
>> instructions
>> 
>> Tamar Christina <Tamar.Christina@arm.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford <richard.sandiford@arm.com>
>> >> Sent: Wednesday, June 8, 2022 11:31 AM
>> >> To: Tamar Christina <Tamar.Christina@arm.com>
>> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
>> <Kyrylo.Tkachov@arm.com>
>> >> Subject: Re: [PATCH]AArch64 relax predicate on load structure load
>> >> instructions
>> >>
>> >> Tamar Christina <tamar.christina@arm.com> writes:
>> >> > Hi All,
>> >> >
>> >> > At some point in time we started lowering the ld1r instructions in
>> gimple.
>> >> >
>> >> > That is:
>> >> >
>> >> > uint8x8_t f1(const uint8_t *in) {
>> >> >     return vld1_dup_u8(&in[1]);
>> >> > }
>> >> >
>> >> > generates at gimple:
>> >> >
>> >> >   _3 = MEM[(const uint8_t *)in_1(D) + 1B];
>> >> >   _4 = {_3, _3, _3, _3, _3, _3, _3, _3};
>> >> >
>> >> > Which is good, but we then generate:
>> >> >
>> >> > f1:
>> >> > 	ldr     b0, [x0, 1]
>> >> > 	dup     v0.8b, v0.b[0]
>> >> > 	ret
>> >> >
>> >> > instead of ld1r.
>> >> >
>> >> > The reason for this is because the load instructions have a too
>> >> > restrictive predicate on them which causes combine not to be able
>> >> > to combine the instructions due to the predicate only accepting
>> >> > simple
>> >> addressing modes.
>> >> >
>> >> > This patch relaxes the predicate to accept any memory operand and
>> >> > relies on LRA to legitimize the address when it needs to as the
>> >> > constraint still only allows the simple addressing mode.  Reload is
>> >> > always able to legitimize to these.
>> >> >
>> >> > Secondly since we are now actually generating more ld1r it became
>> >> > clear that the lane instructions suffer from a similar issue.
>> >> >
>> >> > i.e.
>> >> >
>> >> > float32x4_t f2(const float32_t *in, float32x4_t a) {
>> >> >     float32x4_t dup = vld1q_dup_f32(&in[1]);
>> >> >     return vfmaq_laneq_f32 (a, a, dup, 1); }
>> >> >
>> >> > would generate ld1r + vector fmla instead of ldr + lane fmla.
>> >> >
>> >> > The reason for this is similar to the ld1r issue.  The predicate is
>> >> > too restrictive in only acception register operands but not memory.
>> >> >
>> >> > This relaxes it to accept register and/or memory while leaving the
>> >> > constraint to only accept registers.  This will have LRA generate a
>> >> > reload if needed forcing the memory to registers using the standard
>> >> patterns.
>> >> >
>> >> > These two changes allow combine and reload to generate the right
>> >> sequences.
>> >> >
>> >> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>> >>
>> >> This is going against the general direction of travel, which is to
>> >> make the instruction's predicates and conditions enforce the
>> >> constraints as much as possible (making optimistic assumptions about
>> pseudo registers).
>> >>
>> >> The RA *can* deal with things like:
>> >>
>> >>   (match_operand:M N "general_operand" "r")
>> >>
>> >> but it's best avoided, for a few reasons:
>> >>
>> >> (1) The fix-up will be done in LRA, so IRA will not see the temporary
>> >>     registers.  This can make the allocation of those temporaries
>> >>     suboptimal but (more importantly) it might require other
>> >>     previously-allocated registers to be spilled late due to the
>> >>     unexpected increase in register pressure.
>> >>
>> >> (2) It ends up hiding instructions from the pre-RA optimisers.
>> >>
>> >> (3) It can also prevent combine opportunities (as well as create them),
>> >>     unless the loose predicates in an insn I are propagated to all
>> >>     patterns that might result from combining I with something else.
>> >>
>> >> It sounds like the first problem (not generating ld1r) could be fixed
>> >> by (a) combining aarch64_simd_dup<mode> and
>> *aarch64_simd_ld1r<mode>,
>> >> so that the register and memory alternatives are in the same pattern
>> >> and (b) using the merged instruction(s) to implement the vec_duplicate
>> optab.
>> >> Target-independent code should then make the address satisfy the
>> >> predicate, simplifying the address where necessary.
>> >>
>> >
>> > I think I am likely missing something here. I would assume that you
>> > wanted to use the optab to split the addressing off from the mem
>> > expression so the combined insn matches.
>> >
>> > But in that case, why do you need to combine the two instructions?
>> > I've tried and it doesn't work since the vec_duplicate optab doesn't
>> > see the mem as op1, because in gimple the mem is not part of the
>> duplicate.
>> >
>> > So you still just see:
>> >
>> >>>> dbgrtx (ops[1].value)
>> > (subreg/s/v:QI (reg:SI 92 [ _3 ]) 0)
>> >
>> > As the operand as the argument to the dup is just an SSA_NAME.
>> 
>> Ah, yeah, I'd forgotten that fixed-length vec_duplicates would come from a
>> constructor rather than a vec_duplicate_expr, so we don't get the usual
>> benefit of folding single-use mems during expand.
>> 
>> https://gcc.gnu.org/pipermail/gcc-patches/2022-May/595362.html
>> moves towards using vec_duplicate even for fixed-length vectors.
>> If we take that approach, then I suppose a plain constructor should be folded
>> to a vec_duplicate where possible.
>> 
>> (Alternatively, we could use an extended vec_perm_expr with scalar inputs,
>> as Richi suggested in that thread.)
>> 
>> If we don't do that, or don't do it yet, then…
>> 
>> > If not and you wanted the combined insn to accept
>> >
>> > (set (reg:SI 92 [ _3 ])
>> >     (zero_extend:SI (mem:QI (plus:DI (reg:DI 97)
>> >                 (const_int 1 [0x1])) [0 MEM[(const uint8_tD.4561
>> > *)in_1(D) + 1B]+0 S1 A8])))
>> >
>> > Then that's also not possible without relaxing the combined
>> > predicates.  As far as I can tell If I'm not allowed to use LRA for this, then
>> the only thing that could work is an early split?
>> >
>> > Or do I have to modify store_constructor to try a variant where it
>> > tries pushing in the Decl of an SSA_NAME first?
>> 
>> …yeah, something like this would be needed.  But the vec_duplicate_expr/
>> vec_perm_expr thing seems better, even if we only introduce it during isel.
>> 
>> Not my call either way though :-)  Let's see what Richi (cc:ed) thinks.
>
> FWIW, since my inner "Richards like patch" detector still needs tunings ?
> I did a quick experiment.  Teaching gimple_build_vector_from_val to allow the
> non-constant case and then teaching simplify_vector_constructor to use it for the
> non-constant case gets them generated.
>
> Then I had to teach aarch64_expand_vector_init to generate vec_duplicate_expr
> when the value is non-constant works.
>
> I thought about skipping vec_init entirely in this case during expansion however
> there doesn't seem to be a way to test for vec_duplicate_expr as Richi mentioned,
> it doesn't seem to have an associated optab.

I don't understand, sorry.  The optab is vec_duplicate_optab (generated
via expand_vector_broadcast), and although we don't implement that for
Advanced SIMD yet, the point of the above was that we would.

Thanks,
Richard

>
> This approach does fix the problem, but I'll hold out on cleaning it up till I hear it's
> acceptable.
>
> Cheers,
> Tamar
>
>> 
>> Thanks,
>> Richard
>> 
>> > I guess this also only really works for ld1r, whenever we lower ld2(r)
>> > etc we'll have the same issue again... But I suppose that's for the
>> > next person ?
>> >
>> > Thanks,
>> > Tamar
>> >
>> >> I'm not sure whether fixing the ld1r problem that way will avoid the
>> >> vfmaq_laneq_f32 problem; let me know if not.
>> >>
>> >> Thanks,
>> >> Richard
>> >>
>> >> > Ok for master?
>> >> >
>> >> > Thanks,
>> >> > Tamar
>> >> >
>> >> > gcc/ChangeLog:
>> >> >
>> >> > 	* config/aarch64/aarch64-simd.md (mul_lane<mode>3,
>> >> mul_laneq<mode>3,
>> >> > 	mul_n<mode>3, *aarch64_mul3_elt_to_64v2df,
>> >> *aarch64_mla_elt<mode>,
>> >> > 	*aarch64_mla_elt_<vswap_width_name><mode>,
>> >> aarch64_mla_n<mode>,
>> >> > 	*aarch64_mls_elt<mode>,
>> >> *aarch64_mls_elt_<vswap_width_name><mode>,
>> >> > 	aarch64_mls_n<mode>, *aarch64_fma4_elt<mode>,
>> >> > 	*aarch64_fma4_elt_<vswap_width_name><mode>,
>> >> > 	*aarch64_fma4_elt_from_dup<mode>,
>> >> *aarch64_fma4_elt_to_64v2df,
>> >> > 	*aarch64_fnma4_elt<mode>,
>> >> *aarch64_fnma4_elt_<vswap_width_name><mode>,
>> >> > 	*aarch64_fnma4_elt_from_dup<mode>,
>> >> *aarch64_fnma4_elt_to_64v2df,
>> >> > 	*aarch64_mulx_elt_<vswap_width_name><mode>,
>> >> > 	*aarch64_mulx_elt<mode>, *aarch64_mulx_elt_from_dup<mode>,
>> >> > 	*aarch64_vgetfmulx<mode>): Relax register_operand to
>> >> > 	nonimmediate_operand.
>> >> > 	(aarch64_simd_ld2<vstruct_elt>, aarch64_simd_ld2r<vstruct_elt>,
>> >> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
>> >> > 	vec_load_lanes<mode><vstruct_elt>,
>> >> aarch64_simd_st2<vstruct_elt>,
>> >> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
>> >> > 	vec_store_lanes<mode><vstruct_elt>,
>> >> aarch64_simd_ld3<vstruct_elt>,
>> >> > 	aarch64_simd_ld3r<vstruct_elt>,
>> >> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
>> >> > 	vec_load_lanes<mode><vstruct_elt>,
>> >> aarch64_simd_st3<vstruct_elt>,
>> >> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
>> >> > 	vec_store_lanes<mode><vstruct_elt>,
>> >> aarch64_simd_ld4<vstruct_elt>,
>> >> > 	aarch64_simd_ld4r<vstruct_elt>,
>> >> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
>> >> > 	vec_load_lanes<mode><vstruct_elt>,
>> >> aarch64_simd_st4<vstruct_elt>,
>> >> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
>> >> > 	vec_store_lanes<mode><vstruct_elt>,
>> >> aarch64_ld1_x3_<vstruct_elt>,
>> >> > 	aarch64_ld1_x4_<vstruct_elt>, aarch64_st1_x2_<vstruct_elt>,
>> >> > 	aarch64_st1_x3_<vstruct_elt>, aarch64_st1_x4_<vstruct_elt>,
>> >> > 	aarch64_be_ld1<mode>, aarch64_be_st1<mode>,
>> >> > 	aarch64_ld2<vstruct_elt>_dreg, aarch64_ld2<vstruct_elt>_dreg,
>> >> > 	aarch64_ld3<vstruct_elt>_dreg, aarch64_ld3<vstruct_elt>_dreg,
>> >> > 	aarch64_ld4<vstruct_elt>_dreg, aarch64_ld4<vstruct_elt>_dreg,
>> >> > 	aarch64_st2<vstruct_elt>_dreg, aarch64_st2<vstruct_elt>_dreg,
>> >> > 	aarch64_st3<vstruct_elt>_dreg, aarch64_st3<vstruct_elt>_dreg,
>> >> > 	aarch64_st4<vstruct_elt>_dreg, aarch64_st4<vstruct_elt>_dreg,
>> >> > 	*aarch64_simd_ld1r<mode>, aarch64_simd_ld1<vstruct_elt>_x2):
>> >> Relax
>> >> > 	aarch64_simd_struct_operand to memory_operand.
>> >> > 	* config/aarch64/predicates.md (aarch64_simd_struct_operand):
>> >> Remove.
>> >> >
>> >> > gcc/testsuite/ChangeLog:
>> >> >
>> >> > 	* gcc.target/aarch64/vld1r.c: New test.
>> >> >
>> >> > --- inline copy of patch --
>> >> > diff --git a/gcc/config/aarch64/aarch64-simd.md
>> >> > b/gcc/config/aarch64/aarch64-simd.md
>> >> > index
>> >> >
>> >>
>> be5c70bbb7520ae93d19c4a432ce34863e5b9a64..24e3274ddda2ea76c83571fa
>> >> da8f
>> >> > f4c953b752a1 100644
>> >> > --- a/gcc/config/aarch64/aarch64-simd.md
>> >> > +++ b/gcc/config/aarch64/aarch64-simd.md
>> >> > @@ -712,7 +712,7 @@ (define_insn "mul_lane<mode>3"
>> >> >         (mult:VMULD
>> >> >  	 (vec_duplicate:VMULD
>> >> >  	   (vec_select:<VEL>
>> >> > -	     (match_operand:<VCOND> 2 "register_operand" "<h_con>")
>> >> > +	     (match_operand:<VCOND> 2 "nonimmediate_operand"
>> >> "<h_con>")
>> >> >  	     (parallel [(match_operand:SI 3 "immediate_operand" "i")])))
>> >> >  	 (match_operand:VMULD 1 "register_operand" "w")))]
>> >> >    "TARGET_SIMD"
>> >> > @@ -728,7 +728,7 @@ (define_insn "mul_laneq<mode>3"
>> >> >       (mult:VMUL
>> >> >         (vec_duplicate:VMUL
>> >> >  	  (vec_select:<VEL>
>> >> > -	    (match_operand:<VCONQ> 2 "register_operand" "<h_con>")
>> >> > +	    (match_operand:<VCONQ> 2 "nonimmediate_operand"
>> >> "<h_con>")
>> >> >  	    (parallel [(match_operand:SI 3 "immediate_operand")])))
>> >> >        (match_operand:VMUL 1 "register_operand" "w")))]
>> >> >    "TARGET_SIMD"
>> >> > @@ -743,7 +743,7 @@ (define_insn "mul_n<mode>3"
>> >> >   [(set (match_operand:VMUL 0 "register_operand" "=w")
>> >> >         (mult:VMUL
>> >> >  	 (vec_duplicate:VMUL
>> >> > -	   (match_operand:<VEL> 2 "register_operand" "<h_con>"))
>> >> > +	   (match_operand:<VEL> 2 "nonimmediate_operand" "<h_con>"))
>> >> >  	 (match_operand:VMUL 1 "register_operand" "w")))]
>> >> >    "TARGET_SIMD"
>> >> >    "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"; @@ -789,7
>> >> > +789,7 @@ (define_insn "*aarch64_mul3_elt_to_64v2df"
>> >> >    [(set (match_operand:DF 0 "register_operand" "=w")
>> >> >       (mult:DF
>> >> >         (vec_select:DF
>> >> > -	 (match_operand:V2DF 1 "register_operand" "w")
>> >> > +	 (match_operand:V2DF 1 "nonimmediate_operand" "w")
>> >> >  	 (parallel [(match_operand:SI 2 "immediate_operand")]))
>> >> >         (match_operand:DF 3 "register_operand" "w")))]
>> >> >    "TARGET_SIMD"
>> >> > @@ -1406,7 +1406,7 @@ (define_insn "*aarch64_mla_elt<mode>"
>> >> >  	 (mult:VDQHS
>> >> >  	   (vec_duplicate:VDQHS
>> >> >  	      (vec_select:<VEL>
>> >> > -		(match_operand:VDQHS 1 "register_operand" "<h_con>")
>> >> > +		(match_operand:VDQHS 1 "nonimmediate_operand"
>> >> "<h_con>")
>> >> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
>> >> >  	   (match_operand:VDQHS 3 "register_operand" "w"))
>> >> >  	 (match_operand:VDQHS 4 "register_operand" "0")))] @@ -1424,7
>> >> > +1424,7 @@ (define_insn
>> >> "*aarch64_mla_elt_<vswap_width_name><mode>"
>> >> >  	 (mult:VDQHS
>> >> >  	   (vec_duplicate:VDQHS
>> >> >  	      (vec_select:<VEL>
>> >> > -		(match_operand:<VSWAP_WIDTH> 1 "register_operand"
>> >> "<h_con>")
>> >> > +		(match_operand:<VSWAP_WIDTH> 1
>> >> "nonimmediate_operand" "<h_con>")
>> >> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
>> >> >  	   (match_operand:VDQHS 3 "register_operand" "w"))
>> >> >  	 (match_operand:VDQHS 4 "register_operand" "0")))] @@ -1441,7
>> >> > +1441,7 @@ (define_insn "aarch64_mla_n<mode>"
>> >> >  	(plus:VDQHS
>> >> >  	  (mult:VDQHS
>> >> >  	    (vec_duplicate:VDQHS
>> >> > -	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
>> >> > +	      (match_operand:<VEL> 3 "nonimmediate_operand" "<h_con>"))
>> >> >  	    (match_operand:VDQHS 2 "register_operand" "w"))
>> >> >  	  (match_operand:VDQHS 1 "register_operand" "0")))]
>> >> >   "TARGET_SIMD"
>> >> > @@ -1466,7 +1466,7 @@ (define_insn "*aarch64_mls_elt<mode>"
>> >> >  	 (mult:VDQHS
>> >> >  	   (vec_duplicate:VDQHS
>> >> >  	      (vec_select:<VEL>
>> >> > -		(match_operand:VDQHS 1 "register_operand" "<h_con>")
>> >> > +		(match_operand:VDQHS 1 "nonimmediate_operand"
>> >> "<h_con>")
>> >> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
>> >> >  	   (match_operand:VDQHS 3 "register_operand" "w"))))]
>> >> >   "TARGET_SIMD"
>> >> > @@ -1484,7 +1484,7 @@ (define_insn
>> >> "*aarch64_mls_elt_<vswap_width_name><mode>"
>> >> >  	 (mult:VDQHS
>> >> >  	   (vec_duplicate:VDQHS
>> >> >  	      (vec_select:<VEL>
>> >> > -		(match_operand:<VSWAP_WIDTH> 1 "register_operand"
>> >> "<h_con>")
>> >> > +		(match_operand:<VSWAP_WIDTH> 1
>> >> "nonimmediate_operand" "<h_con>")
>> >> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
>> >> >  	   (match_operand:VDQHS 3 "register_operand" "w"))))]
>> >> >   "TARGET_SIMD"
>> >> > @@ -1501,7 +1501,7 @@ (define_insn "aarch64_mls_n<mode>"
>> >> >  	  (match_operand:VDQHS 1 "register_operand" "0")
>> >> >  	  (mult:VDQHS
>> >> >  	    (vec_duplicate:VDQHS
>> >> > -	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
>> >> > +	      (match_operand:<VEL> 3 "nonimmediate_operand" "<h_con>"))
>> >> >  	    (match_operand:VDQHS 2 "register_operand" "w"))))]
>> >> >    "TARGET_SIMD"
>> >> >    "mls\t%0.<Vtype>, %2.<Vtype>, %3.<Vetype>[0]"
>> >> > @@ -2882,7 +2882,7 @@ (define_insn "*aarch64_fma4_elt<mode>"
>> >> >      (fma:VDQF
>> >> >        (vec_duplicate:VDQF
>> >> >  	(vec_select:<VEL>
>> >> > -	  (match_operand:VDQF 1 "register_operand" "<h_con>")
>> >> > +	  (match_operand:VDQF 1 "nonimmediate_operand" "<h_con>")
>> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
>> >> >        (match_operand:VDQF 3 "register_operand" "w")
>> >> >        (match_operand:VDQF 4 "register_operand" "0")))] @@ -2899,7
>> >> > +2899,7 @@ (define_insn
>> >> "*aarch64_fma4_elt_<vswap_width_name><mode>"
>> >> >      (fma:VDQSF
>> >> >        (vec_duplicate:VDQSF
>> >> >  	(vec_select:<VEL>
>> >> > -	  (match_operand:<VSWAP_WIDTH> 1 "register_operand"
>> >> "<h_con>")
>> >> > +	  (match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand"
>> >> "<h_con>")
>> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
>> >> >        (match_operand:VDQSF 3 "register_operand" "w")
>> >> >        (match_operand:VDQSF 4 "register_operand" "0")))] @@ -2915,7
>> >> > +2915,7 @@ (define_insn "*aarch64_fma4_elt_from_dup<mode>"
>> >> >    [(set (match_operand:VMUL 0 "register_operand" "=w")
>> >> >      (fma:VMUL
>> >> >        (vec_duplicate:VMUL
>> >> > -	  (match_operand:<VEL> 1 "register_operand" "<h_con>"))
>> >> > +	  (match_operand:<VEL> 1 "nonimmediate_operand" "<h_con>"))
>> >> >        (match_operand:VMUL 2 "register_operand" "w")
>> >> >        (match_operand:VMUL 3 "register_operand" "0")))]
>> >> >    "TARGET_SIMD"
>> >> > @@ -2927,7 +2927,7 @@ (define_insn "*aarch64_fma4_elt_to_64v2df"
>> >> >    [(set (match_operand:DF 0 "register_operand" "=w")
>> >> >      (fma:DF
>> >> >  	(vec_select:DF
>> >> > -	  (match_operand:V2DF 1 "register_operand" "w")
>> >> > +	  (match_operand:V2DF 1 "nonimmediate_operand" "w")
>> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")]))
>> >> >        (match_operand:DF 3 "register_operand" "w")
>> >> >        (match_operand:DF 4 "register_operand" "0")))] @@ -2957,7
>> >> > +2957,7 @@ (define_insn "*aarch64_fnma4_elt<mode>"
>> >> >          (match_operand:VDQF 3 "register_operand" "w"))
>> >> >        (vec_duplicate:VDQF
>> >> >  	(vec_select:<VEL>
>> >> > -	  (match_operand:VDQF 1 "register_operand" "<h_con>")
>> >> > +	  (match_operand:VDQF 1 "nonimmediate_operand" "<h_con>")
>> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
>> >> >        (match_operand:VDQF 4 "register_operand" "0")))]
>> >> >    "TARGET_SIMD"
>> >> > @@ -2975,7 +2975,7 @@ (define_insn
>> >> "*aarch64_fnma4_elt_<vswap_width_name><mode>"
>> >> >          (match_operand:VDQSF 3 "register_operand" "w"))
>> >> >        (vec_duplicate:VDQSF
>> >> >  	(vec_select:<VEL>
>> >> > -	  (match_operand:<VSWAP_WIDTH> 1 "register_operand"
>> >> "<h_con>")
>> >> > +	  (match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand"
>> >> "<h_con>")
>> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
>> >> >        (match_operand:VDQSF 4 "register_operand" "0")))]
>> >> >    "TARGET_SIMD"
>> >> > @@ -2992,7 +2992,7 @@ (define_insn
>> >> "*aarch64_fnma4_elt_from_dup<mode>"
>> >> >        (neg:VMUL
>> >> >          (match_operand:VMUL 2 "register_operand" "w"))
>> >> >        (vec_duplicate:VMUL
>> >> > -	(match_operand:<VEL> 1 "register_operand" "<h_con>"))
>> >> > +	(match_operand:<VEL> 1 "nonimmediate_operand" "<h_con>"))
>> >> >        (match_operand:VMUL 3 "register_operand" "0")))]
>> >> >    "TARGET_SIMD"
>> >> >    "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
>> >> > @@ -3003,7 +3003,7 @@ (define_insn
>> "*aarch64_fnma4_elt_to_64v2df"
>> >> >    [(set (match_operand:DF 0 "register_operand" "=w")
>> >> >      (fma:DF
>> >> >        (vec_select:DF
>> >> > -	(match_operand:V2DF 1 "register_operand" "w")
>> >> > +	(match_operand:V2DF 1 "nonimmediate_operand" "w")
>> >> >  	(parallel [(match_operand:SI 2 "immediate_operand")]))
>> >> >        (neg:DF
>> >> >          (match_operand:DF 3 "register_operand" "w")) @@ -4934,7
>> >> > +4934,7 @@ (define_insn
>> >> "*aarch64_mulx_elt_<vswap_width_name><mode>"
>> >> >  	 [(match_operand:VDQSF 1 "register_operand" "w")
>> >> >  	  (vec_duplicate:VDQSF
>> >> >  	   (vec_select:<VEL>
>> >> > -	    (match_operand:<VSWAP_WIDTH> 2 "register_operand" "w")
>> >> > +	    (match_operand:<VSWAP_WIDTH> 2 "nonimmediate_operand"
>> >> "w")
>> >> >  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
>> >> >  	 UNSPEC_FMULX))]
>> >> >    "TARGET_SIMD"
>> >> > @@ -4953,7 +4953,7 @@ (define_insn "*aarch64_mulx_elt<mode>"
>> >> >  	 [(match_operand:VDQF 1 "register_operand" "w")
>> >> >  	  (vec_duplicate:VDQF
>> >> >  	   (vec_select:<VEL>
>> >> > -	    (match_operand:VDQF 2 "register_operand" "w")
>> >> > +	    (match_operand:VDQF 2 "nonimmediate_operand" "w")
>> >> >  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
>> >> >  	 UNSPEC_FMULX))]
>> >> >    "TARGET_SIMD"
>> >> > @@ -4971,7 +4971,7 @@ (define_insn
>> >> "*aarch64_mulx_elt_from_dup<mode>"
>> >> >  	(unspec:VHSDF
>> >> >  	 [(match_operand:VHSDF 1 "register_operand" "w")
>> >> >  	  (vec_duplicate:VHSDF
>> >> > -	    (match_operand:<VEL> 2 "register_operand" "<h_con>"))]
>> >> > +	    (match_operand:<VEL> 2 "nonimmediate_operand" "<h_con>"))]
>> >> >  	 UNSPEC_FMULX))]
>> >> >    "TARGET_SIMD"
>> >> >    "fmulx\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"; @@ -4987,7
>> >> +4987,7
>> >> > @@ (define_insn "*aarch64_vgetfmulx<mode>"
>> >> >  	(unspec:<VEL>
>> >> >  	 [(match_operand:<VEL> 1 "register_operand" "w")
>> >> >  	  (vec_select:<VEL>
>> >> > -	   (match_operand:VDQF 2 "register_operand" "w")
>> >> > +	   (match_operand:VDQF 2 "nonimmediate_operand" "w")
>> >> >  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")]))]
>> >> >  	 UNSPEC_FMULX))]
>> >> >    "TARGET_SIMD"
>> >> > @@ -6768,7 +6768,7 @@ (define_insn "*sqrt<mode>2"
>> >> >  (define_insn "aarch64_simd_ld2<vstruct_elt>"
>> >> >    [(set (match_operand:VSTRUCT_2Q 0 "register_operand" "=w")
>> >> >  	(unspec:VSTRUCT_2Q [
>> >> > -	  (match_operand:VSTRUCT_2Q 1 "aarch64_simd_struct_operand"
>> >> "Utv")]
>> >> > +	  (match_operand:VSTRUCT_2Q 1 "memory_operand" "Utv")]
>> >> >  	  UNSPEC_LD2))]
>> >> >    "TARGET_SIMD"
>> >> >    "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
>> >> > @@ -6778,7 +6778,7 @@ (define_insn "aarch64_simd_ld2<vstruct_elt>"
>> >> >  (define_insn "aarch64_simd_ld2r<vstruct_elt>"
>> >> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
>> >> >  	(unspec:VSTRUCT_2QD [
>> >> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
>> >> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
>> >> >            UNSPEC_LD2_DUP))]
>> >> >    "TARGET_SIMD"
>> >> >    "ld2r\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
>> >> > @@ -6788,7 +6788,7 @@ (define_insn
>> "aarch64_simd_ld2r<vstruct_elt>"
>> >> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>> >> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
>> >> >  	(unspec:VSTRUCT_2QD [
>> >> > -		(match_operand:BLK 1 "aarch64_simd_struct_operand"
>> >> "Utv")
>> >> > +		(match_operand:BLK 1 "memory_operand" "Utv")
>> >> >  		(match_operand:VSTRUCT_2QD 2 "register_operand" "0")
>> >> >  		(match_operand:SI 3 "immediate_operand" "i")]
>> >> >  		UNSPEC_LD2_LANE))]
>> >> > @@ -6804,7 +6804,7 @@ (define_insn
>> >> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>> >> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
>> >> >    [(set (match_operand:VSTRUCT_2Q 0 "register_operand")
>> >> >  	(unspec:VSTRUCT_2Q [
>> >> > -		(match_operand:VSTRUCT_2Q 1
>> >> "aarch64_simd_struct_operand")]
>> >> > +		(match_operand:VSTRUCT_2Q 1 "memory_operand")]
>> >> >  		UNSPEC_LD2))]
>> >> >    "TARGET_SIMD"
>> >> >  {
>> >> > @@ -6822,7 +6822,7 @@ (define_expand
>> >> "vec_load_lanes<mode><vstruct_elt>"
>> >> >  })
>> >> >
>> >> >  (define_insn "aarch64_simd_st2<vstruct_elt>"
>> >> > -  [(set (match_operand:VSTRUCT_2Q 0
>> "aarch64_simd_struct_operand"
>> >> > "=Utv")
>> >> > +  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand" "=Utv")
>> >> >  	(unspec:VSTRUCT_2Q [
>> >> >  		(match_operand:VSTRUCT_2Q 1 "register_operand" "w")]
>> >> >                  UNSPEC_ST2))]
>> >> > @@ -6833,7 +6833,7 @@ (define_insn "aarch64_simd_st2<vstruct_elt>"
>> >> >
>> >> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
>> >> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
>> >> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
>> >> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
>> >> >  	(unspec:BLK [(match_operand:VSTRUCT_2QD 1 "register_operand"
>> >> "w")
>> >> >  		     (match_operand:SI 2 "immediate_operand" "i")]
>> >> >  		     UNSPEC_ST2_LANE))]
>> >> > @@ -6847,7 +6847,7 @@ (define_insn
>> >> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
>> >> >  )
>> >> >
>> >> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
>> >> > -  [(set (match_operand:VSTRUCT_2Q 0
>> "aarch64_simd_struct_operand")
>> >> > +  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand")
>> >> >  	(unspec:VSTRUCT_2Q [(match_operand:VSTRUCT_2Q 1
>> >> "register_operand")]
>> >> >                     UNSPEC_ST2))]
>> >> >    "TARGET_SIMD"
>> >> > @@ -6868,7 +6868,7 @@ (define_expand
>> >> "vec_store_lanes<mode><vstruct_elt>"
>> >> >  (define_insn "aarch64_simd_ld3<vstruct_elt>"
>> >> >    [(set (match_operand:VSTRUCT_3Q 0 "register_operand" "=w")
>> >> >  	(unspec:VSTRUCT_3Q [
>> >> > -	  (match_operand:VSTRUCT_3Q 1 "aarch64_simd_struct_operand"
>> >> "Utv")]
>> >> > +	  (match_operand:VSTRUCT_3Q 1 "memory_operand" "Utv")]
>> >> >  	  UNSPEC_LD3))]
>> >> >    "TARGET_SIMD"
>> >> >    "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
>> >> > @@ -6878,7 +6878,7 @@ (define_insn "aarch64_simd_ld3<vstruct_elt>"
>> >> >  (define_insn "aarch64_simd_ld3r<vstruct_elt>"
>> >> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
>> >> >  	(unspec:VSTRUCT_3QD [
>> >> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
>> >> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
>> >> >            UNSPEC_LD3_DUP))]
>> >> >    "TARGET_SIMD"
>> >> >    "ld3r\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
>> >> > @@ -6888,7 +6888,7 @@ (define_insn
>> "aarch64_simd_ld3r<vstruct_elt>"
>> >> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>> >> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
>> >> >  	(unspec:VSTRUCT_3QD [
>> >> > -		(match_operand:BLK 1 "aarch64_simd_struct_operand"
>> >> "Utv")
>> >> > +		(match_operand:BLK 1 "memory_operand" "Utv")
>> >> >  		(match_operand:VSTRUCT_3QD 2 "register_operand" "0")
>> >> >  		(match_operand:SI 3 "immediate_operand" "i")]
>> >> >  		UNSPEC_LD3_LANE))]
>> >> > @@ -6904,7 +6904,7 @@ (define_insn
>> >> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>> >> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
>> >> >    [(set (match_operand:VSTRUCT_3Q 0 "register_operand")
>> >> >  	(unspec:VSTRUCT_3Q [
>> >> > -		(match_operand:VSTRUCT_3Q 1
>> >> "aarch64_simd_struct_operand")]
>> >> > +		(match_operand:VSTRUCT_3Q 1 "memory_operand")]
>> >> >  		UNSPEC_LD3))]
>> >> >    "TARGET_SIMD"
>> >> >  {
>> >> > @@ -6922,7 +6922,7 @@ (define_expand
>> >> "vec_load_lanes<mode><vstruct_elt>"
>> >> >  })
>> >> >
>> >> >  (define_insn "aarch64_simd_st3<vstruct_elt>"
>> >> > -  [(set (match_operand:VSTRUCT_3Q 0
>> "aarch64_simd_struct_operand"
>> >> > "=Utv")
>> >> > +  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand" "=Utv")
>> >> >  	(unspec:VSTRUCT_3Q [(match_operand:VSTRUCT_3Q 1
>> >> "register_operand" "w")]
>> >> >                     UNSPEC_ST3))]
>> >> >    "TARGET_SIMD"
>> >> > @@ -6932,7 +6932,7 @@ (define_insn "aarch64_simd_st3<vstruct_elt>"
>> >> >
>> >> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
>> >> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
>> >> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
>> >> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
>> >> >  	(unspec:BLK [(match_operand:VSTRUCT_3QD 1 "register_operand"
>> >> "w")
>> >> >  		     (match_operand:SI 2 "immediate_operand" "i")]
>> >> >  		     UNSPEC_ST3_LANE))]
>> >> > @@ -6946,7 +6946,7 @@ (define_insn
>> >> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
>> >> >  )
>> >> >
>> >> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
>> >> > -  [(set (match_operand:VSTRUCT_3Q 0
>> "aarch64_simd_struct_operand")
>> >> > +  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand")
>> >> >  	(unspec:VSTRUCT_3Q [
>> >> >  		(match_operand:VSTRUCT_3Q 1 "register_operand")]
>> >> >                  UNSPEC_ST3))]
>> >> > @@ -6968,7 +6968,7 @@ (define_expand
>> >> "vec_store_lanes<mode><vstruct_elt>"
>> >> >  (define_insn "aarch64_simd_ld4<vstruct_elt>"
>> >> >    [(set (match_operand:VSTRUCT_4Q 0 "register_operand" "=w")
>> >> >  	(unspec:VSTRUCT_4Q [
>> >> > -	  (match_operand:VSTRUCT_4Q 1 "aarch64_simd_struct_operand"
>> >> "Utv")]
>> >> > +	  (match_operand:VSTRUCT_4Q 1 "memory_operand" "Utv")]
>> >> >  	  UNSPEC_LD4))]
>> >> >    "TARGET_SIMD"
>> >> >    "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
>> >> > @@ -6978,7 +6978,7 @@ (define_insn "aarch64_simd_ld4<vstruct_elt>"
>> >> >  (define_insn "aarch64_simd_ld4r<vstruct_elt>"
>> >> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
>> >> >  	(unspec:VSTRUCT_4QD [
>> >> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
>> >> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
>> >> >            UNSPEC_LD4_DUP))]
>> >> >    "TARGET_SIMD"
>> >> >    "ld4r\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
>> >> > @@ -6988,7 +6988,7 @@ (define_insn
>> "aarch64_simd_ld4r<vstruct_elt>"
>> >> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>> >> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
>> >> >  	(unspec:VSTRUCT_4QD [
>> >> > -		(match_operand:BLK 1 "aarch64_simd_struct_operand"
>> >> "Utv")
>> >> > +		(match_operand:BLK 1 "memory_operand" "Utv")
>> >> >  		(match_operand:VSTRUCT_4QD 2 "register_operand" "0")
>> >> >  		(match_operand:SI 3 "immediate_operand" "i")]
>> >> >  		UNSPEC_LD4_LANE))]
>> >> > @@ -7004,7 +7004,7 @@ (define_insn
>> >> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
>> >> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
>> >> >    [(set (match_operand:VSTRUCT_4Q 0 "register_operand")
>> >> >  	(unspec:VSTRUCT_4Q [
>> >> > -		(match_operand:VSTRUCT_4Q 1
>> >> "aarch64_simd_struct_operand")]
>> >> > +		(match_operand:VSTRUCT_4Q 1 "memory_operand")]
>> >> >  		UNSPEC_LD4))]
>> >> >    "TARGET_SIMD"
>> >> >  {
>> >> > @@ -7022,7 +7022,7 @@ (define_expand
>> >> "vec_load_lanes<mode><vstruct_elt>"
>> >> >  })
>> >> >
>> >> >  (define_insn "aarch64_simd_st4<vstruct_elt>"
>> >> > -  [(set (match_operand:VSTRUCT_4Q 0
>> "aarch64_simd_struct_operand"
>> >> > "=Utv")
>> >> > +  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand" "=Utv")
>> >> >  	(unspec:VSTRUCT_4Q [
>> >> >  		(match_operand:VSTRUCT_4Q 1 "register_operand" "w")]
>> >> >                  UNSPEC_ST4))]
>> >> > @@ -7033,7 +7033,7 @@ (define_insn "aarch64_simd_st4<vstruct_elt>"
>> >> >
>> >> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
>> >> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
>> >> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
>> >> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
>> >> >  	(unspec:BLK [(match_operand:VSTRUCT_4QD 1 "register_operand"
>> >> "w")
>> >> >  		     (match_operand:SI 2 "immediate_operand" "i")]
>> >> >  		     UNSPEC_ST4_LANE))]
>> >> > @@ -7047,7 +7047,7 @@ (define_insn
>> >> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
>> >> >  )
>> >> >
>> >> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
>> >> > -  [(set (match_operand:VSTRUCT_4Q 0
>> "aarch64_simd_struct_operand")
>> >> > +  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand")
>> >> >  	(unspec:VSTRUCT_4Q [(match_operand:VSTRUCT_4Q 1
>> >> "register_operand")]
>> >> >                     UNSPEC_ST4))]
>> >> >    "TARGET_SIMD"
>> >> > @@ -7138,7 +7138,7 @@ (define_expand "aarch64_ld1x3<vstruct_elt>"
>> >> >  (define_insn "aarch64_ld1_x3_<vstruct_elt>"
>> >> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
>> >> >          (unspec:VSTRUCT_3QD
>> >> > -	  [(match_operand:VSTRUCT_3QD 1 "aarch64_simd_struct_operand"
>> >> "Utv")]
>> >> > +	  [(match_operand:VSTRUCT_3QD 1 "memory_operand" "Utv")]
>> >> >  	  UNSPEC_LD1))]
>> >> >    "TARGET_SIMD"
>> >> >    "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
>> >> > @@ -7158,7 +7158,7 @@ (define_expand "aarch64_ld1x4<vstruct_elt>"
>> >> >  (define_insn "aarch64_ld1_x4_<vstruct_elt>"
>> >> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
>> >> >  	(unspec:VSTRUCT_4QD
>> >> > -	  [(match_operand:VSTRUCT_4QD 1 "aarch64_simd_struct_operand"
>> >> "Utv")]
>> >> > +	  [(match_operand:VSTRUCT_4QD 1 "memory_operand" "Utv")]
>> >> >  	UNSPEC_LD1))]
>> >> >    "TARGET_SIMD"
>> >> >    "ld1\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
>> >> > @@ -7176,7 +7176,7 @@ (define_expand "aarch64_st1x2<vstruct_elt>"
>> >> >  })
>> >> >
>> >> >  (define_insn "aarch64_st1_x2_<vstruct_elt>"
>> >> > -  [(set (match_operand:VSTRUCT_2QD 0
>> "aarch64_simd_struct_operand"
>> >> > "=Utv")
>> >> > +  [(set (match_operand:VSTRUCT_2QD 0 "memory_operand" "=Utv")
>> >> >  	(unspec:VSTRUCT_2QD
>> >> >  		[(match_operand:VSTRUCT_2QD 1 "register_operand" "w")]
>> >> >  		UNSPEC_ST1))]
>> >> > @@ -7196,7 +7196,7 @@ (define_expand "aarch64_st1x3<vstruct_elt>"
>> >> >  })
>> >> >
>> >> >  (define_insn "aarch64_st1_x3_<vstruct_elt>"
>> >> > -  [(set (match_operand:VSTRUCT_3QD 0
>> "aarch64_simd_struct_operand"
>> >> > "=Utv")
>> >> > +  [(set (match_operand:VSTRUCT_3QD 0 "memory_operand" "=Utv")
>> >> >  	(unspec:VSTRUCT_3QD
>> >> >  		[(match_operand:VSTRUCT_3QD 1 "register_operand" "w")]
>> >> >  		UNSPEC_ST1))]
>> >> > @@ -7216,7 +7216,7 @@ (define_expand "aarch64_st1x4<vstruct_elt>"
>> >> >  })
>> >> >
>> >> >  (define_insn "aarch64_st1_x4_<vstruct_elt>"
>> >> > -  [(set (match_operand:VSTRUCT_4QD 0
>> "aarch64_simd_struct_operand"
>> >> > "=Utv")
>> >> > +  [(set (match_operand:VSTRUCT_4QD 0 "memory_operand" "=Utv")
>> >> >  	(unspec:VSTRUCT_4QD
>> >> >  		[(match_operand:VSTRUCT_4QD 1 "register_operand" "w")]
>> >> >  		UNSPEC_ST1))]
>> >> > @@ -7268,7 +7268,7 @@ (define_insn "*aarch64_movv8di"
>> >> >  (define_insn "aarch64_be_ld1<mode>"
>> >> >    [(set (match_operand:VALLDI_F16 0	"register_operand" "=w")
>> >> >  	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1
>> >> > -			     "aarch64_simd_struct_operand" "Utv")]
>> >> > +			     "memory_operand" "Utv")]
>> >> >  	UNSPEC_LD1))]
>> >> >    "TARGET_SIMD"
>> >> >    "ld1\\t{%0<Vmtype>}, %1"
>> >> > @@ -7276,7 +7276,7 @@ (define_insn "aarch64_be_ld1<mode>"
>> >> >  )
>> >> >
>> >> >  (define_insn "aarch64_be_st1<mode>"
>> >> > -  [(set (match_operand:VALLDI_F16 0 "aarch64_simd_struct_operand"
>> >> > "=Utv")
>> >> > +  [(set (match_operand:VALLDI_F16 0 "memory_operand" "=Utv")
>> >> >  	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1
>> >> "register_operand" "w")]
>> >> >  	UNSPEC_ST1))]
>> >> >    "TARGET_SIMD"
>> >> > @@ -7551,7 +7551,7 @@ (define_expand
>> >> "aarch64_ld<nregs>r<vstruct_elt>"
>> >> >  (define_insn "aarch64_ld2<vstruct_elt>_dreg"
>> >> >    [(set (match_operand:VSTRUCT_2DNX 0 "register_operand" "=w")
>> >> >  	(unspec:VSTRUCT_2DNX [
>> >> > -	  (match_operand:VSTRUCT_2DNX 1
>> >> "aarch64_simd_struct_operand" "Utv")]
>> >> > +	  (match_operand:VSTRUCT_2DNX 1 "memory_operand" "Utv")]
>> >> >  	  UNSPEC_LD2_DREG))]
>> >> >    "TARGET_SIMD"
>> >> >    "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
>> >> > @@ -7561,7 +7561,7 @@ (define_insn "aarch64_ld2<vstruct_elt>_dreg"
>> >> >  (define_insn "aarch64_ld2<vstruct_elt>_dreg"
>> >> >    [(set (match_operand:VSTRUCT_2DX 0 "register_operand" "=w")
>> >> >  	(unspec:VSTRUCT_2DX [
>> >> > -	  (match_operand:VSTRUCT_2DX 1 "aarch64_simd_struct_operand"
>> >> "Utv")]
>> >> > +	  (match_operand:VSTRUCT_2DX 1 "memory_operand" "Utv")]
>> >> >  	  UNSPEC_LD2_DREG))]
>> >> >    "TARGET_SIMD"
>> >> >    "ld1\\t{%S0.1d - %T0.1d}, %1"
>> >> > @@ -7571,7 +7571,7 @@ (define_insn "aarch64_ld2<vstruct_elt>_dreg"
>> >> >  (define_insn "aarch64_ld3<vstruct_elt>_dreg"
>> >> >    [(set (match_operand:VSTRUCT_3DNX 0 "register_operand" "=w")
>> >> >  	(unspec:VSTRUCT_3DNX [
>> >> > -	  (match_operand:VSTRUCT_3DNX 1
>> >> "aarch64_simd_struct_operand" "Utv")]
>> >> > +	  (match_operand:VSTRUCT_3DNX 1 "memory_operand" "Utv")]
>> >> >  	  UNSPEC_LD3_DREG))]
>> >> >    "TARGET_SIMD"
>> >> >    "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
>> >> > @@ -7581,7 +7581,7 @@ (define_insn "aarch64_ld3<vstruct_elt>_dreg"
>> >> >  (define_insn "aarch64_ld3<vstruct_elt>_dreg"
>> >> >    [(set (match_operand:VSTRUCT_3DX 0 "register_operand" "=w")
>> >> >  	(unspec:VSTRUCT_3DX [
>> >> > -	  (match_operand:VSTRUCT_3DX 1 "aarch64_simd_struct_operand"
>> >> "Utv")]
>> >> > +	  (match_operand:VSTRUCT_3DX 1 "memory_operand" "Utv")]
>> >> >  	  UNSPEC_LD3_DREG))]
>> >> >    "TARGET_SIMD"
>> >> >    "ld1\\t{%S0.1d - %U0.1d}, %1"
>> >> > @@ -7591,7 +7591,7 @@ (define_insn "aarch64_ld3<vstruct_elt>_dreg"
>> >> >  (define_insn "aarch64_ld4<vstruct_elt>_dreg"
>> >> >    [(set (match_operand:VSTRUCT_4DNX 0 "register_operand" "=w")
>> >> >  	(unspec:VSTRUCT_4DNX [
>> >> > -	  (match_operand:VSTRUCT_4DNX 1
>> >> "aarch64_simd_struct_operand" "Utv")]
>> >> > +	  (match_operand:VSTRUCT_4DNX 1 "memory_operand" "Utv")]
>> >> >  	  UNSPEC_LD4_DREG))]
>> >> >    "TARGET_SIMD"
>> >> >    "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
>> >> > @@ -7601,7 +7601,7 @@ (define_insn "aarch64_ld4<vstruct_elt>_dreg"
>> >> >  (define_insn "aarch64_ld4<vstruct_elt>_dreg"
>> >> >    [(set (match_operand:VSTRUCT_4DX 0 "register_operand" "=w")
>> >> >  	(unspec:VSTRUCT_4DX [
>> >> > -	  (match_operand:VSTRUCT_4DX 1 "aarch64_simd_struct_operand"
>> >> "Utv")]
>> >> > +	  (match_operand:VSTRUCT_4DX 1 "memory_operand" "Utv")]
>> >> >  	  UNSPEC_LD4_DREG))]
>> >> >    "TARGET_SIMD"
>> >> >    "ld1\\t{%S0.1d - %V0.1d}, %1"
>> >> > @@ -7841,7 +7841,7 @@ (define_insn
>> >> "aarch64_rev<REVERSE:rev_op><mode>"
>> >> >  )
>> >> >
>> >> >  (define_insn "aarch64_st2<vstruct_elt>_dreg"
>> >> > -  [(set (match_operand:VSTRUCT_2DNX 0
>> >> "aarch64_simd_struct_operand"
>> >> > "=Utv")
>> >> > +  [(set (match_operand:VSTRUCT_2DNX 0 "memory_operand" "=Utv")
>> >> >  	(unspec:VSTRUCT_2DNX [
>> >> >  		(match_operand:VSTRUCT_2DNX 1 "register_operand" "w")]
>> >> >  		UNSPEC_ST2))]
>> >> > @@ -7851,7 +7851,7 @@ (define_insn "aarch64_st2<vstruct_elt>_dreg"
>> >> >  )
>> >> >
>> >> >  (define_insn "aarch64_st2<vstruct_elt>_dreg"
>> >> > -  [(set (match_operand:VSTRUCT_2DX 0
>> "aarch64_simd_struct_operand"
>> >> > "=Utv")
>> >> > +  [(set (match_operand:VSTRUCT_2DX 0 "memory_operand" "=Utv")
>> >> >  	(unspec:VSTRUCT_2DX [
>> >> >  		(match_operand:VSTRUCT_2DX 1 "register_operand" "w")]
>> >> >  		UNSPEC_ST2))]
>> >> > @@ -7861,7 +7861,7 @@ (define_insn "aarch64_st2<vstruct_elt>_dreg"
>> >> >  )
>> >> >
>> >> >  (define_insn "aarch64_st3<vstruct_elt>_dreg"
>> >> > -  [(set (match_operand:VSTRUCT_3DNX 0
>> >> "aarch64_simd_struct_operand"
>> >> > "=Utv")
>> >> > +  [(set (match_operand:VSTRUCT_3DNX 0 "memory_operand" "=Utv")
>> >> >  	(unspec:VSTRUCT_3DNX [
>> >> >  		(match_operand:VSTRUCT_3DNX 1 "register_operand" "w")]
>> >> >  		UNSPEC_ST3))]
>> >> > @@ -7871,7 +7871,7 @@ (define_insn "aarch64_st3<vstruct_elt>_dreg"
>> >> >  )
>> >> >
>> >> >  (define_insn "aarch64_st3<vstruct_elt>_dreg"
>> >> > -  [(set (match_operand:VSTRUCT_3DX 0
>> "aarch64_simd_struct_operand"
>> >> > "=Utv")
>> >> > +  [(set (match_operand:VSTRUCT_3DX 0 "memory_operand" "=Utv")
>> >> >  	(unspec:VSTRUCT_3DX [
>> >> >  		(match_operand:VSTRUCT_3DX 1 "register_operand" "w")]
>> >> >  		UNSPEC_ST3))]
>> >> > @@ -7881,7 +7881,7 @@ (define_insn "aarch64_st3<vstruct_elt>_dreg"
>> >> >  )
>> >> >
>> >> >  (define_insn "aarch64_st4<vstruct_elt>_dreg"
>> >> > -  [(set (match_operand:VSTRUCT_4DNX 0
>> >> "aarch64_simd_struct_operand"
>> >> > "=Utv")
>> >> > +  [(set (match_operand:VSTRUCT_4DNX 0 "memory_operand" "=Utv")
>> >> >  	(unspec:VSTRUCT_4DNX [
>> >> >  		(match_operand:VSTRUCT_4DNX 1 "register_operand" "w")]
>> >> >  		UNSPEC_ST4))]
>> >> > @@ -7891,7 +7891,7 @@ (define_insn "aarch64_st4<vstruct_elt>_dreg"
>> >> >  )
>> >> >
>> >> >  (define_insn "aarch64_st4<vstruct_elt>_dreg"
>> >> > -  [(set (match_operand:VSTRUCT_4DX 0
>> "aarch64_simd_struct_operand"
>> >> > "=Utv")
>> >> > +  [(set (match_operand:VSTRUCT_4DX 0 "memory_operand" "=Utv")
>> >> >  	(unspec:VSTRUCT_4DX [
>> >> >  		(match_operand:VSTRUCT_4DX 1 "register_operand" "w")]
>> >> >  		UNSPEC_ST4))]
>> >> > @@ -7974,7 +7974,7 @@ (define_expand "vec_init<mode><Vhalf>"
>> >> >  (define_insn "*aarch64_simd_ld1r<mode>"
>> >> >    [(set (match_operand:VALL_F16 0 "register_operand" "=w")
>> >> >  	(vec_duplicate:VALL_F16
>> >> > -	  (match_operand:<VEL> 1 "aarch64_simd_struct_operand" "Utv")))]
>> >> > +	  (match_operand:<VEL> 1 "memory_operand" "Utv")))]
>> >> >    "TARGET_SIMD"
>> >> >    "ld1r\\t{%0.<Vtype>}, %1"
>> >> >    [(set_attr "type" "neon_load1_all_lanes")] @@ -7983,7 +7983,7 @@
>> >> > (define_insn "*aarch64_simd_ld1r<mode>"
>> >> >  (define_insn "aarch64_simd_ld1<vstruct_elt>_x2"
>> >> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
>> >> >  	(unspec:VSTRUCT_2QD [
>> >> > -	    (match_operand:VSTRUCT_2QD 1 "aarch64_simd_struct_operand"
>> >> "Utv")]
>> >> > +	    (match_operand:VSTRUCT_2QD 1 "memory_operand" "Utv")]
>> >> >  	    UNSPEC_LD1))]
>> >> >    "TARGET_SIMD"
>> >> >    "ld1\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
>> >> > diff --git a/gcc/config/aarch64/predicates.md
>> >> > b/gcc/config/aarch64/predicates.md
>> >> > index
>> >> >
>> >>
>> c308015ac2c13d24cd6bcec71247ec45df8cf5e6..6b70a364530c8108457091bfec
>> >> 12
>> >> > fe549f722149 100644
>> >> > --- a/gcc/config/aarch64/predicates.md
>> >> > +++ b/gcc/config/aarch64/predicates.md
>> >> > @@ -494,10 +494,6 @@ (define_predicate
>> >> "aarch64_simd_reg_or_minus_one"
>> >> >    (ior (match_operand 0 "register_operand")
>> >> >         (match_operand 0 "aarch64_simd_imm_minus_one")))
>> >> >
>> >> > -(define_predicate "aarch64_simd_struct_operand"
>> >> > -  (and (match_code "mem")
>> >> > -       (match_test "TARGET_SIMD && aarch64_simd_mem_operand_p
>> >> (op)")))
>> >> > -
>> >> >  ;; Like general_operand but allow only valid SIMD addressing modes.
>> >> >  (define_predicate "aarch64_simd_general_operand"
>> >> >    (and (match_operand 0 "general_operand") diff --git
>> >> > a/gcc/testsuite/gcc.target/aarch64/vld1r.c
>> >> > b/gcc/testsuite/gcc.target/aarch64/vld1r.c
>> >> > new file mode 100644
>> >> > index
>> >> >
>> >>
>> 0000000000000000000000000000000000000000..72c505c403e9e239771379b7ca
>> >> dd
>> >> > 8a9473f06113
>> >> > --- /dev/null
>> >> > +++ b/gcc/testsuite/gcc.target/aarch64/vld1r.c
>> >> > @@ -0,0 +1,26 @@
>> >> > +/* { dg-do compile } */
>> >> > +/* { dg-additional-options "-O" } */
>> >> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } }
>> >> > +} } */
>> >> > +
>> >> > +#include <arm_neon.h>
>> >> > +
>> >> > +/*
>> >> > +** f1:
>> >> > +** 	add	x0, x0, 1
>> >> > +** 	ld1r	{v0.8b}, \[x0\]
>> >> > +** 	ret
>> >> > +*/
>> >> > +uint8x8_t f1(const uint8_t *in) {
>> >> > +    return vld1_dup_u8(&in[1]);
>> >> > +}
>> >> > +
>> >> > +/*
>> >> > +** f2:
>> >> > +** 	ldr	s1, \[x0, 4\]
>> >> > +** 	fmla	v0.4s, v0.4s, v1.s\[0\]
>> >> > +** 	ret
>> >> > +*/
>> >> > +float32x4_t f2(const float32_t *in, float32x4_t a) {
>> >> > +    float32x4_t dup = vld1q_dup_f32(&in[1]);
>> >> > +    return vfmaq_laneq_f32 (a, a, dup, 1); }
  
Tamar Christina June 9, 2022, 8:43 a.m. UTC | #6
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Thursday, June 9, 2022 9:22 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>; rguenther@suse.de;
> roger@nextmovesoftware.com
> Subject: Re: [PATCH]AArch64 relax predicate on load structure load
> instructions
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> Sent: Wednesday, June 8, 2022 3:36 PM
> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>;
> >> rguenther@suse.de; roger@eyesopen.com
> >> Subject: Re: [PATCH]AArch64 relax predicate on load structure load
> >> instructions
> >>
> >> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> >> -----Original Message-----
> >> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> >> Sent: Wednesday, June 8, 2022 11:31 AM
> >> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> >> <Kyrylo.Tkachov@arm.com>
> >> >> Subject: Re: [PATCH]AArch64 relax predicate on load structure load
> >> >> instructions
> >> >>
> >> >> Tamar Christina <tamar.christina@arm.com> writes:
> >> >> > Hi All,
> >> >> >
> >> >> > At some point in time we started lowering the ld1r instructions
> >> >> > in
> >> gimple.
> >> >> >
> >> >> > That is:
> >> >> >
> >> >> > uint8x8_t f1(const uint8_t *in) {
> >> >> >     return vld1_dup_u8(&in[1]);
> >> >> > }
> >> >> >
> >> >> > generates at gimple:
> >> >> >
> >> >> >   _3 = MEM[(const uint8_t *)in_1(D) + 1B];
> >> >> >   _4 = {_3, _3, _3, _3, _3, _3, _3, _3};
> >> >> >
> >> >> > Which is good, but we then generate:
> >> >> >
> >> >> > f1:
> >> >> > 	ldr     b0, [x0, 1]
> >> >> > 	dup     v0.8b, v0.b[0]
> >> >> > 	ret
> >> >> >
> >> >> > instead of ld1r.
> >> >> >
> >> >> > The reason for this is because the load instructions have a too
> >> >> > restrictive predicate on them which causes combine not to be
> >> >> > able to combine the instructions due to the predicate only
> >> >> > accepting simple
> >> >> addressing modes.
> >> >> >
> >> >> > This patch relaxes the predicate to accept any memory operand
> >> >> > and relies on LRA to legitimize the address when it needs to as
> >> >> > the constraint still only allows the simple addressing mode.
> >> >> > Reload is always able to legitimize to these.
> >> >> >
> >> >> > Secondly since we are now actually generating more ld1r it
> >> >> > became clear that the lane instructions suffer from a similar issue.
> >> >> >
> >> >> > i.e.
> >> >> >
> >> >> > float32x4_t f2(const float32_t *in, float32x4_t a) {
> >> >> >     float32x4_t dup = vld1q_dup_f32(&in[1]);
> >> >> >     return vfmaq_laneq_f32 (a, a, dup, 1); }
> >> >> >
> >> >> > would generate ld1r + vector fmla instead of ldr + lane fmla.
> >> >> >
> >> >> > The reason for this is similar to the ld1r issue.  The predicate
> >> >> > is too restrictive in only acception register operands but not memory.
> >> >> >
> >> >> > This relaxes it to accept register and/or memory while leaving
> >> >> > the constraint to only accept registers.  This will have LRA
> >> >> > generate a reload if needed forcing the memory to registers
> >> >> > using the standard
> >> >> patterns.
> >> >> >
> >> >> > These two changes allow combine and reload to generate the right
> >> >> sequences.
> >> >> >
> >> >> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >> >>
> >> >> This is going against the general direction of travel, which is to
> >> >> make the instruction's predicates and conditions enforce the
> >> >> constraints as much as possible (making optimistic assumptions
> >> >> about
> >> pseudo registers).
> >> >>
> >> >> The RA *can* deal with things like:
> >> >>
> >> >>   (match_operand:M N "general_operand" "r")
> >> >>
> >> >> but it's best avoided, for a few reasons:
> >> >>
> >> >> (1) The fix-up will be done in LRA, so IRA will not see the temporary
> >> >>     registers.  This can make the allocation of those temporaries
> >> >>     suboptimal but (more importantly) it might require other
> >> >>     previously-allocated registers to be spilled late due to the
> >> >>     unexpected increase in register pressure.
> >> >>
> >> >> (2) It ends up hiding instructions from the pre-RA optimisers.
> >> >>
> >> >> (3) It can also prevent combine opportunities (as well as create them),
> >> >>     unless the loose predicates in an insn I are propagated to all
> >> >>     patterns that might result from combining I with something else.
> >> >>
> >> >> It sounds like the first problem (not generating ld1r) could be
> >> >> fixed by (a) combining aarch64_simd_dup<mode> and
> >> *aarch64_simd_ld1r<mode>,
> >> >> so that the register and memory alternatives are in the same
> >> >> pattern and (b) using the merged instruction(s) to implement the
> >> >> vec_duplicate
> >> optab.
> >> >> Target-independent code should then make the address satisfy the
> >> >> predicate, simplifying the address where necessary.
> >> >>
> >> >
> >> > I think I am likely missing something here. I would assume that you
> >> > wanted to use the optab to split the addressing off from the mem
> >> > expression so the combined insn matches.
> >> >
> >> > But in that case, why do you need to combine the two instructions?
> >> > I've tried and it doesn't work since the vec_duplicate optab
> >> > doesn't see the mem as op1, because in gimple the mem is not part
> >> > of the
> >> duplicate.
> >> >
> >> > So you still just see:
> >> >
> >> >>>> dbgrtx (ops[1].value)
> >> > (subreg/s/v:QI (reg:SI 92 [ _3 ]) 0)
> >> >
> >> > As the operand as the argument to the dup is just an SSA_NAME.
> >>
> >> Ah, yeah, I'd forgotten that fixed-length vec_duplicates would come
> >> from a constructor rather than a vec_duplicate_expr, so we don't get
> >> the usual benefit of folding single-use mems during expand.
> >>
> >> https://gcc.gnu.org/pipermail/gcc-patches/2022-May/595362.html
> >> moves towards using vec_duplicate even for fixed-length vectors.
> >> If we take that approach, then I suppose a plain constructor should
> >> be folded to a vec_duplicate where possible.
> >>
> >> (Alternatively, we could use an extended vec_perm_expr with scalar
> >> inputs, as Richi suggested in that thread.)
> >>
> >> If we don't do that, or don't do it yet, then…
> >>
> >> > If not and you wanted the combined insn to accept
> >> >
> >> > (set (reg:SI 92 [ _3 ])
> >> >     (zero_extend:SI (mem:QI (plus:DI (reg:DI 97)
> >> >                 (const_int 1 [0x1])) [0 MEM[(const uint8_tD.4561
> >> > *)in_1(D) + 1B]+0 S1 A8])))
> >> >
> >> > Then that's also not possible without relaxing the combined
> >> > predicates.  As far as I can tell If I'm not allowed to use LRA for
> >> > this, then
> >> the only thing that could work is an early split?
> >> >
> >> > Or do I have to modify store_constructor to try a variant where it
> >> > tries pushing in the Decl of an SSA_NAME first?
> >>
> >> …yeah, something like this would be needed.  But the
> >> vec_duplicate_expr/ vec_perm_expr thing seems better, even if we only
> introduce it during isel.
> >>
> >> Not my call either way though :-)  Let's see what Richi (cc:ed) thinks.
> >
> > FWIW, since my inner "Richards like patch" detector still needs
> > tunings ?
> > I did a quick experiment.  Teaching gimple_build_vector_from_val to
> > allow the non-constant case and then teaching
> > simplify_vector_constructor to use it for the non-constant case gets them
> generated.
> >
> > Then I had to teach aarch64_expand_vector_init to generate
> > vec_duplicate_expr when the value is non-constant works.
> >
> > I thought about skipping vec_init entirely in this case during
> > expansion however there doesn't seem to be a way to test for
> > vec_duplicate_expr as Richi mentioned, it doesn't seem to have an
> associated optab.
> 
> I don't understand, sorry.  The optab is vec_duplicate_optab (generated via
> expand_vector_broadcast), and although we don't implement that for
> Advanced SIMD yet, the point of the above was that we would.

Ah sorry, you're right. I had mistakenly through that since there were different
EXPR codes for them that they would expand to different optabs. But indeed, the
With the other changes above and the expander expand_vector_broadcast
goes to it directly.

Though should probably still change aarch64_expand_vector_init
to remove the forced copy_to_reg for any code that calls it.

Cheers,
Tamar

> 
> Thanks,
> Richard
> 
> >
> > This approach does fix the problem, but I'll hold out on cleaning it
> > up till I hear it's acceptable.
> >
> > Cheers,
> > Tamar
> >
> >>
> >> Thanks,
> >> Richard
> >>
> >> > I guess this also only really works for ld1r, whenever we lower
> >> > ld2(r) etc we'll have the same issue again... But I suppose that's
> >> > for the next person ?
> >> >
> >> > Thanks,
> >> > Tamar
> >> >
> >> >> I'm not sure whether fixing the ld1r problem that way will avoid
> >> >> the
> >> >> vfmaq_laneq_f32 problem; let me know if not.
> >> >>
> >> >> Thanks,
> >> >> Richard
> >> >>
> >> >> > Ok for master?
> >> >> >
> >> >> > Thanks,
> >> >> > Tamar
> >> >> >
> >> >> > gcc/ChangeLog:
> >> >> >
> >> >> > 	* config/aarch64/aarch64-simd.md (mul_lane<mode>3,
> >> >> mul_laneq<mode>3,
> >> >> > 	mul_n<mode>3, *aarch64_mul3_elt_to_64v2df,
> >> >> *aarch64_mla_elt<mode>,
> >> >> > 	*aarch64_mla_elt_<vswap_width_name><mode>,
> >> >> aarch64_mla_n<mode>,
> >> >> > 	*aarch64_mls_elt<mode>,
> >> >> *aarch64_mls_elt_<vswap_width_name><mode>,
> >> >> > 	aarch64_mls_n<mode>, *aarch64_fma4_elt<mode>,
> >> >> > 	*aarch64_fma4_elt_<vswap_width_name><mode>,
> >> >> > 	*aarch64_fma4_elt_from_dup<mode>,
> >> >> *aarch64_fma4_elt_to_64v2df,
> >> >> > 	*aarch64_fnma4_elt<mode>,
> >> >> *aarch64_fnma4_elt_<vswap_width_name><mode>,
> >> >> > 	*aarch64_fnma4_elt_from_dup<mode>,
> >> >> *aarch64_fnma4_elt_to_64v2df,
> >> >> > 	*aarch64_mulx_elt_<vswap_width_name><mode>,
> >> >> > 	*aarch64_mulx_elt<mode>,
> *aarch64_mulx_elt_from_dup<mode>,
> >> >> > 	*aarch64_vgetfmulx<mode>): Relax register_operand to
> >> >> > 	nonimmediate_operand.
> >> >> > 	(aarch64_simd_ld2<vstruct_elt>,
> aarch64_simd_ld2r<vstruct_elt>,
> >> >> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
> >> >> > 	vec_load_lanes<mode><vstruct_elt>,
> >> >> aarch64_simd_st2<vstruct_elt>,
> >> >> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
> >> >> > 	vec_store_lanes<mode><vstruct_elt>,
> >> >> aarch64_simd_ld3<vstruct_elt>,
> >> >> > 	aarch64_simd_ld3r<vstruct_elt>,
> >> >> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
> >> >> > 	vec_load_lanes<mode><vstruct_elt>,
> >> >> aarch64_simd_st3<vstruct_elt>,
> >> >> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
> >> >> > 	vec_store_lanes<mode><vstruct_elt>,
> >> >> aarch64_simd_ld4<vstruct_elt>,
> >> >> > 	aarch64_simd_ld4r<vstruct_elt>,
> >> >> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
> >> >> > 	vec_load_lanes<mode><vstruct_elt>,
> >> >> aarch64_simd_st4<vstruct_elt>,
> >> >> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
> >> >> > 	vec_store_lanes<mode><vstruct_elt>,
> >> >> aarch64_ld1_x3_<vstruct_elt>,
> >> >> > 	aarch64_ld1_x4_<vstruct_elt>,
> aarch64_st1_x2_<vstruct_elt>,
> >> >> > 	aarch64_st1_x3_<vstruct_elt>,
> aarch64_st1_x4_<vstruct_elt>,
> >> >> > 	aarch64_be_ld1<mode>, aarch64_be_st1<mode>,
> >> >> > 	aarch64_ld2<vstruct_elt>_dreg,
> aarch64_ld2<vstruct_elt>_dreg,
> >> >> > 	aarch64_ld3<vstruct_elt>_dreg,
> aarch64_ld3<vstruct_elt>_dreg,
> >> >> > 	aarch64_ld4<vstruct_elt>_dreg,
> aarch64_ld4<vstruct_elt>_dreg,
> >> >> > 	aarch64_st2<vstruct_elt>_dreg,
> aarch64_st2<vstruct_elt>_dreg,
> >> >> > 	aarch64_st3<vstruct_elt>_dreg,
> aarch64_st3<vstruct_elt>_dreg,
> >> >> > 	aarch64_st4<vstruct_elt>_dreg,
> aarch64_st4<vstruct_elt>_dreg,
> >> >> > 	*aarch64_simd_ld1r<mode>,
> aarch64_simd_ld1<vstruct_elt>_x2):
> >> >> Relax
> >> >> > 	aarch64_simd_struct_operand to memory_operand.
> >> >> > 	* config/aarch64/predicates.md
> (aarch64_simd_struct_operand):
> >> >> Remove.
> >> >> >
> >> >> > gcc/testsuite/ChangeLog:
> >> >> >
> >> >> > 	* gcc.target/aarch64/vld1r.c: New test.
> >> >> >
> >> >> > --- inline copy of patch --
> >> >> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> >> >> > b/gcc/config/aarch64/aarch64-simd.md
> >> >> > index
> >> >> >
> >> >>
> >>
> be5c70bbb7520ae93d19c4a432ce34863e5b9a64..24e3274ddda2ea76c83571fa
> >> >> da8f
> >> >> > f4c953b752a1 100644
> >> >> > --- a/gcc/config/aarch64/aarch64-simd.md
> >> >> > +++ b/gcc/config/aarch64/aarch64-simd.md
> >> >> > @@ -712,7 +712,7 @@ (define_insn "mul_lane<mode>3"
> >> >> >         (mult:VMULD
> >> >> >  	 (vec_duplicate:VMULD
> >> >> >  	   (vec_select:<VEL>
> >> >> > -	     (match_operand:<VCOND> 2 "register_operand"
> "<h_con>")
> >> >> > +	     (match_operand:<VCOND> 2 "nonimmediate_operand"
> >> >> "<h_con>")
> >> >> >  	     (parallel [(match_operand:SI 3 "immediate_operand"
> "i")])))
> >> >> >  	 (match_operand:VMULD 1 "register_operand" "w")))]
> >> >> >    "TARGET_SIMD"
> >> >> > @@ -728,7 +728,7 @@ (define_insn "mul_laneq<mode>3"
> >> >> >       (mult:VMUL
> >> >> >         (vec_duplicate:VMUL
> >> >> >  	  (vec_select:<VEL>
> >> >> > -	    (match_operand:<VCONQ> 2 "register_operand"
> "<h_con>")
> >> >> > +	    (match_operand:<VCONQ> 2 "nonimmediate_operand"
> >> >> "<h_con>")
> >> >> >  	    (parallel [(match_operand:SI 3 "immediate_operand")])))
> >> >> >        (match_operand:VMUL 1 "register_operand" "w")))]
> >> >> >    "TARGET_SIMD"
> >> >> > @@ -743,7 +743,7 @@ (define_insn "mul_n<mode>3"
> >> >> >   [(set (match_operand:VMUL 0 "register_operand" "=w")
> >> >> >         (mult:VMUL
> >> >> >  	 (vec_duplicate:VMUL
> >> >> > -	   (match_operand:<VEL> 2 "register_operand" "<h_con>"))
> >> >> > +	   (match_operand:<VEL> 2 "nonimmediate_operand"
> "<h_con>"))
> >> >> >  	 (match_operand:VMUL 1 "register_operand" "w")))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"; @@ -789,7
> >> >> > +789,7 @@ (define_insn "*aarch64_mul3_elt_to_64v2df"
> >> >> >    [(set (match_operand:DF 0 "register_operand" "=w")
> >> >> >       (mult:DF
> >> >> >         (vec_select:DF
> >> >> > -	 (match_operand:V2DF 1 "register_operand" "w")
> >> >> > +	 (match_operand:V2DF 1 "nonimmediate_operand" "w")
> >> >> >  	 (parallel [(match_operand:SI 2 "immediate_operand")]))
> >> >> >         (match_operand:DF 3 "register_operand" "w")))]
> >> >> >    "TARGET_SIMD"
> >> >> > @@ -1406,7 +1406,7 @@ (define_insn "*aarch64_mla_elt<mode>"
> >> >> >  	 (mult:VDQHS
> >> >> >  	   (vec_duplicate:VDQHS
> >> >> >  	      (vec_select:<VEL>
> >> >> > -		(match_operand:VDQHS 1 "register_operand"
> "<h_con>")
> >> >> > +		(match_operand:VDQHS 1 "nonimmediate_operand"
> >> >> "<h_con>")
> >> >> >  		  (parallel [(match_operand:SI 2
> "immediate_operand")])))
> >> >> >  	   (match_operand:VDQHS 3 "register_operand" "w"))
> >> >> >  	 (match_operand:VDQHS 4 "register_operand" "0")))] @@ -
> 1424,7
> >> >> > +1424,7 @@ (define_insn
> >> >> "*aarch64_mla_elt_<vswap_width_name><mode>"
> >> >> >  	 (mult:VDQHS
> >> >> >  	   (vec_duplicate:VDQHS
> >> >> >  	      (vec_select:<VEL>
> >> >> > -		(match_operand:<VSWAP_WIDTH> 1
> "register_operand"
> >> >> "<h_con>")
> >> >> > +		(match_operand:<VSWAP_WIDTH> 1
> >> >> "nonimmediate_operand" "<h_con>")
> >> >> >  		  (parallel [(match_operand:SI 2
> "immediate_operand")])))
> >> >> >  	   (match_operand:VDQHS 3 "register_operand" "w"))
> >> >> >  	 (match_operand:VDQHS 4 "register_operand" "0")))] @@ -
> 1441,7
> >> >> > +1441,7 @@ (define_insn "aarch64_mla_n<mode>"
> >> >> >  	(plus:VDQHS
> >> >> >  	  (mult:VDQHS
> >> >> >  	    (vec_duplicate:VDQHS
> >> >> > -	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
> >> >> > +	      (match_operand:<VEL> 3 "nonimmediate_operand"
> >> >> > +"<h_con>"))
> >> >> >  	    (match_operand:VDQHS 2 "register_operand" "w"))
> >> >> >  	  (match_operand:VDQHS 1 "register_operand" "0")))]
> >> >> >   "TARGET_SIMD"
> >> >> > @@ -1466,7 +1466,7 @@ (define_insn "*aarch64_mls_elt<mode>"
> >> >> >  	 (mult:VDQHS
> >> >> >  	   (vec_duplicate:VDQHS
> >> >> >  	      (vec_select:<VEL>
> >> >> > -		(match_operand:VDQHS 1 "register_operand"
> "<h_con>")
> >> >> > +		(match_operand:VDQHS 1 "nonimmediate_operand"
> >> >> "<h_con>")
> >> >> >  		  (parallel [(match_operand:SI 2
> "immediate_operand")])))
> >> >> >  	   (match_operand:VDQHS 3 "register_operand" "w"))))]
> >> >> >   "TARGET_SIMD"
> >> >> > @@ -1484,7 +1484,7 @@ (define_insn
> >> >> "*aarch64_mls_elt_<vswap_width_name><mode>"
> >> >> >  	 (mult:VDQHS
> >> >> >  	   (vec_duplicate:VDQHS
> >> >> >  	      (vec_select:<VEL>
> >> >> > -		(match_operand:<VSWAP_WIDTH> 1
> "register_operand"
> >> >> "<h_con>")
> >> >> > +		(match_operand:<VSWAP_WIDTH> 1
> >> >> "nonimmediate_operand" "<h_con>")
> >> >> >  		  (parallel [(match_operand:SI 2
> "immediate_operand")])))
> >> >> >  	   (match_operand:VDQHS 3 "register_operand" "w"))))]
> >> >> >   "TARGET_SIMD"
> >> >> > @@ -1501,7 +1501,7 @@ (define_insn "aarch64_mls_n<mode>"
> >> >> >  	  (match_operand:VDQHS 1 "register_operand" "0")
> >> >> >  	  (mult:VDQHS
> >> >> >  	    (vec_duplicate:VDQHS
> >> >> > -	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
> >> >> > +	      (match_operand:<VEL> 3 "nonimmediate_operand"
> >> >> > +"<h_con>"))
> >> >> >  	    (match_operand:VDQHS 2 "register_operand" "w"))))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "mls\t%0.<Vtype>, %2.<Vtype>, %3.<Vetype>[0]"
> >> >> > @@ -2882,7 +2882,7 @@ (define_insn "*aarch64_fma4_elt<mode>"
> >> >> >      (fma:VDQF
> >> >> >        (vec_duplicate:VDQF
> >> >> >  	(vec_select:<VEL>
> >> >> > -	  (match_operand:VDQF 1 "register_operand" "<h_con>")
> >> >> > +	  (match_operand:VDQF 1 "nonimmediate_operand"
> "<h_con>")
> >> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >> >        (match_operand:VDQF 3 "register_operand" "w")
> >> >> >        (match_operand:VDQF 4 "register_operand" "0")))] @@
> >> >> > -2899,7
> >> >> > +2899,7 @@ (define_insn
> >> >> "*aarch64_fma4_elt_<vswap_width_name><mode>"
> >> >> >      (fma:VDQSF
> >> >> >        (vec_duplicate:VDQSF
> >> >> >  	(vec_select:<VEL>
> >> >> > -	  (match_operand:<VSWAP_WIDTH> 1 "register_operand"
> >> >> "<h_con>")
> >> >> > +	  (match_operand:<VSWAP_WIDTH> 1
> "nonimmediate_operand"
> >> >> "<h_con>")
> >> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >> >        (match_operand:VDQSF 3 "register_operand" "w")
> >> >> >        (match_operand:VDQSF 4 "register_operand" "0")))] @@
> >> >> > -2915,7
> >> >> > +2915,7 @@ (define_insn "*aarch64_fma4_elt_from_dup<mode>"
> >> >> >    [(set (match_operand:VMUL 0 "register_operand" "=w")
> >> >> >      (fma:VMUL
> >> >> >        (vec_duplicate:VMUL
> >> >> > -	  (match_operand:<VEL> 1 "register_operand" "<h_con>"))
> >> >> > +	  (match_operand:<VEL> 1 "nonimmediate_operand"
> "<h_con>"))
> >> >> >        (match_operand:VMUL 2 "register_operand" "w")
> >> >> >        (match_operand:VMUL 3 "register_operand" "0")))]
> >> >> >    "TARGET_SIMD"
> >> >> > @@ -2927,7 +2927,7 @@ (define_insn
> "*aarch64_fma4_elt_to_64v2df"
> >> >> >    [(set (match_operand:DF 0 "register_operand" "=w")
> >> >> >      (fma:DF
> >> >> >  	(vec_select:DF
> >> >> > -	  (match_operand:V2DF 1 "register_operand" "w")
> >> >> > +	  (match_operand:V2DF 1 "nonimmediate_operand" "w")
> >> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")]))
> >> >> >        (match_operand:DF 3 "register_operand" "w")
> >> >> >        (match_operand:DF 4 "register_operand" "0")))] @@ -2957,7
> >> >> > +2957,7 @@ (define_insn "*aarch64_fnma4_elt<mode>"
> >> >> >          (match_operand:VDQF 3 "register_operand" "w"))
> >> >> >        (vec_duplicate:VDQF
> >> >> >  	(vec_select:<VEL>
> >> >> > -	  (match_operand:VDQF 1 "register_operand" "<h_con>")
> >> >> > +	  (match_operand:VDQF 1 "nonimmediate_operand"
> "<h_con>")
> >> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >> >        (match_operand:VDQF 4 "register_operand" "0")))]
> >> >> >    "TARGET_SIMD"
> >> >> > @@ -2975,7 +2975,7 @@ (define_insn
> >> >> "*aarch64_fnma4_elt_<vswap_width_name><mode>"
> >> >> >          (match_operand:VDQSF 3 "register_operand" "w"))
> >> >> >        (vec_duplicate:VDQSF
> >> >> >  	(vec_select:<VEL>
> >> >> > -	  (match_operand:<VSWAP_WIDTH> 1 "register_operand"
> >> >> "<h_con>")
> >> >> > +	  (match_operand:<VSWAP_WIDTH> 1
> "nonimmediate_operand"
> >> >> "<h_con>")
> >> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >> >        (match_operand:VDQSF 4 "register_operand" "0")))]
> >> >> >    "TARGET_SIMD"
> >> >> > @@ -2992,7 +2992,7 @@ (define_insn
> >> >> "*aarch64_fnma4_elt_from_dup<mode>"
> >> >> >        (neg:VMUL
> >> >> >          (match_operand:VMUL 2 "register_operand" "w"))
> >> >> >        (vec_duplicate:VMUL
> >> >> > -	(match_operand:<VEL> 1 "register_operand" "<h_con>"))
> >> >> > +	(match_operand:<VEL> 1 "nonimmediate_operand"
> "<h_con>"))
> >> >> >        (match_operand:VMUL 3 "register_operand" "0")))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
> >> >> > @@ -3003,7 +3003,7 @@ (define_insn
> >> "*aarch64_fnma4_elt_to_64v2df"
> >> >> >    [(set (match_operand:DF 0 "register_operand" "=w")
> >> >> >      (fma:DF
> >> >> >        (vec_select:DF
> >> >> > -	(match_operand:V2DF 1 "register_operand" "w")
> >> >> > +	(match_operand:V2DF 1 "nonimmediate_operand" "w")
> >> >> >  	(parallel [(match_operand:SI 2 "immediate_operand")]))
> >> >> >        (neg:DF
> >> >> >          (match_operand:DF 3 "register_operand" "w")) @@ -4934,7
> >> >> > +4934,7 @@ (define_insn
> >> >> "*aarch64_mulx_elt_<vswap_width_name><mode>"
> >> >> >  	 [(match_operand:VDQSF 1 "register_operand" "w")
> >> >> >  	  (vec_duplicate:VDQSF
> >> >> >  	   (vec_select:<VEL>
> >> >> > -	    (match_operand:<VSWAP_WIDTH> 2 "register_operand"
> "w")
> >> >> > +	    (match_operand:<VSWAP_WIDTH> 2
> "nonimmediate_operand"
> >> >> "w")
> >> >> >  	    (parallel [(match_operand:SI 3 "immediate_operand"
> "i")])))]
> >> >> >  	 UNSPEC_FMULX))]
> >> >> >    "TARGET_SIMD"
> >> >> > @@ -4953,7 +4953,7 @@ (define_insn "*aarch64_mulx_elt<mode>"
> >> >> >  	 [(match_operand:VDQF 1 "register_operand" "w")
> >> >> >  	  (vec_duplicate:VDQF
> >> >> >  	   (vec_select:<VEL>
> >> >> > -	    (match_operand:VDQF 2 "register_operand" "w")
> >> >> > +	    (match_operand:VDQF 2 "nonimmediate_operand" "w")
> >> >> >  	    (parallel [(match_operand:SI 3 "immediate_operand"
> "i")])))]
> >> >> >  	 UNSPEC_FMULX))]
> >> >> >    "TARGET_SIMD"
> >> >> > @@ -4971,7 +4971,7 @@ (define_insn
> >> >> "*aarch64_mulx_elt_from_dup<mode>"
> >> >> >  	(unspec:VHSDF
> >> >> >  	 [(match_operand:VHSDF 1 "register_operand" "w")
> >> >> >  	  (vec_duplicate:VHSDF
> >> >> > -	    (match_operand:<VEL> 2 "register_operand" "<h_con>"))]
> >> >> > +	    (match_operand:<VEL> 2 "nonimmediate_operand"
> "<h_con>"))]
> >> >> >  	 UNSPEC_FMULX))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "fmulx\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"; @@ -4987,7
> >> >> +4987,7
> >> >> > @@ (define_insn "*aarch64_vgetfmulx<mode>"
> >> >> >  	(unspec:<VEL>
> >> >> >  	 [(match_operand:<VEL> 1 "register_operand" "w")
> >> >> >  	  (vec_select:<VEL>
> >> >> > -	   (match_operand:VDQF 2 "register_operand" "w")
> >> >> > +	   (match_operand:VDQF 2 "nonimmediate_operand" "w")
> >> >> >  	    (parallel [(match_operand:SI 3 "immediate_operand"
> "i")]))]
> >> >> >  	 UNSPEC_FMULX))]
> >> >> >    "TARGET_SIMD"
> >> >> > @@ -6768,7 +6768,7 @@ (define_insn "*sqrt<mode>2"
> >> >> >  (define_insn "aarch64_simd_ld2<vstruct_elt>"
> >> >> >    [(set (match_operand:VSTRUCT_2Q 0 "register_operand" "=w")
> >> >> >  	(unspec:VSTRUCT_2Q [
> >> >> > -	  (match_operand:VSTRUCT_2Q 1
> "aarch64_simd_struct_operand"
> >> >> "Utv")]
> >> >> > +	  (match_operand:VSTRUCT_2Q 1 "memory_operand"
> "Utv")]
> >> >> >  	  UNSPEC_LD2))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> >> >> > @@ -6778,7 +6778,7 @@ (define_insn
> "aarch64_simd_ld2<vstruct_elt>"
> >> >> >  (define_insn "aarch64_simd_ld2r<vstruct_elt>"
> >> >> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
> >> >> >  	(unspec:VSTRUCT_2QD [
> >> >> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand"
> "Utv")]
> >> >> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
> >> >> >            UNSPEC_LD2_DUP))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "ld2r\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> >> >> > @@ -6788,7 +6788,7 @@ (define_insn
> >> "aarch64_simd_ld2r<vstruct_elt>"
> >> >> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
> >> >> >  	(unspec:VSTRUCT_2QD [
> >> >> > -		(match_operand:BLK 1
> "aarch64_simd_struct_operand"
> >> >> "Utv")
> >> >> > +		(match_operand:BLK 1 "memory_operand" "Utv")
> >> >> >  		(match_operand:VSTRUCT_2QD 2
> "register_operand" "0")
> >> >> >  		(match_operand:SI 3 "immediate_operand" "i")]
> >> >> >  		UNSPEC_LD2_LANE))]
> >> >> > @@ -6804,7 +6804,7 @@ (define_insn
> >> >> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
> >> >> >    [(set (match_operand:VSTRUCT_2Q 0 "register_operand")
> >> >> >  	(unspec:VSTRUCT_2Q [
> >> >> > -		(match_operand:VSTRUCT_2Q 1
> >> >> "aarch64_simd_struct_operand")]
> >> >> > +		(match_operand:VSTRUCT_2Q 1
> "memory_operand")]
> >> >> >  		UNSPEC_LD2))]
> >> >> >    "TARGET_SIMD"
> >> >> >  {
> >> >> > @@ -6822,7 +6822,7 @@ (define_expand
> >> >> "vec_load_lanes<mode><vstruct_elt>"
> >> >> >  })
> >> >> >
> >> >> >  (define_insn "aarch64_simd_st2<vstruct_elt>"
> >> >> > -  [(set (match_operand:VSTRUCT_2Q 0
> >> "aarch64_simd_struct_operand"
> >> >> > "=Utv")
> >> >> > +  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand" "=Utv")
> >> >> >  	(unspec:VSTRUCT_2Q [
> >> >> >  		(match_operand:VSTRUCT_2Q 1 "register_operand"
> "w")]
> >> >> >                  UNSPEC_ST2))]
> >> >> > @@ -6833,7 +6833,7 @@ (define_insn
> "aarch64_simd_st2<vstruct_elt>"
> >> >> >
> >> >> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
> >> >> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> >> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand"
> >> >> > "=Utv")
> >> >> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
> >> >> >  	(unspec:BLK [(match_operand:VSTRUCT_2QD 1
> "register_operand"
> >> >> "w")
> >> >> >  		     (match_operand:SI 2 "immediate_operand" "i")]
> >> >> >  		     UNSPEC_ST2_LANE))]
> >> >> > @@ -6847,7 +6847,7 @@ (define_insn
> >> >> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> >> >  )
> >> >> >
> >> >> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
> >> >> > -  [(set (match_operand:VSTRUCT_2Q 0
> >> "aarch64_simd_struct_operand")
> >> >> > +  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand")
> >> >> >  	(unspec:VSTRUCT_2Q [(match_operand:VSTRUCT_2Q 1
> >> >> "register_operand")]
> >> >> >                     UNSPEC_ST2))]
> >> >> >    "TARGET_SIMD"
> >> >> > @@ -6868,7 +6868,7 @@ (define_expand
> >> >> "vec_store_lanes<mode><vstruct_elt>"
> >> >> >  (define_insn "aarch64_simd_ld3<vstruct_elt>"
> >> >> >    [(set (match_operand:VSTRUCT_3Q 0 "register_operand" "=w")
> >> >> >  	(unspec:VSTRUCT_3Q [
> >> >> > -	  (match_operand:VSTRUCT_3Q 1
> "aarch64_simd_struct_operand"
> >> >> "Utv")]
> >> >> > +	  (match_operand:VSTRUCT_3Q 1 "memory_operand"
> "Utv")]
> >> >> >  	  UNSPEC_LD3))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> >> >> > @@ -6878,7 +6878,7 @@ (define_insn
> "aarch64_simd_ld3<vstruct_elt>"
> >> >> >  (define_insn "aarch64_simd_ld3r<vstruct_elt>"
> >> >> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
> >> >> >  	(unspec:VSTRUCT_3QD [
> >> >> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand"
> "Utv")]
> >> >> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
> >> >> >            UNSPEC_LD3_DUP))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "ld3r\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> >> >> > @@ -6888,7 +6888,7 @@ (define_insn
> >> "aarch64_simd_ld3r<vstruct_elt>"
> >> >> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
> >> >> >  	(unspec:VSTRUCT_3QD [
> >> >> > -		(match_operand:BLK 1
> "aarch64_simd_struct_operand"
> >> >> "Utv")
> >> >> > +		(match_operand:BLK 1 "memory_operand" "Utv")
> >> >> >  		(match_operand:VSTRUCT_3QD 2
> "register_operand" "0")
> >> >> >  		(match_operand:SI 3 "immediate_operand" "i")]
> >> >> >  		UNSPEC_LD3_LANE))]
> >> >> > @@ -6904,7 +6904,7 @@ (define_insn
> >> >> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
> >> >> >    [(set (match_operand:VSTRUCT_3Q 0 "register_operand")
> >> >> >  	(unspec:VSTRUCT_3Q [
> >> >> > -		(match_operand:VSTRUCT_3Q 1
> >> >> "aarch64_simd_struct_operand")]
> >> >> > +		(match_operand:VSTRUCT_3Q 1
> "memory_operand")]
> >> >> >  		UNSPEC_LD3))]
> >> >> >    "TARGET_SIMD"
> >> >> >  {
> >> >> > @@ -6922,7 +6922,7 @@ (define_expand
> >> >> "vec_load_lanes<mode><vstruct_elt>"
> >> >> >  })
> >> >> >
> >> >> >  (define_insn "aarch64_simd_st3<vstruct_elt>"
> >> >> > -  [(set (match_operand:VSTRUCT_3Q 0
> >> "aarch64_simd_struct_operand"
> >> >> > "=Utv")
> >> >> > +  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand" "=Utv")
> >> >> >  	(unspec:VSTRUCT_3Q [(match_operand:VSTRUCT_3Q 1
> >> >> "register_operand" "w")]
> >> >> >                     UNSPEC_ST3))]
> >> >> >    "TARGET_SIMD"
> >> >> > @@ -6932,7 +6932,7 @@ (define_insn
> "aarch64_simd_st3<vstruct_elt>"
> >> >> >
> >> >> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
> >> >> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> >> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand"
> >> >> > "=Utv")
> >> >> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
> >> >> >  	(unspec:BLK [(match_operand:VSTRUCT_3QD 1
> "register_operand"
> >> >> "w")
> >> >> >  		     (match_operand:SI 2 "immediate_operand" "i")]
> >> >> >  		     UNSPEC_ST3_LANE))]
> >> >> > @@ -6946,7 +6946,7 @@ (define_insn
> >> >> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> >> >  )
> >> >> >
> >> >> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
> >> >> > -  [(set (match_operand:VSTRUCT_3Q 0
> >> "aarch64_simd_struct_operand")
> >> >> > +  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand")
> >> >> >  	(unspec:VSTRUCT_3Q [
> >> >> >  		(match_operand:VSTRUCT_3Q 1
> "register_operand")]
> >> >> >                  UNSPEC_ST3))]
> >> >> > @@ -6968,7 +6968,7 @@ (define_expand
> >> >> "vec_store_lanes<mode><vstruct_elt>"
> >> >> >  (define_insn "aarch64_simd_ld4<vstruct_elt>"
> >> >> >    [(set (match_operand:VSTRUCT_4Q 0 "register_operand" "=w")
> >> >> >  	(unspec:VSTRUCT_4Q [
> >> >> > -	  (match_operand:VSTRUCT_4Q 1
> "aarch64_simd_struct_operand"
> >> >> "Utv")]
> >> >> > +	  (match_operand:VSTRUCT_4Q 1 "memory_operand"
> "Utv")]
> >> >> >  	  UNSPEC_LD4))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> >> >> > @@ -6978,7 +6978,7 @@ (define_insn
> "aarch64_simd_ld4<vstruct_elt>"
> >> >> >  (define_insn "aarch64_simd_ld4r<vstruct_elt>"
> >> >> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
> >> >> >  	(unspec:VSTRUCT_4QD [
> >> >> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand"
> "Utv")]
> >> >> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
> >> >> >            UNSPEC_LD4_DUP))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "ld4r\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> >> >> > @@ -6988,7 +6988,7 @@ (define_insn
> >> "aarch64_simd_ld4r<vstruct_elt>"
> >> >> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
> >> >> >  	(unspec:VSTRUCT_4QD [
> >> >> > -		(match_operand:BLK 1
> "aarch64_simd_struct_operand"
> >> >> "Utv")
> >> >> > +		(match_operand:BLK 1 "memory_operand" "Utv")
> >> >> >  		(match_operand:VSTRUCT_4QD 2
> "register_operand" "0")
> >> >> >  		(match_operand:SI 3 "immediate_operand" "i")]
> >> >> >  		UNSPEC_LD4_LANE))]
> >> >> > @@ -7004,7 +7004,7 @@ (define_insn
> >> >> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
> >> >> >    [(set (match_operand:VSTRUCT_4Q 0 "register_operand")
> >> >> >  	(unspec:VSTRUCT_4Q [
> >> >> > -		(match_operand:VSTRUCT_4Q 1
> >> >> "aarch64_simd_struct_operand")]
> >> >> > +		(match_operand:VSTRUCT_4Q 1
> "memory_operand")]
> >> >> >  		UNSPEC_LD4))]
> >> >> >    "TARGET_SIMD"
> >> >> >  {
> >> >> > @@ -7022,7 +7022,7 @@ (define_expand
> >> >> "vec_load_lanes<mode><vstruct_elt>"
> >> >> >  })
> >> >> >
> >> >> >  (define_insn "aarch64_simd_st4<vstruct_elt>"
> >> >> > -  [(set (match_operand:VSTRUCT_4Q 0
> >> "aarch64_simd_struct_operand"
> >> >> > "=Utv")
> >> >> > +  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand" "=Utv")
> >> >> >  	(unspec:VSTRUCT_4Q [
> >> >> >  		(match_operand:VSTRUCT_4Q 1 "register_operand"
> "w")]
> >> >> >                  UNSPEC_ST4))]
> >> >> > @@ -7033,7 +7033,7 @@ (define_insn
> "aarch64_simd_st4<vstruct_elt>"
> >> >> >
> >> >> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
> >> >> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> >> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand"
> >> >> > "=Utv")
> >> >> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
> >> >> >  	(unspec:BLK [(match_operand:VSTRUCT_4QD 1
> "register_operand"
> >> >> "w")
> >> >> >  		     (match_operand:SI 2 "immediate_operand" "i")]
> >> >> >  		     UNSPEC_ST4_LANE))]
> >> >> > @@ -7047,7 +7047,7 @@ (define_insn
> >> >> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> >> >  )
> >> >> >
> >> >> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
> >> >> > -  [(set (match_operand:VSTRUCT_4Q 0
> >> "aarch64_simd_struct_operand")
> >> >> > +  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand")
> >> >> >  	(unspec:VSTRUCT_4Q [(match_operand:VSTRUCT_4Q 1
> >> >> "register_operand")]
> >> >> >                     UNSPEC_ST4))]
> >> >> >    "TARGET_SIMD"
> >> >> > @@ -7138,7 +7138,7 @@ (define_expand
> "aarch64_ld1x3<vstruct_elt>"
> >> >> >  (define_insn "aarch64_ld1_x3_<vstruct_elt>"
> >> >> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
> >> >> >          (unspec:VSTRUCT_3QD
> >> >> > -	  [(match_operand:VSTRUCT_3QD 1
> "aarch64_simd_struct_operand"
> >> >> "Utv")]
> >> >> > +	  [(match_operand:VSTRUCT_3QD 1 "memory_operand"
> "Utv")]
> >> >> >  	  UNSPEC_LD1))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> >> >> > @@ -7158,7 +7158,7 @@ (define_expand
> "aarch64_ld1x4<vstruct_elt>"
> >> >> >  (define_insn "aarch64_ld1_x4_<vstruct_elt>"
> >> >> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
> >> >> >  	(unspec:VSTRUCT_4QD
> >> >> > -	  [(match_operand:VSTRUCT_4QD 1
> "aarch64_simd_struct_operand"
> >> >> "Utv")]
> >> >> > +	  [(match_operand:VSTRUCT_4QD 1 "memory_operand"
> "Utv")]
> >> >> >  	UNSPEC_LD1))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "ld1\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> >> >> > @@ -7176,7 +7176,7 @@ (define_expand
> "aarch64_st1x2<vstruct_elt>"
> >> >> >  })
> >> >> >
> >> >> >  (define_insn "aarch64_st1_x2_<vstruct_elt>"
> >> >> > -  [(set (match_operand:VSTRUCT_2QD 0
> >> "aarch64_simd_struct_operand"
> >> >> > "=Utv")
> >> >> > +  [(set (match_operand:VSTRUCT_2QD 0 "memory_operand"
> "=Utv")
> >> >> >  	(unspec:VSTRUCT_2QD
> >> >> >  		[(match_operand:VSTRUCT_2QD 1
> "register_operand" "w")]
> >> >> >  		UNSPEC_ST1))]
> >> >> > @@ -7196,7 +7196,7 @@ (define_expand
> "aarch64_st1x3<vstruct_elt>"
> >> >> >  })
> >> >> >
> >> >> >  (define_insn "aarch64_st1_x3_<vstruct_elt>"
> >> >> > -  [(set (match_operand:VSTRUCT_3QD 0
> >> "aarch64_simd_struct_operand"
> >> >> > "=Utv")
> >> >> > +  [(set (match_operand:VSTRUCT_3QD 0 "memory_operand"
> "=Utv")
> >> >> >  	(unspec:VSTRUCT_3QD
> >> >> >  		[(match_operand:VSTRUCT_3QD 1
> "register_operand" "w")]
> >> >> >  		UNSPEC_ST1))]
> >> >> > @@ -7216,7 +7216,7 @@ (define_expand
> "aarch64_st1x4<vstruct_elt>"
> >> >> >  })
> >> >> >
> >> >> >  (define_insn "aarch64_st1_x4_<vstruct_elt>"
> >> >> > -  [(set (match_operand:VSTRUCT_4QD 0
> >> "aarch64_simd_struct_operand"
> >> >> > "=Utv")
> >> >> > +  [(set (match_operand:VSTRUCT_4QD 0 "memory_operand"
> "=Utv")
> >> >> >  	(unspec:VSTRUCT_4QD
> >> >> >  		[(match_operand:VSTRUCT_4QD 1
> "register_operand" "w")]
> >> >> >  		UNSPEC_ST1))]
> >> >> > @@ -7268,7 +7268,7 @@ (define_insn "*aarch64_movv8di"
> >> >> >  (define_insn "aarch64_be_ld1<mode>"
> >> >> >    [(set (match_operand:VALLDI_F16 0	"register_operand" "=w")
> >> >> >  	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1
> >> >> > -			     "aarch64_simd_struct_operand" "Utv")]
> >> >> > +			     "memory_operand" "Utv")]
> >> >> >  	UNSPEC_LD1))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "ld1\\t{%0<Vmtype>}, %1"
> >> >> > @@ -7276,7 +7276,7 @@ (define_insn "aarch64_be_ld1<mode>"
> >> >> >  )
> >> >> >
> >> >> >  (define_insn "aarch64_be_st1<mode>"
> >> >> > -  [(set (match_operand:VALLDI_F16 0
> "aarch64_simd_struct_operand"
> >> >> > "=Utv")
> >> >> > +  [(set (match_operand:VALLDI_F16 0 "memory_operand" "=Utv")
> >> >> >  	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1
> >> >> "register_operand" "w")]
> >> >> >  	UNSPEC_ST1))]
> >> >> >    "TARGET_SIMD"
> >> >> > @@ -7551,7 +7551,7 @@ (define_expand
> >> >> "aarch64_ld<nregs>r<vstruct_elt>"
> >> >> >  (define_insn "aarch64_ld2<vstruct_elt>_dreg"
> >> >> >    [(set (match_operand:VSTRUCT_2DNX 0 "register_operand" "=w")
> >> >> >  	(unspec:VSTRUCT_2DNX [
> >> >> > -	  (match_operand:VSTRUCT_2DNX 1
> >> >> "aarch64_simd_struct_operand" "Utv")]
> >> >> > +	  (match_operand:VSTRUCT_2DNX 1 "memory_operand"
> "Utv")]
> >> >> >  	  UNSPEC_LD2_DREG))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> >> >> > @@ -7561,7 +7561,7 @@ (define_insn
> "aarch64_ld2<vstruct_elt>_dreg"
> >> >> >  (define_insn "aarch64_ld2<vstruct_elt>_dreg"
> >> >> >    [(set (match_operand:VSTRUCT_2DX 0 "register_operand" "=w")
> >> >> >  	(unspec:VSTRUCT_2DX [
> >> >> > -	  (match_operand:VSTRUCT_2DX 1
> "aarch64_simd_struct_operand"
> >> >> "Utv")]
> >> >> > +	  (match_operand:VSTRUCT_2DX 1 "memory_operand"
> "Utv")]
> >> >> >  	  UNSPEC_LD2_DREG))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "ld1\\t{%S0.1d - %T0.1d}, %1"
> >> >> > @@ -7571,7 +7571,7 @@ (define_insn
> "aarch64_ld2<vstruct_elt>_dreg"
> >> >> >  (define_insn "aarch64_ld3<vstruct_elt>_dreg"
> >> >> >    [(set (match_operand:VSTRUCT_3DNX 0 "register_operand" "=w")
> >> >> >  	(unspec:VSTRUCT_3DNX [
> >> >> > -	  (match_operand:VSTRUCT_3DNX 1
> >> >> "aarch64_simd_struct_operand" "Utv")]
> >> >> > +	  (match_operand:VSTRUCT_3DNX 1 "memory_operand"
> "Utv")]
> >> >> >  	  UNSPEC_LD3_DREG))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> >> >> > @@ -7581,7 +7581,7 @@ (define_insn
> "aarch64_ld3<vstruct_elt>_dreg"
> >> >> >  (define_insn "aarch64_ld3<vstruct_elt>_dreg"
> >> >> >    [(set (match_operand:VSTRUCT_3DX 0 "register_operand" "=w")
> >> >> >  	(unspec:VSTRUCT_3DX [
> >> >> > -	  (match_operand:VSTRUCT_3DX 1
> "aarch64_simd_struct_operand"
> >> >> "Utv")]
> >> >> > +	  (match_operand:VSTRUCT_3DX 1 "memory_operand"
> "Utv")]
> >> >> >  	  UNSPEC_LD3_DREG))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "ld1\\t{%S0.1d - %U0.1d}, %1"
> >> >> > @@ -7591,7 +7591,7 @@ (define_insn
> "aarch64_ld3<vstruct_elt>_dreg"
> >> >> >  (define_insn "aarch64_ld4<vstruct_elt>_dreg"
> >> >> >    [(set (match_operand:VSTRUCT_4DNX 0 "register_operand" "=w")
> >> >> >  	(unspec:VSTRUCT_4DNX [
> >> >> > -	  (match_operand:VSTRUCT_4DNX 1
> >> >> "aarch64_simd_struct_operand" "Utv")]
> >> >> > +	  (match_operand:VSTRUCT_4DNX 1 "memory_operand"
> "Utv")]
> >> >> >  	  UNSPEC_LD4_DREG))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> >> >> > @@ -7601,7 +7601,7 @@ (define_insn
> "aarch64_ld4<vstruct_elt>_dreg"
> >> >> >  (define_insn "aarch64_ld4<vstruct_elt>_dreg"
> >> >> >    [(set (match_operand:VSTRUCT_4DX 0 "register_operand" "=w")
> >> >> >  	(unspec:VSTRUCT_4DX [
> >> >> > -	  (match_operand:VSTRUCT_4DX 1
> "aarch64_simd_struct_operand"
> >> >> "Utv")]
> >> >> > +	  (match_operand:VSTRUCT_4DX 1 "memory_operand"
> "Utv")]
> >> >> >  	  UNSPEC_LD4_DREG))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "ld1\\t{%S0.1d - %V0.1d}, %1"
> >> >> > @@ -7841,7 +7841,7 @@ (define_insn
> >> >> "aarch64_rev<REVERSE:rev_op><mode>"
> >> >> >  )
> >> >> >
> >> >> >  (define_insn "aarch64_st2<vstruct_elt>_dreg"
> >> >> > -  [(set (match_operand:VSTRUCT_2DNX 0
> >> >> "aarch64_simd_struct_operand"
> >> >> > "=Utv")
> >> >> > +  [(set (match_operand:VSTRUCT_2DNX 0 "memory_operand"
> "=Utv")
> >> >> >  	(unspec:VSTRUCT_2DNX [
> >> >> >  		(match_operand:VSTRUCT_2DNX 1
> "register_operand" "w")]
> >> >> >  		UNSPEC_ST2))]
> >> >> > @@ -7851,7 +7851,7 @@ (define_insn
> "aarch64_st2<vstruct_elt>_dreg"
> >> >> >  )
> >> >> >
> >> >> >  (define_insn "aarch64_st2<vstruct_elt>_dreg"
> >> >> > -  [(set (match_operand:VSTRUCT_2DX 0
> >> "aarch64_simd_struct_operand"
> >> >> > "=Utv")
> >> >> > +  [(set (match_operand:VSTRUCT_2DX 0 "memory_operand"
> "=Utv")
> >> >> >  	(unspec:VSTRUCT_2DX [
> >> >> >  		(match_operand:VSTRUCT_2DX 1
> "register_operand" "w")]
> >> >> >  		UNSPEC_ST2))]
> >> >> > @@ -7861,7 +7861,7 @@ (define_insn
> "aarch64_st2<vstruct_elt>_dreg"
> >> >> >  )
> >> >> >
> >> >> >  (define_insn "aarch64_st3<vstruct_elt>_dreg"
> >> >> > -  [(set (match_operand:VSTRUCT_3DNX 0
> >> >> "aarch64_simd_struct_operand"
> >> >> > "=Utv")
> >> >> > +  [(set (match_operand:VSTRUCT_3DNX 0 "memory_operand"
> "=Utv")
> >> >> >  	(unspec:VSTRUCT_3DNX [
> >> >> >  		(match_operand:VSTRUCT_3DNX 1
> "register_operand" "w")]
> >> >> >  		UNSPEC_ST3))]
> >> >> > @@ -7871,7 +7871,7 @@ (define_insn
> "aarch64_st3<vstruct_elt>_dreg"
> >> >> >  )
> >> >> >
> >> >> >  (define_insn "aarch64_st3<vstruct_elt>_dreg"
> >> >> > -  [(set (match_operand:VSTRUCT_3DX 0
> >> "aarch64_simd_struct_operand"
> >> >> > "=Utv")
> >> >> > +  [(set (match_operand:VSTRUCT_3DX 0 "memory_operand"
> "=Utv")
> >> >> >  	(unspec:VSTRUCT_3DX [
> >> >> >  		(match_operand:VSTRUCT_3DX 1
> "register_operand" "w")]
> >> >> >  		UNSPEC_ST3))]
> >> >> > @@ -7881,7 +7881,7 @@ (define_insn
> "aarch64_st3<vstruct_elt>_dreg"
> >> >> >  )
> >> >> >
> >> >> >  (define_insn "aarch64_st4<vstruct_elt>_dreg"
> >> >> > -  [(set (match_operand:VSTRUCT_4DNX 0
> >> >> "aarch64_simd_struct_operand"
> >> >> > "=Utv")
> >> >> > +  [(set (match_operand:VSTRUCT_4DNX 0 "memory_operand"
> "=Utv")
> >> >> >  	(unspec:VSTRUCT_4DNX [
> >> >> >  		(match_operand:VSTRUCT_4DNX 1
> "register_operand" "w")]
> >> >> >  		UNSPEC_ST4))]
> >> >> > @@ -7891,7 +7891,7 @@ (define_insn
> "aarch64_st4<vstruct_elt>_dreg"
> >> >> >  )
> >> >> >
> >> >> >  (define_insn "aarch64_st4<vstruct_elt>_dreg"
> >> >> > -  [(set (match_operand:VSTRUCT_4DX 0
> >> "aarch64_simd_struct_operand"
> >> >> > "=Utv")
> >> >> > +  [(set (match_operand:VSTRUCT_4DX 0 "memory_operand"
> "=Utv")
> >> >> >  	(unspec:VSTRUCT_4DX [
> >> >> >  		(match_operand:VSTRUCT_4DX 1
> "register_operand" "w")]
> >> >> >  		UNSPEC_ST4))]
> >> >> > @@ -7974,7 +7974,7 @@ (define_expand "vec_init<mode><Vhalf>"
> >> >> >  (define_insn "*aarch64_simd_ld1r<mode>"
> >> >> >    [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> >> >> >  	(vec_duplicate:VALL_F16
> >> >> > -	  (match_operand:<VEL> 1 "aarch64_simd_struct_operand"
> "Utv")))]
> >> >> > +	  (match_operand:<VEL> 1 "memory_operand" "Utv")))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "ld1r\\t{%0.<Vtype>}, %1"
> >> >> >    [(set_attr "type" "neon_load1_all_lanes")] @@ -7983,7 +7983,7
> >> >> > @@ (define_insn "*aarch64_simd_ld1r<mode>"
> >> >> >  (define_insn "aarch64_simd_ld1<vstruct_elt>_x2"
> >> >> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
> >> >> >  	(unspec:VSTRUCT_2QD [
> >> >> > -	    (match_operand:VSTRUCT_2QD 1
> "aarch64_simd_struct_operand"
> >> >> "Utv")]
> >> >> > +	    (match_operand:VSTRUCT_2QD 1 "memory_operand"
> "Utv")]
> >> >> >  	    UNSPEC_LD1))]
> >> >> >    "TARGET_SIMD"
> >> >> >    "ld1\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> >> >> > diff --git a/gcc/config/aarch64/predicates.md
> >> >> > b/gcc/config/aarch64/predicates.md
> >> >> > index
> >> >> >
> >> >>
> >>
> c308015ac2c13d24cd6bcec71247ec45df8cf5e6..6b70a364530c8108457091bfec
> >> >> 12
> >> >> > fe549f722149 100644
> >> >> > --- a/gcc/config/aarch64/predicates.md
> >> >> > +++ b/gcc/config/aarch64/predicates.md
> >> >> > @@ -494,10 +494,6 @@ (define_predicate
> >> >> "aarch64_simd_reg_or_minus_one"
> >> >> >    (ior (match_operand 0 "register_operand")
> >> >> >         (match_operand 0 "aarch64_simd_imm_minus_one")))
> >> >> >
> >> >> > -(define_predicate "aarch64_simd_struct_operand"
> >> >> > -  (and (match_code "mem")
> >> >> > -       (match_test "TARGET_SIMD &&
> aarch64_simd_mem_operand_p
> >> >> (op)")))
> >> >> > -
> >> >> >  ;; Like general_operand but allow only valid SIMD addressing modes.
> >> >> >  (define_predicate "aarch64_simd_general_operand"
> >> >> >    (and (match_operand 0 "general_operand") diff --git
> >> >> > a/gcc/testsuite/gcc.target/aarch64/vld1r.c
> >> >> > b/gcc/testsuite/gcc.target/aarch64/vld1r.c
> >> >> > new file mode 100644
> >> >> > index
> >> >> >
> >> >>
> >>
> 0000000000000000000000000000000000000000..72c505c403e9e239771379b7ca
> >> >> dd
> >> >> > 8a9473f06113
> >> >> > --- /dev/null
> >> >> > +++ b/gcc/testsuite/gcc.target/aarch64/vld1r.c
> >> >> > @@ -0,0 +1,26 @@
> >> >> > +/* { dg-do compile } */
> >> >> > +/* { dg-additional-options "-O" } */
> >> >> > +/* { dg-final { check-function-bodies "**" "" "" { target { le
> >> >> > +} } } } */
> >> >> > +
> >> >> > +#include <arm_neon.h>
> >> >> > +
> >> >> > +/*
> >> >> > +** f1:
> >> >> > +** 	add	x0, x0, 1
> >> >> > +** 	ld1r	{v0.8b}, \[x0\]
> >> >> > +** 	ret
> >> >> > +*/
> >> >> > +uint8x8_t f1(const uint8_t *in) {
> >> >> > +    return vld1_dup_u8(&in[1]); }
> >> >> > +
> >> >> > +/*
> >> >> > +** f2:
> >> >> > +** 	ldr	s1, \[x0, 4\]
> >> >> > +** 	fmla	v0.4s, v0.4s, v1.s\[0\]
> >> >> > +** 	ret
> >> >> > +*/
> >> >> > +float32x4_t f2(const float32_t *in, float32x4_t a) {
> >> >> > +    float32x4_t dup = vld1q_dup_f32(&in[1]);
> >> >> > +    return vfmaq_laneq_f32 (a, a, dup, 1); }
  
Richard Biener June 13, 2022, 8 a.m. UTC | #7
On Wed, 8 Jun 2022, Richard Sandiford wrote:

> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> Sent: Wednesday, June 8, 2022 11:31 AM
> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> >> Subject: Re: [PATCH]AArch64 relax predicate on load structure load
> >> instructions
> >> 
> >> Tamar Christina <tamar.christina@arm.com> writes:
> >> > Hi All,
> >> >
> >> > At some point in time we started lowering the ld1r instructions in gimple.
> >> >
> >> > That is:
> >> >
> >> > uint8x8_t f1(const uint8_t *in) {
> >> >     return vld1_dup_u8(&in[1]);
> >> > }
> >> >
> >> > generates at gimple:
> >> >
> >> >   _3 = MEM[(const uint8_t *)in_1(D) + 1B];
> >> >   _4 = {_3, _3, _3, _3, _3, _3, _3, _3};
> >> >
> >> > Which is good, but we then generate:
> >> >
> >> > f1:
> >> > 	ldr     b0, [x0, 1]
> >> > 	dup     v0.8b, v0.b[0]
> >> > 	ret
> >> >
> >> > instead of ld1r.
> >> >
> >> > The reason for this is because the load instructions have a too
> >> > restrictive predicate on them which causes combine not to be able to
> >> > combine the instructions due to the predicate only accepting simple
> >> addressing modes.
> >> >
> >> > This patch relaxes the predicate to accept any memory operand and
> >> > relies on LRA to legitimize the address when it needs to as the
> >> > constraint still only allows the simple addressing mode.  Reload is
> >> > always able to legitimize to these.
> >> >
> >> > Secondly since we are now actually generating more ld1r it became
> >> > clear that the lane instructions suffer from a similar issue.
> >> >
> >> > i.e.
> >> >
> >> > float32x4_t f2(const float32_t *in, float32x4_t a) {
> >> >     float32x4_t dup = vld1q_dup_f32(&in[1]);
> >> >     return vfmaq_laneq_f32 (a, a, dup, 1); }
> >> >
> >> > would generate ld1r + vector fmla instead of ldr + lane fmla.
> >> >
> >> > The reason for this is similar to the ld1r issue.  The predicate is
> >> > too restrictive in only acception register operands but not memory.
> >> >
> >> > This relaxes it to accept register and/or memory while leaving the
> >> > constraint to only accept registers.  This will have LRA generate a
> >> > reload if needed forcing the memory to registers using the standard
> >> patterns.
> >> >
> >> > These two changes allow combine and reload to generate the right
> >> sequences.
> >> >
> >> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >> 
> >> This is going against the general direction of travel, which is to make the
> >> instruction's predicates and conditions enforce the constraints as much as
> >> possible (making optimistic assumptions about pseudo registers).
> >> 
> >> The RA *can* deal with things like:
> >> 
> >>   (match_operand:M N "general_operand" "r")
> >> 
> >> but it's best avoided, for a few reasons:
> >> 
> >> (1) The fix-up will be done in LRA, so IRA will not see the temporary
> >>     registers.  This can make the allocation of those temporaries
> >>     suboptimal but (more importantly) it might require other
> >>     previously-allocated registers to be spilled late due to the
> >>     unexpected increase in register pressure.
> >> 
> >> (2) It ends up hiding instructions from the pre-RA optimisers.
> >> 
> >> (3) It can also prevent combine opportunities (as well as create them),
> >>     unless the loose predicates in an insn I are propagated to all
> >>     patterns that might result from combining I with something else.
> >> 
> >> It sounds like the first problem (not generating ld1r) could be fixed by (a)
> >> combining aarch64_simd_dup<mode> and *aarch64_simd_ld1r<mode>, so
> >> that the register and memory alternatives are in the same pattern and (b)
> >> using the merged instruction(s) to implement the vec_duplicate optab.
> >> Target-independent code should then make the address satisfy the
> >> predicate, simplifying the address where necessary.
> >> 
> >
> > I think I am likely missing something here. I would assume that you wanted
> > to use the optab to split the addressing off from the mem expression so the
> > combined insn matches.
> >
> > But in that case, why do you need to combine the two instructions?
> > I've tried and it doesn't work since the vec_duplicate optab doesn't see the
> > mem as op1, because in gimple the mem is not part of the duplicate.
> >
> > So you still just see:
> >
> >>>> dbgrtx (ops[1].value)
> > (subreg/s/v:QI (reg:SI 92 [ _3 ]) 0)
> >
> > As the operand as the argument to the dup is just an SSA_NAME.
> 
> Ah, yeah, I'd forgotten that fixed-length vec_duplicates would
> come from a constructor rather than a vec_duplicate_expr, so we don't
> get the usual benefit of folding single-use mems during expand.
> 
> https://gcc.gnu.org/pipermail/gcc-patches/2022-May/595362.html
> moves towards using vec_duplicate even for fixed-length vectors.
> If we take that approach, then I suppose a plain constructor
> should be folded to a vec_duplicate where possible.
> 
> (Alternatively, we could use an extended vec_perm_expr with
> scalar inputs, as Richi suggested in that thread.)
> 
> If we don't do that, or don't do it yet, then…

I suppose since we alrady have vec_duplicate we can just use it ...
what was the reason to not do this originally?  I suppose the
vec_duplicate expander has a fallback via store_constuctor?

Originally I wanted to avoid multiple ways to express the same thing
but vec_duplicate is a common enough special-case and it also
usually maps to a special instruction in vector ISAs.
There's VIEW_CONVERT vs. vec_duplicate for V1m modes then, I
suppose VIEW_CONVERT is more canonical here.

Richard.

> > If not and you wanted the combined insn to accept
> >
> > (set (reg:SI 92 [ _3 ])
> >     (zero_extend:SI (mem:QI (plus:DI (reg:DI 97)
> >                 (const_int 1 [0x1])) [0 MEM[(const uint8_tD.4561 *)in_1(D) + 1B]+0 S1 A8])))
> >
> > Then that's also not possible without relaxing the combined predicates.  As far as I can tell
> > If I'm not allowed to use LRA for this, then the only thing that could work is an early split?
> >
> > Or do I have to modify store_constructor to try a variant where it tries pushing in the
> > Decl of an SSA_NAME first?
> 
> …yeah, something like this would be needed.  But the vec_duplicate_expr/
> vec_perm_expr thing seems better, even if we only introduce it during isel.
> 
> Not my call either way though :-)  Let's see what Richi (cc:ed) thinks.
> 
> Thanks,
> Richard
> 
> > I guess this also only really works for ld1r, whenever we lower ld2(r) etc we'll have the same
> > issue again... But I suppose that's for the next person ?
> >
> > Thanks,
> > Tamar
> >
> >> I'm not sure whether fixing the ld1r problem that way will avoid the
> >> vfmaq_laneq_f32 problem; let me know if not.
> >> 
> >> Thanks,
> >> Richard
> >> 
> >> > Ok for master?
> >> >
> >> > Thanks,
> >> > Tamar
> >> >
> >> > gcc/ChangeLog:
> >> >
> >> > 	* config/aarch64/aarch64-simd.md (mul_lane<mode>3,
> >> mul_laneq<mode>3,
> >> > 	mul_n<mode>3, *aarch64_mul3_elt_to_64v2df,
> >> *aarch64_mla_elt<mode>,
> >> > 	*aarch64_mla_elt_<vswap_width_name><mode>,
> >> aarch64_mla_n<mode>,
> >> > 	*aarch64_mls_elt<mode>,
> >> *aarch64_mls_elt_<vswap_width_name><mode>,
> >> > 	aarch64_mls_n<mode>, *aarch64_fma4_elt<mode>,
> >> > 	*aarch64_fma4_elt_<vswap_width_name><mode>,
> >> > 	*aarch64_fma4_elt_from_dup<mode>,
> >> *aarch64_fma4_elt_to_64v2df,
> >> > 	*aarch64_fnma4_elt<mode>,
> >> *aarch64_fnma4_elt_<vswap_width_name><mode>,
> >> > 	*aarch64_fnma4_elt_from_dup<mode>,
> >> *aarch64_fnma4_elt_to_64v2df,
> >> > 	*aarch64_mulx_elt_<vswap_width_name><mode>,
> >> > 	*aarch64_mulx_elt<mode>, *aarch64_mulx_elt_from_dup<mode>,
> >> > 	*aarch64_vgetfmulx<mode>): Relax register_operand to
> >> > 	nonimmediate_operand.
> >> > 	(aarch64_simd_ld2<vstruct_elt>, aarch64_simd_ld2r<vstruct_elt>,
> >> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
> >> > 	vec_load_lanes<mode><vstruct_elt>,
> >> aarch64_simd_st2<vstruct_elt>,
> >> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
> >> > 	vec_store_lanes<mode><vstruct_elt>,
> >> aarch64_simd_ld3<vstruct_elt>,
> >> > 	aarch64_simd_ld3r<vstruct_elt>,
> >> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
> >> > 	vec_load_lanes<mode><vstruct_elt>,
> >> aarch64_simd_st3<vstruct_elt>,
> >> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
> >> > 	vec_store_lanes<mode><vstruct_elt>,
> >> aarch64_simd_ld4<vstruct_elt>,
> >> > 	aarch64_simd_ld4r<vstruct_elt>,
> >> > 	aarch64_vec_load_lanes<mode>_lane<vstruct_elt>,
> >> > 	vec_load_lanes<mode><vstruct_elt>,
> >> aarch64_simd_st4<vstruct_elt>,
> >> > 	aarch64_vec_store_lanes<mode>_lane<vstruct_elt>,
> >> > 	vec_store_lanes<mode><vstruct_elt>,
> >> aarch64_ld1_x3_<vstruct_elt>,
> >> > 	aarch64_ld1_x4_<vstruct_elt>, aarch64_st1_x2_<vstruct_elt>,
> >> > 	aarch64_st1_x3_<vstruct_elt>, aarch64_st1_x4_<vstruct_elt>,
> >> > 	aarch64_be_ld1<mode>, aarch64_be_st1<mode>,
> >> > 	aarch64_ld2<vstruct_elt>_dreg, aarch64_ld2<vstruct_elt>_dreg,
> >> > 	aarch64_ld3<vstruct_elt>_dreg, aarch64_ld3<vstruct_elt>_dreg,
> >> > 	aarch64_ld4<vstruct_elt>_dreg, aarch64_ld4<vstruct_elt>_dreg,
> >> > 	aarch64_st2<vstruct_elt>_dreg, aarch64_st2<vstruct_elt>_dreg,
> >> > 	aarch64_st3<vstruct_elt>_dreg, aarch64_st3<vstruct_elt>_dreg,
> >> > 	aarch64_st4<vstruct_elt>_dreg, aarch64_st4<vstruct_elt>_dreg,
> >> > 	*aarch64_simd_ld1r<mode>, aarch64_simd_ld1<vstruct_elt>_x2):
> >> Relax
> >> > 	aarch64_simd_struct_operand to memory_operand.
> >> > 	* config/aarch64/predicates.md (aarch64_simd_struct_operand):
> >> Remove.
> >> >
> >> > gcc/testsuite/ChangeLog:
> >> >
> >> > 	* gcc.target/aarch64/vld1r.c: New test.
> >> >
> >> > --- inline copy of patch --
> >> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> >> > b/gcc/config/aarch64/aarch64-simd.md
> >> > index
> >> >
> >> be5c70bbb7520ae93d19c4a432ce34863e5b9a64..24e3274ddda2ea76c83571fa
> >> da8f
> >> > f4c953b752a1 100644
> >> > --- a/gcc/config/aarch64/aarch64-simd.md
> >> > +++ b/gcc/config/aarch64/aarch64-simd.md
> >> > @@ -712,7 +712,7 @@ (define_insn "mul_lane<mode>3"
> >> >         (mult:VMULD
> >> >  	 (vec_duplicate:VMULD
> >> >  	   (vec_select:<VEL>
> >> > -	     (match_operand:<VCOND> 2 "register_operand" "<h_con>")
> >> > +	     (match_operand:<VCOND> 2 "nonimmediate_operand"
> >> "<h_con>")
> >> >  	     (parallel [(match_operand:SI 3 "immediate_operand" "i")])))
> >> >  	 (match_operand:VMULD 1 "register_operand" "w")))]
> >> >    "TARGET_SIMD"
> >> > @@ -728,7 +728,7 @@ (define_insn "mul_laneq<mode>3"
> >> >       (mult:VMUL
> >> >         (vec_duplicate:VMUL
> >> >  	  (vec_select:<VEL>
> >> > -	    (match_operand:<VCONQ> 2 "register_operand" "<h_con>")
> >> > +	    (match_operand:<VCONQ> 2 "nonimmediate_operand"
> >> "<h_con>")
> >> >  	    (parallel [(match_operand:SI 3 "immediate_operand")])))
> >> >        (match_operand:VMUL 1 "register_operand" "w")))]
> >> >    "TARGET_SIMD"
> >> > @@ -743,7 +743,7 @@ (define_insn "mul_n<mode>3"
> >> >   [(set (match_operand:VMUL 0 "register_operand" "=w")
> >> >         (mult:VMUL
> >> >  	 (vec_duplicate:VMUL
> >> > -	   (match_operand:<VEL> 2 "register_operand" "<h_con>"))
> >> > +	   (match_operand:<VEL> 2 "nonimmediate_operand" "<h_con>"))
> >> >  	 (match_operand:VMUL 1 "register_operand" "w")))]
> >> >    "TARGET_SIMD"
> >> >    "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"; @@ -789,7 +789,7
> >> > @@ (define_insn "*aarch64_mul3_elt_to_64v2df"
> >> >    [(set (match_operand:DF 0 "register_operand" "=w")
> >> >       (mult:DF
> >> >         (vec_select:DF
> >> > -	 (match_operand:V2DF 1 "register_operand" "w")
> >> > +	 (match_operand:V2DF 1 "nonimmediate_operand" "w")
> >> >  	 (parallel [(match_operand:SI 2 "immediate_operand")]))
> >> >         (match_operand:DF 3 "register_operand" "w")))]
> >> >    "TARGET_SIMD"
> >> > @@ -1406,7 +1406,7 @@ (define_insn "*aarch64_mla_elt<mode>"
> >> >  	 (mult:VDQHS
> >> >  	   (vec_duplicate:VDQHS
> >> >  	      (vec_select:<VEL>
> >> > -		(match_operand:VDQHS 1 "register_operand" "<h_con>")
> >> > +		(match_operand:VDQHS 1 "nonimmediate_operand"
> >> "<h_con>")
> >> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >  	   (match_operand:VDQHS 3 "register_operand" "w"))
> >> >  	 (match_operand:VDQHS 4 "register_operand" "0")))] @@ -1424,7
> >> > +1424,7 @@ (define_insn
> >> "*aarch64_mla_elt_<vswap_width_name><mode>"
> >> >  	 (mult:VDQHS
> >> >  	   (vec_duplicate:VDQHS
> >> >  	      (vec_select:<VEL>
> >> > -		(match_operand:<VSWAP_WIDTH> 1 "register_operand"
> >> "<h_con>")
> >> > +		(match_operand:<VSWAP_WIDTH> 1
> >> "nonimmediate_operand" "<h_con>")
> >> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >  	   (match_operand:VDQHS 3 "register_operand" "w"))
> >> >  	 (match_operand:VDQHS 4 "register_operand" "0")))] @@ -1441,7
> >> > +1441,7 @@ (define_insn "aarch64_mla_n<mode>"
> >> >  	(plus:VDQHS
> >> >  	  (mult:VDQHS
> >> >  	    (vec_duplicate:VDQHS
> >> > -	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
> >> > +	      (match_operand:<VEL> 3 "nonimmediate_operand" "<h_con>"))
> >> >  	    (match_operand:VDQHS 2 "register_operand" "w"))
> >> >  	  (match_operand:VDQHS 1 "register_operand" "0")))]
> >> >   "TARGET_SIMD"
> >> > @@ -1466,7 +1466,7 @@ (define_insn "*aarch64_mls_elt<mode>"
> >> >  	 (mult:VDQHS
> >> >  	   (vec_duplicate:VDQHS
> >> >  	      (vec_select:<VEL>
> >> > -		(match_operand:VDQHS 1 "register_operand" "<h_con>")
> >> > +		(match_operand:VDQHS 1 "nonimmediate_operand"
> >> "<h_con>")
> >> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >  	   (match_operand:VDQHS 3 "register_operand" "w"))))]
> >> >   "TARGET_SIMD"
> >> > @@ -1484,7 +1484,7 @@ (define_insn
> >> "*aarch64_mls_elt_<vswap_width_name><mode>"
> >> >  	 (mult:VDQHS
> >> >  	   (vec_duplicate:VDQHS
> >> >  	      (vec_select:<VEL>
> >> > -		(match_operand:<VSWAP_WIDTH> 1 "register_operand"
> >> "<h_con>")
> >> > +		(match_operand:<VSWAP_WIDTH> 1
> >> "nonimmediate_operand" "<h_con>")
> >> >  		  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >  	   (match_operand:VDQHS 3 "register_operand" "w"))))]
> >> >   "TARGET_SIMD"
> >> > @@ -1501,7 +1501,7 @@ (define_insn "aarch64_mls_n<mode>"
> >> >  	  (match_operand:VDQHS 1 "register_operand" "0")
> >> >  	  (mult:VDQHS
> >> >  	    (vec_duplicate:VDQHS
> >> > -	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
> >> > +	      (match_operand:<VEL> 3 "nonimmediate_operand" "<h_con>"))
> >> >  	    (match_operand:VDQHS 2 "register_operand" "w"))))]
> >> >    "TARGET_SIMD"
> >> >    "mls\t%0.<Vtype>, %2.<Vtype>, %3.<Vetype>[0]"
> >> > @@ -2882,7 +2882,7 @@ (define_insn "*aarch64_fma4_elt<mode>"
> >> >      (fma:VDQF
> >> >        (vec_duplicate:VDQF
> >> >  	(vec_select:<VEL>
> >> > -	  (match_operand:VDQF 1 "register_operand" "<h_con>")
> >> > +	  (match_operand:VDQF 1 "nonimmediate_operand" "<h_con>")
> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >        (match_operand:VDQF 3 "register_operand" "w")
> >> >        (match_operand:VDQF 4 "register_operand" "0")))] @@ -2899,7
> >> > +2899,7 @@ (define_insn
> >> "*aarch64_fma4_elt_<vswap_width_name><mode>"
> >> >      (fma:VDQSF
> >> >        (vec_duplicate:VDQSF
> >> >  	(vec_select:<VEL>
> >> > -	  (match_operand:<VSWAP_WIDTH> 1 "register_operand"
> >> "<h_con>")
> >> > +	  (match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand"
> >> "<h_con>")
> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >        (match_operand:VDQSF 3 "register_operand" "w")
> >> >        (match_operand:VDQSF 4 "register_operand" "0")))] @@ -2915,7
> >> > +2915,7 @@ (define_insn "*aarch64_fma4_elt_from_dup<mode>"
> >> >    [(set (match_operand:VMUL 0 "register_operand" "=w")
> >> >      (fma:VMUL
> >> >        (vec_duplicate:VMUL
> >> > -	  (match_operand:<VEL> 1 "register_operand" "<h_con>"))
> >> > +	  (match_operand:<VEL> 1 "nonimmediate_operand" "<h_con>"))
> >> >        (match_operand:VMUL 2 "register_operand" "w")
> >> >        (match_operand:VMUL 3 "register_operand" "0")))]
> >> >    "TARGET_SIMD"
> >> > @@ -2927,7 +2927,7 @@ (define_insn "*aarch64_fma4_elt_to_64v2df"
> >> >    [(set (match_operand:DF 0 "register_operand" "=w")
> >> >      (fma:DF
> >> >  	(vec_select:DF
> >> > -	  (match_operand:V2DF 1 "register_operand" "w")
> >> > +	  (match_operand:V2DF 1 "nonimmediate_operand" "w")
> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")]))
> >> >        (match_operand:DF 3 "register_operand" "w")
> >> >        (match_operand:DF 4 "register_operand" "0")))] @@ -2957,7
> >> > +2957,7 @@ (define_insn "*aarch64_fnma4_elt<mode>"
> >> >          (match_operand:VDQF 3 "register_operand" "w"))
> >> >        (vec_duplicate:VDQF
> >> >  	(vec_select:<VEL>
> >> > -	  (match_operand:VDQF 1 "register_operand" "<h_con>")
> >> > +	  (match_operand:VDQF 1 "nonimmediate_operand" "<h_con>")
> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >        (match_operand:VDQF 4 "register_operand" "0")))]
> >> >    "TARGET_SIMD"
> >> > @@ -2975,7 +2975,7 @@ (define_insn
> >> "*aarch64_fnma4_elt_<vswap_width_name><mode>"
> >> >          (match_operand:VDQSF 3 "register_operand" "w"))
> >> >        (vec_duplicate:VDQSF
> >> >  	(vec_select:<VEL>
> >> > -	  (match_operand:<VSWAP_WIDTH> 1 "register_operand"
> >> "<h_con>")
> >> > +	  (match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand"
> >> "<h_con>")
> >> >  	  (parallel [(match_operand:SI 2 "immediate_operand")])))
> >> >        (match_operand:VDQSF 4 "register_operand" "0")))]
> >> >    "TARGET_SIMD"
> >> > @@ -2992,7 +2992,7 @@ (define_insn
> >> "*aarch64_fnma4_elt_from_dup<mode>"
> >> >        (neg:VMUL
> >> >          (match_operand:VMUL 2 "register_operand" "w"))
> >> >        (vec_duplicate:VMUL
> >> > -	(match_operand:<VEL> 1 "register_operand" "<h_con>"))
> >> > +	(match_operand:<VEL> 1 "nonimmediate_operand" "<h_con>"))
> >> >        (match_operand:VMUL 3 "register_operand" "0")))]
> >> >    "TARGET_SIMD"
> >> >    "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
> >> > @@ -3003,7 +3003,7 @@ (define_insn "*aarch64_fnma4_elt_to_64v2df"
> >> >    [(set (match_operand:DF 0 "register_operand" "=w")
> >> >      (fma:DF
> >> >        (vec_select:DF
> >> > -	(match_operand:V2DF 1 "register_operand" "w")
> >> > +	(match_operand:V2DF 1 "nonimmediate_operand" "w")
> >> >  	(parallel [(match_operand:SI 2 "immediate_operand")]))
> >> >        (neg:DF
> >> >          (match_operand:DF 3 "register_operand" "w")) @@ -4934,7
> >> > +4934,7 @@ (define_insn
> >> "*aarch64_mulx_elt_<vswap_width_name><mode>"
> >> >  	 [(match_operand:VDQSF 1 "register_operand" "w")
> >> >  	  (vec_duplicate:VDQSF
> >> >  	   (vec_select:<VEL>
> >> > -	    (match_operand:<VSWAP_WIDTH> 2 "register_operand" "w")
> >> > +	    (match_operand:<VSWAP_WIDTH> 2 "nonimmediate_operand"
> >> "w")
> >> >  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
> >> >  	 UNSPEC_FMULX))]
> >> >    "TARGET_SIMD"
> >> > @@ -4953,7 +4953,7 @@ (define_insn "*aarch64_mulx_elt<mode>"
> >> >  	 [(match_operand:VDQF 1 "register_operand" "w")
> >> >  	  (vec_duplicate:VDQF
> >> >  	   (vec_select:<VEL>
> >> > -	    (match_operand:VDQF 2 "register_operand" "w")
> >> > +	    (match_operand:VDQF 2 "nonimmediate_operand" "w")
> >> >  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
> >> >  	 UNSPEC_FMULX))]
> >> >    "TARGET_SIMD"
> >> > @@ -4971,7 +4971,7 @@ (define_insn
> >> "*aarch64_mulx_elt_from_dup<mode>"
> >> >  	(unspec:VHSDF
> >> >  	 [(match_operand:VHSDF 1 "register_operand" "w")
> >> >  	  (vec_duplicate:VHSDF
> >> > -	    (match_operand:<VEL> 2 "register_operand" "<h_con>"))]
> >> > +	    (match_operand:<VEL> 2 "nonimmediate_operand" "<h_con>"))]
> >> >  	 UNSPEC_FMULX))]
> >> >    "TARGET_SIMD"
> >> >    "fmulx\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"; @@ -4987,7
> >> +4987,7
> >> > @@ (define_insn "*aarch64_vgetfmulx<mode>"
> >> >  	(unspec:<VEL>
> >> >  	 [(match_operand:<VEL> 1 "register_operand" "w")
> >> >  	  (vec_select:<VEL>
> >> > -	   (match_operand:VDQF 2 "register_operand" "w")
> >> > +	   (match_operand:VDQF 2 "nonimmediate_operand" "w")
> >> >  	    (parallel [(match_operand:SI 3 "immediate_operand" "i")]))]
> >> >  	 UNSPEC_FMULX))]
> >> >    "TARGET_SIMD"
> >> > @@ -6768,7 +6768,7 @@ (define_insn "*sqrt<mode>2"
> >> >  (define_insn "aarch64_simd_ld2<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_2Q 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_2Q [
> >> > -	  (match_operand:VSTRUCT_2Q 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	  (match_operand:VSTRUCT_2Q 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD2))]
> >> >    "TARGET_SIMD"
> >> >    "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> >> > @@ -6778,7 +6778,7 @@ (define_insn "aarch64_simd_ld2<vstruct_elt>"
> >> >  (define_insn "aarch64_simd_ld2r<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_2QD [
> >> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
> >> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
> >> >            UNSPEC_LD2_DUP))]
> >> >    "TARGET_SIMD"
> >> >    "ld2r\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> >> > @@ -6788,7 +6788,7 @@ (define_insn "aarch64_simd_ld2r<vstruct_elt>"
> >> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_2QD [
> >> > -		(match_operand:BLK 1 "aarch64_simd_struct_operand"
> >> "Utv")
> >> > +		(match_operand:BLK 1 "memory_operand" "Utv")
> >> >  		(match_operand:VSTRUCT_2QD 2 "register_operand" "0")
> >> >  		(match_operand:SI 3 "immediate_operand" "i")]
> >> >  		UNSPEC_LD2_LANE))]
> >> > @@ -6804,7 +6804,7 @@ (define_insn
> >> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_2Q 0 "register_operand")
> >> >  	(unspec:VSTRUCT_2Q [
> >> > -		(match_operand:VSTRUCT_2Q 1
> >> "aarch64_simd_struct_operand")]
> >> > +		(match_operand:VSTRUCT_2Q 1 "memory_operand")]
> >> >  		UNSPEC_LD2))]
> >> >    "TARGET_SIMD"
> >> >  {
> >> > @@ -6822,7 +6822,7 @@ (define_expand
> >> "vec_load_lanes<mode><vstruct_elt>"
> >> >  })
> >> >
> >> >  (define_insn "aarch64_simd_st2<vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_2Q 0 "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_2Q [
> >> >  		(match_operand:VSTRUCT_2Q 1 "register_operand" "w")]
> >> >                  UNSPEC_ST2))]
> >> > @@ -6833,7 +6833,7 @@ (define_insn "aarch64_simd_st2<vstruct_elt>"
> >> >
> >> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
> >> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
> >> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
> >> >  	(unspec:BLK [(match_operand:VSTRUCT_2QD 1 "register_operand"
> >> "w")
> >> >  		     (match_operand:SI 2 "immediate_operand" "i")]
> >> >  		     UNSPEC_ST2_LANE))]
> >> > @@ -6847,7 +6847,7 @@ (define_insn
> >> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> >  )
> >> >
> >> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_2Q 0 "aarch64_simd_struct_operand")
> >> > +  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand")
> >> >  	(unspec:VSTRUCT_2Q [(match_operand:VSTRUCT_2Q 1
> >> "register_operand")]
> >> >                     UNSPEC_ST2))]
> >> >    "TARGET_SIMD"
> >> > @@ -6868,7 +6868,7 @@ (define_expand
> >> "vec_store_lanes<mode><vstruct_elt>"
> >> >  (define_insn "aarch64_simd_ld3<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_3Q 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_3Q [
> >> > -	  (match_operand:VSTRUCT_3Q 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	  (match_operand:VSTRUCT_3Q 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD3))]
> >> >    "TARGET_SIMD"
> >> >    "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> >> > @@ -6878,7 +6878,7 @@ (define_insn "aarch64_simd_ld3<vstruct_elt>"
> >> >  (define_insn "aarch64_simd_ld3r<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_3QD [
> >> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
> >> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
> >> >            UNSPEC_LD3_DUP))]
> >> >    "TARGET_SIMD"
> >> >    "ld3r\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> >> > @@ -6888,7 +6888,7 @@ (define_insn "aarch64_simd_ld3r<vstruct_elt>"
> >> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_3QD [
> >> > -		(match_operand:BLK 1 "aarch64_simd_struct_operand"
> >> "Utv")
> >> > +		(match_operand:BLK 1 "memory_operand" "Utv")
> >> >  		(match_operand:VSTRUCT_3QD 2 "register_operand" "0")
> >> >  		(match_operand:SI 3 "immediate_operand" "i")]
> >> >  		UNSPEC_LD3_LANE))]
> >> > @@ -6904,7 +6904,7 @@ (define_insn
> >> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_3Q 0 "register_operand")
> >> >  	(unspec:VSTRUCT_3Q [
> >> > -		(match_operand:VSTRUCT_3Q 1
> >> "aarch64_simd_struct_operand")]
> >> > +		(match_operand:VSTRUCT_3Q 1 "memory_operand")]
> >> >  		UNSPEC_LD3))]
> >> >    "TARGET_SIMD"
> >> >  {
> >> > @@ -6922,7 +6922,7 @@ (define_expand
> >> "vec_load_lanes<mode><vstruct_elt>"
> >> >  })
> >> >
> >> >  (define_insn "aarch64_simd_st3<vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_3Q 0 "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_3Q [(match_operand:VSTRUCT_3Q 1
> >> "register_operand" "w")]
> >> >                     UNSPEC_ST3))]
> >> >    "TARGET_SIMD"
> >> > @@ -6932,7 +6932,7 @@ (define_insn "aarch64_simd_st3<vstruct_elt>"
> >> >
> >> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
> >> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
> >> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
> >> >  	(unspec:BLK [(match_operand:VSTRUCT_3QD 1 "register_operand"
> >> "w")
> >> >  		     (match_operand:SI 2 "immediate_operand" "i")]
> >> >  		     UNSPEC_ST3_LANE))]
> >> > @@ -6946,7 +6946,7 @@ (define_insn
> >> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> >  )
> >> >
> >> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_3Q 0 "aarch64_simd_struct_operand")
> >> > +  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand")
> >> >  	(unspec:VSTRUCT_3Q [
> >> >  		(match_operand:VSTRUCT_3Q 1 "register_operand")]
> >> >                  UNSPEC_ST3))]
> >> > @@ -6968,7 +6968,7 @@ (define_expand
> >> "vec_store_lanes<mode><vstruct_elt>"
> >> >  (define_insn "aarch64_simd_ld4<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_4Q 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_4Q [
> >> > -	  (match_operand:VSTRUCT_4Q 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	  (match_operand:VSTRUCT_4Q 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD4))]
> >> >    "TARGET_SIMD"
> >> >    "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> >> > @@ -6978,7 +6978,7 @@ (define_insn "aarch64_simd_ld4<vstruct_elt>"
> >> >  (define_insn "aarch64_simd_ld4r<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_4QD [
> >> > -	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
> >> > +	  (match_operand:BLK 1 "memory_operand" "Utv")]
> >> >            UNSPEC_LD4_DUP))]
> >> >    "TARGET_SIMD"
> >> >    "ld4r\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> >> > @@ -6988,7 +6988,7 @@ (define_insn "aarch64_simd_ld4r<vstruct_elt>"
> >> >  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_4QD [
> >> > -		(match_operand:BLK 1 "aarch64_simd_struct_operand"
> >> "Utv")
> >> > +		(match_operand:BLK 1 "memory_operand" "Utv")
> >> >  		(match_operand:VSTRUCT_4QD 2 "register_operand" "0")
> >> >  		(match_operand:SI 3 "immediate_operand" "i")]
> >> >  		UNSPEC_LD4_LANE))]
> >> > @@ -7004,7 +7004,7 @@ (define_insn
> >> "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
> >> >  (define_expand "vec_load_lanes<mode><vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_4Q 0 "register_operand")
> >> >  	(unspec:VSTRUCT_4Q [
> >> > -		(match_operand:VSTRUCT_4Q 1
> >> "aarch64_simd_struct_operand")]
> >> > +		(match_operand:VSTRUCT_4Q 1 "memory_operand")]
> >> >  		UNSPEC_LD4))]
> >> >    "TARGET_SIMD"
> >> >  {
> >> > @@ -7022,7 +7022,7 @@ (define_expand
> >> "vec_load_lanes<mode><vstruct_elt>"
> >> >  })
> >> >
> >> >  (define_insn "aarch64_simd_st4<vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_4Q 0 "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_4Q [
> >> >  		(match_operand:VSTRUCT_4Q 1 "register_operand" "w")]
> >> >                  UNSPEC_ST4))]
> >> > @@ -7033,7 +7033,7 @@ (define_insn "aarch64_simd_st4<vstruct_elt>"
> >> >
> >> >  ;; RTL uses GCC vector extension indices, so flip only for assembly.
> >> >  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> > -  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
> >> > +  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
> >> >  	(unspec:BLK [(match_operand:VSTRUCT_4QD 1 "register_operand"
> >> "w")
> >> >  		     (match_operand:SI 2 "immediate_operand" "i")]
> >> >  		     UNSPEC_ST4_LANE))]
> >> > @@ -7047,7 +7047,7 @@ (define_insn
> >> "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
> >> >  )
> >> >
> >> >  (define_expand "vec_store_lanes<mode><vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_4Q 0 "aarch64_simd_struct_operand")
> >> > +  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand")
> >> >  	(unspec:VSTRUCT_4Q [(match_operand:VSTRUCT_4Q 1
> >> "register_operand")]
> >> >                     UNSPEC_ST4))]
> >> >    "TARGET_SIMD"
> >> > @@ -7138,7 +7138,7 @@ (define_expand "aarch64_ld1x3<vstruct_elt>"
> >> >  (define_insn "aarch64_ld1_x3_<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
> >> >          (unspec:VSTRUCT_3QD
> >> > -	  [(match_operand:VSTRUCT_3QD 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	  [(match_operand:VSTRUCT_3QD 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD1))]
> >> >    "TARGET_SIMD"
> >> >    "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> >> > @@ -7158,7 +7158,7 @@ (define_expand "aarch64_ld1x4<vstruct_elt>"
> >> >  (define_insn "aarch64_ld1_x4_<vstruct_elt>"
> >> >    [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_4QD
> >> > -	  [(match_operand:VSTRUCT_4QD 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	  [(match_operand:VSTRUCT_4QD 1 "memory_operand" "Utv")]
> >> >  	UNSPEC_LD1))]
> >> >    "TARGET_SIMD"
> >> >    "ld1\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> >> > @@ -7176,7 +7176,7 @@ (define_expand "aarch64_st1x2<vstruct_elt>"
> >> >  })
> >> >
> >> >  (define_insn "aarch64_st1_x2_<vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_2QD 0 "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_2QD 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_2QD
> >> >  		[(match_operand:VSTRUCT_2QD 1 "register_operand" "w")]
> >> >  		UNSPEC_ST1))]
> >> > @@ -7196,7 +7196,7 @@ (define_expand "aarch64_st1x3<vstruct_elt>"
> >> >  })
> >> >
> >> >  (define_insn "aarch64_st1_x3_<vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_3QD 0 "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_3QD 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_3QD
> >> >  		[(match_operand:VSTRUCT_3QD 1 "register_operand" "w")]
> >> >  		UNSPEC_ST1))]
> >> > @@ -7216,7 +7216,7 @@ (define_expand "aarch64_st1x4<vstruct_elt>"
> >> >  })
> >> >
> >> >  (define_insn "aarch64_st1_x4_<vstruct_elt>"
> >> > -  [(set (match_operand:VSTRUCT_4QD 0 "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_4QD 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_4QD
> >> >  		[(match_operand:VSTRUCT_4QD 1 "register_operand" "w")]
> >> >  		UNSPEC_ST1))]
> >> > @@ -7268,7 +7268,7 @@ (define_insn "*aarch64_movv8di"
> >> >  (define_insn "aarch64_be_ld1<mode>"
> >> >    [(set (match_operand:VALLDI_F16 0	"register_operand" "=w")
> >> >  	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1
> >> > -			     "aarch64_simd_struct_operand" "Utv")]
> >> > +			     "memory_operand" "Utv")]
> >> >  	UNSPEC_LD1))]
> >> >    "TARGET_SIMD"
> >> >    "ld1\\t{%0<Vmtype>}, %1"
> >> > @@ -7276,7 +7276,7 @@ (define_insn "aarch64_be_ld1<mode>"
> >> >  )
> >> >
> >> >  (define_insn "aarch64_be_st1<mode>"
> >> > -  [(set (match_operand:VALLDI_F16 0 "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VALLDI_F16 0 "memory_operand" "=Utv")
> >> >  	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1
> >> "register_operand" "w")]
> >> >  	UNSPEC_ST1))]
> >> >    "TARGET_SIMD"
> >> > @@ -7551,7 +7551,7 @@ (define_expand
> >> "aarch64_ld<nregs>r<vstruct_elt>"
> >> >  (define_insn "aarch64_ld2<vstruct_elt>_dreg"
> >> >    [(set (match_operand:VSTRUCT_2DNX 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_2DNX [
> >> > -	  (match_operand:VSTRUCT_2DNX 1
> >> "aarch64_simd_struct_operand" "Utv")]
> >> > +	  (match_operand:VSTRUCT_2DNX 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD2_DREG))]
> >> >    "TARGET_SIMD"
> >> >    "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> >> > @@ -7561,7 +7561,7 @@ (define_insn "aarch64_ld2<vstruct_elt>_dreg"
> >> >  (define_insn "aarch64_ld2<vstruct_elt>_dreg"
> >> >    [(set (match_operand:VSTRUCT_2DX 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_2DX [
> >> > -	  (match_operand:VSTRUCT_2DX 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	  (match_operand:VSTRUCT_2DX 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD2_DREG))]
> >> >    "TARGET_SIMD"
> >> >    "ld1\\t{%S0.1d - %T0.1d}, %1"
> >> > @@ -7571,7 +7571,7 @@ (define_insn "aarch64_ld2<vstruct_elt>_dreg"
> >> >  (define_insn "aarch64_ld3<vstruct_elt>_dreg"
> >> >    [(set (match_operand:VSTRUCT_3DNX 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_3DNX [
> >> > -	  (match_operand:VSTRUCT_3DNX 1
> >> "aarch64_simd_struct_operand" "Utv")]
> >> > +	  (match_operand:VSTRUCT_3DNX 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD3_DREG))]
> >> >    "TARGET_SIMD"
> >> >    "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
> >> > @@ -7581,7 +7581,7 @@ (define_insn "aarch64_ld3<vstruct_elt>_dreg"
> >> >  (define_insn "aarch64_ld3<vstruct_elt>_dreg"
> >> >    [(set (match_operand:VSTRUCT_3DX 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_3DX [
> >> > -	  (match_operand:VSTRUCT_3DX 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	  (match_operand:VSTRUCT_3DX 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD3_DREG))]
> >> >    "TARGET_SIMD"
> >> >    "ld1\\t{%S0.1d - %U0.1d}, %1"
> >> > @@ -7591,7 +7591,7 @@ (define_insn "aarch64_ld3<vstruct_elt>_dreg"
> >> >  (define_insn "aarch64_ld4<vstruct_elt>_dreg"
> >> >    [(set (match_operand:VSTRUCT_4DNX 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_4DNX [
> >> > -	  (match_operand:VSTRUCT_4DNX 1
> >> "aarch64_simd_struct_operand" "Utv")]
> >> > +	  (match_operand:VSTRUCT_4DNX 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD4_DREG))]
> >> >    "TARGET_SIMD"
> >> >    "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
> >> > @@ -7601,7 +7601,7 @@ (define_insn "aarch64_ld4<vstruct_elt>_dreg"
> >> >  (define_insn "aarch64_ld4<vstruct_elt>_dreg"
> >> >    [(set (match_operand:VSTRUCT_4DX 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_4DX [
> >> > -	  (match_operand:VSTRUCT_4DX 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	  (match_operand:VSTRUCT_4DX 1 "memory_operand" "Utv")]
> >> >  	  UNSPEC_LD4_DREG))]
> >> >    "TARGET_SIMD"
> >> >    "ld1\\t{%S0.1d - %V0.1d}, %1"
> >> > @@ -7841,7 +7841,7 @@ (define_insn
> >> "aarch64_rev<REVERSE:rev_op><mode>"
> >> >  )
> >> >
> >> >  (define_insn "aarch64_st2<vstruct_elt>_dreg"
> >> > -  [(set (match_operand:VSTRUCT_2DNX 0
> >> "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_2DNX 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_2DNX [
> >> >  		(match_operand:VSTRUCT_2DNX 1 "register_operand" "w")]
> >> >  		UNSPEC_ST2))]
> >> > @@ -7851,7 +7851,7 @@ (define_insn "aarch64_st2<vstruct_elt>_dreg"
> >> >  )
> >> >
> >> >  (define_insn "aarch64_st2<vstruct_elt>_dreg"
> >> > -  [(set (match_operand:VSTRUCT_2DX 0 "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_2DX 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_2DX [
> >> >  		(match_operand:VSTRUCT_2DX 1 "register_operand" "w")]
> >> >  		UNSPEC_ST2))]
> >> > @@ -7861,7 +7861,7 @@ (define_insn "aarch64_st2<vstruct_elt>_dreg"
> >> >  )
> >> >
> >> >  (define_insn "aarch64_st3<vstruct_elt>_dreg"
> >> > -  [(set (match_operand:VSTRUCT_3DNX 0
> >> "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_3DNX 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_3DNX [
> >> >  		(match_operand:VSTRUCT_3DNX 1 "register_operand" "w")]
> >> >  		UNSPEC_ST3))]
> >> > @@ -7871,7 +7871,7 @@ (define_insn "aarch64_st3<vstruct_elt>_dreg"
> >> >  )
> >> >
> >> >  (define_insn "aarch64_st3<vstruct_elt>_dreg"
> >> > -  [(set (match_operand:VSTRUCT_3DX 0 "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_3DX 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_3DX [
> >> >  		(match_operand:VSTRUCT_3DX 1 "register_operand" "w")]
> >> >  		UNSPEC_ST3))]
> >> > @@ -7881,7 +7881,7 @@ (define_insn "aarch64_st3<vstruct_elt>_dreg"
> >> >  )
> >> >
> >> >  (define_insn "aarch64_st4<vstruct_elt>_dreg"
> >> > -  [(set (match_operand:VSTRUCT_4DNX 0
> >> "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_4DNX 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_4DNX [
> >> >  		(match_operand:VSTRUCT_4DNX 1 "register_operand" "w")]
> >> >  		UNSPEC_ST4))]
> >> > @@ -7891,7 +7891,7 @@ (define_insn "aarch64_st4<vstruct_elt>_dreg"
> >> >  )
> >> >
> >> >  (define_insn "aarch64_st4<vstruct_elt>_dreg"
> >> > -  [(set (match_operand:VSTRUCT_4DX 0 "aarch64_simd_struct_operand"
> >> > "=Utv")
> >> > +  [(set (match_operand:VSTRUCT_4DX 0 "memory_operand" "=Utv")
> >> >  	(unspec:VSTRUCT_4DX [
> >> >  		(match_operand:VSTRUCT_4DX 1 "register_operand" "w")]
> >> >  		UNSPEC_ST4))]
> >> > @@ -7974,7 +7974,7 @@ (define_expand "vec_init<mode><Vhalf>"
> >> >  (define_insn "*aarch64_simd_ld1r<mode>"
> >> >    [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> >> >  	(vec_duplicate:VALL_F16
> >> > -	  (match_operand:<VEL> 1 "aarch64_simd_struct_operand" "Utv")))]
> >> > +	  (match_operand:<VEL> 1 "memory_operand" "Utv")))]
> >> >    "TARGET_SIMD"
> >> >    "ld1r\\t{%0.<Vtype>}, %1"
> >> >    [(set_attr "type" "neon_load1_all_lanes")] @@ -7983,7 +7983,7 @@
> >> > (define_insn "*aarch64_simd_ld1r<mode>"
> >> >  (define_insn "aarch64_simd_ld1<vstruct_elt>_x2"
> >> >    [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
> >> >  	(unspec:VSTRUCT_2QD [
> >> > -	    (match_operand:VSTRUCT_2QD 1 "aarch64_simd_struct_operand"
> >> "Utv")]
> >> > +	    (match_operand:VSTRUCT_2QD 1 "memory_operand" "Utv")]
> >> >  	    UNSPEC_LD1))]
> >> >    "TARGET_SIMD"
> >> >    "ld1\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
> >> > diff --git a/gcc/config/aarch64/predicates.md
> >> > b/gcc/config/aarch64/predicates.md
> >> > index
> >> >
> >> c308015ac2c13d24cd6bcec71247ec45df8cf5e6..6b70a364530c8108457091bfec
> >> 12
> >> > fe549f722149 100644
> >> > --- a/gcc/config/aarch64/predicates.md
> >> > +++ b/gcc/config/aarch64/predicates.md
> >> > @@ -494,10 +494,6 @@ (define_predicate
> >> "aarch64_simd_reg_or_minus_one"
> >> >    (ior (match_operand 0 "register_operand")
> >> >         (match_operand 0 "aarch64_simd_imm_minus_one")))
> >> >
> >> > -(define_predicate "aarch64_simd_struct_operand"
> >> > -  (and (match_code "mem")
> >> > -       (match_test "TARGET_SIMD && aarch64_simd_mem_operand_p
> >> (op)")))
> >> > -
> >> >  ;; Like general_operand but allow only valid SIMD addressing modes.
> >> >  (define_predicate "aarch64_simd_general_operand"
> >> >    (and (match_operand 0 "general_operand") diff --git
> >> > a/gcc/testsuite/gcc.target/aarch64/vld1r.c
> >> > b/gcc/testsuite/gcc.target/aarch64/vld1r.c
> >> > new file mode 100644
> >> > index
> >> >
> >> 0000000000000000000000000000000000000000..72c505c403e9e239771379b7ca
> >> dd
> >> > 8a9473f06113
> >> > --- /dev/null
> >> > +++ b/gcc/testsuite/gcc.target/aarch64/vld1r.c
> >> > @@ -0,0 +1,26 @@
> >> > +/* { dg-do compile } */
> >> > +/* { dg-additional-options "-O" } */
> >> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> >> > +} */
> >> > +
> >> > +#include <arm_neon.h>
> >> > +
> >> > +/*
> >> > +** f1:
> >> > +** 	add	x0, x0, 1
> >> > +** 	ld1r	{v0.8b}, \[x0\]
> >> > +** 	ret
> >> > +*/
> >> > +uint8x8_t f1(const uint8_t *in) {
> >> > +    return vld1_dup_u8(&in[1]);
> >> > +}
> >> > +
> >> > +/*
> >> > +** f2:
> >> > +** 	ldr	s1, \[x0, 4\]
> >> > +** 	fmla	v0.4s, v0.4s, v1.s\[0\]
> >> > +** 	ret
> >> > +*/
> >> > +float32x4_t f2(const float32_t *in, float32x4_t a) {
> >> > +    float32x4_t dup = vld1q_dup_f32(&in[1]);
> >> > +    return vfmaq_laneq_f32 (a, a, dup, 1); }
>
  
Richard Sandiford June 13, 2022, 8:26 a.m. UTC | #8
Richard Biener <rguenther@suse.de> writes:
> On Wed, 8 Jun 2022, Richard Sandiford wrote:
>> Tamar Christina <Tamar.Christina@arm.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford <richard.sandiford@arm.com>
>> >> Sent: Wednesday, June 8, 2022 11:31 AM
>> >> To: Tamar Christina <Tamar.Christina@arm.com>
>> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>> >> Subject: Re: [PATCH]AArch64 relax predicate on load structure load
>> >> instructions
>> >> 
>> >> Tamar Christina <tamar.christina@arm.com> writes:
>> >> > Hi All,
>> >> >
>> >> > At some point in time we started lowering the ld1r instructions in gimple.
>> >> >
>> >> > That is:
>> >> >
>> >> > uint8x8_t f1(const uint8_t *in) {
>> >> >     return vld1_dup_u8(&in[1]);
>> >> > }
>> >> >
>> >> > generates at gimple:
>> >> >
>> >> >   _3 = MEM[(const uint8_t *)in_1(D) + 1B];
>> >> >   _4 = {_3, _3, _3, _3, _3, _3, _3, _3};
>> >> >
>> >> > Which is good, but we then generate:
>> >> >
>> >> > f1:
>> >> > 	ldr     b0, [x0, 1]
>> >> > 	dup     v0.8b, v0.b[0]
>> >> > 	ret
>> >> >
>> >> > instead of ld1r.
>> >> >
>> >> > The reason for this is because the load instructions have a too
>> >> > restrictive predicate on them which causes combine not to be able to
>> >> > combine the instructions due to the predicate only accepting simple
>> >> addressing modes.
>> >> >
>> >> > This patch relaxes the predicate to accept any memory operand and
>> >> > relies on LRA to legitimize the address when it needs to as the
>> >> > constraint still only allows the simple addressing mode.  Reload is
>> >> > always able to legitimize to these.
>> >> >
>> >> > Secondly since we are now actually generating more ld1r it became
>> >> > clear that the lane instructions suffer from a similar issue.
>> >> >
>> >> > i.e.
>> >> >
>> >> > float32x4_t f2(const float32_t *in, float32x4_t a) {
>> >> >     float32x4_t dup = vld1q_dup_f32(&in[1]);
>> >> >     return vfmaq_laneq_f32 (a, a, dup, 1); }
>> >> >
>> >> > would generate ld1r + vector fmla instead of ldr + lane fmla.
>> >> >
>> >> > The reason for this is similar to the ld1r issue.  The predicate is
>> >> > too restrictive in only acception register operands but not memory.
>> >> >
>> >> > This relaxes it to accept register and/or memory while leaving the
>> >> > constraint to only accept registers.  This will have LRA generate a
>> >> > reload if needed forcing the memory to registers using the standard
>> >> patterns.
>> >> >
>> >> > These two changes allow combine and reload to generate the right
>> >> sequences.
>> >> >
>> >> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>> >> 
>> >> This is going against the general direction of travel, which is to make the
>> >> instruction's predicates and conditions enforce the constraints as much as
>> >> possible (making optimistic assumptions about pseudo registers).
>> >> 
>> >> The RA *can* deal with things like:
>> >> 
>> >>   (match_operand:M N "general_operand" "r")
>> >> 
>> >> but it's best avoided, for a few reasons:
>> >> 
>> >> (1) The fix-up will be done in LRA, so IRA will not see the temporary
>> >>     registers.  This can make the allocation of those temporaries
>> >>     suboptimal but (more importantly) it might require other
>> >>     previously-allocated registers to be spilled late due to the
>> >>     unexpected increase in register pressure.
>> >> 
>> >> (2) It ends up hiding instructions from the pre-RA optimisers.
>> >> 
>> >> (3) It can also prevent combine opportunities (as well as create them),
>> >>     unless the loose predicates in an insn I are propagated to all
>> >>     patterns that might result from combining I with something else.
>> >> 
>> >> It sounds like the first problem (not generating ld1r) could be fixed by (a)
>> >> combining aarch64_simd_dup<mode> and *aarch64_simd_ld1r<mode>, so
>> >> that the register and memory alternatives are in the same pattern and (b)
>> >> using the merged instruction(s) to implement the vec_duplicate optab.
>> >> Target-independent code should then make the address satisfy the
>> >> predicate, simplifying the address where necessary.
>> >> 
>> >
>> > I think I am likely missing something here. I would assume that you wanted
>> > to use the optab to split the addressing off from the mem expression so the
>> > combined insn matches.
>> >
>> > But in that case, why do you need to combine the two instructions?
>> > I've tried and it doesn't work since the vec_duplicate optab doesn't see the
>> > mem as op1, because in gimple the mem is not part of the duplicate.
>> >
>> > So you still just see:
>> >
>> >>>> dbgrtx (ops[1].value)
>> > (subreg/s/v:QI (reg:SI 92 [ _3 ]) 0)
>> >
>> > As the operand as the argument to the dup is just an SSA_NAME.
>> 
>> Ah, yeah, I'd forgotten that fixed-length vec_duplicates would
>> come from a constructor rather than a vec_duplicate_expr, so we don't
>> get the usual benefit of folding single-use mems during expand.
>> 
>> https://gcc.gnu.org/pipermail/gcc-patches/2022-May/595362.html
>> moves towards using vec_duplicate even for fixed-length vectors.
>> If we take that approach, then I suppose a plain constructor
>> should be folded to a vec_duplicate where possible.
>> 
>> (Alternatively, we could use an extended vec_perm_expr with
>> scalar inputs, as Richi suggested in that thread.)
>> 
>> If we don't do that, or don't do it yet, then…
>
> I suppose since we alrady have vec_duplicate we can just use it ...
> what was the reason to not do this originally?

There just wasn't any specific benefit for fixed-length vectors at the
time, and obvious potential problems -- introducing VEC_DUPLICATE_EXPRs
too early would lose out on existing CONSTRUCTOR-based folds.

Also, isel didn't exist at the time that vec_duplicate was added, but it
seems like it might be a good place to do the replacement.

Match rules that want to test for a uniform vector operand can already
use vec_same_elem_p to handle all representations, but perhaps we also
need a way of generating the “right” form of duplicate for the current
stage in the pass pipeline?

> I suppose the
> vec_duplicate expander has a fallback via store_constuctor?
>
> Originally I wanted to avoid multiple ways to express the same thing
> but vec_duplicate is a common enough special-case and it also
> usually maps to a special instruction in vector ISAs.
> There's VIEW_CONVERT vs. vec_duplicate for V1m modes then, I
> suppose VIEW_CONVERT is more canonical here.

Is that already true for V1m constructors?  (view_convert being
canonical and constructors not, I mean.)

What do you think about the suggestion in the other thread of making
VEC_PERM_EXPR take an arbitrary number of inputs, with (as you suggested)
the inputs allowed to be scalars rather than vectors?  VEC_PERM_EXPR
could then replace both CONSTRUCTOR and VEC_DUPLICATE_EXPR and “optimising”
a normal constructor to a duplicate would just be a case of removing
repeated scalar inputs.

Thanks,
Richard
  
Richard Biener June 13, 2022, 8:38 a.m. UTC | #9
On Mon, 13 Jun 2022, Richard Sandiford wrote:

> Richard Biener <rguenther@suse.de> writes:
> > On Wed, 8 Jun 2022, Richard Sandiford wrote:
> >> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> >> -----Original Message-----
> >> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> >> Sent: Wednesday, June 8, 2022 11:31 AM
> >> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> >> >> Subject: Re: [PATCH]AArch64 relax predicate on load structure load
> >> >> instructions
> >> >> 
> >> >> Tamar Christina <tamar.christina@arm.com> writes:
> >> >> > Hi All,
> >> >> >
> >> >> > At some point in time we started lowering the ld1r instructions in gimple.
> >> >> >
> >> >> > That is:
> >> >> >
> >> >> > uint8x8_t f1(const uint8_t *in) {
> >> >> >     return vld1_dup_u8(&in[1]);
> >> >> > }
> >> >> >
> >> >> > generates at gimple:
> >> >> >
> >> >> >   _3 = MEM[(const uint8_t *)in_1(D) + 1B];
> >> >> >   _4 = {_3, _3, _3, _3, _3, _3, _3, _3};
> >> >> >
> >> >> > Which is good, but we then generate:
> >> >> >
> >> >> > f1:
> >> >> > 	ldr     b0, [x0, 1]
> >> >> > 	dup     v0.8b, v0.b[0]
> >> >> > 	ret
> >> >> >
> >> >> > instead of ld1r.
> >> >> >
> >> >> > The reason for this is because the load instructions have a too
> >> >> > restrictive predicate on them which causes combine not to be able to
> >> >> > combine the instructions due to the predicate only accepting simple
> >> >> addressing modes.
> >> >> >
> >> >> > This patch relaxes the predicate to accept any memory operand and
> >> >> > relies on LRA to legitimize the address when it needs to as the
> >> >> > constraint still only allows the simple addressing mode.  Reload is
> >> >> > always able to legitimize to these.
> >> >> >
> >> >> > Secondly since we are now actually generating more ld1r it became
> >> >> > clear that the lane instructions suffer from a similar issue.
> >> >> >
> >> >> > i.e.
> >> >> >
> >> >> > float32x4_t f2(const float32_t *in, float32x4_t a) {
> >> >> >     float32x4_t dup = vld1q_dup_f32(&in[1]);
> >> >> >     return vfmaq_laneq_f32 (a, a, dup, 1); }
> >> >> >
> >> >> > would generate ld1r + vector fmla instead of ldr + lane fmla.
> >> >> >
> >> >> > The reason for this is similar to the ld1r issue.  The predicate is
> >> >> > too restrictive in only acception register operands but not memory.
> >> >> >
> >> >> > This relaxes it to accept register and/or memory while leaving the
> >> >> > constraint to only accept registers.  This will have LRA generate a
> >> >> > reload if needed forcing the memory to registers using the standard
> >> >> patterns.
> >> >> >
> >> >> > These two changes allow combine and reload to generate the right
> >> >> sequences.
> >> >> >
> >> >> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >> >> 
> >> >> This is going against the general direction of travel, which is to make the
> >> >> instruction's predicates and conditions enforce the constraints as much as
> >> >> possible (making optimistic assumptions about pseudo registers).
> >> >> 
> >> >> The RA *can* deal with things like:
> >> >> 
> >> >>   (match_operand:M N "general_operand" "r")
> >> >> 
> >> >> but it's best avoided, for a few reasons:
> >> >> 
> >> >> (1) The fix-up will be done in LRA, so IRA will not see the temporary
> >> >>     registers.  This can make the allocation of those temporaries
> >> >>     suboptimal but (more importantly) it might require other
> >> >>     previously-allocated registers to be spilled late due to the
> >> >>     unexpected increase in register pressure.
> >> >> 
> >> >> (2) It ends up hiding instructions from the pre-RA optimisers.
> >> >> 
> >> >> (3) It can also prevent combine opportunities (as well as create them),
> >> >>     unless the loose predicates in an insn I are propagated to all
> >> >>     patterns that might result from combining I with something else.
> >> >> 
> >> >> It sounds like the first problem (not generating ld1r) could be fixed by (a)
> >> >> combining aarch64_simd_dup<mode> and *aarch64_simd_ld1r<mode>, so
> >> >> that the register and memory alternatives are in the same pattern and (b)
> >> >> using the merged instruction(s) to implement the vec_duplicate optab.
> >> >> Target-independent code should then make the address satisfy the
> >> >> predicate, simplifying the address where necessary.
> >> >> 
> >> >
> >> > I think I am likely missing something here. I would assume that you wanted
> >> > to use the optab to split the addressing off from the mem expression so the
> >> > combined insn matches.
> >> >
> >> > But in that case, why do you need to combine the two instructions?
> >> > I've tried and it doesn't work since the vec_duplicate optab doesn't see the
> >> > mem as op1, because in gimple the mem is not part of the duplicate.
> >> >
> >> > So you still just see:
> >> >
> >> >>>> dbgrtx (ops[1].value)
> >> > (subreg/s/v:QI (reg:SI 92 [ _3 ]) 0)
> >> >
> >> > As the operand as the argument to the dup is just an SSA_NAME.
> >> 
> >> Ah, yeah, I'd forgotten that fixed-length vec_duplicates would
> >> come from a constructor rather than a vec_duplicate_expr, so we don't
> >> get the usual benefit of folding single-use mems during expand.
> >> 
> >> https://gcc.gnu.org/pipermail/gcc-patches/2022-May/595362.html
> >> moves towards using vec_duplicate even for fixed-length vectors.
> >> If we take that approach, then I suppose a plain constructor
> >> should be folded to a vec_duplicate where possible.
> >> 
> >> (Alternatively, we could use an extended vec_perm_expr with
> >> scalar inputs, as Richi suggested in that thread.)
> >> 
> >> If we don't do that, or don't do it yet, then…
> >
> > I suppose since we alrady have vec_duplicate we can just use it ...
> > what was the reason to not do this originally?
> 
> There just wasn't any specific benefit for fixed-length vectors at the
> time, and obvious potential problems -- introducing VEC_DUPLICATE_EXPRs
> too early would lose out on existing CONSTRUCTOR-based folds.
> 
> Also, isel didn't exist at the time that vec_duplicate was added, but it
> seems like it might be a good place to do the replacement.
> 
> Match rules that want to test for a uniform vector operand can already
> use vec_same_elem_p to handle all representations, but perhaps we also
> need a way of generating the “right” form of duplicate for the current
> stage in the pass pipeline?

I think we can have vec_duplicate without native target support by
expanding via CONSTRUCTOR, so vec_duplicate would be the correct
one at all stages and we fixup during RTL expansion directly.

As you noted most targets don't implement vec_duplicate yet.

> > I suppose the
> > vec_duplicate expander has a fallback via store_constuctor?
> >
> > Originally I wanted to avoid multiple ways to express the same thing
> > but vec_duplicate is a common enough special-case and it also
> > usually maps to a special instruction in vector ISAs.
> > There's VIEW_CONVERT vs. vec_duplicate for V1m modes then, I
> > suppose VIEW_CONVERT is more canonical here.
> 
> Is that already true for V1m constructors?  (view_convert being
> canonical and constructors not, I mean.)

I think so, yes.

> What do you think about the suggestion in the other thread of making
> VEC_PERM_EXPR take an arbitrary number of inputs, with (as you suggested)
> the inputs allowed to be scalars rather than vectors?  VEC_PERM_EXPR
> could then replace both CONSTRUCTOR and VEC_DUPLICATE_EXPR and “optimising”
> a normal constructor to a duplicate would just be a case of removing
> repeated scalar inputs.

It's indeed somewhat appealing to make VEC_PERM a swiss army knife.
I'm not sure about making it a VL tree though, currently it's
a nice GIMPLE ternary while VL would make it a SINGLE RHS with
a GENERIC tree (unless we introduce a gimple_vec_perm special
gimple node).  That said, allowing scalars as VEC_PERM inputs
to get rid of VEC_DUPLICATE only will still leave us with the
VIEW_CONVERT special case.

At some point we might want to help targets with "interesting"
ISAs by lowering VEC_PERM to supported .VEC_PERM_CONSTs and
relaxing what permutes we allow earlier in the pipeline
(I'm thinking of x86 with its many special permutation ops
and the open-coded vec-perm-const expander).

So not sure what to do, but I'm happy to widen VEC_DUPLICATE_EXPR use.

Richard.
  
Tamar Christina June 13, 2022, 9:51 a.m. UTC | #10
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Monday, June 13, 2022 9:38 AM
> To: Richard Sandiford <Richard.Sandiford@arm.com>
> Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org;
> nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>; roger@eyesopen.com
> Subject: Re: [PATCH]AArch64 relax predicate on load structure load
> instructions
> 
> On Mon, 13 Jun 2022, Richard Sandiford wrote:
> 
> > Richard Biener <rguenther@suse.de> writes:
> > > On Wed, 8 Jun 2022, Richard Sandiford wrote:
> > >> Tamar Christina <Tamar.Christina@arm.com> writes:
> > >> >> -----Original Message-----
> > >> >> From: Richard Sandiford <richard.sandiford@arm.com>
> > >> >> Sent: Wednesday, June 8, 2022 11:31 AM
> > >> >> To: Tamar Christina <Tamar.Christina@arm.com>
> > >> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> > >> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> > >> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> > >> >> <Kyrylo.Tkachov@arm.com>
> > >> >> Subject: Re: [PATCH]AArch64 relax predicate on load structure
> > >> >> load instructions
> > >> >>
> > >> >> Tamar Christina <tamar.christina@arm.com> writes:
> > >> >> > Hi All,
> > >> >> >
> > >> >> > At some point in time we started lowering the ld1r instructions in
> gimple.
> > >> >> >
> > >> >> > That is:
> > >> >> >
> > >> >> > uint8x8_t f1(const uint8_t *in) {
> > >> >> >     return vld1_dup_u8(&in[1]); }
> > >> >> >
> > >> >> > generates at gimple:
> > >> >> >
> > >> >> >   _3 = MEM[(const uint8_t *)in_1(D) + 1B];
> > >> >> >   _4 = {_3, _3, _3, _3, _3, _3, _3, _3};
> > >> >> >
> > >> >> > Which is good, but we then generate:
> > >> >> >
> > >> >> > f1:
> > >> >> > 	ldr     b0, [x0, 1]
> > >> >> > 	dup     v0.8b, v0.b[0]
> > >> >> > 	ret
> > >> >> >
> > >> >> > instead of ld1r.
> > >> >> >
> > >> >> > The reason for this is because the load instructions have a
> > >> >> > too restrictive predicate on them which causes combine not to
> > >> >> > be able to combine the instructions due to the predicate only
> > >> >> > accepting simple
> > >> >> addressing modes.
> > >> >> >
> > >> >> > This patch relaxes the predicate to accept any memory operand
> > >> >> > and relies on LRA to legitimize the address when it needs to
> > >> >> > as the constraint still only allows the simple addressing
> > >> >> > mode.  Reload is always able to legitimize to these.
> > >> >> >
> > >> >> > Secondly since we are now actually generating more ld1r it
> > >> >> > became clear that the lane instructions suffer from a similar issue.
> > >> >> >
> > >> >> > i.e.
> > >> >> >
> > >> >> > float32x4_t f2(const float32_t *in, float32x4_t a) {
> > >> >> >     float32x4_t dup = vld1q_dup_f32(&in[1]);
> > >> >> >     return vfmaq_laneq_f32 (a, a, dup, 1); }
> > >> >> >
> > >> >> > would generate ld1r + vector fmla instead of ldr + lane fmla.
> > >> >> >
> > >> >> > The reason for this is similar to the ld1r issue.  The
> > >> >> > predicate is too restrictive in only acception register operands but
> not memory.
> > >> >> >
> > >> >> > This relaxes it to accept register and/or memory while leaving
> > >> >> > the constraint to only accept registers.  This will have LRA
> > >> >> > generate a reload if needed forcing the memory to registers
> > >> >> > using the standard
> > >> >> patterns.
> > >> >> >
> > >> >> > These two changes allow combine and reload to generate the
> > >> >> > right
> > >> >> sequences.
> > >> >> >
> > >> >> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > >> >>
> > >> >> This is going against the general direction of travel, which is
> > >> >> to make the instruction's predicates and conditions enforce the
> > >> >> constraints as much as possible (making optimistic assumptions
> about pseudo registers).
> > >> >>
> > >> >> The RA *can* deal with things like:
> > >> >>
> > >> >>   (match_operand:M N "general_operand" "r")
> > >> >>
> > >> >> but it's best avoided, for a few reasons:
> > >> >>
> > >> >> (1) The fix-up will be done in LRA, so IRA will not see the temporary
> > >> >>     registers.  This can make the allocation of those temporaries
> > >> >>     suboptimal but (more importantly) it might require other
> > >> >>     previously-allocated registers to be spilled late due to the
> > >> >>     unexpected increase in register pressure.
> > >> >>
> > >> >> (2) It ends up hiding instructions from the pre-RA optimisers.
> > >> >>
> > >> >> (3) It can also prevent combine opportunities (as well as create
> them),
> > >> >>     unless the loose predicates in an insn I are propagated to all
> > >> >>     patterns that might result from combining I with something else.
> > >> >>
> > >> >> It sounds like the first problem (not generating ld1r) could be
> > >> >> fixed by (a) combining aarch64_simd_dup<mode> and
> > >> >> *aarch64_simd_ld1r<mode>, so that the register and memory
> > >> >> alternatives are in the same pattern and (b) using the merged
> instruction(s) to implement the vec_duplicate optab.
> > >> >> Target-independent code should then make the address satisfy the
> > >> >> predicate, simplifying the address where necessary.
> > >> >>
> > >> >
> > >> > I think I am likely missing something here. I would assume that
> > >> > you wanted to use the optab to split the addressing off from the
> > >> > mem expression so the combined insn matches.
> > >> >
> > >> > But in that case, why do you need to combine the two instructions?
> > >> > I've tried and it doesn't work since the vec_duplicate optab
> > >> > doesn't see the mem as op1, because in gimple the mem is not part of
> the duplicate.
> > >> >
> > >> > So you still just see:
> > >> >
> > >> >>>> dbgrtx (ops[1].value)
> > >> > (subreg/s/v:QI (reg:SI 92 [ _3 ]) 0)
> > >> >
> > >> > As the operand as the argument to the dup is just an SSA_NAME.
> > >>
> > >> Ah, yeah, I'd forgotten that fixed-length vec_duplicates would come
> > >> from a constructor rather than a vec_duplicate_expr, so we don't
> > >> get the usual benefit of folding single-use mems during expand.
> > >>
> > >> https://gcc.gnu.org/pipermail/gcc-patches/2022-May/595362.html
> > >> moves towards using vec_duplicate even for fixed-length vectors.
> > >> If we take that approach, then I suppose a plain constructor should
> > >> be folded to a vec_duplicate where possible.
> > >>
> > >> (Alternatively, we could use an extended vec_perm_expr with scalar
> > >> inputs, as Richi suggested in that thread.)
> > >>
> > >> If we don't do that, or don't do it yet, then…
> > >
> > > I suppose since we alrady have vec_duplicate we can just use it ...
> > > what was the reason to not do this originally?
> >
> > There just wasn't any specific benefit for fixed-length vectors at the
> > time, and obvious potential problems -- introducing
> > VEC_DUPLICATE_EXPRs too early would lose out on existing
> CONSTRUCTOR-based folds.
> >
> > Also, isel didn't exist at the time that vec_duplicate was added, but
> > it seems like it might be a good place to do the replacement.
> >
> > Match rules that want to test for a uniform vector operand can already
> > use vec_same_elem_p to handle all representations, but perhaps we also
> > need a way of generating the “right” form of duplicate for the current
> > stage in the pass pipeline?
> 
> I think we can have vec_duplicate without native target support by
> expanding via CONSTRUCTOR, so vec_duplicate would be the correct one at
> all stages and we fixup during RTL expansion directly.
> 
> As you noted most targets don't implement vec_duplicate yet.
> 
> > > I suppose the
> > > vec_duplicate expander has a fallback via store_constuctor?
> > >
> > > Originally I wanted to avoid multiple ways to express the same thing
> > > but vec_duplicate is a common enough special-case and it also
> > > usually maps to a special instruction in vector ISAs.
> > > There's VIEW_CONVERT vs. vec_duplicate for V1m modes then, I
> suppose
> > > VIEW_CONVERT is more canonical here.
> >
> > Is that already true for V1m constructors?  (view_convert being
> > canonical and constructors not, I mean.)
> 
> I think so, yes.
> 
> > What do you think about the suggestion in the other thread of making
> > VEC_PERM_EXPR take an arbitrary number of inputs, with (as you
> > suggested) the inputs allowed to be scalars rather than vectors?
> > VEC_PERM_EXPR could then replace both CONSTRUCTOR and
> VEC_DUPLICATE_EXPR and “optimising”
> > a normal constructor to a duplicate would just be a case of removing
> > repeated scalar inputs.
> 
> It's indeed somewhat appealing to make VEC_PERM a swiss army knife.
> I'm not sure about making it a VL tree though, currently it's a nice GIMPLE
> ternary while VL would make it a SINGLE RHS with a GENERIC tree (unless we
> introduce a gimple_vec_perm special gimple node).  That said, allowing
> scalars as VEC_PERM inputs to get rid of VEC_DUPLICATE only will still leave
> us with the VIEW_CONVERT special case.
> 
> At some point we might want to help targets with "interesting"
> ISAs by lowering VEC_PERM to supported .VEC_PERM_CONSTs and relaxing
> what permutes we allow earlier in the pipeline (I'm thinking of x86 with its
> many special permutation ops and the open-coded vec-perm-const
> expander).
> 
> So not sure what to do, but I'm happy to widen VEC_DUPLICATE_EXPR use.

Just to check, this means detect VEC_DUPLICATE_EXPR during isel and convert
the CONSTRUCTOR to it?

Cheers,
Tamar

> 
> Richard.
  
Richard Biener June 13, 2022, 11:50 a.m. UTC | #11
On Mon, 13 Jun 2022, Tamar Christina wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Monday, June 13, 2022 9:38 AM
> > To: Richard Sandiford <Richard.Sandiford@arm.com>
> > Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org;
> > nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> > Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> > <Kyrylo.Tkachov@arm.com>; roger@eyesopen.com
> > Subject: Re: [PATCH]AArch64 relax predicate on load structure load
> > instructions
> > 
> > On Mon, 13 Jun 2022, Richard Sandiford wrote:
> > 
> > > Richard Biener <rguenther@suse.de> writes:
> > > > On Wed, 8 Jun 2022, Richard Sandiford wrote:
> > > >> Tamar Christina <Tamar.Christina@arm.com> writes:
> > > >> >> -----Original Message-----
> > > >> >> From: Richard Sandiford <richard.sandiford@arm.com>
> > > >> >> Sent: Wednesday, June 8, 2022 11:31 AM
> > > >> >> To: Tamar Christina <Tamar.Christina@arm.com>
> > > >> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> > > >> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> > > >> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> > > >> >> <Kyrylo.Tkachov@arm.com>
> > > >> >> Subject: Re: [PATCH]AArch64 relax predicate on load structure
> > > >> >> load instructions
> > > >> >>
> > > >> >> Tamar Christina <tamar.christina@arm.com> writes:
> > > >> >> > Hi All,
> > > >> >> >
> > > >> >> > At some point in time we started lowering the ld1r instructions in
> > gimple.
> > > >> >> >
> > > >> >> > That is:
> > > >> >> >
> > > >> >> > uint8x8_t f1(const uint8_t *in) {
> > > >> >> >     return vld1_dup_u8(&in[1]); }
> > > >> >> >
> > > >> >> > generates at gimple:
> > > >> >> >
> > > >> >> >   _3 = MEM[(const uint8_t *)in_1(D) + 1B];
> > > >> >> >   _4 = {_3, _3, _3, _3, _3, _3, _3, _3};
> > > >> >> >
> > > >> >> > Which is good, but we then generate:
> > > >> >> >
> > > >> >> > f1:
> > > >> >> > 	ldr     b0, [x0, 1]
> > > >> >> > 	dup     v0.8b, v0.b[0]
> > > >> >> > 	ret
> > > >> >> >
> > > >> >> > instead of ld1r.
> > > >> >> >
> > > >> >> > The reason for this is because the load instructions have a
> > > >> >> > too restrictive predicate on them which causes combine not to
> > > >> >> > be able to combine the instructions due to the predicate only
> > > >> >> > accepting simple
> > > >> >> addressing modes.
> > > >> >> >
> > > >> >> > This patch relaxes the predicate to accept any memory operand
> > > >> >> > and relies on LRA to legitimize the address when it needs to
> > > >> >> > as the constraint still only allows the simple addressing
> > > >> >> > mode.  Reload is always able to legitimize to these.
> > > >> >> >
> > > >> >> > Secondly since we are now actually generating more ld1r it
> > > >> >> > became clear that the lane instructions suffer from a similar issue.
> > > >> >> >
> > > >> >> > i.e.
> > > >> >> >
> > > >> >> > float32x4_t f2(const float32_t *in, float32x4_t a) {
> > > >> >> >     float32x4_t dup = vld1q_dup_f32(&in[1]);
> > > >> >> >     return vfmaq_laneq_f32 (a, a, dup, 1); }
> > > >> >> >
> > > >> >> > would generate ld1r + vector fmla instead of ldr + lane fmla.
> > > >> >> >
> > > >> >> > The reason for this is similar to the ld1r issue.  The
> > > >> >> > predicate is too restrictive in only acception register operands but
> > not memory.
> > > >> >> >
> > > >> >> > This relaxes it to accept register and/or memory while leaving
> > > >> >> > the constraint to only accept registers.  This will have LRA
> > > >> >> > generate a reload if needed forcing the memory to registers
> > > >> >> > using the standard
> > > >> >> patterns.
> > > >> >> >
> > > >> >> > These two changes allow combine and reload to generate the
> > > >> >> > right
> > > >> >> sequences.
> > > >> >> >
> > > >> >> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > > >> >>
> > > >> >> This is going against the general direction of travel, which is
> > > >> >> to make the instruction's predicates and conditions enforce the
> > > >> >> constraints as much as possible (making optimistic assumptions
> > about pseudo registers).
> > > >> >>
> > > >> >> The RA *can* deal with things like:
> > > >> >>
> > > >> >>   (match_operand:M N "general_operand" "r")
> > > >> >>
> > > >> >> but it's best avoided, for a few reasons:
> > > >> >>
> > > >> >> (1) The fix-up will be done in LRA, so IRA will not see the temporary
> > > >> >>     registers.  This can make the allocation of those temporaries
> > > >> >>     suboptimal but (more importantly) it might require other
> > > >> >>     previously-allocated registers to be spilled late due to the
> > > >> >>     unexpected increase in register pressure.
> > > >> >>
> > > >> >> (2) It ends up hiding instructions from the pre-RA optimisers.
> > > >> >>
> > > >> >> (3) It can also prevent combine opportunities (as well as create
> > them),
> > > >> >>     unless the loose predicates in an insn I are propagated to all
> > > >> >>     patterns that might result from combining I with something else.
> > > >> >>
> > > >> >> It sounds like the first problem (not generating ld1r) could be
> > > >> >> fixed by (a) combining aarch64_simd_dup<mode> and
> > > >> >> *aarch64_simd_ld1r<mode>, so that the register and memory
> > > >> >> alternatives are in the same pattern and (b) using the merged
> > instruction(s) to implement the vec_duplicate optab.
> > > >> >> Target-independent code should then make the address satisfy the
> > > >> >> predicate, simplifying the address where necessary.
> > > >> >>
> > > >> >
> > > >> > I think I am likely missing something here. I would assume that
> > > >> > you wanted to use the optab to split the addressing off from the
> > > >> > mem expression so the combined insn matches.
> > > >> >
> > > >> > But in that case, why do you need to combine the two instructions?
> > > >> > I've tried and it doesn't work since the vec_duplicate optab
> > > >> > doesn't see the mem as op1, because in gimple the mem is not part of
> > the duplicate.
> > > >> >
> > > >> > So you still just see:
> > > >> >
> > > >> >>>> dbgrtx (ops[1].value)
> > > >> > (subreg/s/v:QI (reg:SI 92 [ _3 ]) 0)
> > > >> >
> > > >> > As the operand as the argument to the dup is just an SSA_NAME.
> > > >>
> > > >> Ah, yeah, I'd forgotten that fixed-length vec_duplicates would come
> > > >> from a constructor rather than a vec_duplicate_expr, so we don't
> > > >> get the usual benefit of folding single-use mems during expand.
> > > >>
> > > >> https://gcc.gnu.org/pipermail/gcc-patches/2022-May/595362.html
> > > >> moves towards using vec_duplicate even for fixed-length vectors.
> > > >> If we take that approach, then I suppose a plain constructor should
> > > >> be folded to a vec_duplicate where possible.
> > > >>
> > > >> (Alternatively, we could use an extended vec_perm_expr with scalar
> > > >> inputs, as Richi suggested in that thread.)
> > > >>
> > > >> If we don't do that, or don't do it yet, then…
> > > >
> > > > I suppose since we alrady have vec_duplicate we can just use it ...
> > > > what was the reason to not do this originally?
> > >
> > > There just wasn't any specific benefit for fixed-length vectors at the
> > > time, and obvious potential problems -- introducing
> > > VEC_DUPLICATE_EXPRs too early would lose out on existing
> > CONSTRUCTOR-based folds.
> > >
> > > Also, isel didn't exist at the time that vec_duplicate was added, but
> > > it seems like it might be a good place to do the replacement.
> > >
> > > Match rules that want to test for a uniform vector operand can already
> > > use vec_same_elem_p to handle all representations, but perhaps we also
> > > need a way of generating the “right” form of duplicate for the current
> > > stage in the pass pipeline?
> > 
> > I think we can have vec_duplicate without native target support by
> > expanding via CONSTRUCTOR, so vec_duplicate would be the correct one at
> > all stages and we fixup during RTL expansion directly.
> > 
> > As you noted most targets don't implement vec_duplicate yet.
> > 
> > > > I suppose the
> > > > vec_duplicate expander has a fallback via store_constuctor?
> > > >
> > > > Originally I wanted to avoid multiple ways to express the same thing
> > > > but vec_duplicate is a common enough special-case and it also
> > > > usually maps to a special instruction in vector ISAs.
> > > > There's VIEW_CONVERT vs. vec_duplicate for V1m modes then, I
> > suppose
> > > > VIEW_CONVERT is more canonical here.
> > >
> > > Is that already true for V1m constructors?  (view_convert being
> > > canonical and constructors not, I mean.)
> > 
> > I think so, yes.
> > 
> > > What do you think about the suggestion in the other thread of making
> > > VEC_PERM_EXPR take an arbitrary number of inputs, with (as you
> > > suggested) the inputs allowed to be scalars rather than vectors?
> > > VEC_PERM_EXPR could then replace both CONSTRUCTOR and
> > VEC_DUPLICATE_EXPR and “optimising”
> > > a normal constructor to a duplicate would just be a case of removing
> > > repeated scalar inputs.
> > 
> > It's indeed somewhat appealing to make VEC_PERM a swiss army knife.
> > I'm not sure about making it a VL tree though, currently it's a nice GIMPLE
> > ternary while VL would make it a SINGLE RHS with a GENERIC tree (unless we
> > introduce a gimple_vec_perm special gimple node).  That said, allowing
> > scalars as VEC_PERM inputs to get rid of VEC_DUPLICATE only will still leave
> > us with the VIEW_CONVERT special case.
> > 
> > At some point we might want to help targets with "interesting"
> > ISAs by lowering VEC_PERM to supported .VEC_PERM_CONSTs and relaxing
> > what permutes we allow earlier in the pipeline (I'm thinking of x86 with its
> > many special permutation ops and the open-coded vec-perm-const
> > expander).
> > 
> > So not sure what to do, but I'm happy to widen VEC_DUPLICATE_EXPR use.
> 
> Just to check, this means detect VEC_DUPLICATE_EXPR during isel and convert
> the CONSTRUCTOR to it?

Hmm, that's a possibility but I thought of using VEC_DUPLICATE_EXPR
already initially (build_vector_from_val, some CTOR to duplicate folding, 
etc.).  Clearly detecting VEC_DUPLICATE_EXPR at ISEL time would be least
intrusive in case we want to resort to further IL streamlining during
this stage1.  So yes, this should work to fix your optimization
problem.

Richard.
  

Patch

--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -712,7 +712,7 @@  (define_insn "mul_lane<mode>3"
        (mult:VMULD
 	 (vec_duplicate:VMULD
 	   (vec_select:<VEL>
-	     (match_operand:<VCOND> 2 "register_operand" "<h_con>")
+	     (match_operand:<VCOND> 2 "nonimmediate_operand" "<h_con>")
 	     (parallel [(match_operand:SI 3 "immediate_operand" "i")])))
 	 (match_operand:VMULD 1 "register_operand" "w")))]
   "TARGET_SIMD"
@@ -728,7 +728,7 @@  (define_insn "mul_laneq<mode>3"
      (mult:VMUL
        (vec_duplicate:VMUL
 	  (vec_select:<VEL>
-	    (match_operand:<VCONQ> 2 "register_operand" "<h_con>")
+	    (match_operand:<VCONQ> 2 "nonimmediate_operand" "<h_con>")
 	    (parallel [(match_operand:SI 3 "immediate_operand")])))
       (match_operand:VMUL 1 "register_operand" "w")))]
   "TARGET_SIMD"
@@ -743,7 +743,7 @@  (define_insn "mul_n<mode>3"
  [(set (match_operand:VMUL 0 "register_operand" "=w")
        (mult:VMUL
 	 (vec_duplicate:VMUL
-	   (match_operand:<VEL> 2 "register_operand" "<h_con>"))
+	   (match_operand:<VEL> 2 "nonimmediate_operand" "<h_con>"))
 	 (match_operand:VMUL 1 "register_operand" "w")))]
   "TARGET_SIMD"
   "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]";
@@ -789,7 +789,7 @@  (define_insn "*aarch64_mul3_elt_to_64v2df"
   [(set (match_operand:DF 0 "register_operand" "=w")
      (mult:DF
        (vec_select:DF
-	 (match_operand:V2DF 1 "register_operand" "w")
+	 (match_operand:V2DF 1 "nonimmediate_operand" "w")
 	 (parallel [(match_operand:SI 2 "immediate_operand")]))
        (match_operand:DF 3 "register_operand" "w")))]
   "TARGET_SIMD"
@@ -1406,7 +1406,7 @@  (define_insn "*aarch64_mla_elt<mode>"
 	 (mult:VDQHS
 	   (vec_duplicate:VDQHS
 	      (vec_select:<VEL>
-		(match_operand:VDQHS 1 "register_operand" "<h_con>")
+		(match_operand:VDQHS 1 "nonimmediate_operand" "<h_con>")
 		  (parallel [(match_operand:SI 2 "immediate_operand")])))
 	   (match_operand:VDQHS 3 "register_operand" "w"))
 	 (match_operand:VDQHS 4 "register_operand" "0")))]
@@ -1424,7 +1424,7 @@  (define_insn "*aarch64_mla_elt_<vswap_width_name><mode>"
 	 (mult:VDQHS
 	   (vec_duplicate:VDQHS
 	      (vec_select:<VEL>
-		(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+		(match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand" "<h_con>")
 		  (parallel [(match_operand:SI 2 "immediate_operand")])))
 	   (match_operand:VDQHS 3 "register_operand" "w"))
 	 (match_operand:VDQHS 4 "register_operand" "0")))]
@@ -1441,7 +1441,7 @@  (define_insn "aarch64_mla_n<mode>"
 	(plus:VDQHS
 	  (mult:VDQHS
 	    (vec_duplicate:VDQHS
-	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
+	      (match_operand:<VEL> 3 "nonimmediate_operand" "<h_con>"))
 	    (match_operand:VDQHS 2 "register_operand" "w"))
 	  (match_operand:VDQHS 1 "register_operand" "0")))]
  "TARGET_SIMD"
@@ -1466,7 +1466,7 @@  (define_insn "*aarch64_mls_elt<mode>"
 	 (mult:VDQHS
 	   (vec_duplicate:VDQHS
 	      (vec_select:<VEL>
-		(match_operand:VDQHS 1 "register_operand" "<h_con>")
+		(match_operand:VDQHS 1 "nonimmediate_operand" "<h_con>")
 		  (parallel [(match_operand:SI 2 "immediate_operand")])))
 	   (match_operand:VDQHS 3 "register_operand" "w"))))]
  "TARGET_SIMD"
@@ -1484,7 +1484,7 @@  (define_insn "*aarch64_mls_elt_<vswap_width_name><mode>"
 	 (mult:VDQHS
 	   (vec_duplicate:VDQHS
 	      (vec_select:<VEL>
-		(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+		(match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand" "<h_con>")
 		  (parallel [(match_operand:SI 2 "immediate_operand")])))
 	   (match_operand:VDQHS 3 "register_operand" "w"))))]
  "TARGET_SIMD"
@@ -1501,7 +1501,7 @@  (define_insn "aarch64_mls_n<mode>"
 	  (match_operand:VDQHS 1 "register_operand" "0")
 	  (mult:VDQHS
 	    (vec_duplicate:VDQHS
-	      (match_operand:<VEL> 3 "register_operand" "<h_con>"))
+	      (match_operand:<VEL> 3 "nonimmediate_operand" "<h_con>"))
 	    (match_operand:VDQHS 2 "register_operand" "w"))))]
   "TARGET_SIMD"
   "mls\t%0.<Vtype>, %2.<Vtype>, %3.<Vetype>[0]"
@@ -2882,7 +2882,7 @@  (define_insn "*aarch64_fma4_elt<mode>"
     (fma:VDQF
       (vec_duplicate:VDQF
 	(vec_select:<VEL>
-	  (match_operand:VDQF 1 "register_operand" "<h_con>")
+	  (match_operand:VDQF 1 "nonimmediate_operand" "<h_con>")
 	  (parallel [(match_operand:SI 2 "immediate_operand")])))
       (match_operand:VDQF 3 "register_operand" "w")
       (match_operand:VDQF 4 "register_operand" "0")))]
@@ -2899,7 +2899,7 @@  (define_insn "*aarch64_fma4_elt_<vswap_width_name><mode>"
     (fma:VDQSF
       (vec_duplicate:VDQSF
 	(vec_select:<VEL>
-	  (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+	  (match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand" "<h_con>")
 	  (parallel [(match_operand:SI 2 "immediate_operand")])))
       (match_operand:VDQSF 3 "register_operand" "w")
       (match_operand:VDQSF 4 "register_operand" "0")))]
@@ -2915,7 +2915,7 @@  (define_insn "*aarch64_fma4_elt_from_dup<mode>"
   [(set (match_operand:VMUL 0 "register_operand" "=w")
     (fma:VMUL
       (vec_duplicate:VMUL
-	  (match_operand:<VEL> 1 "register_operand" "<h_con>"))
+	  (match_operand:<VEL> 1 "nonimmediate_operand" "<h_con>"))
       (match_operand:VMUL 2 "register_operand" "w")
       (match_operand:VMUL 3 "register_operand" "0")))]
   "TARGET_SIMD"
@@ -2927,7 +2927,7 @@  (define_insn "*aarch64_fma4_elt_to_64v2df"
   [(set (match_operand:DF 0 "register_operand" "=w")
     (fma:DF
 	(vec_select:DF
-	  (match_operand:V2DF 1 "register_operand" "w")
+	  (match_operand:V2DF 1 "nonimmediate_operand" "w")
 	  (parallel [(match_operand:SI 2 "immediate_operand")]))
       (match_operand:DF 3 "register_operand" "w")
       (match_operand:DF 4 "register_operand" "0")))]
@@ -2957,7 +2957,7 @@  (define_insn "*aarch64_fnma4_elt<mode>"
         (match_operand:VDQF 3 "register_operand" "w"))
       (vec_duplicate:VDQF
 	(vec_select:<VEL>
-	  (match_operand:VDQF 1 "register_operand" "<h_con>")
+	  (match_operand:VDQF 1 "nonimmediate_operand" "<h_con>")
 	  (parallel [(match_operand:SI 2 "immediate_operand")])))
       (match_operand:VDQF 4 "register_operand" "0")))]
   "TARGET_SIMD"
@@ -2975,7 +2975,7 @@  (define_insn "*aarch64_fnma4_elt_<vswap_width_name><mode>"
         (match_operand:VDQSF 3 "register_operand" "w"))
       (vec_duplicate:VDQSF
 	(vec_select:<VEL>
-	  (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+	  (match_operand:<VSWAP_WIDTH> 1 "nonimmediate_operand" "<h_con>")
 	  (parallel [(match_operand:SI 2 "immediate_operand")])))
       (match_operand:VDQSF 4 "register_operand" "0")))]
   "TARGET_SIMD"
@@ -2992,7 +2992,7 @@  (define_insn "*aarch64_fnma4_elt_from_dup<mode>"
       (neg:VMUL
         (match_operand:VMUL 2 "register_operand" "w"))
       (vec_duplicate:VMUL
-	(match_operand:<VEL> 1 "register_operand" "<h_con>"))
+	(match_operand:<VEL> 1 "nonimmediate_operand" "<h_con>"))
       (match_operand:VMUL 3 "register_operand" "0")))]
   "TARGET_SIMD"
   "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
@@ -3003,7 +3003,7 @@  (define_insn "*aarch64_fnma4_elt_to_64v2df"
   [(set (match_operand:DF 0 "register_operand" "=w")
     (fma:DF
       (vec_select:DF
-	(match_operand:V2DF 1 "register_operand" "w")
+	(match_operand:V2DF 1 "nonimmediate_operand" "w")
 	(parallel [(match_operand:SI 2 "immediate_operand")]))
       (neg:DF
         (match_operand:DF 3 "register_operand" "w"))
@@ -4934,7 +4934,7 @@  (define_insn "*aarch64_mulx_elt_<vswap_width_name><mode>"
 	 [(match_operand:VDQSF 1 "register_operand" "w")
 	  (vec_duplicate:VDQSF
 	   (vec_select:<VEL>
-	    (match_operand:<VSWAP_WIDTH> 2 "register_operand" "w")
+	    (match_operand:<VSWAP_WIDTH> 2 "nonimmediate_operand" "w")
 	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
 	 UNSPEC_FMULX))]
   "TARGET_SIMD"
@@ -4953,7 +4953,7 @@  (define_insn "*aarch64_mulx_elt<mode>"
 	 [(match_operand:VDQF 1 "register_operand" "w")
 	  (vec_duplicate:VDQF
 	   (vec_select:<VEL>
-	    (match_operand:VDQF 2 "register_operand" "w")
+	    (match_operand:VDQF 2 "nonimmediate_operand" "w")
 	    (parallel [(match_operand:SI 3 "immediate_operand" "i")])))]
 	 UNSPEC_FMULX))]
   "TARGET_SIMD"
@@ -4971,7 +4971,7 @@  (define_insn "*aarch64_mulx_elt_from_dup<mode>"
 	(unspec:VHSDF
 	 [(match_operand:VHSDF 1 "register_operand" "w")
 	  (vec_duplicate:VHSDF
-	    (match_operand:<VEL> 2 "register_operand" "<h_con>"))]
+	    (match_operand:<VEL> 2 "nonimmediate_operand" "<h_con>"))]
 	 UNSPEC_FMULX))]
   "TARGET_SIMD"
   "fmulx\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]";
@@ -4987,7 +4987,7 @@  (define_insn "*aarch64_vgetfmulx<mode>"
 	(unspec:<VEL>
 	 [(match_operand:<VEL> 1 "register_operand" "w")
 	  (vec_select:<VEL>
-	   (match_operand:VDQF 2 "register_operand" "w")
+	   (match_operand:VDQF 2 "nonimmediate_operand" "w")
 	    (parallel [(match_operand:SI 3 "immediate_operand" "i")]))]
 	 UNSPEC_FMULX))]
   "TARGET_SIMD"
@@ -6768,7 +6768,7 @@  (define_insn "*sqrt<mode>2"
 (define_insn "aarch64_simd_ld2<vstruct_elt>"
   [(set (match_operand:VSTRUCT_2Q 0 "register_operand" "=w")
 	(unspec:VSTRUCT_2Q [
-	  (match_operand:VSTRUCT_2Q 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_2Q 1 "memory_operand" "Utv")]
 	  UNSPEC_LD2))]
   "TARGET_SIMD"
   "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
@@ -6778,7 +6778,7 @@  (define_insn "aarch64_simd_ld2<vstruct_elt>"
 (define_insn "aarch64_simd_ld2r<vstruct_elt>"
   [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_2QD [
-	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:BLK 1 "memory_operand" "Utv")]
           UNSPEC_LD2_DUP))]
   "TARGET_SIMD"
   "ld2r\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
@@ -6788,7 +6788,7 @@  (define_insn "aarch64_simd_ld2r<vstruct_elt>"
 (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
   [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_2QD [
-		(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
+		(match_operand:BLK 1 "memory_operand" "Utv")
 		(match_operand:VSTRUCT_2QD 2 "register_operand" "0")
 		(match_operand:SI 3 "immediate_operand" "i")]
 		UNSPEC_LD2_LANE))]
@@ -6804,7 +6804,7 @@  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
 (define_expand "vec_load_lanes<mode><vstruct_elt>"
   [(set (match_operand:VSTRUCT_2Q 0 "register_operand")
 	(unspec:VSTRUCT_2Q [
-		(match_operand:VSTRUCT_2Q 1 "aarch64_simd_struct_operand")]
+		(match_operand:VSTRUCT_2Q 1 "memory_operand")]
 		UNSPEC_LD2))]
   "TARGET_SIMD"
 {
@@ -6822,7 +6822,7 @@  (define_expand "vec_load_lanes<mode><vstruct_elt>"
 })
 
 (define_insn "aarch64_simd_st2<vstruct_elt>"
-  [(set (match_operand:VSTRUCT_2Q 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_2Q [
 		(match_operand:VSTRUCT_2Q 1 "register_operand" "w")]
                 UNSPEC_ST2))]
@@ -6833,7 +6833,7 @@  (define_insn "aarch64_simd_st2<vstruct_elt>"
 
 ;; RTL uses GCC vector extension indices, so flip only for assembly.
 (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
-  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
 	(unspec:BLK [(match_operand:VSTRUCT_2QD 1 "register_operand" "w")
 		     (match_operand:SI 2 "immediate_operand" "i")]
 		     UNSPEC_ST2_LANE))]
@@ -6847,7 +6847,7 @@  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
 )
 
 (define_expand "vec_store_lanes<mode><vstruct_elt>"
-  [(set (match_operand:VSTRUCT_2Q 0 "aarch64_simd_struct_operand")
+  [(set (match_operand:VSTRUCT_2Q 0 "memory_operand")
 	(unspec:VSTRUCT_2Q [(match_operand:VSTRUCT_2Q 1 "register_operand")]
                    UNSPEC_ST2))]
   "TARGET_SIMD"
@@ -6868,7 +6868,7 @@  (define_expand "vec_store_lanes<mode><vstruct_elt>"
 (define_insn "aarch64_simd_ld3<vstruct_elt>"
   [(set (match_operand:VSTRUCT_3Q 0 "register_operand" "=w")
 	(unspec:VSTRUCT_3Q [
-	  (match_operand:VSTRUCT_3Q 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_3Q 1 "memory_operand" "Utv")]
 	  UNSPEC_LD3))]
   "TARGET_SIMD"
   "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
@@ -6878,7 +6878,7 @@  (define_insn "aarch64_simd_ld3<vstruct_elt>"
 (define_insn "aarch64_simd_ld3r<vstruct_elt>"
   [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_3QD [
-	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:BLK 1 "memory_operand" "Utv")]
           UNSPEC_LD3_DUP))]
   "TARGET_SIMD"
   "ld3r\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
@@ -6888,7 +6888,7 @@  (define_insn "aarch64_simd_ld3r<vstruct_elt>"
 (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
   [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_3QD [
-		(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
+		(match_operand:BLK 1 "memory_operand" "Utv")
 		(match_operand:VSTRUCT_3QD 2 "register_operand" "0")
 		(match_operand:SI 3 "immediate_operand" "i")]
 		UNSPEC_LD3_LANE))]
@@ -6904,7 +6904,7 @@  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
 (define_expand "vec_load_lanes<mode><vstruct_elt>"
   [(set (match_operand:VSTRUCT_3Q 0 "register_operand")
 	(unspec:VSTRUCT_3Q [
-		(match_operand:VSTRUCT_3Q 1 "aarch64_simd_struct_operand")]
+		(match_operand:VSTRUCT_3Q 1 "memory_operand")]
 		UNSPEC_LD3))]
   "TARGET_SIMD"
 {
@@ -6922,7 +6922,7 @@  (define_expand "vec_load_lanes<mode><vstruct_elt>"
 })
 
 (define_insn "aarch64_simd_st3<vstruct_elt>"
-  [(set (match_operand:VSTRUCT_3Q 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_3Q [(match_operand:VSTRUCT_3Q 1 "register_operand" "w")]
                    UNSPEC_ST3))]
   "TARGET_SIMD"
@@ -6932,7 +6932,7 @@  (define_insn "aarch64_simd_st3<vstruct_elt>"
 
 ;; RTL uses GCC vector extension indices, so flip only for assembly.
 (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
-  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
 	(unspec:BLK [(match_operand:VSTRUCT_3QD 1 "register_operand" "w")
 		     (match_operand:SI 2 "immediate_operand" "i")]
 		     UNSPEC_ST3_LANE))]
@@ -6946,7 +6946,7 @@  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
 )
 
 (define_expand "vec_store_lanes<mode><vstruct_elt>"
-  [(set (match_operand:VSTRUCT_3Q 0 "aarch64_simd_struct_operand")
+  [(set (match_operand:VSTRUCT_3Q 0 "memory_operand")
 	(unspec:VSTRUCT_3Q [
 		(match_operand:VSTRUCT_3Q 1 "register_operand")]
                 UNSPEC_ST3))]
@@ -6968,7 +6968,7 @@  (define_expand "vec_store_lanes<mode><vstruct_elt>"
 (define_insn "aarch64_simd_ld4<vstruct_elt>"
   [(set (match_operand:VSTRUCT_4Q 0 "register_operand" "=w")
 	(unspec:VSTRUCT_4Q [
-	  (match_operand:VSTRUCT_4Q 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_4Q 1 "memory_operand" "Utv")]
 	  UNSPEC_LD4))]
   "TARGET_SIMD"
   "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
@@ -6978,7 +6978,7 @@  (define_insn "aarch64_simd_ld4<vstruct_elt>"
 (define_insn "aarch64_simd_ld4r<vstruct_elt>"
   [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_4QD [
-	  (match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:BLK 1 "memory_operand" "Utv")]
           UNSPEC_LD4_DUP))]
   "TARGET_SIMD"
   "ld4r\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
@@ -6988,7 +6988,7 @@  (define_insn "aarch64_simd_ld4r<vstruct_elt>"
 (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
   [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_4QD [
-		(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
+		(match_operand:BLK 1 "memory_operand" "Utv")
 		(match_operand:VSTRUCT_4QD 2 "register_operand" "0")
 		(match_operand:SI 3 "immediate_operand" "i")]
 		UNSPEC_LD4_LANE))]
@@ -7004,7 +7004,7 @@  (define_insn "aarch64_vec_load_lanes<mode>_lane<vstruct_elt>"
 (define_expand "vec_load_lanes<mode><vstruct_elt>"
   [(set (match_operand:VSTRUCT_4Q 0 "register_operand")
 	(unspec:VSTRUCT_4Q [
-		(match_operand:VSTRUCT_4Q 1 "aarch64_simd_struct_operand")]
+		(match_operand:VSTRUCT_4Q 1 "memory_operand")]
 		UNSPEC_LD4))]
   "TARGET_SIMD"
 {
@@ -7022,7 +7022,7 @@  (define_expand "vec_load_lanes<mode><vstruct_elt>"
 })
 
 (define_insn "aarch64_simd_st4<vstruct_elt>"
-  [(set (match_operand:VSTRUCT_4Q 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_4Q [
 		(match_operand:VSTRUCT_4Q 1 "register_operand" "w")]
                 UNSPEC_ST4))]
@@ -7033,7 +7033,7 @@  (define_insn "aarch64_simd_st4<vstruct_elt>"
 
 ;; RTL uses GCC vector extension indices, so flip only for assembly.
 (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
-  [(set (match_operand:BLK 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:BLK 0 "memory_operand" "=Utv")
 	(unspec:BLK [(match_operand:VSTRUCT_4QD 1 "register_operand" "w")
 		     (match_operand:SI 2 "immediate_operand" "i")]
 		     UNSPEC_ST4_LANE))]
@@ -7047,7 +7047,7 @@  (define_insn "aarch64_vec_store_lanes<mode>_lane<vstruct_elt>"
 )
 
 (define_expand "vec_store_lanes<mode><vstruct_elt>"
-  [(set (match_operand:VSTRUCT_4Q 0 "aarch64_simd_struct_operand")
+  [(set (match_operand:VSTRUCT_4Q 0 "memory_operand")
 	(unspec:VSTRUCT_4Q [(match_operand:VSTRUCT_4Q 1 "register_operand")]
                    UNSPEC_ST4))]
   "TARGET_SIMD"
@@ -7138,7 +7138,7 @@  (define_expand "aarch64_ld1x3<vstruct_elt>"
 (define_insn "aarch64_ld1_x3_<vstruct_elt>"
   [(set (match_operand:VSTRUCT_3QD 0 "register_operand" "=w")
         (unspec:VSTRUCT_3QD
-	  [(match_operand:VSTRUCT_3QD 1 "aarch64_simd_struct_operand" "Utv")]
+	  [(match_operand:VSTRUCT_3QD 1 "memory_operand" "Utv")]
 	  UNSPEC_LD1))]
   "TARGET_SIMD"
   "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
@@ -7158,7 +7158,7 @@  (define_expand "aarch64_ld1x4<vstruct_elt>"
 (define_insn "aarch64_ld1_x4_<vstruct_elt>"
   [(set (match_operand:VSTRUCT_4QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_4QD
-	  [(match_operand:VSTRUCT_4QD 1 "aarch64_simd_struct_operand" "Utv")]
+	  [(match_operand:VSTRUCT_4QD 1 "memory_operand" "Utv")]
 	UNSPEC_LD1))]
   "TARGET_SIMD"
   "ld1\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
@@ -7176,7 +7176,7 @@  (define_expand "aarch64_st1x2<vstruct_elt>"
 })
 
 (define_insn "aarch64_st1_x2_<vstruct_elt>"
-  [(set (match_operand:VSTRUCT_2QD 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_2QD 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_2QD
 		[(match_operand:VSTRUCT_2QD 1 "register_operand" "w")]
 		UNSPEC_ST1))]
@@ -7196,7 +7196,7 @@  (define_expand "aarch64_st1x3<vstruct_elt>"
 })
 
 (define_insn "aarch64_st1_x3_<vstruct_elt>"
-  [(set (match_operand:VSTRUCT_3QD 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_3QD 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_3QD
 		[(match_operand:VSTRUCT_3QD 1 "register_operand" "w")]
 		UNSPEC_ST1))]
@@ -7216,7 +7216,7 @@  (define_expand "aarch64_st1x4<vstruct_elt>"
 })
 
 (define_insn "aarch64_st1_x4_<vstruct_elt>"
-  [(set (match_operand:VSTRUCT_4QD 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_4QD 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_4QD
 		[(match_operand:VSTRUCT_4QD 1 "register_operand" "w")]
 		UNSPEC_ST1))]
@@ -7268,7 +7268,7 @@  (define_insn "*aarch64_movv8di"
 (define_insn "aarch64_be_ld1<mode>"
   [(set (match_operand:VALLDI_F16 0	"register_operand" "=w")
 	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1
-			     "aarch64_simd_struct_operand" "Utv")]
+			     "memory_operand" "Utv")]
 	UNSPEC_LD1))]
   "TARGET_SIMD"
   "ld1\\t{%0<Vmtype>}, %1"
@@ -7276,7 +7276,7 @@  (define_insn "aarch64_be_ld1<mode>"
 )
 
 (define_insn "aarch64_be_st1<mode>"
-  [(set (match_operand:VALLDI_F16 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VALLDI_F16 0 "memory_operand" "=Utv")
 	(unspec:VALLDI_F16 [(match_operand:VALLDI_F16 1 "register_operand" "w")]
 	UNSPEC_ST1))]
   "TARGET_SIMD"
@@ -7551,7 +7551,7 @@  (define_expand "aarch64_ld<nregs>r<vstruct_elt>"
 (define_insn "aarch64_ld2<vstruct_elt>_dreg"
   [(set (match_operand:VSTRUCT_2DNX 0 "register_operand" "=w")
 	(unspec:VSTRUCT_2DNX [
-	  (match_operand:VSTRUCT_2DNX 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_2DNX 1 "memory_operand" "Utv")]
 	  UNSPEC_LD2_DREG))]
   "TARGET_SIMD"
   "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
@@ -7561,7 +7561,7 @@  (define_insn "aarch64_ld2<vstruct_elt>_dreg"
 (define_insn "aarch64_ld2<vstruct_elt>_dreg"
   [(set (match_operand:VSTRUCT_2DX 0 "register_operand" "=w")
 	(unspec:VSTRUCT_2DX [
-	  (match_operand:VSTRUCT_2DX 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_2DX 1 "memory_operand" "Utv")]
 	  UNSPEC_LD2_DREG))]
   "TARGET_SIMD"
   "ld1\\t{%S0.1d - %T0.1d}, %1"
@@ -7571,7 +7571,7 @@  (define_insn "aarch64_ld2<vstruct_elt>_dreg"
 (define_insn "aarch64_ld3<vstruct_elt>_dreg"
   [(set (match_operand:VSTRUCT_3DNX 0 "register_operand" "=w")
 	(unspec:VSTRUCT_3DNX [
-	  (match_operand:VSTRUCT_3DNX 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_3DNX 1 "memory_operand" "Utv")]
 	  UNSPEC_LD3_DREG))]
   "TARGET_SIMD"
   "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
@@ -7581,7 +7581,7 @@  (define_insn "aarch64_ld3<vstruct_elt>_dreg"
 (define_insn "aarch64_ld3<vstruct_elt>_dreg"
   [(set (match_operand:VSTRUCT_3DX 0 "register_operand" "=w")
 	(unspec:VSTRUCT_3DX [
-	  (match_operand:VSTRUCT_3DX 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_3DX 1 "memory_operand" "Utv")]
 	  UNSPEC_LD3_DREG))]
   "TARGET_SIMD"
   "ld1\\t{%S0.1d - %U0.1d}, %1"
@@ -7591,7 +7591,7 @@  (define_insn "aarch64_ld3<vstruct_elt>_dreg"
 (define_insn "aarch64_ld4<vstruct_elt>_dreg"
   [(set (match_operand:VSTRUCT_4DNX 0 "register_operand" "=w")
 	(unspec:VSTRUCT_4DNX [
-	  (match_operand:VSTRUCT_4DNX 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_4DNX 1 "memory_operand" "Utv")]
 	  UNSPEC_LD4_DREG))]
   "TARGET_SIMD"
   "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
@@ -7601,7 +7601,7 @@  (define_insn "aarch64_ld4<vstruct_elt>_dreg"
 (define_insn "aarch64_ld4<vstruct_elt>_dreg"
   [(set (match_operand:VSTRUCT_4DX 0 "register_operand" "=w")
 	(unspec:VSTRUCT_4DX [
-	  (match_operand:VSTRUCT_4DX 1 "aarch64_simd_struct_operand" "Utv")]
+	  (match_operand:VSTRUCT_4DX 1 "memory_operand" "Utv")]
 	  UNSPEC_LD4_DREG))]
   "TARGET_SIMD"
   "ld1\\t{%S0.1d - %V0.1d}, %1"
@@ -7841,7 +7841,7 @@  (define_insn "aarch64_rev<REVERSE:rev_op><mode>"
 )
 
 (define_insn "aarch64_st2<vstruct_elt>_dreg"
-  [(set (match_operand:VSTRUCT_2DNX 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_2DNX 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_2DNX [
 		(match_operand:VSTRUCT_2DNX 1 "register_operand" "w")]
 		UNSPEC_ST2))]
@@ -7851,7 +7851,7 @@  (define_insn "aarch64_st2<vstruct_elt>_dreg"
 )
 
 (define_insn "aarch64_st2<vstruct_elt>_dreg"
-  [(set (match_operand:VSTRUCT_2DX 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_2DX 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_2DX [
 		(match_operand:VSTRUCT_2DX 1 "register_operand" "w")]
 		UNSPEC_ST2))]
@@ -7861,7 +7861,7 @@  (define_insn "aarch64_st2<vstruct_elt>_dreg"
 )
 
 (define_insn "aarch64_st3<vstruct_elt>_dreg"
-  [(set (match_operand:VSTRUCT_3DNX 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_3DNX 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_3DNX [
 		(match_operand:VSTRUCT_3DNX 1 "register_operand" "w")]
 		UNSPEC_ST3))]
@@ -7871,7 +7871,7 @@  (define_insn "aarch64_st3<vstruct_elt>_dreg"
 )
 
 (define_insn "aarch64_st3<vstruct_elt>_dreg"
-  [(set (match_operand:VSTRUCT_3DX 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_3DX 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_3DX [
 		(match_operand:VSTRUCT_3DX 1 "register_operand" "w")]
 		UNSPEC_ST3))]
@@ -7881,7 +7881,7 @@  (define_insn "aarch64_st3<vstruct_elt>_dreg"
 )
 
 (define_insn "aarch64_st4<vstruct_elt>_dreg"
-  [(set (match_operand:VSTRUCT_4DNX 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_4DNX 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_4DNX [
 		(match_operand:VSTRUCT_4DNX 1 "register_operand" "w")]
 		UNSPEC_ST4))]
@@ -7891,7 +7891,7 @@  (define_insn "aarch64_st4<vstruct_elt>_dreg"
 )
 
 (define_insn "aarch64_st4<vstruct_elt>_dreg"
-  [(set (match_operand:VSTRUCT_4DX 0 "aarch64_simd_struct_operand" "=Utv")
+  [(set (match_operand:VSTRUCT_4DX 0 "memory_operand" "=Utv")
 	(unspec:VSTRUCT_4DX [
 		(match_operand:VSTRUCT_4DX 1 "register_operand" "w")]
 		UNSPEC_ST4))]
@@ -7974,7 +7974,7 @@  (define_expand "vec_init<mode><Vhalf>"
 (define_insn "*aarch64_simd_ld1r<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(vec_duplicate:VALL_F16
-	  (match_operand:<VEL> 1 "aarch64_simd_struct_operand" "Utv")))]
+	  (match_operand:<VEL> 1 "memory_operand" "Utv")))]
   "TARGET_SIMD"
   "ld1r\\t{%0.<Vtype>}, %1"
   [(set_attr "type" "neon_load1_all_lanes")]
@@ -7983,7 +7983,7 @@  (define_insn "*aarch64_simd_ld1r<mode>"
 (define_insn "aarch64_simd_ld1<vstruct_elt>_x2"
   [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
 	(unspec:VSTRUCT_2QD [
-	    (match_operand:VSTRUCT_2QD 1 "aarch64_simd_struct_operand" "Utv")]
+	    (match_operand:VSTRUCT_2QD 1 "memory_operand" "Utv")]
 	    UNSPEC_LD1))]
   "TARGET_SIMD"
   "ld1\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index c308015ac2c13d24cd6bcec71247ec45df8cf5e6..6b70a364530c8108457091bfec12fe549f722149 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -494,10 +494,6 @@  (define_predicate "aarch64_simd_reg_or_minus_one"
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "aarch64_simd_imm_minus_one")))
 
-(define_predicate "aarch64_simd_struct_operand"
-  (and (match_code "mem")
-       (match_test "TARGET_SIMD && aarch64_simd_mem_operand_p (op)")))
-
 ;; Like general_operand but allow only valid SIMD addressing modes.
 (define_predicate "aarch64_simd_general_operand"
   (and (match_operand 0 "general_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/vld1r.c b/gcc/testsuite/gcc.target/aarch64/vld1r.c
new file mode 100644
index 0000000000000000000000000000000000000000..72c505c403e9e239771379b7cadd8a9473f06113
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vld1r.c
@@ -0,0 +1,26 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** f1:
+** 	add	x0, x0, 1
+** 	ld1r	{v0.8b}, \[x0\]
+** 	ret
+*/
+uint8x8_t f1(const uint8_t *in) {
+    return vld1_dup_u8(&in[1]);
+}
+
+/*
+** f2:
+** 	ldr	s1, \[x0, 4\]
+** 	fmla	v0.4s, v0.4s, v1.s\[0\]
+** 	ret
+*/
+float32x4_t f2(const float32_t *in, float32x4_t a) {
+    float32x4_t dup = vld1q_dup_f32(&in[1]);
+    return vfmaq_laneq_f32 (a, a, dup, 1);
+}