[5/8] AArch64 aarch64: Make existing V2HF be usable.

Message ID Y1+4euF0rUwFIjTL@arm.com
State Deferred
Headers
Series [1/8] middle-end: Recognize scalar reductions from bitfields and array_refs |

Commit Message

Tamar Christina Oct. 31, 2022, 11:58 a.m. UTC
  Hi All,

The backend has an existing V2HFmode that is used by pairwise operations.
This mode was however never made fully functional.  Amongst other things it was
never declared as a vector type which made it unusable from the mid-end.

It's also lacking an implementation for load/stores so reload ICEs if this mode
is every used.  This finishes the implementation by providing the above.

Note that I have created a new iterator VHSDF_P instead of extending VHSDF
because the previous iterator is used in far more things than just load/stores.

It's also used for instance in intrinsics and extending this would force me to
provide support for mangling the type while we never expose it through
intrinsics.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (*aarch64_simd_movv2hf): New.
	(mov<mode>, movmisalign<mode>, aarch64_dup_lane<mode>,
	aarch64_store_lane0<mode>, aarch64_simd_vec_set<mode>,
	@aarch64_simd_vec_copy_lane<mode>, vec_set<mode>,
	reduc_<optab>_scal_<mode>, reduc_<fmaxmin>_scal_<mode>,
	aarch64_reduc_<optab>_internal<mode>, aarch64_get_lane<mode>,
	vec_init<mode><Vel>, vec_extract<mode><Vel>): Support V2HF.
	* config/aarch64/aarch64.cc (aarch64_classify_vector_mode):
	Add E_V2HFmode.
	* config/aarch64/iterators.md (VHSDF_P): New.
	(V2F, VALL_F16_FULL, nunits, Vtype, Vmtype, Vetype, stype, VEL,
	Vel, q, vp): Add V2HF.
	* config/arm/types.md (neon_fp_reduc_add_h): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/slp_1.c: Update testcase.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 25aed74f8cf939562ed65a578fe32ca76605b58a..93a2888f567460ad10ec050ea7d4f701df4729d1 100644




--
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 25aed74f8cf939562ed65a578fe32ca76605b58a..93a2888f567460ad10ec050ea7d4f701df4729d1 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -19,10 +19,10 @@
 ;; <http://www.gnu.org/licenses/>.
 
 (define_expand "mov<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
-	(match_operand:VALL_F16 1 "general_operand"))]
+  [(set (match_operand:VALL_F16_FULL 0 "nonimmediate_operand")
+	(match_operand:VALL_F16_FULL 1 "general_operand"))]
   "TARGET_SIMD"
-  "
+{
   /* Force the operand into a register if it is not an
      immediate whose use can be replaced with xzr.
      If the mode is 16 bytes wide, then we will be doing
@@ -46,12 +46,11 @@ (define_expand "mov<mode>"
       aarch64_expand_vector_init (operands[0], operands[1]);
       DONE;
     }
-  "
-)
+})
 
 (define_expand "movmisalign<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
-        (match_operand:VALL_F16 1 "general_operand"))]
+  [(set (match_operand:VALL_F16_FULL 0 "nonimmediate_operand")
+        (match_operand:VALL_F16_FULL 1 "general_operand"))]
   "TARGET_SIMD && !STRICT_ALIGNMENT"
 {
   /* This pattern is not permitted to fail during expansion: if both arguments
@@ -85,10 +84,10 @@ (define_insn "aarch64_simd_dup<mode>"
 )
 
 (define_insn "aarch64_dup_lane<mode>"
-  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
-	(vec_duplicate:VALL_F16
+  [(set (match_operand:VALL_F16_FULL 0 "register_operand" "=w")
+	(vec_duplicate:VALL_F16_FULL
 	  (vec_select:<VEL>
-	    (match_operand:VALL_F16 1 "register_operand" "w")
+	    (match_operand:VALL_F16_FULL 1 "register_operand" "w")
 	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
           )))]
   "TARGET_SIMD"
@@ -142,6 +141,29 @@ (define_insn "*aarch64_simd_mov<VDMOV:mode>"
 		     mov_reg, neon_move<q>")]
 )
 
+(define_insn "*aarch64_simd_movv2hf"
+  [(set (match_operand:V2HF 0 "nonimmediate_operand"
+		"=w, m,  m,  w, ?r, ?w, ?r, w, w")
+	(match_operand:V2HF 1 "general_operand"
+		"m,  Dz, w,  w,  w,  r,  r, Dz, Dn"))]
+  "TARGET_SIMD_F16INST
+   && (register_operand (operands[0], V2HFmode)
+       || aarch64_simd_reg_or_zero (operands[1], V2HFmode))"
+   "@
+    ldr\\t%s0, %1
+    str\\twzr, %0
+    str\\t%s1, %0
+    mov\\t%0.2s[0], %1.2s[0]
+    umov\\t%w0, %1.s[0]
+    fmov\\t%s0, %1
+    mov\\t%0, %1
+    movi\\t%d0, 0
+    * return aarch64_output_simd_mov_immediate (operands[1], 32);"
+  [(set_attr "type" "neon_load1_1reg, store_8, neon_store1_1reg,\
+		     neon_logic, neon_to_gp, f_mcr,\
+		     mov_reg, neon_move, neon_move")]
+)
+
 (define_insn "*aarch64_simd_mov<VQMOV:mode>"
   [(set (match_operand:VQMOV 0 "nonimmediate_operand"
 		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
@@ -182,7 +204,7 @@ (define_insn "*aarch64_simd_mov<VQMOV:mode>"
 
 (define_insn "aarch64_store_lane0<mode>"
   [(set (match_operand:<VEL> 0 "memory_operand" "=m")
-	(vec_select:<VEL> (match_operand:VALL_F16 1 "register_operand" "w")
+	(vec_select:<VEL> (match_operand:VALL_F16_FULL 1 "register_operand" "w")
 			(parallel [(match_operand 2 "const_int_operand" "n")])))]
   "TARGET_SIMD
    && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 0"
@@ -1035,11 +1057,11 @@ (define_insn "one_cmpl<mode>2"
 )
 
 (define_insn "aarch64_simd_vec_set<mode>"
-  [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
-	(vec_merge:VALL_F16
-	    (vec_duplicate:VALL_F16
+  [(set (match_operand:VALL_F16_FULL 0 "register_operand" "=w,w,w")
+	(vec_merge:VALL_F16_FULL
+	    (vec_duplicate:VALL_F16_FULL
 		(match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "w,?r,Utv"))
-	    (match_operand:VALL_F16 3 "register_operand" "0,0,0")
+	    (match_operand:VALL_F16_FULL 3 "register_operand" "0,0,0")
 	    (match_operand:SI 2 "immediate_operand" "i,i,i")))]
   "TARGET_SIMD"
   {
@@ -1061,14 +1083,14 @@ (define_insn "aarch64_simd_vec_set<mode>"
 )
 
 (define_insn "@aarch64_simd_vec_copy_lane<mode>"
-  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
-	(vec_merge:VALL_F16
-	    (vec_duplicate:VALL_F16
+  [(set (match_operand:VALL_F16_FULL 0 "register_operand" "=w")
+	(vec_merge:VALL_F16_FULL
+	    (vec_duplicate:VALL_F16_FULL
 	      (vec_select:<VEL>
-		(match_operand:VALL_F16 3 "register_operand" "w")
+		(match_operand:VALL_F16_FULL 3 "register_operand" "w")
 		(parallel
 		  [(match_operand:SI 4 "immediate_operand" "i")])))
-	    (match_operand:VALL_F16 1 "register_operand" "0")
+	    (match_operand:VALL_F16_FULL 1 "register_operand" "0")
 	    (match_operand:SI 2 "immediate_operand" "i")))]
   "TARGET_SIMD"
   {
@@ -1376,7 +1398,7 @@ (define_insn "vec_shr_<mode>"
 )
 
 (define_expand "vec_set<mode>"
-  [(match_operand:VALL_F16 0 "register_operand")
+  [(match_operand:VALL_F16_FULL 0 "register_operand")
    (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand")
    (match_operand:SI 2 "immediate_operand")]
   "TARGET_SIMD"
@@ -3503,7 +3525,7 @@ (define_insn "popcount<mode>2"
 ;; gimple_fold'd to the IFN_REDUC_(MAX|MIN) function.  (This is FP smax/smin).
 (define_expand "reduc_<optab>_scal_<mode>"
   [(match_operand:<VEL> 0 "register_operand")
-   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
+   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
 		 FMAXMINV)]
   "TARGET_SIMD"
   {
@@ -3518,7 +3540,7 @@ (define_expand "reduc_<optab>_scal_<mode>"
 
 (define_expand "reduc_<fmaxmin>_scal_<mode>"
   [(match_operand:<VEL> 0 "register_operand")
-   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
+   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
 		 FMAXMINNMV)]
   "TARGET_SIMD"
   {
@@ -3562,8 +3584,8 @@ (define_insn "aarch64_reduc_<optab>_internalv2si"
 )
 
 (define_insn "aarch64_reduc_<optab>_internal<mode>"
- [(set (match_operand:VHSDF 0 "register_operand" "=w")
-       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
+ [(set (match_operand:VHSDF_P 0 "register_operand" "=w")
+       (unspec:VHSDF_P [(match_operand:VHSDF_P 1 "register_operand" "w")]
 		      FMAXMINV))]
  "TARGET_SIMD"
  "<maxmin_uns_op><vp>\\t%<Vetype>0, %1.<Vtype>"
@@ -4208,7 +4230,7 @@ (define_insn "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
 (define_insn_and_split "aarch64_get_lane<mode>"
   [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=?r, w, Utv")
 	(vec_select:<VEL>
-	  (match_operand:VALL_F16 1 "register_operand" "w, w, w")
+	  (match_operand:VALL_F16_FULL 1 "register_operand" "w, w, w")
 	  (parallel [(match_operand:SI 2 "immediate_operand" "i, i, i")])))]
   "TARGET_SIMD"
   {
@@ -7989,7 +8011,7 @@ (define_expand "aarch64_st1<VALL_F16:mode>"
 ;; Standard pattern name vec_init<mode><Vel>.
 
 (define_expand "vec_init<mode><Vel>"
-  [(match_operand:VALL_F16 0 "register_operand")
+  [(match_operand:VALL_F16_FULL 0 "register_operand")
    (match_operand 1 "" "")]
   "TARGET_SIMD"
 {
@@ -8068,7 +8090,7 @@ (define_insn "aarch64_urecpe<mode>"
 
 (define_expand "vec_extract<mode><Vel>"
   [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand")
-   (match_operand:VALL_F16 1 "register_operand")
+   (match_operand:VALL_F16_FULL 1 "register_operand")
    (match_operand:SI 2 "immediate_operand")]
   "TARGET_SIMD"
 {
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f05bac713e88ea8c7feaa2367d55bd523ca66f57..1e08f8453688210afe1566092b19b59c9bdd0c97 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -3566,6 +3566,7 @@ aarch64_classify_vector_mode (machine_mode mode)
     case E_V8BFmode:
     case E_V4SFmode:
     case E_V2DFmode:
+    case E_V2HFmode:
       return TARGET_SIMD ? VEC_ADVSIMD : 0;
 
     default:
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 37d8161a33b1c399d80be82afa67613a087389d4..1df09f7fe2eb35aed96113476541e0faa5393551 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -160,6 +160,10 @@ (define_mode_iterator VDQF [V2SF V4SF V2DF])
 (define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
 			     (V8HF "TARGET_SIMD_F16INST")
 			     V2SF V4SF V2DF])
+;; Advanced SIMD Float modes suitable for pairwise operations.
+(define_mode_iterator VHSDF_P [(V4HF "TARGET_SIMD_F16INST")
+			       (V8HF "TARGET_SIMD_F16INST")
+			       V2SF V4SF V2DF (V2HF "TARGET_SIMD_F16INST")])
 
 ;; Advanced SIMD Float modes, and DF.
 (define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF])
@@ -188,15 +192,23 @@ (define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF V2DI])
 (define_mode_iterator VALLF [V2SF V4SF V2DF SF DF])
 
 ;; Advanced SIMD Float modes with 2 elements.
-(define_mode_iterator V2F [V2SF V2DF])
+(define_mode_iterator V2F [V2SF V2DF V2HF])
 
 ;; All Advanced SIMD modes on which we support any arithmetic operations.
 (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
 
-;; All Advanced SIMD modes suitable for moving, loading, and storing.
+;; All Advanced SIMD modes suitable for moving, loading, and storing
+;; except V2HF.
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
 				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
 
+;; All Advanced SIMD modes suitable for moving, loading, and storing
+;; including V2HF
+(define_mode_iterator VALL_F16_FULL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+				     V4HF V8HF V4BF V8BF V2SF V4SF V2DF
+				     (V2HF "TARGET_SIMD_F16INST")])
+
+
 ;; The VALL_F16 modes except the 128-bit 2-element ones.
 (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
 				V4HF V8HF V2SF V4SF])
@@ -1076,7 +1088,7 @@ (define_mode_attr nunits [(V8QI "8") (V16QI "16")
 			  (V2SF "2") (V4SF "4")
 			  (V1DF "1") (V2DF "2")
 			  (DI "1") (DF "1")
-			  (V8DI "8")])
+			  (V8DI "8") (V2HF "2")])
 
 ;; Map a mode to the number of bits in it, if the size of the mode
 ;; is constant.
@@ -1090,6 +1102,7 @@ (define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")])
 
 ;; Give the length suffix letter for a sign- or zero-extension.
 (define_mode_attr size [(QI "b") (HI "h") (SI "w")])
+(define_mode_attr sizel [(QI "b") (HI "h") (SI "")])
 
 ;; Give the number of bits in the mode
 (define_mode_attr sizen [(QI "8") (HI "16") (SI "32") (DI "64")])
@@ -1134,8 +1147,9 @@ (define_mode_attr Vtype [(V8QI "8b") (V16QI "16b")
                          (V2SI "2s") (V4SI  "4s")
                          (DI   "1d") (DF    "1d")
                          (V2DI "2d") (V2SF "2s")
-			 (V4SF "4s") (V2DF "2d")
-			 (V4HF "4h") (V8HF "8h")
+			 (V2HF "2h") (V4SF "4s")
+			 (V2DF "2d") (V4HF "4h")
+			 (V8HF "8h")
 			 (V2x8QI "8b") (V2x4HI "4h")
 			 (V2x2SI "2s") (V2x1DI  "1d")
 			 (V2x4HF "4h") (V2x2SF "2s")
@@ -1175,9 +1189,10 @@ (define_mode_attr Vmtype [(V8QI ".8b") (V16QI ".16b")
 			 (V4HI ".4h") (V8HI  ".8h")
 			 (V2SI ".2s") (V4SI  ".4s")
 			 (V2DI ".2d") (V4HF ".4h")
-			 (V8HF ".8h") (V4BF ".4h")
-			 (V8BF ".8h") (V2SF ".2s")
-			 (V4SF ".4s") (V2DF ".2d")
+			 (V8HF ".8h") (V2HF ".2h")
+			 (V4BF ".4h") (V8BF ".8h")
+			 (V2SF ".2s") (V4SF ".4s")
+			 (V2DF ".2d")
 			 (DI   "")    (SI   "")
 			 (HI   "")    (QI   "")
 			 (TI   "")    (HF   "")
@@ -1193,7 +1208,7 @@ (define_mode_attr Vmntype [(V8HI ".8b") (V4SI ".4h")
 (define_mode_attr Vetype [(V8QI "b") (V16QI "b")
 			  (V4HI "h") (V8HI  "h")
 			  (V2SI "s") (V4SI  "s")
-			  (V2DI "d")
+			  (V2DI "d") (V2HF  "h")
 			  (V4HF "h") (V8HF  "h")
 			  (V2SF "s") (V4SF  "s")
 			  (V2DF "d")
@@ -1285,7 +1300,7 @@ (define_mode_attr Vcwtype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "w") (VNx2QI "d")
 ;; more accurately.
 (define_mode_attr stype [(V8QI "b") (V16QI "b") (V4HI "s") (V8HI "s")
 			 (V2SI "s") (V4SI "s") (V2DI "d") (V4HF "s")
-			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d")
+			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d") (V2HF "s")
 			 (HF "s") (SF "s") (DF "d") (QI "b") (HI "s")
 			 (SI "s") (DI "d")])
 
@@ -1360,8 +1375,8 @@ (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
 		       (V4HF "HF") (V8HF  "HF")
 		       (V2SF "SF") (V4SF  "SF")
 		       (DF   "DF") (V2DF  "DF")
-		       (SI   "SI") (HI    "HI")
-		       (QI   "QI")
+		       (SI   "SI") (V2HF  "HF")
+		       (QI   "QI") (HI    "HI")
 		       (V4BF "BF") (V8BF "BF")
 		       (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
 		       (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
@@ -1381,7 +1396,7 @@ (define_mode_attr Vel [(V8QI "qi") (V16QI "qi")
 		       (V2SF "sf") (V4SF "sf")
 		       (V2DF "df") (DF   "df")
 		       (SI   "si") (HI   "hi")
-		       (QI   "qi")
+		       (QI   "qi") (V2HF "hf")
 		       (V4BF "bf") (V8BF "bf")
 		       (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
 		       (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
@@ -1866,7 +1881,7 @@ (define_mode_attr q [(V8QI "") (V16QI "_q")
 		     (V4HF "") (V8HF "_q")
 		     (V4BF "") (V8BF "_q")
 		     (V2SF "") (V4SF  "_q")
-			       (V2DF  "_q")
+		     (V2HF "") (V2DF  "_q")
 		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")
 		     (V2x8QI "") (V2x16QI "_q")
 		     (V2x4HI "") (V2x8HI "_q")
@@ -1905,6 +1920,7 @@ (define_mode_attr vp [(V8QI "v") (V16QI "v")
 		      (V2SI "p") (V4SI  "v")
 		      (V2DI "p") (V2DF  "p")
 		      (V2SF "p") (V4SF  "v")
+		      (V2HF "p")
 		      (V4HF "v") (V8HF  "v")])
 
 (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")
diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
index 7d0504bdd944e9c0d1b545b0b66a9a1adc808714..3cfbc7a93cca1bea4925853e51d0a147c5722247 100644
--- a/gcc/config/arm/types.md
+++ b/gcc/config/arm/types.md
@@ -483,6 +483,7 @@ (define_attr "autodetect_type"
 ; neon_fp_minmax_s_q
 ; neon_fp_minmax_d
 ; neon_fp_minmax_d_q
+; neon_fp_reduc_add_h
 ; neon_fp_reduc_add_s
 ; neon_fp_reduc_add_s_q
 ; neon_fp_reduc_add_d
@@ -1033,6 +1034,7 @@ (define_attr "type"
   neon_fp_minmax_d,\
   neon_fp_minmax_d_q,\
 \
+  neon_fp_reduc_add_h,\
   neon_fp_reduc_add_s,\
   neon_fp_reduc_add_s_q,\
   neon_fp_reduc_add_d,\
@@ -1257,8 +1259,8 @@ (define_attr "is_neon_type" "yes,no"
           neon_fp_compare_d, neon_fp_compare_d_q, neon_fp_minmax_s,\
           neon_fp_minmax_s_q, neon_fp_minmax_d, neon_fp_minmax_d_q,\
           neon_fp_neg_s, neon_fp_neg_s_q, neon_fp_neg_d, neon_fp_neg_d_q,\
-          neon_fp_reduc_add_s, neon_fp_reduc_add_s_q, neon_fp_reduc_add_d,\
-          neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s,
+          neon_fp_reduc_add_h, neon_fp_reduc_add_s, neon_fp_reduc_add_s_q,\
+          neon_fp_reduc_add_d, neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s,\
           neon_fp_reduc_minmax_s_q, neon_fp_reduc_minmax_d,\
           neon_fp_reduc_minmax_d_q,\
           neon_fp_cvt_narrow_s_q, neon_fp_cvt_narrow_d_q,\
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
index 07d71a63414b1066ea431e287286ad048515711a..8e35e0b574d49913b43c7d8d4f4ba75f127f42e9 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
@@ -30,11 +30,9 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n)	\
 TEST_ALL (VEC_PERM)
 
 /* We should use one DUP for each of the 8-, 16- and 32-bit types,
-   although we currently use LD1RW for _Float16.  We should use two
-   DUPs for each of the three 64-bit types.  */
+   We should use two DUPs for each of the three 64-bit types.  */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 2 } } */
-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } } */
 /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
 /* { dg-final { scan-assembler-not {\tzip2\t} } } */
  

Comments

Richard Sandiford Nov. 1, 2022, 2:58 p.m. UTC | #1
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> The backend has an existing V2HFmode that is used by pairwise operations.
> This mode was however never made fully functional.  Amongst other things it was
> never declared as a vector type which made it unusable from the mid-end.
>
> It's also lacking an implementation for load/stores so reload ICEs if this mode
> is every used.  This finishes the implementation by providing the above.
>
> Note that I have created a new iterator VHSDF_P instead of extending VHSDF
> because the previous iterator is used in far more things than just load/stores.
>
> It's also used for instance in intrinsics and extending this would force me to
> provide support for mangling the type while we never expose it through
> intrinsics.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-simd.md (*aarch64_simd_movv2hf): New.
> 	(mov<mode>, movmisalign<mode>, aarch64_dup_lane<mode>,
> 	aarch64_store_lane0<mode>, aarch64_simd_vec_set<mode>,
> 	@aarch64_simd_vec_copy_lane<mode>, vec_set<mode>,
> 	reduc_<optab>_scal_<mode>, reduc_<fmaxmin>_scal_<mode>,
> 	aarch64_reduc_<optab>_internal<mode>, aarch64_get_lane<mode>,
> 	vec_init<mode><Vel>, vec_extract<mode><Vel>): Support V2HF.
> 	* config/aarch64/aarch64.cc (aarch64_classify_vector_mode):
> 	Add E_V2HFmode.
> 	* config/aarch64/iterators.md (VHSDF_P): New.
> 	(V2F, VALL_F16_FULL, nunits, Vtype, Vmtype, Vetype, stype, VEL,
> 	Vel, q, vp): Add V2HF.
> 	* config/arm/types.md (neon_fp_reduc_add_h): New.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/sve/slp_1.c: Update testcase.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 25aed74f8cf939562ed65a578fe32ca76605b58a..93a2888f567460ad10ec050ea7d4f701df4729d1 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -19,10 +19,10 @@
>  ;; <http://www.gnu.org/licenses/>.
>  
>  (define_expand "mov<mode>"
> -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
> -	(match_operand:VALL_F16 1 "general_operand"))]
> +  [(set (match_operand:VALL_F16_FULL 0 "nonimmediate_operand")
> +	(match_operand:VALL_F16_FULL 1 "general_operand"))]
>    "TARGET_SIMD"
> -  "
> +{
>    /* Force the operand into a register if it is not an
>       immediate whose use can be replaced with xzr.
>       If the mode is 16 bytes wide, then we will be doing
> @@ -46,12 +46,11 @@ (define_expand "mov<mode>"
>        aarch64_expand_vector_init (operands[0], operands[1]);
>        DONE;
>      }
> -  "
> -)
> +})
>  
>  (define_expand "movmisalign<mode>"
> -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
> -        (match_operand:VALL_F16 1 "general_operand"))]
> +  [(set (match_operand:VALL_F16_FULL 0 "nonimmediate_operand")
> +        (match_operand:VALL_F16_FULL 1 "general_operand"))]
>    "TARGET_SIMD && !STRICT_ALIGNMENT"
>  {
>    /* This pattern is not permitted to fail during expansion: if both arguments
> @@ -85,10 +84,10 @@ (define_insn "aarch64_simd_dup<mode>"
>  )
>  
>  (define_insn "aarch64_dup_lane<mode>"
> -  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> -	(vec_duplicate:VALL_F16
> +  [(set (match_operand:VALL_F16_FULL 0 "register_operand" "=w")
> +	(vec_duplicate:VALL_F16_FULL
>  	  (vec_select:<VEL>
> -	    (match_operand:VALL_F16 1 "register_operand" "w")
> +	    (match_operand:VALL_F16_FULL 1 "register_operand" "w")
>  	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
>            )))]
>    "TARGET_SIMD"
> @@ -142,6 +141,29 @@ (define_insn "*aarch64_simd_mov<VDMOV:mode>"
>  		     mov_reg, neon_move<q>")]
>  )
>  
> +(define_insn "*aarch64_simd_movv2hf"
> +  [(set (match_operand:V2HF 0 "nonimmediate_operand"
> +		"=w, m,  m,  w, ?r, ?w, ?r, w, w")
> +	(match_operand:V2HF 1 "general_operand"
> +		"m,  Dz, w,  w,  w,  r,  r, Dz, Dn"))]
> +  "TARGET_SIMD_F16INST
> +   && (register_operand (operands[0], V2HFmode)
> +       || aarch64_simd_reg_or_zero (operands[1], V2HFmode))"
> +   "@
> +    ldr\\t%s0, %1
> +    str\\twzr, %0
> +    str\\t%s1, %0
> +    mov\\t%0.2s[0], %1.2s[0]
> +    umov\\t%w0, %1.s[0]
> +    fmov\\t%s0, %1
> +    mov\\t%0, %1
> +    movi\\t%d0, 0
> +    * return aarch64_output_simd_mov_immediate (operands[1], 32);"
> +  [(set_attr "type" "neon_load1_1reg, store_8, neon_store1_1reg,\
> +		     neon_logic, neon_to_gp, f_mcr,\
> +		     mov_reg, neon_move, neon_move")]
> +)
> +
>  (define_insn "*aarch64_simd_mov<VQMOV:mode>"
>    [(set (match_operand:VQMOV 0 "nonimmediate_operand"
>  		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
> @@ -182,7 +204,7 @@ (define_insn "*aarch64_simd_mov<VQMOV:mode>"
>  
>  (define_insn "aarch64_store_lane0<mode>"
>    [(set (match_operand:<VEL> 0 "memory_operand" "=m")
> -	(vec_select:<VEL> (match_operand:VALL_F16 1 "register_operand" "w")
> +	(vec_select:<VEL> (match_operand:VALL_F16_FULL 1 "register_operand" "w")
>  			(parallel [(match_operand 2 "const_int_operand" "n")])))]
>    "TARGET_SIMD
>     && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 0"
> @@ -1035,11 +1057,11 @@ (define_insn "one_cmpl<mode>2"
>  )
>  
>  (define_insn "aarch64_simd_vec_set<mode>"
> -  [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
> -	(vec_merge:VALL_F16
> -	    (vec_duplicate:VALL_F16
> +  [(set (match_operand:VALL_F16_FULL 0 "register_operand" "=w,w,w")
> +	(vec_merge:VALL_F16_FULL
> +	    (vec_duplicate:VALL_F16_FULL
>  		(match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "w,?r,Utv"))
> -	    (match_operand:VALL_F16 3 "register_operand" "0,0,0")
> +	    (match_operand:VALL_F16_FULL 3 "register_operand" "0,0,0")
>  	    (match_operand:SI 2 "immediate_operand" "i,i,i")))]
>    "TARGET_SIMD"
>    {
> @@ -1061,14 +1083,14 @@ (define_insn "aarch64_simd_vec_set<mode>"
>  )
>  
>  (define_insn "@aarch64_simd_vec_copy_lane<mode>"
> -  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> -	(vec_merge:VALL_F16
> -	    (vec_duplicate:VALL_F16
> +  [(set (match_operand:VALL_F16_FULL 0 "register_operand" "=w")
> +	(vec_merge:VALL_F16_FULL
> +	    (vec_duplicate:VALL_F16_FULL
>  	      (vec_select:<VEL>
> -		(match_operand:VALL_F16 3 "register_operand" "w")
> +		(match_operand:VALL_F16_FULL 3 "register_operand" "w")
>  		(parallel
>  		  [(match_operand:SI 4 "immediate_operand" "i")])))
> -	    (match_operand:VALL_F16 1 "register_operand" "0")
> +	    (match_operand:VALL_F16_FULL 1 "register_operand" "0")
>  	    (match_operand:SI 2 "immediate_operand" "i")))]
>    "TARGET_SIMD"
>    {
> @@ -1376,7 +1398,7 @@ (define_insn "vec_shr_<mode>"
>  )
>  
>  (define_expand "vec_set<mode>"
> -  [(match_operand:VALL_F16 0 "register_operand")
> +  [(match_operand:VALL_F16_FULL 0 "register_operand")
>     (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand")
>     (match_operand:SI 2 "immediate_operand")]
>    "TARGET_SIMD"
> @@ -3503,7 +3525,7 @@ (define_insn "popcount<mode>2"
>  ;; gimple_fold'd to the IFN_REDUC_(MAX|MIN) function.  (This is FP smax/smin).
>  (define_expand "reduc_<optab>_scal_<mode>"
>    [(match_operand:<VEL> 0 "register_operand")
> -   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
> +   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
>  		 FMAXMINV)]
>    "TARGET_SIMD"
>    {
> @@ -3518,7 +3540,7 @@ (define_expand "reduc_<optab>_scal_<mode>"
>  
>  (define_expand "reduc_<fmaxmin>_scal_<mode>"
>    [(match_operand:<VEL> 0 "register_operand")
> -   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
> +   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
>  		 FMAXMINNMV)]
>    "TARGET_SIMD"
>    {
> @@ -3562,8 +3584,8 @@ (define_insn "aarch64_reduc_<optab>_internalv2si"
>  )
>  
>  (define_insn "aarch64_reduc_<optab>_internal<mode>"
> - [(set (match_operand:VHSDF 0 "register_operand" "=w")
> -       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
> + [(set (match_operand:VHSDF_P 0 "register_operand" "=w")
> +       (unspec:VHSDF_P [(match_operand:VHSDF_P 1 "register_operand" "w")]
>  		      FMAXMINV))]
>   "TARGET_SIMD"
>   "<maxmin_uns_op><vp>\\t%<Vetype>0, %1.<Vtype>"
> @@ -4208,7 +4230,7 @@ (define_insn "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
>  (define_insn_and_split "aarch64_get_lane<mode>"
>    [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=?r, w, Utv")
>  	(vec_select:<VEL>
> -	  (match_operand:VALL_F16 1 "register_operand" "w, w, w")
> +	  (match_operand:VALL_F16_FULL 1 "register_operand" "w, w, w")
>  	  (parallel [(match_operand:SI 2 "immediate_operand" "i, i, i")])))]
>    "TARGET_SIMD"
>    {
> @@ -7989,7 +8011,7 @@ (define_expand "aarch64_st1<VALL_F16:mode>"
>  ;; Standard pattern name vec_init<mode><Vel>.
>  
>  (define_expand "vec_init<mode><Vel>"
> -  [(match_operand:VALL_F16 0 "register_operand")
> +  [(match_operand:VALL_F16_FULL 0 "register_operand")
>     (match_operand 1 "" "")]
>    "TARGET_SIMD"
>  {
> @@ -8068,7 +8090,7 @@ (define_insn "aarch64_urecpe<mode>"
>  
>  (define_expand "vec_extract<mode><Vel>"
>    [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand")
> -   (match_operand:VALL_F16 1 "register_operand")
> +   (match_operand:VALL_F16_FULL 1 "register_operand")
>     (match_operand:SI 2 "immediate_operand")]
>    "TARGET_SIMD"
>  {
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index f05bac713e88ea8c7feaa2367d55bd523ca66f57..1e08f8453688210afe1566092b19b59c9bdd0c97 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -3566,6 +3566,7 @@ aarch64_classify_vector_mode (machine_mode mode)
>      case E_V8BFmode:
>      case E_V4SFmode:
>      case E_V2DFmode:
> +    case E_V2HFmode:
>        return TARGET_SIMD ? VEC_ADVSIMD : 0;
>  
>      default:
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 37d8161a33b1c399d80be82afa67613a087389d4..1df09f7fe2eb35aed96113476541e0faa5393551 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -160,6 +160,10 @@ (define_mode_iterator VDQF [V2SF V4SF V2DF])
>  (define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
>  			     (V8HF "TARGET_SIMD_F16INST")
>  			     V2SF V4SF V2DF])
> +;; Advanced SIMD Float modes suitable for pairwise operations.
> +(define_mode_iterator VHSDF_P [(V4HF "TARGET_SIMD_F16INST")
> +			       (V8HF "TARGET_SIMD_F16INST")
> +			       V2SF V4SF V2DF (V2HF "TARGET_SIMD_F16INST")])
>  
>  ;; Advanced SIMD Float modes, and DF.
>  (define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF])
> @@ -188,15 +192,23 @@ (define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF V2DI])
>  (define_mode_iterator VALLF [V2SF V4SF V2DF SF DF])
>  
>  ;; Advanced SIMD Float modes with 2 elements.
> -(define_mode_iterator V2F [V2SF V2DF])
> +(define_mode_iterator V2F [V2SF V2DF V2HF])
>  
>  ;; All Advanced SIMD modes on which we support any arithmetic operations.
>  (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
>  
> -;; All Advanced SIMD modes suitable for moving, loading, and storing.
> +;; All Advanced SIMD modes suitable for moving, loading, and storing
> +;; except V2HF.
>  (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
>  				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
>  
> +;; All Advanced SIMD modes suitable for moving, loading, and storing
> +;; including V2HF
> +(define_mode_iterator VALL_F16_FULL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
> +				     V4HF V8HF V4BF V8BF V2SF V4SF V2DF
> +				     (V2HF "TARGET_SIMD_F16INST")])

This name might cause confusion with the SVE iterators, where FULL
means "every bit of the register is used".  How about something like
VMOVE instead?

With this change, I guess VALL_F16 represents "The set of all modes
for which the vld1 intrinsics are provided" and VMOVE or whatever
is "All Advanced SIMD modes suitable for moving, loading, and storing".
That is, VMOVE extends VALL_F16 with modes that are not manifested
via intrinsics.

> +
> +
>  ;; The VALL_F16 modes except the 128-bit 2-element ones.
>  (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
>  				V4HF V8HF V2SF V4SF])
> @@ -1076,7 +1088,7 @@ (define_mode_attr nunits [(V8QI "8") (V16QI "16")
>  			  (V2SF "2") (V4SF "4")
>  			  (V1DF "1") (V2DF "2")
>  			  (DI "1") (DF "1")
> -			  (V8DI "8")])
> +			  (V8DI "8") (V2HF "2")])
>  
>  ;; Map a mode to the number of bits in it, if the size of the mode
>  ;; is constant.
> @@ -1090,6 +1102,7 @@ (define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")])
>  
>  ;; Give the length suffix letter for a sign- or zero-extension.
>  (define_mode_attr size [(QI "b") (HI "h") (SI "w")])
> +(define_mode_attr sizel [(QI "b") (HI "h") (SI "")])
>  
>  ;; Give the number of bits in the mode
>  (define_mode_attr sizen [(QI "8") (HI "16") (SI "32") (DI "64")])
> @@ -1134,8 +1147,9 @@ (define_mode_attr Vtype [(V8QI "8b") (V16QI "16b")
>                           (V2SI "2s") (V4SI  "4s")
>                           (DI   "1d") (DF    "1d")
>                           (V2DI "2d") (V2SF "2s")
> -			 (V4SF "4s") (V2DF "2d")
> -			 (V4HF "4h") (V8HF "8h")
> +			 (V2HF "2h") (V4SF "4s")
> +			 (V2DF "2d") (V4HF "4h")
> +			 (V8HF "8h")
>  			 (V2x8QI "8b") (V2x4HI "4h")
>  			 (V2x2SI "2s") (V2x1DI  "1d")
>  			 (V2x4HF "4h") (V2x2SF "2s")

Where is the 2h used, and is it valid syntax in that context?

Same for later instances of 2h.

Thanks,
Richard

> @@ -1175,9 +1189,10 @@ (define_mode_attr Vmtype [(V8QI ".8b") (V16QI ".16b")
>  			 (V4HI ".4h") (V8HI  ".8h")
>  			 (V2SI ".2s") (V4SI  ".4s")
>  			 (V2DI ".2d") (V4HF ".4h")
> -			 (V8HF ".8h") (V4BF ".4h")
> -			 (V8BF ".8h") (V2SF ".2s")
> -			 (V4SF ".4s") (V2DF ".2d")
> +			 (V8HF ".8h") (V2HF ".2h")
> +			 (V4BF ".4h") (V8BF ".8h")
> +			 (V2SF ".2s") (V4SF ".4s")
> +			 (V2DF ".2d")
>  			 (DI   "")    (SI   "")
>  			 (HI   "")    (QI   "")
>  			 (TI   "")    (HF   "")
> @@ -1193,7 +1208,7 @@ (define_mode_attr Vmntype [(V8HI ".8b") (V4SI ".4h")
>  (define_mode_attr Vetype [(V8QI "b") (V16QI "b")
>  			  (V4HI "h") (V8HI  "h")
>  			  (V2SI "s") (V4SI  "s")
> -			  (V2DI "d")
> +			  (V2DI "d") (V2HF  "h")
>  			  (V4HF "h") (V8HF  "h")
>  			  (V2SF "s") (V4SF  "s")
>  			  (V2DF "d")
> @@ -1285,7 +1300,7 @@ (define_mode_attr Vcwtype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "w") (VNx2QI "d")
>  ;; more accurately.
>  (define_mode_attr stype [(V8QI "b") (V16QI "b") (V4HI "s") (V8HI "s")
>  			 (V2SI "s") (V4SI "s") (V2DI "d") (V4HF "s")
> -			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d")
> +			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d") (V2HF "s")
>  			 (HF "s") (SF "s") (DF "d") (QI "b") (HI "s")
>  			 (SI "s") (DI "d")])
>  
> @@ -1360,8 +1375,8 @@ (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
>  		       (V4HF "HF") (V8HF  "HF")
>  		       (V2SF "SF") (V4SF  "SF")
>  		       (DF   "DF") (V2DF  "DF")
> -		       (SI   "SI") (HI    "HI")
> -		       (QI   "QI")
> +		       (SI   "SI") (V2HF  "HF")
> +		       (QI   "QI") (HI    "HI")
>  		       (V4BF "BF") (V8BF "BF")
>  		       (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
>  		       (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
> @@ -1381,7 +1396,7 @@ (define_mode_attr Vel [(V8QI "qi") (V16QI "qi")
>  		       (V2SF "sf") (V4SF "sf")
>  		       (V2DF "df") (DF   "df")
>  		       (SI   "si") (HI   "hi")
> -		       (QI   "qi")
> +		       (QI   "qi") (V2HF "hf")
>  		       (V4BF "bf") (V8BF "bf")
>  		       (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
>  		       (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
> @@ -1866,7 +1881,7 @@ (define_mode_attr q [(V8QI "") (V16QI "_q")
>  		     (V4HF "") (V8HF "_q")
>  		     (V4BF "") (V8BF "_q")
>  		     (V2SF "") (V4SF  "_q")
> -			       (V2DF  "_q")
> +		     (V2HF "") (V2DF  "_q")
>  		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")
>  		     (V2x8QI "") (V2x16QI "_q")
>  		     (V2x4HI "") (V2x8HI "_q")
> @@ -1905,6 +1920,7 @@ (define_mode_attr vp [(V8QI "v") (V16QI "v")
>  		      (V2SI "p") (V4SI  "v")
>  		      (V2DI "p") (V2DF  "p")
>  		      (V2SF "p") (V4SF  "v")
> +		      (V2HF "p")
>  		      (V4HF "v") (V8HF  "v")])
>  
>  (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")
> diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
> index 7d0504bdd944e9c0d1b545b0b66a9a1adc808714..3cfbc7a93cca1bea4925853e51d0a147c5722247 100644
> --- a/gcc/config/arm/types.md
> +++ b/gcc/config/arm/types.md
> @@ -483,6 +483,7 @@ (define_attr "autodetect_type"
>  ; neon_fp_minmax_s_q
>  ; neon_fp_minmax_d
>  ; neon_fp_minmax_d_q
> +; neon_fp_reduc_add_h
>  ; neon_fp_reduc_add_s
>  ; neon_fp_reduc_add_s_q
>  ; neon_fp_reduc_add_d
> @@ -1033,6 +1034,7 @@ (define_attr "type"
>    neon_fp_minmax_d,\
>    neon_fp_minmax_d_q,\
>  \
> +  neon_fp_reduc_add_h,\
>    neon_fp_reduc_add_s,\
>    neon_fp_reduc_add_s_q,\
>    neon_fp_reduc_add_d,\
> @@ -1257,8 +1259,8 @@ (define_attr "is_neon_type" "yes,no"
>            neon_fp_compare_d, neon_fp_compare_d_q, neon_fp_minmax_s,\
>            neon_fp_minmax_s_q, neon_fp_minmax_d, neon_fp_minmax_d_q,\
>            neon_fp_neg_s, neon_fp_neg_s_q, neon_fp_neg_d, neon_fp_neg_d_q,\
> -          neon_fp_reduc_add_s, neon_fp_reduc_add_s_q, neon_fp_reduc_add_d,\
> -          neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s,
> +          neon_fp_reduc_add_h, neon_fp_reduc_add_s, neon_fp_reduc_add_s_q,\
> +          neon_fp_reduc_add_d, neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s,\
>            neon_fp_reduc_minmax_s_q, neon_fp_reduc_minmax_d,\
>            neon_fp_reduc_minmax_d_q,\
>            neon_fp_cvt_narrow_s_q, neon_fp_cvt_narrow_d_q,\
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> index 07d71a63414b1066ea431e287286ad048515711a..8e35e0b574d49913b43c7d8d4f4ba75f127f42e9 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> @@ -30,11 +30,9 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n)	\
>  TEST_ALL (VEC_PERM)
>  
>  /* We should use one DUP for each of the 8-, 16- and 32-bit types,
> -   although we currently use LD1RW for _Float16.  We should use two
> -   DUPs for each of the three 64-bit types.  */
> +   We should use two DUPs for each of the three 64-bit types.  */
>  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } } */
> -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 2 } } */
> -/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 1 } } */
> +/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } } */
>  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } } */
>  /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
>  /* { dg-final { scan-assembler-not {\tzip2\t} } } */
  
Tamar Christina Nov. 1, 2022, 3:11 p.m. UTC | #2
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Tuesday, November 1, 2022 2:59 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH 5/8]AArch64 aarch64: Make existing V2HF be usable.
> 
> Tamar Christina <tamar.christina@arm.com> writes:
> > Hi All,
> >
> > The backend has an existing V2HFmode that is used by pairwise operations.
> > This mode was however never made fully functional.  Amongst other
> > things it was never declared as a vector type which made it unusable from
> the mid-end.
> >
> > It's also lacking an implementation for load/stores so reload ICEs if
> > this mode is every used.  This finishes the implementation by providing the
> above.
> >
> > Note that I have created a new iterator VHSDF_P instead of extending
> > VHSDF because the previous iterator is used in far more things than just
> load/stores.
> >
> > It's also used for instance in intrinsics and extending this would
> > force me to provide support for mangling the type while we never
> > expose it through intrinsics.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* config/aarch64/aarch64-simd.md (*aarch64_simd_movv2hf): New.
> > 	(mov<mode>, movmisalign<mode>, aarch64_dup_lane<mode>,
> > 	aarch64_store_lane0<mode>, aarch64_simd_vec_set<mode>,
> > 	@aarch64_simd_vec_copy_lane<mode>, vec_set<mode>,
> > 	reduc_<optab>_scal_<mode>, reduc_<fmaxmin>_scal_<mode>,
> > 	aarch64_reduc_<optab>_internal<mode>,
> aarch64_get_lane<mode>,
> > 	vec_init<mode><Vel>, vec_extract<mode><Vel>): Support V2HF.
> > 	* config/aarch64/aarch64.cc (aarch64_classify_vector_mode):
> > 	Add E_V2HFmode.
> > 	* config/aarch64/iterators.md (VHSDF_P): New.
> > 	(V2F, VALL_F16_FULL, nunits, Vtype, Vmtype, Vetype, stype, VEL,
> > 	Vel, q, vp): Add V2HF.
> > 	* config/arm/types.md (neon_fp_reduc_add_h): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.target/aarch64/sve/slp_1.c: Update testcase.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index
> >
> 25aed74f8cf939562ed65a578fe32ca76605b58a..93a2888f567460ad10ec050ea7
> d4
> > f701df4729d1 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -19,10 +19,10 @@
> >  ;; <http://www.gnu.org/licenses/>.
> >
> >  (define_expand "mov<mode>"
> > -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
> > -	(match_operand:VALL_F16 1 "general_operand"))]
> > +  [(set (match_operand:VALL_F16_FULL 0 "nonimmediate_operand")
> > +	(match_operand:VALL_F16_FULL 1 "general_operand"))]
> >    "TARGET_SIMD"
> > -  "
> > +{
> >    /* Force the operand into a register if it is not an
> >       immediate whose use can be replaced with xzr.
> >       If the mode is 16 bytes wide, then we will be doing @@ -46,12
> > +46,11 @@ (define_expand "mov<mode>"
> >        aarch64_expand_vector_init (operands[0], operands[1]);
> >        DONE;
> >      }
> > -  "
> > -)
> > +})
> >
> >  (define_expand "movmisalign<mode>"
> > -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
> > -        (match_operand:VALL_F16 1 "general_operand"))]
> > +  [(set (match_operand:VALL_F16_FULL 0 "nonimmediate_operand")
> > +        (match_operand:VALL_F16_FULL 1 "general_operand"))]
> >    "TARGET_SIMD && !STRICT_ALIGNMENT"
> >  {
> >    /* This pattern is not permitted to fail during expansion: if both
> > arguments @@ -85,10 +84,10 @@ (define_insn
> "aarch64_simd_dup<mode>"
> >  )
> >
> >  (define_insn "aarch64_dup_lane<mode>"
> > -  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> > -	(vec_duplicate:VALL_F16
> > +  [(set (match_operand:VALL_F16_FULL 0 "register_operand" "=w")
> > +	(vec_duplicate:VALL_F16_FULL
> >  	  (vec_select:<VEL>
> > -	    (match_operand:VALL_F16 1 "register_operand" "w")
> > +	    (match_operand:VALL_F16_FULL 1 "register_operand" "w")
> >  	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
> >            )))]
> >    "TARGET_SIMD"
> > @@ -142,6 +141,29 @@ (define_insn
> "*aarch64_simd_mov<VDMOV:mode>"
> >  		     mov_reg, neon_move<q>")]
> >  )
> >
> > +(define_insn "*aarch64_simd_movv2hf"
> > +  [(set (match_operand:V2HF 0 "nonimmediate_operand"
> > +		"=w, m,  m,  w, ?r, ?w, ?r, w, w")
> > +	(match_operand:V2HF 1 "general_operand"
> > +		"m,  Dz, w,  w,  w,  r,  r, Dz, Dn"))]
> > +  "TARGET_SIMD_F16INST
> > +   && (register_operand (operands[0], V2HFmode)
> > +       || aarch64_simd_reg_or_zero (operands[1], V2HFmode))"
> > +   "@
> > +    ldr\\t%s0, %1
> > +    str\\twzr, %0
> > +    str\\t%s1, %0
> > +    mov\\t%0.2s[0], %1.2s[0]
> > +    umov\\t%w0, %1.s[0]
> > +    fmov\\t%s0, %1
> > +    mov\\t%0, %1
> > +    movi\\t%d0, 0
> > +    * return aarch64_output_simd_mov_immediate (operands[1], 32);"
> > +  [(set_attr "type" "neon_load1_1reg, store_8, neon_store1_1reg,\
> > +		     neon_logic, neon_to_gp, f_mcr,\
> > +		     mov_reg, neon_move, neon_move")]
> > +)
> > +
> >  (define_insn "*aarch64_simd_mov<VQMOV:mode>"
> >    [(set (match_operand:VQMOV 0 "nonimmediate_operand"
> >  		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
> > @@ -182,7 +204,7 @@ (define_insn
> "*aarch64_simd_mov<VQMOV:mode>"
> >
> >  (define_insn "aarch64_store_lane0<mode>"
> >    [(set (match_operand:<VEL> 0 "memory_operand" "=m")
> > -	(vec_select:<VEL> (match_operand:VALL_F16 1 "register_operand"
> "w")
> > +	(vec_select:<VEL> (match_operand:VALL_F16_FULL 1
> "register_operand"
> > +"w")
> >  			(parallel [(match_operand 2 "const_int_operand"
> "n")])))]
> >    "TARGET_SIMD
> >     && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 0"
> > @@ -1035,11 +1057,11 @@ (define_insn "one_cmpl<mode>2"
> >  )
> >
> >  (define_insn "aarch64_simd_vec_set<mode>"
> > -  [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
> > -	(vec_merge:VALL_F16
> > -	    (vec_duplicate:VALL_F16
> > +  [(set (match_operand:VALL_F16_FULL 0 "register_operand" "=w,w,w")
> > +	(vec_merge:VALL_F16_FULL
> > +	    (vec_duplicate:VALL_F16_FULL
> >  		(match_operand:<VEL> 1
> "aarch64_simd_nonimmediate_operand" "w,?r,Utv"))
> > -	    (match_operand:VALL_F16 3 "register_operand" "0,0,0")
> > +	    (match_operand:VALL_F16_FULL 3 "register_operand" "0,0,0")
> >  	    (match_operand:SI 2 "immediate_operand" "i,i,i")))]
> >    "TARGET_SIMD"
> >    {
> > @@ -1061,14 +1083,14 @@ (define_insn "aarch64_simd_vec_set<mode>"
> >  )
> >
> >  (define_insn "@aarch64_simd_vec_copy_lane<mode>"
> > -  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> > -	(vec_merge:VALL_F16
> > -	    (vec_duplicate:VALL_F16
> > +  [(set (match_operand:VALL_F16_FULL 0 "register_operand" "=w")
> > +	(vec_merge:VALL_F16_FULL
> > +	    (vec_duplicate:VALL_F16_FULL
> >  	      (vec_select:<VEL>
> > -		(match_operand:VALL_F16 3 "register_operand" "w")
> > +		(match_operand:VALL_F16_FULL 3 "register_operand" "w")
> >  		(parallel
> >  		  [(match_operand:SI 4 "immediate_operand" "i")])))
> > -	    (match_operand:VALL_F16 1 "register_operand" "0")
> > +	    (match_operand:VALL_F16_FULL 1 "register_operand" "0")
> >  	    (match_operand:SI 2 "immediate_operand" "i")))]
> >    "TARGET_SIMD"
> >    {
> > @@ -1376,7 +1398,7 @@ (define_insn "vec_shr_<mode>"
> >  )
> >
> >  (define_expand "vec_set<mode>"
> > -  [(match_operand:VALL_F16 0 "register_operand")
> > +  [(match_operand:VALL_F16_FULL 0 "register_operand")
> >     (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand")
> >     (match_operand:SI 2 "immediate_operand")]
> >    "TARGET_SIMD"
> > @@ -3503,7 +3525,7 @@ (define_insn "popcount<mode>2"
> >  ;; gimple_fold'd to the IFN_REDUC_(MAX|MIN) function.  (This is FP
> smax/smin).
> >  (define_expand "reduc_<optab>_scal_<mode>"
> >    [(match_operand:<VEL> 0 "register_operand")
> > -   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
> > +   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
> >  		 FMAXMINV)]
> >    "TARGET_SIMD"
> >    {
> > @@ -3518,7 +3540,7 @@ (define_expand "reduc_<optab>_scal_<mode>"
> >
> >  (define_expand "reduc_<fmaxmin>_scal_<mode>"
> >    [(match_operand:<VEL> 0 "register_operand")
> > -   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
> > +   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
> >  		 FMAXMINNMV)]
> >    "TARGET_SIMD"
> >    {
> > @@ -3562,8 +3584,8 @@ (define_insn
> "aarch64_reduc_<optab>_internalv2si"
> >  )
> >
> >  (define_insn "aarch64_reduc_<optab>_internal<mode>"
> > - [(set (match_operand:VHSDF 0 "register_operand" "=w")
> > -       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
> > + [(set (match_operand:VHSDF_P 0 "register_operand" "=w")
> > +       (unspec:VHSDF_P [(match_operand:VHSDF_P 1 "register_operand"
> > + "w")]
> >  		      FMAXMINV))]
> >   "TARGET_SIMD"
> >   "<maxmin_uns_op><vp>\\t%<Vetype>0, %1.<Vtype>"
> > @@ -4208,7 +4230,7 @@ (define_insn
> "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
> >  (define_insn_and_split "aarch64_get_lane<mode>"
> >    [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand"
> "=?r, w, Utv")
> >  	(vec_select:<VEL>
> > -	  (match_operand:VALL_F16 1 "register_operand" "w, w, w")
> > +	  (match_operand:VALL_F16_FULL 1 "register_operand" "w, w, w")
> >  	  (parallel [(match_operand:SI 2 "immediate_operand" "i, i, i")])))]
> >    "TARGET_SIMD"
> >    {
> > @@ -7989,7 +8011,7 @@ (define_expand "aarch64_st1<VALL_F16:mode>"
> >  ;; Standard pattern name vec_init<mode><Vel>.
> >
> >  (define_expand "vec_init<mode><Vel>"
> > -  [(match_operand:VALL_F16 0 "register_operand")
> > +  [(match_operand:VALL_F16_FULL 0 "register_operand")
> >     (match_operand 1 "" "")]
> >    "TARGET_SIMD"
> >  {
> > @@ -8068,7 +8090,7 @@ (define_insn "aarch64_urecpe<mode>"
> >
> >  (define_expand "vec_extract<mode><Vel>"
> >    [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand")
> > -   (match_operand:VALL_F16 1 "register_operand")
> > +   (match_operand:VALL_F16_FULL 1 "register_operand")
> >     (match_operand:SI 2 "immediate_operand")]
> >    "TARGET_SIMD"
> >  {
> > diff --git a/gcc/config/aarch64/aarch64.cc
> > b/gcc/config/aarch64/aarch64.cc index
> >
> f05bac713e88ea8c7feaa2367d55bd523ca66f57..1e08f8453688210afe1566092b
> 19
> > b59c9bdd0c97 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -3566,6 +3566,7 @@ aarch64_classify_vector_mode (machine_mode
> mode)
> >      case E_V8BFmode:
> >      case E_V4SFmode:
> >      case E_V2DFmode:
> > +    case E_V2HFmode:
> >        return TARGET_SIMD ? VEC_ADVSIMD : 0;
> >
> >      default:
> > diff --git a/gcc/config/aarch64/iterators.md
> > b/gcc/config/aarch64/iterators.md index
> >
> 37d8161a33b1c399d80be82afa67613a087389d4..1df09f7fe2eb35aed96113476
> 541
> > e0faa5393551 100644
> > --- a/gcc/config/aarch64/iterators.md
> > +++ b/gcc/config/aarch64/iterators.md
> > @@ -160,6 +160,10 @@ (define_mode_iterator VDQF [V2SF V4SF V2DF])
> > (define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
> >  			     (V8HF "TARGET_SIMD_F16INST")
> >  			     V2SF V4SF V2DF])
> > +;; Advanced SIMD Float modes suitable for pairwise operations.
> > +(define_mode_iterator VHSDF_P [(V4HF "TARGET_SIMD_F16INST")
> > +			       (V8HF "TARGET_SIMD_F16INST")
> > +			       V2SF V4SF V2DF (V2HF
> "TARGET_SIMD_F16INST")])
> >
> >  ;; Advanced SIMD Float modes, and DF.
> >  (define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF]) @@ -188,15
> +192,23
> > @@ (define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF
> V2DI])
> > (define_mode_iterator VALLF [V2SF V4SF V2DF SF DF])
> >
> >  ;; Advanced SIMD Float modes with 2 elements.
> > -(define_mode_iterator V2F [V2SF V2DF])
> > +(define_mode_iterator V2F [V2SF V2DF V2HF])
> >
> >  ;; All Advanced SIMD modes on which we support any arithmetic
> operations.
> >  (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF
> > V4SF V2DF])
> >
> > -;; All Advanced SIMD modes suitable for moving, loading, and storing.
> > +;; All Advanced SIMD modes suitable for moving, loading, and storing
> > +;; except V2HF.
> >  (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
> >  				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
> >
> > +;; All Advanced SIMD modes suitable for moving, loading, and storing
> > +;; including V2HF (define_mode_iterator VALL_F16_FULL [V8QI V16QI
> > +V4HI V8HI V2SI V4SI V2DI
> > +				     V4HF V8HF V4BF V8BF V2SF V4SF V2DF
> > +				     (V2HF "TARGET_SIMD_F16INST")])
> 
> This name might cause confusion with the SVE iterators, where FULL means
> "every bit of the register is used".  How about something like VMOVE
> instead?
> 
> With this change, I guess VALL_F16 represents "The set of all modes for
> which the vld1 intrinsics are provided" and VMOVE or whatever is "All
> Advanced SIMD modes suitable for moving, loading, and storing".
> That is, VMOVE extends VALL_F16 with modes that are not manifested via
> intrinsics.
> 
> > +
> > +
> >  ;; The VALL_F16 modes except the 128-bit 2-element ones.
> >  (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI
> V4SI
> >  				V4HF V8HF V2SF V4SF])
> > @@ -1076,7 +1088,7 @@ (define_mode_attr nunits [(V8QI "8") (V16QI
> "16")
> >  			  (V2SF "2") (V4SF "4")
> >  			  (V1DF "1") (V2DF "2")
> >  			  (DI "1") (DF "1")
> > -			  (V8DI "8")])
> > +			  (V8DI "8") (V2HF "2")])
> >
> >  ;; Map a mode to the number of bits in it, if the size of the mode
> > ;; is constant.
> > @@ -1090,6 +1102,7 @@ (define_mode_attr s [(HF "h") (SF "s") (DF "d")
> > (SI "s") (DI "d")])
> >
> >  ;; Give the length suffix letter for a sign- or zero-extension.
> >  (define_mode_attr size [(QI "b") (HI "h") (SI "w")])
> > +(define_mode_attr sizel [(QI "b") (HI "h") (SI "")])
> >
> >  ;; Give the number of bits in the mode  (define_mode_attr sizen [(QI
> > "8") (HI "16") (SI "32") (DI "64")]) @@ -1134,8 +1147,9 @@
> > (define_mode_attr Vtype [(V8QI "8b") (V16QI "16b")
> >                           (V2SI "2s") (V4SI  "4s")
> >                           (DI   "1d") (DF    "1d")
> >                           (V2DI "2d") (V2SF "2s")
> > -			 (V4SF "4s") (V2DF "2d")
> > -			 (V4HF "4h") (V8HF "8h")
> > +			 (V2HF "2h") (V4SF "4s")
> > +			 (V2DF "2d") (V4HF "4h")
> > +			 (V8HF "8h")
> >  			 (V2x8QI "8b") (V2x4HI "4h")
> >  			 (V2x2SI "2s") (V2x1DI  "1d")
> >  			 (V2x4HF "4h") (V2x2SF "2s")
> 
> Where is the 2h used, and is it valid syntax in that context?
> 

The singular instrance in the ISA where 2h is a valid syntax is for faddp.
I'll double check the usage contexts but it should be the only place.

I'll check and get back to you as I respin the patch.

Thanks,
Tamar


> Same for later instances of 2h.
> 
> Thanks,
> Richard
> 
> > @@ -1175,9 +1189,10 @@ (define_mode_attr Vmtype [(V8QI ".8b")
> (V16QI ".16b")
> >  			 (V4HI ".4h") (V8HI  ".8h")
> >  			 (V2SI ".2s") (V4SI  ".4s")
> >  			 (V2DI ".2d") (V4HF ".4h")
> > -			 (V8HF ".8h") (V4BF ".4h")
> > -			 (V8BF ".8h") (V2SF ".2s")
> > -			 (V4SF ".4s") (V2DF ".2d")
> > +			 (V8HF ".8h") (V2HF ".2h")
> > +			 (V4BF ".4h") (V8BF ".8h")
> > +			 (V2SF ".2s") (V4SF ".4s")
> > +			 (V2DF ".2d")
> >  			 (DI   "")    (SI   "")
> >  			 (HI   "")    (QI   "")
> >  			 (TI   "")    (HF   "")
> > @@ -1193,7 +1208,7 @@ (define_mode_attr Vmntype [(V8HI ".8b") (V4SI
> > ".4h")  (define_mode_attr Vetype [(V8QI "b") (V16QI "b")
> >  			  (V4HI "h") (V8HI  "h")
> >  			  (V2SI "s") (V4SI  "s")
> > -			  (V2DI "d")
> > +			  (V2DI "d") (V2HF  "h")
> >  			  (V4HF "h") (V8HF  "h")
> >  			  (V2SF "s") (V4SF  "s")
> >  			  (V2DF "d")
> > @@ -1285,7 +1300,7 @@ (define_mode_attr Vcwtype [(VNx16QI "b")
> (VNx8QI
> > "h") (VNx4QI "w") (VNx2QI "d")  ;; more accurately.
> >  (define_mode_attr stype [(V8QI "b") (V16QI "b") (V4HI "s") (V8HI "s")
> >  			 (V2SI "s") (V4SI "s") (V2DI "d") (V4HF "s")
> > -			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d")
> > +			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d") (V2HF
> "s")
> >  			 (HF "s") (SF "s") (DF "d") (QI "b") (HI "s")
> >  			 (SI "s") (DI "d")])
> >
> > @@ -1360,8 +1375,8 @@ (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
> >  		       (V4HF "HF") (V8HF  "HF")
> >  		       (V2SF "SF") (V4SF  "SF")
> >  		       (DF   "DF") (V2DF  "DF")
> > -		       (SI   "SI") (HI    "HI")
> > -		       (QI   "QI")
> > +		       (SI   "SI") (V2HF  "HF")
> > +		       (QI   "QI") (HI    "HI")
> >  		       (V4BF "BF") (V8BF "BF")
> >  		       (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI
> "QI")
> >  		       (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI") @@ -1381,7
> +1396,7
> > @@ (define_mode_attr Vel [(V8QI "qi") (V16QI "qi")
> >  		       (V2SF "sf") (V4SF "sf")
> >  		       (V2DF "df") (DF   "df")
> >  		       (SI   "si") (HI   "hi")
> > -		       (QI   "qi")
> > +		       (QI   "qi") (V2HF "hf")
> >  		       (V4BF "bf") (V8BF "bf")
> >  		       (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
> >  		       (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi") @@ -1866,7
> +1881,7
> > @@ (define_mode_attr q [(V8QI "") (V16QI "_q")
> >  		     (V4HF "") (V8HF "_q")
> >  		     (V4BF "") (V8BF "_q")
> >  		     (V2SF "") (V4SF  "_q")
> > -			       (V2DF  "_q")
> > +		     (V2HF "") (V2DF  "_q")
> >  		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")
> >  		     (V2x8QI "") (V2x16QI "_q")
> >  		     (V2x4HI "") (V2x8HI "_q")
> > @@ -1905,6 +1920,7 @@ (define_mode_attr vp [(V8QI "v") (V16QI "v")
> >  		      (V2SI "p") (V4SI  "v")
> >  		      (V2DI "p") (V2DF  "p")
> >  		      (V2SF "p") (V4SF  "v")
> > +		      (V2HF "p")
> >  		      (V4HF "v") (V8HF  "v")])
> >
> >  (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi") diff --git
> > a/gcc/config/arm/types.md b/gcc/config/arm/types.md index
> >
> 7d0504bdd944e9c0d1b545b0b66a9a1adc808714..3cfbc7a93cca1bea4925853e5
> 1d0
> > a147c5722247 100644
> > --- a/gcc/config/arm/types.md
> > +++ b/gcc/config/arm/types.md
> > @@ -483,6 +483,7 @@ (define_attr "autodetect_type"
> >  ; neon_fp_minmax_s_q
> >  ; neon_fp_minmax_d
> >  ; neon_fp_minmax_d_q
> > +; neon_fp_reduc_add_h
> >  ; neon_fp_reduc_add_s
> >  ; neon_fp_reduc_add_s_q
> >  ; neon_fp_reduc_add_d
> > @@ -1033,6 +1034,7 @@ (define_attr "type"
> >    neon_fp_minmax_d,\
> >    neon_fp_minmax_d_q,\
> >  \
> > +  neon_fp_reduc_add_h,\
> >    neon_fp_reduc_add_s,\
> >    neon_fp_reduc_add_s_q,\
> >    neon_fp_reduc_add_d,\
> > @@ -1257,8 +1259,8 @@ (define_attr "is_neon_type" "yes,no"
> >            neon_fp_compare_d, neon_fp_compare_d_q, neon_fp_minmax_s,\
> >            neon_fp_minmax_s_q, neon_fp_minmax_d,
> neon_fp_minmax_d_q,\
> >            neon_fp_neg_s, neon_fp_neg_s_q, neon_fp_neg_d,
> neon_fp_neg_d_q,\
> > -          neon_fp_reduc_add_s, neon_fp_reduc_add_s_q,
> neon_fp_reduc_add_d,\
> > -          neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s,
> > +          neon_fp_reduc_add_h, neon_fp_reduc_add_s,
> neon_fp_reduc_add_s_q,\
> > +          neon_fp_reduc_add_d, neon_fp_reduc_add_d_q,
> > + neon_fp_reduc_minmax_s,\
> >            neon_fp_reduc_minmax_s_q, neon_fp_reduc_minmax_d,\
> >            neon_fp_reduc_minmax_d_q,\
> >            neon_fp_cvt_narrow_s_q, neon_fp_cvt_narrow_d_q,\ diff --git
> > a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> > index
> >
> 07d71a63414b1066ea431e287286ad048515711a..8e35e0b574d49913b43c7d8d
> 4f4b
> > a75f127f42e9 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> > @@ -30,11 +30,9 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int
> n)	\
> >  TEST_ALL (VEC_PERM)
> >
> >  /* We should use one DUP for each of the 8-, 16- and 32-bit types,
> > -   although we currently use LD1RW for _Float16.  We should use two
> > -   DUPs for each of the three 64-bit types.  */
> > +   We should use two DUPs for each of the three 64-bit types.  */
> >  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } }
> > */
> > -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 2 } }
> > */
> > -/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 1 } } */
> > +/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } }
> > +*/
> >  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } }
> > */
> >  /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d,
> > z[0-9]+\.d\n} 3 } } */
> >  /* { dg-final { scan-assembler-not {\tzip2\t} } } */
  
Tamar Christina Nov. 11, 2022, 2:39 p.m. UTC | #3
Hi,


> This name might cause confusion with the SVE iterators, where FULL means
> "every bit of the register is used".  How about something like VMOVE
> instead?
> 
> With this change, I guess VALL_F16 represents "The set of all modes for
> which the vld1 intrinsics are provided" and VMOVE or whatever is "All
> Advanced SIMD modes suitable for moving, loading, and storing".
> That is, VMOVE extends VALL_F16 with modes that are not manifested via
> intrinsics.
> 

Done.

> Where is the 2h used, and is it valid syntax in that context?
> 
> Same for later instances of 2h.

They are, but they weren't meant to be in this patch.  They belong in a separate FP16 series that
I won't get to finish for GCC 13 due not being able to finish writing all the tests.  I have moved them
to that patch series though.

While the addp patch series has been killed, this patch is still good standalone and improves codegen
as shown in the updated testcase.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (*aarch64_simd_movv2hf): New.
	(mov<mode>, movmisalign<mode>, aarch64_dup_lane<mode>,
	aarch64_store_lane0<mode>, aarch64_simd_vec_set<mode>,
	@aarch64_simd_vec_copy_lane<mode>, vec_set<mode>,
	reduc_<optab>_scal_<mode>, reduc_<fmaxmin>_scal_<mode>,
	aarch64_reduc_<optab>_internal<mode>, aarch64_get_lane<mode>,
	vec_init<mode><Vel>, vec_extract<mode><Vel>): Support V2HF.
	(aarch64_simd_dupv2hf): New.
	* config/aarch64/aarch64.cc (aarch64_classify_vector_mode):
	Add E_V2HFmode.
	* config/aarch64/iterators.md (VHSDF_P): New.
	(V2F, VMOVE, nunits, Vtype, Vmtype, Vetype, stype, VEL,
	Vel, q, vp): Add V2HF.
	* config/arm/types.md (neon_fp_reduc_add_h): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/slp_1.c: Update testcase.

--- inline copy of patch ---

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index f4152160084d6b6f34bd69f0ba6386c1ab50f77e..487a31010245accec28e779661e6c2d578fca4b7 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -19,10 +19,10 @@
 ;; <http://www.gnu.org/licenses/>.
 
 (define_expand "mov<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
-	(match_operand:VALL_F16 1 "general_operand"))]
+  [(set (match_operand:VMOVE 0 "nonimmediate_operand")
+	(match_operand:VMOVE 1 "general_operand"))]
   "TARGET_SIMD"
-  "
+{
   /* Force the operand into a register if it is not an
      immediate whose use can be replaced with xzr.
      If the mode is 16 bytes wide, then we will be doing
@@ -46,12 +46,11 @@ (define_expand "mov<mode>"
       aarch64_expand_vector_init (operands[0], operands[1]);
       DONE;
     }
-  "
-)
+})
 
 (define_expand "movmisalign<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
-        (match_operand:VALL_F16 1 "general_operand"))]
+  [(set (match_operand:VMOVE 0 "nonimmediate_operand")
+        (match_operand:VMOVE 1 "general_operand"))]
   "TARGET_SIMD && !STRICT_ALIGNMENT"
 {
   /* This pattern is not permitted to fail during expansion: if both arguments
@@ -73,6 +72,16 @@ (define_insn "aarch64_simd_dup<mode>"
   [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
 )
 
+(define_insn "aarch64_simd_dupv2hf"
+  [(set (match_operand:V2HF 0 "register_operand" "=w")
+	(vec_duplicate:V2HF
+	  (match_operand:HF 1 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "@
+   sli\\t%d0, %d1, 16"
+  [(set_attr "type" "neon_shift_imm")]
+)
+
 (define_insn "aarch64_simd_dup<mode>"
   [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
 	(vec_duplicate:VDQF_F16
@@ -85,10 +94,10 @@ (define_insn "aarch64_simd_dup<mode>"
 )
 
 (define_insn "aarch64_dup_lane<mode>"
-  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
-	(vec_duplicate:VALL_F16
+  [(set (match_operand:VMOVE 0 "register_operand" "=w")
+	(vec_duplicate:VMOVE
 	  (vec_select:<VEL>
-	    (match_operand:VALL_F16 1 "register_operand" "w")
+	    (match_operand:VMOVE 1 "register_operand" "w")
 	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
           )))]
   "TARGET_SIMD"
@@ -142,6 +151,29 @@ (define_insn "*aarch64_simd_mov<VDMOV:mode>"
 		     mov_reg, neon_move<q>")]
 )
 
+(define_insn "*aarch64_simd_movv2hf"
+  [(set (match_operand:V2HF 0 "nonimmediate_operand"
+		"=w, m,  m,  w, ?r, ?w, ?r, w, w")
+	(match_operand:V2HF 1 "general_operand"
+		"m,  Dz, w,  w,  w,  r,  r, Dz, Dn"))]
+  "TARGET_SIMD_F16INST
+   && (register_operand (operands[0], V2HFmode)
+       || aarch64_simd_reg_or_zero (operands[1], V2HFmode))"
+   "@
+    ldr\\t%s0, %1
+    str\\twzr, %0
+    str\\t%s1, %0
+    mov\\t%0.2s[0], %1.2s[0]
+    umov\\t%w0, %1.s[0]
+    fmov\\t%s0, %1
+    mov\\t%0, %1
+    movi\\t%d0, 0
+    * return aarch64_output_simd_mov_immediate (operands[1], 32);"
+  [(set_attr "type" "neon_load1_1reg, store_8, neon_store1_1reg,\
+		     neon_logic, neon_to_gp, f_mcr,\
+		     mov_reg, neon_move, neon_move")]
+)
+
 (define_insn "*aarch64_simd_mov<VQMOV:mode>"
   [(set (match_operand:VQMOV 0 "nonimmediate_operand"
 		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
@@ -182,7 +214,7 @@ (define_insn "*aarch64_simd_mov<VQMOV:mode>"
 
 (define_insn "aarch64_store_lane0<mode>"
   [(set (match_operand:<VEL> 0 "memory_operand" "=m")
-	(vec_select:<VEL> (match_operand:VALL_F16 1 "register_operand" "w")
+	(vec_select:<VEL> (match_operand:VMOVE 1 "register_operand" "w")
 			(parallel [(match_operand 2 "const_int_operand" "n")])))]
   "TARGET_SIMD
    && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 0"
@@ -1035,11 +1067,11 @@ (define_insn "one_cmpl<mode>2"
 )
 
 (define_insn "aarch64_simd_vec_set<mode>"
-  [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
-	(vec_merge:VALL_F16
-	    (vec_duplicate:VALL_F16
+  [(set (match_operand:VMOVE 0 "register_operand" "=w,w,w")
+	(vec_merge:VMOVE
+	    (vec_duplicate:VMOVE
 		(match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "w,?r,Utv"))
-	    (match_operand:VALL_F16 3 "register_operand" "0,0,0")
+	    (match_operand:VMOVE 3 "register_operand" "0,0,0")
 	    (match_operand:SI 2 "immediate_operand" "i,i,i")))]
   "TARGET_SIMD"
   {
@@ -1061,14 +1093,14 @@ (define_insn "aarch64_simd_vec_set<mode>"
 )
 
 (define_insn "@aarch64_simd_vec_copy_lane<mode>"
-  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
-	(vec_merge:VALL_F16
-	    (vec_duplicate:VALL_F16
+  [(set (match_operand:VMOVE 0 "register_operand" "=w")
+	(vec_merge:VMOVE
+	    (vec_duplicate:VMOVE
 	      (vec_select:<VEL>
-		(match_operand:VALL_F16 3 "register_operand" "w")
+		(match_operand:VMOVE 3 "register_operand" "w")
 		(parallel
 		  [(match_operand:SI 4 "immediate_operand" "i")])))
-	    (match_operand:VALL_F16 1 "register_operand" "0")
+	    (match_operand:VMOVE 1 "register_operand" "0")
 	    (match_operand:SI 2 "immediate_operand" "i")))]
   "TARGET_SIMD"
   {
@@ -1376,7 +1408,7 @@ (define_insn "vec_shr_<mode>"
 )
 
 (define_expand "vec_set<mode>"
-  [(match_operand:VALL_F16 0 "register_operand")
+  [(match_operand:VMOVE 0 "register_operand")
    (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand")
    (match_operand:SI 2 "immediate_operand")]
   "TARGET_SIMD"
@@ -3495,7 +3527,7 @@ (define_insn "popcount<mode>2"
 ;; gimple_fold'd to the IFN_REDUC_(MAX|MIN) function.  (This is FP smax/smin).
 (define_expand "reduc_<optab>_scal_<mode>"
   [(match_operand:<VEL> 0 "register_operand")
-   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
+   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
 		 FMAXMINV)]
   "TARGET_SIMD"
   {
@@ -3510,7 +3542,7 @@ (define_expand "reduc_<optab>_scal_<mode>"
 
 (define_expand "reduc_<fmaxmin>_scal_<mode>"
   [(match_operand:<VEL> 0 "register_operand")
-   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
+   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
 		 FMAXMINNMV)]
   "TARGET_SIMD"
   {
@@ -3554,8 +3586,8 @@ (define_insn "aarch64_reduc_<optab>_internalv2si"
 )
 
 (define_insn "aarch64_reduc_<optab>_internal<mode>"
- [(set (match_operand:VHSDF 0 "register_operand" "=w")
-       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
+ [(set (match_operand:VHSDF_P 0 "register_operand" "=w")
+       (unspec:VHSDF_P [(match_operand:VHSDF_P 1 "register_operand" "w")]
 		      FMAXMINV))]
  "TARGET_SIMD"
  "<maxmin_uns_op><vp>\\t%<Vetype>0, %1.<Vtype>"
@@ -4200,7 +4232,7 @@ (define_insn "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
 (define_insn_and_split "aarch64_get_lane<mode>"
   [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=?r, w, Utv")
 	(vec_select:<VEL>
-	  (match_operand:VALL_F16 1 "register_operand" "w, w, w")
+	  (match_operand:VMOVE 1 "register_operand" "w, w, w")
 	  (parallel [(match_operand:SI 2 "immediate_operand" "i, i, i")])))]
   "TARGET_SIMD"
   {
@@ -7981,7 +8013,7 @@ (define_expand "aarch64_st1<VALL_F16:mode>"
 ;; Standard pattern name vec_init<mode><Vel>.
 
 (define_expand "vec_init<mode><Vel>"
-  [(match_operand:VALL_F16 0 "register_operand")
+  [(match_operand:VMOVE 0 "register_operand")
    (match_operand 1 "" "")]
   "TARGET_SIMD"
 {
@@ -8060,7 +8092,7 @@ (define_insn "aarch64_urecpe<mode>"
 
 (define_expand "vec_extract<mode><Vel>"
   [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand")
-   (match_operand:VALL_F16 1 "register_operand")
+   (match_operand:VMOVE 1 "register_operand")
    (match_operand:SI 2 "immediate_operand")]
   "TARGET_SIMD"
 {
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 84dbe2f4ea7d03b424602ed98a34e7824217dc91..35671cb86e374f9ded21d0e4944c63bc2cbc0901 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -3566,6 +3566,7 @@ aarch64_classify_vector_mode (machine_mode mode)
     case E_V8BFmode:
     case E_V4SFmode:
     case E_V2DFmode:
+    case E_V2HFmode:
       return TARGET_SIMD ? VEC_ADVSIMD : 0;
 
     default:
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 37d8161a33b1c399d80be82afa67613a087389d4..dfcf86a440e316c2abdbcc646363d39e458d1a91 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -160,6 +160,10 @@ (define_mode_iterator VDQF [V2SF V4SF V2DF])
 (define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
 			     (V8HF "TARGET_SIMD_F16INST")
 			     V2SF V4SF V2DF])
+;; Advanced SIMD Float modes suitable for pairwise operations.
+(define_mode_iterator VHSDF_P [(V4HF "TARGET_SIMD_F16INST")
+			       (V8HF "TARGET_SIMD_F16INST")
+			       V2SF V4SF V2DF (V2HF "TARGET_SIMD_F16INST")])
 
 ;; Advanced SIMD Float modes, and DF.
 (define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF])
@@ -188,15 +192,23 @@ (define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF V2DI])
 (define_mode_iterator VALLF [V2SF V4SF V2DF SF DF])
 
 ;; Advanced SIMD Float modes with 2 elements.
-(define_mode_iterator V2F [V2SF V2DF])
+(define_mode_iterator V2F [V2SF V2DF V2HF])
 
 ;; All Advanced SIMD modes on which we support any arithmetic operations.
 (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
 
-;; All Advanced SIMD modes suitable for moving, loading, and storing.
+;; All Advanced SIMD modes suitable for moving, loading, and storing
+;; except V2HF.
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
 				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
 
+;; All Advanced SIMD modes suitable for moving, loading, and storing
+;; including V2HF
+(define_mode_iterator VMOVE [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+			     V4HF V8HF V4BF V8BF V2SF V4SF V2DF
+			     (V2HF "TARGET_SIMD_F16INST")])
+
+
 ;; The VALL_F16 modes except the 128-bit 2-element ones.
 (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
 				V4HF V8HF V2SF V4SF])
@@ -1076,7 +1088,7 @@ (define_mode_attr nunits [(V8QI "8") (V16QI "16")
 			  (V2SF "2") (V4SF "4")
 			  (V1DF "1") (V2DF "2")
 			  (DI "1") (DF "1")
-			  (V8DI "8")])
+			  (V8DI "8") (V2HF "2")])
 
 ;; Map a mode to the number of bits in it, if the size of the mode
 ;; is constant.
@@ -1090,6 +1102,7 @@ (define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")])
 
 ;; Give the length suffix letter for a sign- or zero-extension.
 (define_mode_attr size [(QI "b") (HI "h") (SI "w")])
+(define_mode_attr sizel [(QI "b") (HI "h") (SI "")])
 
 ;; Give the number of bits in the mode
 (define_mode_attr sizen [(QI "8") (HI "16") (SI "32") (DI "64")])
@@ -1193,7 +1206,7 @@ (define_mode_attr Vmntype [(V8HI ".8b") (V4SI ".4h")
 (define_mode_attr Vetype [(V8QI "b") (V16QI "b")
 			  (V4HI "h") (V8HI  "h")
 			  (V2SI "s") (V4SI  "s")
-			  (V2DI "d")
+			  (V2DI "d") (V2HF  "h")
 			  (V4HF "h") (V8HF  "h")
 			  (V2SF "s") (V4SF  "s")
 			  (V2DF "d")
@@ -1285,7 +1298,7 @@ (define_mode_attr Vcwtype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "w") (VNx2QI "d")
 ;; more accurately.
 (define_mode_attr stype [(V8QI "b") (V16QI "b") (V4HI "s") (V8HI "s")
 			 (V2SI "s") (V4SI "s") (V2DI "d") (V4HF "s")
-			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d")
+			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d") (V2HF "s")
 			 (HF "s") (SF "s") (DF "d") (QI "b") (HI "s")
 			 (SI "s") (DI "d")])
 
@@ -1360,8 +1373,8 @@ (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
 		       (V4HF "HF") (V8HF  "HF")
 		       (V2SF "SF") (V4SF  "SF")
 		       (DF   "DF") (V2DF  "DF")
-		       (SI   "SI") (HI    "HI")
-		       (QI   "QI")
+		       (SI   "SI") (V2HF  "HF")
+		       (QI   "QI") (HI    "HI")
 		       (V4BF "BF") (V8BF "BF")
 		       (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
 		       (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
@@ -1381,7 +1394,7 @@ (define_mode_attr Vel [(V8QI "qi") (V16QI "qi")
 		       (V2SF "sf") (V4SF "sf")
 		       (V2DF "df") (DF   "df")
 		       (SI   "si") (HI   "hi")
-		       (QI   "qi")
+		       (QI   "qi") (V2HF "hf")
 		       (V4BF "bf") (V8BF "bf")
 		       (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
 		       (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
@@ -1866,7 +1879,7 @@ (define_mode_attr q [(V8QI "") (V16QI "_q")
 		     (V4HF "") (V8HF "_q")
 		     (V4BF "") (V8BF "_q")
 		     (V2SF "") (V4SF  "_q")
-			       (V2DF  "_q")
+		     (V2HF "") (V2DF  "_q")
 		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")
 		     (V2x8QI "") (V2x16QI "_q")
 		     (V2x4HI "") (V2x8HI "_q")
@@ -1905,6 +1918,7 @@ (define_mode_attr vp [(V8QI "v") (V16QI "v")
 		      (V2SI "p") (V4SI  "v")
 		      (V2DI "p") (V2DF  "p")
 		      (V2SF "p") (V4SF  "v")
+		      (V2HF "p")
 		      (V4HF "v") (V8HF  "v")])
 
 (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")
diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
index 7d0504bdd944e9c0d1b545b0b66a9a1adc808714..3cfbc7a93cca1bea4925853e51d0a147c5722247 100644
--- a/gcc/config/arm/types.md
+++ b/gcc/config/arm/types.md
@@ -483,6 +483,7 @@ (define_attr "autodetect_type"
 ; neon_fp_minmax_s_q
 ; neon_fp_minmax_d
 ; neon_fp_minmax_d_q
+; neon_fp_reduc_add_h
 ; neon_fp_reduc_add_s
 ; neon_fp_reduc_add_s_q
 ; neon_fp_reduc_add_d
@@ -1033,6 +1034,7 @@ (define_attr "type"
   neon_fp_minmax_d,\
   neon_fp_minmax_d_q,\
 \
+  neon_fp_reduc_add_h,\
   neon_fp_reduc_add_s,\
   neon_fp_reduc_add_s_q,\
   neon_fp_reduc_add_d,\
@@ -1257,8 +1259,8 @@ (define_attr "is_neon_type" "yes,no"
           neon_fp_compare_d, neon_fp_compare_d_q, neon_fp_minmax_s,\
           neon_fp_minmax_s_q, neon_fp_minmax_d, neon_fp_minmax_d_q,\
           neon_fp_neg_s, neon_fp_neg_s_q, neon_fp_neg_d, neon_fp_neg_d_q,\
-          neon_fp_reduc_add_s, neon_fp_reduc_add_s_q, neon_fp_reduc_add_d,\
-          neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s,
+          neon_fp_reduc_add_h, neon_fp_reduc_add_s, neon_fp_reduc_add_s_q,\
+          neon_fp_reduc_add_d, neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s,\
           neon_fp_reduc_minmax_s_q, neon_fp_reduc_minmax_d,\
           neon_fp_reduc_minmax_d_q,\
           neon_fp_cvt_narrow_s_q, neon_fp_cvt_narrow_d_q,\
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
index 07d71a63414b1066ea431e287286ad048515711a..e6021c5a42748701e5326a5c387a39a0bbadc9e5 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
@@ -30,11 +30,9 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n)	\
 TEST_ALL (VEC_PERM)
 
 /* We should use one DUP for each of the 8-, 16- and 32-bit types,
-   although we currently use LD1RW for _Float16.  We should use two
-   DUPs for each of the three 64-bit types.  */
+   We should use two DUPs for each of the three 64-bit types.  */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 2 } } */
-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } } */
 /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
 /* { dg-final { scan-assembler-not {\tzip2\t} } } */
@@ -53,7 +51,7 @@ TEST_ALL (VEC_PERM)
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */
 /* { dg-final { scan-assembler-not {\tldr} } } */
-/* { dg-final { scan-assembler-times {\tstr} 2 } } */
-/* { dg-final { scan-assembler-times {\tstr\th[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-not {\tstr} } } */
+/* { dg-final { scan-assembler-not {\tstr\th[0-9]+} } } */
 
 /* { dg-final { scan-assembler-not {\tuqdec} } } */
  
Tamar Christina Nov. 22, 2022, 4:01 p.m. UTC | #4
Ping

> -----Original Message-----
> From: Gcc-patches <gcc-patches-
> bounces+tamar.christina=arm.com@gcc.gnu.org> On Behalf Of Tamar
> Christina via Gcc-patches
> Sent: Friday, November 11, 2022 2:40 PM
> To: Richard Sandiford <Richard.Sandiford@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: RE: [PATCH 5/8]AArch64 aarch64: Make existing V2HF be usable.
> 
> Hi,
> 
> 
> > This name might cause confusion with the SVE iterators, where FULL
> > means "every bit of the register is used".  How about something like
> > VMOVE instead?
> >
> > With this change, I guess VALL_F16 represents "The set of all modes
> > for which the vld1 intrinsics are provided" and VMOVE or whatever is
> > "All Advanced SIMD modes suitable for moving, loading, and storing".
> > That is, VMOVE extends VALL_F16 with modes that are not manifested via
> > intrinsics.
> >
> 
> Done.
> 
> > Where is the 2h used, and is it valid syntax in that context?
> >
> > Same for later instances of 2h.
> 
> They are, but they weren't meant to be in this patch.  They belong in a
> separate FP16 series that I won't get to finish for GCC 13 due not being able
> to finish writing all the tests.  I have moved them to that patch series though.
> 
> While the addp patch series has been killed, this patch is still good standalone
> and improves codegen as shown in the updated testcase.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64-simd.md (*aarch64_simd_movv2hf): New.
> 	(mov<mode>, movmisalign<mode>, aarch64_dup_lane<mode>,
> 	aarch64_store_lane0<mode>, aarch64_simd_vec_set<mode>,
> 	@aarch64_simd_vec_copy_lane<mode>, vec_set<mode>,
> 	reduc_<optab>_scal_<mode>, reduc_<fmaxmin>_scal_<mode>,
> 	aarch64_reduc_<optab>_internal<mode>,
> aarch64_get_lane<mode>,
> 	vec_init<mode><Vel>, vec_extract<mode><Vel>): Support V2HF.
> 	(aarch64_simd_dupv2hf): New.
> 	* config/aarch64/aarch64.cc (aarch64_classify_vector_mode):
> 	Add E_V2HFmode.
> 	* config/aarch64/iterators.md (VHSDF_P): New.
> 	(V2F, VMOVE, nunits, Vtype, Vmtype, Vetype, stype, VEL,
> 	Vel, q, vp): Add V2HF.
> 	* config/arm/types.md (neon_fp_reduc_add_h): New.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/aarch64/sve/slp_1.c: Update testcase.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index
> f4152160084d6b6f34bd69f0ba6386c1ab50f77e..487a31010245accec28e779661
> e6c2d578fca4b7 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -19,10 +19,10 @@
>  ;; <http://www.gnu.org/licenses/>.
> 
>  (define_expand "mov<mode>"
> -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
> -	(match_operand:VALL_F16 1 "general_operand"))]
> +  [(set (match_operand:VMOVE 0 "nonimmediate_operand")
> +	(match_operand:VMOVE 1 "general_operand"))]
>    "TARGET_SIMD"
> -  "
> +{
>    /* Force the operand into a register if it is not an
>       immediate whose use can be replaced with xzr.
>       If the mode is 16 bytes wide, then we will be doing @@ -46,12 +46,11 @@
> (define_expand "mov<mode>"
>        aarch64_expand_vector_init (operands[0], operands[1]);
>        DONE;
>      }
> -  "
> -)
> +})
> 
>  (define_expand "movmisalign<mode>"
> -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
> -        (match_operand:VALL_F16 1 "general_operand"))]
> +  [(set (match_operand:VMOVE 0 "nonimmediate_operand")
> +        (match_operand:VMOVE 1 "general_operand"))]
>    "TARGET_SIMD && !STRICT_ALIGNMENT"
>  {
>    /* This pattern is not permitted to fail during expansion: if both arguments
> @@ -73,6 +72,16 @@ (define_insn "aarch64_simd_dup<mode>"
>    [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
>  )
> 
> +(define_insn "aarch64_simd_dupv2hf"
> +  [(set (match_operand:V2HF 0 "register_operand" "=w")
> +	(vec_duplicate:V2HF
> +	  (match_operand:HF 1 "register_operand" "0")))]
> +  "TARGET_SIMD"
> +  "@
> +   sli\\t%d0, %d1, 16"
> +  [(set_attr "type" "neon_shift_imm")]
> +)
> +
>  (define_insn "aarch64_simd_dup<mode>"
>    [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
>  	(vec_duplicate:VDQF_F16
> @@ -85,10 +94,10 @@ (define_insn "aarch64_simd_dup<mode>"
>  )
> 
>  (define_insn "aarch64_dup_lane<mode>"
> -  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> -	(vec_duplicate:VALL_F16
> +  [(set (match_operand:VMOVE 0 "register_operand" "=w")
> +	(vec_duplicate:VMOVE
>  	  (vec_select:<VEL>
> -	    (match_operand:VALL_F16 1 "register_operand" "w")
> +	    (match_operand:VMOVE 1 "register_operand" "w")
>  	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
>            )))]
>    "TARGET_SIMD"
> @@ -142,6 +151,29 @@ (define_insn
> "*aarch64_simd_mov<VDMOV:mode>"
>  		     mov_reg, neon_move<q>")]
>  )
> 
> +(define_insn "*aarch64_simd_movv2hf"
> +  [(set (match_operand:V2HF 0 "nonimmediate_operand"
> +		"=w, m,  m,  w, ?r, ?w, ?r, w, w")
> +	(match_operand:V2HF 1 "general_operand"
> +		"m,  Dz, w,  w,  w,  r,  r, Dz, Dn"))]
> +  "TARGET_SIMD_F16INST
> +   && (register_operand (operands[0], V2HFmode)
> +       || aarch64_simd_reg_or_zero (operands[1], V2HFmode))"
> +   "@
> +    ldr\\t%s0, %1
> +    str\\twzr, %0
> +    str\\t%s1, %0
> +    mov\\t%0.2s[0], %1.2s[0]
> +    umov\\t%w0, %1.s[0]
> +    fmov\\t%s0, %1
> +    mov\\t%0, %1
> +    movi\\t%d0, 0
> +    * return aarch64_output_simd_mov_immediate (operands[1], 32);"
> +  [(set_attr "type" "neon_load1_1reg, store_8, neon_store1_1reg,\
> +		     neon_logic, neon_to_gp, f_mcr,\
> +		     mov_reg, neon_move, neon_move")]
> +)
> +
>  (define_insn "*aarch64_simd_mov<VQMOV:mode>"
>    [(set (match_operand:VQMOV 0 "nonimmediate_operand"
>  		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
> @@ -182,7 +214,7 @@ (define_insn "*aarch64_simd_mov<VQMOV:mode>"
> 
>  (define_insn "aarch64_store_lane0<mode>"
>    [(set (match_operand:<VEL> 0 "memory_operand" "=m")
> -	(vec_select:<VEL> (match_operand:VALL_F16 1 "register_operand"
> "w")
> +	(vec_select:<VEL> (match_operand:VMOVE 1 "register_operand"
> "w")
>  			(parallel [(match_operand 2 "const_int_operand"
> "n")])))]
>    "TARGET_SIMD
>     && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 0"
> @@ -1035,11 +1067,11 @@ (define_insn "one_cmpl<mode>2"
>  )
> 
>  (define_insn "aarch64_simd_vec_set<mode>"
> -  [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
> -	(vec_merge:VALL_F16
> -	    (vec_duplicate:VALL_F16
> +  [(set (match_operand:VMOVE 0 "register_operand" "=w,w,w")
> +	(vec_merge:VMOVE
> +	    (vec_duplicate:VMOVE
>  		(match_operand:<VEL> 1
> "aarch64_simd_nonimmediate_operand" "w,?r,Utv"))
> -	    (match_operand:VALL_F16 3 "register_operand" "0,0,0")
> +	    (match_operand:VMOVE 3 "register_operand" "0,0,0")
>  	    (match_operand:SI 2 "immediate_operand" "i,i,i")))]
>    "TARGET_SIMD"
>    {
> @@ -1061,14 +1093,14 @@ (define_insn "aarch64_simd_vec_set<mode>"
>  )
> 
>  (define_insn "@aarch64_simd_vec_copy_lane<mode>"
> -  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> -	(vec_merge:VALL_F16
> -	    (vec_duplicate:VALL_F16
> +  [(set (match_operand:VMOVE 0 "register_operand" "=w")
> +	(vec_merge:VMOVE
> +	    (vec_duplicate:VMOVE
>  	      (vec_select:<VEL>
> -		(match_operand:VALL_F16 3 "register_operand" "w")
> +		(match_operand:VMOVE 3 "register_operand" "w")
>  		(parallel
>  		  [(match_operand:SI 4 "immediate_operand" "i")])))
> -	    (match_operand:VALL_F16 1 "register_operand" "0")
> +	    (match_operand:VMOVE 1 "register_operand" "0")
>  	    (match_operand:SI 2 "immediate_operand" "i")))]
>    "TARGET_SIMD"
>    {
> @@ -1376,7 +1408,7 @@ (define_insn "vec_shr_<mode>"
>  )
> 
>  (define_expand "vec_set<mode>"
> -  [(match_operand:VALL_F16 0 "register_operand")
> +  [(match_operand:VMOVE 0 "register_operand")
>     (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand")
>     (match_operand:SI 2 "immediate_operand")]
>    "TARGET_SIMD"
> @@ -3495,7 +3527,7 @@ (define_insn "popcount<mode>2"
>  ;; gimple_fold'd to the IFN_REDUC_(MAX|MIN) function.  (This is FP
> smax/smin).
>  (define_expand "reduc_<optab>_scal_<mode>"
>    [(match_operand:<VEL> 0 "register_operand")
> -   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
> +   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
>  		 FMAXMINV)]
>    "TARGET_SIMD"
>    {
> @@ -3510,7 +3542,7 @@ (define_expand "reduc_<optab>_scal_<mode>"
> 
>  (define_expand "reduc_<fmaxmin>_scal_<mode>"
>    [(match_operand:<VEL> 0 "register_operand")
> -   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
> +   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
>  		 FMAXMINNMV)]
>    "TARGET_SIMD"
>    {
> @@ -3554,8 +3586,8 @@ (define_insn
> "aarch64_reduc_<optab>_internalv2si"
>  )
> 
>  (define_insn "aarch64_reduc_<optab>_internal<mode>"
> - [(set (match_operand:VHSDF 0 "register_operand" "=w")
> -       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
> + [(set (match_operand:VHSDF_P 0 "register_operand" "=w")
> +       (unspec:VHSDF_P [(match_operand:VHSDF_P 1 "register_operand"
> + "w")]
>  		      FMAXMINV))]
>   "TARGET_SIMD"
>   "<maxmin_uns_op><vp>\\t%<Vetype>0, %1.<Vtype>"
> @@ -4200,7 +4232,7 @@ (define_insn
> "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
>  (define_insn_and_split "aarch64_get_lane<mode>"
>    [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand"
> "=?r, w, Utv")
>  	(vec_select:<VEL>
> -	  (match_operand:VALL_F16 1 "register_operand" "w, w, w")
> +	  (match_operand:VMOVE 1 "register_operand" "w, w, w")
>  	  (parallel [(match_operand:SI 2 "immediate_operand" "i, i, i")])))]
>    "TARGET_SIMD"
>    {
> @@ -7981,7 +8013,7 @@ (define_expand "aarch64_st1<VALL_F16:mode>"
>  ;; Standard pattern name vec_init<mode><Vel>.
> 
>  (define_expand "vec_init<mode><Vel>"
> -  [(match_operand:VALL_F16 0 "register_operand")
> +  [(match_operand:VMOVE 0 "register_operand")
>     (match_operand 1 "" "")]
>    "TARGET_SIMD"
>  {
> @@ -8060,7 +8092,7 @@ (define_insn "aarch64_urecpe<mode>"
> 
>  (define_expand "vec_extract<mode><Vel>"
>    [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand")
> -   (match_operand:VALL_F16 1 "register_operand")
> +   (match_operand:VMOVE 1 "register_operand")
>     (match_operand:SI 2 "immediate_operand")]
>    "TARGET_SIMD"
>  {
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index
> 84dbe2f4ea7d03b424602ed98a34e7824217dc91..35671cb86e374f9ded21d0e4
> 944c63bc2cbc0901 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -3566,6 +3566,7 @@ aarch64_classify_vector_mode (machine_mode
> mode)
>      case E_V8BFmode:
>      case E_V4SFmode:
>      case E_V2DFmode:
> +    case E_V2HFmode:
>        return TARGET_SIMD ? VEC_ADVSIMD : 0;
> 
>      default:
> diff --git a/gcc/config/aarch64/iterators.md
> b/gcc/config/aarch64/iterators.md index
> 37d8161a33b1c399d80be82afa67613a087389d4..dfcf86a440e316c2abdbcc6463
> 63d39e458d1a91 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -160,6 +160,10 @@ (define_mode_iterator VDQF [V2SF V4SF V2DF])
> (define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
>  			     (V8HF "TARGET_SIMD_F16INST")
>  			     V2SF V4SF V2DF])
> +;; Advanced SIMD Float modes suitable for pairwise operations.
> +(define_mode_iterator VHSDF_P [(V4HF "TARGET_SIMD_F16INST")
> +			       (V8HF "TARGET_SIMD_F16INST")
> +			       V2SF V4SF V2DF (V2HF
> "TARGET_SIMD_F16INST")])
> 
>  ;; Advanced SIMD Float modes, and DF.
>  (define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF]) @@ -188,15 +192,23
> @@ (define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF V2DI])
> (define_mode_iterator VALLF [V2SF V4SF V2DF SF DF])
> 
>  ;; Advanced SIMD Float modes with 2 elements.
> -(define_mode_iterator V2F [V2SF V2DF])
> +(define_mode_iterator V2F [V2SF V2DF V2HF])
> 
>  ;; All Advanced SIMD modes on which we support any arithmetic operations.
>  (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF
> V4SF V2DF])
> 
> -;; All Advanced SIMD modes suitable for moving, loading, and storing.
> +;; All Advanced SIMD modes suitable for moving, loading, and storing ;;
> +except V2HF.
>  (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
>  				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
> 
> +;; All Advanced SIMD modes suitable for moving, loading, and storing ;;
> +including V2HF (define_mode_iterator VMOVE [V8QI V16QI V4HI V8HI V2SI
> +V4SI V2DI
> +			     V4HF V8HF V4BF V8BF V2SF V4SF V2DF
> +			     (V2HF "TARGET_SIMD_F16INST")])
> +
> +
>  ;; The VALL_F16 modes except the 128-bit 2-element ones.
>  (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI
> V4SI
>  				V4HF V8HF V2SF V4SF])
> @@ -1076,7 +1088,7 @@ (define_mode_attr nunits [(V8QI "8") (V16QI "16")
>  			  (V2SF "2") (V4SF "4")
>  			  (V1DF "1") (V2DF "2")
>  			  (DI "1") (DF "1")
> -			  (V8DI "8")])
> +			  (V8DI "8") (V2HF "2")])
> 
>  ;; Map a mode to the number of bits in it, if the size of the mode  ;; is
> constant.
> @@ -1090,6 +1102,7 @@ (define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI
> "s") (DI "d")])
> 
>  ;; Give the length suffix letter for a sign- or zero-extension.
>  (define_mode_attr size [(QI "b") (HI "h") (SI "w")])
> +(define_mode_attr sizel [(QI "b") (HI "h") (SI "")])
> 
>  ;; Give the number of bits in the mode
>  (define_mode_attr sizen [(QI "8") (HI "16") (SI "32") (DI "64")]) @@ -1193,7
> +1206,7 @@ (define_mode_attr Vmntype [(V8HI ".8b") (V4SI ".4h")
> (define_mode_attr Vetype [(V8QI "b") (V16QI "b")
>  			  (V4HI "h") (V8HI  "h")
>  			  (V2SI "s") (V4SI  "s")
> -			  (V2DI "d")
> +			  (V2DI "d") (V2HF  "h")
>  			  (V4HF "h") (V8HF  "h")
>  			  (V2SF "s") (V4SF  "s")
>  			  (V2DF "d")
> @@ -1285,7 +1298,7 @@ (define_mode_attr Vcwtype [(VNx16QI "b")
> (VNx8QI "h") (VNx4QI "w") (VNx2QI "d")  ;; more accurately.
>  (define_mode_attr stype [(V8QI "b") (V16QI "b") (V4HI "s") (V8HI "s")
>  			 (V2SI "s") (V4SI "s") (V2DI "d") (V4HF "s")
> -			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d")
> +			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d") (V2HF
> "s")
>  			 (HF "s") (SF "s") (DF "d") (QI "b") (HI "s")
>  			 (SI "s") (DI "d")])
> 
> @@ -1360,8 +1373,8 @@ (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
>  		       (V4HF "HF") (V8HF  "HF")
>  		       (V2SF "SF") (V4SF  "SF")
>  		       (DF   "DF") (V2DF  "DF")
> -		       (SI   "SI") (HI    "HI")
> -		       (QI   "QI")
> +		       (SI   "SI") (V2HF  "HF")
> +		       (QI   "QI") (HI    "HI")
>  		       (V4BF "BF") (V8BF "BF")
>  		       (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI
> "QI")
>  		       (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI") @@ -1381,7
> +1394,7 @@ (define_mode_attr Vel [(V8QI "qi") (V16QI "qi")
>  		       (V2SF "sf") (V4SF "sf")
>  		       (V2DF "df") (DF   "df")
>  		       (SI   "si") (HI   "hi")
> -		       (QI   "qi")
> +		       (QI   "qi") (V2HF "hf")
>  		       (V4BF "bf") (V8BF "bf")
>  		       (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
>  		       (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi") @@ -1866,7
> +1879,7 @@ (define_mode_attr q [(V8QI "") (V16QI "_q")
>  		     (V4HF "") (V8HF "_q")
>  		     (V4BF "") (V8BF "_q")
>  		     (V2SF "") (V4SF  "_q")
> -			       (V2DF  "_q")
> +		     (V2HF "") (V2DF  "_q")
>  		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")
>  		     (V2x8QI "") (V2x16QI "_q")
>  		     (V2x4HI "") (V2x8HI "_q")
> @@ -1905,6 +1918,7 @@ (define_mode_attr vp [(V8QI "v") (V16QI "v")
>  		      (V2SI "p") (V4SI  "v")
>  		      (V2DI "p") (V2DF  "p")
>  		      (V2SF "p") (V4SF  "v")
> +		      (V2HF "p")
>  		      (V4HF "v") (V8HF  "v")])
> 
>  (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi") diff --git
> a/gcc/config/arm/types.md b/gcc/config/arm/types.md index
> 7d0504bdd944e9c0d1b545b0b66a9a1adc808714..3cfbc7a93cca1bea4925853e5
> 1d0a147c5722247 100644
> --- a/gcc/config/arm/types.md
> +++ b/gcc/config/arm/types.md
> @@ -483,6 +483,7 @@ (define_attr "autodetect_type"
>  ; neon_fp_minmax_s_q
>  ; neon_fp_minmax_d
>  ; neon_fp_minmax_d_q
> +; neon_fp_reduc_add_h
>  ; neon_fp_reduc_add_s
>  ; neon_fp_reduc_add_s_q
>  ; neon_fp_reduc_add_d
> @@ -1033,6 +1034,7 @@ (define_attr "type"
>    neon_fp_minmax_d,\
>    neon_fp_minmax_d_q,\
>  \
> +  neon_fp_reduc_add_h,\
>    neon_fp_reduc_add_s,\
>    neon_fp_reduc_add_s_q,\
>    neon_fp_reduc_add_d,\
> @@ -1257,8 +1259,8 @@ (define_attr "is_neon_type" "yes,no"
>            neon_fp_compare_d, neon_fp_compare_d_q, neon_fp_minmax_s,\
>            neon_fp_minmax_s_q, neon_fp_minmax_d, neon_fp_minmax_d_q,\
>            neon_fp_neg_s, neon_fp_neg_s_q, neon_fp_neg_d,
> neon_fp_neg_d_q,\
> -          neon_fp_reduc_add_s, neon_fp_reduc_add_s_q,
> neon_fp_reduc_add_d,\
> -          neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s,
> +          neon_fp_reduc_add_h, neon_fp_reduc_add_s,
> neon_fp_reduc_add_s_q,\
> +          neon_fp_reduc_add_d, neon_fp_reduc_add_d_q,
> + neon_fp_reduc_minmax_s,\
>            neon_fp_reduc_minmax_s_q, neon_fp_reduc_minmax_d,\
>            neon_fp_reduc_minmax_d_q,\
>            neon_fp_cvt_narrow_s_q, neon_fp_cvt_narrow_d_q,\ diff --git
> a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> index
> 07d71a63414b1066ea431e287286ad048515711a..e6021c5a42748701e5326a5c3
> 87a39a0bbadc9e5 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> @@ -30,11 +30,9 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n)
> 	\
>  TEST_ALL (VEC_PERM)
> 
>  /* We should use one DUP for each of the 8-, 16- and 32-bit types,
> -   although we currently use LD1RW for _Float16.  We should use two
> -   DUPs for each of the three 64-bit types.  */
> +   We should use two DUPs for each of the three 64-bit types.  */
>  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } } */
> -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 2 } } */
> -/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 1 } } */
> +/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } } */
>  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } } */
>  /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-
> 9]+\.d\n} 3 } } */
>  /* { dg-final { scan-assembler-not {\tzip2\t} } } */ @@ -53,7 +51,7 @@
> TEST_ALL (VEC_PERM)
>  /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */
>  /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */
>  /* { dg-final { scan-assembler-not {\tldr} } } */
> -/* { dg-final { scan-assembler-times {\tstr} 2 } } */
> -/* { dg-final { scan-assembler-times {\tstr\th[0-9]+} 2 } } */
> +/* { dg-final { scan-assembler-not {\tstr} } } */
> +/* { dg-final { scan-assembler-not {\tstr\th[0-9]+} } } */
> 
>  /* { dg-final { scan-assembler-not {\tuqdec} } } */
  
Tamar Christina Nov. 30, 2022, 4:26 a.m. UTC | #5
Ping x3

> -----Original Message-----
> From: Tamar Christina
> Sent: Tuesday, November 22, 2022 4:01 PM
> To: Tamar Christina <Tamar.Christina@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: RE: [PATCH 5/8]AArch64 aarch64: Make existing V2HF be usable.
> 
> Ping
> 
> > -----Original Message-----
> > From: Gcc-patches <gcc-patches-
> > bounces+tamar.christina=arm.com@gcc.gnu.org> On Behalf Of Tamar
> > Christina via Gcc-patches
> > Sent: Friday, November 11, 2022 2:40 PM
> > To: Richard Sandiford <Richard.Sandiford@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> > Subject: RE: [PATCH 5/8]AArch64 aarch64: Make existing V2HF be usable.
> >
> > Hi,
> >
> >
> > > This name might cause confusion with the SVE iterators, where FULL
> > > means "every bit of the register is used".  How about something like
> > > VMOVE instead?
> > >
> > > With this change, I guess VALL_F16 represents "The set of all modes
> > > for which the vld1 intrinsics are provided" and VMOVE or whatever is
> > > "All Advanced SIMD modes suitable for moving, loading, and storing".
> > > That is, VMOVE extends VALL_F16 with modes that are not manifested
> > > via intrinsics.
> > >
> >
> > Done.
> >
> > > Where is the 2h used, and is it valid syntax in that context?
> > >
> > > Same for later instances of 2h.
> >
> > They are, but they weren't meant to be in this patch.  They belong in
> > a separate FP16 series that I won't get to finish for GCC 13 due not
> > being able to finish writing all the tests.  I have moved them to that patch
> series though.
> >
> > While the addp patch series has been killed, this patch is still good
> > standalone and improves codegen as shown in the updated testcase.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* config/aarch64/aarch64-simd.md (*aarch64_simd_movv2hf): New.
> > 	(mov<mode>, movmisalign<mode>, aarch64_dup_lane<mode>,
> > 	aarch64_store_lane0<mode>, aarch64_simd_vec_set<mode>,
> > 	@aarch64_simd_vec_copy_lane<mode>, vec_set<mode>,
> > 	reduc_<optab>_scal_<mode>, reduc_<fmaxmin>_scal_<mode>,
> > 	aarch64_reduc_<optab>_internal<mode>,
> > aarch64_get_lane<mode>,
> > 	vec_init<mode><Vel>, vec_extract<mode><Vel>): Support V2HF.
> > 	(aarch64_simd_dupv2hf): New.
> > 	* config/aarch64/aarch64.cc (aarch64_classify_vector_mode):
> > 	Add E_V2HFmode.
> > 	* config/aarch64/iterators.md (VHSDF_P): New.
> > 	(V2F, VMOVE, nunits, Vtype, Vmtype, Vetype, stype, VEL,
> > 	Vel, q, vp): Add V2HF.
> > 	* config/arm/types.md (neon_fp_reduc_add_h): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.target/aarch64/sve/slp_1.c: Update testcase.
> >
> > --- inline copy of patch ---
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index
> >
> f4152160084d6b6f34bd69f0ba6386c1ab50f77e..487a31010245accec28e779661
> > e6c2d578fca4b7 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -19,10 +19,10 @@
> >  ;; <http://www.gnu.org/licenses/>.
> >
> >  (define_expand "mov<mode>"
> > -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
> > -	(match_operand:VALL_F16 1 "general_operand"))]
> > +  [(set (match_operand:VMOVE 0 "nonimmediate_operand")
> > +	(match_operand:VMOVE 1 "general_operand"))]
> >    "TARGET_SIMD"
> > -  "
> > +{
> >    /* Force the operand into a register if it is not an
> >       immediate whose use can be replaced with xzr.
> >       If the mode is 16 bytes wide, then we will be doing @@ -46,12
> > +46,11 @@ (define_expand "mov<mode>"
> >        aarch64_expand_vector_init (operands[0], operands[1]);
> >        DONE;
> >      }
> > -  "
> > -)
> > +})
> >
> >  (define_expand "movmisalign<mode>"
> > -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
> > -        (match_operand:VALL_F16 1 "general_operand"))]
> > +  [(set (match_operand:VMOVE 0 "nonimmediate_operand")
> > +        (match_operand:VMOVE 1 "general_operand"))]
> >    "TARGET_SIMD && !STRICT_ALIGNMENT"
> >  {
> >    /* This pattern is not permitted to fail during expansion: if both
> > arguments @@ -73,6 +72,16 @@ (define_insn
> "aarch64_simd_dup<mode>"
> >    [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
> >  )
> >
> > +(define_insn "aarch64_simd_dupv2hf"
> > +  [(set (match_operand:V2HF 0 "register_operand" "=w")
> > +	(vec_duplicate:V2HF
> > +	  (match_operand:HF 1 "register_operand" "0")))]
> > +  "TARGET_SIMD"
> > +  "@
> > +   sli\\t%d0, %d1, 16"
> > +  [(set_attr "type" "neon_shift_imm")]
> > +)
> > +
> >  (define_insn "aarch64_simd_dup<mode>"
> >    [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
> >  	(vec_duplicate:VDQF_F16
> > @@ -85,10 +94,10 @@ (define_insn "aarch64_simd_dup<mode>"
> >  )
> >
> >  (define_insn "aarch64_dup_lane<mode>"
> > -  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> > -	(vec_duplicate:VALL_F16
> > +  [(set (match_operand:VMOVE 0 "register_operand" "=w")
> > +	(vec_duplicate:VMOVE
> >  	  (vec_select:<VEL>
> > -	    (match_operand:VALL_F16 1 "register_operand" "w")
> > +	    (match_operand:VMOVE 1 "register_operand" "w")
> >  	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
> >            )))]
> >    "TARGET_SIMD"
> > @@ -142,6 +151,29 @@ (define_insn
> > "*aarch64_simd_mov<VDMOV:mode>"
> >  		     mov_reg, neon_move<q>")]
> >  )
> >
> > +(define_insn "*aarch64_simd_movv2hf"
> > +  [(set (match_operand:V2HF 0 "nonimmediate_operand"
> > +		"=w, m,  m,  w, ?r, ?w, ?r, w, w")
> > +	(match_operand:V2HF 1 "general_operand"
> > +		"m,  Dz, w,  w,  w,  r,  r, Dz, Dn"))]
> > +  "TARGET_SIMD_F16INST
> > +   && (register_operand (operands[0], V2HFmode)
> > +       || aarch64_simd_reg_or_zero (operands[1], V2HFmode))"
> > +   "@
> > +    ldr\\t%s0, %1
> > +    str\\twzr, %0
> > +    str\\t%s1, %0
> > +    mov\\t%0.2s[0], %1.2s[0]
> > +    umov\\t%w0, %1.s[0]
> > +    fmov\\t%s0, %1
> > +    mov\\t%0, %1
> > +    movi\\t%d0, 0
> > +    * return aarch64_output_simd_mov_immediate (operands[1], 32);"
> > +  [(set_attr "type" "neon_load1_1reg, store_8, neon_store1_1reg,\
> > +		     neon_logic, neon_to_gp, f_mcr,\
> > +		     mov_reg, neon_move, neon_move")]
> > +)
> > +
> >  (define_insn "*aarch64_simd_mov<VQMOV:mode>"
> >    [(set (match_operand:VQMOV 0 "nonimmediate_operand"
> >  		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
> > @@ -182,7 +214,7 @@ (define_insn
> "*aarch64_simd_mov<VQMOV:mode>"
> >
> >  (define_insn "aarch64_store_lane0<mode>"
> >    [(set (match_operand:<VEL> 0 "memory_operand" "=m")
> > -	(vec_select:<VEL> (match_operand:VALL_F16 1 "register_operand"
> > "w")
> > +	(vec_select:<VEL> (match_operand:VMOVE 1 "register_operand"
> > "w")
> >  			(parallel [(match_operand 2 "const_int_operand"
> > "n")])))]
> >    "TARGET_SIMD
> >     && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 0"
> > @@ -1035,11 +1067,11 @@ (define_insn "one_cmpl<mode>2"
> >  )
> >
> >  (define_insn "aarch64_simd_vec_set<mode>"
> > -  [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
> > -	(vec_merge:VALL_F16
> > -	    (vec_duplicate:VALL_F16
> > +  [(set (match_operand:VMOVE 0 "register_operand" "=w,w,w")
> > +	(vec_merge:VMOVE
> > +	    (vec_duplicate:VMOVE
> >  		(match_operand:<VEL> 1
> > "aarch64_simd_nonimmediate_operand" "w,?r,Utv"))
> > -	    (match_operand:VALL_F16 3 "register_operand" "0,0,0")
> > +	    (match_operand:VMOVE 3 "register_operand" "0,0,0")
> >  	    (match_operand:SI 2 "immediate_operand" "i,i,i")))]
> >    "TARGET_SIMD"
> >    {
> > @@ -1061,14 +1093,14 @@ (define_insn "aarch64_simd_vec_set<mode>"
> >  )
> >
> >  (define_insn "@aarch64_simd_vec_copy_lane<mode>"
> > -  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> > -	(vec_merge:VALL_F16
> > -	    (vec_duplicate:VALL_F16
> > +  [(set (match_operand:VMOVE 0 "register_operand" "=w")
> > +	(vec_merge:VMOVE
> > +	    (vec_duplicate:VMOVE
> >  	      (vec_select:<VEL>
> > -		(match_operand:VALL_F16 3 "register_operand" "w")
> > +		(match_operand:VMOVE 3 "register_operand" "w")
> >  		(parallel
> >  		  [(match_operand:SI 4 "immediate_operand" "i")])))
> > -	    (match_operand:VALL_F16 1 "register_operand" "0")
> > +	    (match_operand:VMOVE 1 "register_operand" "0")
> >  	    (match_operand:SI 2 "immediate_operand" "i")))]
> >    "TARGET_SIMD"
> >    {
> > @@ -1376,7 +1408,7 @@ (define_insn "vec_shr_<mode>"
> >  )
> >
> >  (define_expand "vec_set<mode>"
> > -  [(match_operand:VALL_F16 0 "register_operand")
> > +  [(match_operand:VMOVE 0 "register_operand")
> >     (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand")
> >     (match_operand:SI 2 "immediate_operand")]
> >    "TARGET_SIMD"
> > @@ -3495,7 +3527,7 @@ (define_insn "popcount<mode>2"
> >  ;; gimple_fold'd to the IFN_REDUC_(MAX|MIN) function.  (This is FP
> > smax/smin).
> >  (define_expand "reduc_<optab>_scal_<mode>"
> >    [(match_operand:<VEL> 0 "register_operand")
> > -   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
> > +   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
> >  		 FMAXMINV)]
> >    "TARGET_SIMD"
> >    {
> > @@ -3510,7 +3542,7 @@ (define_expand "reduc_<optab>_scal_<mode>"
> >
> >  (define_expand "reduc_<fmaxmin>_scal_<mode>"
> >    [(match_operand:<VEL> 0 "register_operand")
> > -   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
> > +   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
> >  		 FMAXMINNMV)]
> >    "TARGET_SIMD"
> >    {
> > @@ -3554,8 +3586,8 @@ (define_insn
> > "aarch64_reduc_<optab>_internalv2si"
> >  )
> >
> >  (define_insn "aarch64_reduc_<optab>_internal<mode>"
> > - [(set (match_operand:VHSDF 0 "register_operand" "=w")
> > -       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
> > + [(set (match_operand:VHSDF_P 0 "register_operand" "=w")
> > +       (unspec:VHSDF_P [(match_operand:VHSDF_P 1 "register_operand"
> > + "w")]
> >  		      FMAXMINV))]
> >   "TARGET_SIMD"
> >   "<maxmin_uns_op><vp>\\t%<Vetype>0, %1.<Vtype>"
> > @@ -4200,7 +4232,7 @@ (define_insn
> > "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
> >  (define_insn_and_split "aarch64_get_lane<mode>"
> >    [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand"
> > "=?r, w, Utv")
> >  	(vec_select:<VEL>
> > -	  (match_operand:VALL_F16 1 "register_operand" "w, w, w")
> > +	  (match_operand:VMOVE 1 "register_operand" "w, w, w")
> >  	  (parallel [(match_operand:SI 2 "immediate_operand" "i, i, i")])))]
> >    "TARGET_SIMD"
> >    {
> > @@ -7981,7 +8013,7 @@ (define_expand "aarch64_st1<VALL_F16:mode>"
> >  ;; Standard pattern name vec_init<mode><Vel>.
> >
> >  (define_expand "vec_init<mode><Vel>"
> > -  [(match_operand:VALL_F16 0 "register_operand")
> > +  [(match_operand:VMOVE 0 "register_operand")
> >     (match_operand 1 "" "")]
> >    "TARGET_SIMD"
> >  {
> > @@ -8060,7 +8092,7 @@ (define_insn "aarch64_urecpe<mode>"
> >
> >  (define_expand "vec_extract<mode><Vel>"
> >    [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand")
> > -   (match_operand:VALL_F16 1 "register_operand")
> > +   (match_operand:VMOVE 1 "register_operand")
> >     (match_operand:SI 2 "immediate_operand")]
> >    "TARGET_SIMD"
> >  {
> > diff --git a/gcc/config/aarch64/aarch64.cc
> > b/gcc/config/aarch64/aarch64.cc index
> >
> 84dbe2f4ea7d03b424602ed98a34e7824217dc91..35671cb86e374f9ded21d0e4
> > 944c63bc2cbc0901 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -3566,6 +3566,7 @@ aarch64_classify_vector_mode (machine_mode
> > mode)
> >      case E_V8BFmode:
> >      case E_V4SFmode:
> >      case E_V2DFmode:
> > +    case E_V2HFmode:
> >        return TARGET_SIMD ? VEC_ADVSIMD : 0;
> >
> >      default:
> > diff --git a/gcc/config/aarch64/iterators.md
> > b/gcc/config/aarch64/iterators.md index
> >
> 37d8161a33b1c399d80be82afa67613a087389d4..dfcf86a440e316c2abdbcc6463
> > 63d39e458d1a91 100644
> > --- a/gcc/config/aarch64/iterators.md
> > +++ b/gcc/config/aarch64/iterators.md
> > @@ -160,6 +160,10 @@ (define_mode_iterator VDQF [V2SF V4SF V2DF])
> > (define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
> >  			     (V8HF "TARGET_SIMD_F16INST")
> >  			     V2SF V4SF V2DF])
> > +;; Advanced SIMD Float modes suitable for pairwise operations.
> > +(define_mode_iterator VHSDF_P [(V4HF "TARGET_SIMD_F16INST")
> > +			       (V8HF "TARGET_SIMD_F16INST")
> > +			       V2SF V4SF V2DF (V2HF
> > "TARGET_SIMD_F16INST")])
> >
> >  ;; Advanced SIMD Float modes, and DF.
> >  (define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF]) @@ -188,15
> +192,23
> > @@ (define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF
> V2DI])
> > (define_mode_iterator VALLF [V2SF V4SF V2DF SF DF])
> >
> >  ;; Advanced SIMD Float modes with 2 elements.
> > -(define_mode_iterator V2F [V2SF V2DF])
> > +(define_mode_iterator V2F [V2SF V2DF V2HF])
> >
> >  ;; All Advanced SIMD modes on which we support any arithmetic
> operations.
> >  (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF
> > V4SF V2DF])
> >
> > -;; All Advanced SIMD modes suitable for moving, loading, and storing.
> > +;; All Advanced SIMD modes suitable for moving, loading, and storing
> > +;; except V2HF.
> >  (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
> >  				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
> >
> > +;; All Advanced SIMD modes suitable for moving, loading, and storing
> > +;; including V2HF (define_mode_iterator VMOVE [V8QI V16QI V4HI V8HI
> > +V2SI V4SI V2DI
> > +			     V4HF V8HF V4BF V8BF V2SF V4SF V2DF
> > +			     (V2HF "TARGET_SIMD_F16INST")])
> > +
> > +
> >  ;; The VALL_F16 modes except the 128-bit 2-element ones.
> >  (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI
> V4SI
> >  				V4HF V8HF V2SF V4SF])
> > @@ -1076,7 +1088,7 @@ (define_mode_attr nunits [(V8QI "8") (V16QI
> "16")
> >  			  (V2SF "2") (V4SF "4")
> >  			  (V1DF "1") (V2DF "2")
> >  			  (DI "1") (DF "1")
> > -			  (V8DI "8")])
> > +			  (V8DI "8") (V2HF "2")])
> >
> >  ;; Map a mode to the number of bits in it, if the size of the mode
> > ;; is constant.
> > @@ -1090,6 +1102,7 @@ (define_mode_attr s [(HF "h") (SF "s") (DF "d")
> > (SI
> > "s") (DI "d")])
> >
> >  ;; Give the length suffix letter for a sign- or zero-extension.
> >  (define_mode_attr size [(QI "b") (HI "h") (SI "w")])
> > +(define_mode_attr sizel [(QI "b") (HI "h") (SI "")])
> >
> >  ;; Give the number of bits in the mode  (define_mode_attr sizen [(QI
> > "8") (HI "16") (SI "32") (DI "64")]) @@ -1193,7
> > +1206,7 @@ (define_mode_attr Vmntype [(V8HI ".8b") (V4SI ".4h")
> > (define_mode_attr Vetype [(V8QI "b") (V16QI "b")
> >  			  (V4HI "h") (V8HI  "h")
> >  			  (V2SI "s") (V4SI  "s")
> > -			  (V2DI "d")
> > +			  (V2DI "d") (V2HF  "h")
> >  			  (V4HF "h") (V8HF  "h")
> >  			  (V2SF "s") (V4SF  "s")
> >  			  (V2DF "d")
> > @@ -1285,7 +1298,7 @@ (define_mode_attr Vcwtype [(VNx16QI "b")
> (VNx8QI
> > "h") (VNx4QI "w") (VNx2QI "d")  ;; more accurately.
> >  (define_mode_attr stype [(V8QI "b") (V16QI "b") (V4HI "s") (V8HI "s")
> >  			 (V2SI "s") (V4SI "s") (V2DI "d") (V4HF "s")
> > -			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d")
> > +			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d") (V2HF
> > "s")
> >  			 (HF "s") (SF "s") (DF "d") (QI "b") (HI "s")
> >  			 (SI "s") (DI "d")])
> >
> > @@ -1360,8 +1373,8 @@ (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
> >  		       (V4HF "HF") (V8HF  "HF")
> >  		       (V2SF "SF") (V4SF  "SF")
> >  		       (DF   "DF") (V2DF  "DF")
> > -		       (SI   "SI") (HI    "HI")
> > -		       (QI   "QI")
> > +		       (SI   "SI") (V2HF  "HF")
> > +		       (QI   "QI") (HI    "HI")
> >  		       (V4BF "BF") (V8BF "BF")
> >  		       (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI
> > "QI")
> >  		       (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI") @@ -1381,7
> > +1394,7 @@ (define_mode_attr Vel [(V8QI "qi") (V16QI "qi")
> >  		       (V2SF "sf") (V4SF "sf")
> >  		       (V2DF "df") (DF   "df")
> >  		       (SI   "si") (HI   "hi")
> > -		       (QI   "qi")
> > +		       (QI   "qi") (V2HF "hf")
> >  		       (V4BF "bf") (V8BF "bf")
> >  		       (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
> >  		       (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi") @@ -1866,7
> > +1879,7 @@ (define_mode_attr q [(V8QI "") (V16QI "_q")
> >  		     (V4HF "") (V8HF "_q")
> >  		     (V4BF "") (V8BF "_q")
> >  		     (V2SF "") (V4SF  "_q")
> > -			       (V2DF  "_q")
> > +		     (V2HF "") (V2DF  "_q")
> >  		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")
> >  		     (V2x8QI "") (V2x16QI "_q")
> >  		     (V2x4HI "") (V2x8HI "_q")
> > @@ -1905,6 +1918,7 @@ (define_mode_attr vp [(V8QI "v") (V16QI "v")
> >  		      (V2SI "p") (V4SI  "v")
> >  		      (V2DI "p") (V2DF  "p")
> >  		      (V2SF "p") (V4SF  "v")
> > +		      (V2HF "p")
> >  		      (V4HF "v") (V8HF  "v")])
> >
> >  (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi") diff --git
> > a/gcc/config/arm/types.md b/gcc/config/arm/types.md index
> >
> 7d0504bdd944e9c0d1b545b0b66a9a1adc808714..3cfbc7a93cca1bea4925853e5
> > 1d0a147c5722247 100644
> > --- a/gcc/config/arm/types.md
> > +++ b/gcc/config/arm/types.md
> > @@ -483,6 +483,7 @@ (define_attr "autodetect_type"
> >  ; neon_fp_minmax_s_q
> >  ; neon_fp_minmax_d
> >  ; neon_fp_minmax_d_q
> > +; neon_fp_reduc_add_h
> >  ; neon_fp_reduc_add_s
> >  ; neon_fp_reduc_add_s_q
> >  ; neon_fp_reduc_add_d
> > @@ -1033,6 +1034,7 @@ (define_attr "type"
> >    neon_fp_minmax_d,\
> >    neon_fp_minmax_d_q,\
> >  \
> > +  neon_fp_reduc_add_h,\
> >    neon_fp_reduc_add_s,\
> >    neon_fp_reduc_add_s_q,\
> >    neon_fp_reduc_add_d,\
> > @@ -1257,8 +1259,8 @@ (define_attr "is_neon_type" "yes,no"
> >            neon_fp_compare_d, neon_fp_compare_d_q, neon_fp_minmax_s,\
> >            neon_fp_minmax_s_q, neon_fp_minmax_d,
> neon_fp_minmax_d_q,\
> >            neon_fp_neg_s, neon_fp_neg_s_q, neon_fp_neg_d,
> > neon_fp_neg_d_q,\
> > -          neon_fp_reduc_add_s, neon_fp_reduc_add_s_q,
> > neon_fp_reduc_add_d,\
> > -          neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s,
> > +          neon_fp_reduc_add_h, neon_fp_reduc_add_s,
> > neon_fp_reduc_add_s_q,\
> > +          neon_fp_reduc_add_d, neon_fp_reduc_add_d_q,
> > + neon_fp_reduc_minmax_s,\
> >            neon_fp_reduc_minmax_s_q, neon_fp_reduc_minmax_d,\
> >            neon_fp_reduc_minmax_d_q,\
> >            neon_fp_cvt_narrow_s_q, neon_fp_cvt_narrow_d_q,\ diff --git
> > a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> > index
> >
> 07d71a63414b1066ea431e287286ad048515711a..e6021c5a42748701e5326a5c3
> > 87a39a0bbadc9e5 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> > @@ -30,11 +30,9 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int
> n)
> > 	\
> >  TEST_ALL (VEC_PERM)
> >
> >  /* We should use one DUP for each of the 8-, 16- and 32-bit types,
> > -   although we currently use LD1RW for _Float16.  We should use two
> > -   DUPs for each of the three 64-bit types.  */
> > +   We should use two DUPs for each of the three 64-bit types.  */
> >  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } }
> > */
> > -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 2 } }
> > */
> > -/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 1 } } */
> > +/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } }
> > +*/
> >  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } }
> > */
> >  /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d,
> > z[0- 9]+\.d\n} 3 } } */
> >  /* { dg-final { scan-assembler-not {\tzip2\t} } } */ @@ -53,7 +51,7
> > @@ TEST_ALL (VEC_PERM)
> >  /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */
> >  /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */
> >  /* { dg-final { scan-assembler-not {\tldr} } } */
> > -/* { dg-final { scan-assembler-times {\tstr} 2 } } */
> > -/* { dg-final { scan-assembler-times {\tstr\th[0-9]+} 2 } } */
> > +/* { dg-final { scan-assembler-not {\tstr} } } */
> > +/* { dg-final { scan-assembler-not {\tstr\th[0-9]+} } } */
> >
> >  /* { dg-final { scan-assembler-not {\tuqdec} } } */
  
Richard Sandiford Dec. 6, 2022, 10:28 a.m. UTC | #6
Tamar Christina <Tamar.Christina@arm.com> writes:
> Hi,
>
>
>> This name might cause confusion with the SVE iterators, where FULL means
>> "every bit of the register is used".  How about something like VMOVE
>> instead?
>> 
>> With this change, I guess VALL_F16 represents "The set of all modes for
>> which the vld1 intrinsics are provided" and VMOVE or whatever is "All
>> Advanced SIMD modes suitable for moving, loading, and storing".
>> That is, VMOVE extends VALL_F16 with modes that are not manifested via
>> intrinsics.
>> 
>
> Done.
>
>> Where is the 2h used, and is it valid syntax in that context?
>> 
>> Same for later instances of 2h.
>
> They are, but they weren't meant to be in this patch.  They belong in a separate FP16 series that
> I won't get to finish for GCC 13 due not being able to finish writing all the tests.  I have moved them
> to that patch series though.
>
> While the addp patch series has been killed, this patch is still good standalone and improves codegen
> as shown in the updated testcase.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-simd.md (*aarch64_simd_movv2hf): New.
> 	(mov<mode>, movmisalign<mode>, aarch64_dup_lane<mode>,
> 	aarch64_store_lane0<mode>, aarch64_simd_vec_set<mode>,
> 	@aarch64_simd_vec_copy_lane<mode>, vec_set<mode>,
> 	reduc_<optab>_scal_<mode>, reduc_<fmaxmin>_scal_<mode>,
> 	aarch64_reduc_<optab>_internal<mode>, aarch64_get_lane<mode>,
> 	vec_init<mode><Vel>, vec_extract<mode><Vel>): Support V2HF.
> 	(aarch64_simd_dupv2hf): New.
> 	* config/aarch64/aarch64.cc (aarch64_classify_vector_mode):
> 	Add E_V2HFmode.
> 	* config/aarch64/iterators.md (VHSDF_P): New.
> 	(V2F, VMOVE, nunits, Vtype, Vmtype, Vetype, stype, VEL,
> 	Vel, q, vp): Add V2HF.
> 	* config/arm/types.md (neon_fp_reduc_add_h): New.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/sve/slp_1.c: Update testcase.
>
> --- inline copy of patch ---
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index f4152160084d6b6f34bd69f0ba6386c1ab50f77e..487a31010245accec28e779661e6c2d578fca4b7 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -19,10 +19,10 @@
>  ;; <http://www.gnu.org/licenses/>.
>  
>  (define_expand "mov<mode>"
> -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
> -	(match_operand:VALL_F16 1 "general_operand"))]
> +  [(set (match_operand:VMOVE 0 "nonimmediate_operand")
> +	(match_operand:VMOVE 1 "general_operand"))]
>    "TARGET_SIMD"
> -  "
> +{
>    /* Force the operand into a register if it is not an
>       immediate whose use can be replaced with xzr.
>       If the mode is 16 bytes wide, then we will be doing
> @@ -46,12 +46,11 @@ (define_expand "mov<mode>"
>        aarch64_expand_vector_init (operands[0], operands[1]);
>        DONE;
>      }
> -  "
> -)
> +})
>  
>  (define_expand "movmisalign<mode>"
> -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
> -        (match_operand:VALL_F16 1 "general_operand"))]
> +  [(set (match_operand:VMOVE 0 "nonimmediate_operand")
> +        (match_operand:VMOVE 1 "general_operand"))]
>    "TARGET_SIMD && !STRICT_ALIGNMENT"
>  {
>    /* This pattern is not permitted to fail during expansion: if both arguments
> @@ -73,6 +72,16 @@ (define_insn "aarch64_simd_dup<mode>"
>    [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
>  )
>  
> +(define_insn "aarch64_simd_dupv2hf"
> +  [(set (match_operand:V2HF 0 "register_operand" "=w")
> +	(vec_duplicate:V2HF
> +	  (match_operand:HF 1 "register_operand" "0")))]

Seems like this should be "w" rather than "0", since SLI is a
two-register instruction.

> +  "TARGET_SIMD"
> +  "@
> +   sli\\t%d0, %d1, 16"
> +  [(set_attr "type" "neon_shift_imm")]
> +)
> +
>  (define_insn "aarch64_simd_dup<mode>"
>    [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
>  	(vec_duplicate:VDQF_F16
> @@ -85,10 +94,10 @@ (define_insn "aarch64_simd_dup<mode>"
>  )
>  
>  (define_insn "aarch64_dup_lane<mode>"
> -  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> -	(vec_duplicate:VALL_F16
> +  [(set (match_operand:VMOVE 0 "register_operand" "=w")
> +	(vec_duplicate:VMOVE
>  	  (vec_select:<VEL>
> -	    (match_operand:VALL_F16 1 "register_operand" "w")
> +	    (match_operand:VMOVE 1 "register_operand" "w")
>  	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
>            )))]
>    "TARGET_SIMD"
> @@ -142,6 +151,29 @@ (define_insn "*aarch64_simd_mov<VDMOV:mode>"
>  		     mov_reg, neon_move<q>")]
>  )
>  
> +(define_insn "*aarch64_simd_movv2hf"
> +  [(set (match_operand:V2HF 0 "nonimmediate_operand"
> +		"=w, m,  m,  w, ?r, ?w, ?r, w, w")
> +	(match_operand:V2HF 1 "general_operand"
> +		"m,  Dz, w,  w,  w,  r,  r, Dz, Dn"))]
> +  "TARGET_SIMD_F16INST
> +   && (register_operand (operands[0], V2HFmode)
> +       || aarch64_simd_reg_or_zero (operands[1], V2HFmode))"
> +   "@
> +    ldr\\t%s0, %1
> +    str\\twzr, %0
> +    str\\t%s1, %0
> +    mov\\t%0.2s[0], %1.2s[0]
> +    umov\\t%w0, %1.s[0]
> +    fmov\\t%s0, %1

Should be %w1 instead.

> +    mov\\t%0, %1

I guess this one works with either % (X registers) or %w.  Might still
be better to use %w anyway, so that it looks less like an oversight.

> +    movi\\t%d0, 0
> +    * return aarch64_output_simd_mov_immediate (operands[1], 32);"
> +  [(set_attr "type" "neon_load1_1reg, store_8, neon_store1_1reg,\
> +		     neon_logic, neon_to_gp, f_mcr,\
> +		     mov_reg, neon_move, neon_move")]
> +)
> +
>  (define_insn "*aarch64_simd_mov<VQMOV:mode>"
>    [(set (match_operand:VQMOV 0 "nonimmediate_operand"
>  		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
> @@ -182,7 +214,7 @@ (define_insn "*aarch64_simd_mov<VQMOV:mode>"
>  
>  (define_insn "aarch64_store_lane0<mode>"
>    [(set (match_operand:<VEL> 0 "memory_operand" "=m")
> -	(vec_select:<VEL> (match_operand:VALL_F16 1 "register_operand" "w")
> +	(vec_select:<VEL> (match_operand:VMOVE 1 "register_operand" "w")
>  			(parallel [(match_operand 2 "const_int_operand" "n")])))]
>    "TARGET_SIMD
>     && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 0"
> @@ -1035,11 +1067,11 @@ (define_insn "one_cmpl<mode>2"
>  )
>  
>  (define_insn "aarch64_simd_vec_set<mode>"
> -  [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
> -	(vec_merge:VALL_F16
> -	    (vec_duplicate:VALL_F16
> +  [(set (match_operand:VMOVE 0 "register_operand" "=w,w,w")
> +	(vec_merge:VMOVE
> +	    (vec_duplicate:VMOVE
>  		(match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "w,?r,Utv"))
> -	    (match_operand:VALL_F16 3 "register_operand" "0,0,0")
> +	    (match_operand:VMOVE 3 "register_operand" "0,0,0")
>  	    (match_operand:SI 2 "immediate_operand" "i,i,i")))]
>    "TARGET_SIMD"
>    {
> @@ -1061,14 +1093,14 @@ (define_insn "aarch64_simd_vec_set<mode>"
>  )
>  
>  (define_insn "@aarch64_simd_vec_copy_lane<mode>"
> -  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> -	(vec_merge:VALL_F16
> -	    (vec_duplicate:VALL_F16
> +  [(set (match_operand:VMOVE 0 "register_operand" "=w")
> +	(vec_merge:VMOVE
> +	    (vec_duplicate:VMOVE
>  	      (vec_select:<VEL>
> -		(match_operand:VALL_F16 3 "register_operand" "w")
> +		(match_operand:VMOVE 3 "register_operand" "w")
>  		(parallel
>  		  [(match_operand:SI 4 "immediate_operand" "i")])))
> -	    (match_operand:VALL_F16 1 "register_operand" "0")
> +	    (match_operand:VMOVE 1 "register_operand" "0")
>  	    (match_operand:SI 2 "immediate_operand" "i")))]
>    "TARGET_SIMD"
>    {
> @@ -1376,7 +1408,7 @@ (define_insn "vec_shr_<mode>"
>  )
>  
>  (define_expand "vec_set<mode>"
> -  [(match_operand:VALL_F16 0 "register_operand")
> +  [(match_operand:VMOVE 0 "register_operand")
>     (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand")
>     (match_operand:SI 2 "immediate_operand")]
>    "TARGET_SIMD"
> @@ -3495,7 +3527,7 @@ (define_insn "popcount<mode>2"
>  ;; gimple_fold'd to the IFN_REDUC_(MAX|MIN) function.  (This is FP smax/smin).
>  (define_expand "reduc_<optab>_scal_<mode>"
>    [(match_operand:<VEL> 0 "register_operand")
> -   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
> +   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
>  		 FMAXMINV)]
>    "TARGET_SIMD"
>    {
> @@ -3510,7 +3542,7 @@ (define_expand "reduc_<optab>_scal_<mode>"
>  
>  (define_expand "reduc_<fmaxmin>_scal_<mode>"
>    [(match_operand:<VEL> 0 "register_operand")
> -   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
> +   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
>  		 FMAXMINNMV)]
>    "TARGET_SIMD"
>    {
> @@ -3554,8 +3586,8 @@ (define_insn "aarch64_reduc_<optab>_internalv2si"
>  )
>  
>  (define_insn "aarch64_reduc_<optab>_internal<mode>"
> - [(set (match_operand:VHSDF 0 "register_operand" "=w")
> -       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
> + [(set (match_operand:VHSDF_P 0 "register_operand" "=w")
> +       (unspec:VHSDF_P [(match_operand:VHSDF_P 1 "register_operand" "w")]
>  		      FMAXMINV))]
>   "TARGET_SIMD"
>   "<maxmin_uns_op><vp>\\t%<Vetype>0, %1.<Vtype>"
> @@ -4200,7 +4232,7 @@ (define_insn "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
>  (define_insn_and_split "aarch64_get_lane<mode>"
>    [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=?r, w, Utv")
>  	(vec_select:<VEL>
> -	  (match_operand:VALL_F16 1 "register_operand" "w, w, w")
> +	  (match_operand:VMOVE 1 "register_operand" "w, w, w")
>  	  (parallel [(match_operand:SI 2 "immediate_operand" "i, i, i")])))]
>    "TARGET_SIMD"
>    {
> @@ -7981,7 +8013,7 @@ (define_expand "aarch64_st1<VALL_F16:mode>"
>  ;; Standard pattern name vec_init<mode><Vel>.
>  
>  (define_expand "vec_init<mode><Vel>"
> -  [(match_operand:VALL_F16 0 "register_operand")
> +  [(match_operand:VMOVE 0 "register_operand")
>     (match_operand 1 "" "")]
>    "TARGET_SIMD"
>  {
> @@ -8060,7 +8092,7 @@ (define_insn "aarch64_urecpe<mode>"
>  
>  (define_expand "vec_extract<mode><Vel>"
>    [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand")
> -   (match_operand:VALL_F16 1 "register_operand")
> +   (match_operand:VMOVE 1 "register_operand")
>     (match_operand:SI 2 "immediate_operand")]
>    "TARGET_SIMD"
>  {
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 84dbe2f4ea7d03b424602ed98a34e7824217dc91..35671cb86e374f9ded21d0e4944c63bc2cbc0901 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -3566,6 +3566,7 @@ aarch64_classify_vector_mode (machine_mode mode)
>      case E_V8BFmode:
>      case E_V4SFmode:
>      case E_V2DFmode:
> +    case E_V2HFmode:
>        return TARGET_SIMD ? VEC_ADVSIMD : 0;
>  
>      default:
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 37d8161a33b1c399d80be82afa67613a087389d4..dfcf86a440e316c2abdbcc646363d39e458d1a91 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -160,6 +160,10 @@ (define_mode_iterator VDQF [V2SF V4SF V2DF])
>  (define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
>  			     (V8HF "TARGET_SIMD_F16INST")
>  			     V2SF V4SF V2DF])
> +;; Advanced SIMD Float modes suitable for pairwise operations.
> +(define_mode_iterator VHSDF_P [(V4HF "TARGET_SIMD_F16INST")
> +			       (V8HF "TARGET_SIMD_F16INST")
> +			       V2SF V4SF V2DF (V2HF "TARGET_SIMD_F16INST")])

Maybe "reduction or pairwise operations"?  Otherwise it isn't obvious
why V4HF, V8HF and V4SF are included.

>  
>  ;; Advanced SIMD Float modes, and DF.
>  (define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF])
> @@ -188,15 +192,23 @@ (define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF V2DI])
>  (define_mode_iterator VALLF [V2SF V4SF V2DF SF DF])
>  
>  ;; Advanced SIMD Float modes with 2 elements.
> -(define_mode_iterator V2F [V2SF V2DF])
> +(define_mode_iterator V2F [V2SF V2DF V2HF])
>  
>  ;; All Advanced SIMD modes on which we support any arithmetic operations.
>  (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
>  
> -;; All Advanced SIMD modes suitable for moving, loading, and storing.
> +;; All Advanced SIMD modes suitable for moving, loading, and storing
> +;; except V2HF.

I'd prefer:

;; The set of all modes for which vld1 intrinsics are provided.

otherwise it isn't clear why V2HF is a special case.

>  (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
>  				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
>  
> +;; All Advanced SIMD modes suitable for moving, loading, and storing
> +;; including V2HF
> +(define_mode_iterator VMOVE [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
> +			     V4HF V8HF V4BF V8BF V2SF V4SF V2DF
> +			     (V2HF "TARGET_SIMD_F16INST")])
> +
> +
>  ;; The VALL_F16 modes except the 128-bit 2-element ones.
>  (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
>  				V4HF V8HF V2SF V4SF])
> @@ -1076,7 +1088,7 @@ (define_mode_attr nunits [(V8QI "8") (V16QI "16")
>  			  (V2SF "2") (V4SF "4")
>  			  (V1DF "1") (V2DF "2")
>  			  (DI "1") (DF "1")
> -			  (V8DI "8")])
> +			  (V8DI "8") (V2HF "2")])
>  
>  ;; Map a mode to the number of bits in it, if the size of the mode
>  ;; is constant.
> @@ -1090,6 +1102,7 @@ (define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")])
>  
>  ;; Give the length suffix letter for a sign- or zero-extension.
>  (define_mode_attr size [(QI "b") (HI "h") (SI "w")])
> +(define_mode_attr sizel [(QI "b") (HI "h") (SI "")])
>  
>  ;; Give the number of bits in the mode
>  (define_mode_attr sizen [(QI "8") (HI "16") (SI "32") (DI "64")])

Looks like this isn't used in the patch, so could be dropped.

OK with those changes, thanks.

Richard

> @@ -1193,7 +1206,7 @@ (define_mode_attr Vmntype [(V8HI ".8b") (V4SI ".4h")
>  (define_mode_attr Vetype [(V8QI "b") (V16QI "b")
>  			  (V4HI "h") (V8HI  "h")
>  			  (V2SI "s") (V4SI  "s")
> -			  (V2DI "d")
> +			  (V2DI "d") (V2HF  "h")
>  			  (V4HF "h") (V8HF  "h")
>  			  (V2SF "s") (V4SF  "s")
>  			  (V2DF "d")
> @@ -1285,7 +1298,7 @@ (define_mode_attr Vcwtype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "w") (VNx2QI "d")
>  ;; more accurately.
>  (define_mode_attr stype [(V8QI "b") (V16QI "b") (V4HI "s") (V8HI "s")
>  			 (V2SI "s") (V4SI "s") (V2DI "d") (V4HF "s")
> -			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d")
> +			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d") (V2HF "s")
>  			 (HF "s") (SF "s") (DF "d") (QI "b") (HI "s")
>  			 (SI "s") (DI "d")])
>  
> @@ -1360,8 +1373,8 @@ (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
>  		       (V4HF "HF") (V8HF  "HF")
>  		       (V2SF "SF") (V4SF  "SF")
>  		       (DF   "DF") (V2DF  "DF")
> -		       (SI   "SI") (HI    "HI")
> -		       (QI   "QI")
> +		       (SI   "SI") (V2HF  "HF")
> +		       (QI   "QI") (HI    "HI")
>  		       (V4BF "BF") (V8BF "BF")
>  		       (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
>  		       (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
> @@ -1381,7 +1394,7 @@ (define_mode_attr Vel [(V8QI "qi") (V16QI "qi")
>  		       (V2SF "sf") (V4SF "sf")
>  		       (V2DF "df") (DF   "df")
>  		       (SI   "si") (HI   "hi")
> -		       (QI   "qi")
> +		       (QI   "qi") (V2HF "hf")
>  		       (V4BF "bf") (V8BF "bf")
>  		       (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
>  		       (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
> @@ -1866,7 +1879,7 @@ (define_mode_attr q [(V8QI "") (V16QI "_q")
>  		     (V4HF "") (V8HF "_q")
>  		     (V4BF "") (V8BF "_q")
>  		     (V2SF "") (V4SF  "_q")
> -			       (V2DF  "_q")
> +		     (V2HF "") (V2DF  "_q")
>  		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")
>  		     (V2x8QI "") (V2x16QI "_q")
>  		     (V2x4HI "") (V2x8HI "_q")
> @@ -1905,6 +1918,7 @@ (define_mode_attr vp [(V8QI "v") (V16QI "v")
>  		      (V2SI "p") (V4SI  "v")
>  		      (V2DI "p") (V2DF  "p")
>  		      (V2SF "p") (V4SF  "v")
> +		      (V2HF "p")
>  		      (V4HF "v") (V8HF  "v")])
>  
>  (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")
> diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
> index 7d0504bdd944e9c0d1b545b0b66a9a1adc808714..3cfbc7a93cca1bea4925853e51d0a147c5722247 100644
> --- a/gcc/config/arm/types.md
> +++ b/gcc/config/arm/types.md
> @@ -483,6 +483,7 @@ (define_attr "autodetect_type"
>  ; neon_fp_minmax_s_q
>  ; neon_fp_minmax_d
>  ; neon_fp_minmax_d_q
> +; neon_fp_reduc_add_h
>  ; neon_fp_reduc_add_s
>  ; neon_fp_reduc_add_s_q
>  ; neon_fp_reduc_add_d
> @@ -1033,6 +1034,7 @@ (define_attr "type"
>    neon_fp_minmax_d,\
>    neon_fp_minmax_d_q,\
>  \
> +  neon_fp_reduc_add_h,\
>    neon_fp_reduc_add_s,\
>    neon_fp_reduc_add_s_q,\
>    neon_fp_reduc_add_d,\
> @@ -1257,8 +1259,8 @@ (define_attr "is_neon_type" "yes,no"
>            neon_fp_compare_d, neon_fp_compare_d_q, neon_fp_minmax_s,\
>            neon_fp_minmax_s_q, neon_fp_minmax_d, neon_fp_minmax_d_q,\
>            neon_fp_neg_s, neon_fp_neg_s_q, neon_fp_neg_d, neon_fp_neg_d_q,\
> -          neon_fp_reduc_add_s, neon_fp_reduc_add_s_q, neon_fp_reduc_add_d,\
> -          neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s,
> +          neon_fp_reduc_add_h, neon_fp_reduc_add_s, neon_fp_reduc_add_s_q,\
> +          neon_fp_reduc_add_d, neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s,\
>            neon_fp_reduc_minmax_s_q, neon_fp_reduc_minmax_d,\
>            neon_fp_reduc_minmax_d_q,\
>            neon_fp_cvt_narrow_s_q, neon_fp_cvt_narrow_d_q,\
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> index 07d71a63414b1066ea431e287286ad048515711a..e6021c5a42748701e5326a5c387a39a0bbadc9e5 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> @@ -30,11 +30,9 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n)	\
>  TEST_ALL (VEC_PERM)
>  
>  /* We should use one DUP for each of the 8-, 16- and 32-bit types,
> -   although we currently use LD1RW for _Float16.  We should use two
> -   DUPs for each of the three 64-bit types.  */
> +   We should use two DUPs for each of the three 64-bit types.  */
>  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } } */
> -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 2 } } */
> -/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 1 } } */
> +/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } } */
>  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } } */
>  /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
>  /* { dg-final { scan-assembler-not {\tzip2\t} } } */
> @@ -53,7 +51,7 @@ TEST_ALL (VEC_PERM)
>  /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */
>  /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */
>  /* { dg-final { scan-assembler-not {\tldr} } } */
> -/* { dg-final { scan-assembler-times {\tstr} 2 } } */
> -/* { dg-final { scan-assembler-times {\tstr\th[0-9]+} 2 } } */
> +/* { dg-final { scan-assembler-not {\tstr} } } */
> +/* { dg-final { scan-assembler-not {\tstr\th[0-9]+} } } */
>  
>  /* { dg-final { scan-assembler-not {\tuqdec} } } */
  
Tamar Christina Dec. 6, 2022, 10:58 a.m. UTC | #7
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Tuesday, December 6, 2022 10:28 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH 5/8]AArch64 aarch64: Make existing V2HF be usable.
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> > Hi,
> >
> >
> >> This name might cause confusion with the SVE iterators, where FULL
> >> means "every bit of the register is used".  How about something like
> >> VMOVE instead?
> >>
> >> With this change, I guess VALL_F16 represents "The set of all modes
> >> for which the vld1 intrinsics are provided" and VMOVE or whatever is
> >> "All Advanced SIMD modes suitable for moving, loading, and storing".
> >> That is, VMOVE extends VALL_F16 with modes that are not manifested
> >> via intrinsics.
> >>
> >
> > Done.
> >
> >> Where is the 2h used, and is it valid syntax in that context?
> >>
> >> Same for later instances of 2h.
> >
> > They are, but they weren't meant to be in this patch.  They belong in
> > a separate FP16 series that I won't get to finish for GCC 13 due not
> > being able to finish writing all the tests.  I have moved them to that patch
> series though.
> >
> > While the addp patch series has been killed, this patch is still good
> > standalone and improves codegen as shown in the updated testcase.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* config/aarch64/aarch64-simd.md (*aarch64_simd_movv2hf): New.
> > 	(mov<mode>, movmisalign<mode>, aarch64_dup_lane<mode>,
> > 	aarch64_store_lane0<mode>, aarch64_simd_vec_set<mode>,
> > 	@aarch64_simd_vec_copy_lane<mode>, vec_set<mode>,
> > 	reduc_<optab>_scal_<mode>, reduc_<fmaxmin>_scal_<mode>,
> > 	aarch64_reduc_<optab>_internal<mode>,
> aarch64_get_lane<mode>,
> > 	vec_init<mode><Vel>, vec_extract<mode><Vel>): Support V2HF.
> > 	(aarch64_simd_dupv2hf): New.
> > 	* config/aarch64/aarch64.cc (aarch64_classify_vector_mode):
> > 	Add E_V2HFmode.
> > 	* config/aarch64/iterators.md (VHSDF_P): New.
> > 	(V2F, VMOVE, nunits, Vtype, Vmtype, Vetype, stype, VEL,
> > 	Vel, q, vp): Add V2HF.
> > 	* config/arm/types.md (neon_fp_reduc_add_h): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.target/aarch64/sve/slp_1.c: Update testcase.
> >
> > --- inline copy of patch ---
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index
> >
> f4152160084d6b6f34bd69f0ba6386c1ab50f77e..487a31010245accec28e779661
> e6
> > c2d578fca4b7 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -19,10 +19,10 @@
> >  ;; <http://www.gnu.org/licenses/>.
> >
> >  (define_expand "mov<mode>"
> > -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
> > -	(match_operand:VALL_F16 1 "general_operand"))]
> > +  [(set (match_operand:VMOVE 0 "nonimmediate_operand")
> > +	(match_operand:VMOVE 1 "general_operand"))]
> >    "TARGET_SIMD"
> > -  "
> > +{
> >    /* Force the operand into a register if it is not an
> >       immediate whose use can be replaced with xzr.
> >       If the mode is 16 bytes wide, then we will be doing @@ -46,12
> > +46,11 @@ (define_expand "mov<mode>"
> >        aarch64_expand_vector_init (operands[0], operands[1]);
> >        DONE;
> >      }
> > -  "
> > -)
> > +})
> >
> >  (define_expand "movmisalign<mode>"
> > -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
> > -        (match_operand:VALL_F16 1 "general_operand"))]
> > +  [(set (match_operand:VMOVE 0 "nonimmediate_operand")
> > +        (match_operand:VMOVE 1 "general_operand"))]
> >    "TARGET_SIMD && !STRICT_ALIGNMENT"
> >  {
> >    /* This pattern is not permitted to fail during expansion: if both
> > arguments @@ -73,6 +72,16 @@ (define_insn
> "aarch64_simd_dup<mode>"
> >    [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
> >  )
> >
> > +(define_insn "aarch64_simd_dupv2hf"
> > +  [(set (match_operand:V2HF 0 "register_operand" "=w")
> > +	(vec_duplicate:V2HF
> > +	  (match_operand:HF 1 "register_operand" "0")))]
> 
> Seems like this should be "w" rather than "0", since SLI is a two-register
> instruction.

Yes, but for a dup it's only valid when the same register is used. i.e. it has to
write into the original src register.

Thanks,
Tamar

> 
> > +  "TARGET_SIMD"
> > +  "@
> > +   sli\\t%d0, %d1, 16"
> > +  [(set_attr "type" "neon_shift_imm")]
> > +)
> > +
> >  (define_insn "aarch64_simd_dup<mode>"
> >    [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
> >  	(vec_duplicate:VDQF_F16
> > @@ -85,10 +94,10 @@ (define_insn "aarch64_simd_dup<mode>"
> >  )
> >
> >  (define_insn "aarch64_dup_lane<mode>"
> > -  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> > -	(vec_duplicate:VALL_F16
> > +  [(set (match_operand:VMOVE 0 "register_operand" "=w")
> > +	(vec_duplicate:VMOVE
> >  	  (vec_select:<VEL>
> > -	    (match_operand:VALL_F16 1 "register_operand" "w")
> > +	    (match_operand:VMOVE 1 "register_operand" "w")
> >  	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
> >            )))]
> >    "TARGET_SIMD"
> > @@ -142,6 +151,29 @@ (define_insn
> "*aarch64_simd_mov<VDMOV:mode>"
> >  		     mov_reg, neon_move<q>")]
> >  )
> >
> > +(define_insn "*aarch64_simd_movv2hf"
> > +  [(set (match_operand:V2HF 0 "nonimmediate_operand"
> > +		"=w, m,  m,  w, ?r, ?w, ?r, w, w")
> > +	(match_operand:V2HF 1 "general_operand"
> > +		"m,  Dz, w,  w,  w,  r,  r, Dz, Dn"))]
> > +  "TARGET_SIMD_F16INST
> > +   && (register_operand (operands[0], V2HFmode)
> > +       || aarch64_simd_reg_or_zero (operands[1], V2HFmode))"
> > +   "@
> > +    ldr\\t%s0, %1
> > +    str\\twzr, %0
> > +    str\\t%s1, %0
> > +    mov\\t%0.2s[0], %1.2s[0]
> > +    umov\\t%w0, %1.s[0]
> > +    fmov\\t%s0, %1
> 
> Should be %w1 instead.
> 
> > +    mov\\t%0, %1
> 
> I guess this one works with either % (X registers) or %w.  Might still be better
> to use %w anyway, so that it looks less like an oversight.
> 
> > +    movi\\t%d0, 0
> > +    * return aarch64_output_simd_mov_immediate (operands[1], 32);"
> > +  [(set_attr "type" "neon_load1_1reg, store_8, neon_store1_1reg,\
> > +		     neon_logic, neon_to_gp, f_mcr,\
> > +		     mov_reg, neon_move, neon_move")]
> > +)
> > +
> >  (define_insn "*aarch64_simd_mov<VQMOV:mode>"
> >    [(set (match_operand:VQMOV 0 "nonimmediate_operand"
> >  		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
> > @@ -182,7 +214,7 @@ (define_insn
> "*aarch64_simd_mov<VQMOV:mode>"
> >
> >  (define_insn "aarch64_store_lane0<mode>"
> >    [(set (match_operand:<VEL> 0 "memory_operand" "=m")
> > -	(vec_select:<VEL> (match_operand:VALL_F16 1 "register_operand"
> "w")
> > +	(vec_select:<VEL> (match_operand:VMOVE 1 "register_operand"
> "w")
> >  			(parallel [(match_operand 2 "const_int_operand"
> "n")])))]
> >    "TARGET_SIMD
> >     && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 0"
> > @@ -1035,11 +1067,11 @@ (define_insn "one_cmpl<mode>2"
> >  )
> >
> >  (define_insn "aarch64_simd_vec_set<mode>"
> > -  [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
> > -	(vec_merge:VALL_F16
> > -	    (vec_duplicate:VALL_F16
> > +  [(set (match_operand:VMOVE 0 "register_operand" "=w,w,w")
> > +	(vec_merge:VMOVE
> > +	    (vec_duplicate:VMOVE
> >  		(match_operand:<VEL> 1
> "aarch64_simd_nonimmediate_operand" "w,?r,Utv"))
> > -	    (match_operand:VALL_F16 3 "register_operand" "0,0,0")
> > +	    (match_operand:VMOVE 3 "register_operand" "0,0,0")
> >  	    (match_operand:SI 2 "immediate_operand" "i,i,i")))]
> >    "TARGET_SIMD"
> >    {
> > @@ -1061,14 +1093,14 @@ (define_insn "aarch64_simd_vec_set<mode>"
> >  )
> >
> >  (define_insn "@aarch64_simd_vec_copy_lane<mode>"
> > -  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> > -	(vec_merge:VALL_F16
> > -	    (vec_duplicate:VALL_F16
> > +  [(set (match_operand:VMOVE 0 "register_operand" "=w")
> > +	(vec_merge:VMOVE
> > +	    (vec_duplicate:VMOVE
> >  	      (vec_select:<VEL>
> > -		(match_operand:VALL_F16 3 "register_operand" "w")
> > +		(match_operand:VMOVE 3 "register_operand" "w")
> >  		(parallel
> >  		  [(match_operand:SI 4 "immediate_operand" "i")])))
> > -	    (match_operand:VALL_F16 1 "register_operand" "0")
> > +	    (match_operand:VMOVE 1 "register_operand" "0")
> >  	    (match_operand:SI 2 "immediate_operand" "i")))]
> >    "TARGET_SIMD"
> >    {
> > @@ -1376,7 +1408,7 @@ (define_insn "vec_shr_<mode>"
> >  )
> >
> >  (define_expand "vec_set<mode>"
> > -  [(match_operand:VALL_F16 0 "register_operand")
> > +  [(match_operand:VMOVE 0 "register_operand")
> >     (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand")
> >     (match_operand:SI 2 "immediate_operand")]
> >    "TARGET_SIMD"
> > @@ -3495,7 +3527,7 @@ (define_insn "popcount<mode>2"
> >  ;; gimple_fold'd to the IFN_REDUC_(MAX|MIN) function.  (This is FP
> smax/smin).
> >  (define_expand "reduc_<optab>_scal_<mode>"
> >    [(match_operand:<VEL> 0 "register_operand")
> > -   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
> > +   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
> >  		 FMAXMINV)]
> >    "TARGET_SIMD"
> >    {
> > @@ -3510,7 +3542,7 @@ (define_expand "reduc_<optab>_scal_<mode>"
> >
> >  (define_expand "reduc_<fmaxmin>_scal_<mode>"
> >    [(match_operand:<VEL> 0 "register_operand")
> > -   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
> > +   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
> >  		 FMAXMINNMV)]
> >    "TARGET_SIMD"
> >    {
> > @@ -3554,8 +3586,8 @@ (define_insn
> "aarch64_reduc_<optab>_internalv2si"
> >  )
> >
> >  (define_insn "aarch64_reduc_<optab>_internal<mode>"
> > - [(set (match_operand:VHSDF 0 "register_operand" "=w")
> > -       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
> > + [(set (match_operand:VHSDF_P 0 "register_operand" "=w")
> > +       (unspec:VHSDF_P [(match_operand:VHSDF_P 1 "register_operand"
> > + "w")]
> >  		      FMAXMINV))]
> >   "TARGET_SIMD"
> >   "<maxmin_uns_op><vp>\\t%<Vetype>0, %1.<Vtype>"
> > @@ -4200,7 +4232,7 @@ (define_insn
> "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
> >  (define_insn_and_split "aarch64_get_lane<mode>"
> >    [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand"
> "=?r, w, Utv")
> >  	(vec_select:<VEL>
> > -	  (match_operand:VALL_F16 1 "register_operand" "w, w, w")
> > +	  (match_operand:VMOVE 1 "register_operand" "w, w, w")
> >  	  (parallel [(match_operand:SI 2 "immediate_operand" "i, i, i")])))]
> >    "TARGET_SIMD"
> >    {
> > @@ -7981,7 +8013,7 @@ (define_expand "aarch64_st1<VALL_F16:mode>"
> >  ;; Standard pattern name vec_init<mode><Vel>.
> >
> >  (define_expand "vec_init<mode><Vel>"
> > -  [(match_operand:VALL_F16 0 "register_operand")
> > +  [(match_operand:VMOVE 0 "register_operand")
> >     (match_operand 1 "" "")]
> >    "TARGET_SIMD"
> >  {
> > @@ -8060,7 +8092,7 @@ (define_insn "aarch64_urecpe<mode>"
> >
> >  (define_expand "vec_extract<mode><Vel>"
> >    [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand")
> > -   (match_operand:VALL_F16 1 "register_operand")
> > +   (match_operand:VMOVE 1 "register_operand")
> >     (match_operand:SI 2 "immediate_operand")]
> >    "TARGET_SIMD"
> >  {
> > diff --git a/gcc/config/aarch64/aarch64.cc
> > b/gcc/config/aarch64/aarch64.cc index
> >
> 84dbe2f4ea7d03b424602ed98a34e7824217dc91..35671cb86e374f9ded21d0e4
> 944c
> > 63bc2cbc0901 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -3566,6 +3566,7 @@ aarch64_classify_vector_mode (machine_mode
> mode)
> >      case E_V8BFmode:
> >      case E_V4SFmode:
> >      case E_V2DFmode:
> > +    case E_V2HFmode:
> >        return TARGET_SIMD ? VEC_ADVSIMD : 0;
> >
> >      default:
> > diff --git a/gcc/config/aarch64/iterators.md
> > b/gcc/config/aarch64/iterators.md index
> >
> 37d8161a33b1c399d80be82afa67613a087389d4..dfcf86a440e316c2abdbcc6463
> 63
> > d39e458d1a91 100644
> > --- a/gcc/config/aarch64/iterators.md
> > +++ b/gcc/config/aarch64/iterators.md
> > @@ -160,6 +160,10 @@ (define_mode_iterator VDQF [V2SF V4SF V2DF])
> > (define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
> >  			     (V8HF "TARGET_SIMD_F16INST")
> >  			     V2SF V4SF V2DF])
> > +;; Advanced SIMD Float modes suitable for pairwise operations.
> > +(define_mode_iterator VHSDF_P [(V4HF "TARGET_SIMD_F16INST")
> > +			       (V8HF "TARGET_SIMD_F16INST")
> > +			       V2SF V4SF V2DF (V2HF
> "TARGET_SIMD_F16INST")])
> 
> Maybe "reduction or pairwise operations"?  Otherwise it isn't obvious why
> V4HF, V8HF and V4SF are included.
> 
> >
> >  ;; Advanced SIMD Float modes, and DF.
> >  (define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF]) @@ -188,15
> +192,23
> > @@ (define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF
> V2DI])
> > (define_mode_iterator VALLF [V2SF V4SF V2DF SF DF])
> >
> >  ;; Advanced SIMD Float modes with 2 elements.
> > -(define_mode_iterator V2F [V2SF V2DF])
> > +(define_mode_iterator V2F [V2SF V2DF V2HF])
> >
> >  ;; All Advanced SIMD modes on which we support any arithmetic
> operations.
> >  (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF
> > V4SF V2DF])
> >
> > -;; All Advanced SIMD modes suitable for moving, loading, and storing.
> > +;; All Advanced SIMD modes suitable for moving, loading, and storing
> > +;; except V2HF.
> 
> I'd prefer:
> 
> ;; The set of all modes for which vld1 intrinsics are provided.
> 
> otherwise it isn't clear why V2HF is a special case.
> 
> >  (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
> >  				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
> >
> > +;; All Advanced SIMD modes suitable for moving, loading, and storing
> > +;; including V2HF (define_mode_iterator VMOVE [V8QI V16QI V4HI V8HI
> > +V2SI V4SI V2DI
> > +			     V4HF V8HF V4BF V8BF V2SF V4SF V2DF
> > +			     (V2HF "TARGET_SIMD_F16INST")])
> > +
> > +
> >  ;; The VALL_F16 modes except the 128-bit 2-element ones.
> >  (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI
> V4SI
> >  				V4HF V8HF V2SF V4SF])
> > @@ -1076,7 +1088,7 @@ (define_mode_attr nunits [(V8QI "8") (V16QI
> "16")
> >  			  (V2SF "2") (V4SF "4")
> >  			  (V1DF "1") (V2DF "2")
> >  			  (DI "1") (DF "1")
> > -			  (V8DI "8")])
> > +			  (V8DI "8") (V2HF "2")])
> >
> >  ;; Map a mode to the number of bits in it, if the size of the mode
> > ;; is constant.
> > @@ -1090,6 +1102,7 @@ (define_mode_attr s [(HF "h") (SF "s") (DF "d")
> > (SI "s") (DI "d")])
> >
> >  ;; Give the length suffix letter for a sign- or zero-extension.
> >  (define_mode_attr size [(QI "b") (HI "h") (SI "w")])
> > +(define_mode_attr sizel [(QI "b") (HI "h") (SI "")])
> >
> >  ;; Give the number of bits in the mode  (define_mode_attr sizen [(QI
> > "8") (HI "16") (SI "32") (DI "64")])
> 
> Looks like this isn't used in the patch, so could be dropped.
> 
> OK with those changes, thanks.
> 
> Richard
> 
> > @@ -1193,7 +1206,7 @@ (define_mode_attr Vmntype [(V8HI ".8b") (V4SI
> > ".4h")  (define_mode_attr Vetype [(V8QI "b") (V16QI "b")
> >  			  (V4HI "h") (V8HI  "h")
> >  			  (V2SI "s") (V4SI  "s")
> > -			  (V2DI "d")
> > +			  (V2DI "d") (V2HF  "h")
> >  			  (V4HF "h") (V8HF  "h")
> >  			  (V2SF "s") (V4SF  "s")
> >  			  (V2DF "d")
> > @@ -1285,7 +1298,7 @@ (define_mode_attr Vcwtype [(VNx16QI "b")
> (VNx8QI
> > "h") (VNx4QI "w") (VNx2QI "d")  ;; more accurately.
> >  (define_mode_attr stype [(V8QI "b") (V16QI "b") (V4HI "s") (V8HI "s")
> >  			 (V2SI "s") (V4SI "s") (V2DI "d") (V4HF "s")
> > -			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d")
> > +			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d") (V2HF
> "s")
> >  			 (HF "s") (SF "s") (DF "d") (QI "b") (HI "s")
> >  			 (SI "s") (DI "d")])
> >
> > @@ -1360,8 +1373,8 @@ (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
> >  		       (V4HF "HF") (V8HF  "HF")
> >  		       (V2SF "SF") (V4SF  "SF")
> >  		       (DF   "DF") (V2DF  "DF")
> > -		       (SI   "SI") (HI    "HI")
> > -		       (QI   "QI")
> > +		       (SI   "SI") (V2HF  "HF")
> > +		       (QI   "QI") (HI    "HI")
> >  		       (V4BF "BF") (V8BF "BF")
> >  		       (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI
> "QI")
> >  		       (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI") @@ -1381,7
> +1394,7
> > @@ (define_mode_attr Vel [(V8QI "qi") (V16QI "qi")
> >  		       (V2SF "sf") (V4SF "sf")
> >  		       (V2DF "df") (DF   "df")
> >  		       (SI   "si") (HI   "hi")
> > -		       (QI   "qi")
> > +		       (QI   "qi") (V2HF "hf")
> >  		       (V4BF "bf") (V8BF "bf")
> >  		       (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
> >  		       (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi") @@ -1866,7
> +1879,7
> > @@ (define_mode_attr q [(V8QI "") (V16QI "_q")
> >  		     (V4HF "") (V8HF "_q")
> >  		     (V4BF "") (V8BF "_q")
> >  		     (V2SF "") (V4SF  "_q")
> > -			       (V2DF  "_q")
> > +		     (V2HF "") (V2DF  "_q")
> >  		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")
> >  		     (V2x8QI "") (V2x16QI "_q")
> >  		     (V2x4HI "") (V2x8HI "_q")
> > @@ -1905,6 +1918,7 @@ (define_mode_attr vp [(V8QI "v") (V16QI "v")
> >  		      (V2SI "p") (V4SI  "v")
> >  		      (V2DI "p") (V2DF  "p")
> >  		      (V2SF "p") (V4SF  "v")
> > +		      (V2HF "p")
> >  		      (V4HF "v") (V8HF  "v")])
> >
> >  (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi") diff --git
> > a/gcc/config/arm/types.md b/gcc/config/arm/types.md index
> >
> 7d0504bdd944e9c0d1b545b0b66a9a1adc808714..3cfbc7a93cca1bea4925853e5
> 1d0
> > a147c5722247 100644
> > --- a/gcc/config/arm/types.md
> > +++ b/gcc/config/arm/types.md
> > @@ -483,6 +483,7 @@ (define_attr "autodetect_type"
> >  ; neon_fp_minmax_s_q
> >  ; neon_fp_minmax_d
> >  ; neon_fp_minmax_d_q
> > +; neon_fp_reduc_add_h
> >  ; neon_fp_reduc_add_s
> >  ; neon_fp_reduc_add_s_q
> >  ; neon_fp_reduc_add_d
> > @@ -1033,6 +1034,7 @@ (define_attr "type"
> >    neon_fp_minmax_d,\
> >    neon_fp_minmax_d_q,\
> >  \
> > +  neon_fp_reduc_add_h,\
> >    neon_fp_reduc_add_s,\
> >    neon_fp_reduc_add_s_q,\
> >    neon_fp_reduc_add_d,\
> > @@ -1257,8 +1259,8 @@ (define_attr "is_neon_type" "yes,no"
> >            neon_fp_compare_d, neon_fp_compare_d_q, neon_fp_minmax_s,\
> >            neon_fp_minmax_s_q, neon_fp_minmax_d,
> neon_fp_minmax_d_q,\
> >            neon_fp_neg_s, neon_fp_neg_s_q, neon_fp_neg_d,
> neon_fp_neg_d_q,\
> > -          neon_fp_reduc_add_s, neon_fp_reduc_add_s_q,
> neon_fp_reduc_add_d,\
> > -          neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s,
> > +          neon_fp_reduc_add_h, neon_fp_reduc_add_s,
> neon_fp_reduc_add_s_q,\
> > +          neon_fp_reduc_add_d, neon_fp_reduc_add_d_q,
> > + neon_fp_reduc_minmax_s,\
> >            neon_fp_reduc_minmax_s_q, neon_fp_reduc_minmax_d,\
> >            neon_fp_reduc_minmax_d_q,\
> >            neon_fp_cvt_narrow_s_q, neon_fp_cvt_narrow_d_q,\ diff --git
> > a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> > index
> >
> 07d71a63414b1066ea431e287286ad048515711a..e6021c5a42748701e5326a5c3
> 87a
> > 39a0bbadc9e5 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> > @@ -30,11 +30,9 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int
> n)	\
> >  TEST_ALL (VEC_PERM)
> >
> >  /* We should use one DUP for each of the 8-, 16- and 32-bit types,
> > -   although we currently use LD1RW for _Float16.  We should use two
> > -   DUPs for each of the three 64-bit types.  */
> > +   We should use two DUPs for each of the three 64-bit types.  */
> >  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } }
> > */
> > -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 2 } }
> > */
> > -/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 1 } } */
> > +/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } }
> > +*/
> >  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } }
> > */
> >  /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d,
> > z[0-9]+\.d\n} 3 } } */
> >  /* { dg-final { scan-assembler-not {\tzip2\t} } } */ @@ -53,7 +51,7
> > @@ TEST_ALL (VEC_PERM)
> >  /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */
> >  /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */
> >  /* { dg-final { scan-assembler-not {\tldr} } } */
> > -/* { dg-final { scan-assembler-times {\tstr} 2 } } */
> > -/* { dg-final { scan-assembler-times {\tstr\th[0-9]+} 2 } } */
> > +/* { dg-final { scan-assembler-not {\tstr} } } */
> > +/* { dg-final { scan-assembler-not {\tstr\th[0-9]+} } } */
> >
> >  /* { dg-final { scan-assembler-not {\tuqdec} } } */
  
Richard Sandiford Dec. 6, 2022, 11:05 a.m. UTC | #8
Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Tuesday, December 6, 2022 10:28 AM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>> Subject: Re: [PATCH 5/8]AArch64 aarch64: Make existing V2HF be usable.
>> 
>> Tamar Christina <Tamar.Christina@arm.com> writes:
>> > Hi,
>> >
>> >
>> >> This name might cause confusion with the SVE iterators, where FULL
>> >> means "every bit of the register is used".  How about something like
>> >> VMOVE instead?
>> >>
>> >> With this change, I guess VALL_F16 represents "The set of all modes
>> >> for which the vld1 intrinsics are provided" and VMOVE or whatever is
>> >> "All Advanced SIMD modes suitable for moving, loading, and storing".
>> >> That is, VMOVE extends VALL_F16 with modes that are not manifested
>> >> via intrinsics.
>> >>
>> >
>> > Done.
>> >
>> >> Where is the 2h used, and is it valid syntax in that context?
>> >>
>> >> Same for later instances of 2h.
>> >
>> > They are, but they weren't meant to be in this patch.  They belong in
>> > a separate FP16 series that I won't get to finish for GCC 13 due not
>> > being able to finish writing all the tests.  I have moved them to that patch
>> series though.
>> >
>> > While the addp patch series has been killed, this patch is still good
>> > standalone and improves codegen as shown in the updated testcase.
>> >
>> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>> >
>> > Ok for master?
>> >
>> > Thanks,
>> > Tamar
>> >
>> > gcc/ChangeLog:
>> >
>> > 	* config/aarch64/aarch64-simd.md (*aarch64_simd_movv2hf): New.
>> > 	(mov<mode>, movmisalign<mode>, aarch64_dup_lane<mode>,
>> > 	aarch64_store_lane0<mode>, aarch64_simd_vec_set<mode>,
>> > 	@aarch64_simd_vec_copy_lane<mode>, vec_set<mode>,
>> > 	reduc_<optab>_scal_<mode>, reduc_<fmaxmin>_scal_<mode>,
>> > 	aarch64_reduc_<optab>_internal<mode>,
>> aarch64_get_lane<mode>,
>> > 	vec_init<mode><Vel>, vec_extract<mode><Vel>): Support V2HF.
>> > 	(aarch64_simd_dupv2hf): New.
>> > 	* config/aarch64/aarch64.cc (aarch64_classify_vector_mode):
>> > 	Add E_V2HFmode.
>> > 	* config/aarch64/iterators.md (VHSDF_P): New.
>> > 	(V2F, VMOVE, nunits, Vtype, Vmtype, Vetype, stype, VEL,
>> > 	Vel, q, vp): Add V2HF.
>> > 	* config/arm/types.md (neon_fp_reduc_add_h): New.
>> >
>> > gcc/testsuite/ChangeLog:
>> >
>> > 	* gcc.target/aarch64/sve/slp_1.c: Update testcase.
>> >
>> > --- inline copy of patch ---
>> >
>> > diff --git a/gcc/config/aarch64/aarch64-simd.md
>> > b/gcc/config/aarch64/aarch64-simd.md
>> > index
>> >
>> f4152160084d6b6f34bd69f0ba6386c1ab50f77e..487a31010245accec28e779661
>> e6
>> > c2d578fca4b7 100644
>> > --- a/gcc/config/aarch64/aarch64-simd.md
>> > +++ b/gcc/config/aarch64/aarch64-simd.md
>> > @@ -19,10 +19,10 @@
>> >  ;; <http://www.gnu.org/licenses/>.
>> >
>> >  (define_expand "mov<mode>"
>> > -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
>> > -	(match_operand:VALL_F16 1 "general_operand"))]
>> > +  [(set (match_operand:VMOVE 0 "nonimmediate_operand")
>> > +	(match_operand:VMOVE 1 "general_operand"))]
>> >    "TARGET_SIMD"
>> > -  "
>> > +{
>> >    /* Force the operand into a register if it is not an
>> >       immediate whose use can be replaced with xzr.
>> >       If the mode is 16 bytes wide, then we will be doing @@ -46,12
>> > +46,11 @@ (define_expand "mov<mode>"
>> >        aarch64_expand_vector_init (operands[0], operands[1]);
>> >        DONE;
>> >      }
>> > -  "
>> > -)
>> > +})
>> >
>> >  (define_expand "movmisalign<mode>"
>> > -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
>> > -        (match_operand:VALL_F16 1 "general_operand"))]
>> > +  [(set (match_operand:VMOVE 0 "nonimmediate_operand")
>> > +        (match_operand:VMOVE 1 "general_operand"))]
>> >    "TARGET_SIMD && !STRICT_ALIGNMENT"
>> >  {
>> >    /* This pattern is not permitted to fail during expansion: if both
>> > arguments @@ -73,6 +72,16 @@ (define_insn
>> "aarch64_simd_dup<mode>"
>> >    [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
>> >  )
>> >
>> > +(define_insn "aarch64_simd_dupv2hf"
>> > +  [(set (match_operand:V2HF 0 "register_operand" "=w")
>> > +	(vec_duplicate:V2HF
>> > +	  (match_operand:HF 1 "register_operand" "0")))]
>> 
>> Seems like this should be "w" rather than "0", since SLI is a two-register
>> instruction.
>
> Yes, but for a dup it's only valid when the same register is used. i.e. it has to
> write into the original src register.

Ah, right.  In that case it might be better to use %d0 for the source
operand:

  For operands to match in a particular case usually means that they
  are identical-looking RTL expressions.  But in a few special cases
  specific kinds of dissimilarity are allowed.  For example, @code{*x}
  as an input operand will match @code{*x++} as an output operand.
  For proper results in such cases, the output template should always
  use the output-operand's number when printing the operand.

Thanks,
Richard
  

Patch

--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -19,10 +19,10 @@ 
 ;; <http://www.gnu.org/licenses/>.
 
 (define_expand "mov<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
-	(match_operand:VALL_F16 1 "general_operand"))]
+  [(set (match_operand:VALL_F16_FULL 0 "nonimmediate_operand")
+	(match_operand:VALL_F16_FULL 1 "general_operand"))]
   "TARGET_SIMD"
-  "
+{
   /* Force the operand into a register if it is not an
      immediate whose use can be replaced with xzr.
      If the mode is 16 bytes wide, then we will be doing
@@ -46,12 +46,11 @@  (define_expand "mov<mode>"
       aarch64_expand_vector_init (operands[0], operands[1]);
       DONE;
     }
-  "
-)
+})
 
 (define_expand "movmisalign<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
-        (match_operand:VALL_F16 1 "general_operand"))]
+  [(set (match_operand:VALL_F16_FULL 0 "nonimmediate_operand")
+        (match_operand:VALL_F16_FULL 1 "general_operand"))]
   "TARGET_SIMD && !STRICT_ALIGNMENT"
 {
   /* This pattern is not permitted to fail during expansion: if both arguments
@@ -85,10 +84,10 @@  (define_insn "aarch64_simd_dup<mode>"
 )
 
 (define_insn "aarch64_dup_lane<mode>"
-  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
-	(vec_duplicate:VALL_F16
+  [(set (match_operand:VALL_F16_FULL 0 "register_operand" "=w")
+	(vec_duplicate:VALL_F16_FULL
 	  (vec_select:<VEL>
-	    (match_operand:VALL_F16 1 "register_operand" "w")
+	    (match_operand:VALL_F16_FULL 1 "register_operand" "w")
 	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
           )))]
   "TARGET_SIMD"
@@ -142,6 +141,29 @@  (define_insn "*aarch64_simd_mov<VDMOV:mode>"
 		     mov_reg, neon_move<q>")]
 )
 
+(define_insn "*aarch64_simd_movv2hf"
+  [(set (match_operand:V2HF 0 "nonimmediate_operand"
+		"=w, m,  m,  w, ?r, ?w, ?r, w, w")
+	(match_operand:V2HF 1 "general_operand"
+		"m,  Dz, w,  w,  w,  r,  r, Dz, Dn"))]
+  "TARGET_SIMD_F16INST
+   && (register_operand (operands[0], V2HFmode)
+       || aarch64_simd_reg_or_zero (operands[1], V2HFmode))"
+   "@
+    ldr\\t%s0, %1
+    str\\twzr, %0
+    str\\t%s1, %0
+    mov\\t%0.2s[0], %1.2s[0]
+    umov\\t%w0, %1.s[0]
+    fmov\\t%s0, %1
+    mov\\t%0, %1
+    movi\\t%d0, 0
+    * return aarch64_output_simd_mov_immediate (operands[1], 32);"
+  [(set_attr "type" "neon_load1_1reg, store_8, neon_store1_1reg,\
+		     neon_logic, neon_to_gp, f_mcr,\
+		     mov_reg, neon_move, neon_move")]
+)
+
 (define_insn "*aarch64_simd_mov<VQMOV:mode>"
   [(set (match_operand:VQMOV 0 "nonimmediate_operand"
 		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
@@ -182,7 +204,7 @@  (define_insn "*aarch64_simd_mov<VQMOV:mode>"
 
 (define_insn "aarch64_store_lane0<mode>"
   [(set (match_operand:<VEL> 0 "memory_operand" "=m")
-	(vec_select:<VEL> (match_operand:VALL_F16 1 "register_operand" "w")
+	(vec_select:<VEL> (match_operand:VALL_F16_FULL 1 "register_operand" "w")
 			(parallel [(match_operand 2 "const_int_operand" "n")])))]
   "TARGET_SIMD
    && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 0"
@@ -1035,11 +1057,11 @@  (define_insn "one_cmpl<mode>2"
 )
 
 (define_insn "aarch64_simd_vec_set<mode>"
-  [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
-	(vec_merge:VALL_F16
-	    (vec_duplicate:VALL_F16
+  [(set (match_operand:VALL_F16_FULL 0 "register_operand" "=w,w,w")
+	(vec_merge:VALL_F16_FULL
+	    (vec_duplicate:VALL_F16_FULL
 		(match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "w,?r,Utv"))
-	    (match_operand:VALL_F16 3 "register_operand" "0,0,0")
+	    (match_operand:VALL_F16_FULL 3 "register_operand" "0,0,0")
 	    (match_operand:SI 2 "immediate_operand" "i,i,i")))]
   "TARGET_SIMD"
   {
@@ -1061,14 +1083,14 @@  (define_insn "aarch64_simd_vec_set<mode>"
 )
 
 (define_insn "@aarch64_simd_vec_copy_lane<mode>"
-  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
-	(vec_merge:VALL_F16
-	    (vec_duplicate:VALL_F16
+  [(set (match_operand:VALL_F16_FULL 0 "register_operand" "=w")
+	(vec_merge:VALL_F16_FULL
+	    (vec_duplicate:VALL_F16_FULL
 	      (vec_select:<VEL>
-		(match_operand:VALL_F16 3 "register_operand" "w")
+		(match_operand:VALL_F16_FULL 3 "register_operand" "w")
 		(parallel
 		  [(match_operand:SI 4 "immediate_operand" "i")])))
-	    (match_operand:VALL_F16 1 "register_operand" "0")
+	    (match_operand:VALL_F16_FULL 1 "register_operand" "0")
 	    (match_operand:SI 2 "immediate_operand" "i")))]
   "TARGET_SIMD"
   {
@@ -1376,7 +1398,7 @@  (define_insn "vec_shr_<mode>"
 )
 
 (define_expand "vec_set<mode>"
-  [(match_operand:VALL_F16 0 "register_operand")
+  [(match_operand:VALL_F16_FULL 0 "register_operand")
    (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand")
    (match_operand:SI 2 "immediate_operand")]
   "TARGET_SIMD"
@@ -3503,7 +3525,7 @@  (define_insn "popcount<mode>2"
 ;; gimple_fold'd to the IFN_REDUC_(MAX|MIN) function.  (This is FP smax/smin).
 (define_expand "reduc_<optab>_scal_<mode>"
   [(match_operand:<VEL> 0 "register_operand")
-   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
+   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
 		 FMAXMINV)]
   "TARGET_SIMD"
   {
@@ -3518,7 +3540,7 @@  (define_expand "reduc_<optab>_scal_<mode>"
 
 (define_expand "reduc_<fmaxmin>_scal_<mode>"
   [(match_operand:<VEL> 0 "register_operand")
-   (unspec:<VEL> [(match_operand:VHSDF 1 "register_operand")]
+   (unspec:<VEL> [(match_operand:VHSDF_P 1 "register_operand")]
 		 FMAXMINNMV)]
   "TARGET_SIMD"
   {
@@ -3562,8 +3584,8 @@  (define_insn "aarch64_reduc_<optab>_internalv2si"
 )
 
 (define_insn "aarch64_reduc_<optab>_internal<mode>"
- [(set (match_operand:VHSDF 0 "register_operand" "=w")
-       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
+ [(set (match_operand:VHSDF_P 0 "register_operand" "=w")
+       (unspec:VHSDF_P [(match_operand:VHSDF_P 1 "register_operand" "w")]
 		      FMAXMINV))]
  "TARGET_SIMD"
  "<maxmin_uns_op><vp>\\t%<Vetype>0, %1.<Vtype>"
@@ -4208,7 +4230,7 @@  (define_insn "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
 (define_insn_and_split "aarch64_get_lane<mode>"
   [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=?r, w, Utv")
 	(vec_select:<VEL>
-	  (match_operand:VALL_F16 1 "register_operand" "w, w, w")
+	  (match_operand:VALL_F16_FULL 1 "register_operand" "w, w, w")
 	  (parallel [(match_operand:SI 2 "immediate_operand" "i, i, i")])))]
   "TARGET_SIMD"
   {
@@ -7989,7 +8011,7 @@  (define_expand "aarch64_st1<VALL_F16:mode>"
 ;; Standard pattern name vec_init<mode><Vel>.
 
 (define_expand "vec_init<mode><Vel>"
-  [(match_operand:VALL_F16 0 "register_operand")
+  [(match_operand:VALL_F16_FULL 0 "register_operand")
    (match_operand 1 "" "")]
   "TARGET_SIMD"
 {
@@ -8068,7 +8090,7 @@  (define_insn "aarch64_urecpe<mode>"
 
 (define_expand "vec_extract<mode><Vel>"
   [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand")
-   (match_operand:VALL_F16 1 "register_operand")
+   (match_operand:VALL_F16_FULL 1 "register_operand")
    (match_operand:SI 2 "immediate_operand")]
   "TARGET_SIMD"
 {
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f05bac713e88ea8c7feaa2367d55bd523ca66f57..1e08f8453688210afe1566092b19b59c9bdd0c97 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -3566,6 +3566,7 @@  aarch64_classify_vector_mode (machine_mode mode)
     case E_V8BFmode:
     case E_V4SFmode:
     case E_V2DFmode:
+    case E_V2HFmode:
       return TARGET_SIMD ? VEC_ADVSIMD : 0;
 
     default:
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 37d8161a33b1c399d80be82afa67613a087389d4..1df09f7fe2eb35aed96113476541e0faa5393551 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -160,6 +160,10 @@  (define_mode_iterator VDQF [V2SF V4SF V2DF])
 (define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
 			     (V8HF "TARGET_SIMD_F16INST")
 			     V2SF V4SF V2DF])
+;; Advanced SIMD Float modes suitable for pairwise operations.
+(define_mode_iterator VHSDF_P [(V4HF "TARGET_SIMD_F16INST")
+			       (V8HF "TARGET_SIMD_F16INST")
+			       V2SF V4SF V2DF (V2HF "TARGET_SIMD_F16INST")])
 
 ;; Advanced SIMD Float modes, and DF.
 (define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF])
@@ -188,15 +192,23 @@  (define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF V2DI])
 (define_mode_iterator VALLF [V2SF V4SF V2DF SF DF])
 
 ;; Advanced SIMD Float modes with 2 elements.
-(define_mode_iterator V2F [V2SF V2DF])
+(define_mode_iterator V2F [V2SF V2DF V2HF])
 
 ;; All Advanced SIMD modes on which we support any arithmetic operations.
 (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
 
-;; All Advanced SIMD modes suitable for moving, loading, and storing.
+;; All Advanced SIMD modes suitable for moving, loading, and storing
+;; except V2HF.
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
 				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
 
+;; All Advanced SIMD modes suitable for moving, loading, and storing
+;; including V2HF
+(define_mode_iterator VALL_F16_FULL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+				     V4HF V8HF V4BF V8BF V2SF V4SF V2DF
+				     (V2HF "TARGET_SIMD_F16INST")])
+
+
 ;; The VALL_F16 modes except the 128-bit 2-element ones.
 (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
 				V4HF V8HF V2SF V4SF])
@@ -1076,7 +1088,7 @@  (define_mode_attr nunits [(V8QI "8") (V16QI "16")
 			  (V2SF "2") (V4SF "4")
 			  (V1DF "1") (V2DF "2")
 			  (DI "1") (DF "1")
-			  (V8DI "8")])
+			  (V8DI "8") (V2HF "2")])
 
 ;; Map a mode to the number of bits in it, if the size of the mode
 ;; is constant.
@@ -1090,6 +1102,7 @@  (define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")])
 
 ;; Give the length suffix letter for a sign- or zero-extension.
 (define_mode_attr size [(QI "b") (HI "h") (SI "w")])
+(define_mode_attr sizel [(QI "b") (HI "h") (SI "")])
 
 ;; Give the number of bits in the mode
 (define_mode_attr sizen [(QI "8") (HI "16") (SI "32") (DI "64")])
@@ -1134,8 +1147,9 @@  (define_mode_attr Vtype [(V8QI "8b") (V16QI "16b")
                          (V2SI "2s") (V4SI  "4s")
                          (DI   "1d") (DF    "1d")
                          (V2DI "2d") (V2SF "2s")
-			 (V4SF "4s") (V2DF "2d")
-			 (V4HF "4h") (V8HF "8h")
+			 (V2HF "2h") (V4SF "4s")
+			 (V2DF "2d") (V4HF "4h")
+			 (V8HF "8h")
 			 (V2x8QI "8b") (V2x4HI "4h")
 			 (V2x2SI "2s") (V2x1DI  "1d")
 			 (V2x4HF "4h") (V2x2SF "2s")
@@ -1175,9 +1189,10 @@  (define_mode_attr Vmtype [(V8QI ".8b") (V16QI ".16b")
 			 (V4HI ".4h") (V8HI  ".8h")
 			 (V2SI ".2s") (V4SI  ".4s")
 			 (V2DI ".2d") (V4HF ".4h")
-			 (V8HF ".8h") (V4BF ".4h")
-			 (V8BF ".8h") (V2SF ".2s")
-			 (V4SF ".4s") (V2DF ".2d")
+			 (V8HF ".8h") (V2HF ".2h")
+			 (V4BF ".4h") (V8BF ".8h")
+			 (V2SF ".2s") (V4SF ".4s")
+			 (V2DF ".2d")
 			 (DI   "")    (SI   "")
 			 (HI   "")    (QI   "")
 			 (TI   "")    (HF   "")
@@ -1193,7 +1208,7 @@  (define_mode_attr Vmntype [(V8HI ".8b") (V4SI ".4h")
 (define_mode_attr Vetype [(V8QI "b") (V16QI "b")
 			  (V4HI "h") (V8HI  "h")
 			  (V2SI "s") (V4SI  "s")
-			  (V2DI "d")
+			  (V2DI "d") (V2HF  "h")
 			  (V4HF "h") (V8HF  "h")
 			  (V2SF "s") (V4SF  "s")
 			  (V2DF "d")
@@ -1285,7 +1300,7 @@  (define_mode_attr Vcwtype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "w") (VNx2QI "d")
 ;; more accurately.
 (define_mode_attr stype [(V8QI "b") (V16QI "b") (V4HI "s") (V8HI "s")
 			 (V2SI "s") (V4SI "s") (V2DI "d") (V4HF "s")
-			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d")
+			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d") (V2HF "s")
 			 (HF "s") (SF "s") (DF "d") (QI "b") (HI "s")
 			 (SI "s") (DI "d")])
 
@@ -1360,8 +1375,8 @@  (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
 		       (V4HF "HF") (V8HF  "HF")
 		       (V2SF "SF") (V4SF  "SF")
 		       (DF   "DF") (V2DF  "DF")
-		       (SI   "SI") (HI    "HI")
-		       (QI   "QI")
+		       (SI   "SI") (V2HF  "HF")
+		       (QI   "QI") (HI    "HI")
 		       (V4BF "BF") (V8BF "BF")
 		       (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
 		       (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
@@ -1381,7 +1396,7 @@  (define_mode_attr Vel [(V8QI "qi") (V16QI "qi")
 		       (V2SF "sf") (V4SF "sf")
 		       (V2DF "df") (DF   "df")
 		       (SI   "si") (HI   "hi")
-		       (QI   "qi")
+		       (QI   "qi") (V2HF "hf")
 		       (V4BF "bf") (V8BF "bf")
 		       (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
 		       (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
@@ -1866,7 +1881,7 @@  (define_mode_attr q [(V8QI "") (V16QI "_q")
 		     (V4HF "") (V8HF "_q")
 		     (V4BF "") (V8BF "_q")
 		     (V2SF "") (V4SF  "_q")
-			       (V2DF  "_q")
+		     (V2HF "") (V2DF  "_q")
 		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")
 		     (V2x8QI "") (V2x16QI "_q")
 		     (V2x4HI "") (V2x8HI "_q")
@@ -1905,6 +1920,7 @@  (define_mode_attr vp [(V8QI "v") (V16QI "v")
 		      (V2SI "p") (V4SI  "v")
 		      (V2DI "p") (V2DF  "p")
 		      (V2SF "p") (V4SF  "v")
+		      (V2HF "p")
 		      (V4HF "v") (V8HF  "v")])
 
 (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")
diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
index 7d0504bdd944e9c0d1b545b0b66a9a1adc808714..3cfbc7a93cca1bea4925853e51d0a147c5722247 100644
--- a/gcc/config/arm/types.md
+++ b/gcc/config/arm/types.md
@@ -483,6 +483,7 @@  (define_attr "autodetect_type"
 ; neon_fp_minmax_s_q
 ; neon_fp_minmax_d
 ; neon_fp_minmax_d_q
+; neon_fp_reduc_add_h
 ; neon_fp_reduc_add_s
 ; neon_fp_reduc_add_s_q
 ; neon_fp_reduc_add_d
@@ -1033,6 +1034,7 @@  (define_attr "type"
   neon_fp_minmax_d,\
   neon_fp_minmax_d_q,\
 \
+  neon_fp_reduc_add_h,\
   neon_fp_reduc_add_s,\
   neon_fp_reduc_add_s_q,\
   neon_fp_reduc_add_d,\
@@ -1257,8 +1259,8 @@  (define_attr "is_neon_type" "yes,no"
           neon_fp_compare_d, neon_fp_compare_d_q, neon_fp_minmax_s,\
           neon_fp_minmax_s_q, neon_fp_minmax_d, neon_fp_minmax_d_q,\
           neon_fp_neg_s, neon_fp_neg_s_q, neon_fp_neg_d, neon_fp_neg_d_q,\
-          neon_fp_reduc_add_s, neon_fp_reduc_add_s_q, neon_fp_reduc_add_d,\
-          neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s,
+          neon_fp_reduc_add_h, neon_fp_reduc_add_s, neon_fp_reduc_add_s_q,\
+          neon_fp_reduc_add_d, neon_fp_reduc_add_d_q, neon_fp_reduc_minmax_s,\
           neon_fp_reduc_minmax_s_q, neon_fp_reduc_minmax_d,\
           neon_fp_reduc_minmax_d_q,\
           neon_fp_cvt_narrow_s_q, neon_fp_cvt_narrow_d_q,\
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
index 07d71a63414b1066ea431e287286ad048515711a..8e35e0b574d49913b43c7d8d4f4ba75f127f42e9 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
@@ -30,11 +30,9 @@  vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n)	\
 TEST_ALL (VEC_PERM)
 
 /* We should use one DUP for each of the 8-, 16- and 32-bit types,
-   although we currently use LD1RW for _Float16.  We should use two
-   DUPs for each of the three 64-bit types.  */
+   We should use two DUPs for each of the three 64-bit types.  */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 2 } } */
-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } } */
 /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
 /* { dg-final { scan-assembler-not {\tzip2\t} } } */