[middle-end] - Lower store and load neon builtins to gimple

Message ID VE1PR08MB4896ED647FEF8DA13E8620748ADC9@VE1PR08MB4896.eurprd08.prod.outlook.com
State New
Headers
Series [middle-end] - Lower store and load neon builtins to gimple |

Commit Message

Jirui Wu Sept. 16, 2021, 1:39 p.m. UTC
  Hi all,

This patch lowers the vld1 and vst1 variants of the
store and load neon builtins functions to gimple.

The changes in this patch covers:
* Replaces calls to the vld1 and vst1 variants of the builtins
* Uses MEM_REF gimple assignments to generate better code
* Updates test cases to prevent over optimization

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master? If OK can it be committed for me, I have no commit rights.

Thanks,
Jirui

gcc/ChangeLog:

        * config/aarch64/aarch64-builtins.c (aarch64_general_gimple_fold_builtin):
lower vld1 and vst1 variants of the neon builtins

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/fmla_intrinsic_1.c:
prevent over optimization
        * gcc.target/aarch64/fmls_intrinsic_1.c:
prevent over optimization
        * gcc.target/aarch64/fmul_intrinsic_1.c:
prevent over optimization
        * gcc.target/aarch64/mla_intrinsic_1.c:
prevent over optimization
        * gcc.target/aarch64/mls_intrinsic_1.c:
prevent over optimization
        * gcc.target/aarch64/mul_intrinsic_1.c:
prevent over optimization
        * gcc.target/aarch64/simd/vmul_elem_1.c:
prevent over optimization
        * gcc.target/aarch64/vclz.c:
replace macro with function to prevent over optimization
        * gcc.target/aarch64/vneg_s.c:
replace macro with function to prevent over optimization
  

Comments

Richard Biener Sept. 16, 2021, 1:58 p.m. UTC | #1
On Thu, 16 Sep 2021, Jirui Wu wrote:

> Hi all,
> 
> This patch lowers the vld1 and vst1 variants of the
> store and load neon builtins functions to gimple.
> 
> The changes in this patch covers:
> * Replaces calls to the vld1 and vst1 variants of the builtins
> * Uses MEM_REF gimple assignments to generate better code
> * Updates test cases to prevent over optimization
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master? If OK can it be committed for me, I have no commit rights.

+           new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
+                                           fold_build2 (MEM_REF,
+                                           TREE_TYPE
+                                           (gimple_call_lhs (stmt)),
+                                           args[0], build_int_cst
+                                           (TREE_TYPE (args[0]), 0)));

you are using TBAA info based on the formal argument type that might
have pointer conversions stripped.  Instead you should use a type
based on the specification of the intrinsics (or the builtins).

Likewise for the type of the access (mind alignment info there!).

Richard.

> Thanks,
> Jirui
> 
> gcc/ChangeLog:
> 
>         * config/aarch64/aarch64-builtins.c (aarch64_general_gimple_fold_builtin):
> lower vld1 and vst1 variants of the neon builtins
> 
> gcc/testsuite/ChangeLog:
> 
>         * gcc.target/aarch64/fmla_intrinsic_1.c:
> prevent over optimization
>         * gcc.target/aarch64/fmls_intrinsic_1.c:
> prevent over optimization
>         * gcc.target/aarch64/fmul_intrinsic_1.c:
> prevent over optimization
>         * gcc.target/aarch64/mla_intrinsic_1.c:
> prevent over optimization
>         * gcc.target/aarch64/mls_intrinsic_1.c:
> prevent over optimization
>         * gcc.target/aarch64/mul_intrinsic_1.c:
> prevent over optimization
>         * gcc.target/aarch64/simd/vmul_elem_1.c:
> prevent over optimization
>         * gcc.target/aarch64/vclz.c:
> replace macro with function to prevent over optimization
>         * gcc.target/aarch64/vneg_s.c:
> replace macro with function to prevent over optimization
>
  

Patch

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index eef9fc0f4440d7db359e53a7b4e21e48cf2a65f4..027491414da16b66a7fe922a1b979d97f553b724 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -2382,6 +2382,31 @@  aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt)
 					       1, args[0]);
 	gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
 	break;
+      /*Lower store and load neon builtins to gimple.  */
+      BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
+	if (!BYTES_BIG_ENDIAN)
+	  {
+	    new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
+					    fold_build2 (MEM_REF,
+					    TREE_TYPE
+					    (gimple_call_lhs (stmt)),
+					    args[0], build_int_cst
+					    (TREE_TYPE (args[0]), 0)));
+	  }
+	break;
+      BUILTIN_VALL_F16 (STORE1, st1, 0, STORE)
+	if (!BYTES_BIG_ENDIAN)
+	  {
+	  new_stmt = gimple_build_assign (fold_build2 (MEM_REF,
+						   TREE_TYPE (gimple_call_arg
+							     (stmt, 1)),
+						   gimple_call_arg (stmt, 0),
+						   build_int_cst
+						   (TREE_TYPE (gimple_call_arg
+							      (stmt, 0)), 0)),
+				       gimple_call_arg (stmt, 1));
+	  }
+	break;
       BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10, ALL)
       BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10, ALL)
 	new_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
index 59ad41ed0471b17418c395f31fbe666b60ec3623..bef31c45650dcd088b38a755083e6bd9fe530c52 100644
--- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
@@ -11,6 +11,7 @@  extern void abort (void);
 
 #define TEST_VMLA(q1, q2, size, in1_lanes, in2_lanes)			\
 static void								\
+__attribute__((noipa,noinline))						\
 test_vfma##q1##_lane##q2##_f##size (float##size##_t * res,		\
 				   const float##size##_t *in1,		\
 				   const float##size##_t *in2)		\
@@ -104,12 +105,12 @@  main (int argc, char **argv)
    vfmaq_laneq_f32.  */
 /* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[\[0-9\]+\\\]" 2 } } */
 
-/* vfma_lane_f64.  */
-/* { dg-final { scan-assembler-times "fmadd\\td\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+" 1 } } */
+/* vfma_lane_f64.
+   vfma_laneq_f64.  */
+/* { dg-final { scan-assembler-times "fmadd\\td\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+" 2 } } */
 
 /* vfmaq_lane_f64.
-   vfma_laneq_f64.
    vfmaq_laneq_f64.  */
-/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 3 } } */
+/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 2 } } */
 
 
diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
index 2d5a3d305360a08a9663cfd497cb1a5374b4b327..865def28c3f4d04042ab495d232bb865cabb2b50 100644
--- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
@@ -11,6 +11,7 @@  extern void abort (void);
 
 #define TEST_VMLS(q1, q2, size, in1_lanes, in2_lanes)			\
 static void								\
+__attribute__((noipa,noinline))						\
 test_vfms##q1##_lane##q2##_f##size (float##size##_t * res,		\
 				   const float##size##_t *in1,		\
 				   const float##size##_t *in2)		\
@@ -105,12 +106,12 @@  main (int argc, char **argv)
    vfmsq_laneq_f32.  */
 /* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[\[0-9\]+\\\]" 2 } } */
 
-/* vfms_lane_f64.  */
-/* { dg-final { scan-assembler-times "fmsub\\td\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+" 1 } } */
+/* vfms_lane_f64.
+   vfms_laneq_f64.  */
+/* { dg-final { scan-assembler-times "fmsub\\td\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+" 2 } } */
 
 /* vfmsq_lane_f64.
-   vfms_laneq_f64.
    vfmsq_laneq_f64.  */
-/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 3 } } */
+/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 2 } } */
 
 
diff --git a/gcc/testsuite/gcc.target/aarch64/fmul_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmul_intrinsic_1.c
index 8b0880d89b13596dea7db79c14cb7d124cf7079c..63dc56c70a2572c6a8789c5a75713a7952ab9746 100644
--- a/gcc/testsuite/gcc.target/aarch64/fmul_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmul_intrinsic_1.c
@@ -9,6 +9,7 @@  extern double fabs (double);
 
 #define TEST_VMUL(q1, q2, size, in1_lanes, in2_lanes)			\
 static void								\
+__attribute__((noipa,noinline))						\
 test_vmul##q1##_lane##q2##_f##size (float##size##_t * res,		\
 				   const float##size##_t *in1,		\
 				   const float##size##_t *in2)		\
diff --git a/gcc/testsuite/gcc.target/aarch64/mla_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/mla_intrinsic_1.c
index 46b3c78c131ea92eae208d399ef25c71cd8446f7..885bfb39b797e6d095aaecafa0271094c34fbea5 100644
--- a/gcc/testsuite/gcc.target/aarch64/mla_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/mla_intrinsic_1.c
@@ -11,6 +11,7 @@  extern void abort (void);
 
 #define TEST_VMLA(q, su, size, in1_lanes, in2_lanes)		\
 static void							\
+__attribute__((noipa,noinline))					\
 test_vmlaq_lane##q##_##su##size (MAP##su (size, ) * res,	\
 				 const MAP##su(size, ) *in1,	\
 				 const MAP##su(size, ) *in2)	\
diff --git a/gcc/testsuite/gcc.target/aarch64/mls_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/mls_intrinsic_1.c
index e01a4f6d0e1e83cac042a2cad4f02664b87e8c05..df046ce32c032bce70559a842d52001264ecbcbc 100644
--- a/gcc/testsuite/gcc.target/aarch64/mls_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/mls_intrinsic_1.c
@@ -11,6 +11,7 @@  extern void abort (void);
 
 #define TEST_VMLS(q, su, size, in1_lanes, in2_lanes)		\
 static void							\
+__attribute__((noipa,noinline))					\
 test_vmlsq_lane##q##_##su##size (MAP##su (size, ) * res,	\
 				 const MAP##su(size, ) *in1,	\
 				 const MAP##su(size, ) *in2)	\
diff --git a/gcc/testsuite/gcc.target/aarch64/mul_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/mul_intrinsic_1.c
index 00ef4f2de6c5510638b7e31990c0754f60d3e4d0..517b937f3e1b612d5a9c3c2f68a529a631d848e0 100644
--- a/gcc/testsuite/gcc.target/aarch64/mul_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/mul_intrinsic_1.c
@@ -11,6 +11,7 @@  extern void abort (void);
 
 #define TEST_VMUL(q, su, size, in1_lanes, in2_lanes)		\
 static void							\
+__attribute__((noipa,noinline))					\
 test_vmulq_lane##q##_##su##size (MAP##su (size, ) * res,	\
 				 const MAP##su(size, ) *in1,	\
 				 const MAP##su(size, ) *in2)	\
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c
index a1faefd88bacabadf45bf5a22ca5481db13c41cb..ffa391aeae1fa0b52ef4ad7ae040a8bc40e160d2 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c
@@ -146,12 +146,14 @@  check_v2sf (float32_t elemA, float32_t elemB)
 
   vst1_f32 (vec32x2_res, vmul_n_f32 (vec32x2_src, elemA));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 2; indx++)
     if (* (uint32_t *) &vec32x2_res[indx] != * (uint32_t *) &expected2_1[indx])
       abort ();
 
   vst1_f32 (vec32x2_res, vmul_n_f32 (vec32x2_src, elemB));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 2; indx++)
     if (* (uint32_t *) &vec32x2_res[indx] != * (uint32_t *) &expected2_2[indx])
       abort ();
@@ -169,24 +171,28 @@  check_v4sf (float32_t elemA, float32_t elemB, float32_t elemC, float32_t elemD)
 
   vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemA));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_1[indx])
       abort ();
 
   vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemB));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_2[indx])
       abort ();
 
   vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemC));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_3[indx])
       abort ();
 
   vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemD));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_4[indx])
       abort ();
@@ -204,12 +210,14 @@  check_v2df (float64_t elemdC, float64_t elemdD)
 
   vst1q_f64 (vec64x2_res, vmulq_n_f64 (vec64x2_src, elemdC));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 2; indx++)
     if (* (uint64_t *) &vec64x2_res[indx] != * (uint64_t *) &expectedd2_1[indx])
       abort ();
 
   vst1q_f64 (vec64x2_res, vmulq_n_f64 (vec64x2_src, elemdD));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 2; indx++)
     if (* (uint64_t *) &vec64x2_res[indx] != * (uint64_t *) &expectedd2_2[indx])
       abort ();
@@ -227,12 +235,14 @@  check_v2si (int32_t elemsA, int32_t elemsB)
 
   vst1_s32 (vecs32x2_res, vmul_n_s32 (vecs32x2_src, elemsA));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 2; indx++)
     if (vecs32x2_res[indx] != expecteds2_1[indx])
       abort ();
 
   vst1_s32 (vecs32x2_res, vmul_n_s32 (vecs32x2_src, elemsB));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 2; indx++)
     if (vecs32x2_res[indx] != expecteds2_2[indx])
       abort ();
@@ -248,12 +258,14 @@  check_v2si_unsigned (uint32_t elemusA, uint32_t elemusB)
 
   vst1_u32 (vecus32x2_res, vmul_n_u32 (vecus32x2_src, elemusA));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 2; indx++)
     if (vecus32x2_res[indx] != expectedus2_1[indx])
       abort ();
 
   vst1_u32 (vecus32x2_res, vmul_n_u32 (vecus32x2_src, elemusB));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 2; indx++)
     if (vecus32x2_res[indx] != expectedus2_2[indx])
       abort ();
@@ -271,24 +283,28 @@  check_v4si (int32_t elemsA, int32_t elemsB, int32_t elemsC, int32_t elemsD)
 
   vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsA));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (vecs32x4_res[indx] != expecteds4_1[indx])
       abort ();
 
   vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsB));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (vecs32x4_res[indx] != expecteds4_2[indx])
       abort ();
 
   vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsC));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (vecs32x4_res[indx] != expecteds4_3[indx])
       abort ();
 
   vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsD));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (vecs32x4_res[indx] != expecteds4_4[indx])
       abort ();
@@ -305,24 +321,28 @@  check_v4si_unsigned (uint32_t elemusA, uint32_t elemusB, uint32_t elemusC,
 
   vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusA));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (vecus32x4_res[indx] != expectedus4_1[indx])
       abort ();
 
   vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusB));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (vecus32x4_res[indx] != expectedus4_2[indx])
       abort ();
 
   vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusC));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (vecus32x4_res[indx] != expectedus4_3[indx])
       abort ();
 
   vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusD));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (vecus32x4_res[indx] != expectedus4_4[indx])
       abort ();
@@ -341,24 +361,28 @@  check_v4hi (int16_t elemhA, int16_t elemhB, int16_t elemhC, int16_t elemhD)
 
   vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhA));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (vech16x4_res[indx] != expectedh4_1[indx])
       abort ();
 
   vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhB));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (vech16x4_res[indx] != expectedh4_2[indx])
       abort ();
 
   vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhC));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (vech16x4_res[indx] != expectedh4_3[indx])
       abort ();
 
   vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhD));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (vech16x4_res[indx] != expectedh4_4[indx])
       abort ();
@@ -375,24 +399,28 @@  check_v4hi_unsigned (uint16_t elemuhA, uint16_t elemuhB, uint16_t elemuhC,
 
   vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhA));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (vecuh16x4_res[indx] != expecteduh4_1[indx])
       abort ();
 
   vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhB));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (vecuh16x4_res[indx] != expecteduh4_2[indx])
       abort ();
 
   vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhC));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (vecuh16x4_res[indx] != expecteduh4_3[indx])
       abort ();
 
   vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhD));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 4; indx++)
     if (vecuh16x4_res[indx] != expecteduh4_4[indx])
       abort ();
@@ -411,48 +439,56 @@  check_v8hi (int16_t elemhA, int16_t elemhB, int16_t elemhC, int16_t elemhD,
 
   vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhA));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 8; indx++)
     if (vech16x8_res[indx] != expectedh8_1[indx])
       abort ();
 
   vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhB));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 8; indx++)
     if (vech16x8_res[indx] != expectedh8_2[indx])
       abort ();
 
   vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhC));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 8; indx++)
     if (vech16x8_res[indx] != expectedh8_3[indx])
       abort ();
 
   vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhD));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 8; indx++)
     if (vech16x8_res[indx] != expectedh8_4[indx])
       abort ();
 
   vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhE));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 8; indx++)
     if (vech16x8_res[indx] != expectedh8_5[indx])
       abort ();
 
   vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhF));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 8; indx++)
     if (vech16x8_res[indx] != expectedh8_6[indx])
       abort ();
 
   vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhG));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 8; indx++)
     if (vech16x8_res[indx] != expectedh8_7[indx])
       abort ();
 
   vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhH));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 8; indx++)
     if (vech16x8_res[indx] != expectedh8_8[indx])
       abort ();
@@ -470,48 +506,56 @@  check_v8hi_unsigned (uint16_t elemuhA, uint16_t elemuhB, uint16_t elemuhC,
 
   vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhA));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 8; indx++)
     if (vecuh16x8_res[indx] != expecteduh8_1[indx])
       abort ();
 
   vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhB));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 8; indx++)
     if (vecuh16x8_res[indx] != expecteduh8_2[indx])
       abort ();
 
   vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhC));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 8; indx++)
     if (vecuh16x8_res[indx] != expecteduh8_3[indx])
       abort ();
 
   vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhD));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 8; indx++)
     if (vecuh16x8_res[indx] != expecteduh8_4[indx])
       abort ();
 
   vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhE));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 8; indx++)
     if (vecuh16x8_res[indx] != expecteduh8_5[indx])
       abort ();
 
   vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhF));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 8; indx++)
     if (vecuh16x8_res[indx] != expecteduh8_6[indx])
       abort ();
 
   vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhG));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 8; indx++)
     if (vecuh16x8_res[indx] != expecteduh8_7[indx])
       abort ();
 
   vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhH));
 
+  asm volatile ("" : : : "memory");
   for (indx = 0; indx < 8; indx++)
     if (vecuh16x8_res[indx] != expecteduh8_8[indx])
       abort ();
diff --git a/gcc/testsuite/gcc.target/aarch64/vclz.c b/gcc/testsuite/gcc.target/aarch64/vclz.c
index a36ee44fc1658886f04dff19b946b933f9668008..ca4d17426e645c0f8bbe3a4cdd962848b4e1cbed 100644
--- a/gcc/testsuite/gcc.target/aarch64/vclz.c
+++ b/gcc/testsuite/gcc.target/aarch64/vclz.c
@@ -66,22 +66,62 @@  extern void abort (void);
 #define CLZ_INST(reg_len, data_len, is_signed) \
   CONCAT1 (vclz, POSTFIX (reg_len, data_len, is_signed))
 
-#define RUN_TEST(test_set, answ_set, reg_len, data_len, is_signed, n)	\
-  INHIB_OPTIMIZATION;							\
-  a = LOAD_INST (reg_len, data_len, is_signed) (test_set);		\
-  b = LOAD_INST (reg_len, data_len, is_signed) (answ_set);	        \
-  a = CLZ_INST (reg_len, data_len, is_signed) (a);			\
-  for (i = 0; i < n; i++)						\
-    if (a [i] != b [i])							\
-      return 1;
+#define BUILD_TEST(type, size, lanes)			    \
+int __attribute__((noipa,noinline))			    \
+run_test##type##size##x##lanes (int##size##_t* test_set,    \
+				int##size##_t* answ_set,    \
+				int reg_len, int data_len,  \
+				int n)			    \
+{							    \
+  int i;						    \
+  INHIB_OPTIMIZATION;					    \
+  int##size##x##lanes##_t a = vld1##type##size (test_set);  \
+  int##size##x##lanes##_t b = vld1##type##size (answ_set);  \
+  a = vclz##type##size (a);				    \
+  for (i = 0; i < n; i++){				    \
+    if (a [i] != b [i])					    \
+      return 1;						    \
+  }							    \
+  return 0;						    \
+}
+
+/* unsigned inputs  */
+#define U_BUILD_TEST(type, size, lanes)			    \
+int __attribute__((noipa,noinline))			    \
+run_test##type##size##x##lanes (uint##size##_t* test_set,   \
+				uint##size##_t* answ_set,   \
+				int reg_len, int data_len,  \
+				int n)	                    \
+{							    \
+  int i;						    \
+  INHIB_OPTIMIZATION;					    \
+  uint##size##x##lanes##_t a = vld1##type##size (test_set); \
+  uint##size##x##lanes##_t b = vld1##type##size (answ_set); \
+  a = vclz##type##size (a);				    \
+  for (i = 0; i < n; i++){				    \
+    if (a [i] != b [i])					    \
+      return 1;						    \
+  }							    \
+  return 0;						    \
+}
+
+BUILD_TEST (_s, 8, 8)
+BUILD_TEST (_s, 16, 4)
+BUILD_TEST (_s, 32, 2)
+BUILD_TEST (q_s, 8, 16)
+BUILD_TEST (q_s, 16, 8)
+BUILD_TEST (q_s, 32, 4)
+
+U_BUILD_TEST (_u, 8, 8)
+U_BUILD_TEST (_u, 16, 4)
+U_BUILD_TEST (_u, 32, 2)
+U_BUILD_TEST (q_u, 8, 16)
+U_BUILD_TEST (q_u, 16, 8)
+U_BUILD_TEST (q_u, 32, 4)
 
 int __attribute__ ((noinline))
 test_vclz_s8 ()
 {
-  int i;
-  int8x8_t a;
-  int8x8_t b;
-
   int8_t test_set0[8] = {
     TEST0, TEST1, TEST2, TEST3,
     TEST4, TEST5, TEST6, TEST7
@@ -98,22 +138,18 @@  test_vclz_s8 ()
     0, 0, 0, 0,
     0, 0, 0, 0
   };
-  RUN_TEST (test_set0, answ_set0, 64, 8, 1, 8);
-  RUN_TEST (test_set1, answ_set1, 64, 8, 1, 1);
+  int o1 = run_test_s8x8 (test_set0, answ_set0, 64, 8, 8);
+  int o2 = run_test_s8x8 (test_set1, answ_set1, 64, 8, 1);
 
-  return 0;
+  return o1||o2;
 }
 
 /* Double scan-assembler-times to take account of unsigned functions.  */
-/* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.8b, v\[0-9\]+\.8b" 4 } } */
+/* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.8b, v\[0-9\]+\.8b" 2 } } */
 
 int __attribute__ ((noinline))
 test_vclz_s16 ()
 {
-  int i;
-  int16x4_t a;
-  int16x4_t b;
-
   int16_t test_set0[4] = { TEST0, TEST1, TEST2, TEST3 };
   int16_t test_set1[4] = { TEST4, TEST5, TEST6, TEST7 };
   int16_t test_set2[4] = { TEST8, TEST9, TEST10, TEST11 };
@@ -126,25 +162,21 @@  test_vclz_s16 ()
   int16_t answ_set3[4] = { 4, 3, 2, 1 };
   int16_t answ_set4[4] = { 0, 0, 0, 0 };
 
-  RUN_TEST (test_set0, answ_set0, 64, 16, 1, 4);
-  RUN_TEST (test_set1, answ_set1, 64, 16, 1, 4);
-  RUN_TEST (test_set2, answ_set2, 64, 16, 1, 4);
-  RUN_TEST (test_set3, answ_set3, 64, 16, 1, 4);
-  RUN_TEST (test_set4, answ_set4, 64, 16, 1, 1);
+  int o1 = run_test_s16x4 (test_set0, answ_set0, 64, 16, 4);
+  int o2 = run_test_s16x4 (test_set1, answ_set1, 64, 16, 4);
+  int o3 = run_test_s16x4 (test_set2, answ_set2, 64, 16, 4);
+  int o4 = run_test_s16x4 (test_set3, answ_set3, 64, 16, 4);
+  int o5 = run_test_s16x4 (test_set4, answ_set4, 64, 16, 1);
 
-  return 0;
+  return o1||o2||o3||o4||o5;
 }
 
 /* Double scan-assembler-times to take account of unsigned functions.  */
-/* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.4h, v\[0-9\]+\.4h" 10} } */
+/* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.4h, v\[0-9\]+\.4h" 2} } */
 
 int __attribute__ ((noinline))
 test_vclz_s32 ()
 {
-  int i;
-  int32x2_t a;
-  int32x2_t b;
-
   int32_t test_set0[2] = { TEST0, TEST1 };
   int32_t test_set1[2] = { TEST2, TEST3 };
   int32_t test_set2[2] = { TEST4, TEST5 };
@@ -181,37 +213,34 @@  test_vclz_s32 ()
   int32_t answ_set15[2] = { 2, 1 };
   int32_t answ_set16[2] = { 0, 0 };
 
-  RUN_TEST (test_set0, answ_set0, 64, 32, 1, 2);
-  RUN_TEST (test_set1, answ_set1, 64, 32, 1, 2);
-  RUN_TEST (test_set2, answ_set2, 64, 32, 1, 2);
-  RUN_TEST (test_set3, answ_set3, 64, 32, 1, 2);
-  RUN_TEST (test_set4, answ_set4, 64, 32, 1, 2);
-  RUN_TEST (test_set5, answ_set5, 64, 32, 1, 2);
-  RUN_TEST (test_set6, answ_set6, 64, 32, 1, 2);
-  RUN_TEST (test_set7, answ_set7, 64, 32, 1, 2);
-  RUN_TEST (test_set8, answ_set8, 64, 32, 1, 2);
-  RUN_TEST (test_set9, answ_set9, 64, 32, 1, 2);
-  RUN_TEST (test_set10, answ_set10, 64, 32, 1, 2);
-  RUN_TEST (test_set11, answ_set11, 64, 32, 1, 2);
-  RUN_TEST (test_set12, answ_set12, 64, 32, 1, 2);
-  RUN_TEST (test_set13, answ_set13, 64, 32, 1, 2);
-  RUN_TEST (test_set14, answ_set14, 64, 32, 1, 2);
-  RUN_TEST (test_set15, answ_set15, 64, 32, 1, 2);
-  RUN_TEST (test_set16, answ_set16, 64, 32, 1, 1);
-
-  return 0;
+  int o1 = run_test_s32x2 (test_set0, answ_set0, 64, 32, 2);
+  int o2 = run_test_s32x2 (test_set1, answ_set1, 64, 32, 2);
+  int o3 = run_test_s32x2 (test_set2, answ_set2, 64, 32, 2);
+  int o4 = run_test_s32x2 (test_set3, answ_set3, 64, 32, 2);
+  int o5 = run_test_s32x2 (test_set4, answ_set4, 64, 32, 2);
+  int o6 = run_test_s32x2 (test_set5, answ_set5, 64, 32, 2);
+  int o7 = run_test_s32x2 (test_set6, answ_set6, 64, 32, 2);
+  int o8 = run_test_s32x2 (test_set7, answ_set7, 64, 32, 2);
+  int o9 = run_test_s32x2 (test_set8, answ_set8, 64, 32, 2);
+  int o10 = run_test_s32x2 (test_set9, answ_set9, 64, 32, 2);
+  int o11 = run_test_s32x2 (test_set10, answ_set10, 64, 32, 2);
+  int o12 = run_test_s32x2 (test_set11, answ_set11, 64, 32, 2);
+  int o13 = run_test_s32x2 (test_set12, answ_set12, 64, 32, 2);
+  int o14 = run_test_s32x2 (test_set13, answ_set13, 64, 32, 2);
+  int o15 = run_test_s32x2 (test_set14, answ_set14, 64, 32, 2);
+  int o16 = run_test_s32x2 (test_set15, answ_set15, 64, 32, 2);
+  int o17 = run_test_s32x2 (test_set16, answ_set16, 64, 32, 1);
+
+  return o1||o2||o3||o4||o5||o6||o7||o8||o9||o10||o11||o12||o13||o14
+    ||o15||o16||o17;
 }
 
 /* Double scan-assembler-times to take account of unsigned functions.  */
-/* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" 34 } } */
+/* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s"  2 } } */
 
 int __attribute__ ((noinline))
 test_vclzq_s8 ()
 {
-  int i;
-  int8x16_t a;
-  int8x16_t b;
-
   int8_t test_set0[16] = {
     TEST0, TEST1, TEST2, TEST3, TEST4, TEST5, TEST6, TEST7,
     TEST8, TEST8, TEST8, TEST8, TEST8, TEST8, TEST8, TEST8
@@ -219,8 +248,8 @@  test_vclzq_s8 ()
   int8_t answ_set0[16] = {
     8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0
   };
-  RUN_TEST (test_set0, answ_set0, 128, 8, 1, 9);
-  return 0;
+  int o1 = run_testq_s8x16 (test_set0, answ_set0, 128, 8, 9);
+  return o1;
 }
 
 /* Double scan-assembler-times to take account of unsigned functions.  */
@@ -229,10 +258,6 @@  test_vclzq_s8 ()
 int __attribute__ ((noinline))
 test_vclzq_s16 ()
 {
-  int i;
-  int16x8_t a;
-  int16x8_t b;
-
   int16_t test_set0[8] = {
     TEST0, TEST1, TEST2, TEST3, TEST4, TEST5, TEST6, TEST7
   };
@@ -252,23 +277,19 @@  test_vclzq_s16 ()
   int16_t answ_set2[8] = {
     0, 0, 0, 0, 0, 0, 0, 0
   };
-  RUN_TEST (test_set0, answ_set0, 128, 16, 1, 8);
-  RUN_TEST (test_set1, answ_set1, 128, 16, 1, 8);
-  RUN_TEST (test_set2, answ_set2, 128, 16, 1, 1);
+  int o1 = run_testq_s16x8 (test_set0, answ_set0, 128, 16, 8);
+  int o2 = run_testq_s16x8 (test_set1, answ_set1, 128, 16, 8);
+  int o3 = run_testq_s16x8 (test_set2, answ_set2, 128, 16, 1);
 
-  return 0;
+  return o1||o2||o3;
 }
 
 /* Double scan-assembler-times to take account of unsigned functions.  */
-/* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h" 6 } } */
+/* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h" 2 } } */
 
 int __attribute__ ((noinline))
 test_vclzq_s32 ()
 {
-  int i;
-  int32x4_t a;
-  int32x4_t b;
-
   int32_t test_set0[4] = { TEST0, TEST1, TEST2, TEST3 };
   int32_t test_set1[4] = { TEST4, TEST5, TEST6, TEST7 };
   int32_t test_set2[4] = { TEST8, TEST9, TEST10, TEST11 };
@@ -289,27 +310,23 @@  test_vclzq_s32 ()
   int32_t answ_set7[4] = { 4, 3, 2, 1 };
   int32_t answ_set8[4] = { 0, 0, 0, 0 };
 
-  RUN_TEST (test_set0, answ_set0, 128, 32, 1, 4);
-  RUN_TEST (test_set1, answ_set1, 128, 32, 1, 4);
-  RUN_TEST (test_set2, answ_set2, 128, 32, 1, 4);
-  RUN_TEST (test_set3, answ_set3, 128, 32, 1, 4);
-  RUN_TEST (test_set4, answ_set4, 128, 32, 1, 1);
+  int o1 = run_testq_s32x4 (test_set0, answ_set0, 128, 32, 4);
+  int o2 = run_testq_s32x4 (test_set1, answ_set1, 128, 32, 4);
+  int o3 = run_testq_s32x4 (test_set2, answ_set2, 128, 32, 4);
+  int o4 = run_testq_s32x4 (test_set3, answ_set3, 128, 32, 4);
+  int o5 = run_testq_s32x4 (test_set4, answ_set4, 128, 32, 1);
 
-  return 0;
+  return o1||o2||o3||o4||o5;
 }
 
 /* Double scan-assembler-times to take account of unsigned functions.  */
-/* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" 10 } } */
+/* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" 2 } } */
 
 /* Unsigned versions.  */
 
 int __attribute__ ((noinline))
 test_vclz_u8 ()
 {
-  int i;
-  uint8x8_t a;
-  uint8x8_t b;
-
   uint8_t test_set0[8] = {
     TEST0, TEST1, TEST2, TEST3, TEST4, TEST5, TEST6, TEST7
   };
@@ -323,10 +340,10 @@  test_vclz_u8 ()
     0, 0, 0, 0, 0, 0, 0, 0
   };
 
-  RUN_TEST (test_set0, answ_set0, 64, 8, 0, 8);
-  RUN_TEST (test_set1, answ_set1, 64, 8, 0, 1);
+  int o1 = run_test_u8x8 (test_set0, answ_set0, 64, 8, 8);
+  int o2 = run_test_u8x8 (test_set1, answ_set1, 64, 8, 1);
 
-  return 0;
+  return o1||o2;
 }
 
 /* ASM scan near test for signed version.  */
@@ -334,10 +351,6 @@  test_vclz_u8 ()
 int __attribute__ ((noinline))
 test_vclz_u16 ()
 {
-  int i;
-  uint16x4_t a;
-  uint16x4_t b;
-
   uint16_t test_set0[4] = { TEST0, TEST1, TEST2, TEST3 };
   uint16_t test_set1[4] = { TEST4, TEST5, TEST6, TEST7 };
   uint16_t test_set2[4] = { TEST8, TEST9, TEST10, TEST11 };
@@ -350,13 +363,13 @@  test_vclz_u16 ()
   uint16_t answ_set3[4] = { 4, 3, 2, 1 };
   uint16_t answ_set4[4] = { 0, 0, 0, 0 };
 
-  RUN_TEST (test_set0, answ_set0, 64, 16, 0, 4);
-  RUN_TEST (test_set1, answ_set1, 64, 16, 0, 4);
-  RUN_TEST (test_set2, answ_set2, 64, 16, 0, 4);
-  RUN_TEST (test_set3, answ_set3, 64, 16, 0, 4);
-  RUN_TEST (test_set4, answ_set4, 64, 16, 0, 1);
+  int o1 = run_test_u16x4 (test_set0, answ_set0, 64, 16, 4);
+  int o2 = run_test_u16x4 (test_set1, answ_set1, 64, 16, 4);
+  int o3 = run_test_u16x4 (test_set2, answ_set2, 64, 16, 4);
+  int o4 = run_test_u16x4 (test_set3, answ_set3, 64, 16, 4);
+  int o5 = run_test_u16x4 (test_set4, answ_set4, 64, 16, 1);
 
-  return 0;
+  return o1||o2||o3||o4||o5;
 }
 
 /* ASM scan near test for signed version.  */
@@ -364,10 +377,6 @@  test_vclz_u16 ()
 int __attribute__ ((noinline))
 test_vclz_u32 ()
 {
-  int i;
-  uint32x2_t a;
-  uint32x2_t b;
-
   uint32_t test_set0[2] = { TEST0, TEST1 };
   uint32_t test_set1[2] = { TEST2, TEST3 };
   uint32_t test_set2[2] = { TEST4, TEST5 };
@@ -404,25 +413,26 @@  test_vclz_u32 ()
   uint32_t answ_set15[2] = { 2, 1 };
   uint32_t answ_set16[2] = { 0, 0 };
 
-  RUN_TEST (test_set0, answ_set0, 64, 32, 0, 2);
-  RUN_TEST (test_set1, answ_set1, 64, 32, 0, 2);
-  RUN_TEST (test_set2, answ_set2, 64, 32, 0, 2);
-  RUN_TEST (test_set3, answ_set3, 64, 32, 0, 2);
-  RUN_TEST (test_set4, answ_set4, 64, 32, 0, 2);
-  RUN_TEST (test_set5, answ_set5, 64, 32, 0, 2);
-  RUN_TEST (test_set6, answ_set6, 64, 32, 0, 2);
-  RUN_TEST (test_set7, answ_set7, 64, 32, 0, 2);
-  RUN_TEST (test_set8, answ_set8, 64, 32, 0, 2);
-  RUN_TEST (test_set9, answ_set9, 64, 32, 0, 2);
-  RUN_TEST (test_set10, answ_set10, 64, 32, 0, 2);
-  RUN_TEST (test_set11, answ_set11, 64, 32, 0, 2);
-  RUN_TEST (test_set12, answ_set12, 64, 32, 0, 2);
-  RUN_TEST (test_set13, answ_set13, 64, 32, 0, 2);
-  RUN_TEST (test_set14, answ_set14, 64, 32, 0, 2);
-  RUN_TEST (test_set15, answ_set15, 64, 32, 0, 2);
-  RUN_TEST (test_set16, answ_set16, 64, 32, 0, 1);
-
-  return 0;
+  int o1 = run_test_u32x2 (test_set0, answ_set0, 64, 32, 2);
+  int o2 = run_test_u32x2 (test_set1, answ_set1, 64, 32, 2);
+  int o3 = run_test_u32x2 (test_set2, answ_set2, 64, 32, 2);
+  int o4 = run_test_u32x2 (test_set3, answ_set3, 64, 32, 2);
+  int o5 = run_test_u32x2 (test_set4, answ_set4, 64, 32, 2);
+  int o6 = run_test_u32x2 (test_set5, answ_set5, 64, 32, 2);
+  int o7 = run_test_u32x2 (test_set6, answ_set6, 64, 32, 2);
+  int o8 = run_test_u32x2 (test_set7, answ_set7, 64, 32, 2);
+  int o9 = run_test_u32x2 (test_set8, answ_set8, 64, 32, 2);
+  int o10 = run_test_u32x2 (test_set9, answ_set9, 64, 32, 2);
+  int o11 = run_test_u32x2 (test_set10, answ_set10, 64, 32, 2);
+  int o12 = run_test_u32x2 (test_set11, answ_set11, 64, 32, 2);
+  int o13 = run_test_u32x2 (test_set12, answ_set12, 64, 32, 2);
+  int o14 = run_test_u32x2 (test_set13, answ_set13, 64, 32, 2);
+  int o15 = run_test_u32x2 (test_set14, answ_set14, 64, 32, 2);
+  int o16 = run_test_u32x2 (test_set15, answ_set15, 64, 32, 2);
+  int o17 = run_test_u32x2 (test_set16, answ_set16, 64, 32, 1);
+
+  return o1||o2||o3||o4||o5||o6||o7||o8||o9||o10||o11||o12||o13||o14
+        ||o15||o16||o17;
 }
 
 /* ASM scan near test for signed version.  */
@@ -441,9 +451,9 @@  test_vclzq_u8 ()
   uint8_t answ_set0[16] = {
     8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0
   };
-  RUN_TEST (test_set0, answ_set0, 128, 8, 0, 9);
+  int o1 = run_testq_u8x16 (test_set0, answ_set0, 128, 8, 9);
 
-  return 0;
+  return o1;
 }
 
 /* ASM scan near test for signed version.  */
@@ -476,11 +486,11 @@  test_vclzq_u16 ()
     0, 0, 0, 0, 0, 0, 0, 0
   };
 
-  RUN_TEST (test_set0, answ_set0, 128, 16, 0, 8);
-  RUN_TEST (test_set1, answ_set1, 128, 16, 0, 8);
-  RUN_TEST (test_set2, answ_set2, 128, 16, 0, 1);
+  int o1 = run_testq_u16x8 (test_set0, answ_set0, 128, 16, 8);
+  int o2 = run_testq_u16x8 (test_set1, answ_set1, 128, 16, 8);
+  int o3 = run_testq_u16x8 (test_set2, answ_set2, 128, 16, 1);
 
-  return 0;
+  return o1||o2||o3;
 }
 
 /* ASM scan near test for signed version.  */
@@ -488,10 +498,6 @@  test_vclzq_u16 ()
 int __attribute__ ((noinline))
 test_vclzq_u32 ()
 {
-  int i;
-  uint32x4_t a;
-  uint32x4_t b;
-
   uint32_t test_set0[4] = { TEST0, TEST1, TEST2, TEST3 };
   uint32_t test_set1[4] = { TEST4, TEST5, TEST6, TEST7 };
   uint32_t test_set2[4] = { TEST8, TEST9, TEST10, TEST11 };
@@ -512,13 +518,13 @@  test_vclzq_u32 ()
   uint32_t answ_set7[4] = { 4, 3, 2, 1 };
   uint32_t answ_set8[4] = { 0, 0, 0, 0 };
 
-  RUN_TEST (test_set0, answ_set0, 128, 32, 0, 4);
-  RUN_TEST (test_set1, answ_set1, 128, 32, 0, 4);
-  RUN_TEST (test_set2, answ_set2, 128, 32, 0, 4);
-  RUN_TEST (test_set3, answ_set3, 128, 32, 0, 4);
-  RUN_TEST (test_set4, answ_set4, 128, 32, 0, 1);
+  int o1 = run_testq_u32x4 (test_set0, answ_set0, 128, 32, 4);
+  int o2 = run_testq_u32x4 (test_set1, answ_set1, 128, 32, 4);
+  int o3 = run_testq_u32x4 (test_set2, answ_set2, 128, 32, 4);
+  int o4 = run_testq_u32x4 (test_set3, answ_set3, 128, 32, 4);
+  int o5 = run_testq_u32x4 (test_set4, answ_set4, 128, 32, 1);
 
-  return 0;
+  return o1||o2||o3||o4||o5;
 }
 
 /* ASM scan near test for signed version.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/vneg_s.c b/gcc/testsuite/gcc.target/aarch64/vneg_s.c
index 6947526abdd4f49cf560661531e96feb9b934eb5..8ddc4d21c1f89d6c66624a33ee0386cb3a28c512 100644
--- a/gcc/testsuite/gcc.target/aarch64/vneg_s.c
+++ b/gcc/testsuite/gcc.target/aarch64/vneg_s.c
@@ -31,49 +31,24 @@ 
 
 extern void abort (void);
 
-#define CONCAT(a, b) a##b
-#define CONCAT1(a, b) CONCAT (a, b)
-#define REG_INFEX64 _
-#define REG_INFEX128 q_
-#define REG_INFEX(reg_len) REG_INFEX##reg_len
-#define POSTFIX(reg_len, data_len) \
-  CONCAT1 (REG_INFEX (reg_len), s##data_len)
-#define DATA_TYPE_32 float
-#define DATA_TYPE_64 double
-#define DATA_TYPE(data_len) DATA_TYPE_##data_len
-
-#define FORCE_SIMD_INST64_8(data)
-#define FORCE_SIMD_INST64_16(data)
-#define FORCE_SIMD_INST64_32(data)
-#define FORCE_SIMD_INST64_64(data) force_simd (data)
-#define FORCE_SIMD_INST128_8(data)
-#define FORCE_SIMD_INST128_16(data)
-#define FORCE_SIMD_INST128_32(data)
-#define FORCE_SIMD_INST128_64(data)
-
-#define FORCE_SIMD_INST(reg_len, data_len, data) \
-  CONCAT1 (FORCE_SIMD_INST, reg_len##_##data_len) (data)
-#define LOAD_INST(reg_len, data_len) \
-  CONCAT1 (vld1, POSTFIX (reg_len, data_len))
-#define NEG_INST(reg_len, data_len) \
-  CONCAT1 (vneg, POSTFIX (reg_len, data_len))
-
-#define RUN_TEST(test_set, answ_set, reg_len, data_len, n, a, b)	\
-  {									\
-    int i;								\
-    INHIB_OPTIMIZATION;							\
-    (a) = LOAD_INST (reg_len, data_len) (test_set);			\
-    (b) = LOAD_INST (reg_len, data_len) (answ_set);			\
-    FORCE_SIMD_INST (reg_len, data_len, a)				\
-    a = NEG_INST (reg_len, data_len) (a);				\
-    FORCE_SIMD_INST (reg_len, data_len, a)				\
-    for (i = 0; i < n; i++)						\
-      {									\
-        INHIB_OPTIMIZATION;						\
-	if (a[i] != b[i])						\
-	  return 1;							\
-      }									\
-  }
+#define BUILD_TEST(type, size, lanes)			   \
+int __attribute__((noipa,noinline))			   \
+run_test##type##size##x##lanes (int##size##_t* test_set,   \
+		       int##size##_t* answ_set,		   \
+		       int reg_len, int data_len, int n)   \
+{							   \
+  int i;						   \
+  int##size##x##lanes##_t a = vld1##type##size (test_set); \
+  int##size##x##lanes##_t b = vld1##type##size (answ_set); \
+  a = vneg##type##size (a);				   \
+  for (i = 0; i < n; i++)				   \
+  {				    \
+    INHIB_OPTIMIZATION;		    \
+    if (a[i] != b[i])		    \
+    return 1;			    \
+  }				    \
+  return 0;			    \
+}				    \
 
 #define RUN_TEST_SCALAR(test_val, answ_val, a, b)     \
   {                                                   \
@@ -87,12 +62,19 @@  extern void abort (void);
     force_simd (res);                                 \
   }
 
+BUILD_TEST (_s, 8, 8)
+BUILD_TEST (_s, 16, 4)
+BUILD_TEST (_s, 32, 2)
+BUILD_TEST (_s, 64, 1)
+
+BUILD_TEST (q_s, 8, 16)
+BUILD_TEST (q_s, 16, 8)
+BUILD_TEST (q_s, 32, 4)
+BUILD_TEST (q_s, 64, 2)
+
 int __attribute__ ((noinline))
 test_vneg_s8 ()
 {
-  int8x8_t a;
-  int8x8_t b;
-
   int8_t test_set0[8] = {
     TEST0, TEST1, TEST2, TEST3, TEST4, TEST5, SCHAR_MAX, SCHAR_MIN
   };
@@ -100,9 +82,9 @@  test_vneg_s8 ()
     ANSW0, ANSW1, ANSW2, ANSW3, ANSW4, ANSW5, SCHAR_MIN + 1, SCHAR_MIN
   };
 
-  RUN_TEST (test_set0, answ_set0, 64, 8, 8, a, b);
+  int o1 = run_test_s8x8 (test_set0, answ_set0, 64, 8, 8);
 
-  return 0;
+  return o1;
 }
 
 /* { dg-final { scan-assembler-times "neg\\tv\[0-9\]+\.8b, v\[0-9\]+\.8b" 1 } } */
@@ -110,29 +92,23 @@  test_vneg_s8 ()
 int __attribute__ ((noinline))
 test_vneg_s16 ()
 {
-  int16x4_t a;
-  int16x4_t b;
-
   int16_t test_set0[4] = { TEST0, TEST1, TEST2, TEST3 };
   int16_t test_set1[4] = { TEST4, TEST5, SHRT_MAX, SHRT_MIN };
 
   int16_t answ_set0[4] = { ANSW0, ANSW1, ANSW2, ANSW3 };
   int16_t answ_set1[4] = { ANSW4, ANSW5, SHRT_MIN + 1, SHRT_MIN };
 
-  RUN_TEST (test_set0, answ_set0, 64, 16, 4, a, b);
-  RUN_TEST (test_set1, answ_set1, 64, 16, 4, a, b);
+  int o1 = run_test_s16x4 (test_set0, answ_set0, 64, 16, 4);
+  int o2 = run_test_s16x4 (test_set1, answ_set1, 64, 16, 4);
 
-  return 0;
+  return o1||o2;
 }
 
-/* { dg-final { scan-assembler-times "neg\\tv\[0-9\]+\.4h, v\[0-9\]+\.4h" 2 } } */
+/* { dg-final { scan-assembler-times "neg\\tv\[0-9\]+\.4h, v\[0-9\]+\.4h" 1 } } */
 
 int __attribute__ ((noinline))
 test_vneg_s32 ()
 {
-  int32x2_t a;
-  int32x2_t b;
-
   int32_t test_set0[2] = { TEST0, TEST1 };
   int32_t test_set1[2] = { TEST2, TEST3 };
   int32_t test_set2[2] = { TEST4, TEST5 };
@@ -143,22 +119,19 @@  test_vneg_s32 ()
   int32_t answ_set2[2] = { ANSW4, ANSW5 };
   int32_t answ_set3[2] = { INT_MIN + 1, INT_MIN };
 
-  RUN_TEST (test_set0, answ_set0, 64, 32, 2, a, b);
-  RUN_TEST (test_set1, answ_set1, 64, 32, 2, a, b);
-  RUN_TEST (test_set2, answ_set2, 64, 32, 2, a, b);
-  RUN_TEST (test_set3, answ_set3, 64, 32, 2, a, b);
+  int o1 = run_test_s32x2 (test_set0, answ_set0, 64, 32, 2);
+  int o2 = run_test_s32x2 (test_set1, answ_set1, 64, 32, 2);
+  int o3 = run_test_s32x2 (test_set2, answ_set2, 64, 32, 2);
+  int o4 = run_test_s32x2 (test_set3, answ_set3, 64, 32, 2);
 
-  return 0;
+  return o1||o2||o3||o4;
 }
 
-/* { dg-final { scan-assembler-times "neg\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" 4 } } */
+/* { dg-final { scan-assembler-times "neg\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" 1 } } */
 
 int __attribute__ ((noinline))
 test_vneg_s64 ()
 {
-  int64x1_t a;
-  int64x1_t b;
-
   int64_t test_set0[1] = { TEST0 };
   int64_t test_set1[1] = { TEST1 };
   int64_t test_set2[1] = { TEST2 };
@@ -177,16 +150,16 @@  test_vneg_s64 ()
   int64_t answ_set6[1] = { LLONG_MIN + 1 };
   int64_t answ_set7[1] = { LLONG_MIN };
 
-  RUN_TEST (test_set0, answ_set0, 64, 64, 1, a, b);
-  RUN_TEST (test_set1, answ_set1, 64, 64, 1, a, b);
-  RUN_TEST (test_set2, answ_set2, 64, 64, 1, a, b);
-  RUN_TEST (test_set3, answ_set3, 64, 64, 1, a, b);
-  RUN_TEST (test_set4, answ_set4, 64, 64, 1, a, b);
-  RUN_TEST (test_set5, answ_set5, 64, 64, 1, a, b);
-  RUN_TEST (test_set6, answ_set6, 64, 64, 1, a, b);
-  RUN_TEST (test_set7, answ_set7, 64, 64, 1, a, b);
+  int o1 = run_test_s64x1 (test_set0, answ_set0, 64, 64, 1);
+  int o2 = run_test_s64x1  (test_set1, answ_set1, 64, 64, 1);
+  int o3 = run_test_s64x1 (test_set2, answ_set2, 64, 64, 1);
+  int o4 = run_test_s64x1 (test_set3, answ_set3, 64, 64, 1);
+  int o5 = run_test_s64x1 (test_set4, answ_set4, 64, 64, 1);
+  int o6 = run_test_s64x1 (test_set5, answ_set5, 64, 64, 1);
+  int o7 = run_test_s64x1 (test_set6, answ_set6, 64, 64, 1);
+  int o8 = run_test_s64x1 (test_set7, answ_set7, 64, 64, 1);
 
-  return 0;
+  return o1||o2||o3||o4||o5||o6||o7||o8;
 }
 
 int __attribute__ ((noinline))
@@ -206,14 +179,11 @@  test_vnegd_s64 ()
   return 0;
 }
 
-/* { dg-final { scan-assembler-times "neg\\td\[0-9\]+, d\[0-9\]+" 16 } } */
+/* { dg-final { scan-assembler-times "neg\\td\[0-9\]+, d\[0-9\]+" 8 } } */
 
 int __attribute__ ((noinline))
 test_vnegq_s8 ()
 {
-  int8x16_t a;
-  int8x16_t b;
-
   int8_t test_set0[16] = {
     TEST0, TEST1, TEST2, TEST3, TEST4, TEST5, SCHAR_MAX, SCHAR_MIN,
     4, 8, 15, 16, 23, 42, -1, -2
@@ -224,9 +194,9 @@  test_vnegq_s8 ()
     -4, -8, -15, -16, -23, -42, 1, 2
   };
 
-  RUN_TEST (test_set0, answ_set0, 128, 8, 8, a, b);
+  int o1 = run_testq_s8x16 (test_set0, answ_set0, 128, 8, 8);
 
-  return 0;
+  return o1;
 }
 
 /* { dg-final { scan-assembler-times "neg\\tv\[0-9\]+\.16b, v\[0-9\]+\.16b" 1 } } */
@@ -234,9 +204,6 @@  test_vnegq_s8 ()
 int __attribute__ ((noinline))
 test_vnegq_s16 ()
 {
-  int16x8_t a;
-  int16x8_t b;
-
   int16_t test_set0[8] = {
     TEST0, TEST1, TEST2, TEST3, TEST4, TEST5, SHRT_MAX, SHRT_MIN
   };
@@ -244,9 +211,9 @@  test_vnegq_s16 ()
     ANSW0, ANSW1, ANSW2, ANSW3, ANSW4, ANSW5, SHRT_MIN + 1, SHRT_MIN
   };
 
-  RUN_TEST (test_set0, answ_set0, 128, 16, 8, a, b);
+  int o1 = run_testq_s16x8 (test_set0, answ_set0, 128, 16, 8);
 
-  return 0;
+  return o1;
 }
 
 /* { dg-final { scan-assembler-times "neg\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h" 1 } } */
@@ -254,29 +221,23 @@  test_vnegq_s16 ()
 int __attribute__ ((noinline))
 test_vnegq_s32 ()
 {
-  int32x4_t a;
-  int32x4_t b;
-
   int32_t test_set0[4] = { TEST0, TEST1, TEST2, TEST3 };
   int32_t test_set1[4] = { TEST4, TEST5, INT_MAX, INT_MIN };
 
   int32_t answ_set0[4] = { ANSW0, ANSW1, ANSW2, ANSW3 };
   int32_t answ_set1[4] = { ANSW4, ANSW5, INT_MIN + 1, INT_MIN };
 
-  RUN_TEST (test_set0, answ_set0, 128, 32, 4, a, b);
-  RUN_TEST (test_set1, answ_set1, 128, 32, 4, a, b);
+  int o1 = run_testq_s32x4 (test_set0, answ_set0, 128, 32, 4);
+  int o2 = run_testq_s32x4 (test_set1, answ_set1, 128, 32, 4);
 
-  return 0;
+  return o1||o2;
 }
 
-/* { dg-final { scan-assembler-times "neg\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" 2 } } */
+/* { dg-final { scan-assembler-times "neg\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" 1 } } */
 
 int __attribute__ ((noinline))
 test_vnegq_s64 ()
 {
-  int64x2_t a;
-  int64x2_t b;
-
   int64_t test_set0[2] = { TEST0, TEST1 };
   int64_t test_set1[2] = { TEST2, TEST3 };
   int64_t test_set2[2] = { TEST4, TEST5 };
@@ -287,15 +248,15 @@  test_vnegq_s64 ()
   int64_t answ_set2[2] = { ANSW4, ANSW5 };
   int64_t answ_set3[2] = { LLONG_MIN + 1, LLONG_MIN };
 
-  RUN_TEST (test_set0, answ_set0, 128, 64, 2, a, b);
-  RUN_TEST (test_set1, answ_set1, 128, 64, 2, a, b);
-  RUN_TEST (test_set2, answ_set2, 128, 64, 2, a, b);
-  RUN_TEST (test_set3, answ_set3, 128, 64, 2, a, b);
+  int o1 = run_testq_s64x2 (test_set0, answ_set0, 128, 64, 2);
+  int o2 = run_testq_s64x2 (test_set1, answ_set1, 128, 64, 2);
+  int o3 = run_testq_s64x2 (test_set2, answ_set2, 128, 64, 2);
+  int o4 = run_testq_s64x2 (test_set3, answ_set3, 128, 64, 2);
 
-  return 0;
+  return o1||o2||o2||o4;
 }
 
-/* { dg-final { scan-assembler-times "neg\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" 4 } } */
+/* { dg-final { scan-assembler-times "neg\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" 1 } } */
 
 int
 main (int argc, char **argv)