[8/8] AArch64: Have reload not choose to do add on the scalar side if both values exist on the SIMD side.

Message ID Y1+41nrRB4ZMXZZA@arm.com
State Dropped
Headers
Series [1/8] middle-end: Recognize scalar reductions from bitfields and array_refs |

Commit Message

Tamar Christina Oct. 31, 2022, noon UTC
  Hi All,

Currently we often times generate an r -> r add even if it means we need two
reloads to perform it, i.e. in the case that the values are on the SIMD side.

The pairwise operations expose these more now and so we get suboptimal codegen.

Normally I would have liked to use ^ or $ here, but while this works for the
simple examples, reload inexplicably falls apart on examples that should have
been trivial. It forces a move to r -> w to use the w ADD, which is counter to
what ^ and $ should do.

However ! seems to fix all the regression and still maintains the good codegen.

I have tried looking into whether it's our costings that are off, but I can't
seem anything logical here.  So I'd like to push this change instead along with
test that augment the other testcases that guard the r -> r variants.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.md (*add<mode>3_aarch64): Add ! to the r -> r
	alternative.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/simd/scalar_addp.c: New test.
	* gcc.target/aarch64/simd/scalar_faddp.c: New test.
	* gcc.target/aarch64/simd/scalar_faddp2.c: New test.
	* gcc.target/aarch64/simd/scalar_fmaxp.c: New test.
	* gcc.target/aarch64/simd/scalar_fminp.c: New test.
	* gcc.target/aarch64/simd/scalar_maxp.c: New test.
	* gcc.target/aarch64/simd/scalar_minp.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 09ae1118371f82ca63146fceb953eb9e820d05a4..c333fb1f72725992bb304c560f1245a242d5192d 100644




--
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 09ae1118371f82ca63146fceb953eb9e820d05a4..c333fb1f72725992bb304c560f1245a242d5192d 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -2043,7 +2043,7 @@ (define_expand "add<mode>3"
 
 (define_insn "*add<mode>3_aarch64"
   [(set
-    (match_operand:GPI 0 "register_operand" "=rk,rk,w,rk,r,r,rk")
+    (match_operand:GPI 0 "register_operand" "=rk,!rk,w,rk,r,r,rk")
     (plus:GPI
      (match_operand:GPI 1 "register_operand" "%rk,rk,w,rk,rk,0,rk")
      (match_operand:GPI 2 "aarch64_pluslong_operand" "I,r,w,J,Uaa,Uai,Uav")))]
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
new file mode 100644
index 0000000000000000000000000000000000000000..5b8d40f19884fc7b4e7decd80758bc36fa76d058
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
@@ -0,0 +1,70 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+typedef unsigned long long v2udi __attribute__((vector_size (16)));
+typedef int v2si __attribute__((vector_size (16)));
+typedef unsigned int v2usi __attribute__((vector_size (16)));
+
+/*
+** foo:
+** 	addp	d0, v0.2d
+** 	fmov	x0, d0
+** 	ret
+*/
+long long
+foo (v2di x)
+{
+  return x[1] + x[0];
+}
+
+/*
+** foo1:
+** 	saddlp	v0.1d, v0.2s
+** 	fmov	x0, d0
+** 	ret
+*/
+long long
+foo1 (v2si x)
+{
+  return x[1] + x[0];
+}
+
+/*
+** foo2:
+** 	uaddlp	v0.1d, v0.2s
+** 	fmov	x0, d0
+** 	ret
+*/
+unsigned long long
+foo2 (v2usi x)
+{
+  return x[1] + x[0];
+}
+
+/*
+** foo3:
+** 	uaddlp	v0.1d, v0.2s
+** 	add	d0, d0, d1
+** 	fmov	x0, d0
+** 	ret
+*/
+unsigned long long
+foo3 (v2usi x, v2udi y)
+{
+  return (x[1] + x[0]) + y[0];
+}
+
+/*
+** foo4:
+** 	saddlp	v0.1d, v0.2s
+** 	add	d0, d0, d1
+** 	fmov	x0, d0
+** 	ret
+*/
+long long
+foo4 (v2si x, v2di y)
+{
+  return (x[1] + x[0]) + y[0];
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
new file mode 100644
index 0000000000000000000000000000000000000000..ff455e060fc833b2f63e89c467b91a76fbe31aff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
@@ -0,0 +1,66 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
+/* { dg-add-options arm_v8_2a_fp16_scalar } */
+/* { dg-additional-options "-save-temps -O1" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef double v2df __attribute__((vector_size (16)));
+typedef float v4sf __attribute__((vector_size (16)));
+typedef __fp16 v8hf __attribute__((vector_size (16)));
+
+/*
+** foo:
+** 	faddp	d0, v0.2d
+** 	ret
+*/
+double
+foo (v2df x)
+{
+  return x[1] + x[0];
+}
+
+/*
+** foo1:
+** 	faddp	s0, v0.2s
+** 	ret
+*/
+float
+foo1 (v4sf x)
+{
+  return x[0] + x[1];
+}
+
+/*
+** foo2:
+** 	faddp	h0, v0.2h
+** 	ret
+*/
+__fp16
+foo2 (v8hf x)
+{
+  return x[0] + x[1];
+}
+
+/*
+** foo3:
+** 	ext	v0.16b, v0.16b, v0.16b, #4
+** 	faddp	s0, v0.2s
+** 	ret
+*/
+float
+foo3 (v4sf x)
+{
+  return x[1] + x[2];
+}
+
+/*
+** foo4:
+** 	dup	s0, v0.s\[3\]
+** 	faddp	h0, v0.2h
+** 	ret
+*/
+__fp16
+foo4 (v8hf x)
+{
+  return x[6] + x[7];
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
new file mode 100644
index 0000000000000000000000000000000000000000..04412c3b45c51648e46ff20f730b1213e940391a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
@@ -0,0 +1,14 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -w" } */
+
+typedef __m128i __attribute__((__vector_size__(2 * sizeof(long))));
+double a[];
+*b;
+fn1() {
+  __m128i c;
+  *(__m128i *)a = c;
+  *b = a[0] + a[1];
+}
+
+/* { dg-final { scan-assembler-times {faddp\td0, v0\.2d} 1 } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_fmaxp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_fmaxp.c
new file mode 100644
index 0000000000000000000000000000000000000000..aa1d2bf17cd707b74d8f7c574506610ab4fd7299
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_fmaxp.c
@@ -0,0 +1,56 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
+/* { dg-add-options arm_v8_2a_fp16_scalar } */
+/* { dg-additional-options "-save-temps -O1" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef double v2df __attribute__((vector_size (16)));
+typedef float v4sf __attribute__((vector_size (16)));
+typedef __fp16 v8hf __attribute__((vector_size (16)));
+
+/*
+** foo:
+** 	fmaxnmp	d0, v0.2d
+** 	ret
+*/
+double
+foo (v2df x)
+{
+  return x[0] > x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo1:
+** 	fmaxnmp	s0, v0.2s
+** 	ret
+*/
+float
+foo1 (v4sf x)
+{
+  return x[0] > x[1] ? x[0] : x[1];
+}
+
+/*
+** foo2:
+** 	fmaxnmp	h0, v0.2h
+** 	ret
+*/
+__fp16
+foo2 (v8hf x)
+{
+  return x[0] > x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo3:
+** 	fmaxnmp	s0, v0.2s
+** 	fcvt	d0, s0
+** 	fadd	d0, d0, d1
+** 	ret
+*/
+double
+foo3 (v4sf x, v2df y)
+{
+  return (x[0] > x[1] ? x[0] : x[1]) + y[0];
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_fminp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_fminp.c
new file mode 100644
index 0000000000000000000000000000000000000000..6136c5272069c4d86f09951cdff25f1494e839f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_fminp.c
@@ -0,0 +1,55 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
+/* { dg-add-options arm_v8_2a_fp16_scalar } */
+/* { dg-additional-options "-save-temps -O1" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef double v2df __attribute__((vector_size (16)));
+typedef float v4sf __attribute__((vector_size (16)));
+typedef __fp16 v8hf __attribute__((vector_size (16)));
+
+/*
+** foo:
+** 	fminnmp	d0, v0.2d
+** 	ret
+*/
+double
+foo (v2df x)
+{
+  return x[0] < x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo1:
+** 	fminnmp	s0, v0.2s
+** 	ret
+*/
+float
+foo1 (v4sf x)
+{
+  return x[0] < x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo2:
+** 	fminnmp	h0, v0.2h
+** 	ret
+*/
+__fp16
+foo2 (v8hf x)
+{
+  return x[0] < x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo3:
+** 	fminnmp	s0, v0.2s
+** 	fcvt	d0, s0
+** 	fadd	d0, d0, d1
+** 	ret
+*/
+double
+foo3 (v4sf x, v2df y)
+{
+  return (x[0] < x[1] ? x[0] : x[1]) + y[0];
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_maxp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_maxp.c
new file mode 100644
index 0000000000000000000000000000000000000000..e219a13abc745b83dca58633fd2d812e276d6b2d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_maxp.c
@@ -0,0 +1,74 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+typedef unsigned long long v2udi __attribute__((vector_size (16)));
+typedef int v2si __attribute__((vector_size (16)));
+typedef unsigned int v2usi __attribute__((vector_size (16)));
+
+/*
+** foo:
+** 	umov	x0, v0.d\[1\]
+** 	fmov	x1, d0
+** 	cmp	x0, x1
+** 	csel	x0, x0, x1, ge
+** 	ret
+*/
+long long
+foo (v2di x)
+{
+  return x[0] > x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo1:
+** 	smaxp	v0.2s, v0.2s, v0.2s
+** 	smov	x0, v0.s\[0\]
+** 	ret
+*/
+long long
+foo1 (v2si x)
+{
+  return x[0] > x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo2:
+** 	umaxp	v0.2s, v0.2s, v0.2s
+** 	fmov	w0, s0
+** 	ret
+*/
+unsigned long long
+foo2 (v2usi x)
+{
+  return x[0] > x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo3:
+** 	umaxp	v0.2s, v0.2s, v0.2s
+** 	fmov	w0, s0
+** 	fmov	x1, d1
+** 	add	x0, x1, w0, uxtw
+** 	ret
+*/
+unsigned long long
+foo3 (v2usi x, v2udi y)
+{
+  return (x[0] > x[1] ? x[0] : x[1]) + y[0];
+}
+
+/* 
+** foo4:
+** 	smaxp	v0.2s, v0.2s, v0.2s
+** 	fmov	w0, s0
+** 	fmov	x1, d1
+** 	add	x0, x1, w0, sxtw
+** 	ret
+*/
+long long
+foo4 (v2si x, v2di y)
+{
+  return (x[0] > x[1] ? x[0] : x[1]) + y[0];
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_minp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_minp.c
new file mode 100644
index 0000000000000000000000000000000000000000..2a32fb4ea3edaa4c547a7a481c3ddca6b477430e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_minp.c
@@ -0,0 +1,74 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+typedef unsigned long long v2udi __attribute__((vector_size (16)));
+typedef int v2si __attribute__((vector_size (16)));
+typedef unsigned int v2usi __attribute__((vector_size (16)));
+
+/*
+** foo:
+** 	umov	x0, v0.d\[1\]
+** 	fmov	x1, d0
+** 	cmp	x0, x1
+** 	csel	x0, x0, x1, le
+** 	ret
+*/
+long long
+foo (v2di x)
+{
+  return x[0] < x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo1:
+** 	sminp	v0.2s, v0.2s, v0.2s
+** 	smov	x0, v0.s\[0\]
+** 	ret
+*/
+long long
+foo1 (v2si x)
+{
+  return x[0] < x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo2:
+** 	uminp	v0.2s, v0.2s, v0.2s
+** 	fmov	w0, s0
+** 	ret
+*/
+unsigned long long
+foo2 (v2usi x)
+{
+  return x[0] < x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo3:
+** 	uminp	v0.2s, v0.2s, v0.2s
+** 	fmov	w0, s0
+** 	fmov	x1, d1
+** 	add	x0, x1, w0, uxtw
+** 	ret
+*/
+unsigned long long
+foo3 (v2usi x, v2udi y)
+{
+  return (x[0] < x[1] ? x[0] : x[1]) + y[0];
+}
+
+/* 
+** foo4:
+** 	sminp	v0.2s, v0.2s, v0.2s
+** 	fmov	w0, s0
+** 	fmov	x1, d1
+** 	add	x0, x1, w0, sxtw
+** 	ret
+*/
+long long
+foo4 (v2si x, v2di y)
+{
+  return (x[0] < x[1] ? x[0] : x[1]) + y[0];
+}
  

Comments

Richard Sandiford Nov. 1, 2022, 3:04 p.m. UTC | #1
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> Currently we often times generate an r -> r add even if it means we need two
> reloads to perform it, i.e. in the case that the values are on the SIMD side.
>
> The pairwise operations expose these more now and so we get suboptimal codegen.
>
> Normally I would have liked to use ^ or $ here, but while this works for the
> simple examples, reload inexplicably falls apart on examples that should have
> been trivial. It forces a move to r -> w to use the w ADD, which is counter to
> what ^ and $ should do.
>
> However ! seems to fix all the regression and still maintains the good codegen.
>
> I have tried looking into whether it's our costings that are off, but I can't
> seem anything logical here.  So I'd like to push this change instead along with
> test that augment the other testcases that guard the r -> r variants.

This feels like a hack though.  r<-r+r is one of the simplest thing
the processor can do, so I don't think it makes logical sense to mark
it with !, which means "prohibitively expensive".  It's likely to
push operations that require reloads onto the SIMD side.

Thanks,
Richard

> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.md (*add<mode>3_aarch64): Add ! to the r -> r
> 	alternative.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/simd/scalar_addp.c: New test.
> 	* gcc.target/aarch64/simd/scalar_faddp.c: New test.
> 	* gcc.target/aarch64/simd/scalar_faddp2.c: New test.
> 	* gcc.target/aarch64/simd/scalar_fmaxp.c: New test.
> 	* gcc.target/aarch64/simd/scalar_fminp.c: New test.
> 	* gcc.target/aarch64/simd/scalar_maxp.c: New test.
> 	* gcc.target/aarch64/simd/scalar_minp.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 09ae1118371f82ca63146fceb953eb9e820d05a4..c333fb1f72725992bb304c560f1245a242d5192d 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -2043,7 +2043,7 @@ (define_expand "add<mode>3"
>  
>  (define_insn "*add<mode>3_aarch64"
>    [(set
> -    (match_operand:GPI 0 "register_operand" "=rk,rk,w,rk,r,r,rk")
> +    (match_operand:GPI 0 "register_operand" "=rk,!rk,w,rk,r,r,rk")
>      (plus:GPI
>       (match_operand:GPI 1 "register_operand" "%rk,rk,w,rk,rk,0,rk")
>       (match_operand:GPI 2 "aarch64_pluslong_operand" "I,r,w,J,Uaa,Uai,Uav")))]
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..5b8d40f19884fc7b4e7decd80758bc36fa76d058
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> @@ -0,0 +1,70 @@
> +/* { dg-do assemble } */
> +/* { dg-additional-options "-save-temps -O1 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +typedef long long v2di __attribute__((vector_size (16)));
> +typedef unsigned long long v2udi __attribute__((vector_size (16)));
> +typedef int v2si __attribute__((vector_size (16)));
> +typedef unsigned int v2usi __attribute__((vector_size (16)));
> +
> +/*
> +** foo:
> +** 	addp	d0, v0.2d
> +** 	fmov	x0, d0
> +** 	ret
> +*/
> +long long
> +foo (v2di x)
> +{
> +  return x[1] + x[0];
> +}
> +
> +/*
> +** foo1:
> +** 	saddlp	v0.1d, v0.2s
> +** 	fmov	x0, d0
> +** 	ret
> +*/
> +long long
> +foo1 (v2si x)
> +{
> +  return x[1] + x[0];
> +}
> +
> +/*
> +** foo2:
> +** 	uaddlp	v0.1d, v0.2s
> +** 	fmov	x0, d0
> +** 	ret
> +*/
> +unsigned long long
> +foo2 (v2usi x)
> +{
> +  return x[1] + x[0];
> +}
> +
> +/*
> +** foo3:
> +** 	uaddlp	v0.1d, v0.2s
> +** 	add	d0, d0, d1
> +** 	fmov	x0, d0
> +** 	ret
> +*/
> +unsigned long long
> +foo3 (v2usi x, v2udi y)
> +{
> +  return (x[1] + x[0]) + y[0];
> +}
> +
> +/*
> +** foo4:
> +** 	saddlp	v0.1d, v0.2s
> +** 	add	d0, d0, d1
> +** 	fmov	x0, d0
> +** 	ret
> +*/
> +long long
> +foo4 (v2si x, v2di y)
> +{
> +  return (x[1] + x[0]) + y[0];
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..ff455e060fc833b2f63e89c467b91a76fbe31aff
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> @@ -0,0 +1,66 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
> +/* { dg-add-options arm_v8_2a_fp16_scalar } */
> +/* { dg-additional-options "-save-temps -O1" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +typedef double v2df __attribute__((vector_size (16)));
> +typedef float v4sf __attribute__((vector_size (16)));
> +typedef __fp16 v8hf __attribute__((vector_size (16)));
> +
> +/*
> +** foo:
> +** 	faddp	d0, v0.2d
> +** 	ret
> +*/
> +double
> +foo (v2df x)
> +{
> +  return x[1] + x[0];
> +}
> +
> +/*
> +** foo1:
> +** 	faddp	s0, v0.2s
> +** 	ret
> +*/
> +float
> +foo1 (v4sf x)
> +{
> +  return x[0] + x[1];
> +}
> +
> +/*
> +** foo2:
> +** 	faddp	h0, v0.2h
> +** 	ret
> +*/
> +__fp16
> +foo2 (v8hf x)
> +{
> +  return x[0] + x[1];
> +}
> +
> +/*
> +** foo3:
> +** 	ext	v0.16b, v0.16b, v0.16b, #4
> +** 	faddp	s0, v0.2s
> +** 	ret
> +*/
> +float
> +foo3 (v4sf x)
> +{
> +  return x[1] + x[2];
> +}
> +
> +/*
> +** foo4:
> +** 	dup	s0, v0.s\[3\]
> +** 	faddp	h0, v0.2h
> +** 	ret
> +*/
> +__fp16
> +foo4 (v8hf x)
> +{
> +  return x[6] + x[7];
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..04412c3b45c51648e46ff20f730b1213e940391a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
> @@ -0,0 +1,14 @@
> +/* { dg-do assemble } */
> +/* { dg-additional-options "-save-temps -O1 -w" } */
> +
> +typedef __m128i __attribute__((__vector_size__(2 * sizeof(long))));
> +double a[];
> +*b;
> +fn1() {
> +  __m128i c;
> +  *(__m128i *)a = c;
> +  *b = a[0] + a[1];
> +}
> +
> +/* { dg-final { scan-assembler-times {faddp\td0, v0\.2d} 1 } } */
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_fmaxp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_fmaxp.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..aa1d2bf17cd707b74d8f7c574506610ab4fd7299
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_fmaxp.c
> @@ -0,0 +1,56 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
> +/* { dg-add-options arm_v8_2a_fp16_scalar } */
> +/* { dg-additional-options "-save-temps -O1" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +typedef double v2df __attribute__((vector_size (16)));
> +typedef float v4sf __attribute__((vector_size (16)));
> +typedef __fp16 v8hf __attribute__((vector_size (16)));
> +
> +/*
> +** foo:
> +** 	fmaxnmp	d0, v0.2d
> +** 	ret
> +*/
> +double
> +foo (v2df x)
> +{
> +  return x[0] > x[1] ? x[0] : x[1];
> +}
> +
> +/* 
> +** foo1:
> +** 	fmaxnmp	s0, v0.2s
> +** 	ret
> +*/
> +float
> +foo1 (v4sf x)
> +{
> +  return x[0] > x[1] ? x[0] : x[1];
> +}
> +
> +/*
> +** foo2:
> +** 	fmaxnmp	h0, v0.2h
> +** 	ret
> +*/
> +__fp16
> +foo2 (v8hf x)
> +{
> +  return x[0] > x[1] ? x[0] : x[1];
> +}
> +
> +/* 
> +** foo3:
> +** 	fmaxnmp	s0, v0.2s
> +** 	fcvt	d0, s0
> +** 	fadd	d0, d0, d1
> +** 	ret
> +*/
> +double
> +foo3 (v4sf x, v2df y)
> +{
> +  return (x[0] > x[1] ? x[0] : x[1]) + y[0];
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_fminp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_fminp.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..6136c5272069c4d86f09951cdff25f1494e839f0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_fminp.c
> @@ -0,0 +1,55 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
> +/* { dg-add-options arm_v8_2a_fp16_scalar } */
> +/* { dg-additional-options "-save-temps -O1" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +typedef double v2df __attribute__((vector_size (16)));
> +typedef float v4sf __attribute__((vector_size (16)));
> +typedef __fp16 v8hf __attribute__((vector_size (16)));
> +
> +/*
> +** foo:
> +** 	fminnmp	d0, v0.2d
> +** 	ret
> +*/
> +double
> +foo (v2df x)
> +{
> +  return x[0] < x[1] ? x[0] : x[1];
> +}
> +
> +/* 
> +** foo1:
> +** 	fminnmp	s0, v0.2s
> +** 	ret
> +*/
> +float
> +foo1 (v4sf x)
> +{
> +  return x[0] < x[1] ? x[0] : x[1];
> +}
> +
> +/* 
> +** foo2:
> +** 	fminnmp	h0, v0.2h
> +** 	ret
> +*/
> +__fp16
> +foo2 (v8hf x)
> +{
> +  return x[0] < x[1] ? x[0] : x[1];
> +}
> +
> +/* 
> +** foo3:
> +** 	fminnmp	s0, v0.2s
> +** 	fcvt	d0, s0
> +** 	fadd	d0, d0, d1
> +** 	ret
> +*/
> +double
> +foo3 (v4sf x, v2df y)
> +{
> +  return (x[0] < x[1] ? x[0] : x[1]) + y[0];
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_maxp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_maxp.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..e219a13abc745b83dca58633fd2d812e276d6b2d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_maxp.c
> @@ -0,0 +1,74 @@
> +/* { dg-do assemble } */
> +/* { dg-additional-options "-save-temps -O1 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +typedef long long v2di __attribute__((vector_size (16)));
> +typedef unsigned long long v2udi __attribute__((vector_size (16)));
> +typedef int v2si __attribute__((vector_size (16)));
> +typedef unsigned int v2usi __attribute__((vector_size (16)));
> +
> +/*
> +** foo:
> +** 	umov	x0, v0.d\[1\]
> +** 	fmov	x1, d0
> +** 	cmp	x0, x1
> +** 	csel	x0, x0, x1, ge
> +** 	ret
> +*/
> +long long
> +foo (v2di x)
> +{
> +  return x[0] > x[1] ? x[0] : x[1];
> +}
> +
> +/* 
> +** foo1:
> +** 	smaxp	v0.2s, v0.2s, v0.2s
> +** 	smov	x0, v0.s\[0\]
> +** 	ret
> +*/
> +long long
> +foo1 (v2si x)
> +{
> +  return x[0] > x[1] ? x[0] : x[1];
> +}
> +
> +/* 
> +** foo2:
> +** 	umaxp	v0.2s, v0.2s, v0.2s
> +** 	fmov	w0, s0
> +** 	ret
> +*/
> +unsigned long long
> +foo2 (v2usi x)
> +{
> +  return x[0] > x[1] ? x[0] : x[1];
> +}
> +
> +/* 
> +** foo3:
> +** 	umaxp	v0.2s, v0.2s, v0.2s
> +** 	fmov	w0, s0
> +** 	fmov	x1, d1
> +** 	add	x0, x1, w0, uxtw
> +** 	ret
> +*/
> +unsigned long long
> +foo3 (v2usi x, v2udi y)
> +{
> +  return (x[0] > x[1] ? x[0] : x[1]) + y[0];
> +}
> +
> +/* 
> +** foo4:
> +** 	smaxp	v0.2s, v0.2s, v0.2s
> +** 	fmov	w0, s0
> +** 	fmov	x1, d1
> +** 	add	x0, x1, w0, sxtw
> +** 	ret
> +*/
> +long long
> +foo4 (v2si x, v2di y)
> +{
> +  return (x[0] > x[1] ? x[0] : x[1]) + y[0];
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_minp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_minp.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..2a32fb4ea3edaa4c547a7a481c3ddca6b477430e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_minp.c
> @@ -0,0 +1,74 @@
> +/* { dg-do assemble } */
> +/* { dg-additional-options "-save-temps -O1 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +typedef long long v2di __attribute__((vector_size (16)));
> +typedef unsigned long long v2udi __attribute__((vector_size (16)));
> +typedef int v2si __attribute__((vector_size (16)));
> +typedef unsigned int v2usi __attribute__((vector_size (16)));
> +
> +/*
> +** foo:
> +** 	umov	x0, v0.d\[1\]
> +** 	fmov	x1, d0
> +** 	cmp	x0, x1
> +** 	csel	x0, x0, x1, le
> +** 	ret
> +*/
> +long long
> +foo (v2di x)
> +{
> +  return x[0] < x[1] ? x[0] : x[1];
> +}
> +
> +/* 
> +** foo1:
> +** 	sminp	v0.2s, v0.2s, v0.2s
> +** 	smov	x0, v0.s\[0\]
> +** 	ret
> +*/
> +long long
> +foo1 (v2si x)
> +{
> +  return x[0] < x[1] ? x[0] : x[1];
> +}
> +
> +/* 
> +** foo2:
> +** 	uminp	v0.2s, v0.2s, v0.2s
> +** 	fmov	w0, s0
> +** 	ret
> +*/
> +unsigned long long
> +foo2 (v2usi x)
> +{
> +  return x[0] < x[1] ? x[0] : x[1];
> +}
> +
> +/* 
> +** foo3:
> +** 	uminp	v0.2s, v0.2s, v0.2s
> +** 	fmov	w0, s0
> +** 	fmov	x1, d1
> +** 	add	x0, x1, w0, uxtw
> +** 	ret
> +*/
> +unsigned long long
> +foo3 (v2usi x, v2udi y)
> +{
> +  return (x[0] < x[1] ? x[0] : x[1]) + y[0];
> +}
> +
> +/* 
> +** foo4:
> +** 	sminp	v0.2s, v0.2s, v0.2s
> +** 	fmov	w0, s0
> +** 	fmov	x1, d1
> +** 	add	x0, x1, w0, sxtw
> +** 	ret
> +*/
> +long long
> +foo4 (v2si x, v2di y)
> +{
> +  return (x[0] < x[1] ? x[0] : x[1]) + y[0];
> +}
  
Tamar Christina Nov. 1, 2022, 3:20 p.m. UTC | #2
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Tuesday, November 1, 2022 3:05 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH 8/8]AArch64: Have reload not choose to do add on the
> scalar side if both values exist on the SIMD side.
> 
> Tamar Christina <tamar.christina@arm.com> writes:
> > Hi All,
> >
> > Currently we often times generate an r -> r add even if it means we
> > need two reloads to perform it, i.e. in the case that the values are on the
> SIMD side.
> >
> > The pairwise operations expose these more now and so we get suboptimal
> codegen.
> >
> > Normally I would have liked to use ^ or $ here, but while this works
> > for the simple examples, reload inexplicably falls apart on examples
> > that should have been trivial. It forces a move to r -> w to use the w
> > ADD, which is counter to what ^ and $ should do.
> >
> > However ! seems to fix all the regression and still maintains the good
> codegen.
> >
> > I have tried looking into whether it's our costings that are off, but
> > I can't seem anything logical here.  So I'd like to push this change
> > instead along with test that augment the other testcases that guard the r ->
> r variants.
> 
> This feels like a hack though.  r<-r+r is one of the simplest thing the processor
> can do, so I don't think it makes logical sense to mark it with !, which means
> "prohibitively expensive".  It's likely to push operations that require reloads
> onto the SIMD side.

I agree. Though at the moment, reload isn't behaving as it should. It's almost as if
the register transfer costs are not taken into account when deciding on an alternative.

It seems to think that an r->r and w->w are as cheap even when the value has been assigned
to w before.  For instance, some of the testcases below don't work correctly because of this.

I don't think I can influence this costing, and as I mentioned ^ works for the simple example
But then somehow makes w->w cheaper even though the value was assigned to r.

I'm not really sure where to look here, but the current version is also equally broken..
It basically always forces to r.

Thanks,
Tamar

> 
> Thanks,
> Richard
> 
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* config/aarch64/aarch64.md (*add<mode>3_aarch64): Add ! to the
> r -> r
> > 	alternative.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.target/aarch64/simd/scalar_addp.c: New test.
> > 	* gcc.target/aarch64/simd/scalar_faddp.c: New test.
> > 	* gcc.target/aarch64/simd/scalar_faddp2.c: New test.
> > 	* gcc.target/aarch64/simd/scalar_fmaxp.c: New test.
> > 	* gcc.target/aarch64/simd/scalar_fminp.c: New test.
> > 	* gcc.target/aarch64/simd/scalar_maxp.c: New test.
> > 	* gcc.target/aarch64/simd/scalar_minp.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64.md
> > b/gcc/config/aarch64/aarch64.md index
> >
> 09ae1118371f82ca63146fceb953eb9e820d05a4..c333fb1f72725992bb304c560f
> 12
> > 45a242d5192d 100644
> > --- a/gcc/config/aarch64/aarch64.md
> > +++ b/gcc/config/aarch64/aarch64.md
> > @@ -2043,7 +2043,7 @@ (define_expand "add<mode>3"
> >
> >  (define_insn "*add<mode>3_aarch64"
> >    [(set
> > -    (match_operand:GPI 0 "register_operand" "=rk,rk,w,rk,r,r,rk")
> > +    (match_operand:GPI 0 "register_operand" "=rk,!rk,w,rk,r,r,rk")
> >      (plus:GPI
> >       (match_operand:GPI 1 "register_operand" "%rk,rk,w,rk,rk,0,rk")
> >       (match_operand:GPI 2 "aarch64_pluslong_operand"
> > "I,r,w,J,Uaa,Uai,Uav")))] diff --git
> > a/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> > b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..5b8d40f19884fc7b4e7decd80
> 758
> > bc36fa76d058
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> > @@ -0,0 +1,70 @@
> > +/* { dg-do assemble } */
> > +/* { dg-additional-options "-save-temps -O1 -std=c99" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +typedef long long v2di __attribute__((vector_size (16))); typedef
> > +unsigned long long v2udi __attribute__((vector_size (16))); typedef
> > +int v2si __attribute__((vector_size (16))); typedef unsigned int
> > +v2usi __attribute__((vector_size (16)));
> > +
> > +/*
> > +** foo:
> > +** 	addp	d0, v0.2d
> > +** 	fmov	x0, d0
> > +** 	ret
> > +*/
> > +long long
> > +foo (v2di x)
> > +{
> > +  return x[1] + x[0];
> > +}
> > +
> > +/*
> > +** foo1:
> > +** 	saddlp	v0.1d, v0.2s
> > +** 	fmov	x0, d0
> > +** 	ret
> > +*/
> > +long long
> > +foo1 (v2si x)
> > +{
> > +  return x[1] + x[0];
> > +}
> > +
> > +/*
> > +** foo2:
> > +** 	uaddlp	v0.1d, v0.2s
> > +** 	fmov	x0, d0
> > +** 	ret
> > +*/
> > +unsigned long long
> > +foo2 (v2usi x)
> > +{
> > +  return x[1] + x[0];
> > +}
> > +
> > +/*
> > +** foo3:
> > +** 	uaddlp	v0.1d, v0.2s
> > +** 	add	d0, d0, d1
> > +** 	fmov	x0, d0
> > +** 	ret
> > +*/
> > +unsigned long long
> > +foo3 (v2usi x, v2udi y)
> > +{
> > +  return (x[1] + x[0]) + y[0];
> > +}
> > +
> > +/*
> > +** foo4:
> > +** 	saddlp	v0.1d, v0.2s
> > +** 	add	d0, d0, d1
> > +** 	fmov	x0, d0
> > +** 	ret
> > +*/
> > +long long
> > +foo4 (v2si x, v2di y)
> > +{
> > +  return (x[1] + x[0]) + y[0];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> > b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..ff455e060fc833b2f63e89c467
> b9
> > 1a76fbe31aff
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> > @@ -0,0 +1,66 @@
> > +/* { dg-do assemble } */
> > +/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
> > +/* { dg-add-options arm_v8_2a_fp16_scalar } */
> > +/* { dg-additional-options "-save-temps -O1" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +typedef double v2df __attribute__((vector_size (16))); typedef float
> > +v4sf __attribute__((vector_size (16))); typedef __fp16 v8hf
> > +__attribute__((vector_size (16)));
> > +
> > +/*
> > +** foo:
> > +** 	faddp	d0, v0.2d
> > +** 	ret
> > +*/
> > +double
> > +foo (v2df x)
> > +{
> > +  return x[1] + x[0];
> > +}
> > +
> > +/*
> > +** foo1:
> > +** 	faddp	s0, v0.2s
> > +** 	ret
> > +*/
> > +float
> > +foo1 (v4sf x)
> > +{
> > +  return x[0] + x[1];
> > +}
> > +
> > +/*
> > +** foo2:
> > +** 	faddp	h0, v0.2h
> > +** 	ret
> > +*/
> > +__fp16
> > +foo2 (v8hf x)
> > +{
> > +  return x[0] + x[1];
> > +}
> > +
> > +/*
> > +** foo3:
> > +** 	ext	v0.16b, v0.16b, v0.16b, #4
> > +** 	faddp	s0, v0.2s
> > +** 	ret
> > +*/
> > +float
> > +foo3 (v4sf x)
> > +{
> > +  return x[1] + x[2];
> > +}
> > +
> > +/*
> > +** foo4:
> > +** 	dup	s0, v0.s\[3\]
> > +** 	faddp	h0, v0.2h
> > +** 	ret
> > +*/
> > +__fp16
> > +foo4 (v8hf x)
> > +{
> > +  return x[6] + x[7];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
> > b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..04412c3b45c51648e46ff20f73
> 0b
> > 1213e940391a
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
> > @@ -0,0 +1,14 @@
> > +/* { dg-do assemble } */
> > +/* { dg-additional-options "-save-temps -O1 -w" } */
> > +
> > +typedef __m128i __attribute__((__vector_size__(2 * sizeof(long))));
> > +double a[]; *b;
> > +fn1() {
> > +  __m128i c;
> > +  *(__m128i *)a = c;
> > +  *b = a[0] + a[1];
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {faddp\td0, v0\.2d} 1 } } */
> > +
> > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_fmaxp.c
> > b/gcc/testsuite/gcc.target/aarch64/simd/scalar_fmaxp.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..aa1d2bf17cd707b74d8f7c5745
> 06
> > 610ab4fd7299
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_fmaxp.c
> > @@ -0,0 +1,56 @@
> > +/* { dg-do assemble } */
> > +/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
> > +/* { dg-add-options arm_v8_2a_fp16_scalar } */
> > +/* { dg-additional-options "-save-temps -O1" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +typedef double v2df __attribute__((vector_size (16))); typedef float
> > +v4sf __attribute__((vector_size (16))); typedef __fp16 v8hf
> > +__attribute__((vector_size (16)));
> > +
> > +/*
> > +** foo:
> > +** 	fmaxnmp	d0, v0.2d
> > +** 	ret
> > +*/
> > +double
> > +foo (v2df x)
> > +{
> > +  return x[0] > x[1] ? x[0] : x[1];
> > +}
> > +
> > +/*
> > +** foo1:
> > +** 	fmaxnmp	s0, v0.2s
> > +** 	ret
> > +*/
> > +float
> > +foo1 (v4sf x)
> > +{
> > +  return x[0] > x[1] ? x[0] : x[1];
> > +}
> > +
> > +/*
> > +** foo2:
> > +** 	fmaxnmp	h0, v0.2h
> > +** 	ret
> > +*/
> > +__fp16
> > +foo2 (v8hf x)
> > +{
> > +  return x[0] > x[1] ? x[0] : x[1];
> > +}
> > +
> > +/*
> > +** foo3:
> > +** 	fmaxnmp	s0, v0.2s
> > +** 	fcvt	d0, s0
> > +** 	fadd	d0, d0, d1
> > +** 	ret
> > +*/
> > +double
> > +foo3 (v4sf x, v2df y)
> > +{
> > +  return (x[0] > x[1] ? x[0] : x[1]) + y[0]; }
> > +
> > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_fminp.c
> > b/gcc/testsuite/gcc.target/aarch64/simd/scalar_fminp.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..6136c5272069c4d86f09951cdff
> 2
> > 5f1494e839f0
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_fminp.c
> > @@ -0,0 +1,55 @@
> > +/* { dg-do assemble } */
> > +/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
> > +/* { dg-add-options arm_v8_2a_fp16_scalar } */
> > +/* { dg-additional-options "-save-temps -O1" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +typedef double v2df __attribute__((vector_size (16))); typedef float
> > +v4sf __attribute__((vector_size (16))); typedef __fp16 v8hf
> > +__attribute__((vector_size (16)));
> > +
> > +/*
> > +** foo:
> > +** 	fminnmp	d0, v0.2d
> > +** 	ret
> > +*/
> > +double
> > +foo (v2df x)
> > +{
> > +  return x[0] < x[1] ? x[0] : x[1];
> > +}
> > +
> > +/*
> > +** foo1:
> > +** 	fminnmp	s0, v0.2s
> > +** 	ret
> > +*/
> > +float
> > +foo1 (v4sf x)
> > +{
> > +  return x[0] < x[1] ? x[0] : x[1];
> > +}
> > +
> > +/*
> > +** foo2:
> > +** 	fminnmp	h0, v0.2h
> > +** 	ret
> > +*/
> > +__fp16
> > +foo2 (v8hf x)
> > +{
> > +  return x[0] < x[1] ? x[0] : x[1];
> > +}
> > +
> > +/*
> > +** foo3:
> > +** 	fminnmp	s0, v0.2s
> > +** 	fcvt	d0, s0
> > +** 	fadd	d0, d0, d1
> > +** 	ret
> > +*/
> > +double
> > +foo3 (v4sf x, v2df y)
> > +{
> > +  return (x[0] < x[1] ? x[0] : x[1]) + y[0]; }
> > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_maxp.c
> > b/gcc/testsuite/gcc.target/aarch64/simd/scalar_maxp.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..e219a13abc745b83dca58633f
> d2d
> > 812e276d6b2d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_maxp.c
> > @@ -0,0 +1,74 @@
> > +/* { dg-do assemble } */
> > +/* { dg-additional-options "-save-temps -O1 -std=c99" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +typedef long long v2di __attribute__((vector_size (16))); typedef
> > +unsigned long long v2udi __attribute__((vector_size (16))); typedef
> > +int v2si __attribute__((vector_size (16))); typedef unsigned int
> > +v2usi __attribute__((vector_size (16)));
> > +
> > +/*
> > +** foo:
> > +** 	umov	x0, v0.d\[1\]
> > +** 	fmov	x1, d0
> > +** 	cmp	x0, x1
> > +** 	csel	x0, x0, x1, ge
> > +** 	ret
> > +*/
> > +long long
> > +foo (v2di x)
> > +{
> > +  return x[0] > x[1] ? x[0] : x[1];
> > +}
> > +
> > +/*
> > +** foo1:
> > +** 	smaxp	v0.2s, v0.2s, v0.2s
> > +** 	smov	x0, v0.s\[0\]
> > +** 	ret
> > +*/
> > +long long
> > +foo1 (v2si x)
> > +{
> > +  return x[0] > x[1] ? x[0] : x[1];
> > +}
> > +
> > +/*
> > +** foo2:
> > +** 	umaxp	v0.2s, v0.2s, v0.2s
> > +** 	fmov	w0, s0
> > +** 	ret
> > +*/
> > +unsigned long long
> > +foo2 (v2usi x)
> > +{
> > +  return x[0] > x[1] ? x[0] : x[1];
> > +}
> > +
> > +/*
> > +** foo3:
> > +** 	umaxp	v0.2s, v0.2s, v0.2s
> > +** 	fmov	w0, s0
> > +** 	fmov	x1, d1
> > +** 	add	x0, x1, w0, uxtw
> > +** 	ret
> > +*/
> > +unsigned long long
> > +foo3 (v2usi x, v2udi y)
> > +{
> > +  return (x[0] > x[1] ? x[0] : x[1]) + y[0]; }
> > +
> > +/*
> > +** foo4:
> > +** 	smaxp	v0.2s, v0.2s, v0.2s
> > +** 	fmov	w0, s0
> > +** 	fmov	x1, d1
> > +** 	add	x0, x1, w0, sxtw
> > +** 	ret
> > +*/
> > +long long
> > +foo4 (v2si x, v2di y)
> > +{
> > +  return (x[0] > x[1] ? x[0] : x[1]) + y[0]; }
> > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_minp.c
> > b/gcc/testsuite/gcc.target/aarch64/simd/scalar_minp.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..2a32fb4ea3edaa4c547a7a481c
> 3d
> > dca6b477430e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_minp.c
> > @@ -0,0 +1,74 @@
> > +/* { dg-do assemble } */
> > +/* { dg-additional-options "-save-temps -O1 -std=c99" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +typedef long long v2di __attribute__((vector_size (16))); typedef
> > +unsigned long long v2udi __attribute__((vector_size (16))); typedef
> > +int v2si __attribute__((vector_size (16))); typedef unsigned int
> > +v2usi __attribute__((vector_size (16)));
> > +
> > +/*
> > +** foo:
> > +** 	umov	x0, v0.d\[1\]
> > +** 	fmov	x1, d0
> > +** 	cmp	x0, x1
> > +** 	csel	x0, x0, x1, le
> > +** 	ret
> > +*/
> > +long long
> > +foo (v2di x)
> > +{
> > +  return x[0] < x[1] ? x[0] : x[1];
> > +}
> > +
> > +/*
> > +** foo1:
> > +** 	sminp	v0.2s, v0.2s, v0.2s
> > +** 	smov	x0, v0.s\[0\]
> > +** 	ret
> > +*/
> > +long long
> > +foo1 (v2si x)
> > +{
> > +  return x[0] < x[1] ? x[0] : x[1];
> > +}
> > +
> > +/*
> > +** foo2:
> > +** 	uminp	v0.2s, v0.2s, v0.2s
> > +** 	fmov	w0, s0
> > +** 	ret
> > +*/
> > +unsigned long long
> > +foo2 (v2usi x)
> > +{
> > +  return x[0] < x[1] ? x[0] : x[1];
> > +}
> > +
> > +/*
> > +** foo3:
> > +** 	uminp	v0.2s, v0.2s, v0.2s
> > +** 	fmov	w0, s0
> > +** 	fmov	x1, d1
> > +** 	add	x0, x1, w0, uxtw
> > +** 	ret
> > +*/
> > +unsigned long long
> > +foo3 (v2usi x, v2udi y)
> > +{
> > +  return (x[0] < x[1] ? x[0] : x[1]) + y[0]; }
> > +
> > +/*
> > +** foo4:
> > +** 	sminp	v0.2s, v0.2s, v0.2s
> > +** 	fmov	w0, s0
> > +** 	fmov	x1, d1
> > +** 	add	x0, x1, w0, sxtw
> > +** 	ret
> > +*/
> > +long long
> > +foo4 (v2si x, v2di y)
> > +{
> > +  return (x[0] < x[1] ? x[0] : x[1]) + y[0]; }
  

Patch

--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -2043,7 +2043,7 @@  (define_expand "add<mode>3"
 
 (define_insn "*add<mode>3_aarch64"
   [(set
-    (match_operand:GPI 0 "register_operand" "=rk,rk,w,rk,r,r,rk")
+    (match_operand:GPI 0 "register_operand" "=rk,!rk,w,rk,r,r,rk")
     (plus:GPI
      (match_operand:GPI 1 "register_operand" "%rk,rk,w,rk,rk,0,rk")
      (match_operand:GPI 2 "aarch64_pluslong_operand" "I,r,w,J,Uaa,Uai,Uav")))]
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
new file mode 100644
index 0000000000000000000000000000000000000000..5b8d40f19884fc7b4e7decd80758bc36fa76d058
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
@@ -0,0 +1,70 @@ 
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+typedef unsigned long long v2udi __attribute__((vector_size (16)));
+typedef int v2si __attribute__((vector_size (16)));
+typedef unsigned int v2usi __attribute__((vector_size (16)));
+
+/*
+** foo:
+** 	addp	d0, v0.2d
+** 	fmov	x0, d0
+** 	ret
+*/
+long long
+foo (v2di x)
+{
+  return x[1] + x[0];
+}
+
+/*
+** foo1:
+** 	saddlp	v0.1d, v0.2s
+** 	fmov	x0, d0
+** 	ret
+*/
+long long
+foo1 (v2si x)
+{
+  return x[1] + x[0];
+}
+
+/*
+** foo2:
+** 	uaddlp	v0.1d, v0.2s
+** 	fmov	x0, d0
+** 	ret
+*/
+unsigned long long
+foo2 (v2usi x)
+{
+  return x[1] + x[0];
+}
+
+/*
+** foo3:
+** 	uaddlp	v0.1d, v0.2s
+** 	add	d0, d0, d1
+** 	fmov	x0, d0
+** 	ret
+*/
+unsigned long long
+foo3 (v2usi x, v2udi y)
+{
+  return (x[1] + x[0]) + y[0];
+}
+
+/*
+** foo4:
+** 	saddlp	v0.1d, v0.2s
+** 	add	d0, d0, d1
+** 	fmov	x0, d0
+** 	ret
+*/
+long long
+foo4 (v2si x, v2di y)
+{
+  return (x[1] + x[0]) + y[0];
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
new file mode 100644
index 0000000000000000000000000000000000000000..ff455e060fc833b2f63e89c467b91a76fbe31aff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
@@ -0,0 +1,66 @@ 
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
+/* { dg-add-options arm_v8_2a_fp16_scalar } */
+/* { dg-additional-options "-save-temps -O1" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef double v2df __attribute__((vector_size (16)));
+typedef float v4sf __attribute__((vector_size (16)));
+typedef __fp16 v8hf __attribute__((vector_size (16)));
+
+/*
+** foo:
+** 	faddp	d0, v0.2d
+** 	ret
+*/
+double
+foo (v2df x)
+{
+  return x[1] + x[0];
+}
+
+/*
+** foo1:
+** 	faddp	s0, v0.2s
+** 	ret
+*/
+float
+foo1 (v4sf x)
+{
+  return x[0] + x[1];
+}
+
+/*
+** foo2:
+** 	faddp	h0, v0.2h
+** 	ret
+*/
+__fp16
+foo2 (v8hf x)
+{
+  return x[0] + x[1];
+}
+
+/*
+** foo3:
+** 	ext	v0.16b, v0.16b, v0.16b, #4
+** 	faddp	s0, v0.2s
+** 	ret
+*/
+float
+foo3 (v4sf x)
+{
+  return x[1] + x[2];
+}
+
+/*
+** foo4:
+** 	dup	s0, v0.s\[3\]
+** 	faddp	h0, v0.2h
+** 	ret
+*/
+__fp16
+foo4 (v8hf x)
+{
+  return x[6] + x[7];
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
new file mode 100644
index 0000000000000000000000000000000000000000..04412c3b45c51648e46ff20f730b1213e940391a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
@@ -0,0 +1,14 @@ 
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -w" } */
+
+typedef __m128i __attribute__((__vector_size__(2 * sizeof(long))));
+double a[];
+*b;
+fn1() {
+  __m128i c;
+  *(__m128i *)a = c;
+  *b = a[0] + a[1];
+}
+
+/* { dg-final { scan-assembler-times {faddp\td0, v0\.2d} 1 } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_fmaxp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_fmaxp.c
new file mode 100644
index 0000000000000000000000000000000000000000..aa1d2bf17cd707b74d8f7c574506610ab4fd7299
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_fmaxp.c
@@ -0,0 +1,56 @@ 
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
+/* { dg-add-options arm_v8_2a_fp16_scalar } */
+/* { dg-additional-options "-save-temps -O1" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef double v2df __attribute__((vector_size (16)));
+typedef float v4sf __attribute__((vector_size (16)));
+typedef __fp16 v8hf __attribute__((vector_size (16)));
+
+/*
+** foo:
+** 	fmaxnmp	d0, v0.2d
+** 	ret
+*/
+double
+foo (v2df x)
+{
+  return x[0] > x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo1:
+** 	fmaxnmp	s0, v0.2s
+** 	ret
+*/
+float
+foo1 (v4sf x)
+{
+  return x[0] > x[1] ? x[0] : x[1];
+}
+
+/*
+** foo2:
+** 	fmaxnmp	h0, v0.2h
+** 	ret
+*/
+__fp16
+foo2 (v8hf x)
+{
+  return x[0] > x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo3:
+** 	fmaxnmp	s0, v0.2s
+** 	fcvt	d0, s0
+** 	fadd	d0, d0, d1
+** 	ret
+*/
+double
+foo3 (v4sf x, v2df y)
+{
+  return (x[0] > x[1] ? x[0] : x[1]) + y[0];
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_fminp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_fminp.c
new file mode 100644
index 0000000000000000000000000000000000000000..6136c5272069c4d86f09951cdff25f1494e839f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_fminp.c
@@ -0,0 +1,55 @@ 
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
+/* { dg-add-options arm_v8_2a_fp16_scalar } */
+/* { dg-additional-options "-save-temps -O1" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef double v2df __attribute__((vector_size (16)));
+typedef float v4sf __attribute__((vector_size (16)));
+typedef __fp16 v8hf __attribute__((vector_size (16)));
+
+/*
+** foo:
+** 	fminnmp	d0, v0.2d
+** 	ret
+*/
+double
+foo (v2df x)
+{
+  return x[0] < x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo1:
+** 	fminnmp	s0, v0.2s
+** 	ret
+*/
+float
+foo1 (v4sf x)
+{
+  return x[0] < x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo2:
+** 	fminnmp	h0, v0.2h
+** 	ret
+*/
+__fp16
+foo2 (v8hf x)
+{
+  return x[0] < x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo3:
+** 	fminnmp	s0, v0.2s
+** 	fcvt	d0, s0
+** 	fadd	d0, d0, d1
+** 	ret
+*/
+double
+foo3 (v4sf x, v2df y)
+{
+  return (x[0] < x[1] ? x[0] : x[1]) + y[0];
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_maxp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_maxp.c
new file mode 100644
index 0000000000000000000000000000000000000000..e219a13abc745b83dca58633fd2d812e276d6b2d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_maxp.c
@@ -0,0 +1,74 @@ 
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+typedef unsigned long long v2udi __attribute__((vector_size (16)));
+typedef int v2si __attribute__((vector_size (16)));
+typedef unsigned int v2usi __attribute__((vector_size (16)));
+
+/*
+** foo:
+** 	umov	x0, v0.d\[1\]
+** 	fmov	x1, d0
+** 	cmp	x0, x1
+** 	csel	x0, x0, x1, ge
+** 	ret
+*/
+long long
+foo (v2di x)
+{
+  return x[0] > x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo1:
+** 	smaxp	v0.2s, v0.2s, v0.2s
+** 	smov	x0, v0.s\[0\]
+** 	ret
+*/
+long long
+foo1 (v2si x)
+{
+  return x[0] > x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo2:
+** 	umaxp	v0.2s, v0.2s, v0.2s
+** 	fmov	w0, s0
+** 	ret
+*/
+unsigned long long
+foo2 (v2usi x)
+{
+  return x[0] > x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo3:
+** 	umaxp	v0.2s, v0.2s, v0.2s
+** 	fmov	w0, s0
+** 	fmov	x1, d1
+** 	add	x0, x1, w0, uxtw
+** 	ret
+*/
+unsigned long long
+foo3 (v2usi x, v2udi y)
+{
+  return (x[0] > x[1] ? x[0] : x[1]) + y[0];
+}
+
+/* 
+** foo4:
+** 	smaxp	v0.2s, v0.2s, v0.2s
+** 	fmov	w0, s0
+** 	fmov	x1, d1
+** 	add	x0, x1, w0, sxtw
+** 	ret
+*/
+long long
+foo4 (v2si x, v2di y)
+{
+  return (x[0] > x[1] ? x[0] : x[1]) + y[0];
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_minp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_minp.c
new file mode 100644
index 0000000000000000000000000000000000000000..2a32fb4ea3edaa4c547a7a481c3ddca6b477430e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_minp.c
@@ -0,0 +1,74 @@ 
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+typedef unsigned long long v2udi __attribute__((vector_size (16)));
+typedef int v2si __attribute__((vector_size (16)));
+typedef unsigned int v2usi __attribute__((vector_size (16)));
+
+/*
+** foo:
+** 	umov	x0, v0.d\[1\]
+** 	fmov	x1, d0
+** 	cmp	x0, x1
+** 	csel	x0, x0, x1, le
+** 	ret
+*/
+long long
+foo (v2di x)
+{
+  return x[0] < x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo1:
+** 	sminp	v0.2s, v0.2s, v0.2s
+** 	smov	x0, v0.s\[0\]
+** 	ret
+*/
+long long
+foo1 (v2si x)
+{
+  return x[0] < x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo2:
+** 	uminp	v0.2s, v0.2s, v0.2s
+** 	fmov	w0, s0
+** 	ret
+*/
+unsigned long long
+foo2 (v2usi x)
+{
+  return x[0] < x[1] ? x[0] : x[1];
+}
+
+/* 
+** foo3:
+** 	uminp	v0.2s, v0.2s, v0.2s
+** 	fmov	w0, s0
+** 	fmov	x1, d1
+** 	add	x0, x1, w0, uxtw
+** 	ret
+*/
+unsigned long long
+foo3 (v2usi x, v2udi y)
+{
+  return (x[0] < x[1] ? x[0] : x[1]) + y[0];
+}
+
+/* 
+** foo4:
+** 	sminp	v0.2s, v0.2s, v0.2s
+** 	fmov	w0, s0
+** 	fmov	x1, d1
+** 	add	x0, x1, w0, sxtw
+** 	ret
+*/
+long long
+foo4 (v2si x, v2di y)
+{
+  return (x[0] < x[1] ? x[0] : x[1]) + y[0];
+}