middle-end Add optimized float addsub without needing VEC_PERM_EXPR.

Message ID patch-15819-tamar@arm.com
State Committed
Headers
Series middle-end Add optimized float addsub without needing VEC_PERM_EXPR. |

Commit Message

Tamar Christina June 16, 2022, 10:58 a.m. UTC
  Hi All,

For IEEE 754 floating point formats we can replace a sequence of alternative
+/- with fneg of a wider type followed by an fadd.  This eliminated the need for
using a permutation.  This patch adds a math.pd rule to recognize and do this
rewriting.

For

void f (float *restrict a, float *restrict b, float *res, int n)
{
   for (int i = 0; i < (n & -4); i+=2)
    {
      res[i+0] = a[i+0] + b[i+0];
      res[i+1] = a[i+1] - b[i+1];
    }
}

we generate:

.L3:
        ldr     q1, [x1, x3]
        ldr     q0, [x0, x3]
        fneg    v1.2d, v1.2d
        fadd    v0.4s, v0.4s, v1.4s
        str     q0, [x2, x3]
        add     x3, x3, 16
        cmp     x3, x4
        bne     .L3

now instead of:

.L3:
        ldr     q1, [x0, x3]
        ldr     q2, [x1, x3]
        fadd    v0.4s, v1.4s, v2.4s
        fsub    v1.4s, v1.4s, v2.4s
        tbl     v0.16b, {v0.16b - v1.16b}, v3.16b
        str     q0, [x2, x3]
        add     x3, x3, 16
        cmp     x3, x4
        bne     .L3

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Thanks to George Steed for the idea.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* match.pd: Add fneg/fadd rule.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/simd/addsub_1.c: New test.
	* gcc.target/aarch64/sve/addsub_1.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/match.pd b/gcc/match.pd
index 51b0a1b562409af535e53828a10c30b8a3e1ae2e..af1c98d4a2831f38258d6fc1bbe811c8ee6c7c6e 100644




--
diff --git a/gcc/match.pd b/gcc/match.pd
index 51b0a1b562409af535e53828a10c30b8a3e1ae2e..af1c98d4a2831f38258d6fc1bbe811c8ee6c7c6e 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7612,6 +7612,49 @@ and,
   (simplify (reduc (op @0 VECTOR_CST@1))
     (op (reduc:type @0) (reduc:type @1))))
 
+/* Simplify vector floating point operations of alternating sub/add pairs
+   into using an fneg of a wider element type followed by a normal add.
+   under IEEE 754 the fneg of the wider type will negate every even entry
+   and when doing an add we get a sub of the even and add of every odd
+   elements.  */
+(simplify
+ (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)
+ (if (!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)
+  (with
+   {
+     /* Build a vector of integers from the tree mask.  */
+     vec_perm_builder builder;
+     if (!tree_to_vec_perm_builder (&builder, @2))
+       return NULL_TREE;
+
+     /* Create a vec_perm_indices for the integer vector.  */
+     poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
+     vec_perm_indices sel (builder, 2, nelts);
+   }
+   (if (sel.series_p (0, 2, 0, 2))
+    (with
+     {
+       machine_mode vec_mode = TYPE_MODE (type);
+       auto elem_mode = GET_MODE_INNER (vec_mode);
+       auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
+       tree stype;
+       switch (elem_mode)
+	 {
+	 case E_HFmode:
+	   stype = float_type_node;
+	   break;
+	 case E_SFmode:
+	   stype = double_type_node;
+	   break;
+	 default:
+	   return NULL_TREE;
+	 }
+       tree ntype = build_vector_type (stype, nunits);
+       if (!ntype)
+	 return NULL_TREE;
+     }
+     (plus (view_convert:type (negate (view_convert:ntype @1))) @0))))))
+
 (simplify
  (vec_perm @0 @1 VECTOR_CST@2)
  (with
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0dbbf1b47ad43310c4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
+/* { dg-options "-Ofast" } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#pragma GCC target "+nosve"
+
+/* 
+** f1:
+** ...
+**	fneg	v[0-9]+.2d, v[0-9]+.2d
+**	fadd	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+void f1 (float *restrict a, float *restrict b, float *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
+
+/* 
+** d1:
+** ...
+** 	fneg	v[0-9]+.4s, v[0-9]+.4s
+** 	fadd	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
+{
+   for (int i = 0; i < (n & -8); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
+
+/* 
+** e1:
+** ...
+** 	fadd	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** 	fsub	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** 	ins	v[0-9]+.d\[1\], v[0-9]+.d\[1\]
+** ...
+*/
+void e1 (double *restrict a, double *restrict b, double *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a314a29b7a7a922
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+/*
+** f1:
+** ...
+** 	fneg	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
+** 	fadd	z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void f1 (float *restrict a, float *restrict b, float *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
+
+/* 
+** d1:
+** ...
+** 	fneg	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
+** 	fadd	z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
+** ...
+*/ 
+void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
+{
+   for (int i = 0; i < (n & -8); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
+
+/*
+** e1:
+** ...
+** 	fsub	z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
+** 	movprfx	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
+** 	fadd	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void e1 (double *restrict a, double *restrict b, double *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
  

Comments

Andrew Pinski June 17, 2022, 8:33 p.m. UTC | #1
On Thu, Jun 16, 2022 at 3:59 AM Tamar Christina via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi All,
>
> For IEEE 754 floating point formats we can replace a sequence of alternative
> +/- with fneg of a wider type followed by an fadd.  This eliminated the need for
> using a permutation.  This patch adds a math.pd rule to recognize and do this
> rewriting.

I don't think this is correct. You don't check the format of the
floating point to make sure this is valid (e.g. REAL_MODE_FORMAT's
signbit_rw/signbit_ro field).
Also would just be better if you do the xor in integer mode (using
signbit_rw field for the correct bit)?
And then making sure the target optimizes the xor to the neg
instruction when needed?

Thanks,
Andrew Pinski



>
> For
>
> void f (float *restrict a, float *restrict b, float *res, int n)
> {
>    for (int i = 0; i < (n & -4); i+=2)
>     {
>       res[i+0] = a[i+0] + b[i+0];
>       res[i+1] = a[i+1] - b[i+1];
>     }
> }
>
> we generate:
>
> .L3:
>         ldr     q1, [x1, x3]
>         ldr     q0, [x0, x3]
>         fneg    v1.2d, v1.2d
>         fadd    v0.4s, v0.4s, v1.4s
>         str     q0, [x2, x3]
>         add     x3, x3, 16
>         cmp     x3, x4
>         bne     .L3
>
> now instead of:
>
> .L3:
>         ldr     q1, [x0, x3]
>         ldr     q2, [x1, x3]
>         fadd    v0.4s, v1.4s, v2.4s
>         fsub    v1.4s, v1.4s, v2.4s
>         tbl     v0.16b, {v0.16b - v1.16b}, v3.16b
>         str     q0, [x2, x3]
>         add     x3, x3, 16
>         cmp     x3, x4
>         bne     .L3
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Thanks to George Steed for the idea.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>         * match.pd: Add fneg/fadd rule.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/simd/addsub_1.c: New test.
>         * gcc.target/aarch64/sve/addsub_1.c: New test.
>
> --- inline copy of patch --
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 51b0a1b562409af535e53828a10c30b8a3e1ae2e..af1c98d4a2831f38258d6fc1bbe811c8ee6c7c6e 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -7612,6 +7612,49 @@ and,
>    (simplify (reduc (op @0 VECTOR_CST@1))
>      (op (reduc:type @0) (reduc:type @1))))
>
> +/* Simplify vector floating point operations of alternating sub/add pairs
> +   into using an fneg of a wider element type followed by a normal add.
> +   under IEEE 754 the fneg of the wider type will negate every even entry
> +   and when doing an add we get a sub of the even and add of every odd
> +   elements.  */
> +(simplify
> + (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)
> + (if (!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)
> +  (with
> +   {
> +     /* Build a vector of integers from the tree mask.  */
> +     vec_perm_builder builder;
> +     if (!tree_to_vec_perm_builder (&builder, @2))
> +       return NULL_TREE;
> +
> +     /* Create a vec_perm_indices for the integer vector.  */
> +     poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
> +     vec_perm_indices sel (builder, 2, nelts);
> +   }
> +   (if (sel.series_p (0, 2, 0, 2))
> +    (with
> +     {
> +       machine_mode vec_mode = TYPE_MODE (type);
> +       auto elem_mode = GET_MODE_INNER (vec_mode);
> +       auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
> +       tree stype;
> +       switch (elem_mode)
> +        {
> +        case E_HFmode:
> +          stype = float_type_node;
> +          break;
> +        case E_SFmode:
> +          stype = double_type_node;
> +          break;
> +        default:
> +          return NULL_TREE;
> +        }
> +       tree ntype = build_vector_type (stype, nunits);
> +       if (!ntype)
> +        return NULL_TREE;
> +     }
> +     (plus (view_convert:type (negate (view_convert:ntype @1))) @0))))))
> +
>  (simplify
>   (vec_perm @0 @1 VECTOR_CST@2)
>   (with
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0dbbf1b47ad43310c4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
> +/* { dg-options "-Ofast" } */
> +/* { dg-add-options arm_v8_2a_fp16_neon } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#pragma GCC target "+nosve"
> +
> +/*
> +** f1:
> +** ...
> +**     fneg    v[0-9]+.2d, v[0-9]+.2d
> +**     fadd    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +** ...
> +*/
> +void f1 (float *restrict a, float *restrict b, float *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
> +
> +/*
> +** d1:
> +** ...
> +**     fneg    v[0-9]+.4s, v[0-9]+.4s
> +**     fadd    v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> +** ...
> +*/
> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
> +{
> +   for (int i = 0; i < (n & -8); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
> +
> +/*
> +** e1:
> +** ...
> +**     fadd    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> +**     fsub    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> +**     ins     v[0-9]+.d\[1\], v[0-9]+.d\[1\]
> +** ...
> +*/
> +void e1 (double *restrict a, double *restrict b, double *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a314a29b7a7a922
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> @@ -0,0 +1,52 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +/*
> +** f1:
> +** ...
> +**     fneg    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> +**     fadd    z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
> +** ...
> +*/
> +void f1 (float *restrict a, float *restrict b, float *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
> +
> +/*
> +** d1:
> +** ...
> +**     fneg    z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
> +**     fadd    z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
> +** ...
> +*/
> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
> +{
> +   for (int i = 0; i < (n & -8); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
> +
> +/*
> +** e1:
> +** ...
> +**     fsub    z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
> +**     movprfx z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> +**     fadd    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> +** ...
> +*/
> +void e1 (double *restrict a, double *restrict b, double *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
>
>
>
>
> --
  
Richard Biener June 18, 2022, 10:49 a.m. UTC | #2
> Am 17.06.2022 um 22:34 schrieb Andrew Pinski via Gcc-patches <gcc-patches@gcc.gnu.org>:
> 
> On Thu, Jun 16, 2022 at 3:59 AM Tamar Christina via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>> 
>> Hi All,
>> 
>> For IEEE 754 floating point formats we can replace a sequence of alternative
>> +/- with fneg of a wider type followed by an fadd.  This eliminated the need for
>> using a permutation.  This patch adds a math.pd rule to recognize and do this
>> rewriting.
> 
> I don't think this is correct. You don't check the format of the
> floating point to make sure this is valid (e.g. REAL_MODE_FORMAT's
> signbit_rw/signbit_ro field).
> Also would just be better if you do the xor in integer mode (using
> signbit_rw field for the correct bit)?
> And then making sure the target optimizes the xor to the neg
> instruction when needed?

I’m also worried about using FP operations for the negate here.  When @1 is constant do we still constant fold this correctly?

For costing purposes it would be nice to make this visible to the vectorizer.

Also is this really good for all targets?  Can there be issues with reformatting when using FP ops as in your patch or with using integer XOR as suggested making this more expensive than the blend?

Richard.

> Thanks,
> Andrew Pinski
> 
> 
> 
>> 
>> For
>> 
>> void f (float *restrict a, float *restrict b, float *res, int n)
>> {
>>   for (int i = 0; i < (n & -4); i+=2)
>>    {
>>      res[i+0] = a[i+0] + b[i+0];
>>      res[i+1] = a[i+1] - b[i+1];
>>    }
>> }
>> 
>> we generate:
>> 
>> .L3:
>>        ldr     q1, [x1, x3]
>>        ldr     q0, [x0, x3]
>>        fneg    v1.2d, v1.2d
>>        fadd    v0.4s, v0.4s, v1.4s
>>        str     q0, [x2, x3]
>>        add     x3, x3, 16
>>        cmp     x3, x4
>>        bne     .L3
>> 
>> now instead of:
>> 
>> .L3:
>>        ldr     q1, [x0, x3]
>>        ldr     q2, [x1, x3]
>>        fadd    v0.4s, v1.4s, v2.4s
>>        fsub    v1.4s, v1.4s, v2.4s
>>        tbl     v0.16b, {v0.16b - v1.16b}, v3.16b
>>        str     q0, [x2, x3]
>>        add     x3, x3, 16
>>        cmp     x3, x4
>>        bne     .L3
>> 
>> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>> 
>> Thanks to George Steed for the idea.
>> 
>> Ok for master?
>> 
>> Thanks,
>> Tamar
>> 
>> gcc/ChangeLog:
>> 
>>        * match.pd: Add fneg/fadd rule.
>> 
>> gcc/testsuite/ChangeLog:
>> 
>>        * gcc.target/aarch64/simd/addsub_1.c: New test.
>>        * gcc.target/aarch64/sve/addsub_1.c: New test.
>> 
>> --- inline copy of patch --
>> diff --git a/gcc/match.pd b/gcc/match.pd
>> index 51b0a1b562409af535e53828a10c30b8a3e1ae2e..af1c98d4a2831f38258d6fc1bbe811c8ee6c7c6e 100644
>> --- a/gcc/match.pd
>> +++ b/gcc/match.pd
>> @@ -7612,6 +7612,49 @@ and,
>>   (simplify (reduc (op @0 VECTOR_CST@1))
>>     (op (reduc:type @0) (reduc:type @1))))
>> 
>> +/* Simplify vector floating point operations of alternating sub/add pairs
>> +   into using an fneg of a wider element type followed by a normal add.
>> +   under IEEE 754 the fneg of the wider type will negate every even entry
>> +   and when doing an add we get a sub of the even and add of every odd
>> +   elements.  */
>> +(simplify
>> + (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)
>> + (if (!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)
>> +  (with
>> +   {
>> +     /* Build a vector of integers from the tree mask.  */
>> +     vec_perm_builder builder;
>> +     if (!tree_to_vec_perm_builder (&builder, @2))
>> +       return NULL_TREE;
>> +
>> +     /* Create a vec_perm_indices for the integer vector.  */
>> +     poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
>> +     vec_perm_indices sel (builder, 2, nelts);
>> +   }
>> +   (if (sel.series_p (0, 2, 0, 2))
>> +    (with
>> +     {
>> +       machine_mode vec_mode = TYPE_MODE (type);
>> +       auto elem_mode = GET_MODE_INNER (vec_mode);
>> +       auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
>> +       tree stype;
>> +       switch (elem_mode)
>> +        {
>> +        case E_HFmode:
>> +          stype = float_type_node;
>> +          break;
>> +        case E_SFmode:
>> +          stype = double_type_node;
>> +          break;
>> +        default:
>> +          return NULL_TREE;
>> +        }
>> +       tree ntype = build_vector_type (stype, nunits);
>> +       if (!ntype)
>> +        return NULL_TREE;
>> +     }
>> +     (plus (view_convert:type (negate (view_convert:ntype @1))) @0))))))
>> +
>> (simplify
>>  (vec_perm @0 @1 VECTOR_CST@2)
>>  (with
>> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0dbbf1b47ad43310c4
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
>> @@ -0,0 +1,56 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
>> +/* { dg-options "-Ofast" } */
>> +/* { dg-add-options arm_v8_2a_fp16_neon } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
>> +
>> +#pragma GCC target "+nosve"
>> +
>> +/*
>> +** f1:
>> +** ...
>> +**     fneg    v[0-9]+.2d, v[0-9]+.2d
>> +**     fadd    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
>> +** ...
>> +*/
>> +void f1 (float *restrict a, float *restrict b, float *res, int n)
>> +{
>> +   for (int i = 0; i < (n & -4); i+=2)
>> +    {
>> +      res[i+0] = a[i+0] + b[i+0];
>> +      res[i+1] = a[i+1] - b[i+1];
>> +    }
>> +}
>> +
>> +/*
>> +** d1:
>> +** ...
>> +**     fneg    v[0-9]+.4s, v[0-9]+.4s
>> +**     fadd    v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
>> +** ...
>> +*/
>> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
>> +{
>> +   for (int i = 0; i < (n & -8); i+=2)
>> +    {
>> +      res[i+0] = a[i+0] + b[i+0];
>> +      res[i+1] = a[i+1] - b[i+1];
>> +    }
>> +}
>> +
>> +/*
>> +** e1:
>> +** ...
>> +**     fadd    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
>> +**     fsub    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
>> +**     ins     v[0-9]+.d\[1\], v[0-9]+.d\[1\]
>> +** ...
>> +*/
>> +void e1 (double *restrict a, double *restrict b, double *res, int n)
>> +{
>> +   for (int i = 0; i < (n & -4); i+=2)
>> +    {
>> +      res[i+0] = a[i+0] + b[i+0];
>> +      res[i+1] = a[i+1] - b[i+1];
>> +    }
>> +}
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a314a29b7a7a922
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
>> @@ -0,0 +1,52 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-Ofast" } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
>> +
>> +/*
>> +** f1:
>> +** ...
>> +**     fneg    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
>> +**     fadd    z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
>> +** ...
>> +*/
>> +void f1 (float *restrict a, float *restrict b, float *res, int n)
>> +{
>> +   for (int i = 0; i < (n & -4); i+=2)
>> +    {
>> +      res[i+0] = a[i+0] + b[i+0];
>> +      res[i+1] = a[i+1] - b[i+1];
>> +    }
>> +}
>> +
>> +/*
>> +** d1:
>> +** ...
>> +**     fneg    z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
>> +**     fadd    z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
>> +** ...
>> +*/
>> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
>> +{
>> +   for (int i = 0; i < (n & -8); i+=2)
>> +    {
>> +      res[i+0] = a[i+0] + b[i+0];
>> +      res[i+1] = a[i+1] - b[i+1];
>> +    }
>> +}
>> +
>> +/*
>> +** e1:
>> +** ...
>> +**     fsub    z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
>> +**     movprfx z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
>> +**     fadd    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
>> +** ...
>> +*/
>> +void e1 (double *restrict a, double *restrict b, double *res, int n)
>> +{
>> +   for (int i = 0; i < (n & -4); i+=2)
>> +    {
>> +      res[i+0] = a[i+0] + b[i+0];
>> +      res[i+1] = a[i+1] - b[i+1];
>> +    }
>> +}
>> 
>> 
>> 
>> 
>> --
  
Tamar Christina June 20, 2022, 10 a.m. UTC | #3
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Saturday, June 18, 2022 11:49 AM
> To: Andrew Pinski via Gcc-patches <gcc-patches@gcc.gnu.org>
> Cc: Tamar Christina <Tamar.Christina@arm.com>; nd <nd@arm.com>
> Subject: Re: [PATCH]middle-end Add optimized float addsub without
> needing VEC_PERM_EXPR.
> 
> 
> 
> > Am 17.06.2022 um 22:34 schrieb Andrew Pinski via Gcc-patches <gcc-
> patches@gcc.gnu.org>:
> >
> > On Thu, Jun 16, 2022 at 3:59 AM Tamar Christina via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> >>
> >> Hi All,
> >>
> >> For IEEE 754 floating point formats we can replace a sequence of
> >> alternative
> >> +/- with fneg of a wider type followed by an fadd.  This eliminated
> >> +the need for
> >> using a permutation.  This patch adds a math.pd rule to recognize and
> >> do this rewriting.
> >
> > I don't think this is correct. You don't check the format of the
> > floating point to make sure this is valid (e.g. REAL_MODE_FORMAT's
> > signbit_rw/signbit_ro field).

Yes I originally had this check, but I wondered whether it would be needed.
I'm not aware of any vector ISA where the 32-bit and 16-bit floats don't follow
the IEEE data layout and semantics here.

My preference would be to ask the target about the data format of its vector
Floating points because I don't think there needs to be a direct correlation between
The scalar and vector formats strictly speaking.   But I know Richi won't like that so
the check is probably most likely.

> > Also would just be better if you do the xor in integer mode (using
> > signbit_rw field for the correct bit)?
> > And then making sure the target optimizes the xor to the neg
> > instruction when needed?

I don't really see the advantage of this one. It's not removing an instruction
and it's assuming the vector ISA can do integer ops on a floating point vector
cheaply.  Since match.pd doesn't have the ability to do costing I'd rather not
do this.

> I’m also worried about using FP operations for the negate here.  When @1 is
> constant do we still constant fold this correctly?

We never did constant folding for this case, the folding infrastructure doesn't
know how to fold the VEC_PERM_EXPR.  So even with @0 and @1 constant no
folding takes place even today if we vectorize.

> 
> For costing purposes it would be nice to make this visible to the vectorizer.
> 

I initially wanted to use VEC_ADDSUB for this, but noticed it didn't trigger in a number of
place I had expected it to. While looking into it I noticed it's because this follows the x86
instruction semantics so left it alone.

It felt like adding a third pattern here might be confusing. However I can also use the SLP
pattern matcher to rewrite it without an optab if you prefer that?

The operations will then be costed normally.

> Also is this really good for all targets?  Can there be issues with reformatting
> when using FP ops as in your patch or with using integer XOR as suggested
> making this more expensive than the blend?

I don't think with the fp ops alone,  since it's using two fp ops already and after the change 2 fp ops.
and I can't image that a target would have a slow -a.

The XOR one I wouldn't do, as the vector int and vector float could for instance be in different register
files or FP be a co-processor etc.  Mixing FP and Integer ops in this case I can image can lead to something
suboptimal.  Also for targets with masking/predication the VEC_PERM_EXP could potentially be lowered to
a mask/predicate in the backend. Whereas the XOR approach is far less likely.

Thanks,
Tamar

> 
> Richard.
> 
> > Thanks,
> > Andrew Pinski
> >
> >
> >
> >>
> >> For
> >>
> >> void f (float *restrict a, float *restrict b, float *res, int n) {
> >>   for (int i = 0; i < (n & -4); i+=2)
> >>    {
> >>      res[i+0] = a[i+0] + b[i+0];
> >>      res[i+1] = a[i+1] - b[i+1];
> >>    }
> >> }
> >>
> >> we generate:
> >>
> >> .L3:
> >>        ldr     q1, [x1, x3]
> >>        ldr     q0, [x0, x3]
> >>        fneg    v1.2d, v1.2d
> >>        fadd    v0.4s, v0.4s, v1.4s
> >>        str     q0, [x2, x3]
> >>        add     x3, x3, 16
> >>        cmp     x3, x4
> >>        bne     .L3
> >>
> >> now instead of:
> >>
> >> .L3:
> >>        ldr     q1, [x0, x3]
> >>        ldr     q2, [x1, x3]
> >>        fadd    v0.4s, v1.4s, v2.4s
> >>        fsub    v1.4s, v1.4s, v2.4s
> >>        tbl     v0.16b, {v0.16b - v1.16b}, v3.16b
> >>        str     q0, [x2, x3]
> >>        add     x3, x3, 16
> >>        cmp     x3, x4
> >>        bne     .L3
> >>
> >> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >>
> >> Thanks to George Steed for the idea.
> >>
> >> Ok for master?
> >>
> >> Thanks,
> >> Tamar
> >>
> >> gcc/ChangeLog:
> >>
> >>        * match.pd: Add fneg/fadd rule.
> >>
> >> gcc/testsuite/ChangeLog:
> >>
> >>        * gcc.target/aarch64/simd/addsub_1.c: New test.
> >>        * gcc.target/aarch64/sve/addsub_1.c: New test.
> >>
> >> --- inline copy of patch --
> >> diff --git a/gcc/match.pd b/gcc/match.pd index
> >>
> 51b0a1b562409af535e53828a10c30b8a3e1ae2e..af1c98d4a2831f38258d6fc1bb
> e
> >> 811c8ee6c7c6e 100644
> >> --- a/gcc/match.pd
> >> +++ b/gcc/match.pd
> >> @@ -7612,6 +7612,49 @@ and,
> >>   (simplify (reduc (op @0 VECTOR_CST@1))
> >>     (op (reduc:type @0) (reduc:type @1))))
> >>
> >> +/* Simplify vector floating point operations of alternating sub/add pairs
> >> +   into using an fneg of a wider element type followed by a normal add.
> >> +   under IEEE 754 the fneg of the wider type will negate every even entry
> >> +   and when doing an add we get a sub of the even and add of every odd
> >> +   elements.  */
> >> +(simplify
> >> + (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)  (if
> >> +(!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)
> >> +  (with
> >> +   {
> >> +     /* Build a vector of integers from the tree mask.  */
> >> +     vec_perm_builder builder;
> >> +     if (!tree_to_vec_perm_builder (&builder, @2))
> >> +       return NULL_TREE;
> >> +
> >> +     /* Create a vec_perm_indices for the integer vector.  */
> >> +     poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
> >> +     vec_perm_indices sel (builder, 2, nelts);
> >> +   }
> >> +   (if (sel.series_p (0, 2, 0, 2))
> >> +    (with
> >> +     {
> >> +       machine_mode vec_mode = TYPE_MODE (type);
> >> +       auto elem_mode = GET_MODE_INNER (vec_mode);
> >> +       auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
> >> +       tree stype;
> >> +       switch (elem_mode)
> >> +        {
> >> +        case E_HFmode:
> >> +          stype = float_type_node;
> >> +          break;
> >> +        case E_SFmode:
> >> +          stype = double_type_node;
> >> +          break;
> >> +        default:
> >> +          return NULL_TREE;
> >> +        }
> >> +       tree ntype = build_vector_type (stype, nunits);
> >> +       if (!ntype)
> >> +        return NULL_TREE;
> >> +     }
> >> +     (plus (view_convert:type (negate (view_convert:ntype @1)))
> >> + @0))))))
> >> +
> >> (simplify
> >>  (vec_perm @0 @1 VECTOR_CST@2)
> >>  (with
> >> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> >> b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> >> new file mode 100644
> >> index
> >>
> 0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0db
> bf
> >> 1b47ad43310c4
> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> >> @@ -0,0 +1,56 @@
> >> +/* { dg-do compile } */
> >> +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
> >> +/* { dg-options "-Ofast" } */
> >> +/* { dg-add-options arm_v8_2a_fp16_neon } */
> >> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> >> +} */
> >> +
> >> +#pragma GCC target "+nosve"
> >> +
> >> +/*
> >> +** f1:
> >> +** ...
> >> +**     fneg    v[0-9]+.2d, v[0-9]+.2d
> >> +**     fadd    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> >> +** ...
> >> +*/
> >> +void f1 (float *restrict a, float *restrict b, float *res, int n) {
> >> +   for (int i = 0; i < (n & -4); i+=2)
> >> +    {
> >> +      res[i+0] = a[i+0] + b[i+0];
> >> +      res[i+1] = a[i+1] - b[i+1];
> >> +    }
> >> +}
> >> +
> >> +/*
> >> +** d1:
> >> +** ...
> >> +**     fneg    v[0-9]+.4s, v[0-9]+.4s
> >> +**     fadd    v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> >> +** ...
> >> +*/
> >> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res,
> >> +int n) {
> >> +   for (int i = 0; i < (n & -8); i+=2)
> >> +    {
> >> +      res[i+0] = a[i+0] + b[i+0];
> >> +      res[i+1] = a[i+1] - b[i+1];
> >> +    }
> >> +}
> >> +
> >> +/*
> >> +** e1:
> >> +** ...
> >> +**     fadd    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> >> +**     fsub    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> >> +**     ins     v[0-9]+.d\[1\], v[0-9]+.d\[1\]
> >> +** ...
> >> +*/
> >> +void e1 (double *restrict a, double *restrict b, double *res, int n)
> >> +{
> >> +   for (int i = 0; i < (n & -4); i+=2)
> >> +    {
> >> +      res[i+0] = a[i+0] + b[i+0];
> >> +      res[i+1] = a[i+1] - b[i+1];
> >> +    }
> >> +}
> >> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> >> b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> >> new file mode 100644
> >> index
> >>
> 0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a
> 3
> >> 14a29b7a7a922
> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> >> @@ -0,0 +1,52 @@
> >> +/* { dg-do compile } */
> >> +/* { dg-options "-Ofast" } */
> >> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> >> +} */
> >> +
> >> +/*
> >> +** f1:
> >> +** ...
> >> +**     fneg    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> >> +**     fadd    z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
> >> +** ...
> >> +*/
> >> +void f1 (float *restrict a, float *restrict b, float *res, int n) {
> >> +   for (int i = 0; i < (n & -4); i+=2)
> >> +    {
> >> +      res[i+0] = a[i+0] + b[i+0];
> >> +      res[i+1] = a[i+1] - b[i+1];
> >> +    }
> >> +}
> >> +
> >> +/*
> >> +** d1:
> >> +** ...
> >> +**     fneg    z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
> >> +**     fadd    z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
> >> +** ...
> >> +*/
> >> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res,
> >> +int n) {
> >> +   for (int i = 0; i < (n & -8); i+=2)
> >> +    {
> >> +      res[i+0] = a[i+0] + b[i+0];
> >> +      res[i+1] = a[i+1] - b[i+1];
> >> +    }
> >> +}
> >> +
> >> +/*
> >> +** e1:
> >> +** ...
> >> +**     fsub    z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
> >> +**     movprfx z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> >> +**     fadd    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> >> +** ...
> >> +*/
> >> +void e1 (double *restrict a, double *restrict b, double *res, int n)
> >> +{
> >> +   for (int i = 0; i < (n & -4); i+=2)
> >> +    {
> >> +      res[i+0] = a[i+0] + b[i+0];
> >> +      res[i+1] = a[i+1] - b[i+1];
> >> +    }
> >> +}
> >>
> >>
> >>
> >>
> >> --
  
Richard Biener June 20, 2022, 11:56 a.m. UTC | #4
On Mon, 20 Jun 2022, Tamar Christina wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Saturday, June 18, 2022 11:49 AM
> > To: Andrew Pinski via Gcc-patches <gcc-patches@gcc.gnu.org>
> > Cc: Tamar Christina <Tamar.Christina@arm.com>; nd <nd@arm.com>
> > Subject: Re: [PATCH]middle-end Add optimized float addsub without
> > needing VEC_PERM_EXPR.
> > 
> > 
> > 
> > > Am 17.06.2022 um 22:34 schrieb Andrew Pinski via Gcc-patches <gcc-
> > patches@gcc.gnu.org>:
> > >
> > > On Thu, Jun 16, 2022 at 3:59 AM Tamar Christina via Gcc-patches
> > > <gcc-patches@gcc.gnu.org> wrote:
> > >>
> > >> Hi All,
> > >>
> > >> For IEEE 754 floating point formats we can replace a sequence of
> > >> alternative
> > >> +/- with fneg of a wider type followed by an fadd.  This eliminated
> > >> +the need for
> > >> using a permutation.  This patch adds a math.pd rule to recognize and
> > >> do this rewriting.
> > >
> > > I don't think this is correct. You don't check the format of the
> > > floating point to make sure this is valid (e.g. REAL_MODE_FORMAT's
> > > signbit_rw/signbit_ro field).
> 
> Yes I originally had this check, but I wondered whether it would be needed.
> I'm not aware of any vector ISA where the 32-bit and 16-bit floats don't follow
> the IEEE data layout and semantics here.
> 
> My preference would be to ask the target about the data format of its vector
> Floating points because I don't think there needs to be a direct correlation between
> The scalar and vector formats strictly speaking.   But I know Richi won't like that so
> the check is probably most likely.
> 
> > > Also would just be better if you do the xor in integer mode (using
> > > signbit_rw field for the correct bit)?
> > > And then making sure the target optimizes the xor to the neg
> > > instruction when needed?
> 
> I don't really see the advantage of this one. It's not removing an instruction
> and it's assuming the vector ISA can do integer ops on a floating point vector
> cheaply.  Since match.pd doesn't have the ability to do costing I'd rather not
> do this.
> 
> > I’m also worried about using FP operations for the negate here.  When @1 is
> > constant do we still constant fold this correctly?
> 
> We never did constant folding for this case, the folding infrastructure doesn't
> know how to fold the VEC_PERM_EXPR.  So even with @0 and @1 constant no
> folding takes place even today if we vectorize.
> 
> > 
> > For costing purposes it would be nice to make this visible to the vectorizer.
> > 
> 
> I initially wanted to use VEC_ADDSUB for this, but noticed it didn't trigger in a number of
> place I had expected it to. While looking into it I noticed it's because this follows the x86
> instruction semantics so left it alone.
> 
> It felt like adding a third pattern here might be confusing. However I can also use the SLP
> pattern matcher to rewrite it without an optab if you prefer that?
> 
> The operations will then be costed normally.
> 
> > Also is this really good for all targets?  Can there be issues with reformatting
> > when using FP ops as in your patch or with using integer XOR as suggested
> > making this more expensive than the blend?
> 
> I don't think with the fp ops alone,  since it's using two fp ops already and after the change 2 fp ops.
> and I can't image that a target would have a slow -a.

Wouldn't a target need to re-check if lanes are NaN or denormal if after
a SFmode lane operation a DFmode lane operation follows?  IIRC that is
what usually makes punning "integer" vectors as FP vectors costly.

Note one option would be to emit a multiply with { 1, -1, 1, -1 } on
GIMPLE where then targets could opt-in to handle this via a DFmode
negate via a combine pattern?  Not sure if this can be even done
starting from the vec-perm RTL IL.

I fear whether (neg:V2DF (subreg:V2DF (reg:V4SF))) is a good idea
will heavily depend on the target CPU (not only the ISA).  For RISC-V
for example I think the DF lanes do not overlap with two SF lanes
(so same with gcn I think).

Richard.

> The XOR one I wouldn't do, as the vector int and vector float could for instance be in different register
> files or FP be a co-processor etc.  Mixing FP and Integer ops in this case I can image can lead to something
> suboptimal.  Also for targets with masking/predication the VEC_PERM_EXP could potentially be lowered to
> a mask/predicate in the backend. Whereas the XOR approach is far less likely.
> 
> Thanks,
> Tamar
> 
> > 
> > Richard.
> > 
> > > Thanks,
> > > Andrew Pinski
> > >
> > >
> > >
> > >>
> > >> For
> > >>
> > >> void f (float *restrict a, float *restrict b, float *res, int n) {
> > >>   for (int i = 0; i < (n & -4); i+=2)
> > >>    {
> > >>      res[i+0] = a[i+0] + b[i+0];
> > >>      res[i+1] = a[i+1] - b[i+1];
> > >>    }
> > >> }
> > >>
> > >> we generate:
> > >>
> > >> .L3:
> > >>        ldr     q1, [x1, x3]
> > >>        ldr     q0, [x0, x3]
> > >>        fneg    v1.2d, v1.2d
> > >>        fadd    v0.4s, v0.4s, v1.4s
> > >>        str     q0, [x2, x3]
> > >>        add     x3, x3, 16
> > >>        cmp     x3, x4
> > >>        bne     .L3
> > >>
> > >> now instead of:
> > >>
> > >> .L3:
> > >>        ldr     q1, [x0, x3]
> > >>        ldr     q2, [x1, x3]
> > >>        fadd    v0.4s, v1.4s, v2.4s
> > >>        fsub    v1.4s, v1.4s, v2.4s
> > >>        tbl     v0.16b, {v0.16b - v1.16b}, v3.16b
> > >>        str     q0, [x2, x3]
> > >>        add     x3, x3, 16
> > >>        cmp     x3, x4
> > >>        bne     .L3
> > >>
> > >> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > >>
> > >> Thanks to George Steed for the idea.
> > >>
> > >> Ok for master?
> > >>
> > >> Thanks,
> > >> Tamar
> > >>
> > >> gcc/ChangeLog:
> > >>
> > >>        * match.pd: Add fneg/fadd rule.
> > >>
> > >> gcc/testsuite/ChangeLog:
> > >>
> > >>        * gcc.target/aarch64/simd/addsub_1.c: New test.
> > >>        * gcc.target/aarch64/sve/addsub_1.c: New test.
> > >>
> > >> --- inline copy of patch --
> > >> diff --git a/gcc/match.pd b/gcc/match.pd index
> > >>
> > 51b0a1b562409af535e53828a10c30b8a3e1ae2e..af1c98d4a2831f38258d6fc1bb
> > e
> > >> 811c8ee6c7c6e 100644
> > >> --- a/gcc/match.pd
> > >> +++ b/gcc/match.pd
> > >> @@ -7612,6 +7612,49 @@ and,
> > >>   (simplify (reduc (op @0 VECTOR_CST@1))
> > >>     (op (reduc:type @0) (reduc:type @1))))
> > >>
> > >> +/* Simplify vector floating point operations of alternating sub/add pairs
> > >> +   into using an fneg of a wider element type followed by a normal add.
> > >> +   under IEEE 754 the fneg of the wider type will negate every even entry
> > >> +   and when doing an add we get a sub of the even and add of every odd
> > >> +   elements.  */
> > >> +(simplify
> > >> + (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)  (if
> > >> +(!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)
> > >> +  (with
> > >> +   {
> > >> +     /* Build a vector of integers from the tree mask.  */
> > >> +     vec_perm_builder builder;
> > >> +     if (!tree_to_vec_perm_builder (&builder, @2))
> > >> +       return NULL_TREE;
> > >> +
> > >> +     /* Create a vec_perm_indices for the integer vector.  */
> > >> +     poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
> > >> +     vec_perm_indices sel (builder, 2, nelts);
> > >> +   }
> > >> +   (if (sel.series_p (0, 2, 0, 2))
> > >> +    (with
> > >> +     {
> > >> +       machine_mode vec_mode = TYPE_MODE (type);
> > >> +       auto elem_mode = GET_MODE_INNER (vec_mode);
> > >> +       auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
> > >> +       tree stype;
> > >> +       switch (elem_mode)
> > >> +        {
> > >> +        case E_HFmode:
> > >> +          stype = float_type_node;
> > >> +          break;
> > >> +        case E_SFmode:
> > >> +          stype = double_type_node;
> > >> +          break;
> > >> +        default:
> > >> +          return NULL_TREE;
> > >> +        }
> > >> +       tree ntype = build_vector_type (stype, nunits);
> > >> +       if (!ntype)
> > >> +        return NULL_TREE;
> > >> +     }
> > >> +     (plus (view_convert:type (negate (view_convert:ntype @1)))
> > >> + @0))))))
> > >> +
> > >> (simplify
> > >>  (vec_perm @0 @1 VECTOR_CST@2)
> > >>  (with
> > >> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> > >> b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> > >> new file mode 100644
> > >> index
> > >>
> > 0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0db
> > bf
> > >> 1b47ad43310c4
> > >> --- /dev/null
> > >> +++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> > >> @@ -0,0 +1,56 @@
> > >> +/* { dg-do compile } */
> > >> +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
> > >> +/* { dg-options "-Ofast" } */
> > >> +/* { dg-add-options arm_v8_2a_fp16_neon } */
> > >> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > >> +} */
> > >> +
> > >> +#pragma GCC target "+nosve"
> > >> +
> > >> +/*
> > >> +** f1:
> > >> +** ...
> > >> +**     fneg    v[0-9]+.2d, v[0-9]+.2d
> > >> +**     fadd    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> > >> +** ...
> > >> +*/
> > >> +void f1 (float *restrict a, float *restrict b, float *res, int n) {
> > >> +   for (int i = 0; i < (n & -4); i+=2)
> > >> +    {
> > >> +      res[i+0] = a[i+0] + b[i+0];
> > >> +      res[i+1] = a[i+1] - b[i+1];
> > >> +    }
> > >> +}
> > >> +
> > >> +/*
> > >> +** d1:
> > >> +** ...
> > >> +**     fneg    v[0-9]+.4s, v[0-9]+.4s
> > >> +**     fadd    v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> > >> +** ...
> > >> +*/
> > >> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res,
> > >> +int n) {
> > >> +   for (int i = 0; i < (n & -8); i+=2)
> > >> +    {
> > >> +      res[i+0] = a[i+0] + b[i+0];
> > >> +      res[i+1] = a[i+1] - b[i+1];
> > >> +    }
> > >> +}
> > >> +
> > >> +/*
> > >> +** e1:
> > >> +** ...
> > >> +**     fadd    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> > >> +**     fsub    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> > >> +**     ins     v[0-9]+.d\[1\], v[0-9]+.d\[1\]
> > >> +** ...
> > >> +*/
> > >> +void e1 (double *restrict a, double *restrict b, double *res, int n)
> > >> +{
> > >> +   for (int i = 0; i < (n & -4); i+=2)
> > >> +    {
> > >> +      res[i+0] = a[i+0] + b[i+0];
> > >> +      res[i+1] = a[i+1] - b[i+1];
> > >> +    }
> > >> +}
> > >> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> > >> b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> > >> new file mode 100644
> > >> index
> > >>
> > 0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a
> > 3
> > >> 14a29b7a7a922
> > >> --- /dev/null
> > >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> > >> @@ -0,0 +1,52 @@
> > >> +/* { dg-do compile } */
> > >> +/* { dg-options "-Ofast" } */
> > >> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > >> +} */
> > >> +
> > >> +/*
> > >> +** f1:
> > >> +** ...
> > >> +**     fneg    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> > >> +**     fadd    z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
> > >> +** ...
> > >> +*/
> > >> +void f1 (float *restrict a, float *restrict b, float *res, int n) {
> > >> +   for (int i = 0; i < (n & -4); i+=2)
> > >> +    {
> > >> +      res[i+0] = a[i+0] + b[i+0];
> > >> +      res[i+1] = a[i+1] - b[i+1];
> > >> +    }
> > >> +}
> > >> +
> > >> +/*
> > >> +** d1:
> > >> +** ...
> > >> +**     fneg    z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
> > >> +**     fadd    z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
> > >> +** ...
> > >> +*/
> > >> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res,
> > >> +int n) {
> > >> +   for (int i = 0; i < (n & -8); i+=2)
> > >> +    {
> > >> +      res[i+0] = a[i+0] + b[i+0];
> > >> +      res[i+1] = a[i+1] - b[i+1];
> > >> +    }
> > >> +}
> > >> +
> > >> +/*
> > >> +** e1:
> > >> +** ...
> > >> +**     fsub    z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
> > >> +**     movprfx z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> > >> +**     fadd    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> > >> +** ...
> > >> +*/
> > >> +void e1 (double *restrict a, double *restrict b, double *res, int n)
> > >> +{
> > >> +   for (int i = 0; i < (n & -4); i+=2)
> > >> +    {
> > >> +      res[i+0] = a[i+0] + b[i+0];
> > >> +      res[i+1] = a[i+1] - b[i+1];
> > >> +    }
> > >> +}
> > >>
> > >>
> > >>
> > >>
> > >> --
>
  
Tamar Christina June 20, 2022, 12:05 p.m. UTC | #5
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Monday, June 20, 2022 12:56 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Andrew Pinski via Gcc-patches <gcc-patches@gcc.gnu.org>; nd
> <nd@arm.com>
> Subject: RE: [PATCH]middle-end Add optimized float addsub without
> needing VEC_PERM_EXPR.
> 
> On Mon, 20 Jun 2022, Tamar Christina wrote:
> 
> > > -----Original Message-----
> > > From: Richard Biener <rguenther@suse.de>
> > > Sent: Saturday, June 18, 2022 11:49 AM
> > > To: Andrew Pinski via Gcc-patches <gcc-patches@gcc.gnu.org>
> > > Cc: Tamar Christina <Tamar.Christina@arm.com>; nd <nd@arm.com>
> > > Subject: Re: [PATCH]middle-end Add optimized float addsub without
> > > needing VEC_PERM_EXPR.
> > >
> > >
> > >
> > > > Am 17.06.2022 um 22:34 schrieb Andrew Pinski via Gcc-patches <gcc-
> > > patches@gcc.gnu.org>:
> > > >
> > > > On Thu, Jun 16, 2022 at 3:59 AM Tamar Christina via Gcc-patches
> > > > <gcc-patches@gcc.gnu.org> wrote:
> > > >>
> > > >> Hi All,
> > > >>
> > > >> For IEEE 754 floating point formats we can replace a sequence of
> > > >> alternative
> > > >> +/- with fneg of a wider type followed by an fadd.  This
> > > >> +eliminated the need for
> > > >> using a permutation.  This patch adds a math.pd rule to recognize
> > > >> and do this rewriting.
> > > >
> > > > I don't think this is correct. You don't check the format of the
> > > > floating point to make sure this is valid (e.g. REAL_MODE_FORMAT's
> > > > signbit_rw/signbit_ro field).
> >
> > Yes I originally had this check, but I wondered whether it would be needed.
> > I'm not aware of any vector ISA where the 32-bit and 16-bit floats
> > don't follow the IEEE data layout and semantics here.
> >
> > My preference would be to ask the target about the data format of its
> > vector Floating points because I don't think there needs to be a direct
> correlation between
> > The scalar and vector formats strictly speaking.   But I know Richi won't like
> that so
> > the check is probably most likely.
> >
> > > > Also would just be better if you do the xor in integer mode (using
> > > > signbit_rw field for the correct bit)?
> > > > And then making sure the target optimizes the xor to the neg
> > > > instruction when needed?
> >
> > I don't really see the advantage of this one. It's not removing an
> > instruction and it's assuming the vector ISA can do integer ops on a
> > floating point vector cheaply.  Since match.pd doesn't have the
> > ability to do costing I'd rather not do this.
> >
> > > I’m also worried about using FP operations for the negate here.
> > > When @1 is constant do we still constant fold this correctly?
> >
> > We never did constant folding for this case, the folding
> > infrastructure doesn't know how to fold the VEC_PERM_EXPR.  So even
> > with @0 and @1 constant no folding takes place even today if we vectorize.
> >
> > >
> > > For costing purposes it would be nice to make this visible to the
> vectorizer.
> > >
> >
> > I initially wanted to use VEC_ADDSUB for this, but noticed it didn't
> > trigger in a number of place I had expected it to. While looking into
> > it I noticed it's because this follows the x86 instruction semantics so left it
> alone.
> >
> > It felt like adding a third pattern here might be confusing. However I
> > can also use the SLP pattern matcher to rewrite it without an optab if you
> prefer that?
> >
> > The operations will then be costed normally.
> >
> > > Also is this really good for all targets?  Can there be issues with
> > > reformatting when using FP ops as in your patch or with using
> > > integer XOR as suggested making this more expensive than the blend?
> >
> > I don't think with the fp ops alone,  since it's using two fp ops already and
> after the change 2 fp ops.
> > and I can't image that a target would have a slow -a.
> 
> Wouldn't a target need to re-check if lanes are NaN or denormal if after a
> SFmode lane operation a DFmode lane operation follows?  IIRC that is what
> usually makes punning "integer" vectors as FP vectors costly.

I guess this really depends on the target.

> 
> Note one option would be to emit a multiply with { 1, -1, 1, -1 } on GIMPLE
> where then targets could opt-in to handle this via a DFmode negate via a
> combine pattern?  Not sure if this can be even done starting from the vec-
> perm RTL IL.
>

But multiplies can be so expensive that the VEC_PERM_EXPR would still be
better.  At least as you say, the target costed for that. 

> I fear whether (neg:V2DF (subreg:V2DF (reg:V4SF))) is a good idea will
> heavily depend on the target CPU (not only the ISA).  For RISC-V for example
> I think the DF lanes do not overlap with two SF lanes (so same with gcn I
> think).

Right, so I think the conclusion is I need to move this to the backend.

Thanks,
Tamar

> 
> Richard.
> 
> > The XOR one I wouldn't do, as the vector int and vector float could
> > for instance be in different register files or FP be a co-processor
> > etc.  Mixing FP and Integer ops in this case I can image can lead to
> > something suboptimal.  Also for targets with masking/predication the
> VEC_PERM_EXP could potentially be lowered to a mask/predicate in the
> backend. Whereas the XOR approach is far less likely.
> >
> > Thanks,
> > Tamar
> >
> > >
> > > Richard.
> > >
> > > > Thanks,
> > > > Andrew Pinski
> > > >
> > > >
> > > >
> > > >>
> > > >> For
> > > >>
> > > >> void f (float *restrict a, float *restrict b, float *res, int n) {
> > > >>   for (int i = 0; i < (n & -4); i+=2)
> > > >>    {
> > > >>      res[i+0] = a[i+0] + b[i+0];
> > > >>      res[i+1] = a[i+1] - b[i+1];
> > > >>    }
> > > >> }
> > > >>
> > > >> we generate:
> > > >>
> > > >> .L3:
> > > >>        ldr     q1, [x1, x3]
> > > >>        ldr     q0, [x0, x3]
> > > >>        fneg    v1.2d, v1.2d
> > > >>        fadd    v0.4s, v0.4s, v1.4s
> > > >>        str     q0, [x2, x3]
> > > >>        add     x3, x3, 16
> > > >>        cmp     x3, x4
> > > >>        bne     .L3
> > > >>
> > > >> now instead of:
> > > >>
> > > >> .L3:
> > > >>        ldr     q1, [x0, x3]
> > > >>        ldr     q2, [x1, x3]
> > > >>        fadd    v0.4s, v1.4s, v2.4s
> > > >>        fsub    v1.4s, v1.4s, v2.4s
> > > >>        tbl     v0.16b, {v0.16b - v1.16b}, v3.16b
> > > >>        str     q0, [x2, x3]
> > > >>        add     x3, x3, 16
> > > >>        cmp     x3, x4
> > > >>        bne     .L3
> > > >>
> > > >> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > > >>
> > > >> Thanks to George Steed for the idea.
> > > >>
> > > >> Ok for master?
> > > >>
> > > >> Thanks,
> > > >> Tamar
> > > >>
> > > >> gcc/ChangeLog:
> > > >>
> > > >>        * match.pd: Add fneg/fadd rule.
> > > >>
> > > >> gcc/testsuite/ChangeLog:
> > > >>
> > > >>        * gcc.target/aarch64/simd/addsub_1.c: New test.
> > > >>        * gcc.target/aarch64/sve/addsub_1.c: New test.
> > > >>
> > > >> --- inline copy of patch --
> > > >> diff --git a/gcc/match.pd b/gcc/match.pd index
> > > >>
> > >
> 51b0a1b562409af535e53828a10c30b8a3e1ae2e..af1c98d4a2831f38258d6fc1bb
> > > e
> > > >> 811c8ee6c7c6e 100644
> > > >> --- a/gcc/match.pd
> > > >> +++ b/gcc/match.pd
> > > >> @@ -7612,6 +7612,49 @@ and,
> > > >>   (simplify (reduc (op @0 VECTOR_CST@1))
> > > >>     (op (reduc:type @0) (reduc:type @1))))
> > > >>
> > > >> +/* Simplify vector floating point operations of alternating sub/add
> pairs
> > > >> +   into using an fneg of a wider element type followed by a normal
> add.
> > > >> +   under IEEE 754 the fneg of the wider type will negate every even
> entry
> > > >> +   and when doing an add we get a sub of the even and add of every
> odd
> > > >> +   elements.  */
> > > >> +(simplify
> > > >> + (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)  (if
> > > >> +(!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)
> > > >> +  (with
> > > >> +   {
> > > >> +     /* Build a vector of integers from the tree mask.  */
> > > >> +     vec_perm_builder builder;
> > > >> +     if (!tree_to_vec_perm_builder (&builder, @2))
> > > >> +       return NULL_TREE;
> > > >> +
> > > >> +     /* Create a vec_perm_indices for the integer vector.  */
> > > >> +     poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
> > > >> +     vec_perm_indices sel (builder, 2, nelts);
> > > >> +   }
> > > >> +   (if (sel.series_p (0, 2, 0, 2))
> > > >> +    (with
> > > >> +     {
> > > >> +       machine_mode vec_mode = TYPE_MODE (type);
> > > >> +       auto elem_mode = GET_MODE_INNER (vec_mode);
> > > >> +       auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
> > > >> +       tree stype;
> > > >> +       switch (elem_mode)
> > > >> +        {
> > > >> +        case E_HFmode:
> > > >> +          stype = float_type_node;
> > > >> +          break;
> > > >> +        case E_SFmode:
> > > >> +          stype = double_type_node;
> > > >> +          break;
> > > >> +        default:
> > > >> +          return NULL_TREE;
> > > >> +        }
> > > >> +       tree ntype = build_vector_type (stype, nunits);
> > > >> +       if (!ntype)
> > > >> +        return NULL_TREE;
> > > >> +     }
> > > >> +     (plus (view_convert:type (negate (view_convert:ntype @1)))
> > > >> + @0))))))
> > > >> +
> > > >> (simplify
> > > >>  (vec_perm @0 @1 VECTOR_CST@2)
> > > >>  (with
> > > >> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> > > >> b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> > > >> new file mode 100644
> > > >> index
> > > >>
> > >
> 0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0db
> > > bf
> > > >> 1b47ad43310c4
> > > >> --- /dev/null
> > > >> +++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> > > >> @@ -0,0 +1,56 @@
> > > >> +/* { dg-do compile } */
> > > >> +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
> > > >> +/* { dg-options "-Ofast" } */
> > > >> +/* { dg-add-options arm_v8_2a_fp16_neon } */
> > > >> +/* { dg-final { check-function-bodies "**" "" "" { target { le }
> > > >> +} } } */
> > > >> +
> > > >> +#pragma GCC target "+nosve"
> > > >> +
> > > >> +/*
> > > >> +** f1:
> > > >> +** ...
> > > >> +**     fneg    v[0-9]+.2d, v[0-9]+.2d
> > > >> +**     fadd    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> > > >> +** ...
> > > >> +*/
> > > >> +void f1 (float *restrict a, float *restrict b, float *res, int n) {
> > > >> +   for (int i = 0; i < (n & -4); i+=2)
> > > >> +    {
> > > >> +      res[i+0] = a[i+0] + b[i+0];
> > > >> +      res[i+1] = a[i+1] - b[i+1];
> > > >> +    }
> > > >> +}
> > > >> +
> > > >> +/*
> > > >> +** d1:
> > > >> +** ...
> > > >> +**     fneg    v[0-9]+.4s, v[0-9]+.4s
> > > >> +**     fadd    v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> > > >> +** ...
> > > >> +*/
> > > >> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16
> > > >> +*res, int n) {
> > > >> +   for (int i = 0; i < (n & -8); i+=2)
> > > >> +    {
> > > >> +      res[i+0] = a[i+0] + b[i+0];
> > > >> +      res[i+1] = a[i+1] - b[i+1];
> > > >> +    }
> > > >> +}
> > > >> +
> > > >> +/*
> > > >> +** e1:
> > > >> +** ...
> > > >> +**     fadd    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> > > >> +**     fsub    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> > > >> +**     ins     v[0-9]+.d\[1\], v[0-9]+.d\[1\]
> > > >> +** ...
> > > >> +*/
> > > >> +void e1 (double *restrict a, double *restrict b, double *res,
> > > >> +int n) {
> > > >> +   for (int i = 0; i < (n & -4); i+=2)
> > > >> +    {
> > > >> +      res[i+0] = a[i+0] + b[i+0];
> > > >> +      res[i+1] = a[i+1] - b[i+1];
> > > >> +    }
> > > >> +}
> > > >> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> > > >> b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> > > >> new file mode 100644
> > > >> index
> > > >>
> > >
> 0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a
> > > 3
> > > >> 14a29b7a7a922
> > > >> --- /dev/null
> > > >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> > > >> @@ -0,0 +1,52 @@
> > > >> +/* { dg-do compile } */
> > > >> +/* { dg-options "-Ofast" } */
> > > >> +/* { dg-final { check-function-bodies "**" "" "" { target { le }
> > > >> +} } } */
> > > >> +
> > > >> +/*
> > > >> +** f1:
> > > >> +** ...
> > > >> +**     fneg    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> > > >> +**     fadd    z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
> > > >> +** ...
> > > >> +*/
> > > >> +void f1 (float *restrict a, float *restrict b, float *res, int n) {
> > > >> +   for (int i = 0; i < (n & -4); i+=2)
> > > >> +    {
> > > >> +      res[i+0] = a[i+0] + b[i+0];
> > > >> +      res[i+1] = a[i+1] - b[i+1];
> > > >> +    }
> > > >> +}
> > > >> +
> > > >> +/*
> > > >> +** d1:
> > > >> +** ...
> > > >> +**     fneg    z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
> > > >> +**     fadd    z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
> > > >> +** ...
> > > >> +*/
> > > >> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16
> > > >> +*res, int n) {
> > > >> +   for (int i = 0; i < (n & -8); i+=2)
> > > >> +    {
> > > >> +      res[i+0] = a[i+0] + b[i+0];
> > > >> +      res[i+1] = a[i+1] - b[i+1];
> > > >> +    }
> > > >> +}
> > > >> +
> > > >> +/*
> > > >> +** e1:
> > > >> +** ...
> > > >> +**     fsub    z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
> > > >> +**     movprfx z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> > > >> +**     fadd    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> > > >> +** ...
> > > >> +*/
> > > >> +void e1 (double *restrict a, double *restrict b, double *res,
> > > >> +int n) {
> > > >> +   for (int i = 0; i < (n & -4); i+=2)
> > > >> +    {
> > > >> +      res[i+0] = a[i+0] + b[i+0];
> > > >> +      res[i+1] = a[i+1] - b[i+1];
> > > >> +    }
> > > >> +}
> > > >>
> > > >>
> > > >>
> > > >>
> > > >> --
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH, Frankenstraße 146, 90461
> Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald,
> Boudien Moerman; HRB 36809 (AG Nuernberg)
  
Richard Sandiford June 20, 2022, 1:10 p.m. UTC | #6
Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> -----Original Message-----
>> From: Richard Biener <rguenther@suse.de>
>> Sent: Monday, June 20, 2022 12:56 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: Andrew Pinski via Gcc-patches <gcc-patches@gcc.gnu.org>; nd
>> <nd@arm.com>
>> Subject: RE: [PATCH]middle-end Add optimized float addsub without
>> needing VEC_PERM_EXPR.
>> 
>> On Mon, 20 Jun 2022, Tamar Christina wrote:
>> 
>> > > -----Original Message-----
>> > > From: Richard Biener <rguenther@suse.de>
>> > > Sent: Saturday, June 18, 2022 11:49 AM
>> > > To: Andrew Pinski via Gcc-patches <gcc-patches@gcc.gnu.org>
>> > > Cc: Tamar Christina <Tamar.Christina@arm.com>; nd <nd@arm.com>
>> > > Subject: Re: [PATCH]middle-end Add optimized float addsub without
>> > > needing VEC_PERM_EXPR.
>> > >
>> > >
>> > >
>> > > > Am 17.06.2022 um 22:34 schrieb Andrew Pinski via Gcc-patches <gcc-
>> > > patches@gcc.gnu.org>:
>> > > >
>> > > > On Thu, Jun 16, 2022 at 3:59 AM Tamar Christina via Gcc-patches
>> > > > <gcc-patches@gcc.gnu.org> wrote:
>> > > >>
>> > > >> Hi All,
>> > > >>
>> > > >> For IEEE 754 floating point formats we can replace a sequence of
>> > > >> alternative
>> > > >> +/- with fneg of a wider type followed by an fadd.  This
>> > > >> +eliminated the need for
>> > > >> using a permutation.  This patch adds a math.pd rule to recognize
>> > > >> and do this rewriting.
>> > > >
>> > > > I don't think this is correct. You don't check the format of the
>> > > > floating point to make sure this is valid (e.g. REAL_MODE_FORMAT's
>> > > > signbit_rw/signbit_ro field).
>> >
>> > Yes I originally had this check, but I wondered whether it would be needed.
>> > I'm not aware of any vector ISA where the 32-bit and 16-bit floats
>> > don't follow the IEEE data layout and semantics here.
>> >
>> > My preference would be to ask the target about the data format of its
>> > vector Floating points because I don't think there needs to be a direct
>> correlation between
>> > The scalar and vector formats strictly speaking.   But I know Richi won't like
>> that so
>> > the check is probably most likely.
>> >
>> > > > Also would just be better if you do the xor in integer mode (using
>> > > > signbit_rw field for the correct bit)?
>> > > > And then making sure the target optimizes the xor to the neg
>> > > > instruction when needed?
>> >
>> > I don't really see the advantage of this one. It's not removing an
>> > instruction and it's assuming the vector ISA can do integer ops on a
>> > floating point vector cheaply.  Since match.pd doesn't have the
>> > ability to do costing I'd rather not do this.
>> >
>> > > I’m also worried about using FP operations for the negate here.
>> > > When @1 is constant do we still constant fold this correctly?
>> >
>> > We never did constant folding for this case, the folding
>> > infrastructure doesn't know how to fold the VEC_PERM_EXPR.  So even
>> > with @0 and @1 constant no folding takes place even today if we vectorize.
>> >
>> > >
>> > > For costing purposes it would be nice to make this visible to the
>> vectorizer.
>> > >
>> >
>> > I initially wanted to use VEC_ADDSUB for this, but noticed it didn't
>> > trigger in a number of place I had expected it to. While looking into
>> > it I noticed it's because this follows the x86 instruction semantics so left it
>> alone.
>> >
>> > It felt like adding a third pattern here might be confusing. However I
>> > can also use the SLP pattern matcher to rewrite it without an optab if you
>> prefer that?
>> >
>> > The operations will then be costed normally.
>> >
>> > > Also is this really good for all targets?  Can there be issues with
>> > > reformatting when using FP ops as in your patch or with using
>> > > integer XOR as suggested making this more expensive than the blend?
>> >
>> > I don't think with the fp ops alone,  since it's using two fp ops already and
>> after the change 2 fp ops.
>> > and I can't image that a target would have a slow -a.
>> 
>> Wouldn't a target need to re-check if lanes are NaN or denormal if after a
>> SFmode lane operation a DFmode lane operation follows?  IIRC that is what
>> usually makes punning "integer" vectors as FP vectors costly.
>
> I guess this really depends on the target.
>
>> 
>> Note one option would be to emit a multiply with { 1, -1, 1, -1 } on GIMPLE
>> where then targets could opt-in to handle this via a DFmode negate via a
>> combine pattern?  Not sure if this can be even done starting from the vec-
>> perm RTL IL.
>>
>
> But multiplies can be so expensive that the VEC_PERM_EXPR would still be
> better.  At least as you say, the target costed for that. 
>
>> I fear whether (neg:V2DF (subreg:V2DF (reg:V4SF))) is a good idea will
>> heavily depend on the target CPU (not only the ISA).  For RISC-V for example
>> I think the DF lanes do not overlap with two SF lanes (so same with gcn I
>> think).
>
> Right, so I think the conclusion is I need to move this to the backend.

I wouldn't go that far :)  It just means that it needs more conditions.

E.g. whether the subreg is cheap should be testable via MODES_TIEABLE_P.
I don't think that macro should be true for modes that cause
reinterpretation (and it isn't for the RISC-V example).  There's also
targetm.can_change_mode_class (…, ALL_REGS).

Thanks,
Richard

>
> Thanks,
> Tamar
>
>> 
>> Richard.
>> 
>> > The XOR one I wouldn't do, as the vector int and vector float could
>> > for instance be in different register files or FP be a co-processor
>> > etc.  Mixing FP and Integer ops in this case I can image can lead to
>> > something suboptimal.  Also for targets with masking/predication the
>> VEC_PERM_EXP could potentially be lowered to a mask/predicate in the
>> backend. Whereas the XOR approach is far less likely.
>> >
>> > Thanks,
>> > Tamar
>> >
>> > >
>> > > Richard.
>> > >
>> > > > Thanks,
>> > > > Andrew Pinski
>> > > >
>> > > >
>> > > >
>> > > >>
>> > > >> For
>> > > >>
>> > > >> void f (float *restrict a, float *restrict b, float *res, int n) {
>> > > >>   for (int i = 0; i < (n & -4); i+=2)
>> > > >>    {
>> > > >>      res[i+0] = a[i+0] + b[i+0];
>> > > >>      res[i+1] = a[i+1] - b[i+1];
>> > > >>    }
>> > > >> }
>> > > >>
>> > > >> we generate:
>> > > >>
>> > > >> .L3:
>> > > >>        ldr     q1, [x1, x3]
>> > > >>        ldr     q0, [x0, x3]
>> > > >>        fneg    v1.2d, v1.2d
>> > > >>        fadd    v0.4s, v0.4s, v1.4s
>> > > >>        str     q0, [x2, x3]
>> > > >>        add     x3, x3, 16
>> > > >>        cmp     x3, x4
>> > > >>        bne     .L3
>> > > >>
>> > > >> now instead of:
>> > > >>
>> > > >> .L3:
>> > > >>        ldr     q1, [x0, x3]
>> > > >>        ldr     q2, [x1, x3]
>> > > >>        fadd    v0.4s, v1.4s, v2.4s
>> > > >>        fsub    v1.4s, v1.4s, v2.4s
>> > > >>        tbl     v0.16b, {v0.16b - v1.16b}, v3.16b
>> > > >>        str     q0, [x2, x3]
>> > > >>        add     x3, x3, 16
>> > > >>        cmp     x3, x4
>> > > >>        bne     .L3
>> > > >>
>> > > >> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>> > > >>
>> > > >> Thanks to George Steed for the idea.
>> > > >>
>> > > >> Ok for master?
>> > > >>
>> > > >> Thanks,
>> > > >> Tamar
>> > > >>
>> > > >> gcc/ChangeLog:
>> > > >>
>> > > >>        * match.pd: Add fneg/fadd rule.
>> > > >>
>> > > >> gcc/testsuite/ChangeLog:
>> > > >>
>> > > >>        * gcc.target/aarch64/simd/addsub_1.c: New test.
>> > > >>        * gcc.target/aarch64/sve/addsub_1.c: New test.
>> > > >>
>> > > >> --- inline copy of patch --
>> > > >> diff --git a/gcc/match.pd b/gcc/match.pd index
>> > > >>
>> > >
>> 51b0a1b562409af535e53828a10c30b8a3e1ae2e..af1c98d4a2831f38258d6fc1bb
>> > > e
>> > > >> 811c8ee6c7c6e 100644
>> > > >> --- a/gcc/match.pd
>> > > >> +++ b/gcc/match.pd
>> > > >> @@ -7612,6 +7612,49 @@ and,
>> > > >>   (simplify (reduc (op @0 VECTOR_CST@1))
>> > > >>     (op (reduc:type @0) (reduc:type @1))))
>> > > >>
>> > > >> +/* Simplify vector floating point operations of alternating sub/add
>> pairs
>> > > >> +   into using an fneg of a wider element type followed by a normal
>> add.
>> > > >> +   under IEEE 754 the fneg of the wider type will negate every even
>> entry
>> > > >> +   and when doing an add we get a sub of the even and add of every
>> odd
>> > > >> +   elements.  */
>> > > >> +(simplify
>> > > >> + (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)  (if
>> > > >> +(!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)
>> > > >> +  (with
>> > > >> +   {
>> > > >> +     /* Build a vector of integers from the tree mask.  */
>> > > >> +     vec_perm_builder builder;
>> > > >> +     if (!tree_to_vec_perm_builder (&builder, @2))
>> > > >> +       return NULL_TREE;
>> > > >> +
>> > > >> +     /* Create a vec_perm_indices for the integer vector.  */
>> > > >> +     poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
>> > > >> +     vec_perm_indices sel (builder, 2, nelts);
>> > > >> +   }
>> > > >> +   (if (sel.series_p (0, 2, 0, 2))
>> > > >> +    (with
>> > > >> +     {
>> > > >> +       machine_mode vec_mode = TYPE_MODE (type);
>> > > >> +       auto elem_mode = GET_MODE_INNER (vec_mode);
>> > > >> +       auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
>> > > >> +       tree stype;
>> > > >> +       switch (elem_mode)
>> > > >> +        {
>> > > >> +        case E_HFmode:
>> > > >> +          stype = float_type_node;
>> > > >> +          break;
>> > > >> +        case E_SFmode:
>> > > >> +          stype = double_type_node;
>> > > >> +          break;
>> > > >> +        default:
>> > > >> +          return NULL_TREE;
>> > > >> +        }
>> > > >> +       tree ntype = build_vector_type (stype, nunits);
>> > > >> +       if (!ntype)
>> > > >> +        return NULL_TREE;
>> > > >> +     }
>> > > >> +     (plus (view_convert:type (negate (view_convert:ntype @1)))
>> > > >> + @0))))))
>> > > >> +
>> > > >> (simplify
>> > > >>  (vec_perm @0 @1 VECTOR_CST@2)
>> > > >>  (with
>> > > >> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
>> > > >> b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
>> > > >> new file mode 100644
>> > > >> index
>> > > >>
>> > >
>> 0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0db
>> > > bf
>> > > >> 1b47ad43310c4
>> > > >> --- /dev/null
>> > > >> +++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
>> > > >> @@ -0,0 +1,56 @@
>> > > >> +/* { dg-do compile } */
>> > > >> +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
>> > > >> +/* { dg-options "-Ofast" } */
>> > > >> +/* { dg-add-options arm_v8_2a_fp16_neon } */
>> > > >> +/* { dg-final { check-function-bodies "**" "" "" { target { le }
>> > > >> +} } } */
>> > > >> +
>> > > >> +#pragma GCC target "+nosve"
>> > > >> +
>> > > >> +/*
>> > > >> +** f1:
>> > > >> +** ...
>> > > >> +**     fneg    v[0-9]+.2d, v[0-9]+.2d
>> > > >> +**     fadd    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
>> > > >> +** ...
>> > > >> +*/
>> > > >> +void f1 (float *restrict a, float *restrict b, float *res, int n) {
>> > > >> +   for (int i = 0; i < (n & -4); i+=2)
>> > > >> +    {
>> > > >> +      res[i+0] = a[i+0] + b[i+0];
>> > > >> +      res[i+1] = a[i+1] - b[i+1];
>> > > >> +    }
>> > > >> +}
>> > > >> +
>> > > >> +/*
>> > > >> +** d1:
>> > > >> +** ...
>> > > >> +**     fneg    v[0-9]+.4s, v[0-9]+.4s
>> > > >> +**     fadd    v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
>> > > >> +** ...
>> > > >> +*/
>> > > >> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16
>> > > >> +*res, int n) {
>> > > >> +   for (int i = 0; i < (n & -8); i+=2)
>> > > >> +    {
>> > > >> +      res[i+0] = a[i+0] + b[i+0];
>> > > >> +      res[i+1] = a[i+1] - b[i+1];
>> > > >> +    }
>> > > >> +}
>> > > >> +
>> > > >> +/*
>> > > >> +** e1:
>> > > >> +** ...
>> > > >> +**     fadd    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
>> > > >> +**     fsub    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
>> > > >> +**     ins     v[0-9]+.d\[1\], v[0-9]+.d\[1\]
>> > > >> +** ...
>> > > >> +*/
>> > > >> +void e1 (double *restrict a, double *restrict b, double *res,
>> > > >> +int n) {
>> > > >> +   for (int i = 0; i < (n & -4); i+=2)
>> > > >> +    {
>> > > >> +      res[i+0] = a[i+0] + b[i+0];
>> > > >> +      res[i+1] = a[i+1] - b[i+1];
>> > > >> +    }
>> > > >> +}
>> > > >> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
>> > > >> b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
>> > > >> new file mode 100644
>> > > >> index
>> > > >>
>> > >
>> 0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a
>> > > 3
>> > > >> 14a29b7a7a922
>> > > >> --- /dev/null
>> > > >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
>> > > >> @@ -0,0 +1,52 @@
>> > > >> +/* { dg-do compile } */
>> > > >> +/* { dg-options "-Ofast" } */
>> > > >> +/* { dg-final { check-function-bodies "**" "" "" { target { le }
>> > > >> +} } } */
>> > > >> +
>> > > >> +/*
>> > > >> +** f1:
>> > > >> +** ...
>> > > >> +**     fneg    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
>> > > >> +**     fadd    z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
>> > > >> +** ...
>> > > >> +*/
>> > > >> +void f1 (float *restrict a, float *restrict b, float *res, int n) {
>> > > >> +   for (int i = 0; i < (n & -4); i+=2)
>> > > >> +    {
>> > > >> +      res[i+0] = a[i+0] + b[i+0];
>> > > >> +      res[i+1] = a[i+1] - b[i+1];
>> > > >> +    }
>> > > >> +}
>> > > >> +
>> > > >> +/*
>> > > >> +** d1:
>> > > >> +** ...
>> > > >> +**     fneg    z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
>> > > >> +**     fadd    z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
>> > > >> +** ...
>> > > >> +*/
>> > > >> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16
>> > > >> +*res, int n) {
>> > > >> +   for (int i = 0; i < (n & -8); i+=2)
>> > > >> +    {
>> > > >> +      res[i+0] = a[i+0] + b[i+0];
>> > > >> +      res[i+1] = a[i+1] - b[i+1];
>> > > >> +    }
>> > > >> +}
>> > > >> +
>> > > >> +/*
>> > > >> +** e1:
>> > > >> +** ...
>> > > >> +**     fsub    z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
>> > > >> +**     movprfx z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
>> > > >> +**     fadd    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
>> > > >> +** ...
>> > > >> +*/
>> > > >> +void e1 (double *restrict a, double *restrict b, double *res,
>> > > >> +int n) {
>> > > >> +   for (int i = 0; i < (n & -4); i+=2)
>> > > >> +    {
>> > > >> +      res[i+0] = a[i+0] + b[i+0];
>> > > >> +      res[i+1] = a[i+1] - b[i+1];
>> > > >> +    }
>> > > >> +}
>> > > >>
>> > > >>
>> > > >>
>> > > >>
>> > > >> --
>> >
>> 
>> --
>> Richard Biener <rguenther@suse.de>
>> SUSE Software Solutions Germany GmbH, Frankenstraße 146, 90461
>> Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald,
>> Boudien Moerman; HRB 36809 (AG Nuernberg)
  
Jeff Law July 1, 2022, 10:57 p.m. UTC | #7
On 6/17/2022 2:33 PM, Andrew Pinski via Gcc-patches wrote:
> On Thu, Jun 16, 2022 at 3:59 AM Tamar Christina via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>> Hi All,
>>
>> For IEEE 754 floating point formats we can replace a sequence of alternative
>> +/- with fneg of a wider type followed by an fadd.  This eliminated the need for
>> using a permutation.  This patch adds a math.pd rule to recognize and do this
>> rewriting.
> I don't think this is correct. You don't check the format of the
> floating point to make sure this is valid (e.g. REAL_MODE_FORMAT's
> signbit_rw/signbit_ro field).
> Also would just be better if you do the xor in integer mode (using
> signbit_rw field for the correct bit)?
> And then making sure the target optimizes the xor to the neg
> instruction when needed?
Whether or not the xor trick is better or not would be highly target 
dependent.  That seems like it's better left for expansion to figure out 
since we have target costing information at that point.

Jeff
  
Jeff Law July 1, 2022, 11:07 p.m. UTC | #8
On 6/20/2022 5:56 AM, Richard Biener via Gcc-patches wrote:
>
>
> Note one option would be to emit a multiply with { 1, -1, 1, -1 } on
> GIMPLE where then targets could opt-in to handle this via a DFmode
> negate via a combine pattern?  Not sure if this can be even done
> starting from the vec-perm RTL IL.
FWIW, FP multiply is the same cost as FP add/sub on our target.
>
> I fear whether (neg:V2DF (subreg:V2DF (reg:V4SF))) is a good idea
> will heavily depend on the target CPU (not only the ISA).  For RISC-V
> for example I think the DF lanes do not overlap with two SF lanes
> (so same with gcn I think).
Absolutely.  I've regularly seen introduction of subregs like that 
ultimately result in the SUBREG_REG object getting dumped into memory 
rather than be allocated into a register.  It could well be a problem 
with our port, I haven't started chasing it down yet.

One such case where that came up recently was the addition of something 
like this to simplify-rtx.  Basically in some cases we can turn a 
VEC_SELECT into a SUBREG, so I had this little hack in simplify-rtx that 
I was playing with:
> +      /* If we have a VEC_SELECT of a SUBREG try to change the SUBREG so
> +        that we eliminate the VEC_SELECT.  */
> +      if (GET_CODE (op0) == SUBREG
> +         && subreg_lowpart_p (op0)
> +         && VECTOR_MODE_P (GET_MODE (op0))
> +         && GET_MODE_INNER (GET_MODE (op0)) == mode
> +         && XVECLEN (trueop1, 0) == 1
> +         && CONST_INT_P (XVECEXP (trueop1, 0, 0)))
> +       {
> +         return simplify_gen_subreg (mode, SUBREG_REG (op0), GET_MODE 
> (SUBREG_REG (op0)), INTVAL (XVECEXP (trueop1, 0, 0)) * 8);
> +       }

Seemed like a no-brainer win, but in reality it made things worse pretty 
consistently.

jeff
  
Tamar Christina Sept. 23, 2022, 9:11 a.m. UTC | #9
Hi,

Attached is the respun version of the patch,

> >>
> >> Wouldn't a target need to re-check if lanes are NaN or denormal if
> >> after a SFmode lane operation a DFmode lane operation follows?  IIRC
> >> that is what usually makes punning "integer" vectors as FP vectors costly.

I don't believe this is a problem, due to NANs not being a single value and
according to the standard the sign bit doesn't change the meaning of a NAN.

That's why specifically for negates generally no check is performed and it's
Assumed that if a value is a NaN going in, it's a NaN coming out, and this
Optimization doesn't change that.  Also under fast-math we don't guarantee
a stable representation for NaN (or zeros, etc) afaik.

So if that is still a concern I could add && !HONORS_NAN () to the constraints.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* match.pd: Add fneg/fadd rule.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/simd/addsub_1.c: New test.
	* gcc.target/aarch64/sve/addsub_1.c: New test.

--- inline version of patch ---

diff --git a/gcc/match.pd b/gcc/match.pd
index 1bb936fc4010f98f24bb97671350e8432c55b347..2617d56091dfbd41ae49f980ee0af3757f5ec1cf 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7916,6 +7916,59 @@ and,
   (simplify (reduc (op @0 VECTOR_CST@1))
     (op (reduc:type @0) (reduc:type @1))))
 
+/* Simplify vector floating point operations of alternating sub/add pairs
+   into using an fneg of a wider element type followed by a normal add.
+   under IEEE 754 the fneg of the wider type will negate every even entry
+   and when doing an add we get a sub of the even and add of every odd
+   elements.  */
+(simplify
+ (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)
+ (if (!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)
+  (with
+   {
+     /* Build a vector of integers from the tree mask.  */
+     vec_perm_builder builder;
+     if (!tree_to_vec_perm_builder (&builder, @2))
+       return NULL_TREE;
+
+     /* Create a vec_perm_indices for the integer vector.  */
+     poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
+     vec_perm_indices sel (builder, 2, nelts);
+   }
+   (if (sel.series_p (0, 2, 0, 2))
+    (with
+     {
+       machine_mode vec_mode = TYPE_MODE (type);
+       auto elem_mode = GET_MODE_INNER (vec_mode);
+       auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
+       tree stype;
+       switch (elem_mode)
+	 {
+	 case E_HFmode:
+	   stype = float_type_node;
+	   break;
+	 case E_SFmode:
+	   stype = double_type_node;
+	   break;
+	 default:
+	   return NULL_TREE;
+	 }
+       tree ntype = build_vector_type (stype, nunits);
+       if (!ntype)
+	 return NULL_TREE;
+
+       /* The format has to have a simple sign bit.  */
+       const struct real_format *fmt = FLOAT_MODE_FORMAT (vec_mode);
+       if (fmt == NULL)
+	 return NULL_TREE;
+     }
+     (if (fmt->signbit_rw == GET_MODE_UNIT_BITSIZE (vec_mode) - 1
+	  && fmt->signbit_rw == fmt->signbit_ro
+	  && targetm.can_change_mode_class (TYPE_MODE (ntype), TYPE_MODE (type), ALL_REGS)
+	  && (optimize_vectors_before_lowering_p ()
+	      || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)))
+      (plus (view_convert:type (negate (view_convert:ntype @1))) @0)))))))
+
 (simplify
  (vec_perm @0 @1 VECTOR_CST@2)
  (with
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0dbbf1b47ad43310c4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
+/* { dg-options "-Ofast" } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#pragma GCC target "+nosve"
+
+/* 
+** f1:
+** ...
+**	fneg	v[0-9]+.2d, v[0-9]+.2d
+**	fadd	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+void f1 (float *restrict a, float *restrict b, float *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
+
+/* 
+** d1:
+** ...
+** 	fneg	v[0-9]+.4s, v[0-9]+.4s
+** 	fadd	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
+{
+   for (int i = 0; i < (n & -8); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
+
+/* 
+** e1:
+** ...
+** 	fadd	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** 	fsub	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** 	ins	v[0-9]+.d\[1\], v[0-9]+.d\[1\]
+** ...
+*/
+void e1 (double *restrict a, double *restrict b, double *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a314a29b7a7a922
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+/*
+** f1:
+** ...
+** 	fneg	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
+** 	fadd	z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void f1 (float *restrict a, float *restrict b, float *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
+
+/* 
+** d1:
+** ...
+** 	fneg	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
+** 	fadd	z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
+** ...
+*/ 
+void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
+{
+   for (int i = 0; i < (n & -8); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
+
+/*
+** e1:
+** ...
+** 	fsub	z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
+** 	movprfx	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
+** 	fadd	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void e1 (double *restrict a, double *restrict b, double *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
  
Richard Biener Sept. 23, 2022, 12:54 p.m. UTC | #10
On Fri, 23 Sep 2022, Tamar Christina wrote:

> Hi,
> 
> Attached is the respun version of the patch,
> 
> > >>
> > >> Wouldn't a target need to re-check if lanes are NaN or denormal if
> > >> after a SFmode lane operation a DFmode lane operation follows?  IIRC
> > >> that is what usually makes punning "integer" vectors as FP vectors costly.
> 
> I don't believe this is a problem, due to NANs not being a single value and
> according to the standard the sign bit doesn't change the meaning of a NAN.
> 
> That's why specifically for negates generally no check is performed and it's
> Assumed that if a value is a NaN going in, it's a NaN coming out, and this
> Optimization doesn't change that.  Also under fast-math we don't guarantee
> a stable representation for NaN (or zeros, etc) afaik.
> 
> So if that is still a concern I could add && !HONORS_NAN () to the constraints.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* match.pd: Add fneg/fadd rule.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/aarch64/simd/addsub_1.c: New test.
> 	* gcc.target/aarch64/sve/addsub_1.c: New test.
> 
> --- inline version of patch ---
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 1bb936fc4010f98f24bb97671350e8432c55b347..2617d56091dfbd41ae49f980ee0af3757f5ec1cf 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -7916,6 +7916,59 @@ and,
>    (simplify (reduc (op @0 VECTOR_CST@1))
>      (op (reduc:type @0) (reduc:type @1))))
>  
> +/* Simplify vector floating point operations of alternating sub/add pairs
> +   into using an fneg of a wider element type followed by a normal add.
> +   under IEEE 754 the fneg of the wider type will negate every even entry
> +   and when doing an add we get a sub of the even and add of every odd
> +   elements.  */
> +(simplify
> + (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)
> + (if (!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)

shouldn't this be FLOAT_WORDS_BIG_ENDIAN instead?

I'm still concerned what

 (neg:V2DF (subreg:V2DF (reg:V4SF) 0))

means for architectures like RISC-V.  Can one "reformat" FP values
in vector registers so that two floats overlap a double
(and then back)?

I suppose you rely on target_can_change_mode_class to tell you that.


> +  (with
> +   {
> +     /* Build a vector of integers from the tree mask.  */
> +     vec_perm_builder builder;
> +     if (!tree_to_vec_perm_builder (&builder, @2))
> +       return NULL_TREE;
> +
> +     /* Create a vec_perm_indices for the integer vector.  */
> +     poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
> +     vec_perm_indices sel (builder, 2, nelts);
> +   }
> +   (if (sel.series_p (0, 2, 0, 2))
> +    (with
> +     {
> +       machine_mode vec_mode = TYPE_MODE (type);
> +       auto elem_mode = GET_MODE_INNER (vec_mode);
> +       auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
> +       tree stype;
> +       switch (elem_mode)
> +	 {
> +	 case E_HFmode:
> +	   stype = float_type_node;
> +	   break;
> +	 case E_SFmode:
> +	   stype = double_type_node;
> +	   break;
> +	 default:
> +	   return NULL_TREE;
> +	 }

Can't you use GET_MODE_WIDER_MODE and double-check the
mode-size doubles?  I mean you obviously miss DFmode -> TFmode.

> +       tree ntype = build_vector_type (stype, nunits);
> +       if (!ntype)

You want to check that the above results in a vector mode.

> +	 return NULL_TREE;
> +
> +       /* The format has to have a simple sign bit.  */
> +       const struct real_format *fmt = FLOAT_MODE_FORMAT (vec_mode);
> +       if (fmt == NULL)
> +	 return NULL_TREE;
> +     }
> +     (if (fmt->signbit_rw == GET_MODE_UNIT_BITSIZE (vec_mode) - 1

shouldn't this be a check on the component mode?  I think you'd
want to check that the bigger format signbit_rw is equal to
the smaller format mode size plus its signbit_rw or so?

> +	  && fmt->signbit_rw == fmt->signbit_ro
> +	  && targetm.can_change_mode_class (TYPE_MODE (ntype), TYPE_MODE (type), ALL_REGS)
> +	  && (optimize_vectors_before_lowering_p ()
> +	      || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)))
> +      (plus (view_convert:type (negate (view_convert:ntype @1))) @0)))))))
> +
>  (simplify
>   (vec_perm @0 @1 VECTOR_CST@2)
>   (with
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0dbbf1b47ad43310c4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
> +/* { dg-options "-Ofast" } */
> +/* { dg-add-options arm_v8_2a_fp16_neon } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#pragma GCC target "+nosve"
> +
> +/* 
> +** f1:
> +** ...
> +**	fneg	v[0-9]+.2d, v[0-9]+.2d
> +**	fadd	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +** ...
> +*/
> +void f1 (float *restrict a, float *restrict b, float *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
> +
> +/* 
> +** d1:
> +** ...
> +** 	fneg	v[0-9]+.4s, v[0-9]+.4s
> +** 	fadd	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> +** ...
> +*/
> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
> +{
> +   for (int i = 0; i < (n & -8); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
> +
> +/* 
> +** e1:
> +** ...
> +** 	fadd	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> +** 	fsub	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> +** 	ins	v[0-9]+.d\[1\], v[0-9]+.d\[1\]
> +** ...
> +*/
> +void e1 (double *restrict a, double *restrict b, double *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a314a29b7a7a922
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> @@ -0,0 +1,52 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +/*
> +** f1:
> +** ...
> +** 	fneg	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> +** 	fadd	z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
> +** ...
> +*/
> +void f1 (float *restrict a, float *restrict b, float *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
> +
> +/* 
> +** d1:
> +** ...
> +** 	fneg	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
> +** 	fadd	z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
> +** ...
> +*/ 
> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
> +{
> +   for (int i = 0; i < (n & -8); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
> +
> +/*
> +** e1:
> +** ...
> +** 	fsub	z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
> +** 	movprfx	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> +** 	fadd	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> +** ...
> +*/
> +void e1 (double *restrict a, double *restrict b, double *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
> 
>
  
钟居哲 Sept. 23, 2022, 1:07 p.m. UTC | #11
So far I didn't see the case that V2DF <-> V4SF in RISC-V. 



juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2022-09-23 20:54
To: Tamar Christina
CC: Richard Sandiford; Tamar Christina via Gcc-patches; nd; juzhe.zhong
Subject: RE: [PATCH]middle-end Add optimized float addsub without needing VEC_PERM_EXPR.
On Fri, 23 Sep 2022, Tamar Christina wrote:
 
> Hi,
> 
> Attached is the respun version of the patch,
> 
> > >>
> > >> Wouldn't a target need to re-check if lanes are NaN or denormal if
> > >> after a SFmode lane operation a DFmode lane operation follows?  IIRC
> > >> that is what usually makes punning "integer" vectors as FP vectors costly.
> 
> I don't believe this is a problem, due to NANs not being a single value and
> according to the standard the sign bit doesn't change the meaning of a NAN.
> 
> That's why specifically for negates generally no check is performed and it's
> Assumed that if a value is a NaN going in, it's a NaN coming out, and this
> Optimization doesn't change that.  Also under fast-math we don't guarantee
> a stable representation for NaN (or zeros, etc) afaik.
> 
> So if that is still a concern I could add && !HONORS_NAN () to the constraints.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> * match.pd: Add fneg/fadd rule.
> 
> gcc/testsuite/ChangeLog:
> 
> * gcc.target/aarch64/simd/addsub_1.c: New test.
> * gcc.target/aarch64/sve/addsub_1.c: New test.
> 
> --- inline version of patch ---
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 1bb936fc4010f98f24bb97671350e8432c55b347..2617d56091dfbd41ae49f980ee0af3757f5ec1cf 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -7916,6 +7916,59 @@ and,
>    (simplify (reduc (op @0 VECTOR_CST@1))
>      (op (reduc:type @0) (reduc:type @1))))
>  
> +/* Simplify vector floating point operations of alternating sub/add pairs
> +   into using an fneg of a wider element type followed by a normal add.
> +   under IEEE 754 the fneg of the wider type will negate every even entry
> +   and when doing an add we get a sub of the even and add of every odd
> +   elements.  */
> +(simplify
> + (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)
> + (if (!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)
 
shouldn't this be FLOAT_WORDS_BIG_ENDIAN instead?
 
I'm still concerned what
 
(neg:V2DF (subreg:V2DF (reg:V4SF) 0))
 
means for architectures like RISC-V.  Can one "reformat" FP values
in vector registers so that two floats overlap a double
(and then back)?
 
I suppose you rely on target_can_change_mode_class to tell you that.
 
 
> +  (with
> +   {
> +     /* Build a vector of integers from the tree mask.  */
> +     vec_perm_builder builder;
> +     if (!tree_to_vec_perm_builder (&builder, @2))
> +       return NULL_TREE;
> +
> +     /* Create a vec_perm_indices for the integer vector.  */
> +     poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
> +     vec_perm_indices sel (builder, 2, nelts);
> +   }
> +   (if (sel.series_p (0, 2, 0, 2))
> +    (with
> +     {
> +       machine_mode vec_mode = TYPE_MODE (type);
> +       auto elem_mode = GET_MODE_INNER (vec_mode);
> +       auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
> +       tree stype;
> +       switch (elem_mode)
> + {
> + case E_HFmode:
> +    stype = float_type_node;
> +    break;
> + case E_SFmode:
> +    stype = double_type_node;
> +    break;
> + default:
> +    return NULL_TREE;
> + }
 
Can't you use GET_MODE_WIDER_MODE and double-check the
mode-size doubles?  I mean you obviously miss DFmode -> TFmode.
 
> +       tree ntype = build_vector_type (stype, nunits);
> +       if (!ntype)
 
You want to check that the above results in a vector mode.
 
> + return NULL_TREE;
> +
> +       /* The format has to have a simple sign bit.  */
> +       const struct real_format *fmt = FLOAT_MODE_FORMAT (vec_mode);
> +       if (fmt == NULL)
> + return NULL_TREE;
> +     }
> +     (if (fmt->signbit_rw == GET_MODE_UNIT_BITSIZE (vec_mode) - 1
 
shouldn't this be a check on the component mode?  I think you'd
want to check that the bigger format signbit_rw is equal to
the smaller format mode size plus its signbit_rw or so?
 
> +   && fmt->signbit_rw == fmt->signbit_ro
> +   && targetm.can_change_mode_class (TYPE_MODE (ntype), TYPE_MODE (type), ALL_REGS)
> +   && (optimize_vectors_before_lowering_p ()
> +       || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)))
> +      (plus (view_convert:type (negate (view_convert:ntype @1))) @0)))))))
> +
>  (simplify
>   (vec_perm @0 @1 VECTOR_CST@2)
>   (with
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0dbbf1b47ad43310c4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
> +/* { dg-options "-Ofast" } */
> +/* { dg-add-options arm_v8_2a_fp16_neon } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#pragma GCC target "+nosve"
> +
> +/* 
> +** f1:
> +** ...
> +** fneg v[0-9]+.2d, v[0-9]+.2d
> +** fadd v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +** ...
> +*/
> +void f1 (float *restrict a, float *restrict b, float *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
> +
> +/* 
> +** d1:
> +** ...
> +** fneg v[0-9]+.4s, v[0-9]+.4s
> +** fadd v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> +** ...
> +*/
> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
> +{
> +   for (int i = 0; i < (n & -8); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
> +
> +/* 
> +** e1:
> +** ...
> +** fadd v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> +** fsub v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> +** ins v[0-9]+.d\[1\], v[0-9]+.d\[1\]
> +** ...
> +*/
> +void e1 (double *restrict a, double *restrict b, double *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a314a29b7a7a922
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> @@ -0,0 +1,52 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +/*
> +** f1:
> +** ...
> +** fneg z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> +** fadd z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
> +** ...
> +*/
> +void f1 (float *restrict a, float *restrict b, float *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
> +
> +/* 
> +** d1:
> +** ...
> +** fneg z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
> +** fadd z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
> +** ...
> +*/ 
> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
> +{
> +   for (int i = 0; i < (n & -8); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
> +
> +/*
> +** e1:
> +** ...
> +** fsub z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
> +** movprfx z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> +** fadd z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> +** ...
> +*/
> +void e1 (double *restrict a, double *restrict b, double *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] + b[i+0];
> +      res[i+1] = a[i+1] - b[i+1];
> +    }
> +}
> 
> 
 
-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)
  
Tamar Christina Sept. 23, 2022, 1:13 p.m. UTC | #12
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Friday, September 23, 2022 8:54 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina via
> Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
> juzhe.zhong@rivai.ai
> Subject: RE: [PATCH]middle-end Add optimized float addsub without
> needing VEC_PERM_EXPR.
> 
> On Fri, 23 Sep 2022, Tamar Christina wrote:
> 
> > Hi,
> >
> > Attached is the respun version of the patch,
> >
> > > >>
> > > >> Wouldn't a target need to re-check if lanes are NaN or denormal
> > > >> if after a SFmode lane operation a DFmode lane operation follows?
> > > >> IIRC that is what usually makes punning "integer" vectors as FP vectors
> costly.
> >
> > I don't believe this is a problem, due to NANs not being a single
> > value and according to the standard the sign bit doesn't change the
> meaning of a NAN.
> >
> > That's why specifically for negates generally no check is performed
> > and it's Assumed that if a value is a NaN going in, it's a NaN coming
> > out, and this Optimization doesn't change that.  Also under fast-math
> > we don't guarantee a stable representation for NaN (or zeros, etc) afaik.
> >
> > So if that is still a concern I could add && !HONORS_NAN () to the
> constraints.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* match.pd: Add fneg/fadd rule.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.target/aarch64/simd/addsub_1.c: New test.
> > 	* gcc.target/aarch64/sve/addsub_1.c: New test.
> >
> > --- inline version of patch ---
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd index
> >
> 1bb936fc4010f98f24bb97671350e8432c55b347..2617d56091dfbd41ae49f980e
> e0a
> > f3757f5ec1cf 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -7916,6 +7916,59 @@ and,
> >    (simplify (reduc (op @0 VECTOR_CST@1))
> >      (op (reduc:type @0) (reduc:type @1))))
> >
> > +/* Simplify vector floating point operations of alternating sub/add pairs
> > +   into using an fneg of a wider element type followed by a normal add.
> > +   under IEEE 754 the fneg of the wider type will negate every even entry
> > +   and when doing an add we get a sub of the even and add of every odd
> > +   elements.  */
> > +(simplify
> > + (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)  (if
> > +(!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)
> 
> shouldn't this be FLOAT_WORDS_BIG_ENDIAN instead?
> 
> I'm still concerned what
> 
>  (neg:V2DF (subreg:V2DF (reg:V4SF) 0))
> 
> means for architectures like RISC-V.  Can one "reformat" FP values in vector
> registers so that two floats overlap a double (and then back)?
> 
> I suppose you rely on target_can_change_mode_class to tell you that.

Indeed, the documentation says:

"This hook returns true if it is possible to bitcast values held in registers of class rclass
from mode from to mode to and if doing so preserves the low-order bits that are
common to both modes. The result is only meaningful if rclass has registers that can
hold both from and to."

This implies to me that if the bitcast shouldn't be possible the hook should reject it.
Of course you always where something is possible, but perhaps not cheap to do.

The specific implementation for RISC-V seem to imply to me that they disallow any FP
conversions. So seems to be ok.

> 
> 
> > +  (with
> > +   {
> > +     /* Build a vector of integers from the tree mask.  */
> > +     vec_perm_builder builder;
> > +     if (!tree_to_vec_perm_builder (&builder, @2))
> > +       return NULL_TREE;
> > +
> > +     /* Create a vec_perm_indices for the integer vector.  */
> > +     poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
> > +     vec_perm_indices sel (builder, 2, nelts);
> > +   }
> > +   (if (sel.series_p (0, 2, 0, 2))
> > +    (with
> > +     {
> > +       machine_mode vec_mode = TYPE_MODE (type);
> > +       auto elem_mode = GET_MODE_INNER (vec_mode);
> > +       auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
> > +       tree stype;
> > +       switch (elem_mode)
> > +	 {
> > +	 case E_HFmode:
> > +	   stype = float_type_node;
> > +	   break;
> > +	 case E_SFmode:
> > +	   stype = double_type_node;
> > +	   break;
> > +	 default:
> > +	   return NULL_TREE;
> > +	 }
> 
> Can't you use GET_MODE_WIDER_MODE and double-check the mode-size
> doubles?  I mean you obviously miss DFmode -> TFmode.

Problem is I need the type, not the mode, but all even build_pointer_type_for_mode
requires the new scalar type.  So I couldn't find anything to help here given that there's
no inverse relationship between modes and types.

> 
> > +       tree ntype = build_vector_type (stype, nunits);
> > +       if (!ntype)
> 
> You want to check that the above results in a vector mode.

Does it? Technically you can cast a V2SF to both a V1DF or DF can't you?
Both seem equally valid here.

> > +	 return NULL_TREE;
> > +
> > +       /* The format has to have a simple sign bit.  */
> > +       const struct real_format *fmt = FLOAT_MODE_FORMAT (vec_mode);
> > +       if (fmt == NULL)
> > +	 return NULL_TREE;
> > +     }
> > +     (if (fmt->signbit_rw == GET_MODE_UNIT_BITSIZE (vec_mode) - 1
> 
> shouldn't this be a check on the component mode?  I think you'd want to
> check that the bigger format signbit_rw is equal to the smaller format mode
> size plus its signbit_rw or so?

Tbh, both are somewhat weak guarantees.  In a previous patch of mine I'd added a new field "is_ieee"
to the real formats to denote that they are an IEEE type.  Maybe I should revive that instead?

Regards,
Tamar

> 
> > +	  && fmt->signbit_rw == fmt->signbit_ro
> > +	  && targetm.can_change_mode_class (TYPE_MODE (ntype),
> TYPE_MODE (type), ALL_REGS)
> > +	  && (optimize_vectors_before_lowering_p ()
> > +	      || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)))
> > +      (plus (view_convert:type (negate (view_convert:ntype @1)))
> > +@0)))))))
> > +
> >  (simplify
> >   (vec_perm @0 @1 VECTOR_CST@2)
> >   (with
> > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> > b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0db
> bf1
> > b47ad43310c4
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> > @@ -0,0 +1,56 @@
> > +/* { dg-do compile } */
> > +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
> > +/* { dg-options "-Ofast" } */
> > +/* { dg-add-options arm_v8_2a_fp16_neon } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +#pragma GCC target "+nosve"
> > +
> > +/*
> > +** f1:
> > +** ...
> > +**	fneg	v[0-9]+.2d, v[0-9]+.2d
> > +**	fadd	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> > +** ...
> > +*/
> > +void f1 (float *restrict a, float *restrict b, float *res, int n) {
> > +   for (int i = 0; i < (n & -4); i+=2)
> > +    {
> > +      res[i+0] = a[i+0] + b[i+0];
> > +      res[i+1] = a[i+1] - b[i+1];
> > +    }
> > +}
> > +
> > +/*
> > +** d1:
> > +** ...
> > +** 	fneg	v[0-9]+.4s, v[0-9]+.4s
> > +** 	fadd	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> > +** ...
> > +*/
> > +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res,
> > +int n) {
> > +   for (int i = 0; i < (n & -8); i+=2)
> > +    {
> > +      res[i+0] = a[i+0] + b[i+0];
> > +      res[i+1] = a[i+1] - b[i+1];
> > +    }
> > +}
> > +
> > +/*
> > +** e1:
> > +** ...
> > +** 	fadd	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> > +** 	fsub	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> > +** 	ins	v[0-9]+.d\[1\], v[0-9]+.d\[1\]
> > +** ...
> > +*/
> > +void e1 (double *restrict a, double *restrict b, double *res, int n)
> > +{
> > +   for (int i = 0; i < (n & -4); i+=2)
> > +    {
> > +      res[i+0] = a[i+0] + b[i+0];
> > +      res[i+1] = a[i+1] - b[i+1];
> > +    }
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a
> 31
> > 4a29b7a7a922
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> > @@ -0,0 +1,52 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-Ofast" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +/*
> > +** f1:
> > +** ...
> > +** 	fneg	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> > +** 	fadd	z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
> > +** ...
> > +*/
> > +void f1 (float *restrict a, float *restrict b, float *res, int n) {
> > +   for (int i = 0; i < (n & -4); i+=2)
> > +    {
> > +      res[i+0] = a[i+0] + b[i+0];
> > +      res[i+1] = a[i+1] - b[i+1];
> > +    }
> > +}
> > +
> > +/*
> > +** d1:
> > +** ...
> > +** 	fneg	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
> > +** 	fadd	z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
> > +** ...
> > +*/
> > +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res,
> > +int n) {
> > +   for (int i = 0; i < (n & -8); i+=2)
> > +    {
> > +      res[i+0] = a[i+0] + b[i+0];
> > +      res[i+1] = a[i+1] - b[i+1];
> > +    }
> > +}
> > +
> > +/*
> > +** e1:
> > +** ...
> > +** 	fsub	z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
> > +** 	movprfx	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> > +** 	fadd	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> > +** ...
> > +*/
> > +void e1 (double *restrict a, double *restrict b, double *res, int n)
> > +{
> > +   for (int i = 0; i < (n & -4); i+=2)
> > +    {
> > +      res[i+0] = a[i+0] + b[i+0];
> > +      res[i+1] = a[i+1] - b[i+1];
> > +    }
> > +}
> >
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald,
> Boudien Moerman; HRB 36809 (AG Nuernberg)
  
Tamar Christina Sept. 23, 2022, 1:54 p.m. UTC | #13
> -----Original Message-----
> From: Gcc-patches <gcc-patches-
> bounces+tamar.christina=arm.com@gcc.gnu.org> On Behalf Of Tamar
> Christina via Gcc-patches
> Sent: Friday, September 23, 2022 9:14 AM
> To: Richard Biener <rguenther@suse.de>
> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; nd <nd@arm.com>;
> Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>;
> juzhe.zhong@rivai.ai
> Subject: RE: [PATCH]middle-end Add optimized float addsub without
> needing VEC_PERM_EXPR.
> 
> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Friday, September 23, 2022 8:54 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina via
> > Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
> > juzhe.zhong@rivai.ai
> > Subject: RE: [PATCH]middle-end Add optimized float addsub without
> > needing VEC_PERM_EXPR.
> >
> > On Fri, 23 Sep 2022, Tamar Christina wrote:
> >
> > > Hi,
> > >
> > > Attached is the respun version of the patch,
> > >
> > > > >>
> > > > >> Wouldn't a target need to re-check if lanes are NaN or denormal
> > > > >> if after a SFmode lane operation a DFmode lane operation follows?
> > > > >> IIRC that is what usually makes punning "integer" vectors as FP
> > > > >> vectors
> > costly.
> > >
> > > I don't believe this is a problem, due to NANs not being a single
> > > value and according to the standard the sign bit doesn't change the
> > meaning of a NAN.
> > >
> > > That's why specifically for negates generally no check is performed
> > > and it's Assumed that if a value is a NaN going in, it's a NaN
> > > coming out, and this Optimization doesn't change that.  Also under
> > > fast-math we don't guarantee a stable representation for NaN (or zeros,
> etc) afaik.
> > >
> > > So if that is still a concern I could add && !HONORS_NAN () to the
> > constraints.
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > >
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > 	* match.pd: Add fneg/fadd rule.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > 	* gcc.target/aarch64/simd/addsub_1.c: New test.
> > > 	* gcc.target/aarch64/sve/addsub_1.c: New test.
> > >
> > > --- inline version of patch ---
> > >
> > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > >
> >
> 1bb936fc4010f98f24bb97671350e8432c55b347..2617d56091dfbd41ae49f980e
> > e0a
> > > f3757f5ec1cf 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -7916,6 +7916,59 @@ and,
> > >    (simplify (reduc (op @0 VECTOR_CST@1))
> > >      (op (reduc:type @0) (reduc:type @1))))
> > >
> > > +/* Simplify vector floating point operations of alternating sub/add pairs
> > > +   into using an fneg of a wider element type followed by a normal add.
> > > +   under IEEE 754 the fneg of the wider type will negate every even entry
> > > +   and when doing an add we get a sub of the even and add of every odd
> > > +   elements.  */
> > > +(simplify
> > > + (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)  (if
> > > +(!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)
> >
> > shouldn't this be FLOAT_WORDS_BIG_ENDIAN instead?
> >
> > I'm still concerned what
> >
> >  (neg:V2DF (subreg:V2DF (reg:V4SF) 0))
> >
> > means for architectures like RISC-V.  Can one "reformat" FP values in
> > vector registers so that two floats overlap a double (and then back)?
> >
> > I suppose you rely on target_can_change_mode_class to tell you that.
> 
> Indeed, the documentation says:
> 
> "This hook returns true if it is possible to bitcast values held in registers of
> class rclass from mode from to mode to and if doing so preserves the low-
> order bits that are common to both modes. The result is only meaningful if
> rclass has registers that can hold both from and to."
> 
> This implies to me that if the bitcast shouldn't be possible the hook should
> reject it.
> Of course you always where something is possible, but perhaps not cheap to
> do.
> 
> The specific implementation for RISC-V seem to imply to me that they
> disallow any FP conversions. So seems to be ok.
> 
> >
> >
> > > +  (with
> > > +   {
> > > +     /* Build a vector of integers from the tree mask.  */
> > > +     vec_perm_builder builder;
> > > +     if (!tree_to_vec_perm_builder (&builder, @2))
> > > +       return NULL_TREE;
> > > +
> > > +     /* Create a vec_perm_indices for the integer vector.  */
> > > +     poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
> > > +     vec_perm_indices sel (builder, 2, nelts);
> > > +   }
> > > +   (if (sel.series_p (0, 2, 0, 2))
> > > +    (with
> > > +     {
> > > +       machine_mode vec_mode = TYPE_MODE (type);
> > > +       auto elem_mode = GET_MODE_INNER (vec_mode);
> > > +       auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
> > > +       tree stype;
> > > +       switch (elem_mode)
> > > +	 {
> > > +	 case E_HFmode:
> > > +	   stype = float_type_node;
> > > +	   break;
> > > +	 case E_SFmode:
> > > +	   stype = double_type_node;
> > > +	   break;
> > > +	 default:
> > > +	   return NULL_TREE;
> > > +	 }
> >
> > Can't you use GET_MODE_WIDER_MODE and double-check the mode-size
> > doubles?  I mean you obviously miss DFmode -> TFmode.
> 
> Problem is I need the type, not the mode, but all even
> build_pointer_type_for_mode requires the new scalar type.  So I couldn't
> find anything to help here given that there's no inverse relationship between
> modes and types.
> 

I meant build_vector_type_for_mode here.

> >
> > > +       tree ntype = build_vector_type (stype, nunits);
> > > +       if (!ntype)
> >
> > You want to check that the above results in a vector mode.
> 
> Does it? Technically you can cast a V2SF to both a V1DF or DF can't you?
> Both seem equally valid here.
> 
> > > +	 return NULL_TREE;
> > > +
> > > +       /* The format has to have a simple sign bit.  */
> > > +       const struct real_format *fmt = FLOAT_MODE_FORMAT
> (vec_mode);
> > > +       if (fmt == NULL)
> > > +	 return NULL_TREE;
> > > +     }
> > > +     (if (fmt->signbit_rw == GET_MODE_UNIT_BITSIZE (vec_mode) - 1
> >
> > shouldn't this be a check on the component mode?  I think you'd want
> > to check that the bigger format signbit_rw is equal to the smaller
> > format mode size plus its signbit_rw or so?
> 
> Tbh, both are somewhat weak guarantees.  In a previous patch of mine I'd
> added a new field "is_ieee"
> to the real formats to denote that they are an IEEE type.  Maybe I should
> revive that instead?
> 
> Regards,
> Tamar
> 
> >
> > > +	  && fmt->signbit_rw == fmt->signbit_ro
> > > +	  && targetm.can_change_mode_class (TYPE_MODE (ntype),
> > TYPE_MODE (type), ALL_REGS)
> > > +	  && (optimize_vectors_before_lowering_p ()
> > > +	      || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)))
> > > +      (plus (view_convert:type (negate (view_convert:ntype @1)))
> > > +@0)))))))
> > > +
> > >  (simplify
> > >   (vec_perm @0 @1 VECTOR_CST@2)
> > >   (with
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> > > b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> > > new file mode 100644
> > > index
> > >
> >
> 0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0db
> > bf1
> > > b47ad43310c4
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> > > @@ -0,0 +1,56 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
> > > +/* { dg-options "-Ofast" } */
> > > +/* { dg-add-options arm_v8_2a_fp16_neon } */
> > > +/* { dg-final { check-function-bodies "**" "" "" { target { le } }
> > > +} } */
> > > +
> > > +#pragma GCC target "+nosve"
> > > +
> > > +/*
> > > +** f1:
> > > +** ...
> > > +**	fneg	v[0-9]+.2d, v[0-9]+.2d
> > > +**	fadd	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> > > +** ...
> > > +*/
> > > +void f1 (float *restrict a, float *restrict b, float *res, int n) {
> > > +   for (int i = 0; i < (n & -4); i+=2)
> > > +    {
> > > +      res[i+0] = a[i+0] + b[i+0];
> > > +      res[i+1] = a[i+1] - b[i+1];
> > > +    }
> > > +}
> > > +
> > > +/*
> > > +** d1:
> > > +** ...
> > > +** 	fneg	v[0-9]+.4s, v[0-9]+.4s
> > > +** 	fadd	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> > > +** ...
> > > +*/
> > > +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res,
> > > +int n) {
> > > +   for (int i = 0; i < (n & -8); i+=2)
> > > +    {
> > > +      res[i+0] = a[i+0] + b[i+0];
> > > +      res[i+1] = a[i+1] - b[i+1];
> > > +    }
> > > +}
> > > +
> > > +/*
> > > +** e1:
> > > +** ...
> > > +** 	fadd	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> > > +** 	fsub	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> > > +** 	ins	v[0-9]+.d\[1\], v[0-9]+.d\[1\]
> > > +** ...
> > > +*/
> > > +void e1 (double *restrict a, double *restrict b, double *res, int
> > > +n) {
> > > +   for (int i = 0; i < (n & -4); i+=2)
> > > +    {
> > > +      res[i+0] = a[i+0] + b[i+0];
> > > +      res[i+1] = a[i+1] - b[i+1];
> > > +    }
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> > > b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> > > new file mode 100644
> > > index
> > >
> >
> 0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a
> > 31
> > > 4a29b7a7a922
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> > > @@ -0,0 +1,52 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-Ofast" } */
> > > +/* { dg-final { check-function-bodies "**" "" "" { target { le } }
> > > +} } */
> > > +
> > > +/*
> > > +** f1:
> > > +** ...
> > > +** 	fneg	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> > > +** 	fadd	z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
> > > +** ...
> > > +*/
> > > +void f1 (float *restrict a, float *restrict b, float *res, int n) {
> > > +   for (int i = 0; i < (n & -4); i+=2)
> > > +    {
> > > +      res[i+0] = a[i+0] + b[i+0];
> > > +      res[i+1] = a[i+1] - b[i+1];
> > > +    }
> > > +}
> > > +
> > > +/*
> > > +** d1:
> > > +** ...
> > > +** 	fneg	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
> > > +** 	fadd	z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
> > > +** ...
> > > +*/
> > > +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res,
> > > +int n) {
> > > +   for (int i = 0; i < (n & -8); i+=2)
> > > +    {
> > > +      res[i+0] = a[i+0] + b[i+0];
> > > +      res[i+1] = a[i+1] - b[i+1];
> > > +    }
> > > +}
> > > +
> > > +/*
> > > +** e1:
> > > +** ...
> > > +** 	fsub	z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
> > > +** 	movprfx	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> > > +** 	fadd	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> > > +** ...
> > > +*/
> > > +void e1 (double *restrict a, double *restrict b, double *res, int
> > > +n) {
> > > +   for (int i = 0; i < (n & -4); i+=2)
> > > +    {
> > > +      res[i+0] = a[i+0] + b[i+0];
> > > +      res[i+1] = a[i+1] - b[i+1];
> > > +    }
> > > +}
> > >
> > >
> >
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald,
> > Boudien Moerman; HRB 36809 (AG Nuernberg)
  
Richard Biener Sept. 26, 2022, 11:10 a.m. UTC | #14
On Fri, 23 Sep 2022, Tamar Christina wrote:

> > -----Original Message-----
> > From: Gcc-patches <gcc-patches-
> > bounces+tamar.christina=arm.com@gcc.gnu.org> On Behalf Of Tamar
> > Christina via Gcc-patches
> > Sent: Friday, September 23, 2022 9:14 AM
> > To: Richard Biener <rguenther@suse.de>
> > Cc: Richard Sandiford <Richard.Sandiford@arm.com>; nd <nd@arm.com>;
> > Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>;
> > juzhe.zhong@rivai.ai
> > Subject: RE: [PATCH]middle-end Add optimized float addsub without
> > needing VEC_PERM_EXPR.
> > 
> > > -----Original Message-----
> > > From: Richard Biener <rguenther@suse.de>
> > > Sent: Friday, September 23, 2022 8:54 AM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Tamar Christina via
> > > Gcc-patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>;
> > > juzhe.zhong@rivai.ai
> > > Subject: RE: [PATCH]middle-end Add optimized float addsub without
> > > needing VEC_PERM_EXPR.
> > >
> > > On Fri, 23 Sep 2022, Tamar Christina wrote:
> > >
> > > > Hi,
> > > >
> > > > Attached is the respun version of the patch,
> > > >
> > > > > >>
> > > > > >> Wouldn't a target need to re-check if lanes are NaN or denormal
> > > > > >> if after a SFmode lane operation a DFmode lane operation follows?
> > > > > >> IIRC that is what usually makes punning "integer" vectors as FP
> > > > > >> vectors
> > > costly.
> > > >
> > > > I don't believe this is a problem, due to NANs not being a single
> > > > value and according to the standard the sign bit doesn't change the
> > > meaning of a NAN.
> > > >
> > > > That's why specifically for negates generally no check is performed
> > > > and it's Assumed that if a value is a NaN going in, it's a NaN
> > > > coming out, and this Optimization doesn't change that.  Also under
> > > > fast-math we don't guarantee a stable representation for NaN (or zeros,
> > etc) afaik.
> > > >
> > > > So if that is still a concern I could add && !HONORS_NAN () to the
> > > constraints.
> > > >
> > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > > >
> > > > Ok for master?
> > > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > 	* match.pd: Add fneg/fadd rule.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > 	* gcc.target/aarch64/simd/addsub_1.c: New test.
> > > > 	* gcc.target/aarch64/sve/addsub_1.c: New test.
> > > >
> > > > --- inline version of patch ---
> > > >
> > > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > > >
> > >
> > 1bb936fc4010f98f24bb97671350e8432c55b347..2617d56091dfbd41ae49f980e
> > > e0a
> > > > f3757f5ec1cf 100644
> > > > --- a/gcc/match.pd
> > > > +++ b/gcc/match.pd
> > > > @@ -7916,6 +7916,59 @@ and,
> > > >    (simplify (reduc (op @0 VECTOR_CST@1))
> > > >      (op (reduc:type @0) (reduc:type @1))))
> > > >
> > > > +/* Simplify vector floating point operations of alternating sub/add pairs
> > > > +   into using an fneg of a wider element type followed by a normal add.
> > > > +   under IEEE 754 the fneg of the wider type will negate every even entry
> > > > +   and when doing an add we get a sub of the even and add of every odd
> > > > +   elements.  */
> > > > +(simplify
> > > > + (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)  (if
> > > > +(!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)
> > >
> > > shouldn't this be FLOAT_WORDS_BIG_ENDIAN instead?
> > >
> > > I'm still concerned what
> > >
> > >  (neg:V2DF (subreg:V2DF (reg:V4SF) 0))
> > >
> > > means for architectures like RISC-V.  Can one "reformat" FP values in
> > > vector registers so that two floats overlap a double (and then back)?
> > >
> > > I suppose you rely on target_can_change_mode_class to tell you that.
> > 
> > Indeed, the documentation says:
> > 
> > "This hook returns true if it is possible to bitcast values held in registers of
> > class rclass from mode from to mode to and if doing so preserves the low-
> > order bits that are common to both modes. The result is only meaningful if
> > rclass has registers that can hold both from and to."
> > 
> > This implies to me that if the bitcast shouldn't be possible the hook should
> > reject it.
> > Of course you always where something is possible, but perhaps not cheap to
> > do.
> > 
> > The specific implementation for RISC-V seem to imply to me that they
> > disallow any FP conversions. So seems to be ok.

Ok, I see.

> > >
> > >
> > > > +  (with
> > > > +   {
> > > > +     /* Build a vector of integers from the tree mask.  */
> > > > +     vec_perm_builder builder;
> > > > +     if (!tree_to_vec_perm_builder (&builder, @2))
> > > > +       return NULL_TREE;
> > > > +
> > > > +     /* Create a vec_perm_indices for the integer vector.  */
> > > > +     poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
> > > > +     vec_perm_indices sel (builder, 2, nelts);
> > > > +   }
> > > > +   (if (sel.series_p (0, 2, 0, 2))
> > > > +    (with
> > > > +     {
> > > > +       machine_mode vec_mode = TYPE_MODE (type);
> > > > +       auto elem_mode = GET_MODE_INNER (vec_mode);
> > > > +       auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
> > > > +       tree stype;
> > > > +       switch (elem_mode)
> > > > +	 {
> > > > +	 case E_HFmode:
> > > > +	   stype = float_type_node;
> > > > +	   break;
> > > > +	 case E_SFmode:
> > > > +	   stype = double_type_node;
> > > > +	   break;
> > > > +	 default:
> > > > +	   return NULL_TREE;
> > > > +	 }
> > >
> > > Can't you use GET_MODE_WIDER_MODE and double-check the mode-size
> > > doubles?  I mean you obviously miss DFmode -> TFmode.
> > 
> > Problem is I need the type, not the mode, but all even
> > build_pointer_type_for_mode requires the new scalar type.  So I couldn't
> > find anything to help here given that there's no inverse relationship between
> > modes and types.
> > 
> 
> I meant build_vector_type_for_mode here.

I think you should be able to use type_for_mode on the scalar wider
mode and build the vector type with the wider elements from that
though?

> > >
> > > > +       tree ntype = build_vector_type (stype, nunits);
> > > > +       if (!ntype)
> > >
> > > You want to check that the above results in a vector mode.
> > 
> > Does it? Technically you can cast a V2SF to both a V1DF or DF can't you?
> > Both seem equally valid here.

If the target doesn't support, say V2DF but only V4SF then you
could get BLKmode as result, so you can also check != BLKmode.
You could eventually check
mode_for_vector (scalar_mode, nunits).exists ().  build_vector_type
will never return NULL_TREE.

> > > > +	 return NULL_TREE;
> > > > +
> > > > +       /* The format has to have a simple sign bit.  */
> > > > +       const struct real_format *fmt = FLOAT_MODE_FORMAT
> > (vec_mode);
> > > > +       if (fmt == NULL)
> > > > +	 return NULL_TREE;
> > > > +     }
> > > > +     (if (fmt->signbit_rw == GET_MODE_UNIT_BITSIZE (vec_mode) - 1
> > >
> > > shouldn't this be a check on the component mode?  I think you'd want
> > > to check that the bigger format signbit_rw is equal to the smaller
> > > format mode size plus its signbit_rw or so?
> > 
> > Tbh, both are somewhat weak guarantees.  In a previous patch of mine I'd
> > added a new field "is_ieee"
> > to the real formats to denote that they are an IEEE type.  Maybe I should
> > revive that instead?

Not sure, isn't the only important part whether the larger mode signbit
is in the same position as the correct smaller mode element signbit?
And yes, that (negate ..) will actually just flip that bit and not
do anything else.

Richard.

> > 
> > Regards,
> > Tamar
> > 
> > >
> > > > +	  && fmt->signbit_rw == fmt->signbit_ro
> > > > +	  && targetm.can_change_mode_class (TYPE_MODE (ntype),
> > > TYPE_MODE (type), ALL_REGS)
> > > > +	  && (optimize_vectors_before_lowering_p ()
> > > > +	      || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)))
> > > > +      (plus (view_convert:type (negate (view_convert:ntype @1)))
> > > > +@0)))))))
> > > > +
> > > >  (simplify
> > > >   (vec_perm @0 @1 VECTOR_CST@2)
> > > >   (with
> > > > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> > > > b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> > > > new file mode 100644
> > > > index
> > > >
> > >
> > 0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0db
> > > bf1
> > > > b47ad43310c4
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
> > > > @@ -0,0 +1,56 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
> > > > +/* { dg-options "-Ofast" } */
> > > > +/* { dg-add-options arm_v8_2a_fp16_neon } */
> > > > +/* { dg-final { check-function-bodies "**" "" "" { target { le } }
> > > > +} } */
> > > > +
> > > > +#pragma GCC target "+nosve"
> > > > +
> > > > +/*
> > > > +** f1:
> > > > +** ...
> > > > +**	fneg	v[0-9]+.2d, v[0-9]+.2d
> > > > +**	fadd	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> > > > +** ...
> > > > +*/
> > > > +void f1 (float *restrict a, float *restrict b, float *res, int n) {
> > > > +   for (int i = 0; i < (n & -4); i+=2)
> > > > +    {
> > > > +      res[i+0] = a[i+0] + b[i+0];
> > > > +      res[i+1] = a[i+1] - b[i+1];
> > > > +    }
> > > > +}
> > > > +
> > > > +/*
> > > > +** d1:
> > > > +** ...
> > > > +** 	fneg	v[0-9]+.4s, v[0-9]+.4s
> > > > +** 	fadd	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> > > > +** ...
> > > > +*/
> > > > +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res,
> > > > +int n) {
> > > > +   for (int i = 0; i < (n & -8); i+=2)
> > > > +    {
> > > > +      res[i+0] = a[i+0] + b[i+0];
> > > > +      res[i+1] = a[i+1] - b[i+1];
> > > > +    }
> > > > +}
> > > > +
> > > > +/*
> > > > +** e1:
> > > > +** ...
> > > > +** 	fadd	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> > > > +** 	fsub	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> > > > +** 	ins	v[0-9]+.d\[1\], v[0-9]+.d\[1\]
> > > > +** ...
> > > > +*/
> > > > +void e1 (double *restrict a, double *restrict b, double *res, int
> > > > +n) {
> > > > +   for (int i = 0; i < (n & -4); i+=2)
> > > > +    {
> > > > +      res[i+0] = a[i+0] + b[i+0];
> > > > +      res[i+1] = a[i+1] - b[i+1];
> > > > +    }
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> > > > b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> > > > new file mode 100644
> > > > index
> > > >
> > >
> > 0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a
> > > 31
> > > > 4a29b7a7a922
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
> > > > @@ -0,0 +1,52 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-Ofast" } */
> > > > +/* { dg-final { check-function-bodies "**" "" "" { target { le } }
> > > > +} } */
> > > > +
> > > > +/*
> > > > +** f1:
> > > > +** ...
> > > > +** 	fneg	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> > > > +** 	fadd	z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
> > > > +** ...
> > > > +*/
> > > > +void f1 (float *restrict a, float *restrict b, float *res, int n) {
> > > > +   for (int i = 0; i < (n & -4); i+=2)
> > > > +    {
> > > > +      res[i+0] = a[i+0] + b[i+0];
> > > > +      res[i+1] = a[i+1] - b[i+1];
> > > > +    }
> > > > +}
> > > > +
> > > > +/*
> > > > +** d1:
> > > > +** ...
> > > > +** 	fneg	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
> > > > +** 	fadd	z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
> > > > +** ...
> > > > +*/
> > > > +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res,
> > > > +int n) {
> > > > +   for (int i = 0; i < (n & -8); i+=2)
> > > > +    {
> > > > +      res[i+0] = a[i+0] + b[i+0];
> > > > +      res[i+1] = a[i+1] - b[i+1];
> > > > +    }
> > > > +}
> > > > +
> > > > +/*
> > > > +** e1:
> > > > +** ...
> > > > +** 	fsub	z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
> > > > +** 	movprfx	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> > > > +** 	fadd	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> > > > +** ...
> > > > +*/
> > > > +void e1 (double *restrict a, double *restrict b, double *res, int
> > > > +n) {
> > > > +   for (int i = 0; i < (n & -4); i+=2)
> > > > +    {
> > > > +      res[i+0] = a[i+0] + b[i+0];
> > > > +      res[i+1] = a[i+1] - b[i+1];
> > > > +    }
> > > > +}
> > > >
> > > >
> > >
> > > --
> > > Richard Biener <rguenther@suse.de>
> > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald,
> > > Boudien Moerman; HRB 36809 (AG Nuernberg)
>
  
Tamar Christina Oct. 31, 2022, 11:38 a.m. UTC | #15
Hi All,

This is a respin with all feedback addressed.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* match.pd: Add fneg/fadd rule.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/simd/addsub_1.c: New test.
	* gcc.target/aarch64/sve/addsub_1.c: New test.

--- inline copy of patch ---

diff --git a/gcc/generic-match-head.cc b/gcc/generic-match-head.cc
index cb0fbd32fa6fbea7b3c96462bf54abe891396fd6..aed4dcc8c3d718dcd4cb2da05cf9709a65cdde70 100644
--- a/gcc/generic-match-head.cc
+++ b/gcc/generic-match-head.cc
@@ -39,6 +39,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "dbgcnt.h"
 #include "tm.h"
 #include "tree-eh.h"
+#include "langhooks.h"
 
 /* Routine to determine if the types T1 and T2 are effectively
    the same for GENERIC.  If T1 or T2 is not a type, the test
diff --git a/gcc/gimple-match-head.cc b/gcc/gimple-match-head.cc
index 4c80d77f8ba23b9ce8b67913c2238ff65b291906..9986e3479f903d26446c795113a14c9cc3d4359e 100644
--- a/gcc/gimple-match-head.cc
+++ b/gcc/gimple-match-head.cc
@@ -46,6 +46,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "dbgcnt.h"
 #include "tm.h"
 #include "gimple-range.h"
+#include "langhooks.h"
 
 /* Forward declarations of the private auto-generated matchers.
    They expect valueized operands in canonical order and do not
diff --git a/gcc/match.pd b/gcc/match.pd
index 1bb936fc4010f98f24bb97671350e8432c55b347..5e747352b413e6595ccbd20d781614d55abc372a 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7916,6 +7916,65 @@ and,
   (simplify (reduc (op @0 VECTOR_CST@1))
     (op (reduc:type @0) (reduc:type @1))))
 
+/* Simplify vector floating point operations of alternating sub/add pairs
+   into using an fneg of a wider element type followed by a normal add.
+   under IEEE 754 the fneg of the wider type will negate every even entry
+   and when doing an add we get a sub of the even and add of every odd
+   elements.  */
+(simplify
+ (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)
+ (if (!VECTOR_INTEGER_TYPE_P (type)
+      && !FLOAT_WORDS_BIG_ENDIAN)
+  (with
+   {
+     /* Build a vector of integers from the tree mask.  */
+     vec_perm_builder builder;
+     if (!tree_to_vec_perm_builder (&builder, @2))
+       return NULL_TREE;
+
+     /* Create a vec_perm_indices for the integer vector.  */
+     poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
+     vec_perm_indices sel (builder, 2, nelts);
+   }
+   (if (sel.series_p (0, 2, 0, 2))
+    (with
+     {
+       machine_mode vec_mode = TYPE_MODE (type);
+       machine_mode wide_mode;
+       if (!GET_MODE_WIDER_MODE (vec_mode).exists (&wide_mode)
+	   || !VECTOR_MODE_P (wide_mode)
+	   || (GET_MODE_UNIT_BITSIZE (vec_mode) * 2
+		!= GET_MODE_UNIT_BITSIZE (wide_mode)))
+	 return NULL_TREE;
+
+       tree stype = lang_hooks.types.type_for_mode (GET_MODE_INNER (wide_mode),
+						    TYPE_UNSIGNED (type));
+       if (TYPE_MODE (stype) == BLKmode)
+	 return NULL_TREE;
+       tree ntype = build_vector_type_for_mode (stype, wide_mode);
+       if (!VECTOR_TYPE_P (ntype))
+	 return NULL_TREE;
+
+       /* The format has to be a non-extended ieee format.  */
+       const struct real_format *fmt_old = FLOAT_MODE_FORMAT (vec_mode);
+       const struct real_format *fmt_new = FLOAT_MODE_FORMAT (wide_mode);
+       if (fmt_old == NULL || fmt_new == NULL)
+	 return NULL_TREE;
+
+       /* If the target doesn't support v1xx vectors, try using scalar mode xx
+	  instead.  */
+       if (known_eq (GET_MODE_NUNITS (wide_mode), 1)
+	   && !target_supports_op_p (ntype, NEGATE_EXPR, optab_vector))
+	 ntype = stype;
+     }
+     (if (fmt_new->signbit_rw
+	     == fmt_old->signbit_rw + GET_MODE_UNIT_BITSIZE (vec_mode)
+	  && fmt_new->signbit_rw == fmt_new->signbit_ro
+	  && targetm.can_change_mode_class (TYPE_MODE (ntype), TYPE_MODE (type), ALL_REGS)
+	  && ((optimize_vectors_before_lowering_p () && VECTOR_TYPE_P (ntype))
+	      || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)))
+      (plus (view_convert:type (negate (view_convert:ntype @1))) @0)))))))
+
 (simplify
  (vec_perm @0 @1 VECTOR_CST@2)
  (with
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0dbbf1b47ad43310c4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
+/* { dg-options "-Ofast" } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#pragma GCC target "+nosve"
+
+/* 
+** f1:
+** ...
+**	fneg	v[0-9]+.2d, v[0-9]+.2d
+**	fadd	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+void f1 (float *restrict a, float *restrict b, float *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
+
+/* 
+** d1:
+** ...
+** 	fneg	v[0-9]+.4s, v[0-9]+.4s
+** 	fadd	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
+{
+   for (int i = 0; i < (n & -8); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
+
+/* 
+** e1:
+** ...
+** 	fadd	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** 	fsub	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** 	ins	v[0-9]+.d\[1\], v[0-9]+.d\[1\]
+** ...
+*/
+void e1 (double *restrict a, double *restrict b, double *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a314a29b7a7a922
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+/*
+** f1:
+** ...
+** 	fneg	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
+** 	fadd	z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void f1 (float *restrict a, float *restrict b, float *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
+
+/* 
+** d1:
+** ...
+** 	fneg	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
+** 	fadd	z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
+** ...
+*/ 
+void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
+{
+   for (int i = 0; i < (n & -8); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
+
+/*
+** e1:
+** ...
+** 	fsub	z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
+** 	movprfx	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
+** 	fadd	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void e1 (double *restrict a, double *restrict b, double *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
  
Jeff Law Oct. 31, 2022, 3:49 p.m. UTC | #16
On 10/31/22 05:38, Tamar Christina via Gcc-patches wrote:
> Hi All,
>
> This is a respin with all feedback addressed.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* match.pd: Add fneg/fadd rule.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/simd/addsub_1.c: New test.
> 	* gcc.target/aarch64/sve/addsub_1.c: New test.

It's a pretty neat optimization.  I'd been watching it progress. Glad to 
see you were able to address the existing feedback before stage1 closed.


OK for the trunk.


Jeff
  

Patch

--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7612,6 +7612,49 @@  and,
   (simplify (reduc (op @0 VECTOR_CST@1))
     (op (reduc:type @0) (reduc:type @1))))
 
+/* Simplify vector floating point operations of alternating sub/add pairs
+   into using an fneg of a wider element type followed by a normal add.
+   under IEEE 754 the fneg of the wider type will negate every even entry
+   and when doing an add we get a sub of the even and add of every odd
+   elements.  */
+(simplify
+ (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)
+ (if (!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)
+  (with
+   {
+     /* Build a vector of integers from the tree mask.  */
+     vec_perm_builder builder;
+     if (!tree_to_vec_perm_builder (&builder, @2))
+       return NULL_TREE;
+
+     /* Create a vec_perm_indices for the integer vector.  */
+     poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
+     vec_perm_indices sel (builder, 2, nelts);
+   }
+   (if (sel.series_p (0, 2, 0, 2))
+    (with
+     {
+       machine_mode vec_mode = TYPE_MODE (type);
+       auto elem_mode = GET_MODE_INNER (vec_mode);
+       auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
+       tree stype;
+       switch (elem_mode)
+	 {
+	 case E_HFmode:
+	   stype = float_type_node;
+	   break;
+	 case E_SFmode:
+	   stype = double_type_node;
+	   break;
+	 default:
+	   return NULL_TREE;
+	 }
+       tree ntype = build_vector_type (stype, nunits);
+       if (!ntype)
+	 return NULL_TREE;
+     }
+     (plus (view_convert:type (negate (view_convert:ntype @1))) @0))))))
+
 (simplify
  (vec_perm @0 @1 VECTOR_CST@2)
  (with
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0dbbf1b47ad43310c4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
@@ -0,0 +1,56 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
+/* { dg-options "-Ofast" } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#pragma GCC target "+nosve"
+
+/* 
+** f1:
+** ...
+**	fneg	v[0-9]+.2d, v[0-9]+.2d
+**	fadd	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+void f1 (float *restrict a, float *restrict b, float *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
+
+/* 
+** d1:
+** ...
+** 	fneg	v[0-9]+.4s, v[0-9]+.4s
+** 	fadd	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
+{
+   for (int i = 0; i < (n & -8); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
+
+/* 
+** e1:
+** ...
+** 	fadd	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** 	fsub	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** 	ins	v[0-9]+.d\[1\], v[0-9]+.d\[1\]
+** ...
+*/
+void e1 (double *restrict a, double *restrict b, double *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a314a29b7a7a922
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
@@ -0,0 +1,52 @@ 
+/* { dg-do compile } */
+/* { dg-options "-Ofast" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+/*
+** f1:
+** ...
+** 	fneg	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
+** 	fadd	z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void f1 (float *restrict a, float *restrict b, float *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
+
+/* 
+** d1:
+** ...
+** 	fneg	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
+** 	fadd	z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
+** ...
+*/ 
+void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
+{
+   for (int i = 0; i < (n & -8); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}
+
+/*
+** e1:
+** ...
+** 	fsub	z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
+** 	movprfx	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
+** 	fadd	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void e1 (double *restrict a, double *restrict b, double *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] + b[i+0];
+      res[i+1] = a[i+1] - b[i+1];
+    }
+}