middle-end convert negate + right shift into compare greater.

Message ID patch-14918-tamar@arm.com
State Committed
Headers
Series middle-end convert negate + right shift into compare greater. |

Commit Message

Tamar Christina Oct. 5, 2021, 12:50 p.m. UTC
  Hi All,

This turns an inversion of the sign bit + arithmetic right shift into a
comparison with 0.

i.e.

void fun1(int32_t *x, int n)
{
    for (int i = 0; i < (n & -16); i++)
      x[i] = (-x[i]) >> 31;
}

now generates:

.L3:
        ldr     q0, [x0]
        cmgt    v0.4s, v0.4s, #0
        str     q0, [x0], 16
        cmp     x0, x1
        bne     .L3

instead of:

.L3:
        ldr     q0, [x0]
        neg     v0.4s, v0.4s
        sshr    v0.4s, v0.4s, 31
        str     q0, [x0], 16
        cmp     x0, x1
        bne     .L3

Bootstrapped Regtested on aarch64-none-linux-gnu,
x86_64-pc-linux-gnu and no regressions.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* match.pd: New negate+shift pattern.

gcc/testsuite/ChangeLog:

	* gcc.dg/signbit-2.c: New test.
	* gcc.dg/signbit-3.c: New test.
	* gcc.target/aarch64/signbit-1.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/match.pd b/gcc/match.pd
index 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..581436fe36dbacdcb0c2720b7190c96d14398143 100644


--
  

Comments

Richard Earnshaw Oct. 5, 2021, 12:56 p.m. UTC | #1
On 05/10/2021 13:50, Tamar Christina via Gcc-patches wrote:
> Hi All,
> 
> This turns an inversion of the sign bit + arithmetic right shift into a
> comparison with 0.
> 
> i.e.
> 
> void fun1(int32_t *x, int n)
> {
>      for (int i = 0; i < (n & -16); i++)
>        x[i] = (-x[i]) >> 31;
> }
> 
Notwithstanding that I think shifting a negative value right is 
unspecified behaviour, I don't think this generates the same result when 
x[i] is INT_MIN either, although negating that is also unspecified since 
it can't be represented in an int.

R.

> now generates:
> 
> .L3:
>          ldr     q0, [x0]
>          cmgt    v0.4s, v0.4s, #0
>          str     q0, [x0], 16
>          cmp     x0, x1
>          bne     .L3
> 
> instead of:
> 
> .L3:
>          ldr     q0, [x0]
>          neg     v0.4s, v0.4s
>          sshr    v0.4s, v0.4s, 31
>          str     q0, [x0], 16
>          cmp     x0, x1
>          bne     .L3
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> x86_64-pc-linux-gnu and no regressions.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* match.pd: New negate+shift pattern.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/signbit-2.c: New test.
> 	* gcc.dg/signbit-3.c: New test.
> 	* gcc.target/aarch64/signbit-1.c: New test.
> 
> --- inline copy of patch --
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..581436fe36dbacdcb0c2720b7190c96d14398143 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -826,6 +826,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>       { tree utype = unsigned_type_for (type); }
>       (convert (rshift (lshift (convert:utype @0) @2) @3))))))
>   
> +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */
> +(for cst (INTEGER_CST VECTOR_CST)
> + (simplify
> +  (rshift (negate:s @0) cst@1)
> +   (with { tree ctype = TREE_TYPE (@0);
> +	   tree stype = TREE_TYPE (@1);
> +	   tree bt = truth_type_for (ctype); }
> +    (switch
> +     /* Handle scalar case.  */
> +     (if (INTEGRAL_TYPE_P (ctype)
> +	  && !VECTOR_TYPE_P (ctype)
> +	  && !TYPE_UNSIGNED (ctype)
> +	  && canonicalize_math_after_vectorization_p ()
> +	  && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
> +      (convert:bt (gt:bt @0 { build_zero_cst (stype); })))
> +     /* Handle vector case with a scalar immediate.  */
> +     (if (VECTOR_INTEGER_TYPE_P (ctype)
> +	  && !VECTOR_TYPE_P (stype)
> +	  && !TYPE_UNSIGNED (ctype)
> +          && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
> +      (convert:bt (gt:bt @0 { build_zero_cst (ctype); })))
> +     /* Handle vector case with a vector immediate.   */
> +     (if (VECTOR_INTEGER_TYPE_P (ctype)
> +	  && VECTOR_TYPE_P (stype)
> +	  && !TYPE_UNSIGNED (ctype)
> +	  && uniform_vector_p (@1))
> +      (with { tree cst = vector_cst_elt (@1, 0);
> +	      tree t = TREE_TYPE (cst); }
> +       (if (wi::eq_p (wi::to_wide (cst), TYPE_PRECISION (t) - 1))
> +        (convert:bt (gt:bt @0 { build_zero_cst (ctype); })))))))))
> +
>   /* Fold (C1/X)*C2 into (C1*C2)/X.  */
>   (simplify
>    (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2)
> diff --git a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc30176c96a669bb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/signbit-2.c
> @@ -0,0 +1,19 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */
> +
> +#include <stdint.h>
> +
> +void fun1(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +void fun2(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 30;
> +}
> +
> +/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1 optimized } } */
> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> diff --git a/gcc/testsuite/gcc.dg/signbit-3.c b/gcc/testsuite/gcc.dg/signbit-3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628f00938ece60bf7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/signbit-3.c
> @@ -0,0 +1,13 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */
> +
> +#include <stdint.h>
> +
> +void fun1(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } } */
> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27fe48503714447e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> @@ -0,0 +1,18 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O3 --save-temps" } */
> +
> +#include <stdint.h>
> +
> +void fun1(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +void fun2(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 30;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */
> 
>
  
Tamar Christina Oct. 5, 2021, 1:30 p.m. UTC | #2
> -----Original Message-----
> From: Richard Earnshaw <Richard.Earnshaw@foss.arm.com>
> Sent: Tuesday, October 5, 2021 1:56 PM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; rguenther@suse.de
> Subject: Re: [PATCH]middle-end convert negate + right shift into compare
> greater.
> 
> 
> 
> On 05/10/2021 13:50, Tamar Christina via Gcc-patches wrote:
> > Hi All,
> >
> > This turns an inversion of the sign bit + arithmetic right shift into
> > a comparison with 0.
> >
> > i.e.
> >
> > void fun1(int32_t *x, int n)
> > {
> >      for (int i = 0; i < (n & -16); i++)
> >        x[i] = (-x[i]) >> 31;
> > }
> >
> Notwithstanding that I think shifting a negative value right is unspecified
> behaviour, I don't think this generates the same result when x[i] is INT_MIN
> either, although negating that is also unspecified since it can't be
> represented in an int.
> 

You're right that they are implementation defined, but I think most ISAs do have a sane
Implementation of these two cases. At least both x86 and AArch64 just replicate the signbit
and for negate do two complement negation. So INT_MIN works as expected and results in 0.

But I'm happy to guard this behind some sort of target guard.

Regards,
Tamar

> R.
> 
> > now generates:
> >
> > .L3:
> >          ldr     q0, [x0]
> >          cmgt    v0.4s, v0.4s, #0
> >          str     q0, [x0], 16
> >          cmp     x0, x1
> >          bne     .L3
> >
> > instead of:
> >
> > .L3:
> >          ldr     q0, [x0]
> >          neg     v0.4s, v0.4s
> >          sshr    v0.4s, v0.4s, 31
> >          str     q0, [x0], 16
> >          cmp     x0, x1
> >          bne     .L3
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu,
> > x86_64-pc-linux-gnu and no regressions.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* match.pd: New negate+shift pattern.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.dg/signbit-2.c: New test.
> > 	* gcc.dg/signbit-3.c: New test.
> > 	* gcc.target/aarch64/signbit-1.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index
> 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..581436fe36dbacdcb0c2720b7
> 190c96d14398143 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -826,6 +826,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >       { tree utype = unsigned_type_for (type); }
> >       (convert (rshift (lshift (convert:utype @0) @2) @3))))))
> >
> > +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */
> > +(for cst (INTEGER_CST VECTOR_CST)
> > + (simplify
> > +  (rshift (negate:s @0) cst@1)
> > +   (with { tree ctype = TREE_TYPE (@0);
> > +	   tree stype = TREE_TYPE (@1);
> > +	   tree bt = truth_type_for (ctype); }
> > +    (switch
> > +     /* Handle scalar case.  */
> > +     (if (INTEGRAL_TYPE_P (ctype)
> > +	  && !VECTOR_TYPE_P (ctype)
> > +	  && !TYPE_UNSIGNED (ctype)
> > +	  && canonicalize_math_after_vectorization_p ()
> > +	  && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
> > +      (convert:bt (gt:bt @0 { build_zero_cst (stype); })))
> > +     /* Handle vector case with a scalar immediate.  */
> > +     (if (VECTOR_INTEGER_TYPE_P (ctype)
> > +	  && !VECTOR_TYPE_P (stype)
> > +	  && !TYPE_UNSIGNED (ctype)
> > +          && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
> > +      (convert:bt (gt:bt @0 { build_zero_cst (ctype); })))
> > +     /* Handle vector case with a vector immediate.   */
> > +     (if (VECTOR_INTEGER_TYPE_P (ctype)
> > +	  && VECTOR_TYPE_P (stype)
> > +	  && !TYPE_UNSIGNED (ctype)
> > +	  && uniform_vector_p (@1))
> > +      (with { tree cst = vector_cst_elt (@1, 0);
> > +	      tree t = TREE_TYPE (cst); }
> > +       (if (wi::eq_p (wi::to_wide (cst), TYPE_PRECISION (t) - 1))
> > +        (convert:bt (gt:bt @0 { build_zero_cst (ctype); })))))))))
> > +
> >   /* Fold (C1/X)*C2 into (C1*C2)/X.  */
> >   (simplify
> >    (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2)
> > diff --git a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-
> 2.c
> > new file mode 100644
> > index
> 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc
> 30176c96a669bb
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/signbit-2.c
> > @@ -0,0 +1,19 @@
> > +/* { dg-do assemble } */
> > +/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */
> > +
> > +#include <stdint.h>
> > +
> > +void fun1(int32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (-x[i]) >> 31;
> > +}
> > +
> > +void fun2(int32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (-x[i]) >> 30;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1 optimized } }
> */
> > +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> > diff --git a/gcc/testsuite/gcc.dg/signbit-3.c b/gcc/testsuite/gcc.dg/signbit-
> 3.c
> > new file mode 100644
> > index
> 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628
> f00938ece60bf7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/signbit-3.c
> > @@ -0,0 +1,13 @@
> > +/* { dg-do assemble } */
> > +/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */
> > +
> > +#include <stdint.h>
> > +
> > +void fun1(int32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (-x[i]) >> 31;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } } */
> > +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> > new file mode 100644
> > index
> 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27
> fe48503714447e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> > @@ -0,0 +1,18 @@
> > +/* { dg-do assemble } */
> > +/* { dg-options "-O3 --save-temps" } */
> > +
> > +#include <stdint.h>
> > +
> > +void fun1(int32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (-x[i]) >> 31;
> > +}
> > +
> > +void fun2(int32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (-x[i]) >> 30;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */
> >
> >
  
Richard Earnshaw Oct. 5, 2021, 1:34 p.m. UTC | #3
On 05/10/2021 14:30, Tamar Christina wrote:
> 
> 
>> -----Original Message-----
>> From: Richard Earnshaw <Richard.Earnshaw@foss.arm.com>
>> Sent: Tuesday, October 5, 2021 1:56 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
>> Cc: nd <nd@arm.com>; rguenther@suse.de
>> Subject: Re: [PATCH]middle-end convert negate + right shift into compare
>> greater.
>>
>>
>>
>> On 05/10/2021 13:50, Tamar Christina via Gcc-patches wrote:
>>> Hi All,
>>>
>>> This turns an inversion of the sign bit + arithmetic right shift into
>>> a comparison with 0.
>>>
>>> i.e.
>>>
>>> void fun1(int32_t *x, int n)
>>> {
>>>       for (int i = 0; i < (n & -16); i++)
>>>         x[i] = (-x[i]) >> 31;
>>> }
>>>
>> Notwithstanding that I think shifting a negative value right is unspecified
>> behaviour, I don't think this generates the same result when x[i] is INT_MIN
>> either, although negating that is also unspecified since it can't be
>> represented in an int.
>>
> 
> You're right that they are implementation defined, but I think most ISAs do have a sane
> Implementation of these two cases. At least both x86 and AArch64 just replicate the signbit
> and for negate do two complement negation. So INT_MIN works as expected and results in 0.

Which is not what the original code produces if you have wrapping ints, 
because -INT_MIN is INT_MIN, and thus still negative.

R.

> 
> But I'm happy to guard this behind some sort of target guard.
> 
> Regards,
> Tamar
> 
>> R.
>>
>>> now generates:
>>>
>>> .L3:
>>>           ldr     q0, [x0]
>>>           cmgt    v0.4s, v0.4s, #0
>>>           str     q0, [x0], 16
>>>           cmp     x0, x1
>>>           bne     .L3
>>>
>>> instead of:
>>>
>>> .L3:
>>>           ldr     q0, [x0]
>>>           neg     v0.4s, v0.4s
>>>           sshr    v0.4s, v0.4s, 31
>>>           str     q0, [x0], 16
>>>           cmp     x0, x1
>>>           bne     .L3
>>>
>>> Bootstrapped Regtested on aarch64-none-linux-gnu,
>>> x86_64-pc-linux-gnu and no regressions.
>>>
>>> Ok for master?
>>>
>>> Thanks,
>>> Tamar
>>>
>>> gcc/ChangeLog:
>>>
>>> 	* match.pd: New negate+shift pattern.
>>>
>>> gcc/testsuite/ChangeLog:
>>>
>>> 	* gcc.dg/signbit-2.c: New test.
>>> 	* gcc.dg/signbit-3.c: New test.
>>> 	* gcc.target/aarch64/signbit-1.c: New test.
>>>
>>> --- inline copy of patch --
>>> diff --git a/gcc/match.pd b/gcc/match.pd
>>> index
>> 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..581436fe36dbacdcb0c2720b7
>> 190c96d14398143 100644
>>> --- a/gcc/match.pd
>>> +++ b/gcc/match.pd
>>> @@ -826,6 +826,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>>        { tree utype = unsigned_type_for (type); }
>>>        (convert (rshift (lshift (convert:utype @0) @2) @3))))))
>>>
>>> +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */
>>> +(for cst (INTEGER_CST VECTOR_CST)
>>> + (simplify
>>> +  (rshift (negate:s @0) cst@1)
>>> +   (with { tree ctype = TREE_TYPE (@0);
>>> +	   tree stype = TREE_TYPE (@1);
>>> +	   tree bt = truth_type_for (ctype); }
>>> +    (switch
>>> +     /* Handle scalar case.  */
>>> +     (if (INTEGRAL_TYPE_P (ctype)
>>> +	  && !VECTOR_TYPE_P (ctype)
>>> +	  && !TYPE_UNSIGNED (ctype)
>>> +	  && canonicalize_math_after_vectorization_p ()
>>> +	  && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
>>> +      (convert:bt (gt:bt @0 { build_zero_cst (stype); })))
>>> +     /* Handle vector case with a scalar immediate.  */
>>> +     (if (VECTOR_INTEGER_TYPE_P (ctype)
>>> +	  && !VECTOR_TYPE_P (stype)
>>> +	  && !TYPE_UNSIGNED (ctype)
>>> +          && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
>>> +      (convert:bt (gt:bt @0 { build_zero_cst (ctype); })))
>>> +     /* Handle vector case with a vector immediate.   */
>>> +     (if (VECTOR_INTEGER_TYPE_P (ctype)
>>> +	  && VECTOR_TYPE_P (stype)
>>> +	  && !TYPE_UNSIGNED (ctype)
>>> +	  && uniform_vector_p (@1))
>>> +      (with { tree cst = vector_cst_elt (@1, 0);
>>> +	      tree t = TREE_TYPE (cst); }
>>> +       (if (wi::eq_p (wi::to_wide (cst), TYPE_PRECISION (t) - 1))
>>> +        (convert:bt (gt:bt @0 { build_zero_cst (ctype); })))))))))
>>> +
>>>    /* Fold (C1/X)*C2 into (C1*C2)/X.  */
>>>    (simplify
>>>     (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2)
>>> diff --git a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-
>> 2.c
>>> new file mode 100644
>>> index
>> 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc
>> 30176c96a669bb
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.dg/signbit-2.c
>>> @@ -0,0 +1,19 @@
>>> +/* { dg-do assemble } */
>>> +/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */
>>> +
>>> +#include <stdint.h>
>>> +
>>> +void fun1(int32_t *x, int n)
>>> +{
>>> +    for (int i = 0; i < (n & -16); i++)
>>> +      x[i] = (-x[i]) >> 31;
>>> +}
>>> +
>>> +void fun2(int32_t *x, int n)
>>> +{
>>> +    for (int i = 0; i < (n & -16); i++)
>>> +      x[i] = (-x[i]) >> 30;
>>> +}
>>> +
>>> +/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1 optimized } }
>> */
>>> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
>>> diff --git a/gcc/testsuite/gcc.dg/signbit-3.c b/gcc/testsuite/gcc.dg/signbit-
>> 3.c
>>> new file mode 100644
>>> index
>> 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628
>> f00938ece60bf7
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.dg/signbit-3.c
>>> @@ -0,0 +1,13 @@
>>> +/* { dg-do assemble } */
>>> +/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */
>>> +
>>> +#include <stdint.h>
>>> +
>>> +void fun1(int32_t *x, int n)
>>> +{
>>> +    for (int i = 0; i < (n & -16); i++)
>>> +      x[i] = (-x[i]) >> 31;
>>> +}
>>> +
>>> +/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } } */
>>> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c
>> b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
>>> new file mode 100644
>>> index
>> 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27
>> fe48503714447e
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
>>> @@ -0,0 +1,18 @@
>>> +/* { dg-do assemble } */
>>> +/* { dg-options "-O3 --save-temps" } */
>>> +
>>> +#include <stdint.h>
>>> +
>>> +void fun1(int32_t *x, int n)
>>> +{
>>> +    for (int i = 0; i < (n & -16); i++)
>>> +      x[i] = (-x[i]) >> 31;
>>> +}
>>> +
>>> +void fun2(int32_t *x, int n)
>>> +{
>>> +    for (int i = 0; i < (n & -16); i++)
>>> +      x[i] = (-x[i]) >> 30;
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */
>>>
>>>
  
Tamar Christina Oct. 5, 2021, 1:49 p.m. UTC | #4
> -----Original Message-----
> From: Richard Earnshaw <Richard.Earnshaw@foss.arm.com>
> Sent: Tuesday, October 5, 2021 2:34 PM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; rguenther@suse.de
> Subject: Re: [PATCH]middle-end convert negate + right shift into compare
> greater.
> 
> 
> 
> On 05/10/2021 14:30, Tamar Christina wrote:
> >
> >
> >> -----Original Message-----
> >> From: Richard Earnshaw <Richard.Earnshaw@foss.arm.com>
> >> Sent: Tuesday, October 5, 2021 1:56 PM
> >> To: Tamar Christina <Tamar.Christina@arm.com>;
> >> gcc-patches@gcc.gnu.org
> >> Cc: nd <nd@arm.com>; rguenther@suse.de
> >> Subject: Re: [PATCH]middle-end convert negate + right shift into
> >> compare greater.
> >>
> >>
> >>
> >> On 05/10/2021 13:50, Tamar Christina via Gcc-patches wrote:
> >>> Hi All,
> >>>
> >>> This turns an inversion of the sign bit + arithmetic right shift
> >>> into a comparison with 0.
> >>>
> >>> i.e.
> >>>
> >>> void fun1(int32_t *x, int n)
> >>> {
> >>>       for (int i = 0; i < (n & -16); i++)
> >>>         x[i] = (-x[i]) >> 31;
> >>> }
> >>>
> >> Notwithstanding that I think shifting a negative value right is
> >> unspecified behaviour, I don't think this generates the same result
> >> when x[i] is INT_MIN either, although negating that is also
> >> unspecified since it can't be represented in an int.
> >>
> >
> > You're right that they are implementation defined, but I think most
> > ISAs do have a sane Implementation of these two cases. At least both
> > x86 and AArch64 just replicate the signbit and for negate do two
> complement negation. So INT_MIN works as expected and results in 0.
> 
> Which is not what the original code produces if you have wrapping ints,
> because -INT_MIN is INT_MIN, and thus still negative.
> 

True, but then you have a signed overflow which is undefined behaviour and not implementation defined

" If an exceptional condition occurs during the evaluation of an expression (that is, if the result is not mathematically defined or not in the range of representable values for its type), the behavior is undefined."

So it should still be acceptable to do in this case.

> R.
> 
> >
> > But I'm happy to guard this behind some sort of target guard.
> >
> > Regards,
> > Tamar
> >
> >> R.
> >>
> >>> now generates:
> >>>
> >>> .L3:
> >>>           ldr     q0, [x0]
> >>>           cmgt    v0.4s, v0.4s, #0
> >>>           str     q0, [x0], 16
> >>>           cmp     x0, x1
> >>>           bne     .L3
> >>>
> >>> instead of:
> >>>
> >>> .L3:
> >>>           ldr     q0, [x0]
> >>>           neg     v0.4s, v0.4s
> >>>           sshr    v0.4s, v0.4s, 31
> >>>           str     q0, [x0], 16
> >>>           cmp     x0, x1
> >>>           bne     .L3
> >>>
> >>> Bootstrapped Regtested on aarch64-none-linux-gnu,
> >>> x86_64-pc-linux-gnu and no regressions.
> >>>
> >>> Ok for master?
> >>>
> >>> Thanks,
> >>> Tamar
> >>>
> >>> gcc/ChangeLog:
> >>>
> >>> 	* match.pd: New negate+shift pattern.
> >>>
> >>> gcc/testsuite/ChangeLog:
> >>>
> >>> 	* gcc.dg/signbit-2.c: New test.
> >>> 	* gcc.dg/signbit-3.c: New test.
> >>> 	* gcc.target/aarch64/signbit-1.c: New test.
> >>>
> >>> --- inline copy of patch --
> >>> diff --git a/gcc/match.pd b/gcc/match.pd index
> >>
> 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..581436fe36dbacdcb0c2720b7
> >> 190c96d14398143 100644
> >>> --- a/gcc/match.pd
> >>> +++ b/gcc/match.pd
> >>> @@ -826,6 +826,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >>>        { tree utype = unsigned_type_for (type); }
> >>>        (convert (rshift (lshift (convert:utype @0) @2) @3))))))
> >>>
> >>> +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */
> >>> +(for cst (INTEGER_CST VECTOR_CST)  (simplify
> >>> +  (rshift (negate:s @0) cst@1)
> >>> +   (with { tree ctype = TREE_TYPE (@0);
> >>> +	   tree stype = TREE_TYPE (@1);
> >>> +	   tree bt = truth_type_for (ctype); }
> >>> +    (switch
> >>> +     /* Handle scalar case.  */
> >>> +     (if (INTEGRAL_TYPE_P (ctype)
> >>> +	  && !VECTOR_TYPE_P (ctype)
> >>> +	  && !TYPE_UNSIGNED (ctype)
> >>> +	  && canonicalize_math_after_vectorization_p ()
> >>> +	  && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
> >>> +      (convert:bt (gt:bt @0 { build_zero_cst (stype); })))
> >>> +     /* Handle vector case with a scalar immediate.  */
> >>> +     (if (VECTOR_INTEGER_TYPE_P (ctype)
> >>> +	  && !VECTOR_TYPE_P (stype)
> >>> +	  && !TYPE_UNSIGNED (ctype)
> >>> +          && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
> >>> +      (convert:bt (gt:bt @0 { build_zero_cst (ctype); })))
> >>> +     /* Handle vector case with a vector immediate.   */
> >>> +     (if (VECTOR_INTEGER_TYPE_P (ctype)
> >>> +	  && VECTOR_TYPE_P (stype)
> >>> +	  && !TYPE_UNSIGNED (ctype)
> >>> +	  && uniform_vector_p (@1))
> >>> +      (with { tree cst = vector_cst_elt (@1, 0);
> >>> +	      tree t = TREE_TYPE (cst); }
> >>> +       (if (wi::eq_p (wi::to_wide (cst), TYPE_PRECISION (t) - 1))
> >>> +        (convert:bt (gt:bt @0 { build_zero_cst (ctype); })))))))))
> >>> +
> >>>    /* Fold (C1/X)*C2 into (C1*C2)/X.  */
> >>>    (simplify
> >>>     (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2) diff --git
> >>> a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-
> >> 2.c
> >>> new file mode 100644
> >>> index
> >>
> 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc
> >> 30176c96a669bb
> >>> --- /dev/null
> >>> +++ b/gcc/testsuite/gcc.dg/signbit-2.c
> >>> @@ -0,0 +1,19 @@
> >>> +/* { dg-do assemble } */
> >>> +/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */
> >>> +
> >>> +#include <stdint.h>
> >>> +
> >>> +void fun1(int32_t *x, int n)
> >>> +{
> >>> +    for (int i = 0; i < (n & -16); i++)
> >>> +      x[i] = (-x[i]) >> 31;
> >>> +}
> >>> +
> >>> +void fun2(int32_t *x, int n)
> >>> +{
> >>> +    for (int i = 0; i < (n & -16); i++)
> >>> +      x[i] = (-x[i]) >> 30;
> >>> +}
> >>> +
> >>> +/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1
> >>> +optimized } }
> >> */
> >>> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> >>> diff --git a/gcc/testsuite/gcc.dg/signbit-3.c
> >>> b/gcc/testsuite/gcc.dg/signbit-
> >> 3.c
> >>> new file mode 100644
> >>> index
> >>
> 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628
> >> f00938ece60bf7
> >>> --- /dev/null
> >>> +++ b/gcc/testsuite/gcc.dg/signbit-3.c
> >>> @@ -0,0 +1,13 @@
> >>> +/* { dg-do assemble } */
> >>> +/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */
> >>> +
> >>> +#include <stdint.h>
> >>> +
> >>> +void fun1(int32_t *x, int n)
> >>> +{
> >>> +    for (int i = 0; i < (n & -16); i++)
> >>> +      x[i] = (-x[i]) >> 31;
> >>> +}
> >>> +
> >>> +/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } } */
> >>> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> >>> diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> >> b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> >>> new file mode 100644
> >>> index
> >>
> 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27
> >> fe48503714447e
> >>> --- /dev/null
> >>> +++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> >>> @@ -0,0 +1,18 @@
> >>> +/* { dg-do assemble } */
> >>> +/* { dg-options "-O3 --save-temps" } */
> >>> +
> >>> +#include <stdint.h>
> >>> +
> >>> +void fun1(int32_t *x, int n)
> >>> +{
> >>> +    for (int i = 0; i < (n & -16); i++)
> >>> +      x[i] = (-x[i]) >> 31;
> >>> +}
> >>> +
> >>> +void fun2(int32_t *x, int n)
> >>> +{
> >>> +    for (int i = 0; i < (n & -16); i++)
> >>> +      x[i] = (-x[i]) >> 30;
> >>> +}
> >>> +
> >>> +/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */
> >>>
> >>>
  
Richard Earnshaw Oct. 5, 2021, 1:51 p.m. UTC | #5
On 05/10/2021 14:49, Tamar Christina wrote:
>> -----Original Message-----
>> From: Richard Earnshaw <Richard.Earnshaw@foss.arm.com>
>> Sent: Tuesday, October 5, 2021 2:34 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
>> Cc: nd <nd@arm.com>; rguenther@suse.de
>> Subject: Re: [PATCH]middle-end convert negate + right shift into compare
>> greater.
>>
>>
>>
>> On 05/10/2021 14:30, Tamar Christina wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: Richard Earnshaw <Richard.Earnshaw@foss.arm.com>
>>>> Sent: Tuesday, October 5, 2021 1:56 PM
>>>> To: Tamar Christina <Tamar.Christina@arm.com>;
>>>> gcc-patches@gcc.gnu.org
>>>> Cc: nd <nd@arm.com>; rguenther@suse.de
>>>> Subject: Re: [PATCH]middle-end convert negate + right shift into
>>>> compare greater.
>>>>
>>>>
>>>>
>>>> On 05/10/2021 13:50, Tamar Christina via Gcc-patches wrote:
>>>>> Hi All,
>>>>>
>>>>> This turns an inversion of the sign bit + arithmetic right shift
>>>>> into a comparison with 0.
>>>>>
>>>>> i.e.
>>>>>
>>>>> void fun1(int32_t *x, int n)
>>>>> {
>>>>>        for (int i = 0; i < (n & -16); i++)
>>>>>          x[i] = (-x[i]) >> 31;
>>>>> }
>>>>>
>>>> Notwithstanding that I think shifting a negative value right is
>>>> unspecified behaviour, I don't think this generates the same result
>>>> when x[i] is INT_MIN either, although negating that is also
>>>> unspecified since it can't be represented in an int.
>>>>
>>>
>>> You're right that they are implementation defined, but I think most
>>> ISAs do have a sane Implementation of these two cases. At least both
>>> x86 and AArch64 just replicate the signbit and for negate do two
>> complement negation. So INT_MIN works as expected and results in 0.
>>
>> Which is not what the original code produces if you have wrapping ints,
>> because -INT_MIN is INT_MIN, and thus still negative.
>>
> 
> True, but then you have a signed overflow which is undefined behaviour and not implementation defined
> 
> " If an exceptional condition occurs during the evaluation of an expression (that is, if the result is not mathematically defined or not in the range of representable values for its type), the behavior is undefined."
> 
> So it should still be acceptable to do in this case.

-fwrapv

R.

> 
>> R.
>>
>>>
>>> But I'm happy to guard this behind some sort of target guard.
>>>
>>> Regards,
>>> Tamar
>>>
>>>> R.
>>>>
>>>>> now generates:
>>>>>
>>>>> .L3:
>>>>>            ldr     q0, [x0]
>>>>>            cmgt    v0.4s, v0.4s, #0
>>>>>            str     q0, [x0], 16
>>>>>            cmp     x0, x1
>>>>>            bne     .L3
>>>>>
>>>>> instead of:
>>>>>
>>>>> .L3:
>>>>>            ldr     q0, [x0]
>>>>>            neg     v0.4s, v0.4s
>>>>>            sshr    v0.4s, v0.4s, 31
>>>>>            str     q0, [x0], 16
>>>>>            cmp     x0, x1
>>>>>            bne     .L3
>>>>>
>>>>> Bootstrapped Regtested on aarch64-none-linux-gnu,
>>>>> x86_64-pc-linux-gnu and no regressions.
>>>>>
>>>>> Ok for master?
>>>>>
>>>>> Thanks,
>>>>> Tamar
>>>>>
>>>>> gcc/ChangeLog:
>>>>>
>>>>> 	* match.pd: New negate+shift pattern.
>>>>>
>>>>> gcc/testsuite/ChangeLog:
>>>>>
>>>>> 	* gcc.dg/signbit-2.c: New test.
>>>>> 	* gcc.dg/signbit-3.c: New test.
>>>>> 	* gcc.target/aarch64/signbit-1.c: New test.
>>>>>
>>>>> --- inline copy of patch --
>>>>> diff --git a/gcc/match.pd b/gcc/match.pd index
>>>>
>> 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..581436fe36dbacdcb0c2720b7
>>>> 190c96d14398143 100644
>>>>> --- a/gcc/match.pd
>>>>> +++ b/gcc/match.pd
>>>>> @@ -826,6 +826,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>>>>         { tree utype = unsigned_type_for (type); }
>>>>>         (convert (rshift (lshift (convert:utype @0) @2) @3))))))
>>>>>
>>>>> +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */
>>>>> +(for cst (INTEGER_CST VECTOR_CST)  (simplify
>>>>> +  (rshift (negate:s @0) cst@1)
>>>>> +   (with { tree ctype = TREE_TYPE (@0);
>>>>> +	   tree stype = TREE_TYPE (@1);
>>>>> +	   tree bt = truth_type_for (ctype); }
>>>>> +    (switch
>>>>> +     /* Handle scalar case.  */
>>>>> +     (if (INTEGRAL_TYPE_P (ctype)
>>>>> +	  && !VECTOR_TYPE_P (ctype)
>>>>> +	  && !TYPE_UNSIGNED (ctype)
>>>>> +	  && canonicalize_math_after_vectorization_p ()
>>>>> +	  && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
>>>>> +      (convert:bt (gt:bt @0 { build_zero_cst (stype); })))
>>>>> +     /* Handle vector case with a scalar immediate.  */
>>>>> +     (if (VECTOR_INTEGER_TYPE_P (ctype)
>>>>> +	  && !VECTOR_TYPE_P (stype)
>>>>> +	  && !TYPE_UNSIGNED (ctype)
>>>>> +          && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
>>>>> +      (convert:bt (gt:bt @0 { build_zero_cst (ctype); })))
>>>>> +     /* Handle vector case with a vector immediate.   */
>>>>> +     (if (VECTOR_INTEGER_TYPE_P (ctype)
>>>>> +	  && VECTOR_TYPE_P (stype)
>>>>> +	  && !TYPE_UNSIGNED (ctype)
>>>>> +	  && uniform_vector_p (@1))
>>>>> +      (with { tree cst = vector_cst_elt (@1, 0);
>>>>> +	      tree t = TREE_TYPE (cst); }
>>>>> +       (if (wi::eq_p (wi::to_wide (cst), TYPE_PRECISION (t) - 1))
>>>>> +        (convert:bt (gt:bt @0 { build_zero_cst (ctype); })))))))))
>>>>> +
>>>>>     /* Fold (C1/X)*C2 into (C1*C2)/X.  */
>>>>>     (simplify
>>>>>      (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2) diff --git
>>>>> a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-
>>>> 2.c
>>>>> new file mode 100644
>>>>> index
>>>>
>> 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc
>>>> 30176c96a669bb
>>>>> --- /dev/null
>>>>> +++ b/gcc/testsuite/gcc.dg/signbit-2.c
>>>>> @@ -0,0 +1,19 @@
>>>>> +/* { dg-do assemble } */
>>>>> +/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */
>>>>> +
>>>>> +#include <stdint.h>
>>>>> +
>>>>> +void fun1(int32_t *x, int n)
>>>>> +{
>>>>> +    for (int i = 0; i < (n & -16); i++)
>>>>> +      x[i] = (-x[i]) >> 31;
>>>>> +}
>>>>> +
>>>>> +void fun2(int32_t *x, int n)
>>>>> +{
>>>>> +    for (int i = 0; i < (n & -16); i++)
>>>>> +      x[i] = (-x[i]) >> 30;
>>>>> +}
>>>>> +
>>>>> +/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1
>>>>> +optimized } }
>>>> */
>>>>> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
>>>>> diff --git a/gcc/testsuite/gcc.dg/signbit-3.c
>>>>> b/gcc/testsuite/gcc.dg/signbit-
>>>> 3.c
>>>>> new file mode 100644
>>>>> index
>>>>
>> 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628
>>>> f00938ece60bf7
>>>>> --- /dev/null
>>>>> +++ b/gcc/testsuite/gcc.dg/signbit-3.c
>>>>> @@ -0,0 +1,13 @@
>>>>> +/* { dg-do assemble } */
>>>>> +/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */
>>>>> +
>>>>> +#include <stdint.h>
>>>>> +
>>>>> +void fun1(int32_t *x, int n)
>>>>> +{
>>>>> +    for (int i = 0; i < (n & -16); i++)
>>>>> +      x[i] = (-x[i]) >> 31;
>>>>> +}
>>>>> +
>>>>> +/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } } */
>>>>> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
>>>>> diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c
>>>> b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
>>>>> new file mode 100644
>>>>> index
>>>>
>> 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27
>>>> fe48503714447e
>>>>> --- /dev/null
>>>>> +++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
>>>>> @@ -0,0 +1,18 @@
>>>>> +/* { dg-do assemble } */
>>>>> +/* { dg-options "-O3 --save-temps" } */
>>>>> +
>>>>> +#include <stdint.h>
>>>>> +
>>>>> +void fun1(int32_t *x, int n)
>>>>> +{
>>>>> +    for (int i = 0; i < (n & -16); i++)
>>>>> +      x[i] = (-x[i]) >> 31;
>>>>> +}
>>>>> +
>>>>> +void fun2(int32_t *x, int n)
>>>>> +{
>>>>> +    for (int i = 0; i < (n & -16); i++)
>>>>> +      x[i] = (-x[i]) >> 30;
>>>>> +}
>>>>> +
>>>>> +/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */
>>>>>
>>>>>
  
Tamar Christina Oct. 5, 2021, 1:56 p.m. UTC | #6
> -----Original Message-----
> From: Richard Earnshaw <Richard.Earnshaw@foss.arm.com>
> Sent: Tuesday, October 5, 2021 2:52 PM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; rguenther@suse.de
> Subject: Re: [PATCH]middle-end convert negate + right shift into compare
> greater.
> 
> 
> 
> On 05/10/2021 14:49, Tamar Christina wrote:
> >> -----Original Message-----
> >> From: Richard Earnshaw <Richard.Earnshaw@foss.arm.com>
> >> Sent: Tuesday, October 5, 2021 2:34 PM
> >> To: Tamar Christina <Tamar.Christina@arm.com>;
> >> gcc-patches@gcc.gnu.org
> >> Cc: nd <nd@arm.com>; rguenther@suse.de
> >> Subject: Re: [PATCH]middle-end convert negate + right shift into
> >> compare greater.
> >>
> >>
> >>
> >> On 05/10/2021 14:30, Tamar Christina wrote:
> >>>
> >>>
> >>>> -----Original Message-----
> >>>> From: Richard Earnshaw <Richard.Earnshaw@foss.arm.com>
> >>>> Sent: Tuesday, October 5, 2021 1:56 PM
> >>>> To: Tamar Christina <Tamar.Christina@arm.com>;
> >>>> gcc-patches@gcc.gnu.org
> >>>> Cc: nd <nd@arm.com>; rguenther@suse.de
> >>>> Subject: Re: [PATCH]middle-end convert negate + right shift into
> >>>> compare greater.
> >>>>
> >>>>
> >>>>
> >>>> On 05/10/2021 13:50, Tamar Christina via Gcc-patches wrote:
> >>>>> Hi All,
> >>>>>
> >>>>> This turns an inversion of the sign bit + arithmetic right shift
> >>>>> into a comparison with 0.
> >>>>>
> >>>>> i.e.
> >>>>>
> >>>>> void fun1(int32_t *x, int n)
> >>>>> {
> >>>>>        for (int i = 0; i < (n & -16); i++)
> >>>>>          x[i] = (-x[i]) >> 31;
> >>>>> }
> >>>>>
> >>>> Notwithstanding that I think shifting a negative value right is
> >>>> unspecified behaviour, I don't think this generates the same result
> >>>> when x[i] is INT_MIN either, although negating that is also
> >>>> unspecified since it can't be represented in an int.
> >>>>
> >>>
> >>> You're right that they are implementation defined, but I think most
> >>> ISAs do have a sane Implementation of these two cases. At least both
> >>> x86 and AArch64 just replicate the signbit and for negate do two
> >> complement negation. So INT_MIN works as expected and results in 0.
> >>
> >> Which is not what the original code produces if you have wrapping
> >> ints, because -INT_MIN is INT_MIN, and thus still negative.
> >>
> >
> > True, but then you have a signed overflow which is undefined behaviour
> > and not implementation defined
> >
> > " If an exceptional condition occurs during the evaluation of an expression
> (that is, if the result is not mathematically defined or not in the range of
> representable values for its type), the behavior is undefined."
> >
> > So it should still be acceptable to do in this case.
> 
> -fwrapv

If I understand correctly, you're happy with this is I guard it on ! flag_wrapv ?

Regards,
Tamar
> 
> R.
> 
> >
> >> R.
> >>
> >>>
> >>> But I'm happy to guard this behind some sort of target guard.
> >>>
> >>> Regards,
> >>> Tamar
> >>>
> >>>> R.
> >>>>
> >>>>> now generates:
> >>>>>
> >>>>> .L3:
> >>>>>            ldr     q0, [x0]
> >>>>>            cmgt    v0.4s, v0.4s, #0
> >>>>>            str     q0, [x0], 16
> >>>>>            cmp     x0, x1
> >>>>>            bne     .L3
> >>>>>
> >>>>> instead of:
> >>>>>
> >>>>> .L3:
> >>>>>            ldr     q0, [x0]
> >>>>>            neg     v0.4s, v0.4s
> >>>>>            sshr    v0.4s, v0.4s, 31
> >>>>>            str     q0, [x0], 16
> >>>>>            cmp     x0, x1
> >>>>>            bne     .L3
> >>>>>
> >>>>> Bootstrapped Regtested on aarch64-none-linux-gnu,
> >>>>> x86_64-pc-linux-gnu and no regressions.
> >>>>>
> >>>>> Ok for master?
> >>>>>
> >>>>> Thanks,
> >>>>> Tamar
> >>>>>
> >>>>> gcc/ChangeLog:
> >>>>>
> >>>>> 	* match.pd: New negate+shift pattern.
> >>>>>
> >>>>> gcc/testsuite/ChangeLog:
> >>>>>
> >>>>> 	* gcc.dg/signbit-2.c: New test.
> >>>>> 	* gcc.dg/signbit-3.c: New test.
> >>>>> 	* gcc.target/aarch64/signbit-1.c: New test.
> >>>>>
> >>>>> --- inline copy of patch --
> >>>>> diff --git a/gcc/match.pd b/gcc/match.pd index
> >>>>
> >>
> 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..581436fe36dbacdcb0c2720b7
> >>>> 190c96d14398143 100644
> >>>>> --- a/gcc/match.pd
> >>>>> +++ b/gcc/match.pd
> >>>>> @@ -826,6 +826,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >>>>>         { tree utype = unsigned_type_for (type); }
> >>>>>         (convert (rshift (lshift (convert:utype @0) @2) @3))))))
> >>>>>
> >>>>> +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */
> >>>>> +(for cst (INTEGER_CST VECTOR_CST)  (simplify
> >>>>> +  (rshift (negate:s @0) cst@1)
> >>>>> +   (with { tree ctype = TREE_TYPE (@0);
> >>>>> +	   tree stype = TREE_TYPE (@1);
> >>>>> +	   tree bt = truth_type_for (ctype); }
> >>>>> +    (switch
> >>>>> +     /* Handle scalar case.  */
> >>>>> +     (if (INTEGRAL_TYPE_P (ctype)
> >>>>> +	  && !VECTOR_TYPE_P (ctype)
> >>>>> +	  && !TYPE_UNSIGNED (ctype)
> >>>>> +	  && canonicalize_math_after_vectorization_p ()
> >>>>> +	  && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) -
> 1))
> >>>>> +      (convert:bt (gt:bt @0 { build_zero_cst (stype); })))
> >>>>> +     /* Handle vector case with a scalar immediate.  */
> >>>>> +     (if (VECTOR_INTEGER_TYPE_P (ctype)
> >>>>> +	  && !VECTOR_TYPE_P (stype)
> >>>>> +	  && !TYPE_UNSIGNED (ctype)
> >>>>> +          && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
> >>>>> +      (convert:bt (gt:bt @0 { build_zero_cst (ctype); })))
> >>>>> +     /* Handle vector case with a vector immediate.   */
> >>>>> +     (if (VECTOR_INTEGER_TYPE_P (ctype)
> >>>>> +	  && VECTOR_TYPE_P (stype)
> >>>>> +	  && !TYPE_UNSIGNED (ctype)
> >>>>> +	  && uniform_vector_p (@1))
> >>>>> +      (with { tree cst = vector_cst_elt (@1, 0);
> >>>>> +	      tree t = TREE_TYPE (cst); }
> >>>>> +       (if (wi::eq_p (wi::to_wide (cst), TYPE_PRECISION (t) - 1))
> >>>>> +        (convert:bt (gt:bt @0 { build_zero_cst (ctype);
> >>>>> +})))))))))
> >>>>> +
> >>>>>     /* Fold (C1/X)*C2 into (C1*C2)/X.  */
> >>>>>     (simplify
> >>>>>      (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2) diff --git
> >>>>> a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-
> >>>> 2.c
> >>>>> new file mode 100644
> >>>>> index
> >>>>
> >>
> 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc
> >>>> 30176c96a669bb
> >>>>> --- /dev/null
> >>>>> +++ b/gcc/testsuite/gcc.dg/signbit-2.c
> >>>>> @@ -0,0 +1,19 @@
> >>>>> +/* { dg-do assemble } */
> >>>>> +/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */
> >>>>> +
> >>>>> +#include <stdint.h>
> >>>>> +
> >>>>> +void fun1(int32_t *x, int n)
> >>>>> +{
> >>>>> +    for (int i = 0; i < (n & -16); i++)
> >>>>> +      x[i] = (-x[i]) >> 31;
> >>>>> +}
> >>>>> +
> >>>>> +void fun2(int32_t *x, int n)
> >>>>> +{
> >>>>> +    for (int i = 0; i < (n & -16); i++)
> >>>>> +      x[i] = (-x[i]) >> 30;
> >>>>> +}
> >>>>> +
> >>>>> +/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1
> >>>>> +optimized } }
> >>>> */
> >>>>> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> >>>>> diff --git a/gcc/testsuite/gcc.dg/signbit-3.c
> >>>>> b/gcc/testsuite/gcc.dg/signbit-
> >>>> 3.c
> >>>>> new file mode 100644
> >>>>> index
> >>>>
> >>
> 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628
> >>>> f00938ece60bf7
> >>>>> --- /dev/null
> >>>>> +++ b/gcc/testsuite/gcc.dg/signbit-3.c
> >>>>> @@ -0,0 +1,13 @@
> >>>>> +/* { dg-do assemble } */
> >>>>> +/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */
> >>>>> +
> >>>>> +#include <stdint.h>
> >>>>> +
> >>>>> +void fun1(int32_t *x, int n)
> >>>>> +{
> >>>>> +    for (int i = 0; i < (n & -16); i++)
> >>>>> +      x[i] = (-x[i]) >> 31;
> >>>>> +}
> >>>>> +
> >>>>> +/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } }
> >>>>> +*/
> >>>>> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> >>>>> diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> >>>> b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> >>>>> new file mode 100644
> >>>>> index
> >>>>
> >>
> 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27
> >>>> fe48503714447e
> >>>>> --- /dev/null
> >>>>> +++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> >>>>> @@ -0,0 +1,18 @@
> >>>>> +/* { dg-do assemble } */
> >>>>> +/* { dg-options "-O3 --save-temps" } */
> >>>>> +
> >>>>> +#include <stdint.h>
> >>>>> +
> >>>>> +void fun1(int32_t *x, int n)
> >>>>> +{
> >>>>> +    for (int i = 0; i < (n & -16); i++)
> >>>>> +      x[i] = (-x[i]) >> 31;
> >>>>> +}
> >>>>> +
> >>>>> +void fun2(int32_t *x, int n)
> >>>>> +{
> >>>>> +    for (int i = 0; i < (n & -16); i++)
> >>>>> +      x[i] = (-x[i]) >> 30;
> >>>>> +}
> >>>>> +
> >>>>> +/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */
> >>>>>
> >>>>>
  
Richard Earnshaw Oct. 7, 2021, 12:46 p.m. UTC | #7
On 05/10/2021 14:56, Tamar Christina via Gcc-patches wrote:
> 
> 
>> -----Original Message-----
>> From: Richard Earnshaw <Richard.Earnshaw@foss.arm.com>
>> Sent: Tuesday, October 5, 2021 2:52 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
>> Cc: nd <nd@arm.com>; rguenther@suse.de
>> Subject: Re: [PATCH]middle-end convert negate + right shift into compare
>> greater.
>>
>>
>>
>> On 05/10/2021 14:49, Tamar Christina wrote:
>>>> -----Original Message-----
>>>> From: Richard Earnshaw <Richard.Earnshaw@foss.arm.com>
>>>> Sent: Tuesday, October 5, 2021 2:34 PM
>>>> To: Tamar Christina <Tamar.Christina@arm.com>;
>>>> gcc-patches@gcc.gnu.org
>>>> Cc: nd <nd@arm.com>; rguenther@suse.de
>>>> Subject: Re: [PATCH]middle-end convert negate + right shift into
>>>> compare greater.
>>>>
>>>>
>>>>
>>>> On 05/10/2021 14:30, Tamar Christina wrote:
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Richard Earnshaw <Richard.Earnshaw@foss.arm.com>
>>>>>> Sent: Tuesday, October 5, 2021 1:56 PM
>>>>>> To: Tamar Christina <Tamar.Christina@arm.com>;
>>>>>> gcc-patches@gcc.gnu.org
>>>>>> Cc: nd <nd@arm.com>; rguenther@suse.de
>>>>>> Subject: Re: [PATCH]middle-end convert negate + right shift into
>>>>>> compare greater.
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 05/10/2021 13:50, Tamar Christina via Gcc-patches wrote:
>>>>>>> Hi All,
>>>>>>>
>>>>>>> This turns an inversion of the sign bit + arithmetic right shift
>>>>>>> into a comparison with 0.
>>>>>>>
>>>>>>> i.e.
>>>>>>>
>>>>>>> void fun1(int32_t *x, int n)
>>>>>>> {
>>>>>>>         for (int i = 0; i < (n & -16); i++)
>>>>>>>           x[i] = (-x[i]) >> 31;
>>>>>>> }
>>>>>>>
>>>>>> Notwithstanding that I think shifting a negative value right is
>>>>>> unspecified behaviour, I don't think this generates the same result
>>>>>> when x[i] is INT_MIN either, although negating that is also
>>>>>> unspecified since it can't be represented in an int.
>>>>>>
>>>>>
>>>>> You're right that they are implementation defined, but I think most
>>>>> ISAs do have a sane Implementation of these two cases. At least both
>>>>> x86 and AArch64 just replicate the signbit and for negate do two
>>>> complement negation. So INT_MIN works as expected and results in 0.
>>>>
>>>> Which is not what the original code produces if you have wrapping
>>>> ints, because -INT_MIN is INT_MIN, and thus still negative.
>>>>
>>>
>>> True, but then you have a signed overflow which is undefined behaviour
>>> and not implementation defined
>>>
>>> " If an exceptional condition occurs during the evaluation of an expression
>> (that is, if the result is not mathematically defined or not in the range of
>> representable values for its type), the behavior is undefined."
>>>
>>> So it should still be acceptable to do in this case.
>>
>> -fwrapv
> 
> If I understand correctly, you're happy with this is I guard it on ! flag_wrapv ?

I did some more digging.  Right shift of a negative value is IMP_DEF 
(not UNDEF - this keeps catching me out).  So yes, wrapping this with 
!wrapv would address my concern.

I've not reviewed the patch itself, though.  I've never even written a 
patch for match.pd, so don't feel qualified to do that.

R.

> 
> Regards,
> Tamar
>>
>> R.
>>
>>>
>>>> R.
>>>>
>>>>>
>>>>> But I'm happy to guard this behind some sort of target guard.
>>>>>
>>>>> Regards,
>>>>> Tamar
>>>>>
>>>>>> R.
>>>>>>
>>>>>>> now generates:
>>>>>>>
>>>>>>> .L3:
>>>>>>>             ldr     q0, [x0]
>>>>>>>             cmgt    v0.4s, v0.4s, #0
>>>>>>>             str     q0, [x0], 16
>>>>>>>             cmp     x0, x1
>>>>>>>             bne     .L3
>>>>>>>
>>>>>>> instead of:
>>>>>>>
>>>>>>> .L3:
>>>>>>>             ldr     q0, [x0]
>>>>>>>             neg     v0.4s, v0.4s
>>>>>>>             sshr    v0.4s, v0.4s, 31
>>>>>>>             str     q0, [x0], 16
>>>>>>>             cmp     x0, x1
>>>>>>>             bne     .L3
>>>>>>>
>>>>>>> Bootstrapped Regtested on aarch64-none-linux-gnu,
>>>>>>> x86_64-pc-linux-gnu and no regressions.
>>>>>>>
>>>>>>> Ok for master?
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Tamar
>>>>>>>
>>>>>>> gcc/ChangeLog:
>>>>>>>
>>>>>>> 	* match.pd: New negate+shift pattern.
>>>>>>>
>>>>>>> gcc/testsuite/ChangeLog:
>>>>>>>
>>>>>>> 	* gcc.dg/signbit-2.c: New test.
>>>>>>> 	* gcc.dg/signbit-3.c: New test.
>>>>>>> 	* gcc.target/aarch64/signbit-1.c: New test.
>>>>>>>
>>>>>>> --- inline copy of patch --
>>>>>>> diff --git a/gcc/match.pd b/gcc/match.pd index
>>>>>>
>>>>
>> 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..581436fe36dbacdcb0c2720b7
>>>>>> 190c96d14398143 100644
>>>>>>> --- a/gcc/match.pd
>>>>>>> +++ b/gcc/match.pd
>>>>>>> @@ -826,6 +826,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>>>>>>          { tree utype = unsigned_type_for (type); }
>>>>>>>          (convert (rshift (lshift (convert:utype @0) @2) @3))))))
>>>>>>>
>>>>>>> +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */
>>>>>>> +(for cst (INTEGER_CST VECTOR_CST)  (simplify
>>>>>>> +  (rshift (negate:s @0) cst@1)
>>>>>>> +   (with { tree ctype = TREE_TYPE (@0);
>>>>>>> +	   tree stype = TREE_TYPE (@1);
>>>>>>> +	   tree bt = truth_type_for (ctype); }
>>>>>>> +    (switch
>>>>>>> +     /* Handle scalar case.  */
>>>>>>> +     (if (INTEGRAL_TYPE_P (ctype)
>>>>>>> +	  && !VECTOR_TYPE_P (ctype)
>>>>>>> +	  && !TYPE_UNSIGNED (ctype)
>>>>>>> +	  && canonicalize_math_after_vectorization_p ()
>>>>>>> +	  && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) -
>> 1))
>>>>>>> +      (convert:bt (gt:bt @0 { build_zero_cst (stype); })))
>>>>>>> +     /* Handle vector case with a scalar immediate.  */
>>>>>>> +     (if (VECTOR_INTEGER_TYPE_P (ctype)
>>>>>>> +	  && !VECTOR_TYPE_P (stype)
>>>>>>> +	  && !TYPE_UNSIGNED (ctype)
>>>>>>> +          && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
>>>>>>> +      (convert:bt (gt:bt @0 { build_zero_cst (ctype); })))
>>>>>>> +     /* Handle vector case with a vector immediate.   */
>>>>>>> +     (if (VECTOR_INTEGER_TYPE_P (ctype)
>>>>>>> +	  && VECTOR_TYPE_P (stype)
>>>>>>> +	  && !TYPE_UNSIGNED (ctype)
>>>>>>> +	  && uniform_vector_p (@1))
>>>>>>> +      (with { tree cst = vector_cst_elt (@1, 0);
>>>>>>> +	      tree t = TREE_TYPE (cst); }
>>>>>>> +       (if (wi::eq_p (wi::to_wide (cst), TYPE_PRECISION (t) - 1))
>>>>>>> +        (convert:bt (gt:bt @0 { build_zero_cst (ctype);
>>>>>>> +})))))))))
>>>>>>> +
>>>>>>>      /* Fold (C1/X)*C2 into (C1*C2)/X.  */
>>>>>>>      (simplify
>>>>>>>       (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2) diff --git
>>>>>>> a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-
>>>>>> 2.c
>>>>>>> new file mode 100644
>>>>>>> index
>>>>>>
>>>>
>> 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc
>>>>>> 30176c96a669bb
>>>>>>> --- /dev/null
>>>>>>> +++ b/gcc/testsuite/gcc.dg/signbit-2.c
>>>>>>> @@ -0,0 +1,19 @@
>>>>>>> +/* { dg-do assemble } */
>>>>>>> +/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */
>>>>>>> +
>>>>>>> +#include <stdint.h>
>>>>>>> +
>>>>>>> +void fun1(int32_t *x, int n)
>>>>>>> +{
>>>>>>> +    for (int i = 0; i < (n & -16); i++)
>>>>>>> +      x[i] = (-x[i]) >> 31;
>>>>>>> +}
>>>>>>> +
>>>>>>> +void fun2(int32_t *x, int n)
>>>>>>> +{
>>>>>>> +    for (int i = 0; i < (n & -16); i++)
>>>>>>> +      x[i] = (-x[i]) >> 30;
>>>>>>> +}
>>>>>>> +
>>>>>>> +/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1
>>>>>>> +optimized } }
>>>>>> */
>>>>>>> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
>>>>>>> diff --git a/gcc/testsuite/gcc.dg/signbit-3.c
>>>>>>> b/gcc/testsuite/gcc.dg/signbit-
>>>>>> 3.c
>>>>>>> new file mode 100644
>>>>>>> index
>>>>>>
>>>>
>> 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628
>>>>>> f00938ece60bf7
>>>>>>> --- /dev/null
>>>>>>> +++ b/gcc/testsuite/gcc.dg/signbit-3.c
>>>>>>> @@ -0,0 +1,13 @@
>>>>>>> +/* { dg-do assemble } */
>>>>>>> +/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */
>>>>>>> +
>>>>>>> +#include <stdint.h>
>>>>>>> +
>>>>>>> +void fun1(int32_t *x, int n)
>>>>>>> +{
>>>>>>> +    for (int i = 0; i < (n & -16); i++)
>>>>>>> +      x[i] = (-x[i]) >> 31;
>>>>>>> +}
>>>>>>> +
>>>>>>> +/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } }
>>>>>>> +*/
>>>>>>> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
>>>>>>> diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c
>>>>>> b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
>>>>>>> new file mode 100644
>>>>>>> index
>>>>>>
>>>>
>> 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27
>>>>>> fe48503714447e
>>>>>>> --- /dev/null
>>>>>>> +++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
>>>>>>> @@ -0,0 +1,18 @@
>>>>>>> +/* { dg-do assemble } */
>>>>>>> +/* { dg-options "-O3 --save-temps" } */
>>>>>>> +
>>>>>>> +#include <stdint.h>
>>>>>>> +
>>>>>>> +void fun1(int32_t *x, int n)
>>>>>>> +{
>>>>>>> +    for (int i = 0; i < (n & -16); i++)
>>>>>>> +      x[i] = (-x[i]) >> 31;
>>>>>>> +}
>>>>>>> +
>>>>>>> +void fun2(int32_t *x, int n)
>>>>>>> +{
>>>>>>> +    for (int i = 0; i < (n & -16); i++)
>>>>>>> +      x[i] = (-x[i]) >> 30;
>>>>>>> +}
>>>>>>> +
>>>>>>> +/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */
>>>>>>>
>>>>>>>
  
Tamar Christina Oct. 11, 2021, 11:36 a.m. UTC | #8
Hi all,

Here's a new version of the patch.

> >>> " If an exceptional condition occurs during the evaluation of an
> >>> expression
> >> (that is, if the result is not mathematically defined or not in the
> >> range of representable values for its type), the behavior is undefined."
> >>>
> >>> So it should still be acceptable to do in this case.
> >>
> >> -fwrapv
> >
> > If I understand correctly, you're happy with this is I guard it on ! flag_wrapv ?
> 
> I did some more digging.  Right shift of a negative value is IMP_DEF (not
> UNDEF - this keeps catching me out).  So yes, wrapping this with !wrapv
> would address my concern.
> 
> I've not reviewed the patch itself, though.  I've never even written a patch
> for match.pd, so don't feel qualified to do that.

No problem, thanks for catching this! I'm sure one of the Richards will review it when
they have a chance.

Bootstrapped Regtested on aarch64-none-linux-gnu,
x86_64-pc-linux-gnu and no regressions.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* match.pd: New negate+shift pattern.

gcc/testsuite/ChangeLog:

	* gcc.dg/signbit-2.c: New test.
	* gcc.dg/signbit-3.c: New test.
	* gcc.target/aarch64/signbit-1.c: New test.

--- inline copy of patch ---

diff --git a/gcc/match.pd b/gcc/match.pd
index 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..3d48eda826f889483a83267409c3f278ee907b57 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -826,6 +826,38 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
     { tree utype = unsigned_type_for (type); }
     (convert (rshift (lshift (convert:utype @0) @2) @3))))))
 
+/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */
+(for cst (INTEGER_CST VECTOR_CST)
+ (simplify
+  (rshift (negate:s @0) cst@1)
+   (if (!flag_wrapv)
+    (with { tree ctype = TREE_TYPE (@0);
+	    tree stype = TREE_TYPE (@1);
+	    tree bt = truth_type_for (ctype); }
+     (switch
+      /* Handle scalar case.  */
+      (if (INTEGRAL_TYPE_P (ctype)
+	   && !VECTOR_TYPE_P (ctype)
+	   && !TYPE_UNSIGNED (ctype)
+	   && canonicalize_math_after_vectorization_p ()
+	   && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
+       (convert:bt (gt:bt @0 { build_zero_cst (stype); })))
+      /* Handle vector case with a scalar immediate.  */
+      (if (VECTOR_INTEGER_TYPE_P (ctype)
+	   && !VECTOR_TYPE_P (stype)
+	   && !TYPE_UNSIGNED (ctype)
+           && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
+       (convert:bt (gt:bt @0 { build_zero_cst (ctype); })))
+      /* Handle vector case with a vector immediate.   */
+      (if (VECTOR_INTEGER_TYPE_P (ctype)
+	   && VECTOR_TYPE_P (stype)
+	   && !TYPE_UNSIGNED (ctype)
+	   && uniform_vector_p (@1))
+       (with { tree cst = vector_cst_elt (@1, 0);
+	       tree t = TREE_TYPE (cst); }
+        (if (wi::eq_p (wi::to_wide (cst), TYPE_PRECISION (t) - 1))
+         (convert:bt (gt:bt @0 { build_zero_cst (ctype); }))))))))))
+
 /* Fold (C1/X)*C2 into (C1*C2)/X.  */
 (simplify
  (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2)
diff --git a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc30176c96a669bb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/signbit-2.c
@@ -0,0 +1,19 @@
+/* { dg-do assemble } */
+/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */
+
+#include <stdint.h>
+
+void fun1(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+void fun2(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 30;
+}
+
+/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1 optimized } } */
+/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
diff --git a/gcc/testsuite/gcc.dg/signbit-3.c b/gcc/testsuite/gcc.dg/signbit-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628f00938ece60bf7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/signbit-3.c
@@ -0,0 +1,13 @@
+/* { dg-do assemble } */
+/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */
+
+#include <stdint.h>
+
+void fun1(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } } */
+/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27fe48503714447e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
@@ -0,0 +1,18 @@
+/* { dg-do assemble } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <stdint.h>
+
+void fun1(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+void fun2(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 30;
+}
+
+/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */
  
Richard Biener Oct. 13, 2021, 12:11 p.m. UTC | #9
On Mon, 11 Oct 2021, Tamar Christina wrote:

> Hi all,
> 
> Here's a new version of the patch.
> 
> > >>> " If an exceptional condition occurs during the evaluation of an
> > >>> expression
> > >> (that is, if the result is not mathematically defined or not in the
> > >> range of representable values for its type), the behavior is undefined."
> > >>>
> > >>> So it should still be acceptable to do in this case.
> > >>
> > >> -fwrapv
> > >
> > > If I understand correctly, you're happy with this is I guard it on ! flag_wrapv ?
> > 
> > I did some more digging.  Right shift of a negative value is IMP_DEF (not
> > UNDEF - this keeps catching me out).  So yes, wrapping this with !wrapv
> > would address my concern.
> > 
> > I've not reviewed the patch itself, though.  I've never even written a patch
> > for match.pd, so don't feel qualified to do that.
> 
> No problem, thanks for catching this! I'm sure one of the Richards will review it when
> they have a chance.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> x86_64-pc-linux-gnu and no regressions.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* match.pd: New negate+shift pattern.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/signbit-2.c: New test.
> 	* gcc.dg/signbit-3.c: New test.
> 	* gcc.target/aarch64/signbit-1.c: New test.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..3d48eda826f889483a83267409c3f278ee907b57 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -826,6 +826,38 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>      { tree utype = unsigned_type_for (type); }
>      (convert (rshift (lshift (convert:utype @0) @2) @3))))))
>  
> +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */
> +(for cst (INTEGER_CST VECTOR_CST)
> + (simplify
> +  (rshift (negate:s @0) cst@1)
> +   (if (!flag_wrapv)

Don't test flag_wrapv directly, instead use the appropriate
TYPE_OVERFLOW_{UNDEFINED,WRAPS} predicates.  But I'm not sure
what we are protecting against?  Right-shift of signed integers
is implementation-defined and GCC treats it as you'd expect,
sign-extending the result.

> +    (with { tree ctype = TREE_TYPE (@0);
> +	    tree stype = TREE_TYPE (@1);
> +	    tree bt = truth_type_for (ctype); }
> +     (switch
> +      /* Handle scalar case.  */
> +      (if (INTEGRAL_TYPE_P (ctype)
> +	   && !VECTOR_TYPE_P (ctype)
> +	   && !TYPE_UNSIGNED (ctype)
> +	   && canonicalize_math_after_vectorization_p ()
> +	   && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
> +       (convert:bt (gt:bt @0 { build_zero_cst (stype); })))

I'm not sure why the result is of type 'bt' rather than the
original type of the expression?

In that regard for non-vectors we'd have to add the sign
extension from unsigned bool, in the vector case we'd
hope the type of the comparison is correct.  I think in
both cases it might be convenient to use

  (cond (gt:bt @0 { build_zero_cst (ctype); }) { build_all_ones_cst 
(ctype); } { build_zero_cost (ctype); })

to compute the correct result and rely on (cond ..) simplifications
to simplify that if possible.

Btw, 'stype' should be irrelevant - you need to look at
the precision of 'ctype', no?

Richard.

> +      /* Handle vector case with a scalar immediate.  */
> +      (if (VECTOR_INTEGER_TYPE_P (ctype)
> +	   && !VECTOR_TYPE_P (stype)
> +	   && !TYPE_UNSIGNED (ctype)
> +           && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
> +       (convert:bt (gt:bt @0 { build_zero_cst (ctype); })))
> +      /* Handle vector case with a vector immediate.   */
> +      (if (VECTOR_INTEGER_TYPE_P (ctype)
> +	   && VECTOR_TYPE_P (stype)
> +	   && !TYPE_UNSIGNED (ctype)
> +	   && uniform_vector_p (@1))
> +       (with { tree cst = vector_cst_elt (@1, 0);
> +	       tree t = TREE_TYPE (cst); }
> +        (if (wi::eq_p (wi::to_wide (cst), TYPE_PRECISION (t) - 1))
> +         (convert:bt (gt:bt @0 { build_zero_cst (ctype); }))))))))))
> +
>  /* Fold (C1/X)*C2 into (C1*C2)/X.  */
>  (simplify
>   (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2)
> diff --git a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc30176c96a669bb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/signbit-2.c
> @@ -0,0 +1,19 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */
> +
> +#include <stdint.h>
> +
> +void fun1(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +void fun2(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 30;
> +}
> +
> +/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1 optimized } } */
> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> diff --git a/gcc/testsuite/gcc.dg/signbit-3.c b/gcc/testsuite/gcc.dg/signbit-3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628f00938ece60bf7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/signbit-3.c
> @@ -0,0 +1,13 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */
> +
> +#include <stdint.h>
> +
> +void fun1(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } } */
> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27fe48503714447e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> @@ -0,0 +1,18 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O3 --save-temps" } */
> +
> +#include <stdint.h>
> +
> +void fun1(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +void fun2(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 30;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */
>
  
Tamar Christina Oct. 15, 2021, 7:48 a.m. UTC | #10
> >
> > +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */ (for
> > +cst (INTEGER_CST VECTOR_CST)  (simplify
> > +  (rshift (negate:s @0) cst@1)
> > +   (if (!flag_wrapv)
> 
> Don't test flag_wrapv directly, instead use the appropriate
> TYPE_OVERFLOW_{UNDEFINED,WRAPS} predicates.  But I'm not sure what
> we are protecting against?  Right-shift of signed integers is implementation-
> defined and GCC treats it as you'd expect, sign-extending the result.
> 

It's protecting against the overflow of the negate on INT_MIN. When wrapping
overflows are enabled the results would be wrong.

> > +    (with { tree ctype = TREE_TYPE (@0);
> > +	    tree stype = TREE_TYPE (@1);
> > +	    tree bt = truth_type_for (ctype); }
> > +     (switch
> > +      /* Handle scalar case.  */
> > +      (if (INTEGRAL_TYPE_P (ctype)
> > +	   && !VECTOR_TYPE_P (ctype)
> > +	   && !TYPE_UNSIGNED (ctype)
> > +	   && canonicalize_math_after_vectorization_p ()
> > +	   && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
> > +       (convert:bt (gt:bt @0 { build_zero_cst (stype); })))
> 
> I'm not sure why the result is of type 'bt' rather than the original type of the
> expression?

That was to satisfy some RTL check that expected results of comparisons to always
be a Boolean, though for scalar that logically always is the case, I just added it
for consistency.

> 
> In that regard for non-vectors we'd have to add the sign extension from
> unsigned bool, in the vector case we'd hope the type of the comparison is
> correct.  I think in both cases it might be convenient to use
> 
>   (cond (gt:bt @0 { build_zero_cst (ctype); }) { build_all_ones_cst (ctype); }
> { build_zero_cost (ctype); })
> 
> to compute the correct result and rely on (cond ..) simplifications to simplify
> that if possible.
> 
> Btw, 'stype' should be irrelevant - you need to look at the precision of 'ctype',
> no?

I was working under the assumption that both input types must have the same
precision, but turns out that assumption doesn't need to hold.

New version attached.

Bootstrapped Regtested on aarch64-none-linux-gnu,
x86_64-pc-linux-gnu and no regressions.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* match.pd: New negate+shift pattern.

gcc/testsuite/ChangeLog:

	* gcc.dg/signbit-2.c: New test.
	* gcc.dg/signbit-3.c: New test.
	* gcc.target/aarch64/signbit-1.c: New test.

--- inline copy of patch ---

diff --git a/gcc/match.pd b/gcc/match.pd
index 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..9532cae582e152cae6e22fcce95a9744a844e3c2 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -38,7 +38,8 @@ along with GCC; see the file COPYING3.  If not see
    uniform_integer_cst_p
    HONOR_NANS
    uniform_vector_p
-   bitmask_inv_cst_vector_p)
+   bitmask_inv_cst_vector_p
+   expand_vec_cmp_expr_p)
 
 /* Operator lists.  */
 (define_operator_list tcc_comparison
@@ -826,6 +827,42 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
     { tree utype = unsigned_type_for (type); }
     (convert (rshift (lshift (convert:utype @0) @2) @3))))))
 
+/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */
+(for cst (INTEGER_CST VECTOR_CST)
+ (simplify
+  (rshift (negate:s @0) cst@1)
+   (if (!TYPE_OVERFLOW_WRAPS (type))
+    (with { tree ctype = TREE_TYPE (@0);
+	    tree stype = TREE_TYPE (@1);
+	    tree bt = truth_type_for (ctype);
+	    tree zeros = build_zero_cst (ctype); }
+     (switch
+      /* Handle scalar case.  */
+      (if (INTEGRAL_TYPE_P (ctype)
+	   && !VECTOR_TYPE_P (ctype)
+	   && !TYPE_UNSIGNED (ctype)
+	   && canonicalize_math_after_vectorization_p ()
+	   && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (ctype) - 1))
+       (cond (gt:bt @0 { zeros; }) { build_all_ones_cst (ctype); } { zeros; }))
+      /* Handle vector case with a scalar immediate.  */
+      (if (VECTOR_INTEGER_TYPE_P (ctype)
+	   && !VECTOR_TYPE_P (stype)
+	   && !TYPE_UNSIGNED (ctype)
+	   && expand_vec_cmp_expr_p (ctype, ctype, { GT_EXPR }))
+       (with { HOST_WIDE_INT bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (ctype)); }
+	(if (wi::eq_p (wi::to_wide (@1), bits - 1))
+	 (convert:bt (gt:bt @0 { zeros; })))))
+      /* Handle vector case with a vector immediate.   */
+      (if (VECTOR_INTEGER_TYPE_P (ctype)
+	   && VECTOR_TYPE_P (stype)
+	   && !TYPE_UNSIGNED (ctype)
+	   && uniform_vector_p (@1)
+	   && expand_vec_cmp_expr_p (ctype, ctype, { GT_EXPR }))
+       (with { tree cst = vector_cst_elt (@1, 0);
+	       HOST_WIDE_INT bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (ctype)); }
+        (if (wi::eq_p (wi::to_wide (cst), bits - 1))
+	 (convert:bt (gt:bt @0 { zeros; }))))))))))
+
 /* Fold (C1/X)*C2 into (C1*C2)/X.  */
 (simplify
  (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2)
diff --git a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc30176c96a669bb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/signbit-2.c
@@ -0,0 +1,19 @@
+/* { dg-do assemble } */
+/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */
+
+#include <stdint.h>
+
+void fun1(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+void fun2(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 30;
+}
+
+/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1 optimized } } */
+/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
diff --git a/gcc/testsuite/gcc.dg/signbit-3.c b/gcc/testsuite/gcc.dg/signbit-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628f00938ece60bf7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/signbit-3.c
@@ -0,0 +1,13 @@
+/* { dg-do assemble } */
+/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */
+
+#include <stdint.h>
+
+void fun1(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } } */
+/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27fe48503714447e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
@@ -0,0 +1,18 @@
+/* { dg-do assemble } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <stdint.h>
+
+void fun1(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+void fun2(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 30;
+}
+
+/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */
  
Richard Biener Oct. 15, 2021, 9:06 a.m. UTC | #11
On Fri, 15 Oct 2021, Tamar Christina wrote:

> > >
> > > +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */ (for
> > > +cst (INTEGER_CST VECTOR_CST)  (simplify
> > > +  (rshift (negate:s @0) cst@1)
> > > +   (if (!flag_wrapv)
> > 
> > Don't test flag_wrapv directly, instead use the appropriate
> > TYPE_OVERFLOW_{UNDEFINED,WRAPS} predicates.  But I'm not sure what
> > we are protecting against?  Right-shift of signed integers is implementation-
> > defined and GCC treats it as you'd expect, sign-extending the result.
> > 
> 
> It's protecting against the overflow of the negate on INT_MIN. When wrapping
> overflows are enabled the results would be wrong.

But -INT_MIN == INT_MIN in twos-complement so I fail to see the wrong
result?  That is, both -INT_MIN >> 31 and INT_MIN >> 31 are -1.

> > > +    (with { tree ctype = TREE_TYPE (@0);
> > > +	    tree stype = TREE_TYPE (@1);
> > > +	    tree bt = truth_type_for (ctype); }
> > > +     (switch
> > > +      /* Handle scalar case.  */
> > > +      (if (INTEGRAL_TYPE_P (ctype)
> > > +	   && !VECTOR_TYPE_P (ctype)
> > > +	   && !TYPE_UNSIGNED (ctype)
> > > +	   && canonicalize_math_after_vectorization_p ()
> > > +	   && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
> > > +       (convert:bt (gt:bt @0 { build_zero_cst (stype); })))
> > 
> > I'm not sure why the result is of type 'bt' rather than the original type of the
> > expression?
> 
> That was to satisfy some RTL check that expected results of comparisons to always
> be a Boolean, though for scalar that logically always is the case, I just added it
> for consistency.
> 
> > 
> > In that regard for non-vectors we'd have to add the sign extension from
> > unsigned bool, in the vector case we'd hope the type of the comparison is
> > correct.  I think in both cases it might be convenient to use
> > 
> >   (cond (gt:bt @0 { build_zero_cst (ctype); }) { build_all_ones_cst (ctype); }
> > { build_zero_cost (ctype); })
> > 
> > to compute the correct result and rely on (cond ..) simplifications to simplify
> > that if possible.
> > 
> > Btw, 'stype' should be irrelevant - you need to look at the precision of 'ctype',
> > no?
> 
> I was working under the assumption that both input types must have the same
> precision, but turns out that assumption doesn't need to hold.
> 
> New version attached.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> x86_64-pc-linux-gnu and no regressions.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* match.pd: New negate+shift pattern.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/signbit-2.c: New test.
> 	* gcc.dg/signbit-3.c: New test.
> 	* gcc.target/aarch64/signbit-1.c: New test.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..9532cae582e152cae6e22fcce95a9744a844e3c2 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -38,7 +38,8 @@ along with GCC; see the file COPYING3.  If not see
>     uniform_integer_cst_p
>     HONOR_NANS
>     uniform_vector_p
> -   bitmask_inv_cst_vector_p)
> +   bitmask_inv_cst_vector_p
> +   expand_vec_cmp_expr_p)
>  
>  /* Operator lists.  */
>  (define_operator_list tcc_comparison
> @@ -826,6 +827,42 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>      { tree utype = unsigned_type_for (type); }
>      (convert (rshift (lshift (convert:utype @0) @2) @3))))))
>  
> +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */
> +(for cst (INTEGER_CST VECTOR_CST)
> + (simplify
> +  (rshift (negate:s @0) cst@1)
> +   (if (!TYPE_OVERFLOW_WRAPS (type))

as said, I don't think that's necessary but at least it's now
written correctly ;)

> +    (with { tree ctype = TREE_TYPE (@0);

Instead of 'ctype' you can use 'type' since the type of the expression
is the same as that of @0

> +	    tree stype = TREE_TYPE (@1);
> +	    tree bt = truth_type_for (ctype);
> +	    tree zeros = build_zero_cst (ctype); }
> +     (switch
> +      /* Handle scalar case.  */
> +      (if (INTEGRAL_TYPE_P (ctype)
> +	   && !VECTOR_TYPE_P (ctype)

INTEGRAL_TYPE_P does not include VECTOR_TYPE_P.

> +	   && !TYPE_UNSIGNED (ctype)
> +	   && canonicalize_math_after_vectorization_p ()
> +	   && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (ctype) - 1))
> +       (cond (gt:bt @0 { zeros; }) { build_all_ones_cst (ctype); } { zeros; }))
> +      /* Handle vector case with a scalar immediate.  */
> +      (if (VECTOR_INTEGER_TYPE_P (ctype)
> +	   && !VECTOR_TYPE_P (stype)
> +	   && !TYPE_UNSIGNED (ctype)
> +	   && expand_vec_cmp_expr_p (ctype, ctype, { GT_EXPR }))
> +       (with { HOST_WIDE_INT bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (ctype)); }
> +	(if (wi::eq_p (wi::to_wide (@1), bits - 1))

You can use element_precision (@0) - 1 in both the scalar and vector case.

> +	 (convert:bt (gt:bt @0 { zeros; })))))
> +      /* Handle vector case with a vector immediate.   */
> +      (if (VECTOR_INTEGER_TYPE_P (ctype)
> +	   && VECTOR_TYPE_P (stype)
> +	   && !TYPE_UNSIGNED (ctype)
> +	   && uniform_vector_p (@1)
> +	   && expand_vec_cmp_expr_p (ctype, ctype, { GT_EXPR }))
> +       (with { tree cst = vector_cst_elt (@1, 0);
> +	       HOST_WIDE_INT bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (ctype)); }
> +        (if (wi::eq_p (wi::to_wide (cst), bits - 1))
> +	 (convert:bt (gt:bt @0 { zeros; }))))))))))

The (convert:bt are not necessary.  Note that 'ctype' is not a valid
mask type but we eventually will be forgiving enough here.  You are
computing thruth_type_for, asking for gt:bt but checking
expand_vec_cmp_expr_p (ctype, ctype, ...), that's inconsistent and
will lead to problems on targets with AVX512VL + AVX2 which will
have a AVX2 compare of say V4SImode producing a V4SImode mask
but thruth_type_for will give you a QImode vector type.

As said the most general transform would be to

 (cond (gt:bt @0 { zeros; }) { build_zero_cst (ctype); } { 
build_all_ones_cst (ctype); })

if you think that's not better than the negate + shift but only
when the compare will directly produce the desired result then
I think you have to either drop the truth_type_for and manually
build the truthtype or check that the mode of the truth type
and the value type match.  And you need to produce

  (view_convert:ctype (gt:bt @0 { zeros; }))

then.

> +
>  /* Fold (C1/X)*C2 into (C1*C2)/X.  */
>  (simplify
>   (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2)
> diff --git a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc30176c96a669bb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/signbit-2.c
> @@ -0,0 +1,19 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */
> +
> +#include <stdint.h>
> +
> +void fun1(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +void fun2(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 30;
> +}
> +
> +/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1 optimized } } */
> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> diff --git a/gcc/testsuite/gcc.dg/signbit-3.c b/gcc/testsuite/gcc.dg/signbit-3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628f00938ece60bf7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/signbit-3.c
> @@ -0,0 +1,13 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */
> +
> +#include <stdint.h>
> +
> +void fun1(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } } */
> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27fe48503714447e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> @@ -0,0 +1,18 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O3 --save-temps" } */
> +
> +#include <stdint.h>
> +
> +void fun1(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +void fun2(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 30;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */
>
  
Richard Earnshaw (lists) Oct. 15, 2021, 10:36 a.m. UTC | #12
On 15/10/2021 10:06, Richard Biener via Gcc-patches wrote:
> On Fri, 15 Oct 2021, Tamar Christina wrote:
> 
>>>>
>>>> +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */ (for
>>>> +cst (INTEGER_CST VECTOR_CST)  (simplify
>>>> +  (rshift (negate:s @0) cst@1)
>>>> +   (if (!flag_wrapv)
>>>
>>> Don't test flag_wrapv directly, instead use the appropriate
>>> TYPE_OVERFLOW_{UNDEFINED,WRAPS} predicates.  But I'm not sure what
>>> we are protecting against?  Right-shift of signed integers is implementation-
>>> defined and GCC treats it as you'd expect, sign-extending the result.
>>>
>>
>> It's protecting against the overflow of the negate on INT_MIN. When wrapping
>> overflows are enabled the results would be wrong.
> 
> But -INT_MIN == INT_MIN in twos-complement so I fail to see the wrong
> result?  That is, both -INT_MIN >> 31 and INT_MIN >> 31 are -1.

Exactly, so transforming the original testcase from (x = -a >> 31) into 
(x = -(a > 0)) is not valid in that case.

R.

> 
>>>> +    (with { tree ctype = TREE_TYPE (@0);
>>>> +	    tree stype = TREE_TYPE (@1);
>>>> +	    tree bt = truth_type_for (ctype); }
>>>> +     (switch
>>>> +      /* Handle scalar case.  */
>>>> +      (if (INTEGRAL_TYPE_P (ctype)
>>>> +	   && !VECTOR_TYPE_P (ctype)
>>>> +	   && !TYPE_UNSIGNED (ctype)
>>>> +	   && canonicalize_math_after_vectorization_p ()
>>>> +	   && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
>>>> +       (convert:bt (gt:bt @0 { build_zero_cst (stype); })))
>>>
>>> I'm not sure why the result is of type 'bt' rather than the original type of the
>>> expression?
>>
>> That was to satisfy some RTL check that expected results of comparisons to always
>> be a Boolean, though for scalar that logically always is the case, I just added it
>> for consistency.
>>
>>>
>>> In that regard for non-vectors we'd have to add the sign extension from
>>> unsigned bool, in the vector case we'd hope the type of the comparison is
>>> correct.  I think in both cases it might be convenient to use
>>>
>>>    (cond (gt:bt @0 { build_zero_cst (ctype); }) { build_all_ones_cst (ctype); }
>>> { build_zero_cost (ctype); })
>>>
>>> to compute the correct result and rely on (cond ..) simplifications to simplify
>>> that if possible.
>>>
>>> Btw, 'stype' should be irrelevant - you need to look at the precision of 'ctype',
>>> no?
>>
>> I was working under the assumption that both input types must have the same
>> precision, but turns out that assumption doesn't need to hold.
>>
>> New version attached.
>>
>> Bootstrapped Regtested on aarch64-none-linux-gnu,
>> x86_64-pc-linux-gnu and no regressions.
>>
>> Ok for master?
>>
>> Thanks,
>> Tamar
>>
>> gcc/ChangeLog:
>>
>> 	* match.pd: New negate+shift pattern.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 	* gcc.dg/signbit-2.c: New test.
>> 	* gcc.dg/signbit-3.c: New test.
>> 	* gcc.target/aarch64/signbit-1.c: New test.
>>
>> --- inline copy of patch ---
>>
>> diff --git a/gcc/match.pd b/gcc/match.pd
>> index 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..9532cae582e152cae6e22fcce95a9744a844e3c2 100644
>> --- a/gcc/match.pd
>> +++ b/gcc/match.pd
>> @@ -38,7 +38,8 @@ along with GCC; see the file COPYING3.  If not see
>>      uniform_integer_cst_p
>>      HONOR_NANS
>>      uniform_vector_p
>> -   bitmask_inv_cst_vector_p)
>> +   bitmask_inv_cst_vector_p
>> +   expand_vec_cmp_expr_p)
>>   
>>   /* Operator lists.  */
>>   (define_operator_list tcc_comparison
>> @@ -826,6 +827,42 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>       { tree utype = unsigned_type_for (type); }
>>       (convert (rshift (lshift (convert:utype @0) @2) @3))))))
>>   
>> +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */
>> +(for cst (INTEGER_CST VECTOR_CST)
>> + (simplify
>> +  (rshift (negate:s @0) cst@1)
>> +   (if (!TYPE_OVERFLOW_WRAPS (type))
> 
> as said, I don't think that's necessary but at least it's now
> written correctly ;)
> 
>> +    (with { tree ctype = TREE_TYPE (@0);
> 
> Instead of 'ctype' you can use 'type' since the type of the expression
> is the same as that of @0
> 
>> +	    tree stype = TREE_TYPE (@1);
>> +	    tree bt = truth_type_for (ctype);
>> +	    tree zeros = build_zero_cst (ctype); }
>> +     (switch
>> +      /* Handle scalar case.  */
>> +      (if (INTEGRAL_TYPE_P (ctype)
>> +	   && !VECTOR_TYPE_P (ctype)
> 
> INTEGRAL_TYPE_P does not include VECTOR_TYPE_P.
> 
>> +	   && !TYPE_UNSIGNED (ctype)
>> +	   && canonicalize_math_after_vectorization_p ()
>> +	   && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (ctype) - 1))
>> +       (cond (gt:bt @0 { zeros; }) { build_all_ones_cst (ctype); } { zeros; }))
>> +      /* Handle vector case with a scalar immediate.  */
>> +      (if (VECTOR_INTEGER_TYPE_P (ctype)
>> +	   && !VECTOR_TYPE_P (stype)
>> +	   && !TYPE_UNSIGNED (ctype)
>> +	   && expand_vec_cmp_expr_p (ctype, ctype, { GT_EXPR }))
>> +       (with { HOST_WIDE_INT bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (ctype)); }
>> +	(if (wi::eq_p (wi::to_wide (@1), bits - 1))
> 
> You can use element_precision (@0) - 1 in both the scalar and vector case.
> 
>> +	 (convert:bt (gt:bt @0 { zeros; })))))
>> +      /* Handle vector case with a vector immediate.   */
>> +      (if (VECTOR_INTEGER_TYPE_P (ctype)
>> +	   && VECTOR_TYPE_P (stype)
>> +	   && !TYPE_UNSIGNED (ctype)
>> +	   && uniform_vector_p (@1)
>> +	   && expand_vec_cmp_expr_p (ctype, ctype, { GT_EXPR }))
>> +       (with { tree cst = vector_cst_elt (@1, 0);
>> +	       HOST_WIDE_INT bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (ctype)); }
>> +        (if (wi::eq_p (wi::to_wide (cst), bits - 1))
>> +	 (convert:bt (gt:bt @0 { zeros; }))))))))))
> 
> The (convert:bt are not necessary.  Note that 'ctype' is not a valid
> mask type but we eventually will be forgiving enough here.  You are
> computing thruth_type_for, asking for gt:bt but checking
> expand_vec_cmp_expr_p (ctype, ctype, ...), that's inconsistent and
> will lead to problems on targets with AVX512VL + AVX2 which will
> have a AVX2 compare of say V4SImode producing a V4SImode mask
> but thruth_type_for will give you a QImode vector type.
> 
> As said the most general transform would be to
> 
>   (cond (gt:bt @0 { zeros; }) { build_zero_cst (ctype); } {
> build_all_ones_cst (ctype); })
> 
> if you think that's not better than the negate + shift but only
> when the compare will directly produce the desired result then
> I think you have to either drop the truth_type_for and manually
> build the truthtype or check that the mode of the truth type
> and the value type match.  And you need to produce
> 
>    (view_convert:ctype (gt:bt @0 { zeros; }))
> 
> then.
> 
>> +
>>   /* Fold (C1/X)*C2 into (C1*C2)/X.  */
>>   (simplify
>>    (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2)
>> diff --git a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-2.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc30176c96a669bb
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/signbit-2.c
>> @@ -0,0 +1,19 @@
>> +/* { dg-do assemble } */
>> +/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */
>> +
>> +#include <stdint.h>
>> +
>> +void fun1(int32_t *x, int n)
>> +{
>> +    for (int i = 0; i < (n & -16); i++)
>> +      x[i] = (-x[i]) >> 31;
>> +}
>> +
>> +void fun2(int32_t *x, int n)
>> +{
>> +    for (int i = 0; i < (n & -16); i++)
>> +      x[i] = (-x[i]) >> 30;
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1 optimized } } */
>> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
>> diff --git a/gcc/testsuite/gcc.dg/signbit-3.c b/gcc/testsuite/gcc.dg/signbit-3.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628f00938ece60bf7
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/signbit-3.c
>> @@ -0,0 +1,13 @@
>> +/* { dg-do assemble } */
>> +/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */
>> +
>> +#include <stdint.h>
>> +
>> +void fun1(int32_t *x, int n)
>> +{
>> +    for (int i = 0; i < (n & -16); i++)
>> +      x[i] = (-x[i]) >> 31;
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } } */
>> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
>> diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27fe48503714447e
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
>> @@ -0,0 +1,18 @@
>> +/* { dg-do assemble } */
>> +/* { dg-options "-O3 --save-temps" } */
>> +
>> +#include <stdint.h>
>> +
>> +void fun1(int32_t *x, int n)
>> +{
>> +    for (int i = 0; i < (n & -16); i++)
>> +      x[i] = (-x[i]) >> 31;
>> +}
>> +
>> +void fun2(int32_t *x, int n)
>> +{
>> +    for (int i = 0; i < (n & -16); i++)
>> +      x[i] = (-x[i]) >> 30;
>> +}
>> +
>> +/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */
>>
>
  
Richard Biener Oct. 15, 2021, 10:57 a.m. UTC | #13
On Fri, 15 Oct 2021, Richard Earnshaw wrote:

> On 15/10/2021 10:06, Richard Biener via Gcc-patches wrote:
> > On Fri, 15 Oct 2021, Tamar Christina wrote:
> > 
> >>>>
> >>>> +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */ (for
> >>>> +cst (INTEGER_CST VECTOR_CST)  (simplify
> >>>> +  (rshift (negate:s @0) cst@1)
> >>>> +   (if (!flag_wrapv)
> >>>
> >>> Don't test flag_wrapv directly, instead use the appropriate
> >>> TYPE_OVERFLOW_{UNDEFINED,WRAPS} predicates.  But I'm not sure what
> >>> we are protecting against?  Right-shift of signed integers is
> >>> implementation-
> >>> defined and GCC treats it as you'd expect, sign-extending the result.
> >>>
> >>
> >> It's protecting against the overflow of the negate on INT_MIN. When
> >> wrapping
> >> overflows are enabled the results would be wrong.
> > 
> > But -INT_MIN == INT_MIN in twos-complement so I fail to see the wrong
> > result?  That is, both -INT_MIN >> 31 and INT_MIN >> 31 are -1.
> 
> Exactly, so transforming the original testcase from (x = -a >> 31) into (x =
> -(a > 0)) is not valid in that case.

Hmm, but we're not doing that.  Actually, we inconsistently handle
the scalar and the vector variant here - maybe the (negate ..)
is missing around the (gt @0 { ...}) of the scalar case.

Btw, I would appreciate testcases for the cases that would go wrong,
indeed INT_MIN would be handled wrong.

Richard.



> R.
> 
> > 
> >>>> +    (with { tree ctype = TREE_TYPE (@0);
> >>>> +	    tree stype = TREE_TYPE (@1);
> >>>> +	    tree bt = truth_type_for (ctype); }
> >>>> +     (switch
> >>>> +      /* Handle scalar case.  */
> >>>> +      (if (INTEGRAL_TYPE_P (ctype)
> >>>> +	   && !VECTOR_TYPE_P (ctype)
> >>>> +	   && !TYPE_UNSIGNED (ctype)
> >>>> +	   && canonicalize_math_after_vectorization_p ()
> >>>> +	   && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
> >>>> +       (convert:bt (gt:bt @0 { build_zero_cst (stype); })))
> >>>
> >>> I'm not sure why the result is of type 'bt' rather than the original type
> >>> of the
> >>> expression?
> >>
> >> That was to satisfy some RTL check that expected results of comparisons to
> >> always
> >> be a Boolean, though for scalar that logically always is the case, I just
> >> added it
> >> for consistency.
> >>
> >>>
> >>> In that regard for non-vectors we'd have to add the sign extension from
> >>> unsigned bool, in the vector case we'd hope the type of the comparison is
> >>> correct.  I think in both cases it might be convenient to use
> >>>
> >>>    (cond (gt:bt @0 { build_zero_cst (ctype); }) { build_all_ones_cst
> >>> (ctype); }
> >>> { build_zero_cost (ctype); })
> >>>
> >>> to compute the correct result and rely on (cond ..) simplifications to
> >>> simplify
> >>> that if possible.
> >>>
> >>> Btw, 'stype' should be irrelevant - you need to look at the precision of
> >>> 'ctype',
> >>> no?
> >>
> >> I was working under the assumption that both input types must have the same
> >> precision, but turns out that assumption doesn't need to hold.
> >>
> >> New version attached.
> >>
> >> Bootstrapped Regtested on aarch64-none-linux-gnu,
> >> x86_64-pc-linux-gnu and no regressions.
> >>
> >> Ok for master?
> >>
> >> Thanks,
> >> Tamar
> >>
> >> gcc/ChangeLog:
> >>
> >>  * match.pd: New negate+shift pattern.
> >>
> >> gcc/testsuite/ChangeLog:
> >>
> >>  * gcc.dg/signbit-2.c: New test.
> >>  * gcc.dg/signbit-3.c: New test.
> >>  * gcc.target/aarch64/signbit-1.c: New test.
> >>
> >> --- inline copy of patch ---
> >>
> >> diff --git a/gcc/match.pd b/gcc/match.pd
> >> index
> >> 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..9532cae582e152cae6e22fcce95a9744a844e3c2
> >> 100644
> >> --- a/gcc/match.pd
> >> +++ b/gcc/match.pd
> >> @@ -38,7 +38,8 @@ along with GCC; see the file COPYING3.  If not see
> >>      uniform_integer_cst_p
> >>      HONOR_NANS
> >>      uniform_vector_p
> >> -   bitmask_inv_cst_vector_p)
> >> +   bitmask_inv_cst_vector_p
> >> +   expand_vec_cmp_expr_p)
> >>   
> >>   /* Operator lists.  */
> >>   (define_operator_list tcc_comparison
> >> @@ -826,6 +827,42 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >>       { tree utype = unsigned_type_for (type); }
> >>       (convert (rshift (lshift (convert:utype @0) @2) @3))))))
> >>   +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */
> >> +(for cst (INTEGER_CST VECTOR_CST)
> >> + (simplify
> >> +  (rshift (negate:s @0) cst@1)
> >> +   (if (!TYPE_OVERFLOW_WRAPS (type))
> > 
> > as said, I don't think that's necessary but at least it's now
> > written correctly ;)
> > 
> >> +    (with { tree ctype = TREE_TYPE (@0);
> > 
> > Instead of 'ctype' you can use 'type' since the type of the expression
> > is the same as that of @0
> > 
> >> +	    tree stype = TREE_TYPE (@1);
> >> +	    tree bt = truth_type_for (ctype);
> >> +	    tree zeros = build_zero_cst (ctype); }
> >> +     (switch
> >> +      /* Handle scalar case.  */
> >> +      (if (INTEGRAL_TYPE_P (ctype)
> >> +	   && !VECTOR_TYPE_P (ctype)
> > 
> > INTEGRAL_TYPE_P does not include VECTOR_TYPE_P.
> > 
> >> +	   && !TYPE_UNSIGNED (ctype)
> >> +	   && canonicalize_math_after_vectorization_p ()
> >> +	   && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (ctype) - 1))
> >> +       (cond (gt:bt @0 { zeros; }) { build_all_ones_cst (ctype); } {
> >> zeros; }))
> >> +      /* Handle vector case with a scalar immediate.  */
> >> +      (if (VECTOR_INTEGER_TYPE_P (ctype)
> >> +	   && !VECTOR_TYPE_P (stype)
> >> +	   && !TYPE_UNSIGNED (ctype)
> >> +	   && expand_vec_cmp_expr_p (ctype, ctype, { GT_EXPR }))
> >> +       (with { HOST_WIDE_INT bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE
> >> (ctype)); }
> >> +	(if (wi::eq_p (wi::to_wide (@1), bits - 1))
> > 
> > You can use element_precision (@0) - 1 in both the scalar and vector case.
> > 
> >> +	 (convert:bt (gt:bt @0 { zeros; })))))
> >> +      /* Handle vector case with a vector immediate.   */
> >> +      (if (VECTOR_INTEGER_TYPE_P (ctype)
> >> +	   && VECTOR_TYPE_P (stype)
> >> +	   && !TYPE_UNSIGNED (ctype)
> >> +	   && uniform_vector_p (@1)
> >> +	   && expand_vec_cmp_expr_p (ctype, ctype, { GT_EXPR }))
> >> +       (with { tree cst = vector_cst_elt (@1, 0);
> >> +	       HOST_WIDE_INT bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (ctype));
> >> }
> >> +        (if (wi::eq_p (wi::to_wide (cst), bits - 1))
> >> +	 (convert:bt (gt:bt @0 { zeros; }))))))))))
> > 
> > The (convert:bt are not necessary.  Note that 'ctype' is not a valid
> > mask type but we eventually will be forgiving enough here.  You are
> > computing thruth_type_for, asking for gt:bt but checking
> > expand_vec_cmp_expr_p (ctype, ctype, ...), that's inconsistent and
> > will lead to problems on targets with AVX512VL + AVX2 which will
> > have a AVX2 compare of say V4SImode producing a V4SImode mask
> > but thruth_type_for will give you a QImode vector type.
> > 
> > As said the most general transform would be to
> > 
> >   (cond (gt:bt @0 { zeros; }) { build_zero_cst (ctype); } {
> > build_all_ones_cst (ctype); })
> > 
> > if you think that's not better than the negate + shift but only
> > when the compare will directly produce the desired result then
> > I think you have to either drop the truth_type_for and manually
> > build the truthtype or check that the mode of the truth type
> > and the value type match.  And you need to produce
> > 
> >    (view_convert:ctype (gt:bt @0 { zeros; }))
> > 
> > then.
> > 
> >> +
> >>   /* Fold (C1/X)*C2 into (C1*C2)/X.  */
> >>   (simplify
> >>    (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2)
> >> diff --git a/gcc/testsuite/gcc.dg/signbit-2.c
> >> b/gcc/testsuite/gcc.dg/signbit-2.c
> >> new file mode 100644
> >> index
> >> 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc30176c96a669bb
> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.dg/signbit-2.c
> >> @@ -0,0 +1,19 @@
> >> +/* { dg-do assemble } */
> >> +/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */
> >> +
> >> +#include <stdint.h>
> >> +
> >> +void fun1(int32_t *x, int n)
> >> +{
> >> +    for (int i = 0; i < (n & -16); i++)
> >> +      x[i] = (-x[i]) >> 31;
> >> +}
> >> +
> >> +void fun2(int32_t *x, int n)
> >> +{
> >> +    for (int i = 0; i < (n & -16); i++)
> >> +      x[i] = (-x[i]) >> 30;
> >> +}
> >> +
> >> +/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1 optimized
> >> } } */
> >> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> >> diff --git a/gcc/testsuite/gcc.dg/signbit-3.c
> >> b/gcc/testsuite/gcc.dg/signbit-3.c
> >> new file mode 100644
> >> index
> >> 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628f00938ece60bf7
> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.dg/signbit-3.c
> >> @@ -0,0 +1,13 @@
> >> +/* { dg-do assemble } */
> >> +/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */
> >> +
> >> +#include <stdint.h>
> >> +
> >> +void fun1(int32_t *x, int n)
> >> +{
> >> +    for (int i = 0; i < (n & -16); i++)
> >> +      x[i] = (-x[i]) >> 31;
> >> +}
> >> +
> >> +/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } } */
> >> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> >> diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> >> b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> >> new file mode 100644
> >> index
> >> 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27fe48503714447e
> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> >> @@ -0,0 +1,18 @@
> >> +/* { dg-do assemble } */
> >> +/* { dg-options "-O3 --save-temps" } */
> >> +
> >> +#include <stdint.h>
> >> +
> >> +void fun1(int32_t *x, int n)
> >> +{
> >> +    for (int i = 0; i < (n & -16); i++)
> >> +      x[i] = (-x[i]) >> 31;
> >> +}
> >> +
> >> +void fun2(int32_t *x, int n)
> >> +{
> >> +    for (int i = 0; i < (n & -16); i++)
> >> +      x[i] = (-x[i]) >> 30;
> >> +}
> >> +
> >> +/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */
> >>
> > 
> 
>
  
Tamar Christina Oct. 15, 2021, 11:55 a.m. UTC | #14
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Friday, October 15, 2021 10:07 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Richard Earnshaw <Richard.Earnshaw@foss.arm.com>; gcc-
> patches@gcc.gnu.org; nd <nd@arm.com>
> Subject: RE: [PATCH]middle-end convert negate + right shift into compare
> greater.
> 
> On Fri, 15 Oct 2021, Tamar Christina wrote:
> 
> > > >
> > > > +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */
> > > > +(for cst (INTEGER_CST VECTOR_CST)  (simplify
> > > > +  (rshift (negate:s @0) cst@1)
> > > > +   (if (!flag_wrapv)
> > >
> > > Don't test flag_wrapv directly, instead use the appropriate
> > > TYPE_OVERFLOW_{UNDEFINED,WRAPS} predicates.  But I'm not sure
> what
> > > we are protecting against?  Right-shift of signed integers is
> > > implementation- defined and GCC treats it as you'd expect, sign-
> extending the result.
> > >
> >
> > It's protecting against the overflow of the negate on INT_MIN. When
> > wrapping overflows are enabled the results would be wrong.
> 
> But -INT_MIN == INT_MIN in twos-complement so I fail to see the wrong
> result?  That is, both -INT_MIN >> 31 and INT_MIN >> 31 are -1.
> 
> > > > +    (with { tree ctype = TREE_TYPE (@0);
> > > > +	    tree stype = TREE_TYPE (@1);
> > > > +	    tree bt = truth_type_for (ctype); }
> > > > +     (switch
> > > > +      /* Handle scalar case.  */
> > > > +      (if (INTEGRAL_TYPE_P (ctype)
> > > > +	   && !VECTOR_TYPE_P (ctype)
> > > > +	   && !TYPE_UNSIGNED (ctype)
> > > > +	   && canonicalize_math_after_vectorization_p ()
> > > > +	   && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
> > > > +       (convert:bt (gt:bt @0 { build_zero_cst (stype); })))
> > >
> > > I'm not sure why the result is of type 'bt' rather than the original
> > > type of the expression?
> >
> > That was to satisfy some RTL check that expected results of
> > comparisons to always be a Boolean, though for scalar that logically
> > always is the case, I just added it for consistency.
> >
> > >
> > > In that regard for non-vectors we'd have to add the sign extension
> > > from unsigned bool, in the vector case we'd hope the type of the
> > > comparison is correct.  I think in both cases it might be convenient
> > > to use
> > >
> > >   (cond (gt:bt @0 { build_zero_cst (ctype); }) { build_all_ones_cst
> > > (ctype); } { build_zero_cost (ctype); })
> > >
> > > to compute the correct result and rely on (cond ..) simplifications
> > > to simplify that if possible.
> > >
> > > Btw, 'stype' should be irrelevant - you need to look at the
> > > precision of 'ctype', no?
> >
> > I was working under the assumption that both input types must have the
> > same precision, but turns out that assumption doesn't need to hold.
> >
> > New version attached.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no regressions.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* match.pd: New negate+shift pattern.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.dg/signbit-2.c: New test.
> > 	* gcc.dg/signbit-3.c: New test.
> > 	* gcc.target/aarch64/signbit-1.c: New test.
> >
> > --- inline copy of patch ---
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd index
> >
> 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..9532cae582e152cae6e22fcce
> 95a
> > 9744a844e3c2 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -38,7 +38,8 @@ along with GCC; see the file COPYING3.  If not see
> >     uniform_integer_cst_p
> >     HONOR_NANS
> >     uniform_vector_p
> > -   bitmask_inv_cst_vector_p)
> > +   bitmask_inv_cst_vector_p
> > +   expand_vec_cmp_expr_p)
> >
> >  /* Operator lists.  */
> >  (define_operator_list tcc_comparison
> > @@ -826,6 +827,42 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >      { tree utype = unsigned_type_for (type); }
> >      (convert (rshift (lshift (convert:utype @0) @2) @3))))))
> >
> > +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */ (for
> > +cst (INTEGER_CST VECTOR_CST)  (simplify
> > +  (rshift (negate:s @0) cst@1)
> > +   (if (!TYPE_OVERFLOW_WRAPS (type))
> 
> as said, I don't think that's necessary but at least it's now written correctly ;)
> 
> > +    (with { tree ctype = TREE_TYPE (@0);
> 
> Instead of 'ctype' you can use 'type' since the type of the expression is the
> same as that of @0
> 
> > +	    tree stype = TREE_TYPE (@1);
> > +	    tree bt = truth_type_for (ctype);
> > +	    tree zeros = build_zero_cst (ctype); }
> > +     (switch
> > +      /* Handle scalar case.  */
> > +      (if (INTEGRAL_TYPE_P (ctype)
> > +	   && !VECTOR_TYPE_P (ctype)
> 
> INTEGRAL_TYPE_P does not include VECTOR_TYPE_P.
> 
> > +	   && !TYPE_UNSIGNED (ctype)
> > +	   && canonicalize_math_after_vectorization_p ()
> > +	   && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (ctype) - 1))
> > +       (cond (gt:bt @0 { zeros; }) { build_all_ones_cst (ctype); } { zeros; }))
> > +      /* Handle vector case with a scalar immediate.  */
> > +      (if (VECTOR_INTEGER_TYPE_P (ctype)
> > +	   && !VECTOR_TYPE_P (stype)
> > +	   && !TYPE_UNSIGNED (ctype)
> > +	   && expand_vec_cmp_expr_p (ctype, ctype, { GT_EXPR }))
> > +       (with { HOST_WIDE_INT bits = GET_MODE_UNIT_BITSIZE
> (TYPE_MODE (ctype)); }
> > +	(if (wi::eq_p (wi::to_wide (@1), bits - 1))
> 
> You can use element_precision (@0) - 1 in both the scalar and vector case.
> 
> > +	 (convert:bt (gt:bt @0 { zeros; })))))
> > +      /* Handle vector case with a vector immediate.   */
> > +      (if (VECTOR_INTEGER_TYPE_P (ctype)
> > +	   && VECTOR_TYPE_P (stype)
> > +	   && !TYPE_UNSIGNED (ctype)
> > +	   && uniform_vector_p (@1)
> > +	   && expand_vec_cmp_expr_p (ctype, ctype, { GT_EXPR }))
> > +       (with { tree cst = vector_cst_elt (@1, 0);
> > +	       HOST_WIDE_INT bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE
> (ctype)); }
> > +        (if (wi::eq_p (wi::to_wide (cst), bits - 1))
> > +	 (convert:bt (gt:bt @0 { zeros; }))))))))))
> 
> The (convert:bt are not necessary.  Note that 'ctype' is not a valid mask type
> but we eventually will be forgiving enough here.  You are computing
> thruth_type_for, asking for gt:bt but checking expand_vec_cmp_expr_p
> (ctype, ctype, ...), that's inconsistent and will lead to problems on targets
> with AVX512VL + AVX2 which will have a AVX2 compare of say V4SImode
> producing a V4SImode mask but thruth_type_for will give you a QImode
> vector type.
> 

Right, What I wanted to check here was that the target has the same mode for
the mask as the result, which looking at expand_vec_cmp_expr_p I thought
would give me.  The problem with checking expand_vec_cmp_expr_p with bt
as the mask crashes with SVE because the comparison becomes a predicate
which is applied on an operation. So I guess there are tree various different
possibilities here.

> As said the most general transform would be to
> 
>  (cond (gt:bt @0 { zeros; }) { build_zero_cst (ctype); } { build_all_ones_cst
> (ctype); })
> 

I tried this before but it doesn't optimize the cond away. So with this it produces:

  vect__6.10_7 = vect__4.8_9 > { 0, 0, 0, 0 } ? { 0, 0, 0, 0 } : { -1, -1, -1, -1 };
  MEM <vector(4) int> [(int32_t *)_2] = vect__6.10_7;

Which I suppose is technically correct, but it'll crash at expand time because this form
doesn't look to be expected:

0x1262ac1 prepare_cmp_insn
        /src/gcc/gcc/optabs.c:4532
0x1262f80 emit_cmp_and_jump_insns(rtx_def*, rtx_def*, rtx_code, rtx_def*, machine_mode, int, rtx_def*, profile_probability)
        /src/gcc/gcc/optabs.c:4677
0xdad85d do_compare_rtx_and_jump(rtx_def*, rtx_def*, rtx_code, int, machine_mode, rtx_def*, rtx_code_label*, rtx_code_label*, profile_probability)
        /src/gcc/gcc/dojump.c:1220
0xdadf0c do_compare_and_jump
        /src/gcc/gcc/dojump.c:1294
0xdab0d3 do_jump_1
        /src/gcc/gcc/dojump.c:253
0xdabaa2 do_jump
        /src/gcc/gcc/dojump.c:500
0xdacdd7 jumpifnot(tree_node*, rtx_code_label*, profile_probability)
        /src/gcc/gcc/dojump.c:942
0xeba6f1 expand_expr_real_2(separate_ops*, rtx_def*, machine_mode, expand_modifier)
        /src/gcc/gcc/expr.c:10219
0xebb99d expand_expr_real_1(tree_node*, rtx_def*, machine_mode, expand_modifier, rtx_def**, bool)
        /src/gcc/gcc/expr.c:10480
0xeb363a expand_expr_real(tree_node*, rtx_def*, machine_mode, expand_modifier, rtx_def**, bool)
        /src/gcc/gcc/expr.c:8713
0xe9116d expand_expr
        /src/gcc/gcc/expr.h:301
0xea3ce5 expand_assignment(tree_node*, tree_node*, bool)
        /src/gcc/gcc/expr.c:5345
0xd01860 expand_gimple_stmt_1
        /src/gcc/gcc/cfgexpand.c:3942
0xd01c4f expand_gimple_stmt
        /src/gcc/gcc/cfgexpand.c:4040
0xd0a9b7 expand_gimple_basic_block
        /src/gcc/gcc/cfgexpand.c:6082
0xd0ce45 execute
        /src/gcc/gcc/cfgexpand.c:6808

> if you think that's not better than the negate + shift but only when the
> compare will directly produce the desired result then I think you have to
> either drop the truth_type_for and manually build the truthtype or check
> that the mode of the truth type and the value type match.  And you need to
> produce
> 

But how do I figure out what the target's format is? Is there a hook/property
I can use for this?

Thanks,
Tamar

>   (view_convert:ctype (gt:bt @0 { zeros; }))
> 
> then.
> 
> > +
> >  /* Fold (C1/X)*C2 into (C1*C2)/X.  */  (simplify
> >   (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2) diff --git
> > a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-2.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc
> 30
> > 176c96a669bb
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/signbit-2.c
> > @@ -0,0 +1,19 @@
> > +/* { dg-do assemble } */
> > +/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */
> > +
> > +#include <stdint.h>
> > +
> > +void fun1(int32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (-x[i]) >> 31;
> > +}
> > +
> > +void fun2(int32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (-x[i]) >> 30;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1
> > +optimized } } */
> > +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> > diff --git a/gcc/testsuite/gcc.dg/signbit-3.c
> > b/gcc/testsuite/gcc.dg/signbit-3.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628
> f0
> > 0938ece60bf7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/signbit-3.c
> > @@ -0,0 +1,13 @@
> > +/* { dg-do assemble } */
> > +/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */
> > +
> > +#include <stdint.h>
> > +
> > +void fun1(int32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (-x[i]) >> 31;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } } */
> > +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> > b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27
> fe
> > 48503714447e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> > @@ -0,0 +1,18 @@
> > +/* { dg-do assemble } */
> > +/* { dg-options "-O3 --save-temps" } */
> > +
> > +#include <stdint.h>
> > +
> > +void fun1(int32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (-x[i]) >> 31;
> > +}
> > +
> > +void fun2(int32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (-x[i]) >> 30;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409
> Nuernberg, Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)
  
Tamar Christina Nov. 3, 2021, 1:21 p.m. UTC | #15
Hi,

I have addressed all the feedback and updated patch attached:

Bootstrapped Regtested on aarch64-none-linux-gnu,
x86_64-pc-linux-gnu and no regressions.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* match.pd: New negate+shift pattern.

gcc/testsuite/ChangeLog:

	* gcc.dg/signbit-2.c: New test.
	* gcc.dg/signbit-3.c: New test.
	* gcc.dg/signbit-4.c: New test.
	* gcc.dg/signbit-5.c: New test.
	* gcc.dg/signbit-6.c: New test.
	* gcc.target/aarch64/signbit-1.c: New test.

--- inline copy of patch ---

diff --git a/gcc/match.pd b/gcc/match.pd
index 65a6591f75c03333602147bbdf6d59f9ccd4b1e5..fe93500d22e2388889c8c9faf4c58cee95dec7f9 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -38,7 +38,8 @@ along with GCC; see the file COPYING3.  If not see
    uniform_integer_cst_p
    HONOR_NANS
    uniform_vector_p
-   bitmask_inv_cst_vector_p)
+   bitmask_inv_cst_vector_p
+   expand_vec_cmp_expr_p)
 
 /* Operator lists.  */
 (define_operator_list tcc_comparison
@@ -832,6 +833,38 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
     { tree utype = unsigned_type_for (type); }
     (convert (rshift (lshift (convert:utype @0) @2) @3))))))
 
+/* Fold (-x >> C) into -(x > 0) where C = precision(type) - 1.  */
+(for cst (INTEGER_CST VECTOR_CST)
+ (simplify
+  (rshift (negate:s @0) cst@1)
+   (if (!TYPE_UNSIGNED (type)
+        && TYPE_OVERFLOW_UNDEFINED (type))
+    (with { tree stype = TREE_TYPE (@1);
+	    tree bt = truth_type_for (type);
+	    tree zeros = build_zero_cst (type); }
+     (switch
+      /* Handle scalar case.  */
+      (if (INTEGRAL_TYPE_P (type)
+	   /* If we apply the rule to the scalar type before vectorization
+	      we will enforce the result of the comparison being a bool
+	      which will require an extra AND on the result that will be
+	      indistinguishable from when the user did actually want 0
+	      or 1 as the result so it can't be removed.  */
+	   && canonicalize_math_after_vectorization_p ()
+	   && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (type) - 1))
+       (negate (convert (gt @0 { zeros; }))))
+      /* Handle vector case.  */
+      (if (VECTOR_INTEGER_TYPE_P (type)
+	   /* First check whether the target has the same mode for vector
+	      comparison results as it's operands do.  */
+	   && TYPE_MODE (bt) == TYPE_MODE (type)
+	   /* Then check to see if the target is able to expand the comparison
+	      with the given type later on, otherwise we may ICE.  */
+	   && expand_vec_cmp_expr_p (type, bt, { GT_EXPR }))
+       (with { tree cst = uniform_integer_cst_p (@1); }
+	(if (cst && wi::eq_p (wi::to_wide (cst), element_precision (type) - 1))
+	 (view_convert (gt:bt @0 { zeros; }))))))))))
+
 /* Fold (C1/X)*C2 into (C1*C2)/X.  */
 (simplify
  (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2)
diff --git a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc30176c96a669bb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/signbit-2.c
@@ -0,0 +1,19 @@
+/* { dg-do assemble } */
+/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */
+
+#include <stdint.h>
+
+void fun1(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+void fun2(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 30;
+}
+
+/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1 optimized } } */
+/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
diff --git a/gcc/testsuite/gcc.dg/signbit-3.c b/gcc/testsuite/gcc.dg/signbit-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628f00938ece60bf7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/signbit-3.c
@@ -0,0 +1,13 @@
+/* { dg-do assemble } */
+/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */
+
+#include <stdint.h>
+
+void fun1(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } } */
+/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
diff --git a/gcc/testsuite/gcc.dg/signbit-4.c b/gcc/testsuite/gcc.dg/signbit-4.c
new file mode 100644
index 0000000000000000000000000000000000000000..bc459ba60a760bdf49e94dbec762f378c24fe9b5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/signbit-4.c
@@ -0,0 +1,65 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -fwrapv" } */
+
+#include <stdint.h>
+#include <limits.h>
+#include <stdio.h>
+
+#ifndef N
+#define N 65
+#endif
+
+#ifndef TYPE
+#define TYPE int32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 1
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+__attribute__ ((noinline, noipa))
+void fun1(TYPE *x, int n)
+{
+    for (int i = 0; i < n; i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+__attribute__ ((noinline, noipa, optimize("O0")))
+void fun2(TYPE *x, int n)
+{
+    for (int i = 0; i < n; i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  a[0] = INT_MIN;
+  b[0] = INT_MIN;
+
+  for (int i = 1; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N);
+  fun2 (b, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.dg/signbit-5.c b/gcc/testsuite/gcc.dg/signbit-5.c
new file mode 100644
index 0000000000000000000000000000000000000000..22a92704773e3282759524b74d35196a477d43dd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/signbit-5.c
@@ -0,0 +1,65 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+#include <limits.h>
+#include <stdio.h>
+
+#ifndef N
+#define N 65
+#endif
+
+#ifndef TYPE
+#define TYPE int32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 1
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+__attribute__ ((noinline, noipa))
+void fun1(TYPE *x, int n)
+{
+    for (int i = 0; i < n; i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+__attribute__ ((noinline, noipa, optimize("O1")))
+void fun2(TYPE *x, int n)
+{
+    for (int i = 0; i < n; i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  a[0] = INT_MIN;
+  b[0] = INT_MIN;
+
+  for (int i = 1; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N);
+  fun2 (b, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.dg/signbit-6.c b/gcc/testsuite/gcc.dg/signbit-6.c
new file mode 100644
index 0000000000000000000000000000000000000000..da186624cfa057dfc3780c8af4f6b1335ba07e7e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/signbit-6.c
@@ -0,0 +1,72 @@
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+
+#include <stdint.h>
+#include <limits.h>
+#include <stdio.h>
+
+#ifndef N
+#define N 65
+#endif
+
+#ifndef TYPE
+#define TYPE int32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 1
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+__attribute__ ((noinline, noipa))
+void fun1(TYPE *x, int n)
+{
+    for (int i = 0; i < n; i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+__attribute__ ((noinline, noipa, optimize("O0")))
+void fun2(TYPE *x, int n)
+{
+    for (int i = 0; i < n; i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  a[0] = INT_MIN;
+  b[0] = INT_MIN;
+
+  for (int i = 1; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N);
+  fun2 (b, N);
+
+  if (DEBUG)
+    printf ("%d = 0x%x == 0x%x\n", 0, a[0], b[0]);
+
+  if (a[0] != 0x0 || b[0] != -1)
+        __builtin_abort ();
+
+
+  for (int i = 1; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27fe48503714447e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
@@ -0,0 +1,18 @@
+/* { dg-do assemble } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <stdint.h>
+
+void fun1(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+void fun2(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 30;
+}
+
+/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */
  
Richard Biener Nov. 4, 2021, 1:06 p.m. UTC | #16
On Wed, 3 Nov 2021, Tamar Christina wrote:

> Hi,
> 
> I have addressed all the feedback and updated patch attached:
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> x86_64-pc-linux-gnu and no regressions.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* match.pd: New negate+shift pattern.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/signbit-2.c: New test.
> 	* gcc.dg/signbit-3.c: New test.
> 	* gcc.dg/signbit-4.c: New test.
> 	* gcc.dg/signbit-5.c: New test.
> 	* gcc.dg/signbit-6.c: New test.
> 	* gcc.target/aarch64/signbit-1.c: New test.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 65a6591f75c03333602147bbdf6d59f9ccd4b1e5..fe93500d22e2388889c8c9faf4c58cee95dec7f9 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -38,7 +38,8 @@ along with GCC; see the file COPYING3.  If not see
>     uniform_integer_cst_p
>     HONOR_NANS
>     uniform_vector_p
> -   bitmask_inv_cst_vector_p)
> +   bitmask_inv_cst_vector_p
> +   expand_vec_cmp_expr_p)
>  
>  /* Operator lists.  */
>  (define_operator_list tcc_comparison
> @@ -832,6 +833,38 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>      { tree utype = unsigned_type_for (type); }
>      (convert (rshift (lshift (convert:utype @0) @2) @3))))))
>  
> +/* Fold (-x >> C) into -(x > 0) where C = precision(type) - 1.  */
> +(for cst (INTEGER_CST VECTOR_CST)
> + (simplify
> +  (rshift (negate:s @0) cst@1)
> +   (if (!TYPE_UNSIGNED (type)
> +        && TYPE_OVERFLOW_UNDEFINED (type))
> +    (with { tree stype = TREE_TYPE (@1);
> +	    tree bt = truth_type_for (type);
> +	    tree zeros = build_zero_cst (type); }
> +     (switch
> +      /* Handle scalar case.  */
> +      (if (INTEGRAL_TYPE_P (type)
> +	   /* If we apply the rule to the scalar type before vectorization
> +	      we will enforce the result of the comparison being a bool
> +	      which will require an extra AND on the result that will be
> +	      indistinguishable from when the user did actually want 0
> +	      or 1 as the result so it can't be removed.  */
> +	   && canonicalize_math_after_vectorization_p ()
> +	   && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (type) - 1))
> +       (negate (convert (gt @0 { zeros; }))))
> +      /* Handle vector case.  */
> +      (if (VECTOR_INTEGER_TYPE_P (type)
> +	   /* First check whether the target has the same mode for vector
> +	      comparison results as it's operands do.  */
> +	   && TYPE_MODE (bt) == TYPE_MODE (type)
> +	   /* Then check to see if the target is able to expand the comparison
> +	      with the given type later on, otherwise we may ICE.  */
> +	   && expand_vec_cmp_expr_p (type, bt, { GT_EXPR }))

No need to wrap GT_EXPR in { }

> +       (with { tree cst = uniform_integer_cst_p (@1); }

if you declare 'cst' above where you declare 'bt' you can do

          && (cst = uniform_integer_cst_p (@1)))

combining it with the if above, and the one below, simplifying indents
and flow.

OK with that change.

I guess it might happen that the scalar transform expands badly
on some targets?  Please have an eye on problems that come up.

Thanks,
Richard.

> +	(if (cst && wi::eq_p (wi::to_wide (cst), element_precision (type) - 1))
> +	 (view_convert (gt:bt @0 { zeros; }))))))))))
> +
>  /* Fold (C1/X)*C2 into (C1*C2)/X.  */
>  (simplify
>   (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2)
> diff --git a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc30176c96a669bb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/signbit-2.c
> @@ -0,0 +1,19 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */
> +
> +#include <stdint.h>
> +
> +void fun1(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +void fun2(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 30;
> +}
> +
> +/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1 optimized } } */
> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> diff --git a/gcc/testsuite/gcc.dg/signbit-3.c b/gcc/testsuite/gcc.dg/signbit-3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628f00938ece60bf7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/signbit-3.c
> @@ -0,0 +1,13 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */
> +
> +#include <stdint.h>
> +
> +void fun1(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } } */
> +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
> diff --git a/gcc/testsuite/gcc.dg/signbit-4.c b/gcc/testsuite/gcc.dg/signbit-4.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..bc459ba60a760bdf49e94dbec762f378c24fe9b5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/signbit-4.c
> @@ -0,0 +1,65 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1 -fwrapv" } */
> +
> +#include <stdint.h>
> +#include <limits.h>
> +#include <stdio.h>
> +
> +#ifndef N
> +#define N 65
> +#endif
> +
> +#ifndef TYPE
> +#define TYPE int32_t
> +#endif
> +
> +#ifndef DEBUG
> +#define DEBUG 1
> +#endif
> +
> +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> +
> +__attribute__ ((noinline, noipa))
> +void fun1(TYPE *x, int n)
> +{
> +    for (int i = 0; i < n; i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +__attribute__ ((noinline, noipa, optimize("O0")))
> +void fun2(TYPE *x, int n)
> +{
> +    for (int i = 0; i < n; i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +int main ()
> +{
> +  TYPE a[N];
> +  TYPE b[N];
> +
> +  a[0] = INT_MIN;
> +  b[0] = INT_MIN;
> +
> +  for (int i = 1; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 13;
> +      b[i] = BASE + i * 13;
> +      if (DEBUG)
> +        printf ("%d: 0x%x\n", i, a[i]);
> +    }
> +
> +  fun1 (a, N);
> +  fun2 (b, N);
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (DEBUG)
> +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> +
> +      if (a[i] != b[i])
> +        __builtin_abort ();
> +    }
> +  return 0;
> +}
> +
> diff --git a/gcc/testsuite/gcc.dg/signbit-5.c b/gcc/testsuite/gcc.dg/signbit-5.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..22a92704773e3282759524b74d35196a477d43dd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/signbit-5.c
> @@ -0,0 +1,65 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3" } */
> +
> +#include <stdint.h>
> +#include <limits.h>
> +#include <stdio.h>
> +
> +#ifndef N
> +#define N 65
> +#endif
> +
> +#ifndef TYPE
> +#define TYPE int32_t
> +#endif
> +
> +#ifndef DEBUG
> +#define DEBUG 1
> +#endif
> +
> +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> +
> +__attribute__ ((noinline, noipa))
> +void fun1(TYPE *x, int n)
> +{
> +    for (int i = 0; i < n; i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +__attribute__ ((noinline, noipa, optimize("O1")))
> +void fun2(TYPE *x, int n)
> +{
> +    for (int i = 0; i < n; i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +int main ()
> +{
> +  TYPE a[N];
> +  TYPE b[N];
> +
> +  a[0] = INT_MIN;
> +  b[0] = INT_MIN;
> +
> +  for (int i = 1; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 13;
> +      b[i] = BASE + i * 13;
> +      if (DEBUG)
> +        printf ("%d: 0x%x\n", i, a[i]);
> +    }
> +
> +  fun1 (a, N);
> +  fun2 (b, N);
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (DEBUG)
> +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> +
> +      if (a[i] != b[i])
> +        __builtin_abort ();
> +    }
> +  return 0;
> +}
> +
> diff --git a/gcc/testsuite/gcc.dg/signbit-6.c b/gcc/testsuite/gcc.dg/signbit-6.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..da186624cfa057dfc3780c8af4f6b1335ba07e7e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/signbit-6.c
> @@ -0,0 +1,72 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1" } */
> +
> +#include <stdint.h>
> +#include <limits.h>
> +#include <stdio.h>
> +
> +#ifndef N
> +#define N 65
> +#endif
> +
> +#ifndef TYPE
> +#define TYPE int32_t
> +#endif
> +
> +#ifndef DEBUG
> +#define DEBUG 1
> +#endif
> +
> +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> +
> +__attribute__ ((noinline, noipa))
> +void fun1(TYPE *x, int n)
> +{
> +    for (int i = 0; i < n; i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +__attribute__ ((noinline, noipa, optimize("O0")))
> +void fun2(TYPE *x, int n)
> +{
> +    for (int i = 0; i < n; i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +int main ()
> +{
> +  TYPE a[N];
> +  TYPE b[N];
> +
> +  a[0] = INT_MIN;
> +  b[0] = INT_MIN;
> +
> +  for (int i = 1; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 13;
> +      b[i] = BASE + i * 13;
> +      if (DEBUG)
> +        printf ("%d: 0x%x\n", i, a[i]);
> +    }
> +
> +  fun1 (a, N);
> +  fun2 (b, N);
> +
> +  if (DEBUG)
> +    printf ("%d = 0x%x == 0x%x\n", 0, a[0], b[0]);
> +
> +  if (a[0] != 0x0 || b[0] != -1)
> +        __builtin_abort ();
> +
> +
> +  for (int i = 1; i < N; ++i)
> +    {
> +      if (DEBUG)
> +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> +
> +      if (a[i] != b[i])
> +        __builtin_abort ();
> +    }
> +  return 0;
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27fe48503714447e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
> @@ -0,0 +1,18 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O3 --save-temps" } */
> +
> +#include <stdint.h>
> +
> +void fun1(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 31;
> +}
> +
> +void fun2(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (-x[i]) >> 30;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */
>
  

Patch

diff --git a/gcc/match.pd b/gcc/match.pd
index 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..581436fe36dbacdcb0c2720b7190c96d14398143 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -826,6 +826,37 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
     { tree utype = unsigned_type_for (type); }
     (convert (rshift (lshift (convert:utype @0) @2) @3))))))
 
+/* Fold (-x >> C) into x > 0 where C = precision(type) - 1.  */
+(for cst (INTEGER_CST VECTOR_CST)
+ (simplify
+  (rshift (negate:s @0) cst@1)
+   (with { tree ctype = TREE_TYPE (@0);
+	   tree stype = TREE_TYPE (@1);
+	   tree bt = truth_type_for (ctype); }
+    (switch
+     /* Handle scalar case.  */
+     (if (INTEGRAL_TYPE_P (ctype)
+	  && !VECTOR_TYPE_P (ctype)
+	  && !TYPE_UNSIGNED (ctype)
+	  && canonicalize_math_after_vectorization_p ()
+	  && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
+      (convert:bt (gt:bt @0 { build_zero_cst (stype); })))
+     /* Handle vector case with a scalar immediate.  */
+     (if (VECTOR_INTEGER_TYPE_P (ctype)
+	  && !VECTOR_TYPE_P (stype)
+	  && !TYPE_UNSIGNED (ctype)
+          && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1))
+      (convert:bt (gt:bt @0 { build_zero_cst (ctype); })))
+     /* Handle vector case with a vector immediate.   */
+     (if (VECTOR_INTEGER_TYPE_P (ctype)
+	  && VECTOR_TYPE_P (stype)
+	  && !TYPE_UNSIGNED (ctype)
+	  && uniform_vector_p (@1))
+      (with { tree cst = vector_cst_elt (@1, 0);
+	      tree t = TREE_TYPE (cst); }
+       (if (wi::eq_p (wi::to_wide (cst), TYPE_PRECISION (t) - 1))
+        (convert:bt (gt:bt @0 { build_zero_cst (ctype); })))))))))
+
 /* Fold (C1/X)*C2 into (C1*C2)/X.  */
 (simplify
  (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2)
diff --git a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc30176c96a669bb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/signbit-2.c
@@ -0,0 +1,19 @@ 
+/* { dg-do assemble } */
+/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */
+
+#include <stdint.h>
+
+void fun1(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+void fun2(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 30;
+}
+
+/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1 optimized } } */
+/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
diff --git a/gcc/testsuite/gcc.dg/signbit-3.c b/gcc/testsuite/gcc.dg/signbit-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628f00938ece60bf7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/signbit-3.c
@@ -0,0 +1,13 @@ 
+/* { dg-do assemble } */
+/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */
+
+#include <stdint.h>
+
+void fun1(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } } */
+/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27fe48503714447e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c
@@ -0,0 +1,18 @@ 
+/* { dg-do assemble } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <stdint.h>
+
+void fun1(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 31;
+}
+
+void fun2(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (-x[i]) >> 30;
+}
+
+/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */