# [5/7] middle-end Convert bitclear <imm> + cmp<cc> #0 into cm<cc2> <imm2>

Message ID 20210929162106.GA5336@arm.com New show AArch64 Optimize truncation, shifts and bitmask comparisons | expand

## Commit Message

Tamar Christina Sept. 29, 2021, 4:21 p.m. UTC
```Hi All,

This optimizes the case where a mask Y which fulfills ~Y + 1 == pow2 is used to
clear a some bits and then compared against 0 into one without the masking and
a compare against a different bit immediate.

We can do this for all unsigned compares and for signed we can do it for
comparisons of EQ and NE:

(x & (~255)) == 0 becomes x <= 255. Which for leaves it to the target to
optimally deal with the comparison.

This transformation has to be done in the mid-end because in RTL you don't have
the signs of the comparison operands and if the target needs an immediate this
should be floated outside of the loop.

The RTL loop invariant hoisting is done before split1.

i.e.

void fun1(int32_t *x, int n)
{
for (int i = 0; i < (n & -16); i++)
x[i] = (x[i]&(~255)) == 0;
}

now generates:

.L3:
ldr     q0, [x0]
cmhs    v0.4s, v2.4s, v0.4s
and     v0.16b, v1.16b, v0.16b
str     q0, [x0], 16
cmp     x0, x1
bne     .L3

and floats the immediate out of the loop.

.L3:
ldr     q0, [x0]
bic     v0.4s, #255
cmeq    v0.4s, v0.4s, #0
and     v0.16b, v1.16b, v0.16b
str     q0, [x0], 16
cmp     x0, x1
bne     .L3

Bootstrapped Regtested on aarch64-none-linux-gnu,
x86_64-pc-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* match.pd: New bitmask compare pattern.

gcc/testsuite/ChangeLog:

--- inline copy of patch --
diff --git a/gcc/match.pd b/gcc/match.pd
index 0fcfd0ea62c043dc217d0d560ce5b7e569b70e7d..df9212cb27d172856b9d43b0875262f96e8993c4 100644

--
```

Richard Biener Sept. 30, 2021, 6:17 a.m. UTC | #1
```On Wed, 29 Sep 2021, Tamar Christina wrote:

> Hi All,
>
> This optimizes the case where a mask Y which fulfills ~Y + 1 == pow2 is used to
> clear a some bits and then compared against 0 into one without the masking and
> a compare against a different bit immediate.
>
> We can do this for all unsigned compares and for signed we can do it for
> comparisons of EQ and NE:
>
> (x & (~255)) == 0 becomes x <= 255. Which for leaves it to the target to
> optimally deal with the comparison.
>
> This transformation has to be done in the mid-end because in RTL you don't have
> the signs of the comparison operands and if the target needs an immediate this
> should be floated outside of the loop.
>
> The RTL loop invariant hoisting is done before split1.
>
> i.e.
>
> void fun1(int32_t *x, int n)
> {
>     for (int i = 0; i < (n & -16); i++)
>       x[i] = (x[i]&(~255)) == 0;
> }
>
> now generates:
>
> .L3:
>         ldr     q0, [x0]
>         cmhs    v0.4s, v2.4s, v0.4s
>         and     v0.16b, v1.16b, v0.16b
>         str     q0, [x0], 16
>         cmp     x0, x1
>         bne     .L3
>
> and floats the immediate out of the loop.
>
>
> .L3:
>         ldr     q0, [x0]
>         bic     v0.4s, #255
>         cmeq    v0.4s, v0.4s, #0
>         and     v0.16b, v1.16b, v0.16b
>         str     q0, [x0], 16
>         cmp     x0, x1
>         bne     .L3
>
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> x86_64-pc-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* match.pd: New bitmask compare pattern.
>
> gcc/testsuite/ChangeLog:
>
>
> --- inline copy of patch --
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 0fcfd0ea62c043dc217d0d560ce5b7e569b70e7d..df9212cb27d172856b9d43b0875262f96e8993c4 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -4288,6 +4288,56 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>      (if (ic == ncmp)
>       (ncmp @0 @1))))))
>
> +/* Transform comparisons of the form (X & Y) CMP 0 to X CMP2 Z
> +   where ~Y + 1 == pow2 and Z = ~Y.  */
> +(for cmp (simple_comparison)
> + (simplify
> +  (cmp (bit_and:c @0 VECTOR_CST@1) integer_zerop)

Why not for INTEGER_CST as well?  We do have a related folding (only
for INTEGER_CST) that does

/* A & (2**N - 1) <= 2**K - 1 -> A & (2**N - 2**K) == 0
A & (2**N - 1) >  2**K - 1 -> A & (2**N - 2**K) != 0

which could be extended for integer vectors.  That said, can you please
place the pattern next to the above?

Why does the transform only work for uniform vector constants?  (I see
that the implementation becomes simpler, but then you should also handle
the INTEGER_CST case at least)

> +   (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@1))
> +	&& uniform_vector_p (@1))
> +    (with { tree elt = vector_cst_elt (@1, 0); }
> +     (switch
> +      (if (TYPE_UNSIGNED (TREE_TYPE (@1)) && tree_fits_uhwi_p (elt))

avoid tree_fits_uhwi_p and use wide_int here

> +	(with { unsigned HOST_WIDE_INT diff = tree_to_uhwi (elt);
> +	        tree tdiff = wide_int_to_tree (TREE_TYPE (elt), (~diff) + 1);
> +		tree newval = wide_int_to_tree (TREE_TYPE (elt), ~diff);
> +		tree newmask = build_uniform_cst (TREE_TYPE (@1), newval); }
> +	 (if (integer_pow2p (tdiff))

You don't seem to use 'tdiff' so please do this check in wide_int

> +	  (switch
> +	   /* ((mask & x) < 0) -> 0.  */
> +	   (if (cmp == LT_EXPR)
> +	    { build_zero_cst (TREE_TYPE (@1)); })
> +	   /* ((mask & x) <= 0) -> x < mask.  */
> +	   (if (cmp == LE_EXPR)
> +	    (lt @0 { newmask; }))
> +	   /* ((mask & x) == 0) -> x < mask.  */
> +	   (if (cmp == EQ_EXPR)
> +	    (le @0 { newmask; }))
> +	   /* ((mask & x) != 0) -> x > mask.  */
> +	   (if (cmp == NE_EXPR)
> +	    (gt @0 { newmask; }))
> +	   /* ((mask & x) >= 0) -> x <= mask.  */
> +	   (if (cmp == GE_EXPR)
> +	    (le @0 { newmask; }))
> +	    /* ((mask & x) > 0) -> x < mask.  */
> +	   (if (cmp == GT_EXPR)
> +	    (lt @0 { newmask; }))))))

you can avoid this switch with a lock-step (for, that maps 'cmp'
to the result comparison code (for simplicity you can either keep
the LT_EXPR special-case or transform to an always true condition
which will be simplified).

> +      (if (!TYPE_UNSIGNED (TREE_TYPE (@1)) && tree_fits_shwi_p (elt))
> +	(with { unsigned HOST_WIDE_INT diff = tree_to_shwi (elt);
> +		tree ustype = unsigned_type_for (TREE_TYPE (elt));
> +		tree uvtype = unsigned_type_for (TREE_TYPE (@1));
> +	        tree tdiff = wide_int_to_tree (ustype, (~diff) + 1);
> +	        tree udiff = wide_int_to_tree (ustype, ~diff);
> +		tree cst = build_uniform_cst (uvtype, udiff); }
> +	 (if (integer_pow2p (tdiff))
> +	  (switch
> +	    /* ((mask & x) == 0) -> x < mask.  */
> +	    (if (cmp == EQ_EXPR)
> +	     (le (convert:uvtype @0) { cst; }))
> +	    /* ((mask & x) != 0) -> x > mask.  */
> +	    (if (cmp == NE_EXPR)
> +	     (gt (convert:uvtype @0) { cst; })))))))))))
> +
>  /* Transform comparisons of the form X - Y CMP 0 to X CMP Y.
>     ??? The transformation is valid for the other operators if overflow
>     is undefined for the type, but performing it here badly interacts
> new file mode 100644
> index 0000000000000000000000000000000000000000..76a22a2313137a2a75dd711c2c15c2d3a34e15aa
> --- /dev/null
> @@ -0,0 +1,26 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +#define TYPE int32_t
> +
> +/* { dg-final { scan-tree-dump {<=\s*.+\{ 255,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..32553d7ba2f823f7a21237451990d0a216d2f912
> --- /dev/null
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) != 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) != 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump {>\s*.+\{ 255,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..e10cbf7fabe2dbf7ce436cdf37b0f8b207c58408
> --- /dev/null
> @@ -0,0 +1,17 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O3 -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +typedef unsigned int v4si __attribute__ ((vector_size (16)));
> +
> +__attribute__((noinline, noipa))
> +void fun(v4si *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +/* { dg-final { scan-tree-dump {<=\s*.+\{ 255,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> new file mode 100644
> --- /dev/null
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> --- /dev/null
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..1bcf23ccf1447d6c8c999ed1eb25ba0a450028e1
> --- /dev/null
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) >= 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) >= 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {=\s*.+\{ 1,.+\}} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..6e5a2fca9992efbc01f8dbbc6f95936e86643028
> --- /dev/null
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) > 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) > 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {>\s*.+\{ 255,.+\}} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&`s*.+\{ 4294967040,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..018e7a4348c9fc461106c3d9d01291325d3406c2
> --- /dev/null
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) <= 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) <= 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..798678fb7555052c93abc4ca34f617d640f73bb4
> --- /dev/null
> @@ -0,0 +1,24 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~1)) < 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~1)) < 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {__builtin_memset} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..1dabe834ed57dfa0be48c1dc3dbb226092c79a1a
> --- /dev/null
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~1)) != 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~1)) != 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {>\s*.+\{ 1,.+\}} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967294,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> --- /dev/null
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~5)) == 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~5)) == 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-not {<=\s*.+\{ 4294967289,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump {&\s*.+\{ 4294967290,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..2b94065c025e0cbf71a21ac9b9d6314e24b0c2d9
> --- /dev/null
> @@ -0,0 +1,43 @@
> +#include <stdio.h>
> +
> +#ifndef N
> +#define N 50
> +#endif
> +
> +#ifndef TYPE
> +#define TYPE uint32_t
> +#endif
> +
> +#ifndef DEBUG
> +#define DEBUG 0
> +#endif
> +
> +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> +
> +int main ()
> +{
> +  TYPE a[N];
> +  TYPE b[N];
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 13;
> +      b[i] = BASE + i * 13;
> +      if (DEBUG)
> +        printf ("%d: 0x%x\n", i, a[i]);
> +    }
> +
> +  fun1 (a, N);
> +  fun2 (b, N);
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (DEBUG)
> +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> +
> +      if (a[i] != b[i])
> +        __builtin_abort ();
> +    }
> +  return 0;
> +}
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..568c1ffc8bc4148efaeeba7a45a75ecbd3a7a3dd
> --- /dev/null
> @@ -0,0 +1,13 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O2 -save-temps" } */
> +
> +#include <arm_neon.h>
> +
> +uint32x4_t foo (int32x4_t a)
> +{
> +  int32x4_t cst = vdupq_n_s32 (255);
> +  int32x4_t zero = vdupq_n_s32 (0);
> +  return vceqq_s32 (vbicq_s32 (a, cst), zero);
> +}
> +
> +/* { dg-final { scan-assembler-not {\tbic\t} { xfail { aarch64*-*-* } } } } */
>
>
>
```
Tamar Christina Sept. 30, 2021, 9:56 a.m. UTC | #2
```> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Thursday, September 30, 2021 7:18 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> Subject: Re: [PATCH 5/7]middle-end Convert bitclear <imm> + cmp<cc> #0
> into cm<cc2> <imm2>
>
> On Wed, 29 Sep 2021, Tamar Christina wrote:
>
> > Hi All,
> >
> > This optimizes the case where a mask Y which fulfills ~Y + 1 == pow2
> > is used to clear a some bits and then compared against 0 into one
> > without the masking and a compare against a different bit immediate.
> >
> > We can do this for all unsigned compares and for signed we can do it
> > for comparisons of EQ and NE:
> >
> > (x & (~255)) == 0 becomes x <= 255. Which for leaves it to the target
> > to optimally deal with the comparison.
> >
> > This transformation has to be done in the mid-end because in RTL you
> > don't have the signs of the comparison operands and if the target
> > needs an immediate this should be floated outside of the loop.
> >
> > The RTL loop invariant hoisting is done before split1.
> >
> > i.e.
> >
> > void fun1(int32_t *x, int n)
> > {
> >     for (int i = 0; i < (n & -16); i++)
> >       x[i] = (x[i]&(~255)) == 0;
> > }
> >
> > now generates:
> >
> > .L3:
> >         ldr     q0, [x0]
> >         cmhs    v0.4s, v2.4s, v0.4s
> >         and     v0.16b, v1.16b, v0.16b
> >         str     q0, [x0], 16
> >         cmp     x0, x1
> >         bne     .L3
> >
> > and floats the immediate out of the loop.
> >
> >
> > .L3:
> >         ldr     q0, [x0]
> >         bic     v0.4s, #255
> >         cmeq    v0.4s, v0.4s, #0
> >         and     v0.16b, v1.16b, v0.16b
> >         str     q0, [x0], 16
> >         cmp     x0, x1
> >         bne     .L3
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* match.pd: New bitmask compare pattern.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.dg/bic-bitmask-10.c: New test.
> > 	* gcc.dg/bic-bitmask-11.c: New test.
> > 	* gcc.dg/bic-bitmask-12.c: New test.
> > 	* gcc.dg/bic-bitmask-2.c: New test.
> > 	* gcc.dg/bic-bitmask-3.c: New test.
> > 	* gcc.dg/bic-bitmask-4.c: New test.
> > 	* gcc.dg/bic-bitmask-5.c: New test.
> > 	* gcc.dg/bic-bitmask-6.c: New test.
> > 	* gcc.dg/bic-bitmask-7.c: New test.
> > 	* gcc.dg/bic-bitmask-8.c: New test.
> > 	* gcc.dg/bic-bitmask-9.c: New test.
> > 	* gcc.dg/bic-bitmask.h: New test.
> > 	* gcc.target/aarch64/bic-bitmask-1.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/match.pd b/gcc/match.pd index
> >
> 0fcfd0ea62c043dc217d0d560ce5b7e569b70e7d..df9212cb27d172856b9d43b08
> 752
> > 62f96e8993c4 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -4288,6 +4288,56 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >      (if (ic == ncmp)
> >       (ncmp @0 @1))))))
> >
> > +/* Transform comparisons of the form (X & Y) CMP 0 to X CMP2 Z
> > +   where ~Y + 1 == pow2 and Z = ~Y.  */ (for cmp (simple_comparison)
> > +(simplify
> > +  (cmp (bit_and:c @0 VECTOR_CST@1) integer_zerop)
>
> Why not for INTEGER_CST as well?  We do have a related folding (only for
> INTEGER_CST) that does
>

Because of a slight concern to de-optimize what targets currently generate for the flag setting variants.
So for example AArch64 generates worse code for foo than it does bar

int foo (int x)
{
if (x <= 0xFFFF)
return 1;

return 0;
}

int bar (int x)
{
if (x & ~0xFFFF)
return 1;

return 0;
}

Because the flag setting bitmask was optimized more.  I can of course do this and fix
AArch64 but other targets may have the same issue.  For vectors this was less of a concern since
there's not flag setting there.

Do you still want the scalar version?

Thanks,
Tamar

> /* A & (2**N - 1) <= 2**K - 1 -> A & (2**N - 2**K) == 0
>    A & (2**N - 1) >  2**K - 1 -> A & (2**N - 2**K) != 0
>
> which could be extended for integer vectors.  That said, can you please place
> the pattern next to the above?
>
> Why does the transform only work for uniform vector constants?  (I see that
> the implementation becomes simpler, but then you should also handle the
> INTEGER_CST case at least)
>
> > +   (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@1))
> > +	&& uniform_vector_p (@1))
> > +    (with { tree elt = vector_cst_elt (@1, 0); }
> > +     (switch
> > +      (if (TYPE_UNSIGNED (TREE_TYPE (@1)) && tree_fits_uhwi_p (elt))
>
> avoid tree_fits_uhwi_p and use wide_int here
>
> > +	(with { unsigned HOST_WIDE_INT diff = tree_to_uhwi (elt);
> > +	        tree tdiff = wide_int_to_tree (TREE_TYPE (elt), (~diff) + 1);
> > +		tree newval = wide_int_to_tree (TREE_TYPE (elt), ~diff);
> > +		tree newmask = build_uniform_cst (TREE_TYPE (@1),
> newval); }
> > +	 (if (integer_pow2p (tdiff))
>
> You don't seem to use 'tdiff' so please do this check in wide_int
>
> > +	  (switch
> > +	   /* ((mask & x) < 0) -> 0.  */
> > +	   (if (cmp == LT_EXPR)
> > +	    { build_zero_cst (TREE_TYPE (@1)); })
> > +	   /* ((mask & x) <= 0) -> x < mask.  */
> > +	   (if (cmp == LE_EXPR)
> > +	    (lt @0 { newmask; }))
> > +	   /* ((mask & x) == 0) -> x < mask.  */
> > +	   (if (cmp == EQ_EXPR)
> > +	    (le @0 { newmask; }))
> > +	   /* ((mask & x) != 0) -> x > mask.  */
> > +	   (if (cmp == NE_EXPR)
> > +	    (gt @0 { newmask; }))
> > +	   /* ((mask & x) >= 0) -> x <= mask.  */
> > +	   (if (cmp == GE_EXPR)
> > +	    (le @0 { newmask; }))
> > +	    /* ((mask & x) > 0) -> x < mask.  */
> > +	   (if (cmp == GT_EXPR)
> > +	    (lt @0 { newmask; }))))))
>
> you can avoid this switch with a lock-step (for, that maps 'cmp'
> to the result comparison code (for simplicity you can either keep the LT_EXPR
> special-case or transform to an always true condition which will be simplified).
>
> > +      (if (!TYPE_UNSIGNED (TREE_TYPE (@1)) && tree_fits_shwi_p (elt))
> > +	(with { unsigned HOST_WIDE_INT diff = tree_to_shwi (elt);
> > +		tree ustype = unsigned_type_for (TREE_TYPE (elt));
> > +		tree uvtype = unsigned_type_for (TREE_TYPE (@1));
> > +	        tree tdiff = wide_int_to_tree (ustype, (~diff) + 1);
> > +	        tree udiff = wide_int_to_tree (ustype, ~diff);
> > +		tree cst = build_uniform_cst (uvtype, udiff); }
> > +	 (if (integer_pow2p (tdiff))
> > +	  (switch
> > +	    /* ((mask & x) == 0) -> x < mask.  */
> > +	    (if (cmp == EQ_EXPR)
> > +	     (le (convert:uvtype @0) { cst; }))
> > +	    /* ((mask & x) != 0) -> x > mask.  */
> > +	    (if (cmp == NE_EXPR)
> > +	     (gt (convert:uvtype @0) { cst; })))))))))))
> > +
> >  /* Transform comparisons of the form X - Y CMP 0 to X CMP Y.
> >     ??? The transformation is valid for the other operators if overflow
> >     is undefined for the type, but performing it here badly interacts
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..76a22a2313137a2a75dd711c2c
> 15
> > c2d3a34e15aa
> > --- /dev/null
> > @@ -0,0 +1,26 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > +
> > +#include <stdint.h>
> > +
> > +__attribute__((noinline, noipa))
> > +void fun1(int32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~255)) == 0;
> > +}
> > +
> > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(int32_t
> > +*x, int n) {
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~255)) == 0;
> > +}
> > +
> > +#define TYPE int32_t
> > +
> > +/* { dg-final { scan-tree-dump {<=\s*.+\{ 255,.+\}} dce7 } } */
> > +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 }
> > +} */
> > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > +aarch64*-*-* } } } } */
> > +
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..32553d7ba2f823f7a212374519
> 90
> > d0a216d2f912
> > --- /dev/null
> > @@ -0,0 +1,25 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > +
> > +#include <stdint.h>
> > +
> > +__attribute__((noinline, noipa))
> > +void fun1(uint32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~255)) != 0;
> > +}
> > +
> > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > +*x, int n) {
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~255)) != 0;
> > +}
> > +
> > +
> > +/* { dg-final { scan-tree-dump {>\s*.+\{ 255,.+\}} dce7 } } */
> > +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 }
> > +} */
> > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > +aarch64*-*-* } } } } */
> > +
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..e10cbf7fabe2dbf7ce436cdf37
> b0
> > f8b207c58408
> > --- /dev/null
> > @@ -0,0 +1,17 @@
> > +/* { dg-do assemble } */
> > +/* { dg-options "-O3 -fdump-tree-dce" } */
> > +
> > +#include <stdint.h>
> > +
> > +typedef unsigned int v4si __attribute__ ((vector_size (16)));
> > +
> > +__attribute__((noinline, noipa))
> > +void fun(v4si *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~255)) == 0;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump {<=\s*.+\{ 255,.+\}} dce7 } } */
> > +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 }
> > +} */
> > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > +aarch64*-*-* } } } } */
> > new file mode 100644
> > index
> >
> fa
> > ec0be577e13f
> > --- /dev/null
> > @@ -0,0 +1,25 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > +
> > +#include <stdint.h>
> > +
> > +__attribute__((noinline, noipa))
> > +void fun1(uint32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~255)) == 0;
> > +}
> > +
> > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > +*x, int n) {
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~255)) == 0;
> > +}
> > +
> > +
> > +/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } }
> > +*/
> > +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 }
> > +} */
> > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > +aarch64*-*-* } } } } */
> > +
> > new file mode 100644
> > index
> >
> fa
> > ec0be577e13f
> > --- /dev/null
> > @@ -0,0 +1,25 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > +
> > +#include <stdint.h>
> > +
> > +__attribute__((noinline, noipa))
> > +void fun1(uint32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~255)) == 0;
> > +}
> > +
> > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > +*x, int n) {
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~255)) == 0;
> > +}
> > +
> > +
> > +/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } }
> > +*/
> > +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 }
> > +} */
> > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > +aarch64*-*-* } } } } */
> > +
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..1bcf23ccf1447d6c8c999ed1eb
> 25
> > ba0a450028e1
> > --- /dev/null
> > @@ -0,0 +1,25 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > +
> > +#include <stdint.h>
> > +
> > +__attribute__((noinline, noipa))
> > +void fun1(uint32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~255)) >= 0;
> > +}
> > +
> > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > +*x, int n) {
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~255)) >= 0;
> > +}
> > +
> > +
> > +/* { dg-final { scan-tree-dump-times {=\s*.+\{ 1,.+\}} 1 dce7 } } */
> > +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 }
> > +} */
> > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > +aarch64*-*-* } } } } */
> > +
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..6e5a2fca9992efbc01f8dbbc6f
> 95
> > 936e86643028
> > --- /dev/null
> > @@ -0,0 +1,25 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > +
> > +#include <stdint.h>
> > +
> > +__attribute__((noinline, noipa))
> > +void fun1(uint32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~255)) > 0;
> > +}
> > +
> > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > +*x, int n) {
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~255)) > 0;
> > +}
> > +
> > +
> > +/* { dg-final { scan-tree-dump-times {>\s*.+\{ 255,.+\}} 1 dce7 } }
> > +*/
> > +/* { dg-final { scan-tree-dump-not {&`s*.+\{ 4294967040,.+\}} dce7 }
> > +} */
> > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > +aarch64*-*-* } } } } */
> > +
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..018e7a4348c9fc461106c3d9d0
> 12
> > 91325d3406c2
> > --- /dev/null
> > @@ -0,0 +1,25 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > +
> > +#include <stdint.h>
> > +
> > +__attribute__((noinline, noipa))
> > +void fun1(uint32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~255)) <= 0;
> > +}
> > +
> > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > +*x, int n) {
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~255)) <= 0;
> > +}
> > +
> > +
> > +/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } }
> > +*/
> > +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 }
> > +} */
> > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > +aarch64*-*-* } } } } */
> > +
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..798678fb7555052c93abc4ca34
> f6
> > 17d640f73bb4
> > --- /dev/null
> > @@ -0,0 +1,24 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > +
> > +#include <stdint.h>
> > +
> > +__attribute__((noinline, noipa))
> > +void fun1(uint32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~1)) < 0;
> > +}
> > +
> > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > +*x, int n) {
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~1)) < 0;
> > +}
> > +
> > +
> > +/* { dg-final { scan-tree-dump-times {__builtin_memset} 1 dce7 } } */
> > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > +aarch64*-*-* } } } } */
> > +
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..1dabe834ed57dfa0be48c1dc3
> dbb
> > 226092c79a1a
> > --- /dev/null
> > @@ -0,0 +1,25 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > +
> > +#include <stdint.h>
> > +
> > +__attribute__((noinline, noipa))
> > +void fun1(uint32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~1)) != 0;
> > +}
> > +
> > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > +*x, int n) {
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~1)) != 0;
> > +}
> > +
> > +
> > +/* { dg-final { scan-tree-dump-times {>\s*.+\{ 1,.+\}} 1 dce7 } } */
> > +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967294,.+\}} dce7 }
> > +} */
> > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > +aarch64*-*-* } } } } */
> > +
> > new file mode 100644
> > index
> >
> ea
> > 26bb6b693e49
> > --- /dev/null
> > @@ -0,0 +1,25 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > +
> > +#include <stdint.h>
> > +
> > +__attribute__((noinline, noipa))
> > +void fun1(uint32_t *x, int n)
> > +{
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~5)) == 0;
> > +}
> > +
> > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > +*x, int n) {
> > +    for (int i = 0; i < (n & -16); i++)
> > +      x[i] = (x[i]&(~5)) == 0;
> > +}
> > +
> > +
> > +/* { dg-final { scan-tree-dump-not {<=\s*.+\{ 4294967289,.+\}} dce7 }
> > +} */
> > +/* { dg-final { scan-tree-dump {&\s*.+\{ 4294967290,.+\}} dce7 } } */
> > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > +aarch64*-*-* } } } } */
> > +
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..2b94065c025e0cbf71a21ac9b9
> d6
> > 314e24b0c2d9
> > --- /dev/null
> > @@ -0,0 +1,43 @@
> > +#include <stdio.h>
> > +
> > +#ifndef N
> > +#define N 50
> > +#endif
> > +
> > +#ifndef TYPE
> > +#define TYPE uint32_t
> > +#endif
> > +
> > +#ifndef DEBUG
> > +#define DEBUG 0
> > +#endif
> > +
> > +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> > +
> > +int main ()
> > +{
> > +  TYPE a[N];
> > +  TYPE b[N];
> > +
> > +  for (int i = 0; i < N; ++i)
> > +    {
> > +      a[i] = BASE + i * 13;
> > +      b[i] = BASE + i * 13;
> > +      if (DEBUG)
> > +        printf ("%d: 0x%x\n", i, a[i]);
> > +    }
> > +
> > +  fun1 (a, N);
> > +  fun2 (b, N);
> > +
> > +  for (int i = 0; i < N; ++i)
> > +    {
> > +      if (DEBUG)
> > +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> > +
> > +      if (a[i] != b[i])
> > +        __builtin_abort ();
> > +    }
> > +  return 0;
> > +}
> > +
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..568c1ffc8bc4148efaeeba7a45
> a7
> > 5ecbd3a7a3dd
> > --- /dev/null
> > @@ -0,0 +1,13 @@
> > +/* { dg-do assemble } */
> > +/* { dg-options "-O2 -save-temps" } */
> > +
> > +#include <arm_neon.h>
> > +
> > +uint32x4_t foo (int32x4_t a)
> > +{
> > +  int32x4_t cst = vdupq_n_s32 (255);
> > +  int32x4_t zero = vdupq_n_s32 (0);
> > +  return vceqq_s32 (vbicq_s32 (a, cst), zero); }
> > +
> > +/* { dg-final { scan-assembler-not {\tbic\t} { xfail { aarch64*-*-* }
> > +} } } */
> >
> >
> >
>
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409
> Nuernberg, Germany; GF: Felix ImendÃ¶rffer; HRB 36809 (AG Nuernberg)
```
Richard Biener Sept. 30, 2021, 10:26 a.m. UTC | #3
```On Thu, 30 Sep 2021, Tamar Christina wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Thursday, September 30, 2021 7:18 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> > Subject: Re: [PATCH 5/7]middle-end Convert bitclear <imm> + cmp<cc> #0
> > into cm<cc2> <imm2>
> >
> > On Wed, 29 Sep 2021, Tamar Christina wrote:
> >
> > > Hi All,
> > >
> > > This optimizes the case where a mask Y which fulfills ~Y + 1 == pow2
> > > is used to clear a some bits and then compared against 0 into one
> > > without the masking and a compare against a different bit immediate.
> > >
> > > We can do this for all unsigned compares and for signed we can do it
> > > for comparisons of EQ and NE:
> > >
> > > (x & (~255)) == 0 becomes x <= 255. Which for leaves it to the target
> > > to optimally deal with the comparison.
> > >
> > > This transformation has to be done in the mid-end because in RTL you
> > > don't have the signs of the comparison operands and if the target
> > > needs an immediate this should be floated outside of the loop.
> > >
> > > The RTL loop invariant hoisting is done before split1.
> > >
> > > i.e.
> > >
> > > void fun1(int32_t *x, int n)
> > > {
> > >     for (int i = 0; i < (n & -16); i++)
> > >       x[i] = (x[i]&(~255)) == 0;
> > > }
> > >
> > > now generates:
> > >
> > > .L3:
> > >         ldr     q0, [x0]
> > >         cmhs    v0.4s, v2.4s, v0.4s
> > >         and     v0.16b, v1.16b, v0.16b
> > >         str     q0, [x0], 16
> > >         cmp     x0, x1
> > >         bne     .L3
> > >
> > > and floats the immediate out of the loop.
> > >
> > >
> > > .L3:
> > >         ldr     q0, [x0]
> > >         bic     v0.4s, #255
> > >         cmeq    v0.4s, v0.4s, #0
> > >         and     v0.16b, v1.16b, v0.16b
> > >         str     q0, [x0], 16
> > >         cmp     x0, x1
> > >         bne     .L3
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > > and no issues.
> > >
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > 	* match.pd: New bitmask compare pattern.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > 	* gcc.dg/bic-bitmask-10.c: New test.
> > > 	* gcc.dg/bic-bitmask-11.c: New test.
> > > 	* gcc.dg/bic-bitmask-12.c: New test.
> > > 	* gcc.dg/bic-bitmask-2.c: New test.
> > > 	* gcc.dg/bic-bitmask-3.c: New test.
> > > 	* gcc.dg/bic-bitmask-4.c: New test.
> > > 	* gcc.dg/bic-bitmask-5.c: New test.
> > > 	* gcc.dg/bic-bitmask-6.c: New test.
> > > 	* gcc.dg/bic-bitmask-7.c: New test.
> > > 	* gcc.dg/bic-bitmask-8.c: New test.
> > > 	* gcc.dg/bic-bitmask-9.c: New test.
> > > 	* gcc.dg/bic-bitmask.h: New test.
> > > 	* gcc.target/aarch64/bic-bitmask-1.c: New test.
> > >
> > > --- inline copy of patch --
> > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > >
> > 0fcfd0ea62c043dc217d0d560ce5b7e569b70e7d..df9212cb27d172856b9d43b08
> > 752
> > > 62f96e8993c4 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -4288,6 +4288,56 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >      (if (ic == ncmp)
> > >       (ncmp @0 @1))))))
> > >
> > > +/* Transform comparisons of the form (X & Y) CMP 0 to X CMP2 Z
> > > +   where ~Y + 1 == pow2 and Z = ~Y.  */ (for cmp (simple_comparison)
> > > +(simplify
> > > +  (cmp (bit_and:c @0 VECTOR_CST@1) integer_zerop)
> >
> > Why not for INTEGER_CST as well?  We do have a related folding (only for
> > INTEGER_CST) that does
> >
>
> Because of a slight concern to de-optimize what targets currently generate for the flag setting variants.
> So for example AArch64 generates worse code for foo than it does bar
>
> int foo (int x)
> {
>     if (x <= 0xFFFF)
>       return 1;
>
>     return 0;
> }
>
> int bar (int x)
> {
>     if (x & ~0xFFFF)
>       return 1;
>
>     return 0;
> }
>
> Because the flag setting bitmask was optimized more.  I can of course do this and fix
> AArch64 but other targets may have the same issue.  For vectors this was less of a concern since
> there's not flag setting there.
>
> Do you still want the scalar version?

Yes, the simplification result is simpler and thus more canonical on
GIMPLE.  On x86 we generate

xorl    %eax, %eax
cmpl    \$65535, %edi
setle   %al
ret

vs

xorl    %eax, %eax
andl    \$-65536, %edi
setne   %al
ret

which are equivalent I think (and would be easily be transformed
using a peephole if required).

Richard.

> Thanks,
> Tamar
>
> > /* A & (2**N - 1) <= 2**K - 1 -> A & (2**N - 2**K) == 0
> >    A & (2**N - 1) >  2**K - 1 -> A & (2**N - 2**K) != 0
> >
> > which could be extended for integer vectors.  That said, can you please place
> > the pattern next to the above?
> >
> > Why does the transform only work for uniform vector constants?  (I see that
> > the implementation becomes simpler, but then you should also handle the
> > INTEGER_CST case at least)
> >
> > > +   (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@1))
> > > +	&& uniform_vector_p (@1))
> > > +    (with { tree elt = vector_cst_elt (@1, 0); }
> > > +     (switch
> > > +      (if (TYPE_UNSIGNED (TREE_TYPE (@1)) && tree_fits_uhwi_p (elt))
> >
> > avoid tree_fits_uhwi_p and use wide_int here
> >
> > > +	(with { unsigned HOST_WIDE_INT diff = tree_to_uhwi (elt);
> > > +	        tree tdiff = wide_int_to_tree (TREE_TYPE (elt), (~diff) + 1);
> > > +		tree newval = wide_int_to_tree (TREE_TYPE (elt), ~diff);
> > > +		tree newmask = build_uniform_cst (TREE_TYPE (@1),
> > newval); }
> > > +	 (if (integer_pow2p (tdiff))
> >
> > You don't seem to use 'tdiff' so please do this check in wide_int
> >
> > > +	  (switch
> > > +	   /* ((mask & x) < 0) -> 0.  */
> > > +	   (if (cmp == LT_EXPR)
> > > +	    { build_zero_cst (TREE_TYPE (@1)); })
> > > +	   /* ((mask & x) <= 0) -> x < mask.  */
> > > +	   (if (cmp == LE_EXPR)
> > > +	    (lt @0 { newmask; }))
> > > +	   /* ((mask & x) == 0) -> x < mask.  */
> > > +	   (if (cmp == EQ_EXPR)
> > > +	    (le @0 { newmask; }))
> > > +	   /* ((mask & x) != 0) -> x > mask.  */
> > > +	   (if (cmp == NE_EXPR)
> > > +	    (gt @0 { newmask; }))
> > > +	   /* ((mask & x) >= 0) -> x <= mask.  */
> > > +	   (if (cmp == GE_EXPR)
> > > +	    (le @0 { newmask; }))
> > > +	    /* ((mask & x) > 0) -> x < mask.  */
> > > +	   (if (cmp == GT_EXPR)
> > > +	    (lt @0 { newmask; }))))))
> >
> > you can avoid this switch with a lock-step (for, that maps 'cmp'
> > to the result comparison code (for simplicity you can either keep the LT_EXPR
> > special-case or transform to an always true condition which will be simplified).
> >
> > > +      (if (!TYPE_UNSIGNED (TREE_TYPE (@1)) && tree_fits_shwi_p (elt))
> > > +	(with { unsigned HOST_WIDE_INT diff = tree_to_shwi (elt);
> > > +		tree ustype = unsigned_type_for (TREE_TYPE (elt));
> > > +		tree uvtype = unsigned_type_for (TREE_TYPE (@1));
> > > +	        tree tdiff = wide_int_to_tree (ustype, (~diff) + 1);
> > > +	        tree udiff = wide_int_to_tree (ustype, ~diff);
> > > +		tree cst = build_uniform_cst (uvtype, udiff); }
> > > +	 (if (integer_pow2p (tdiff))
> > > +	  (switch
> > > +	    /* ((mask & x) == 0) -> x < mask.  */
> > > +	    (if (cmp == EQ_EXPR)
> > > +	     (le (convert:uvtype @0) { cst; }))
> > > +	    /* ((mask & x) != 0) -> x > mask.  */
> > > +	    (if (cmp == NE_EXPR)
> > > +	     (gt (convert:uvtype @0) { cst; })))))))))))
> > > +
> > >  /* Transform comparisons of the form X - Y CMP 0 to X CMP Y.
> > >     ??? The transformation is valid for the other operators if overflow
> > >     is undefined for the type, but performing it here badly interacts
> > > diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-10.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..76a22a2313137a2a75dd711c2c
> > 15
> > > c2d3a34e15aa
> > > --- /dev/null
> > > @@ -0,0 +1,26 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +__attribute__((noinline, noipa))
> > > +void fun1(int32_t *x, int n)
> > > +{
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~255)) == 0;
> > > +}
> > > +
> > > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(int32_t
> > > +*x, int n) {
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~255)) == 0;
> > > +}
> > > +
> > > +#define TYPE int32_t
> > > +
> > > +/* { dg-final { scan-tree-dump {<=\s*.+\{ 255,.+\}} dce7 } } */
> > > +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 }
> > > +} */
> > > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > > +aarch64*-*-* } } } } */
> > > +
> > > diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-11.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..32553d7ba2f823f7a212374519
> > 90
> > > d0a216d2f912
> > > --- /dev/null
> > > @@ -0,0 +1,25 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +__attribute__((noinline, noipa))
> > > +void fun1(uint32_t *x, int n)
> > > +{
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~255)) != 0;
> > > +}
> > > +
> > > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > > +*x, int n) {
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~255)) != 0;
> > > +}
> > > +
> > > +
> > > +/* { dg-final { scan-tree-dump {>\s*.+\{ 255,.+\}} dce7 } } */
> > > +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 }
> > > +} */
> > > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > > +aarch64*-*-* } } } } */
> > > +
> > > diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-12.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..e10cbf7fabe2dbf7ce436cdf37
> > b0
> > > f8b207c58408
> > > --- /dev/null
> > > @@ -0,0 +1,17 @@
> > > +/* { dg-do assemble } */
> > > +/* { dg-options "-O3 -fdump-tree-dce" } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +typedef unsigned int v4si __attribute__ ((vector_size (16)));
> > > +
> > > +__attribute__((noinline, noipa))
> > > +void fun(v4si *x, int n)
> > > +{
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~255)) == 0;
> > > +}
> > > +
> > > +/* { dg-final { scan-tree-dump {<=\s*.+\{ 255,.+\}} dce7 } } */
> > > +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 }
> > > +} */
> > > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > > +aarch64*-*-* } } } } */
> > > diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-2.c
> > > new file mode 100644
> > > index
> > >
> > fa
> > > ec0be577e13f
> > > --- /dev/null
> > > @@ -0,0 +1,25 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +__attribute__((noinline, noipa))
> > > +void fun1(uint32_t *x, int n)
> > > +{
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~255)) == 0;
> > > +}
> > > +
> > > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > > +*x, int n) {
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~255)) == 0;
> > > +}
> > > +
> > > +
> > > +/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } }
> > > +*/
> > > +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 }
> > > +} */
> > > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > > +aarch64*-*-* } } } } */
> > > +
> > > diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-3.c
> > > new file mode 100644
> > > index
> > >
> > fa
> > > ec0be577e13f
> > > --- /dev/null
> > > @@ -0,0 +1,25 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +__attribute__((noinline, noipa))
> > > +void fun1(uint32_t *x, int n)
> > > +{
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~255)) == 0;
> > > +}
> > > +
> > > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > > +*x, int n) {
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~255)) == 0;
> > > +}
> > > +
> > > +
> > > +/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } }
> > > +*/
> > > +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 }
> > > +} */
> > > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > > +aarch64*-*-* } } } } */
> > > +
> > > diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-4.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..1bcf23ccf1447d6c8c999ed1eb
> > 25
> > > ba0a450028e1
> > > --- /dev/null
> > > @@ -0,0 +1,25 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +__attribute__((noinline, noipa))
> > > +void fun1(uint32_t *x, int n)
> > > +{
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~255)) >= 0;
> > > +}
> > > +
> > > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > > +*x, int n) {
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~255)) >= 0;
> > > +}
> > > +
> > > +
> > > +/* { dg-final { scan-tree-dump-times {=\s*.+\{ 1,.+\}} 1 dce7 } } */
> > > +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 }
> > > +} */
> > > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > > +aarch64*-*-* } } } } */
> > > +
> > > diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-5.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..6e5a2fca9992efbc01f8dbbc6f
> > 95
> > > 936e86643028
> > > --- /dev/null
> > > @@ -0,0 +1,25 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +__attribute__((noinline, noipa))
> > > +void fun1(uint32_t *x, int n)
> > > +{
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~255)) > 0;
> > > +}
> > > +
> > > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > > +*x, int n) {
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~255)) > 0;
> > > +}
> > > +
> > > +
> > > +/* { dg-final { scan-tree-dump-times {>\s*.+\{ 255,.+\}} 1 dce7 } }
> > > +*/
> > > +/* { dg-final { scan-tree-dump-not {&`s*.+\{ 4294967040,.+\}} dce7 }
> > > +} */
> > > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > > +aarch64*-*-* } } } } */
> > > +
> > > diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-6.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..018e7a4348c9fc461106c3d9d0
> > 12
> > > 91325d3406c2
> > > --- /dev/null
> > > @@ -0,0 +1,25 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +__attribute__((noinline, noipa))
> > > +void fun1(uint32_t *x, int n)
> > > +{
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~255)) <= 0;
> > > +}
> > > +
> > > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > > +*x, int n) {
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~255)) <= 0;
> > > +}
> > > +
> > > +
> > > +/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } }
> > > +*/
> > > +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 }
> > > +} */
> > > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > > +aarch64*-*-* } } } } */
> > > +
> > > diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-7.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..798678fb7555052c93abc4ca34
> > f6
> > > 17d640f73bb4
> > > --- /dev/null
> > > @@ -0,0 +1,24 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +__attribute__((noinline, noipa))
> > > +void fun1(uint32_t *x, int n)
> > > +{
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~1)) < 0;
> > > +}
> > > +
> > > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > > +*x, int n) {
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~1)) < 0;
> > > +}
> > > +
> > > +
> > > +/* { dg-final { scan-tree-dump-times {__builtin_memset} 1 dce7 } } */
> > > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > > +aarch64*-*-* } } } } */
> > > +
> > > diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-8.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..1dabe834ed57dfa0be48c1dc3
> > dbb
> > > 226092c79a1a
> > > --- /dev/null
> > > @@ -0,0 +1,25 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +__attribute__((noinline, noipa))
> > > +void fun1(uint32_t *x, int n)
> > > +{
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~1)) != 0;
> > > +}
> > > +
> > > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > > +*x, int n) {
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~1)) != 0;
> > > +}
> > > +
> > > +
> > > +/* { dg-final { scan-tree-dump-times {>\s*.+\{ 1,.+\}} 1 dce7 } } */
> > > +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967294,.+\}} dce7 }
> > > +} */
> > > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > > +aarch64*-*-* } } } } */
> > > +
> > > diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-9.c
> > > new file mode 100644
> > > index
> > >
> > ea
> > > 26bb6b693e49
> > > --- /dev/null
> > > @@ -0,0 +1,25 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +__attribute__((noinline, noipa))
> > > +void fun1(uint32_t *x, int n)
> > > +{
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~5)) == 0;
> > > +}
> > > +
> > > +__attribute__((noinline, noipa, optimize("O1"))) void fun2(uint32_t
> > > +*x, int n) {
> > > +    for (int i = 0; i < (n & -16); i++)
> > > +      x[i] = (x[i]&(~5)) == 0;
> > > +}
> > > +
> > > +
> > > +/* { dg-final { scan-tree-dump-not {<=\s*.+\{ 4294967289,.+\}} dce7 }
> > > +} */
> > > +/* { dg-final { scan-tree-dump {&\s*.+\{ 4294967290,.+\}} dce7 } } */
> > > +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target {
> > > +aarch64*-*-* } } } } */
> > > +
> > > diff --git a/gcc/testsuite/gcc.dg/bic-bitmask.h
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..2b94065c025e0cbf71a21ac9b9
> > d6
> > > 314e24b0c2d9
> > > --- /dev/null
> > > @@ -0,0 +1,43 @@
> > > +#include <stdio.h>
> > > +
> > > +#ifndef N
> > > +#define N 50
> > > +#endif
> > > +
> > > +#ifndef TYPE
> > > +#define TYPE uint32_t
> > > +#endif
> > > +
> > > +#ifndef DEBUG
> > > +#define DEBUG 0
> > > +#endif
> > > +
> > > +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> > > +
> > > +int main ()
> > > +{
> > > +  TYPE a[N];
> > > +  TYPE b[N];
> > > +
> > > +  for (int i = 0; i < N; ++i)
> > > +    {
> > > +      a[i] = BASE + i * 13;
> > > +      b[i] = BASE + i * 13;
> > > +      if (DEBUG)
> > > +        printf ("%d: 0x%x\n", i, a[i]);
> > > +    }
> > > +
> > > +  fun1 (a, N);
> > > +  fun2 (b, N);
> > > +
> > > +  for (int i = 0; i < N; ++i)
> > > +    {
> > > +      if (DEBUG)
> > > +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> > > +
> > > +      if (a[i] != b[i])
> > > +        __builtin_abort ();
> > > +    }
> > > +  return 0;
> > > +}
> > > +
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/bic-bitmask-1.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..568c1ffc8bc4148efaeeba7a45
> > a7
> > > 5ecbd3a7a3dd
> > > --- /dev/null
> > > @@ -0,0 +1,13 @@
> > > +/* { dg-do assemble } */
> > > +/* { dg-options "-O2 -save-temps" } */
> > > +
> > > +#include <arm_neon.h>
> > > +
> > > +uint32x4_t foo (int32x4_t a)
> > > +{
> > > +  int32x4_t cst = vdupq_n_s32 (255);
> > > +  int32x4_t zero = vdupq_n_s32 (0);
> > > +  return vceqq_s32 (vbicq_s32 (a, cst), zero); }
> > > +
> > > +/* { dg-final { scan-assembler-not {\tbic\t} { xfail { aarch64*-*-* }
> > > +} } } */
> > >
> > >
> > >
> >
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409
> > Nuernberg, Germany; GF: Felix ImendÃ¶rffer; HRB 36809 (AG Nuernberg)
>
```
Tamar Christina Oct. 5, 2021, 12:55 p.m. UTC | #4
```Hi All,

Here's a new version of the patch handling both scalar and vector modes
and non-uniform constant vectors.

Bootstrapped Regtested on aarch64-none-linux-gnu,
x86_64-pc-linux-gnu and no regressions.

In order to not break IVopts and CSE I have added a
requirement for the scalar version to be single use.

Thanks,
Tamar

gcc/ChangeLog:

* match.pd: Use it in new bitmask compare pattern.

gcc/testsuite/ChangeLog:

--- inline copy of patch --

diff --git a/gcc/match.pd b/gcc/match.pd
index 0fcfd0ea62c043dc217d0d560ce5b7e569b70e7d..7d2a24dbc5e9644a09968f877e12a824d8ba1caa 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -37,7 +37,8 @@ along with GCC; see the file COPYING3.  If not see
integer_pow2p
uniform_integer_cst_p
HONOR_NANS
-   uniform_vector_p)
+   uniform_vector_p

/* Operator lists.  */
(define_operator_list tcc_comparison
@@ -4900,6 +4901,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(eqcmp (bit_and @1 { wide_int_to_tree (ty, mask - rhs); })
{ build_zero_cst (ty); }))))))

+/* Transform comparisons of the form (X & Y) CMP 0 to X CMP2 Z
+   where ~Y + 1 == pow2 and Z = ~Y.  */
+(for cst (VECTOR_CST INTEGER_CST)
+ (for cmp (le eq ne ge gt)
+      icmp (le le gt le gt)
+ (simplify
+  (cmp (bit_and:c@2 @0 cst@1) integer_zerop)
+   (with { tree csts = bitmask_inv_cst_vector_p (@1); }
+     (switch
+      (if (csts && TYPE_UNSIGNED (TREE_TYPE (@1))
+	   && (VECTOR_TYPE_P (TREE_TYPE (@1)) || single_use (@2)))
+       (icmp @0 { csts; }))
+      (if (csts && !TYPE_UNSIGNED (TREE_TYPE (@1))
+	   && (cmp == EQ_EXPR || cmp == NE_EXPR)
+	   && (VECTOR_TYPE_P (TREE_TYPE (@1)) || single_use (@2)))
+       (with { tree utype = unsigned_type_for (TREE_TYPE (@1)); }
+	(icmp (convert:utype @0) { csts; }))))))))
+
/* -A CMP -B -> B CMP A.  */
(for cmp (tcc_comparison)
scmp (swapped_tcc_comparison)
new file mode 100644
index 0000000000000000000000000000000000000000..76a22a2313137a2a75dd711c2c15c2d3a34e15aa
--- /dev/null
@@ -0,0 +1,26 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+#define TYPE int32_t
+
+/* { dg-final { scan-tree-dump {<=\s*.+\{ 255,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..32553d7ba2f823f7a21237451990d0a216d2f912
--- /dev/null
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) != 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) != 0;
+}
+
+
+/* { dg-final { scan-tree-dump {>\s*.+\{ 255,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..e10cbf7fabe2dbf7ce436cdf37b0f8b207c58408
--- /dev/null
@@ -0,0 +1,17 @@
+/* { dg-do assemble } */
+/* { dg-options "-O3 -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+typedef unsigned int v4si __attribute__ ((vector_size (16)));
+
+__attribute__((noinline, noipa))
+void fun(v4si *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+/* { dg-final { scan-tree-dump {<=\s*.+\{ 255,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
new file mode 100644
index 0000000000000000000000000000000000000000..8ebaa30238c761b8831685209a7490f06591c000
--- /dev/null
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+/* { dg-options "-O0 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {<=\s* 255} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s* 4294967040} dce7 } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..0a2789433f8bc45a590d136179b8ee4ec5cda1c1
--- /dev/null
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O0")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {<=\s* 255} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s* 4294967040} dce7 } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..53a2c986b00f159ae5fa839798850ac42e9b9504
--- /dev/null
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) >= 0;
+}
+
+__attribute__((noinline, noipa, optimize("O0")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) >= 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {=\s* 1} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s* 4294967040} dce7 } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..fff4670e2a47106c614c1224b8e8aac091d6e821
--- /dev/null
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) > 0;
+}
+
+__attribute__((noinline, noipa, optimize("O0")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) > 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {>\s* 255} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s* 4294967040} dce7 } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..7b877fde017de0fb7aeabd4152a1593f07d52e71
--- /dev/null
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) <= 0;
+}
+
+__attribute__((noinline, noipa, optimize("O0")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) <= 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {<=\s* 255} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s* 4294967040} dce7 } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..8455bf5286a7cc9f08713489e29b5f2b6f6fc012
--- /dev/null
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) < 0;
+}
+
+__attribute__((noinline, noipa, optimize("O0")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) < 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {= 0} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
--- /dev/null
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) != 0;
+}
+
+__attribute__((noinline, noipa, optimize("O0")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) != 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {>\s* 1} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s* 4294967294} dce7 } } */
+
new file mode 100644
--- /dev/null
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..befd25cb4aac6fe206110e7ca80816dd6fc0ed94
--- /dev/null
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~5)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O0")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~5)) == 0;
+}
+
+
+/* { dg-final { scan-tree-dump-not {<=\s* 4294967289} dce7 } } */
+/* { dg-final { scan-tree-dump {&\s* 4294967290} dce7 } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..b037ffc248eef1509c642abb0087b77882679150
--- /dev/null
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O0")))
+void fun2(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+#define TYPE int32_t
+
+/* { dg-final { scan-tree-dump {<=\s* 255} dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s* 4294967290} dce7 } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..20027a6367a1dcec12492435f260250a7b54aca1
--- /dev/null
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) != 0;
+}
+
+__attribute__((noinline, noipa, optimize("O0")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) != 0;
+}
+
+
+/* { dg-final { scan-tree-dump {>\s* 255} dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s* 4294967290} dce7 } } */
+
new file mode 100644
--- /dev/null
@@ -0,0 +1,16 @@
+/* { dg-do assemble } */
+/* { dg-options "-O1 -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+typedef unsigned int v4si __attribute__ ((vector_size (16)));
+
+__attribute__((noinline, noipa))
+v4si fun(v4si x)
+{
+    v4si mask = { 255, 15, 1, 0xFFFF };
+    v4si zeros = {0};
+    return (x & ~mask) == zeros;
+}
+
+/* { dg-final { scan-tree-dump {<=\s*.+\{ 255, 15, 1, 65535 \}} dce7 } } */
new file mode 100644
--- /dev/null
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..1bcf23ccf1447d6c8c999ed1eb25ba0a450028e1
--- /dev/null
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) >= 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) >= 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {=\s*.+\{ 1,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..d053727076abedefdecfda7c4fea6f92d54a94a5
--- /dev/null
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) > 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) > 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {>\s*.+\{ 255,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..018e7a4348c9fc461106c3d9d01291325d3406c2
--- /dev/null
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) <= 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) <= 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..798678fb7555052c93abc4ca34f617d640f73bb4
--- /dev/null
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) < 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) < 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {__builtin_memset} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..1dabe834ed57dfa0be48c1dc3dbb226092c79a1a
--- /dev/null
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) != 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) != 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {>\s*.+\{ 1,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967294,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
--- /dev/null
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~5)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~5)) == 0;
+}
+
+
+/* { dg-final { scan-tree-dump-not {<=\s*.+\{ 4294967289,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump {&\s*.+\{ 4294967290,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..faf80b974db07a7d817a615cc64a35f1020e9764
--- /dev/null
@@ -0,0 +1,43 @@
+#include <stdio.h>
+
+#ifndef N
+#define N 65
+#endif
+
+#ifndef TYPE
+#define TYPE uint32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N);
+  fun2 (b, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
new file mode 100644
index 0000000000000000000000000000000000000000..568c1ffc8bc4148efaeeba7a45a75ecbd3a7a3dd
--- /dev/null
@@ -0,0 +1,13 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -save-temps" } */
+
+#include <arm_neon.h>
+
+uint32x4_t foo (int32x4_t a)
+{
+  int32x4_t cst = vdupq_n_s32 (255);
+  int32x4_t zero = vdupq_n_s32 (0);
+  return vceqq_s32 (vbicq_s32 (a, cst), zero);
+}
+
+/* { dg-final { scan-assembler-not {\tbic\t} { xfail { aarch64*-*-* } } } } */
diff --git a/gcc/tree.h b/gcc/tree.h
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -4862,6 +4862,11 @@ extern bool integer_minus_onep (const_tree);

extern bool integer_pow2p (const_tree);

+/* Checks to see if T is a constant or a constant vector and if each element E
+   adheres to ~E + 1 == pow2 then return ~E otherwise NULL_TREE.  */
+
+
/* integer_nonzerop (tree x) is nonzero if X is an integer constant
with a nonzero value.  */

diff --git a/gcc/tree.c b/gcc/tree.c
index cba3bca41b3a50557939267b7a06df04b5d042b7..801f121a368b088d0f769f16f2ed29e320e71556 100644
--- a/gcc/tree.c
+++ b/gcc/tree.c
@@ -10246,6 +10246,59 @@ uniform_integer_cst_p (tree t)
return NULL_TREE;
}

+/* Checks to see if T is a constant or a constant vector and if each element E
+   adheres to ~E + 1 == pow2 then return ~E otherwise NULL_TREE.  */
+
+tree
+{
+
+  tree_code code = TREE_CODE (t);
+  tree type = TREE_TYPE (t);
+
+  if (!INTEGRAL_TYPE_P (type)
+      && !VECTOR_INTEGER_TYPE_P (type))
+    return NULL_TREE;
+
+  unsigned HOST_WIDE_INT nelts = 1;
+  tree cst;
+  unsigned int idx = 0;
+  bool uniform = uniform_integer_cst_p (t);
+  tree newtype = unsigned_type_for (type);
+  tree_vector_builder builder;
+  if (code == INTEGER_CST)
+    cst = t;
+  else
+    {
+      if (!VECTOR_CST_NELTS (t).is_constant (&nelts))
+	return NULL_TREE;
+
+      cst = vector_cst_elt (t, 0);
+      builder.new_vector (newtype, nelts, 1);
+    }
+
+  tree ty = unsigned_type_for (TREE_TYPE (cst));
+
+  do {
+    if (idx > 0)
+      cst = vector_cst_elt (t, idx);
+    wide_int icst = wi::to_wide (cst);
+    wide_int inv =  wi::bit_not (icst);
+    icst = wi::add (1, inv);
+    if (wi::popcount (icst) != 1)
+      return NULL_TREE;
+
+    tree newcst = wide_int_to_tree (ty, inv);
+
+    if (uniform)
+      return build_uniform_cst (newtype, newcst);
+
+    builder.quick_push (newcst);
+  } while (++idx < nelts);
+
+  return builder.build ();
+}
+
/* If VECTOR_CST T has a single nonzero element, return the index of that
element, otherwise return -1.  */
```
Richard Biener Oct. 13, 2021, 12:17 p.m. UTC | #5
```On Tue, 5 Oct 2021, Tamar Christina wrote:

> Hi All,
>
> Here's a new version of the patch handling both scalar and vector modes
> and non-uniform constant vectors.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> x86_64-pc-linux-gnu and no regressions.
>
> In order to not break IVopts and CSE I have added a
> requirement for the scalar version to be single use.

OK.

Thanks,
Richard.

> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* match.pd: Use it in new bitmask compare pattern.
>
> gcc/testsuite/ChangeLog:
>
>
> --- inline copy of patch --
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 0fcfd0ea62c043dc217d0d560ce5b7e569b70e7d..7d2a24dbc5e9644a09968f877e12a824d8ba1caa 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -37,7 +37,8 @@ along with GCC; see the file COPYING3.  If not see
>     integer_pow2p
>     uniform_integer_cst_p
>     HONOR_NANS
> -   uniform_vector_p)
> +   uniform_vector_p
>
>  /* Operator lists.  */
>  (define_operator_list tcc_comparison
> @@ -4900,6 +4901,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>        (eqcmp (bit_and @1 { wide_int_to_tree (ty, mask - rhs); })
>  	     { build_zero_cst (ty); }))))))
>
> +/* Transform comparisons of the form (X & Y) CMP 0 to X CMP2 Z
> +   where ~Y + 1 == pow2 and Z = ~Y.  */
> +(for cst (VECTOR_CST INTEGER_CST)
> + (for cmp (le eq ne ge gt)
> +      icmp (le le gt le gt)
> + (simplify
> +  (cmp (bit_and:c@2 @0 cst@1) integer_zerop)
> +   (with { tree csts = bitmask_inv_cst_vector_p (@1); }
> +     (switch
> +      (if (csts && TYPE_UNSIGNED (TREE_TYPE (@1))
> +	   && (VECTOR_TYPE_P (TREE_TYPE (@1)) || single_use (@2)))
> +       (icmp @0 { csts; }))
> +      (if (csts && !TYPE_UNSIGNED (TREE_TYPE (@1))
> +	   && (cmp == EQ_EXPR || cmp == NE_EXPR)
> +	   && (VECTOR_TYPE_P (TREE_TYPE (@1)) || single_use (@2)))
> +       (with { tree utype = unsigned_type_for (TREE_TYPE (@1)); }
> +	(icmp (convert:utype @0) { csts; }))))))))
> +
>  /* -A CMP -B -> B CMP A.  */
>  (for cmp (tcc_comparison)
>       scmp (swapped_tcc_comparison)
> new file mode 100644
> index 0000000000000000000000000000000000000000..76a22a2313137a2a75dd711c2c15c2d3a34e15aa
> --- /dev/null
> @@ -0,0 +1,26 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +#define TYPE int32_t
> +
> +/* { dg-final { scan-tree-dump {<=\s*.+\{ 255,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..32553d7ba2f823f7a21237451990d0a216d2f912
> --- /dev/null
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) != 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) != 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump {>\s*.+\{ 255,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..e10cbf7fabe2dbf7ce436cdf37b0f8b207c58408
> --- /dev/null
> @@ -0,0 +1,17 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O3 -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +typedef unsigned int v4si __attribute__ ((vector_size (16)));
> +
> +__attribute__((noinline, noipa))
> +void fun(v4si *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +/* { dg-final { scan-tree-dump {<=\s*.+\{ 255,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> new file mode 100644
> index 0000000000000000000000000000000000000000..8ebaa30238c761b8831685209a7490f06591c000
> --- /dev/null
> @@ -0,0 +1,24 @@
> +/* { dg-do run } */
> +/* { dg-options "-O0 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {<=\s* 255} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s* 4294967040} dce7 } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..0a2789433f8bc45a590d136179b8ee4ec5cda1c1
> --- /dev/null
> @@ -0,0 +1,24 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O0")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {<=\s* 255} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s* 4294967040} dce7 } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..53a2c986b00f159ae5fa839798850ac42e9b9504
> --- /dev/null
> @@ -0,0 +1,24 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) >= 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O0")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) >= 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {=\s* 1} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s* 4294967040} dce7 } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..fff4670e2a47106c614c1224b8e8aac091d6e821
> --- /dev/null
> @@ -0,0 +1,24 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) > 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O0")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) > 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {>\s* 255} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s* 4294967040} dce7 } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..7b877fde017de0fb7aeabd4152a1593f07d52e71
> --- /dev/null
> @@ -0,0 +1,24 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) <= 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O0")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) <= 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {<=\s* 255} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s* 4294967040} dce7 } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..8455bf5286a7cc9f08713489e29b5f2b6f6fc012
> --- /dev/null
> @@ -0,0 +1,24 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~1)) < 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O0")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~1)) < 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {= 0} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> --- /dev/null
> @@ -0,0 +1,24 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~1)) != 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O0")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~1)) != 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {>\s* 1} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s* 4294967294} dce7 } } */
> +
> new file mode 100644
> --- /dev/null
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..befd25cb4aac6fe206110e7ca80816dd6fc0ed94
> --- /dev/null
> @@ -0,0 +1,24 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~5)) == 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O0")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~5)) == 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-not {<=\s* 4294967289} dce7 } } */
> +/* { dg-final { scan-tree-dump {&\s* 4294967290} dce7 } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..b037ffc248eef1509c642abb0087b77882679150
> --- /dev/null
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O0")))
> +void fun2(int32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +#define TYPE int32_t
> +
> +/* { dg-final { scan-tree-dump {<=\s* 255} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s* 4294967290} dce7 } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..20027a6367a1dcec12492435f260250a7b54aca1
> --- /dev/null
> @@ -0,0 +1,24 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) != 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O0")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) != 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump {>\s* 255} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s* 4294967290} dce7 } } */
> +
> new file mode 100644
> --- /dev/null
> @@ -0,0 +1,16 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O1 -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +typedef unsigned int v4si __attribute__ ((vector_size (16)));
> +
> +__attribute__((noinline, noipa))
> +v4si fun(v4si x)
> +{
> +    v4si mask = { 255, 15, 1, 0xFFFF };
> +    v4si zeros = {0};
> +    return (x & ~mask) == zeros;
> +}
> +
> +/* { dg-final { scan-tree-dump {<=\s*.+\{ 255, 15, 1, 65535 \}} dce7 } } */
> new file mode 100644
> --- /dev/null
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) == 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..1bcf23ccf1447d6c8c999ed1eb25ba0a450028e1
> --- /dev/null
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) >= 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) >= 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {=\s*.+\{ 1,.+\}} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..d053727076abedefdecfda7c4fea6f92d54a94a5
> --- /dev/null
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) > 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) > 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {>\s*.+\{ 255,.+\}} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..018e7a4348c9fc461106c3d9d01291325d3406c2
> --- /dev/null
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) <= 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~255)) <= 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..798678fb7555052c93abc4ca34f617d640f73bb4
> --- /dev/null
> @@ -0,0 +1,24 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~1)) < 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~1)) < 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {__builtin_memset} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..1dabe834ed57dfa0be48c1dc3dbb226092c79a1a
> --- /dev/null
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~1)) != 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~1)) != 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times {>\s*.+\{ 1,.+\}} 1 dce7 } } */
> +/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967294,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> --- /dev/null
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
> +
> +#include <stdint.h>
> +
> +__attribute__((noinline, noipa))
> +void fun1(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~5)) == 0;
> +}
> +
> +__attribute__((noinline, noipa, optimize("O1")))
> +void fun2(uint32_t *x, int n)
> +{
> +    for (int i = 0; i < (n & -16); i++)
> +      x[i] = (x[i]&(~5)) == 0;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-not {<=\s*.+\{ 4294967289,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump {&\s*.+\{ 4294967290,.+\}} dce7 } } */
> +/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..faf80b974db07a7d817a615cc64a35f1020e9764
> --- /dev/null
> @@ -0,0 +1,43 @@
> +#include <stdio.h>
> +
> +#ifndef N
> +#define N 65
> +#endif
> +
> +#ifndef TYPE
> +#define TYPE uint32_t
> +#endif
> +
> +#ifndef DEBUG
> +#define DEBUG 0
> +#endif
> +
> +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> +
> +int main ()
> +{
> +  TYPE a[N];
> +  TYPE b[N];
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 13;
> +      b[i] = BASE + i * 13;
> +      if (DEBUG)
> +        printf ("%d: 0x%x\n", i, a[i]);
> +    }
> +
> +  fun1 (a, N);
> +  fun2 (b, N);
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (DEBUG)
> +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> +
> +      if (a[i] != b[i])
> +        __builtin_abort ();
> +    }
> +  return 0;
> +}
> +
> new file mode 100644
> index 0000000000000000000000000000000000000000..568c1ffc8bc4148efaeeba7a45a75ecbd3a7a3dd
> --- /dev/null
> @@ -0,0 +1,13 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O2 -save-temps" } */
> +
> +#include <arm_neon.h>
> +
> +uint32x4_t foo (int32x4_t a)
> +{
> +  int32x4_t cst = vdupq_n_s32 (255);
> +  int32x4_t zero = vdupq_n_s32 (0);
> +  return vceqq_s32 (vbicq_s32 (a, cst), zero);
> +}
> +
> +/* { dg-final { scan-assembler-not {\tbic\t} { xfail { aarch64*-*-* } } } } */
> diff --git a/gcc/tree.h b/gcc/tree.h
> --- a/gcc/tree.h
> +++ b/gcc/tree.h
> @@ -4862,6 +4862,11 @@ extern bool integer_minus_onep (const_tree);
>
>  extern bool integer_pow2p (const_tree);
>
> +/* Checks to see if T is a constant or a constant vector and if each element E
> +   adheres to ~E + 1 == pow2 then return ~E otherwise NULL_TREE.  */
> +
> +
>  /* integer_nonzerop (tree x) is nonzero if X is an integer constant
>     with a nonzero value.  */
>
> diff --git a/gcc/tree.c b/gcc/tree.c
> index cba3bca41b3a50557939267b7a06df04b5d042b7..801f121a368b088d0f769f16f2ed29e320e71556 100644
> --- a/gcc/tree.c
> +++ b/gcc/tree.c
> @@ -10246,6 +10246,59 @@ uniform_integer_cst_p (tree t)
>    return NULL_TREE;
>  }
>
> +/* Checks to see if T is a constant or a constant vector and if each element E
> +   adheres to ~E + 1 == pow2 then return ~E otherwise NULL_TREE.  */
> +
> +tree
> +{
> +
> +  tree_code code = TREE_CODE (t);
> +  tree type = TREE_TYPE (t);
> +
> +  if (!INTEGRAL_TYPE_P (type)
> +      && !VECTOR_INTEGER_TYPE_P (type))
> +    return NULL_TREE;
> +
> +  unsigned HOST_WIDE_INT nelts = 1;
> +  tree cst;
> +  unsigned int idx = 0;
> +  bool uniform = uniform_integer_cst_p (t);
> +  tree newtype = unsigned_type_for (type);
> +  tree_vector_builder builder;
> +  if (code == INTEGER_CST)
> +    cst = t;
> +  else
> +    {
> +      if (!VECTOR_CST_NELTS (t).is_constant (&nelts))
> +	return NULL_TREE;
> +
> +      cst = vector_cst_elt (t, 0);
> +      builder.new_vector (newtype, nelts, 1);
> +    }
> +
> +  tree ty = unsigned_type_for (TREE_TYPE (cst));
> +
> +  do {
> +    if (idx > 0)
> +      cst = vector_cst_elt (t, idx);
> +    wide_int icst = wi::to_wide (cst);
> +    wide_int inv =  wi::bit_not (icst);
> +    icst = wi::add (1, inv);
> +    if (wi::popcount (icst) != 1)
> +      return NULL_TREE;
> +
> +    tree newcst = wide_int_to_tree (ty, inv);
> +
> +    if (uniform)
> +      return build_uniform_cst (newtype, newcst);
> +
> +    builder.quick_push (newcst);
> +  } while (++idx < nelts);
> +
> +  return builder.build ();
> +}
> +
>  /* If VECTOR_CST T has a single nonzero element, return the index of that
>     element, otherwise return -1.  */
>
>
```

## Patch

```diff --git a/gcc/match.pd b/gcc/match.pd
index 0fcfd0ea62c043dc217d0d560ce5b7e569b70e7d..df9212cb27d172856b9d43b0875262f96e8993c4 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -4288,6 +4288,56 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(if (ic == ncmp)
(ncmp @0 @1))))))

+/* Transform comparisons of the form (X & Y) CMP 0 to X CMP2 Z
+   where ~Y + 1 == pow2 and Z = ~Y.  */
+(for cmp (simple_comparison)
+ (simplify
+  (cmp (bit_and:c @0 VECTOR_CST@1) integer_zerop)
+   (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@1))
+	&& uniform_vector_p (@1))
+    (with { tree elt = vector_cst_elt (@1, 0); }
+     (switch
+      (if (TYPE_UNSIGNED (TREE_TYPE (@1)) && tree_fits_uhwi_p (elt))
+	(with { unsigned HOST_WIDE_INT diff = tree_to_uhwi (elt);
+	        tree tdiff = wide_int_to_tree (TREE_TYPE (elt), (~diff) + 1);
+		tree newval = wide_int_to_tree (TREE_TYPE (elt), ~diff);
+		tree newmask = build_uniform_cst (TREE_TYPE (@1), newval); }
+	 (if (integer_pow2p (tdiff))
+	  (switch
+	   /* ((mask & x) < 0) -> 0.  */
+	   (if (cmp == LT_EXPR)
+	    { build_zero_cst (TREE_TYPE (@1)); })
+	   /* ((mask & x) <= 0) -> x < mask.  */
+	   (if (cmp == LE_EXPR)
+	    (lt @0 { newmask; }))
+	   /* ((mask & x) == 0) -> x < mask.  */
+	   (if (cmp == EQ_EXPR)
+	    (le @0 { newmask; }))
+	   /* ((mask & x) != 0) -> x > mask.  */
+	   (if (cmp == NE_EXPR)
+	    (gt @0 { newmask; }))
+	   /* ((mask & x) >= 0) -> x <= mask.  */
+	   (if (cmp == GE_EXPR)
+	    (le @0 { newmask; }))
+	    /* ((mask & x) > 0) -> x < mask.  */
+	   (if (cmp == GT_EXPR)
+	    (lt @0 { newmask; }))))))
+      (if (!TYPE_UNSIGNED (TREE_TYPE (@1)) && tree_fits_shwi_p (elt))
+	(with { unsigned HOST_WIDE_INT diff = tree_to_shwi (elt);
+		tree ustype = unsigned_type_for (TREE_TYPE (elt));
+		tree uvtype = unsigned_type_for (TREE_TYPE (@1));
+	        tree tdiff = wide_int_to_tree (ustype, (~diff) + 1);
+	        tree udiff = wide_int_to_tree (ustype, ~diff);
+		tree cst = build_uniform_cst (uvtype, udiff); }
+	 (if (integer_pow2p (tdiff))
+	  (switch
+	    /* ((mask & x) == 0) -> x < mask.  */
+	    (if (cmp == EQ_EXPR)
+	     (le (convert:uvtype @0) { cst; }))
+	    /* ((mask & x) != 0) -> x > mask.  */
+	    (if (cmp == NE_EXPR)
+	     (gt (convert:uvtype @0) { cst; })))))))))))
+
/* Transform comparisons of the form X - Y CMP 0 to X CMP Y.
??? The transformation is valid for the other operators if overflow
is undefined for the type, but performing it here badly interacts
new file mode 100644
index 0000000000000000000000000000000000000000..76a22a2313137a2a75dd711c2c15c2d3a34e15aa
--- /dev/null
@@ -0,0 +1,26 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+#define TYPE int32_t
+
+/* { dg-final { scan-tree-dump {<=\s*.+\{ 255,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..32553d7ba2f823f7a21237451990d0a216d2f912
--- /dev/null
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) != 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) != 0;
+}
+
+
+/* { dg-final { scan-tree-dump {>\s*.+\{ 255,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..e10cbf7fabe2dbf7ce436cdf37b0f8b207c58408
--- /dev/null
@@ -0,0 +1,17 @@
+/* { dg-do assemble } */
+/* { dg-options "-O3 -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+typedef unsigned int v4si __attribute__ ((vector_size (16)));
+
+__attribute__((noinline, noipa))
+void fun(v4si *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+/* { dg-final { scan-tree-dump {<=\s*.+\{ 255,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
new file mode 100644
--- /dev/null
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
--- /dev/null
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..1bcf23ccf1447d6c8c999ed1eb25ba0a450028e1
--- /dev/null
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) >= 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) >= 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {=\s*.+\{ 1,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..6e5a2fca9992efbc01f8dbbc6f95936e86643028
--- /dev/null
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) > 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) > 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {>\s*.+\{ 255,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&`s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..018e7a4348c9fc461106c3d9d01291325d3406c2
--- /dev/null
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) <= 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) <= 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..798678fb7555052c93abc4ca34f617d640f73bb4
--- /dev/null
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) < 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) < 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {__builtin_memset} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..1dabe834ed57dfa0be48c1dc3dbb226092c79a1a
--- /dev/null
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) != 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) != 0;
+}
+
+
+/* { dg-final { scan-tree-dump-times {>\s*.+\{ 1,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967294,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
--- /dev/null
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~5)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~5)) == 0;
+}
+
+
+/* { dg-final { scan-tree-dump-not {<=\s*.+\{ 4294967289,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump {&\s*.+\{ 4294967290,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
new file mode 100644
index 0000000000000000000000000000000000000000..2b94065c025e0cbf71a21ac9b9d6314e24b0c2d9
--- /dev/null
@@ -0,0 +1,43 @@
+#include <stdio.h>
+
+#ifndef N
+#define N 50
+#endif
+
+#ifndef TYPE
+#define TYPE uint32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N);
+  fun2 (b, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
new file mode 100644
index 0000000000000000000000000000000000000000..568c1ffc8bc4148efaeeba7a45a75ecbd3a7a3dd
--- /dev/null
@@ -0,0 +1,13 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -save-temps" } */
+
+#include <arm_neon.h>
+
+uint32x4_t foo (int32x4_t a)
+{
+  int32x4_t cst = vdupq_n_s32 (255);
+  int32x4_t zero = vdupq_n_s32 (0);
+  return vceqq_s32 (vbicq_s32 (a, cst), zero);
+}
+
+/* { dg-final { scan-assembler-not {\tbic\t} { xfail { aarch64*-*-* } } } } */

```