match.pd: Simplify 1 / X for integer X [PR95424]

Message ID CALHvHFX5=L5kqWE7mpiCTJ7OAKwM4CPqu-J57uMKikNDirbgsQ@mail.gmail.com
State New
Headers
Series match.pd: Simplify 1 / X for integer X [PR95424] |

Commit Message

Zhao Wei Liew Jan. 5, 2022, 6:14 a.m. UTC
  match.pd/95424: Simplify 1 / X for integer X

This patch implements an optimization for the following C++ code:

int f(int x) {
    return 1 / x;
}

int f(unsigned int x) {
    return 1 / x;
}

Before this patch, x86-64 gcc -std=c++20 -O3 produces the following
assembly:

f(int):
    xor edx, edx
    mov eax, 1
    idiv edi
    ret
f(unsigned int):
    xor edx, edx
    mov eax, 1
    div edi
    ret

In comparison, clang++ -std=c++20 -O3 produces the following assembly:

f(int):
    lea ecx, [rdi + 1]
    xor eax, eax
    cmp ecx, 3
    cmovb eax, edi
    ret
f(unsigned int):
    xor eax, eax
    cmp edi, 1
    sete al
    ret

Clang's output is more efficient as it avoids expensive div operations.

With this patch, GCC now produces the following assembly:
f(int):
    lea eax, [rdi + 1]
    cmp eax, 2
    mov eax, 0
    cmovbe eax, edi
    ret
f(unsigned int):
    xor eax, eax
    cmp edi, 1
    sete al
    ret

which is virtualy identical to Clang's assembly output. Any slight
differences
in the output for f(int) is related to another possible missed optimization.

gcc/ChangeLog:

        * match.pd: Simplify 1 / X where X is an integer.

gcc/testsuite/ChangeLog:

        * gcc.dg/tree-ssa/divide-6.c: New test.
        * gcc.dg/tree-ssa/divide-7.c: New test.
  

Comments

Richard Biener Jan. 5, 2022, 8:17 a.m. UTC | #1
On Wed, Jan 5, 2022 at 7:15 AM Zhao Wei Liew via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> match.pd/95424: Simplify 1 / X for integer X
>
> This patch implements an optimization for the following C++ code:
>
> int f(int x) {
>     return 1 / x;
> }
>
> int f(unsigned int x) {
>     return 1 / x;
> }
>
> Before this patch, x86-64 gcc -std=c++20 -O3 produces the following
> assembly:
>
> f(int):
>     xor edx, edx
>     mov eax, 1
>     idiv edi
>     ret
> f(unsigned int):
>     xor edx, edx
>     mov eax, 1
>     div edi
>     ret
>
> In comparison, clang++ -std=c++20 -O3 produces the following assembly:
>
> f(int):
>     lea ecx, [rdi + 1]
>     xor eax, eax
>     cmp ecx, 3
>     cmovb eax, edi
>     ret
> f(unsigned int):
>     xor eax, eax
>     cmp edi, 1
>     sete al
>     ret
>
> Clang's output is more efficient as it avoids expensive div operations.
>
> With this patch, GCC now produces the following assembly:
> f(int):
>     lea eax, [rdi + 1]
>     cmp eax, 2
>     mov eax, 0
>     cmovbe eax, edi
>     ret
> f(unsigned int):
>     xor eax, eax
>     cmp edi, 1
>     sete al
>     ret
>
> which is virtualy identical to Clang's assembly output. Any slight
> differences
> in the output for f(int) is related to another possible missed optimization.
>
> gcc/ChangeLog:
>
>         * match.pd: Simplify 1 / X where X is an integer.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/tree-ssa/divide-6.c: New test.
>         * gcc.dg/tree-ssa/divide-7.c: New test.
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 84c9b918041..5edae1818bb 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -422,7 +422,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>     (div:C @0 (negate @0))
>     (if ((INTEGRAL_TYPE_P (type) || VECTOR_INTEGER_TYPE_P (type))
>   && TYPE_OVERFLOW_UNDEFINED (type))
> -    { build_minus_one_cst (type); })))
> +    { build_minus_one_cst (type); }))
> +
> + /* 1 / X -> X == 1 for unsigned integer X
> +    1 / X -> X >= -1 && X <= 1 ? X : 0 for signed integer X
> +    But not for 1 / 0 so that we can get proper warnings and errors. */
> + (simplify
> +   (div integer_onep@0 @1)
> +   (switch
> +     (if (!integer_zerop (@1)
> +          && INTEGRAL_TYPE_P (TREE_TYPE (@1))

you can use INTEGRAL_TYPE_P (type), the type of @0, @1 and the
result ('type') are the same.

> +          && TYPE_UNSIGNED (TREE_TYPE (@1)))
> +      (eq @0 @1))
> +     (if (!integer_zerop (@1)
> +          && INTEGRAL_TYPE_P (TREE_TYPE (@1))
> +          && !TYPE_UNSIGNED (TREE_TYPE (@1)))

Please refactor as

    (if (INTEGRAL_TYPE_P (type)
         && !integer_zerop (@1))
     (if (TYPE_UNSIGNED (....))
      (eq @0 @1)
      (cond (...

> +      (cond (bit_and (ge @1 { build_minus_one_cst (integer_type_node); })
> +                     (le @1 { build_one_cst (integer_type_node); }))

You should use build_[minus_]one_cst (type) to get -1/1 of the correct
type here.

X >= -1 && X <= 1 is (hopefully?) going to be simplified
as  (unsigned)X + 1 <= 2, it might be good to simplify it this way
here as well?

Finally I'm not sure whether 1 /[ceil] 2 isn't 1, similar -1 /[floor]
2 should be -1
so the transform looks to be correct only for TRUNC_DIV_EXPR, not all 'div'?

Thanks,
Richard.

> +        @1 { build_zero_cst (type); })))))
>
>  /* For unsigned integral types, FLOOR_DIV_EXPR is the same as
>     TRUNC_DIV_EXPR.  Rewrite into the latter in this case.  */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
> b/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
> new file mode 100644
> index 00000000000..a9fc4c04058
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -fdump-tree-optimized" } */
> +
> +unsigned int f(unsigned int x) {
> +  return 1 / x;
> +}
> +
> +/* { dg-final { scan-tree-dump-not "1 / x_..D.;" "optimized" } } */
> +/* { dg-final { scan-tree-dump "x_..D. == 1;" "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
> b/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
> new file mode 100644
> index 00000000000..285279af7c2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -fdump-tree-optimized" } */
> +
> +int f(int x) {
> +  return 1 / x;
> +}
> +
> +/* { dg-final { scan-tree-dump-not "1 / x_..D.;" "optimized" } } */
> +/* { dg-final { scan-tree-dump ".. <= 2 ? x_..D. : 0;" "optimized" } } */
  
Zhao Wei Liew Jan. 5, 2022, 9:17 a.m. UTC | #2
> X >= -1 && X <= 1 is (hopefully?) going to be simplified
> as  (unsigned)X + 1 <= 2, it might be good to simplify it this way
> here as well?

Yup, GCC does simplify it that way in the end, so I didn't really bother to
simplify it here. That said, I'm open to simplifying it here as well, but
I'm not sure how to do the unsigned cast.
Besides that, thanks for the rest of your suggestions! I'm testing the
changes and will post a v2 soon.

On Wed, 5 Jan 2022 at 16:18, Richard Biener <richard.guenther@gmail.com>
wrote:
>
> On Wed, Jan 5, 2022 at 7:15 AM Zhao Wei Liew via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > match.pd/95424: Simplify 1 / X for integer X
> >
> > This patch implements an optimization for the following C++ code:
> >
> > int f(int x) {
> >     return 1 / x;
> > }
> >
> > int f(unsigned int x) {
> >     return 1 / x;
> > }
> >
> > Before this patch, x86-64 gcc -std=c++20 -O3 produces the following
> > assembly:
> >
> > f(int):
> >     xor edx, edx
> >     mov eax, 1
> >     idiv edi
> >     ret
> > f(unsigned int):
> >     xor edx, edx
> >     mov eax, 1
> >     div edi
> >     ret
> >
> > In comparison, clang++ -std=c++20 -O3 produces the following assembly:
> >
> > f(int):
> >     lea ecx, [rdi + 1]
> >     xor eax, eax
> >     cmp ecx, 3
> >     cmovb eax, edi
> >     ret
> > f(unsigned int):
> >     xor eax, eax
> >     cmp edi, 1
> >     sete al
> >     ret
> >
> > Clang's output is more efficient as it avoids expensive div operations.
> >
> > With this patch, GCC now produces the following assembly:
> > f(int):
> >     lea eax, [rdi + 1]
> >     cmp eax, 2
> >     mov eax, 0
> >     cmovbe eax, edi
> >     ret
> > f(unsigned int):
> >     xor eax, eax
> >     cmp edi, 1
> >     sete al
> >     ret
> >
> > which is virtualy identical to Clang's assembly output. Any slight
> > differences
> > in the output for f(int) is related to another possible missed
optimization.
> >
> > gcc/ChangeLog:
> >
> >         * match.pd: Simplify 1 / X where X is an integer.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.dg/tree-ssa/divide-6.c: New test.
> >         * gcc.dg/tree-ssa/divide-7.c: New test.
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index 84c9b918041..5edae1818bb 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -422,7 +422,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >     (div:C @0 (negate @0))
> >     (if ((INTEGRAL_TYPE_P (type) || VECTOR_INTEGER_TYPE_P (type))
> >   && TYPE_OVERFLOW_UNDEFINED (type))
> > -    { build_minus_one_cst (type); })))
> > +    { build_minus_one_cst (type); }))
> > +
> > + /* 1 / X -> X == 1 for unsigned integer X
> > +    1 / X -> X >= -1 && X <= 1 ? X : 0 for signed integer X
> > +    But not for 1 / 0 so that we can get proper warnings and errors. */
> > + (simplify
> > +   (div integer_onep@0 @1)
> > +   (switch
> > +     (if (!integer_zerop (@1)
> > +          && INTEGRAL_TYPE_P (TREE_TYPE (@1))
>
> you can use INTEGRAL_TYPE_P (type), the type of @0, @1 and the
> result ('type') are the same.
>
> > +          && TYPE_UNSIGNED (TREE_TYPE (@1)))
> > +      (eq @0 @1))
> > +     (if (!integer_zerop (@1)
> > +          && INTEGRAL_TYPE_P (TREE_TYPE (@1))
> > +          && !TYPE_UNSIGNED (TREE_TYPE (@1)))
>
> Please refactor as
>
>     (if (INTEGRAL_TYPE_P (type)
>          && !integer_zerop (@1))
>      (if (TYPE_UNSIGNED (....))
>       (eq @0 @1)
>       (cond (...
>
> > +      (cond (bit_and (ge @1 { build_minus_one_cst (integer_type_node);
})
> > +                     (le @1 { build_one_cst (integer_type_node); }))
>
> You should use build_[minus_]one_cst (type) to get -1/1 of the correct
> type here.
>
> X >= -1 && X <= 1 is (hopefully?) going to be simplified
> as  (unsigned)X + 1 <= 2, it might be good to simplify it this way
> here as well?
>
> Finally I'm not sure whether 1 /[ceil] 2 isn't 1, similar -1 /[floor]
> 2 should be -1
> so the transform looks to be correct only for TRUNC_DIV_EXPR, not all
'div'?
>
> Thanks,
> Richard.
>
> > +        @1 { build_zero_cst (type); })))))
> >
> >  /* For unsigned integral types, FLOOR_DIV_EXPR is the same as
> >     TRUNC_DIV_EXPR.  Rewrite into the latter in this case.  */
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
> > b/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
> > new file mode 100644
> > index 00000000000..a9fc4c04058
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
> > @@ -0,0 +1,9 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O -fdump-tree-optimized" } */
> > +
> > +unsigned int f(unsigned int x) {
> > +  return 1 / x;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-not "1 / x_..D.;" "optimized" } } */
> > +/* { dg-final { scan-tree-dump "x_..D. == 1;" "optimized" } } */
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
> > b/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
> > new file mode 100644
> > index 00000000000..285279af7c2
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
> > @@ -0,0 +1,9 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O -fdump-tree-optimized" } */
> > +
> > +int f(int x) {
> > +  return 1 / x;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-not "1 / x_..D.;" "optimized" } } */
> > +/* { dg-final { scan-tree-dump ".. <= 2 ? x_..D. : 0;" "optimized" } }
*/
  
Richard Biener Jan. 5, 2022, 9:38 a.m. UTC | #3
On Wed, Jan 5, 2022 at 10:18 AM Zhao Wei Liew <zhaoweiliew@gmail.com> wrote:
>
> > X >= -1 && X <= 1 is (hopefully?) going to be simplified
> > as  (unsigned)X + 1 <= 2, it might be good to simplify it this way
> > here as well?
>
> Yup, GCC does simplify it that way in the end, so I didn't really bother to simplify it here. That said, I'm open to simplifying it here as well, but I'm not sure how to do the unsigned cast.

You'd do sth like
  (with
    {  tree utype = unsigned_type_for (type); }
    (cond (le (plus (convert:utype @0) { build_one_cst (utype); }) {
build_int_cst (utype, 2); }) ...)

extra tricky will be 1 bit integer types, I guess it might be easiest
to exclude them
and special case them separately - X / Y is always X for them I think,
for signed
1 bit types X / Y is always undefined when X is not 0 (the division
overflows or is by zero).
Not sure if worth though ;)  (might appear when bools end up divided
by bools ...)

> Besides that, thanks for the rest of your suggestions! I'm testing the changes and will post a v2 soon.
>
> On Wed, 5 Jan 2022 at 16:18, Richard Biener <richard.guenther@gmail.com> wrote:
> >
> > On Wed, Jan 5, 2022 at 7:15 AM Zhao Wei Liew via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > match.pd/95424: Simplify 1 / X for integer X
> > >
> > > This patch implements an optimization for the following C++ code:
> > >
> > > int f(int x) {
> > >     return 1 / x;
> > > }
> > >
> > > int f(unsigned int x) {
> > >     return 1 / x;
> > > }
> > >
> > > Before this patch, x86-64 gcc -std=c++20 -O3 produces the following
> > > assembly:
> > >
> > > f(int):
> > >     xor edx, edx
> > >     mov eax, 1
> > >     idiv edi
> > >     ret
> > > f(unsigned int):
> > >     xor edx, edx
> > >     mov eax, 1
> > >     div edi
> > >     ret
> > >
> > > In comparison, clang++ -std=c++20 -O3 produces the following assembly:
> > >
> > > f(int):
> > >     lea ecx, [rdi + 1]
> > >     xor eax, eax
> > >     cmp ecx, 3
> > >     cmovb eax, edi
> > >     ret
> > > f(unsigned int):
> > >     xor eax, eax
> > >     cmp edi, 1
> > >     sete al
> > >     ret
> > >
> > > Clang's output is more efficient as it avoids expensive div operations.
> > >
> > > With this patch, GCC now produces the following assembly:
> > > f(int):
> > >     lea eax, [rdi + 1]
> > >     cmp eax, 2
> > >     mov eax, 0
> > >     cmovbe eax, edi
> > >     ret
> > > f(unsigned int):
> > >     xor eax, eax
> > >     cmp edi, 1
> > >     sete al
> > >     ret
> > >
> > > which is virtualy identical to Clang's assembly output. Any slight
> > > differences
> > > in the output for f(int) is related to another possible missed optimization.
> > >
> > > gcc/ChangeLog:
> > >
> > >         * match.pd: Simplify 1 / X where X is an integer.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.dg/tree-ssa/divide-6.c: New test.
> > >         * gcc.dg/tree-ssa/divide-7.c: New test.
> > >
> > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > index 84c9b918041..5edae1818bb 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -422,7 +422,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >     (div:C @0 (negate @0))
> > >     (if ((INTEGRAL_TYPE_P (type) || VECTOR_INTEGER_TYPE_P (type))
> > >   && TYPE_OVERFLOW_UNDEFINED (type))
> > > -    { build_minus_one_cst (type); })))
> > > +    { build_minus_one_cst (type); }))
> > > +
> > > + /* 1 / X -> X == 1 for unsigned integer X
> > > +    1 / X -> X >= -1 && X <= 1 ? X : 0 for signed integer X
> > > +    But not for 1 / 0 so that we can get proper warnings and errors. */
> > > + (simplify
> > > +   (div integer_onep@0 @1)
> > > +   (switch
> > > +     (if (!integer_zerop (@1)
> > > +          && INTEGRAL_TYPE_P (TREE_TYPE (@1))
> >
> > you can use INTEGRAL_TYPE_P (type), the type of @0, @1 and the
> > result ('type') are the same.
> >
> > > +          && TYPE_UNSIGNED (TREE_TYPE (@1)))
> > > +      (eq @0 @1))
> > > +     (if (!integer_zerop (@1)
> > > +          && INTEGRAL_TYPE_P (TREE_TYPE (@1))
> > > +          && !TYPE_UNSIGNED (TREE_TYPE (@1)))
> >
> > Please refactor as
> >
> >     (if (INTEGRAL_TYPE_P (type)
> >          && !integer_zerop (@1))
> >      (if (TYPE_UNSIGNED (....))
> >       (eq @0 @1)
> >       (cond (...
> >
> > > +      (cond (bit_and (ge @1 { build_minus_one_cst (integer_type_node); })
> > > +                     (le @1 { build_one_cst (integer_type_node); }))
> >
> > You should use build_[minus_]one_cst (type) to get -1/1 of the correct
> > type here.
> >
> > X >= -1 && X <= 1 is (hopefully?) going to be simplified
> > as  (unsigned)X + 1 <= 2, it might be good to simplify it this way
> > here as well?
> >
> > Finally I'm not sure whether 1 /[ceil] 2 isn't 1, similar -1 /[floor]
> > 2 should be -1
> > so the transform looks to be correct only for TRUNC_DIV_EXPR, not all 'div'?
> >
> > Thanks,
> > Richard.
> >
> > > +        @1 { build_zero_cst (type); })))))
> > >
> > >  /* For unsigned integral types, FLOOR_DIV_EXPR is the same as
> > >     TRUNC_DIV_EXPR.  Rewrite into the latter in this case.  */
> > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
> > > b/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
> > > new file mode 100644
> > > index 00000000000..a9fc4c04058
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
> > > @@ -0,0 +1,9 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O -fdump-tree-optimized" } */
> > > +
> > > +unsigned int f(unsigned int x) {
> > > +  return 1 / x;
> > > +}
> > > +
> > > +/* { dg-final { scan-tree-dump-not "1 / x_..D.;" "optimized" } } */
> > > +/* { dg-final { scan-tree-dump "x_..D. == 1;" "optimized" } } */
> > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
> > > b/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
> > > new file mode 100644
> > > index 00000000000..285279af7c2
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
> > > @@ -0,0 +1,9 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O -fdump-tree-optimized" } */
> > > +
> > > +int f(int x) {
> > > +  return 1 / x;
> > > +}
> > > +
> > > +/* { dg-final { scan-tree-dump-not "1 / x_..D.;" "optimized" } } */
> > > +/* { dg-final { scan-tree-dump ".. <= 2 ? x_..D. : 0;" "optimized" } } */
  
Jakub Jelinek Jan. 5, 2022, 9:42 a.m. UTC | #4
On Wed, Jan 05, 2022 at 10:38:53AM +0100, Richard Biener via Gcc-patches wrote:
> On Wed, Jan 5, 2022 at 10:18 AM Zhao Wei Liew <zhaoweiliew@gmail.com> wrote:
> >
> > > X >= -1 && X <= 1 is (hopefully?) going to be simplified
> > > as  (unsigned)X + 1 <= 2, it might be good to simplify it this way
> > > here as well?
> >
> > Yup, GCC does simplify it that way in the end, so I didn't really bother to simplify it here. That said, I'm open to simplifying it here as well, but I'm not sure how to do the unsigned cast.
> 
> You'd do sth like
>   (with
>     {  tree utype = unsigned_type_for (type); }
>     (cond (le (plus (convert:utype @0) { build_one_cst (utype); }) {
> build_int_cst (utype, 2); }) ...)
> 
> extra tricky will be 1 bit integer types, I guess it might be easiest
> to exclude them
> and special case them separately - X / Y is always X for them I think,

Note, we already have:
 /* X / bool_range_Y is X.  */
 (simplify
  (div @0 SSA_NAME@1)
  (if (INTEGRAL_TYPE_P (type) && ssa_name_has_boolean_range (@1))
   @0))
for those.

	Jakub
  
Richard Biener Jan. 5, 2022, 9:55 a.m. UTC | #5
On Wed, Jan 5, 2022 at 10:42 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> On Wed, Jan 05, 2022 at 10:38:53AM +0100, Richard Biener via Gcc-patches wrote:
> > On Wed, Jan 5, 2022 at 10:18 AM Zhao Wei Liew <zhaoweiliew@gmail.com> wrote:
> > >
> > > > X >= -1 && X <= 1 is (hopefully?) going to be simplified
> > > > as  (unsigned)X + 1 <= 2, it might be good to simplify it this way
> > > > here as well?
> > >
> > > Yup, GCC does simplify it that way in the end, so I didn't really bother to simplify it here. That said, I'm open to simplifying it here as well, but I'm not sure how to do the unsigned cast.
> >
> > You'd do sth like
> >   (with
> >     {  tree utype = unsigned_type_for (type); }
> >     (cond (le (plus (convert:utype @0) { build_one_cst (utype); }) {
> > build_int_cst (utype, 2); }) ...)
> >
> > extra tricky will be 1 bit integer types, I guess it might be easiest
> > to exclude them
> > and special case them separately - X / Y is always X for them I think,
>
> Note, we already have:
>  /* X / bool_range_Y is X.  */
>  (simplify
>   (div @0 SSA_NAME@1)
>   (if (INTEGRAL_TYPE_P (type) && ssa_name_has_boolean_range (@1))
>    @0))
> for those.

Ah, it might not handle the signed : 1 case though since -1 is not in the
bool range.  We could generalize the above though.

Richard.

>
>         Jakub
>
  
Zhao Wei Liew Jan. 6, 2022, 9:52 a.m. UTC | #6
On Wed, 5 Jan 2022 at 17:55, Richard Biener <richard.guenther@gmail.com>
wrote:

> On Wed, Jan 5, 2022 at 10:42 AM Jakub Jelinek <jakub@redhat.com> wrote:
> >
> > On Wed, Jan 05, 2022 at 10:38:53AM +0100, Richard Biener via Gcc-patches
> wrote:
> > > On Wed, Jan 5, 2022 at 10:18 AM Zhao Wei Liew <zhaoweiliew@gmail.com>
> wrote:
> > > >
> > > > > X >= -1 && X <= 1 is (hopefully?) going to be simplified
> > > > > as  (unsigned)X + 1 <= 2, it might be good to simplify it this way
> > > > > here as well?
> > > >
> > > > Yup, GCC does simplify it that way in the end, so I didn't really
> bother to simplify it here. That said, I'm open to simplifying it here as
> well, but I'm not sure how to do the unsigned cast.
> > >
> > > You'd do sth like
> > >   (with
> > >     {  tree utype = unsigned_type_for (type); }
> > >     (cond (le (plus (convert:utype @0) { build_one_cst (utype); }) {
> > > build_int_cst (utype, 2); }) ...)
> > >
> > > extra tricky will be 1 bit integer types, I guess it might be easiest
> > > to exclude them
> > > and special case them separately - X / Y is always X for them I think,
> >
> > Note, we already have:
> >  /* X / bool_range_Y is X.  */
> >  (simplify
> >   (div @0 SSA_NAME@1)
> >   (if (INTEGRAL_TYPE_P (type) && ssa_name_has_boolean_range (@1))
> >    @0))
> > for those.
>
> Ah, it might not handle the signed : 1 case though since -1 is not in the
> bool range.  We could generalize the above though.
>
> Richard.
>
> >
> >         Jakub
> >
>

Perhaps it is possible to exclude 1-bit cases and rely on other existing
simplifications to deal with them?
In particular, GCC seems to already optimally simplify the signed and
unsigned 1-bit cases [1].

[1]:
struct S {
  unsigned int x: 1;
  int y: 1;
};
unsigned int fu(S s) {
  return 1 / s.x;
}
int fi(S s) {
  return 1 / s.y;
}

fu(S):
        mov     eax, 1
        ret
fi(S):
        mov     eax, -1
        ret
  
Jakub Jelinek Jan. 6, 2022, 10:04 a.m. UTC | #7
On Thu, Jan 06, 2022 at 05:52:03PM +0800, Zhao Wei Liew wrote:
> On Wed, 5 Jan 2022 at 17:55, Richard Biener <richard.guenther@gmail.com>
> wrote:
> 
> > On Wed, Jan 5, 2022 at 10:42 AM Jakub Jelinek <jakub@redhat.com> wrote:
> > >
> > > On Wed, Jan 05, 2022 at 10:38:53AM +0100, Richard Biener via Gcc-patches
> > wrote:
> > > > On Wed, Jan 5, 2022 at 10:18 AM Zhao Wei Liew <zhaoweiliew@gmail.com>
> > wrote:
> > > > >
> > > > > > X >= -1 && X <= 1 is (hopefully?) going to be simplified
> > > > > > as  (unsigned)X + 1 <= 2, it might be good to simplify it this way
> > > > > > here as well?
> > > > >
> > > > > Yup, GCC does simplify it that way in the end, so I didn't really
> > bother to simplify it here. That said, I'm open to simplifying it here as
> > well, but I'm not sure how to do the unsigned cast.
> > > >
> > > > You'd do sth like
> > > >   (with
> > > >     {  tree utype = unsigned_type_for (type); }
> > > >     (cond (le (plus (convert:utype @0) { build_one_cst (utype); }) {
> > > > build_int_cst (utype, 2); }) ...)
> > > >
> > > > extra tricky will be 1 bit integer types, I guess it might be easiest
> > > > to exclude them
> > > > and special case them separately - X / Y is always X for them I think,
> > >
> > > Note, we already have:
> > >  /* X / bool_range_Y is X.  */
> > >  (simplify
> > >   (div @0 SSA_NAME@1)
> > >   (if (INTEGRAL_TYPE_P (type) && ssa_name_has_boolean_range (@1))
> > >    @0))
> > > for those.
> >
> > Ah, it might not handle the signed : 1 case though since -1 is not in the
> > bool range.  We could generalize the above though.
> >
> > Richard.
> >
> > >
> > >         Jakub
> > >
> >
> 
> Perhaps it is possible to exclude 1-bit cases and rely on other existing
> simplifications to deal with them?
> In particular, GCC seems to already optimally simplify the signed and
> unsigned 1-bit cases [1].

Sure, I thought that was what Richi was suggesting, so ad some
 && TYPE_PRECISION (...) > 1
somewhere (after making sure it is INTEGRAL_TYPE_P), or element_precision
if it is ANY_INTEGRAL_TYPE_P and could be vector type.

The discussion was about whether we optimize even those : 1 cases
sufficiently but that can be done in different simplifications...

	Jakub
  

Patch

diff --git a/gcc/match.pd b/gcc/match.pd
index 84c9b918041..5edae1818bb 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -422,7 +422,24 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
    (div:C @0 (negate @0))
    (if ((INTEGRAL_TYPE_P (type) || VECTOR_INTEGER_TYPE_P (type))
  && TYPE_OVERFLOW_UNDEFINED (type))
-    { build_minus_one_cst (type); })))
+    { build_minus_one_cst (type); }))
+
+ /* 1 / X -> X == 1 for unsigned integer X
+    1 / X -> X >= -1 && X <= 1 ? X : 0 for signed integer X
+    But not for 1 / 0 so that we can get proper warnings and errors. */
+ (simplify
+   (div integer_onep@0 @1)
+   (switch
+     (if (!integer_zerop (@1)
+          && INTEGRAL_TYPE_P (TREE_TYPE (@1))
+          && TYPE_UNSIGNED (TREE_TYPE (@1)))
+      (eq @0 @1))
+     (if (!integer_zerop (@1)
+          && INTEGRAL_TYPE_P (TREE_TYPE (@1))
+          && !TYPE_UNSIGNED (TREE_TYPE (@1)))
+      (cond (bit_and (ge @1 { build_minus_one_cst (integer_type_node); })
+                     (le @1 { build_one_cst (integer_type_node); }))
+        @1 { build_zero_cst (type); })))))

 /* For unsigned integral types, FLOOR_DIV_EXPR is the same as
    TRUNC_DIV_EXPR.  Rewrite into the latter in this case.  */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
b/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
new file mode 100644
index 00000000000..a9fc4c04058
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
@@ -0,0 +1,9 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-optimized" } */
+
+unsigned int f(unsigned int x) {
+  return 1 / x;
+}
+
+/* { dg-final { scan-tree-dump-not "1 / x_..D.;" "optimized" } } */
+/* { dg-final { scan-tree-dump "x_..D. == 1;" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
b/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
new file mode 100644
index 00000000000..285279af7c2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
@@ -0,0 +1,9 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-optimized" } */
+
+int f(int x) {
+  return 1 / x;
+}
+
+/* { dg-final { scan-tree-dump-not "1 / x_..D.;" "optimized" } } */
+/* { dg-final { scan-tree-dump ".. <= 2 ? x_..D. : 0;" "optimized" } } */