diff mbox series

[v2] match.pd: Simplify 1 / X for integer X [PR95424]

Message ID	CALHvHFVD_NKGBn3MGhWSz-3Lt41fgkZooqdeXeViyHn2QUH2-w@mail.gmail.com
State	New
Headers	DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 09E60385800C MIME-Version: 1.0 Date: Thu, 6 Jan 2022 18:35:55 +0800 Message-ID: <CALHvHFVD_NKGBn3MGhWSz-3Lt41fgkZooqdeXeViyHn2QUH2-w@mail.gmail.com> Subject: [PATCH v2] match.pd: Simplify 1 / X for integer X [PR95424] To: GCC Patches <gcc-patches@gcc.gnu.org> Content-Type: text/plain; charset="UTF-8" Precedence: list From: Zhao Wei Liew via Gcc-patches <gcc-patches@gcc.gnu.org> Reply-To: Zhao Wei Liew <zhaoweiliew@gmail.com> Cc: Jakub Jelinek <jakub@redhat.com> Errors-To: gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org>
Series	[v2] match.pd: Simplify 1 / X for integer X [PR95424] \| [v2] match.pd: Simplify 1 / X for integer X [PR95424]

Commit Message

Zhao Wei Liew Jan. 6, 2022, 10:35 a.m. UTC

  This patch implements an optimization for the following C++ code:

int f(int x) {
    return 1 / x;
}

int f(unsigned int x) {
    return 1 / x;
}

Before this patch, x86-64 gcc -std=c++20 -O3 produces the following
assembly:

f(int):
    xor edx, edx
    mov eax, 1
    idiv edi
    ret
f(unsigned int):
    xor edx, edx
    mov eax, 1
    div edi
    ret

In comparison, clang++ -std=c++20 -O3 produces the following assembly:

f(int):
    lea ecx, [rdi + 1]
    xor eax, eax
    cmp ecx, 3
    cmovb eax, edi
    ret
f(unsigned int):
    xor eax, eax
    cmp edi, 1
    sete al
    ret

Clang's output is more efficient as it avoids expensive div operations.

With this patch, GCC now produces the following assembly:

f(int):
    lea eax, [rdi + 1]
    cmp eax, 2
    mov eax, 0
    cmovbe eax, edi
    ret
f(unsigned int):
    xor eax, eax
    cmp edi, 1
    sete al
    ret

which is virtually identical to Clang's assembly output. Any slight
differences
in the output for f(int) is possibly related to a different missed
optimization.

v1: https://gcc.gnu.org/pipermail/gcc-patches/2022-January/587634.html
Changes from v1:
1. Refactor common if conditions.
2. Use build_[minus_]one_cst (type) to get -1/1 of the correct type.
3. Match only for TRUNC_DIV_EXPR and TYPE_PRECISION (type) > 1.

gcc/ChangeLog:

* match.pd: Simplify 1 / X where X is an integer.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/divide-6.c: New test.
* gcc.dg/tree-ssa/divide-7.c: New test.
---
 gcc/match.pd                             | 15 +++++++++++++++
 gcc/testsuite/gcc.dg/tree-ssa/divide-6.c |  9 +++++++++
 gcc/testsuite/gcc.dg/tree-ssa/divide-7.c |  9 +++++++++
 3 files changed, 33 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/divide-7.c

Comments

Richard Biener Jan. 10, 2022, 1:03 p.m. UTC | #1

On Thu, Jan 6, 2022 at 11:36 AM Zhao Wei Liew <zhaoweiliew@gmail.com> wrote:
>
> This patch implements an optimization for the following C++ code:
>
> int f(int x) {
>     return 1 / x;
> }
>
> int f(unsigned int x) {
>     return 1 / x;
> }
>
> Before this patch, x86-64 gcc -std=c++20 -O3 produces the following assembly:
>
> f(int):
>     xor edx, edx
>     mov eax, 1
>     idiv edi
>     ret
> f(unsigned int):
>     xor edx, edx
>     mov eax, 1
>     div edi
>     ret
>
> In comparison, clang++ -std=c++20 -O3 produces the following assembly:
>
> f(int):
>     lea ecx, [rdi + 1]
>     xor eax, eax
>     cmp ecx, 3
>     cmovb eax, edi
>     ret
> f(unsigned int):
>     xor eax, eax
>     cmp edi, 1
>     sete al
>     ret
>
> Clang's output is more efficient as it avoids expensive div operations.
>
> With this patch, GCC now produces the following assembly:
>
> f(int):
>     lea eax, [rdi + 1]
>     cmp eax, 2
>     mov eax, 0
>     cmovbe eax, edi
>     ret
> f(unsigned int):
>     xor eax, eax
>     cmp edi, 1
>     sete al
>     ret
>
> which is virtually identical to Clang's assembly output. Any slight differences
> in the output for f(int) is possibly related to a different missed optimization.
>
> v1: https://gcc.gnu.org/pipermail/gcc-patches/2022-January/587634.html
> Changes from v1:
> 1. Refactor common if conditions.
> 2. Use build_[minus_]one_cst (type) to get -1/1 of the correct type.
> 3. Match only for TRUNC_DIV_EXPR and TYPE_PRECISION (type) > 1.
>
> gcc/ChangeLog:
>
> * match.pd: Simplify 1 / X where X is an integer.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/divide-6.c: New test.
> * gcc.dg/tree-ssa/divide-7.c: New test.
> ---
>  gcc/match.pd                             | 15 +++++++++++++++
>  gcc/testsuite/gcc.dg/tree-ssa/divide-6.c |  9 +++++++++
>  gcc/testsuite/gcc.dg/tree-ssa/divide-7.c |  9 +++++++++
>  3 files changed, 33 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 84c9b918041..52a0f77f455 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -432,6 +432,21 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>        && TYPE_UNSIGNED (type))
>    (trunc_div @0 @1)))
>
> + /* 1 / X -> X == 1 for unsigned integer X.
> +    1 / X -> X >= -1 && X <= 1 ? X : 0 for signed integer X.
> +    But not for 1 / 0 so that we can get proper warnings and errors,
> +    and not for 1-bit integers as they are edge cases better handled elsewhere. */
> +(simplify
> +  (trunc_div integer_onep@0 @1)
> +  (if (INTEGRAL_TYPE_P (type) && !integer_zerop (@1) && TYPE_PRECISION (type) > 1)
> +    (switch
> +      (if (TYPE_UNSIGNED (type))
> +        (eq @1 { build_one_cst (type); }))
> +      (if (!TYPE_UNSIGNED (type))

           (if (TYPE_UNSIGNED (type))
            (... A ...)
            (... B ...))

works like if (x) A else B, that's shorter and faster than the switch variant.

OK with that change.

Thanks,
Richard.

> +        (with { tree utype = unsigned_type_for (type); }
> +          (cond (le (plus (convert:utype @1) { build_one_cst (utype); }) { build_int_cst (utype, 2); })
> +            @1 { build_zero_cst (type); }))))))
> +
>  /* Combine two successive divisions.  Note that combining ceil_div
>     and floor_div is trickier and combining round_div even more so.  */
>  (for div (trunc_div exact_div)
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c b/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
> new file mode 100644
> index 00000000000..a9fc4c04058
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -fdump-tree-optimized" } */
> +
> +unsigned int f(unsigned int x) {
> +  return 1 / x;
> +}
> +
> +/* { dg-final { scan-tree-dump-not "1 / x_..D.;" "optimized" } } */
> +/* { dg-final { scan-tree-dump "x_..D. == 1;" "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c b/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
> new file mode 100644
> index 00000000000..285279af7c2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -fdump-tree-optimized" } */
> +
> +int f(int x) {
> +  return 1 / x;
> +}
> +
> +/* { dg-final { scan-tree-dump-not "1 / x_..D.;" "optimized" } } */
> +/* { dg-final { scan-tree-dump ".. <= 2 ? x_..D. : 0;" "optimized" } } */
> --
> 2.17.1
>

diff mbox series

Patch

diff --git a/gcc/match.pd b/gcc/match.pd
index 84c9b918041..52a0f77f455 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -432,6 +432,21 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
       && TYPE_UNSIGNED (type))
   (trunc_div @0 @1)))

+ /* 1 / X -> X == 1 for unsigned integer X.
+    1 / X -> X >= -1 && X <= 1 ? X : 0 for signed integer X.
+    But not for 1 / 0 so that we can get proper warnings and errors,
+    and not for 1-bit integers as they are edge cases better handled
elsewhere. */
+(simplify
+  (trunc_div integer_onep@0 @1)
+  (if (INTEGRAL_TYPE_P (type) && !integer_zerop (@1) && TYPE_PRECISION
(type) > 1)
+    (switch
+      (if (TYPE_UNSIGNED (type))
+        (eq @1 { build_one_cst (type); }))
+      (if (!TYPE_UNSIGNED (type))
+        (with { tree utype = unsigned_type_for (type); }
+          (cond (le (plus (convert:utype @1) { build_one_cst (utype); }) {
build_int_cst (utype, 2); })
+            @1 { build_zero_cst (type); }))))))
+
 /* Combine two successive divisions.  Note that combining ceil_div
    and floor_div is trickier and combining round_div even more so.  */
 (for div (trunc_div exact_div)
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
b/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
new file mode 100644
index 00000000000..a9fc4c04058
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/divide-6.c
@@ -0,0 +1,9 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-optimized" } */
+
+unsigned int f(unsigned int x) {
+  return 1 / x;
+}
+
+/* { dg-final { scan-tree-dump-not "1 / x_..D.;" "optimized" } } */
+/* { dg-final { scan-tree-dump "x_..D. == 1;" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
b/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
new file mode 100644
index 00000000000..285279af7c2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/divide-7.c
@@ -0,0 +1,9 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-optimized" } */
+
+int f(int x) {
+  return 1 / x;
+}
+
+/* { dg-final { scan-tree-dump-not "1 / x_..D.;" "optimized" } } */
+/* { dg-final { scan-tree-dump ".. <= 2 ? x_..D. : 0;" "optimized" } } */