Simplify (B * v + C) * D -> BD* v + CD when B, C, D are all INTEGER_CST.
Commit Message
>> + (mult:c (plus:c@4 (mult:c@5 @0 INTEGER_CST@1) INTEGER_CST@2) INTEGER_CST@3)
>since canonicalization puts INTEGER_CSTs last the :c should not be necessary.
Changed.
>> + (if (single_use (@4)
>> + && single_use (@5))
>since the resulting expression is not simple using :s instead of
>single_use (..) should
>work as well.
Changed.
> when we go from (a + CST1) * CST2 to a * CST2 + CST1*CST2 we have
> to worry about CST1 == -a which would make (a+CST1) * INT_MAX
> not overflow but a * INT_MAX + CST1 * INT_MAX might. Is the
> overflow check for CST1 * INT_MAX sufficient to rule out
> that a * CST2 does not overflow when (a + CST1) * CST2 does not
> overflow? Consider a == 2, CST1 == -1, CST2 == INT_MAX,
> here 1 * INT_MAX does not overflow, nor does -1 * INT_MAX, but
> 2 * INT_MAX overflows and thus the resulting expression invokes
> undefined behavior.
>
> The same issue probably arises for the first pattern outer half
> which looks like (a' + CST2) * CST3 with a' = a * CST1?
>
> The appropriate solution might be to perform the arithmetic
> in an unsigned type with the implication that has on value-range
> analysis.
Yes, the patch patched based on value-range analysis.
Update the patch.
Similar for (v + B) * C + D -> C * v + BCD.
Don't simplify it when there's overflow and overflow is UB for type v.
gcc/ChangeLog:
PR tree-optimization/53533
* match.pd: Simplify (B * v + C) * D -> BD * v + CD and
(v + B) * C + D -> C * v + BCD when B,C,D are all INTEGER_CST,
and there's no overflow or !TYPE_OVERFLOW_UNDEFINED.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr53533-1.c: New test.
* gcc.target/i386/pr53533-2.c: New test.
* gcc.target/i386/pr53533-3.c: New test.
* gcc.target/i386/pr53533-4.c: New test.
* gcc.target/i386/pr53533-5.c: New test.
* gcc.dg/vect/slp-11a.c: Adjust testcase.
---
gcc/match.pd | 82 +++++++++++++++++++++++
gcc/testsuite/gcc.dg/vect/slp-11a.c | 10 +--
gcc/testsuite/gcc.target/i386/pr53533-1.c | 23 +++++++
gcc/testsuite/gcc.target/i386/pr53533-2.c | 46 +++++++++++++
gcc/testsuite/gcc.target/i386/pr53533-3.c | 24 +++++++
gcc/testsuite/gcc.target/i386/pr53533-4.c | 46 +++++++++++++
gcc/testsuite/gcc.target/i386/pr53533-5.c | 22 ++++++
7 files changed, 248 insertions(+), 5 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-3.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-4.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-5.c
Comments
On Tue, Jun 7, 2022 at 9:54 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> >> + (mult:c (plus:c@4 (mult:c@5 @0 INTEGER_CST@1) INTEGER_CST@2) INTEGER_CST@3)
>
> >since canonicalization puts INTEGER_CSTs last the :c should not be necessary.
>
> Changed.
>
> >> + (if (single_use (@4)
> >> + && single_use (@5))
>
> >since the resulting expression is not simple using :s instead of
> >single_use (..) should
> >work as well.
>
> Changed.
>
> > when we go from (a + CST1) * CST2 to a * CST2 + CST1*CST2 we have
> > to worry about CST1 == -a which would make (a+CST1) * INT_MAX
> > not overflow but a * INT_MAX + CST1 * INT_MAX might. Is the
> > overflow check for CST1 * INT_MAX sufficient to rule out
> > that a * CST2 does not overflow when (a + CST1) * CST2 does not
> > overflow? Consider a == 2, CST1 == -1, CST2 == INT_MAX,
> > here 1 * INT_MAX does not overflow, nor does -1 * INT_MAX, but
> > 2 * INT_MAX overflows and thus the resulting expression invokes
> > undefined behavior.
> >
> > The same issue probably arises for the first pattern outer half
> > which looks like (a' + CST2) * CST3 with a' = a * CST1?
> >
> > The appropriate solution might be to perform the arithmetic
> > in an unsigned type with the implication that has on value-range
> > analysis.
>
> Yes, the patch patched based on value-range analysis.
>
> Update the patch.
>
> Similar for (v + B) * C + D -> C * v + BCD.
> Don't simplify it when there's overflow and overflow is UB for type v.
>
> gcc/ChangeLog:
>
> PR tree-optimization/53533
> * match.pd: Simplify (B * v + C) * D -> BD * v + CD and
> (v + B) * C + D -> C * v + BCD when B,C,D are all INTEGER_CST,
> and there's no overflow or !TYPE_OVERFLOW_UNDEFINED.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr53533-1.c: New test.
> * gcc.target/i386/pr53533-2.c: New test.
> * gcc.target/i386/pr53533-3.c: New test.
> * gcc.target/i386/pr53533-4.c: New test.
> * gcc.target/i386/pr53533-5.c: New test.
> * gcc.dg/vect/slp-11a.c: Adjust testcase.
> ---
> gcc/match.pd | 82 +++++++++++++++++++++++
> gcc/testsuite/gcc.dg/vect/slp-11a.c | 10 +--
> gcc/testsuite/gcc.target/i386/pr53533-1.c | 23 +++++++
> gcc/testsuite/gcc.target/i386/pr53533-2.c | 46 +++++++++++++
> gcc/testsuite/gcc.target/i386/pr53533-3.c | 24 +++++++
> gcc/testsuite/gcc.target/i386/pr53533-4.c | 46 +++++++++++++
> gcc/testsuite/gcc.target/i386/pr53533-5.c | 22 ++++++
> 7 files changed, 248 insertions(+), 5 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-2.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-3.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-4.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-5.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 44a385b912d..54f53a1f988 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -489,6 +489,88 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> (if (!overflow || TYPE_OVERFLOW_WRAPS (type))
> (mult @0 { wide_int_to_tree (type, mul); }))))
>
> +/* Similar to above, but there could be an extra add/sub between
> + successive multuiplications. */
> +(simplify
> + (mult (plus:s (mult:s@4 @0 INTEGER_CST@1) INTEGER_CST@2) INTEGER_CST@3)
> + (with {
> + bool overflowed = true;
> + wi::overflow_type ovf1, ovf2;
> + wide_int mul = wi::mul (wi::to_wide (@1), wi::to_wide (@3),
> + TYPE_SIGN (type), &ovf1);
> + wide_int add = wi::mul (wi::to_wide (@2), wi::to_wide (@3),
> + TYPE_SIGN (type), &ovf2);
> + if (TYPE_OVERFLOW_UNDEFINED (type))
> + {
> +#if GIMPLE
> + value_range vr0;
> + if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE
> + && get_global_range_query ()->range_of_expr (vr0, @4)
> + && vr0.kind () == VR_RANGE)
> + {
> + wide_int wmin0 = vr0.lower_bound ();
> + wide_int wmax0 = vr0.upper_bound ();
> + wmin0 = wi::mul (wmin0, wi::to_wide (@3), TYPE_SIGN (type), &ovf1);
> + wmax0 = wi::mul (wmax0, wi::to_wide (@3), TYPE_SIGN (type), &ovf2);
> + if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
> + {
> + wi::add (wmin0, add, TYPE_SIGN (type), &ovf1);
> + wi::add (wmax0, add, TYPE_SIGN (type), &ovf2);
> + if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
> + overflowed = false;
I wonder if we have an API available for these kind of queries already?
Possibly value_range expr_range (MULT_EXPR, vr0, vr1, &ovf),
computing the result range of the operation on ranges vr0 and vr1
indicating overflow behavior in 'ovf' (similar as to the wide_int APIs)?
Andrew?
I see match.pd already has similar code in a few places.
> + }
> + }
> +#endif
> + }
> + else
> + overflowed = false;
> + }
> + /* Skip folding on overflow. */
> + (if (!overflowed)
> + (plus (mult @0 { wide_int_to_tree (type, mul); })
> + { wide_int_to_tree (type, add); }))))
> +
> +/* Similar to above, but a multiplication between successive additions. */
> +(simplify
> + (plus (mult:s (plus:s @0 INTEGER_CST@1) INTEGER_CST@2) INTEGER_CST@3)
> + (with {
> + bool overflowed = true;
> + wi::overflow_type ovf1;
> + wi::overflow_type ovf2;
> + wide_int mul = wi::mul (wi::to_wide (@1), wi::to_wide (@2),
> + TYPE_SIGN (type), &ovf1);
> + wide_int add = wi::add (mul, wi::to_wide (@3),
> + TYPE_SIGN (type), &ovf2);
> + if (TYPE_OVERFLOW_UNDEFINED (type))
> + {
> +#if GIMPLE
> + value_range vr0;
> + if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE
> + && get_global_range_query ()->range_of_expr (vr0, @0)
> + && vr0.kind () == VR_RANGE)
> + {
> + wide_int wmin0 = vr0.lower_bound ();
> + wide_int wmax0 = vr0.upper_bound ();
> + wmin0 = wi::mul (wmin0, wi::to_wide (@2), TYPE_SIGN (type), &ovf1);
> + wmax0 = wi::mul (wmax0, wi::to_wide (@2), TYPE_SIGN (type), &ovf2);
> + if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
> + {
> + wi::add (wmin0, mul, TYPE_SIGN (type), &ovf1);
> + wi::add (wmax0, mul, TYPE_SIGN (type), &ovf2);
> + if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
> + overflowed = false;
> + }
> + }
> +#endif
> + }
> + else
> + overflowed = false;
> +
unnecessary vertical space
OK with that fixed.
Thanks,
Richard.
> + }
> + /* Skip folding on overflow. */
> + (if (!overflowed)
> + (plus (mult @0 @2) { wide_int_to_tree (type, add); }))))
> +
> /* Optimize A / A to 1.0 if we don't care about
> NaNs or Infinities. */
> (simplify
> diff --git a/gcc/testsuite/gcc.dg/vect/slp-11a.c b/gcc/testsuite/gcc.dg/vect/slp-11a.c
> index bcd3c861ca4..e6632fa77be 100644
> --- a/gcc/testsuite/gcc.dg/vect/slp-11a.c
> +++ b/gcc/testsuite/gcc.dg/vect/slp-11a.c
> @@ -9,14 +9,14 @@ int
> main1 ()
> {
> int i;
> - unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
> - unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
> + int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
> + int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
>
> /* Different operations - not SLPable. */
> for (i = 0; i < N; i++)
> {
> a0 = in[i*8] + 5;
> - a1 = in[i*8 + 1] * 6;
> + a1 = in[i*8 + 1] * 51072;
> a2 = in[i*8 + 2] + 7;
> a3 = in[i*8 + 3] + 8;
> a4 = in[i*8 + 4] + 9;
> @@ -25,7 +25,7 @@ main1 ()
> a7 = in[i*8 + 7] + 12;
>
> b0 = a0 * 3;
> - b1 = a1 * 2;
> + b1 = a1 * 51072;
> b2 = a2 * 12;
> b3 = a3 * 5;
> b4 = a4 * 8;
> @@ -47,7 +47,7 @@ main1 ()
> for (i = 0; i < N; i++)
> {
> if (out[i*8] != (in[i*8] + 5) * 3 - 2
> - || out[i*8 + 1] != (in[i*8 + 1] * 6) * 2 - 3
> + || out[i*8 + 1] != (in[i*8 + 1] * 51072) * 51072 - 3
> || out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2
> || out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1
> || out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8
> diff --git a/gcc/testsuite/gcc.target/i386/pr53533-1.c b/gcc/testsuite/gcc.target/i386/pr53533-1.c
> new file mode 100644
> index 00000000000..095de665366
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr53533-1.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1" } */
> +/* { dg-final { scan-assembler-times "imull\[ \t\]" "1" } } */
> +/* { dg-final { scan-assembler-times "(?:addl|subl)\[ \t\]" "1" { target { ! ia32 } } } } */
> +
> +void
> +__attribute__((noipa))
> +foo (unsigned a[256], unsigned b[256])
> +{
> + int i;
> + for (i = 0; i < 256; ++i)
> + {
> + unsigned tmp = a[i] + 12345U;
> + tmp *= 914237U;
> + tmp += 12332U;
> + tmp *= 914237U;
> + tmp += 12332U;
> + tmp *= 914237U;
> + tmp -= 13U;
> + tmp *= 8000U;
> + b[i] = tmp;
> + }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr53533-2.c b/gcc/testsuite/gcc.target/i386/pr53533-2.c
> new file mode 100644
> index 00000000000..c31b6ff4dec
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr53533-2.c
> @@ -0,0 +1,46 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +
> +#include "pr53533-1.c"
> +
> +void
> +__attribute__((optimize("-O0")))
> +foo1 (unsigned a[256], unsigned b[256])
> +{
> + int i;
> + for (i = 0; i < 256; ++i)
> + {
> + unsigned tmp = a[i] + 12345U;
> + tmp *= 914237U;
> + tmp += 12332U;
> + tmp *= 914237U;
> + tmp += 12332U;
> + tmp *= 914237U;
> + tmp -= 13U;
> + tmp *= 8000U;
> + b[i] = tmp;
> + }
> +}
> +
> +int main()
> +{
> + unsigned int a[256];
> + unsigned int b[256];
> + unsigned int c[256];
> + for (unsigned int i = 0; i != 256; i++)
> + {
> + b[i] = 0;
> + c[i] = 1;
> + a[i] = i * i - 10 * i + 33;
> + }
> + foo (a, b);
> + foo1 (a, c);
> +
> + for (unsigned int i = 0; i != 256; i++)
> + {
> + if (b[i] != c[i])
> + __builtin_abort ();
> + }
> +
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr53533-3.c b/gcc/testsuite/gcc.target/i386/pr53533-3.c
> new file mode 100644
> index 00000000000..3b260d134e9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr53533-3.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -fwrapv" } */
> +/* { dg-final { scan-assembler-times "imull\[ \t\]" "1" } } */
> +/* { dg-final { scan-assembler-times "(?:addl|subl)\[ \t\]" "1" { target { ! ia32 } } } } */
> +
> +void
> +__attribute__((noipa))
> +foo (int a[256], int b[256])
> +{
> + int i;
> + for (i = 0; i < 256; ++i)
> + {
> + int tmp = a[i] + 12345;
> + tmp *= 914237;
> + tmp += 12332;
> + tmp *= 914237;
> + tmp += 12332;
> + tmp *= 914237;
> + tmp -= 13;
> + tmp *= 8000;
> + b[i] = tmp;
> + }
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/i386/pr53533-4.c b/gcc/testsuite/gcc.target/i386/pr53533-4.c
> new file mode 100644
> index 00000000000..c29f90a44dc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr53533-4.c
> @@ -0,0 +1,46 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fwrapv" } */
> +
> +#include "pr53533-3.c"
> +
> +void
> +__attribute__((optimize("-O0")))
> +foo1 (int a[256], int b[256])
> +{
> + int i;
> + for (i = 0; i < 256; ++i)
> + {
> + int tmp = a[i] + 12345;
> + tmp *= 914237;
> + tmp += 12332;
> + tmp *= 914237;
> + tmp += 12332;
> + tmp *= 914237;
> + tmp -= 13;
> + tmp *= 8000;
> + b[i] = tmp;
> + }
> +}
> +
> +int main()
> +{
> + int a[256];
> + int b[256];
> + int c[256];
> + for (int i = 0; i != 256; i++)
> + {
> + b[i] = 0;
> + c[i] = 1;
> + a[i] = i * i - 10 * i + 33;
> + }
> + foo (a, b);
> + foo1 (a, c);
> +
> + for (unsigned int i = 0; i != 256; i++)
> + {
> + if (b[i] != c[i])
> + __builtin_abort ();
> + }
> +
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr53533-5.c b/gcc/testsuite/gcc.target/i386/pr53533-5.c
> new file mode 100644
> index 00000000000..56fe4f44064
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr53533-5.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-times {(?n)\* 2147483647} 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)\* 1073741823} 1 "optimized" } } */
> +
> +#define INT_MAX 2147483647
> +int
> +foo (int a)
> +{
> + /* When a == -2, there's no overflow for (a + 1) * INT_MAX - 1.
> + but overflow for a * INT_MAX + (INT_MAX - 1).
> + Don't simpify it. */
> + return (a + 1) * INT_MAX - 1;
> +}
> +
> +int
> +foo1 (int a)
> +{
> + /* Be conservative here, don't simplify this as long as
> + a * 2147483646 may overflow. */
> + return 1073741823 * (a * 2 + 1);
> +}
> --
> 2.18.1
>
On 6/15/22 04:24, Richard Biener wrote:
>
>> diff --git a/gcc/match.pd b/gcc/match.pd
>> index 44a385b912d..54f53a1f988 100644
>> --- a/gcc/match.pd
>> +++ b/gcc/match.pd
>> @@ -489,6 +489,88 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>> (if (!overflow || TYPE_OVERFLOW_WRAPS (type))
>> (mult @0 { wide_int_to_tree (type, mul); }))))
>>
>> +/* Similar to above, but there could be an extra add/sub between
>> + successive multuiplications. */
>> +(simplify
>> + (mult (plus:s (mult:s@4 @0 INTEGER_CST@1) INTEGER_CST@2) INTEGER_CST@3)
>> + (with {
>> + bool overflowed = true;
>> + wi::overflow_type ovf1, ovf2;
>> + wide_int mul = wi::mul (wi::to_wide (@1), wi::to_wide (@3),
>> + TYPE_SIGN (type), &ovf1);
>> + wide_int add = wi::mul (wi::to_wide (@2), wi::to_wide (@3),
>> + TYPE_SIGN (type), &ovf2);
>> + if (TYPE_OVERFLOW_UNDEFINED (type))
>> + {
>> +#if GIMPLE
>> + value_range vr0;
>> + if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE
>> + && get_global_range_query ()->range_of_expr (vr0, @4)
>> + && vr0.kind () == VR_RANGE)
>> + {
>> + wide_int wmin0 = vr0.lower_bound ();
>> + wide_int wmax0 = vr0.upper_bound ();
>> + wmin0 = wi::mul (wmin0, wi::to_wide (@3), TYPE_SIGN (type), &ovf1);
>> + wmax0 = wi::mul (wmax0, wi::to_wide (@3), TYPE_SIGN (type), &ovf2);
>> + if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
>> + {
>> + wi::add (wmin0, add, TYPE_SIGN (type), &ovf1);
>> + wi::add (wmax0, add, TYPE_SIGN (type), &ovf2);
>> + if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
>> + overflowed = false;
> I wonder if we have an API available for these kind of queries already?
> Possibly value_range expr_range (MULT_EXPR, vr0, vr1, &ovf),
> computing the result range of the operation on ranges vr0 and vr1
> indicating overflow behavior in 'ovf' (similar as to the wide_int APIs)?
>
> Andrew?
>
> I see match.pd already has similar code in a few places.
The overflow characteristic is currently "hidden" in the API. ie, you
can do
range_op_handler (MULT_EXPR, type).fold_range (r, type, vr0, vr1);
and get the result in 'r', which typcically if it overflows for MULT is
varying I think. (we give up easily :-) BUt the fact than an overflow
may have happened is not explicitly provided.
If this was something generally useful, we could consider adding an
optional flag to the API which indicates is an overflow may happen.
Then you'd have something like:
range_op_handler (MULT_EXPR, type).fold_range (r, type, vr0, vr1, &ovf);
The original range-ops implementation way back when did carry an
overflow around, but it was removed early on. Someone would have to go
in and tweak all the fold_range routines to set the flag correctly, if
it was passed in. If its going to work for one it would have to work for
all.
I do believe the information is all contained in every relevant range-op
routine, we just don't propagate it back to the caller. Most of the
"generic" routines have a wi_op_overflows () method which determines
whether any pair of wide-ints overflow the operation, we'd just have to
return the flag in those cases. The rest of the cases would have to be
audited.
And in todays multi-type environment, we'd have to handle it for the
other types as well.. so an integer specific value like wi::OVF_* may
not be appropriate. Perhaps just a boolean flag indicating if the
result is an accurate calculation, or integrates possibly out of range
values.
Andrew
PS. We could also simplify the API slightly for general consumption via
the gimple-range-fold.h routines. they currently support just gimple
statements:
bool fold_range (vrange &r, gimple *s, vrange &r1, vrange &r2);
but we could probably also manage something like:
bool fold_range (vrange &r, MULT_EXPR, vrange &r1, vrange &r2);
where we key off the types of r1 and r2 when possible... and return
false otherwise. Then if we have extended the range-ops API to include
an overflow, we could attach it here too.
Andrew
@@ -489,6 +489,88 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(if (!overflow || TYPE_OVERFLOW_WRAPS (type))
(mult @0 { wide_int_to_tree (type, mul); }))))
+/* Similar to above, but there could be an extra add/sub between
+ successive multuiplications. */
+(simplify
+ (mult (plus:s (mult:s@4 @0 INTEGER_CST@1) INTEGER_CST@2) INTEGER_CST@3)
+ (with {
+ bool overflowed = true;
+ wi::overflow_type ovf1, ovf2;
+ wide_int mul = wi::mul (wi::to_wide (@1), wi::to_wide (@3),
+ TYPE_SIGN (type), &ovf1);
+ wide_int add = wi::mul (wi::to_wide (@2), wi::to_wide (@3),
+ TYPE_SIGN (type), &ovf2);
+ if (TYPE_OVERFLOW_UNDEFINED (type))
+ {
+#if GIMPLE
+ value_range vr0;
+ if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE
+ && get_global_range_query ()->range_of_expr (vr0, @4)
+ && vr0.kind () == VR_RANGE)
+ {
+ wide_int wmin0 = vr0.lower_bound ();
+ wide_int wmax0 = vr0.upper_bound ();
+ wmin0 = wi::mul (wmin0, wi::to_wide (@3), TYPE_SIGN (type), &ovf1);
+ wmax0 = wi::mul (wmax0, wi::to_wide (@3), TYPE_SIGN (type), &ovf2);
+ if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
+ {
+ wi::add (wmin0, add, TYPE_SIGN (type), &ovf1);
+ wi::add (wmax0, add, TYPE_SIGN (type), &ovf2);
+ if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
+ overflowed = false;
+ }
+ }
+#endif
+ }
+ else
+ overflowed = false;
+ }
+ /* Skip folding on overflow. */
+ (if (!overflowed)
+ (plus (mult @0 { wide_int_to_tree (type, mul); })
+ { wide_int_to_tree (type, add); }))))
+
+/* Similar to above, but a multiplication between successive additions. */
+(simplify
+ (plus (mult:s (plus:s @0 INTEGER_CST@1) INTEGER_CST@2) INTEGER_CST@3)
+ (with {
+ bool overflowed = true;
+ wi::overflow_type ovf1;
+ wi::overflow_type ovf2;
+ wide_int mul = wi::mul (wi::to_wide (@1), wi::to_wide (@2),
+ TYPE_SIGN (type), &ovf1);
+ wide_int add = wi::add (mul, wi::to_wide (@3),
+ TYPE_SIGN (type), &ovf2);
+ if (TYPE_OVERFLOW_UNDEFINED (type))
+ {
+#if GIMPLE
+ value_range vr0;
+ if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE
+ && get_global_range_query ()->range_of_expr (vr0, @0)
+ && vr0.kind () == VR_RANGE)
+ {
+ wide_int wmin0 = vr0.lower_bound ();
+ wide_int wmax0 = vr0.upper_bound ();
+ wmin0 = wi::mul (wmin0, wi::to_wide (@2), TYPE_SIGN (type), &ovf1);
+ wmax0 = wi::mul (wmax0, wi::to_wide (@2), TYPE_SIGN (type), &ovf2);
+ if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
+ {
+ wi::add (wmin0, mul, TYPE_SIGN (type), &ovf1);
+ wi::add (wmax0, mul, TYPE_SIGN (type), &ovf2);
+ if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
+ overflowed = false;
+ }
+ }
+#endif
+ }
+ else
+ overflowed = false;
+
+ }
+ /* Skip folding on overflow. */
+ (if (!overflowed)
+ (plus (mult @0 @2) { wide_int_to_tree (type, add); }))))
+
/* Optimize A / A to 1.0 if we don't care about
NaNs or Infinities. */
(simplify
@@ -9,14 +9,14 @@ int
main1 ()
{
int i;
- unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
- unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
+ int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
/* Different operations - not SLPable. */
for (i = 0; i < N; i++)
{
a0 = in[i*8] + 5;
- a1 = in[i*8 + 1] * 6;
+ a1 = in[i*8 + 1] * 51072;
a2 = in[i*8 + 2] + 7;
a3 = in[i*8 + 3] + 8;
a4 = in[i*8 + 4] + 9;
@@ -25,7 +25,7 @@ main1 ()
a7 = in[i*8 + 7] + 12;
b0 = a0 * 3;
- b1 = a1 * 2;
+ b1 = a1 * 51072;
b2 = a2 * 12;
b3 = a3 * 5;
b4 = a4 * 8;
@@ -47,7 +47,7 @@ main1 ()
for (i = 0; i < N; i++)
{
if (out[i*8] != (in[i*8] + 5) * 3 - 2
- || out[i*8 + 1] != (in[i*8 + 1] * 6) * 2 - 3
+ || out[i*8 + 1] != (in[i*8 + 1] * 51072) * 51072 - 3
|| out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2
|| out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1
|| out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8
new file mode 100644
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+/* { dg-final { scan-assembler-times "imull\[ \t\]" "1" } } */
+/* { dg-final { scan-assembler-times "(?:addl|subl)\[ \t\]" "1" { target { ! ia32 } } } } */
+
+void
+__attribute__((noipa))
+foo (unsigned a[256], unsigned b[256])
+{
+ int i;
+ for (i = 0; i < 256; ++i)
+ {
+ unsigned tmp = a[i] + 12345U;
+ tmp *= 914237U;
+ tmp += 12332U;
+ tmp *= 914237U;
+ tmp += 12332U;
+ tmp *= 914237U;
+ tmp -= 13U;
+ tmp *= 8000U;
+ b[i] = tmp;
+ }
+}
new file mode 100644
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+#include "pr53533-1.c"
+
+void
+__attribute__((optimize("-O0")))
+foo1 (unsigned a[256], unsigned b[256])
+{
+ int i;
+ for (i = 0; i < 256; ++i)
+ {
+ unsigned tmp = a[i] + 12345U;
+ tmp *= 914237U;
+ tmp += 12332U;
+ tmp *= 914237U;
+ tmp += 12332U;
+ tmp *= 914237U;
+ tmp -= 13U;
+ tmp *= 8000U;
+ b[i] = tmp;
+ }
+}
+
+int main()
+{
+ unsigned int a[256];
+ unsigned int b[256];
+ unsigned int c[256];
+ for (unsigned int i = 0; i != 256; i++)
+ {
+ b[i] = 0;
+ c[i] = 1;
+ a[i] = i * i - 10 * i + 33;
+ }
+ foo (a, b);
+ foo1 (a, c);
+
+ for (unsigned int i = 0; i != 256; i++)
+ {
+ if (b[i] != c[i])
+ __builtin_abort ();
+ }
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fwrapv" } */
+/* { dg-final { scan-assembler-times "imull\[ \t\]" "1" } } */
+/* { dg-final { scan-assembler-times "(?:addl|subl)\[ \t\]" "1" { target { ! ia32 } } } } */
+
+void
+__attribute__((noipa))
+foo (int a[256], int b[256])
+{
+ int i;
+ for (i = 0; i < 256; ++i)
+ {
+ int tmp = a[i] + 12345;
+ tmp *= 914237;
+ tmp += 12332;
+ tmp *= 914237;
+ tmp += 12332;
+ tmp *= 914237;
+ tmp -= 13;
+ tmp *= 8000;
+ b[i] = tmp;
+ }
+}
+
new file mode 100644
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fwrapv" } */
+
+#include "pr53533-3.c"
+
+void
+__attribute__((optimize("-O0")))
+foo1 (int a[256], int b[256])
+{
+ int i;
+ for (i = 0; i < 256; ++i)
+ {
+ int tmp = a[i] + 12345;
+ tmp *= 914237;
+ tmp += 12332;
+ tmp *= 914237;
+ tmp += 12332;
+ tmp *= 914237;
+ tmp -= 13;
+ tmp *= 8000;
+ b[i] = tmp;
+ }
+}
+
+int main()
+{
+ int a[256];
+ int b[256];
+ int c[256];
+ for (int i = 0; i != 256; i++)
+ {
+ b[i] = 0;
+ c[i] = 1;
+ a[i] = i * i - 10 * i + 33;
+ }
+ foo (a, b);
+ foo1 (a, c);
+
+ for (unsigned int i = 0; i != 256; i++)
+ {
+ if (b[i] != c[i])
+ __builtin_abort ();
+ }
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times {(?n)\* 2147483647} 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times {(?n)\* 1073741823} 1 "optimized" } } */
+
+#define INT_MAX 2147483647
+int
+foo (int a)
+{
+ /* When a == -2, there's no overflow for (a + 1) * INT_MAX - 1.
+ but overflow for a * INT_MAX + (INT_MAX - 1).
+ Don't simpify it. */
+ return (a + 1) * INT_MAX - 1;
+}
+
+int
+foo1 (int a)
+{
+ /* Be conservative here, don't simplify this as long as
+ a * 2147483646 may overflow. */
+ return 1073741823 * (a * 2 + 1);
+}