Simplify (B * v + C) * D -> BD* v + CD when B, C, D are all INTEGER_CST.

Message ID 20220607075431.24764-1-hongtao.liu@intel.com
State New
Headers
Series Simplify (B * v + C) * D -> BD* v + CD when B, C, D are all INTEGER_CST. |

Commit Message

Liu, Hongtao June 7, 2022, 7:54 a.m. UTC
  >> + (mult:c (plus:c@4 (mult:c@5 @0 INTEGER_CST@1) INTEGER_CST@2) INTEGER_CST@3)

>since canonicalization puts INTEGER_CSTs last the :c should not be necessary.

Changed.

>> + (if (single_use (@4)
>> +      && single_use (@5))

>since the resulting expression is not simple using :s instead of
>single_use (..) should
>work as well.

Changed.

> when we go from (a + CST1) * CST2 to a * CST2 + CST1*CST2 we have
> to worry about CST1 == -a which would make (a+CST1) * INT_MAX
> not overflow but a * INT_MAX + CST1 * INT_MAX might.  Is the
> overflow check for CST1 * INT_MAX sufficient to rule out
> that a * CST2 does not overflow when (a + CST1) * CST2 does not
> overflow?  Consider a == 2, CST1 == -1, CST2 == INT_MAX,
> here 1 * INT_MAX does not overflow, nor does -1 * INT_MAX, but
> 2 * INT_MAX overflows and thus the resulting expression invokes
> undefined behavior.
>
> The same issue probably arises for the first pattern outer half
> which looks like (a' + CST2) * CST3 with a' = a * CST1?
>
> The appropriate solution might be to perform the arithmetic
> in an unsigned type with the implication that has on value-range
> analysis.

Yes, the patch patched based on value-range analysis.

Update the patch.

Similar for (v + B) * C + D -> C * v + BCD.
Don't simplify it when there's overflow and overflow is UB for type v.

gcc/ChangeLog:

	PR tree-optimization/53533
	* match.pd: Simplify (B * v + C) * D -> BD * v + CD and
	(v + B) * C + D -> C * v + BCD when B,C,D are all INTEGER_CST,
	and there's no overflow or !TYPE_OVERFLOW_UNDEFINED.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr53533-1.c: New test.
	* gcc.target/i386/pr53533-2.c: New test.
	* gcc.target/i386/pr53533-3.c: New test.
	* gcc.target/i386/pr53533-4.c: New test.
	* gcc.target/i386/pr53533-5.c: New test.
	* gcc.dg/vect/slp-11a.c: Adjust testcase.
---
 gcc/match.pd                              | 82 +++++++++++++++++++++++
 gcc/testsuite/gcc.dg/vect/slp-11a.c       | 10 +--
 gcc/testsuite/gcc.target/i386/pr53533-1.c | 23 +++++++
 gcc/testsuite/gcc.target/i386/pr53533-2.c | 46 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr53533-3.c | 24 +++++++
 gcc/testsuite/gcc.target/i386/pr53533-4.c | 46 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr53533-5.c | 22 ++++++
 7 files changed, 248 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-5.c
  

Comments

Richard Biener June 15, 2022, 8:24 a.m. UTC | #1
On Tue, Jun 7, 2022 at 9:54 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> >> + (mult:c (plus:c@4 (mult:c@5 @0 INTEGER_CST@1) INTEGER_CST@2) INTEGER_CST@3)
>
> >since canonicalization puts INTEGER_CSTs last the :c should not be necessary.
>
> Changed.
>
> >> + (if (single_use (@4)
> >> +      && single_use (@5))
>
> >since the resulting expression is not simple using :s instead of
> >single_use (..) should
> >work as well.
>
> Changed.
>
> > when we go from (a + CST1) * CST2 to a * CST2 + CST1*CST2 we have
> > to worry about CST1 == -a which would make (a+CST1) * INT_MAX
> > not overflow but a * INT_MAX + CST1 * INT_MAX might.  Is the
> > overflow check for CST1 * INT_MAX sufficient to rule out
> > that a * CST2 does not overflow when (a + CST1) * CST2 does not
> > overflow?  Consider a == 2, CST1 == -1, CST2 == INT_MAX,
> > here 1 * INT_MAX does not overflow, nor does -1 * INT_MAX, but
> > 2 * INT_MAX overflows and thus the resulting expression invokes
> > undefined behavior.
> >
> > The same issue probably arises for the first pattern outer half
> > which looks like (a' + CST2) * CST3 with a' = a * CST1?
> >
> > The appropriate solution might be to perform the arithmetic
> > in an unsigned type with the implication that has on value-range
> > analysis.
>
> Yes, the patch patched based on value-range analysis.
>
> Update the patch.
>
> Similar for (v + B) * C + D -> C * v + BCD.
> Don't simplify it when there's overflow and overflow is UB for type v.
>
> gcc/ChangeLog:
>
>         PR tree-optimization/53533
>         * match.pd: Simplify (B * v + C) * D -> BD * v + CD and
>         (v + B) * C + D -> C * v + BCD when B,C,D are all INTEGER_CST,
>         and there's no overflow or !TYPE_OVERFLOW_UNDEFINED.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr53533-1.c: New test.
>         * gcc.target/i386/pr53533-2.c: New test.
>         * gcc.target/i386/pr53533-3.c: New test.
>         * gcc.target/i386/pr53533-4.c: New test.
>         * gcc.target/i386/pr53533-5.c: New test.
>         * gcc.dg/vect/slp-11a.c: Adjust testcase.
> ---
>  gcc/match.pd                              | 82 +++++++++++++++++++++++
>  gcc/testsuite/gcc.dg/vect/slp-11a.c       | 10 +--
>  gcc/testsuite/gcc.target/i386/pr53533-1.c | 23 +++++++
>  gcc/testsuite/gcc.target/i386/pr53533-2.c | 46 +++++++++++++
>  gcc/testsuite/gcc.target/i386/pr53533-3.c | 24 +++++++
>  gcc/testsuite/gcc.target/i386/pr53533-4.c | 46 +++++++++++++
>  gcc/testsuite/gcc.target/i386/pr53533-5.c | 22 ++++++
>  7 files changed, 248 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-4.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-5.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 44a385b912d..54f53a1f988 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -489,6 +489,88 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>    (if (!overflow || TYPE_OVERFLOW_WRAPS (type))
>     (mult @0 { wide_int_to_tree (type, mul); }))))
>
> +/* Similar to above, but there could be an extra add/sub between
> +   successive multuiplications.  */
> +(simplify
> + (mult (plus:s (mult:s@4 @0 INTEGER_CST@1) INTEGER_CST@2) INTEGER_CST@3)
> + (with {
> +   bool overflowed = true;
> +   wi::overflow_type ovf1, ovf2;
> +   wide_int mul = wi::mul (wi::to_wide (@1), wi::to_wide (@3),
> +                          TYPE_SIGN (type), &ovf1);
> +   wide_int add = wi::mul (wi::to_wide (@2), wi::to_wide (@3),
> +                          TYPE_SIGN (type), &ovf2);
> +  if (TYPE_OVERFLOW_UNDEFINED (type))
> +    {
> +#if GIMPLE
> +      value_range vr0;
> +      if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE
> +         && get_global_range_query ()->range_of_expr (vr0, @4)
> +         && vr0.kind () == VR_RANGE)
> +       {
> +         wide_int wmin0 = vr0.lower_bound ();
> +         wide_int wmax0 = vr0.upper_bound ();
> +         wmin0 = wi::mul (wmin0, wi::to_wide (@3), TYPE_SIGN (type), &ovf1);
> +         wmax0 = wi::mul (wmax0, wi::to_wide (@3), TYPE_SIGN (type), &ovf2);
> +         if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
> +           {
> +             wi::add (wmin0, add, TYPE_SIGN (type), &ovf1);
> +             wi::add (wmax0, add, TYPE_SIGN (type), &ovf2);
> +             if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
> +               overflowed = false;

I wonder if we have an API available for these kind of queries already?
Possibly value_range expr_range (MULT_EXPR, vr0, vr1, &ovf),
computing the result range of the operation on ranges vr0 and vr1
indicating overflow behavior in 'ovf' (similar as to the wide_int APIs)?

Andrew?

I see match.pd already has similar code in a few places.

> +           }
> +       }
> +#endif
> +    }
> +  else
> +   overflowed = false;
> + }
> +  /* Skip folding on overflow.  */
> +  (if (!overflowed)
> +   (plus (mult @0 { wide_int_to_tree (type, mul); })
> +        { wide_int_to_tree (type, add); }))))
> +
> +/* Similar to above, but a multiplication between successive additions.  */
> +(simplify
> + (plus (mult:s (plus:s @0 INTEGER_CST@1) INTEGER_CST@2) INTEGER_CST@3)
> + (with {
> +   bool overflowed = true;
> +   wi::overflow_type ovf1;
> +   wi::overflow_type ovf2;
> +   wide_int mul = wi::mul (wi::to_wide (@1), wi::to_wide (@2),
> +                          TYPE_SIGN (type), &ovf1);
> +   wide_int add = wi::add (mul, wi::to_wide (@3),
> +                          TYPE_SIGN (type), &ovf2);
> +  if (TYPE_OVERFLOW_UNDEFINED (type))
> +    {
> +#if GIMPLE
> +      value_range vr0;
> +      if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE
> +         && get_global_range_query ()->range_of_expr (vr0, @0)
> +         && vr0.kind () == VR_RANGE)
> +       {
> +         wide_int wmin0 = vr0.lower_bound ();
> +         wide_int wmax0 = vr0.upper_bound ();
> +         wmin0 = wi::mul (wmin0, wi::to_wide (@2), TYPE_SIGN (type), &ovf1);
> +         wmax0 = wi::mul (wmax0, wi::to_wide (@2), TYPE_SIGN (type), &ovf2);
> +         if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
> +           {
> +             wi::add (wmin0, mul, TYPE_SIGN (type), &ovf1);
> +             wi::add (wmax0, mul, TYPE_SIGN (type), &ovf2);
> +             if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
> +               overflowed = false;
> +           }
> +       }
> +#endif
> +    }
> +  else
> +   overflowed = false;
> +

unnecessary vertical space

OK with that fixed.

Thanks,
Richard.

> + }
> +  /* Skip folding on overflow.  */
> +  (if (!overflowed)
> +   (plus (mult @0 @2) { wide_int_to_tree (type, add); }))))
> +
>  /* Optimize A / A to 1.0 if we don't care about
>     NaNs or Infinities.  */
>  (simplify
> diff --git a/gcc/testsuite/gcc.dg/vect/slp-11a.c b/gcc/testsuite/gcc.dg/vect/slp-11a.c
> index bcd3c861ca4..e6632fa77be 100644
> --- a/gcc/testsuite/gcc.dg/vect/slp-11a.c
> +++ b/gcc/testsuite/gcc.dg/vect/slp-11a.c
> @@ -9,14 +9,14 @@ int
>  main1 ()
>  {
>    int i;
> -  unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
> -  unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
> +  int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
> +  int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
>
>    /* Different operations - not SLPable.  */
>    for (i = 0; i < N; i++)
>      {
>        a0 = in[i*8] + 5;
> -      a1 = in[i*8 + 1] * 6;
> +      a1 = in[i*8 + 1] * 51072;
>        a2 = in[i*8 + 2] + 7;
>        a3 = in[i*8 + 3] + 8;
>        a4 = in[i*8 + 4] + 9;
> @@ -25,7 +25,7 @@ main1 ()
>        a7 = in[i*8 + 7] + 12;
>
>        b0 = a0 * 3;
> -      b1 = a1 * 2;
> +      b1 = a1 * 51072;
>        b2 = a2 * 12;
>        b3 = a3 * 5;
>        b4 = a4 * 8;
> @@ -47,7 +47,7 @@ main1 ()
>    for (i = 0; i < N; i++)
>      {
>        if (out[i*8] !=  (in[i*8] + 5) * 3 - 2
> -         || out[i*8 + 1] != (in[i*8 + 1] * 6) * 2 - 3
> +         || out[i*8 + 1] != (in[i*8 + 1] * 51072) * 51072 - 3
>           || out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2
>           || out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1
>           || out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8
> diff --git a/gcc/testsuite/gcc.target/i386/pr53533-1.c b/gcc/testsuite/gcc.target/i386/pr53533-1.c
> new file mode 100644
> index 00000000000..095de665366
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr53533-1.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1" } */
> +/* { dg-final { scan-assembler-times "imull\[ \t\]" "1" } } */
> +/* { dg-final { scan-assembler-times "(?:addl|subl)\[ \t\]" "1" { target { ! ia32 } } } } */
> +
> +void
> +__attribute__((noipa))
> +foo (unsigned a[256], unsigned b[256])
> +{
> +  int i;
> +  for (i = 0; i < 256; ++i)
> +    {
> +      unsigned tmp = a[i] + 12345U;
> +      tmp *= 914237U;
> +      tmp += 12332U;
> +      tmp *= 914237U;
> +      tmp += 12332U;
> +      tmp *= 914237U;
> +      tmp -= 13U;
> +      tmp *= 8000U;
> +      b[i] = tmp;
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr53533-2.c b/gcc/testsuite/gcc.target/i386/pr53533-2.c
> new file mode 100644
> index 00000000000..c31b6ff4dec
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr53533-2.c
> @@ -0,0 +1,46 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +
> +#include "pr53533-1.c"
> +
> +void
> +__attribute__((optimize("-O0")))
> +foo1 (unsigned a[256], unsigned b[256])
> +{
> +  int i;
> +  for (i = 0; i < 256; ++i)
> +    {
> +      unsigned tmp = a[i] + 12345U;
> +      tmp *= 914237U;
> +      tmp += 12332U;
> +      tmp *= 914237U;
> +      tmp += 12332U;
> +      tmp *= 914237U;
> +      tmp -= 13U;
> +      tmp *= 8000U;
> +      b[i] = tmp;
> +    }
> +}
> +
> +int main()
> +{
> +  unsigned int a[256];
> +  unsigned int b[256];
> +  unsigned int c[256];
> +  for (unsigned int i = 0; i != 256; i++)
> +    {
> +      b[i] = 0;
> +      c[i] = 1;
> +      a[i] = i * i - 10 * i + 33;
> +    }
> +  foo (a, b);
> +  foo1 (a, c);
> +
> +  for (unsigned int i = 0; i != 256; i++)
> +    {
> +      if (b[i] != c[i])
> +       __builtin_abort ();
> +    }
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr53533-3.c b/gcc/testsuite/gcc.target/i386/pr53533-3.c
> new file mode 100644
> index 00000000000..3b260d134e9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr53533-3.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -fwrapv" } */
> +/* { dg-final { scan-assembler-times "imull\[ \t\]" "1" } } */
> +/* { dg-final { scan-assembler-times "(?:addl|subl)\[ \t\]" "1" { target { ! ia32 } } } } */
> +
> +void
> +__attribute__((noipa))
> +foo (int a[256], int b[256])
> +{
> +  int i;
> +  for (i = 0; i < 256; ++i)
> +    {
> +      int tmp = a[i] + 12345;
> +      tmp *= 914237;
> +      tmp += 12332;
> +      tmp *= 914237;
> +      tmp += 12332;
> +      tmp *= 914237;
> +      tmp -= 13;
> +      tmp *= 8000;
> +      b[i] = tmp;
> +    }
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/i386/pr53533-4.c b/gcc/testsuite/gcc.target/i386/pr53533-4.c
> new file mode 100644
> index 00000000000..c29f90a44dc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr53533-4.c
> @@ -0,0 +1,46 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fwrapv" } */
> +
> +#include "pr53533-3.c"
> +
> +void
> +__attribute__((optimize("-O0")))
> +foo1 (int a[256], int b[256])
> +{
> +  int i;
> +  for (i = 0; i < 256; ++i)
> +    {
> +      int tmp = a[i] + 12345;
> +      tmp *= 914237;
> +      tmp += 12332;
> +      tmp *= 914237;
> +      tmp += 12332;
> +      tmp *= 914237;
> +      tmp -= 13;
> +      tmp *= 8000;
> +      b[i] = tmp;
> +    }
> +}
> +
> +int main()
> +{
> +  int a[256];
> +  int b[256];
> +  int c[256];
> +  for (int i = 0; i != 256; i++)
> +    {
> +      b[i] = 0;
> +      c[i] = 1;
> +      a[i] = i * i - 10 * i + 33;
> +    }
> +  foo (a, b);
> +  foo1 (a, c);
> +
> +  for (unsigned int i = 0; i != 256; i++)
> +    {
> +      if (b[i] != c[i])
> +       __builtin_abort ();
> +    }
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr53533-5.c b/gcc/testsuite/gcc.target/i386/pr53533-5.c
> new file mode 100644
> index 00000000000..56fe4f44064
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr53533-5.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-times {(?n)\* 2147483647} 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)\* 1073741823} 1 "optimized" } } */
> +
> +#define INT_MAX 2147483647
> +int
> +foo (int a)
> +{
> +  /* When a == -2, there's no overflow for (a + 1) * INT_MAX - 1.
> +     but overflow for a * INT_MAX + (INT_MAX - 1).
> +     Don't simpify it.  */
> +  return (a + 1) * INT_MAX - 1;
> +}
> +
> +int
> +foo1 (int a)
> +{
> +  /* Be conservative here, don't simplify this as long as
> +     a * 2147483646 may overflow.  */
> +  return 1073741823 * (a * 2 + 1);
> +}
> --
> 2.18.1
>
  
Andrew MacLeod June 15, 2022, 1:29 p.m. UTC | #2
On 6/15/22 04:24, Richard Biener wrote:
>
>> diff --git a/gcc/match.pd b/gcc/match.pd
>> index 44a385b912d..54f53a1f988 100644
>> --- a/gcc/match.pd
>> +++ b/gcc/match.pd
>> @@ -489,6 +489,88 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>     (if (!overflow || TYPE_OVERFLOW_WRAPS (type))
>>      (mult @0 { wide_int_to_tree (type, mul); }))))
>>
>> +/* Similar to above, but there could be an extra add/sub between
>> +   successive multuiplications.  */
>> +(simplify
>> + (mult (plus:s (mult:s@4 @0 INTEGER_CST@1) INTEGER_CST@2) INTEGER_CST@3)
>> + (with {
>> +   bool overflowed = true;
>> +   wi::overflow_type ovf1, ovf2;
>> +   wide_int mul = wi::mul (wi::to_wide (@1), wi::to_wide (@3),
>> +                          TYPE_SIGN (type), &ovf1);
>> +   wide_int add = wi::mul (wi::to_wide (@2), wi::to_wide (@3),
>> +                          TYPE_SIGN (type), &ovf2);
>> +  if (TYPE_OVERFLOW_UNDEFINED (type))
>> +    {
>> +#if GIMPLE
>> +      value_range vr0;
>> +      if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE
>> +         && get_global_range_query ()->range_of_expr (vr0, @4)
>> +         && vr0.kind () == VR_RANGE)
>> +       {
>> +         wide_int wmin0 = vr0.lower_bound ();
>> +         wide_int wmax0 = vr0.upper_bound ();
>> +         wmin0 = wi::mul (wmin0, wi::to_wide (@3), TYPE_SIGN (type), &ovf1);
>> +         wmax0 = wi::mul (wmax0, wi::to_wide (@3), TYPE_SIGN (type), &ovf2);
>> +         if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
>> +           {
>> +             wi::add (wmin0, add, TYPE_SIGN (type), &ovf1);
>> +             wi::add (wmax0, add, TYPE_SIGN (type), &ovf2);
>> +             if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
>> +               overflowed = false;
> I wonder if we have an API available for these kind of queries already?
> Possibly value_range expr_range (MULT_EXPR, vr0, vr1, &ovf),
> computing the result range of the operation on ranges vr0 and vr1
> indicating overflow behavior in 'ovf' (similar as to the wide_int APIs)?
>
> Andrew?
>
> I see match.pd already has similar code in a few places.

The overflow characteristic is currently "hidden" in the API. ie, you 
can do

   range_op_handler (MULT_EXPR, type).fold_range (r, type, vr0, vr1);

and get the result in 'r', which typcically if it overflows for MULT is 
varying I think.  (we give up easily :-)  BUt the fact than an overflow 
may have happened is not explicitly provided.

If this was something generally useful, we could consider adding an 
optional flag to the API which indicates is an overflow may happen.  
Then you'd have something like:
   range_op_handler (MULT_EXPR, type).fold_range (r, type, vr0, vr1, &ovf);

The original range-ops implementation way back when did carry an 
overflow around, but it was removed early on.  Someone would have to go 
in and tweak all the fold_range routines to set the flag correctly, if 
it was passed in. If its going to work for one it would have to work for 
all.

I do believe the information is all contained in every relevant range-op 
routine, we just don't propagate it back to the caller. Most of the 
"generic" routines have a wi_op_overflows () method which determines 
whether any pair of wide-ints overflow the operation, we'd just have to 
return the  flag in those cases.  The rest of the cases would have to be 
audited.

And in todays multi-type environment, we'd have to handle it for the 
other types as well.. so an integer specific value like wi::OVF_*  may 
not be appropriate.   Perhaps just a boolean flag indicating if the 
result is an accurate calculation, or integrates possibly out of range 
values.

Andrew

PS.  We could also simplify the API slightly for general consumption via 
the gimple-range-fold.h routines. they currently support just gimple 
statements:
      bool fold_range (vrange &r, gimple *s, vrange &r1, vrange &r2);
but we could probably also manage something like:
     bool fold_range (vrange &r, MULT_EXPR, vrange &r1, vrange &r2);
where we key off the types of r1 and r2 when possible... and return 
false otherwise.   Then if we have extended the range-ops API to include 
an overflow, we could attach it here too.

Andrew
  

Patch

diff --git a/gcc/match.pd b/gcc/match.pd
index 44a385b912d..54f53a1f988 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -489,6 +489,88 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   (if (!overflow || TYPE_OVERFLOW_WRAPS (type))
    (mult @0 { wide_int_to_tree (type, mul); }))))
 
+/* Similar to above, but there could be an extra add/sub between
+   successive multuiplications.  */
+(simplify
+ (mult (plus:s (mult:s@4 @0 INTEGER_CST@1) INTEGER_CST@2) INTEGER_CST@3)
+ (with {
+   bool overflowed = true;
+   wi::overflow_type ovf1, ovf2;
+   wide_int mul = wi::mul (wi::to_wide (@1), wi::to_wide (@3),
+			   TYPE_SIGN (type), &ovf1);
+   wide_int add = wi::mul (wi::to_wide (@2), wi::to_wide (@3),
+			   TYPE_SIGN (type), &ovf2);
+  if (TYPE_OVERFLOW_UNDEFINED (type))
+    {
+#if GIMPLE
+      value_range vr0;
+      if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE
+	  && get_global_range_query ()->range_of_expr (vr0, @4)
+	  && vr0.kind () == VR_RANGE)
+	{
+	  wide_int wmin0 = vr0.lower_bound ();
+	  wide_int wmax0 = vr0.upper_bound ();
+	  wmin0 = wi::mul (wmin0, wi::to_wide (@3), TYPE_SIGN (type), &ovf1);
+	  wmax0 = wi::mul (wmax0, wi::to_wide (@3), TYPE_SIGN (type), &ovf2);
+	  if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
+	    {
+	      wi::add (wmin0, add, TYPE_SIGN (type), &ovf1);
+	      wi::add (wmax0, add, TYPE_SIGN (type), &ovf2);
+	      if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
+		overflowed = false;
+	    }
+	}
+#endif
+    }
+  else
+   overflowed = false;
+ }
+  /* Skip folding on overflow.  */
+  (if (!overflowed)
+   (plus (mult @0 { wide_int_to_tree (type, mul); })
+	 { wide_int_to_tree (type, add); }))))
+
+/* Similar to above, but a multiplication between successive additions.  */
+(simplify
+ (plus (mult:s (plus:s @0 INTEGER_CST@1) INTEGER_CST@2) INTEGER_CST@3)
+ (with {
+   bool overflowed = true;
+   wi::overflow_type ovf1;
+   wi::overflow_type ovf2;
+   wide_int mul = wi::mul (wi::to_wide (@1), wi::to_wide (@2),
+			   TYPE_SIGN (type), &ovf1);
+   wide_int add = wi::add (mul, wi::to_wide (@3),
+			   TYPE_SIGN (type), &ovf2);
+  if (TYPE_OVERFLOW_UNDEFINED (type))
+    {
+#if GIMPLE
+      value_range vr0;
+      if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE
+	  && get_global_range_query ()->range_of_expr (vr0, @0)
+	  && vr0.kind () == VR_RANGE)
+	{
+	  wide_int wmin0 = vr0.lower_bound ();
+	  wide_int wmax0 = vr0.upper_bound ();
+	  wmin0 = wi::mul (wmin0, wi::to_wide (@2), TYPE_SIGN (type), &ovf1);
+	  wmax0 = wi::mul (wmax0, wi::to_wide (@2), TYPE_SIGN (type), &ovf2);
+	  if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
+	    {
+	      wi::add (wmin0, mul, TYPE_SIGN (type), &ovf1);
+	      wi::add (wmax0, mul, TYPE_SIGN (type), &ovf2);
+	      if (ovf1 == wi::OVF_NONE && ovf2 == wi::OVF_NONE)
+		overflowed = false;
+	    }
+	}
+#endif
+    }
+  else
+   overflowed = false;
+
+ }
+  /* Skip folding on overflow.  */
+  (if (!overflowed)
+   (plus (mult @0 @2) { wide_int_to_tree (type, add); }))))
+
 /* Optimize A / A to 1.0 if we don't care about
    NaNs or Infinities.  */
 (simplify
diff --git a/gcc/testsuite/gcc.dg/vect/slp-11a.c b/gcc/testsuite/gcc.dg/vect/slp-11a.c
index bcd3c861ca4..e6632fa77be 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-11a.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-11a.c
@@ -9,14 +9,14 @@  int
 main1 ()
 {
   int i;
-  unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
-  unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+  int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
+  int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
 
   /* Different operations - not SLPable.  */
   for (i = 0; i < N; i++)
     {
       a0 = in[i*8] + 5;
-      a1 = in[i*8 + 1] * 6;
+      a1 = in[i*8 + 1] * 51072;
       a2 = in[i*8 + 2] + 7;
       a3 = in[i*8 + 3] + 8;
       a4 = in[i*8 + 4] + 9;
@@ -25,7 +25,7 @@  main1 ()
       a7 = in[i*8 + 7] + 12;
 
       b0 = a0 * 3;
-      b1 = a1 * 2;
+      b1 = a1 * 51072;
       b2 = a2 * 12;
       b3 = a3 * 5;
       b4 = a4 * 8;
@@ -47,7 +47,7 @@  main1 ()
   for (i = 0; i < N; i++)
     {
       if (out[i*8] !=  (in[i*8] + 5) * 3 - 2
-         || out[i*8 + 1] != (in[i*8 + 1] * 6) * 2 - 3
+         || out[i*8 + 1] != (in[i*8 + 1] * 51072) * 51072 - 3
          || out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2
          || out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1
          || out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8
diff --git a/gcc/testsuite/gcc.target/i386/pr53533-1.c b/gcc/testsuite/gcc.target/i386/pr53533-1.c
new file mode 100644
index 00000000000..095de665366
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53533-1.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+/* { dg-final { scan-assembler-times "imull\[ \t\]" "1" } } */
+/* { dg-final { scan-assembler-times "(?:addl|subl)\[ \t\]" "1" { target { ! ia32 } } } } */
+
+void
+__attribute__((noipa))
+foo (unsigned a[256], unsigned b[256])
+{
+  int i;
+  for (i = 0; i < 256; ++i)
+    {
+      unsigned tmp = a[i] + 12345U;
+      tmp *= 914237U;
+      tmp += 12332U;
+      tmp *= 914237U;
+      tmp += 12332U;
+      tmp *= 914237U;
+      tmp -= 13U;
+      tmp *= 8000U;
+      b[i] = tmp;
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr53533-2.c b/gcc/testsuite/gcc.target/i386/pr53533-2.c
new file mode 100644
index 00000000000..c31b6ff4dec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53533-2.c
@@ -0,0 +1,46 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+#include "pr53533-1.c"
+
+void
+__attribute__((optimize("-O0")))
+foo1 (unsigned a[256], unsigned b[256])
+{
+  int i;
+  for (i = 0; i < 256; ++i)
+    {
+      unsigned tmp = a[i] + 12345U;
+      tmp *= 914237U;
+      tmp += 12332U;
+      tmp *= 914237U;
+      tmp += 12332U;
+      tmp *= 914237U;
+      tmp -= 13U;
+      tmp *= 8000U;
+      b[i] = tmp;
+    }
+}
+
+int main()
+{
+  unsigned int a[256];
+  unsigned int b[256];
+  unsigned int c[256];
+  for (unsigned int i = 0; i != 256; i++)
+    {
+      b[i] = 0;
+      c[i] = 1;
+      a[i] = i * i - 10 * i + 33;
+    }
+  foo (a, b);
+  foo1 (a, c);
+
+  for (unsigned int i = 0; i != 256; i++)
+    {
+      if (b[i] != c[i])
+	__builtin_abort ();
+    }
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr53533-3.c b/gcc/testsuite/gcc.target/i386/pr53533-3.c
new file mode 100644
index 00000000000..3b260d134e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53533-3.c
@@ -0,0 +1,24 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O1 -fwrapv" } */
+/* { dg-final { scan-assembler-times "imull\[ \t\]" "1" } } */
+/* { dg-final { scan-assembler-times "(?:addl|subl)\[ \t\]" "1" { target { ! ia32 } } } } */
+
+void
+__attribute__((noipa))
+foo (int a[256], int b[256])
+{
+  int i;
+  for (i = 0; i < 256; ++i)
+    {
+      int tmp = a[i] + 12345;
+      tmp *= 914237;
+      tmp += 12332;
+      tmp *= 914237;
+      tmp += 12332;
+      tmp *= 914237;
+      tmp -= 13;
+      tmp *= 8000;
+      b[i] = tmp;
+    }
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr53533-4.c b/gcc/testsuite/gcc.target/i386/pr53533-4.c
new file mode 100644
index 00000000000..c29f90a44dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53533-4.c
@@ -0,0 +1,46 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -fwrapv" } */
+
+#include "pr53533-3.c"
+
+void
+__attribute__((optimize("-O0")))
+foo1 (int a[256], int b[256])
+{
+  int i;
+  for (i = 0; i < 256; ++i)
+    {
+      int tmp = a[i] + 12345;
+      tmp *= 914237;
+      tmp += 12332;
+      tmp *= 914237;
+      tmp += 12332;
+      tmp *= 914237;
+      tmp -= 13;
+      tmp *= 8000;
+      b[i] = tmp;
+    }
+}
+
+int main()
+{
+  int a[256];
+  int b[256];
+  int c[256];
+  for (int i = 0; i != 256; i++)
+    {
+      b[i] = 0;
+      c[i] = 1;
+      a[i] = i * i - 10 * i + 33;
+    }
+  foo (a, b);
+  foo1 (a, c);
+
+  for (unsigned int i = 0; i != 256; i++)
+    {
+      if (b[i] != c[i])
+	__builtin_abort ();
+    }
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr53533-5.c b/gcc/testsuite/gcc.target/i386/pr53533-5.c
new file mode 100644
index 00000000000..56fe4f44064
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53533-5.c
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times {(?n)\* 2147483647} 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times {(?n)\* 1073741823} 1 "optimized" } } */
+
+#define INT_MAX 2147483647
+int
+foo (int a)
+{
+  /* When a == -2, there's no overflow for (a + 1) * INT_MAX - 1.
+     but overflow for a * INT_MAX + (INT_MAX - 1).
+     Don't simpify it.  */
+  return (a + 1) * INT_MAX - 1;
+}
+
+int
+foo1 (int a)
+{
+  /* Be conservative here, don't simplify this as long as
+     a * 2147483646 may overflow.  */
+  return 1073741823 * (a * 2 + 1);
+}