On Fri, Feb 18, 2022 at 7:20 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch improves GCC's scalar evolution and final value replacement
> optimizations by supporting triangular/quadratic/trapezoid chrecs which
> resolves both PR middle-end/65855 and PR c/80852, but alas not (yet)
> PR tree-optimization/46186.
>
> I've listed Richard Biener as co-author, as this solution is based
> heavily on his proposed patch in comment #4 of PR 65855 from 2015,
> but with several significant changes. The most important change is
> a correction to formula used. For the scalar evolution {a, +, {b, +, c}},
> there was an off-by-one error, so chrec_apply should not return
> a + b*x + c*x*(x+1)/2, but a + b*x + c*x*(x-1)/2, which explains
> why the original patch didn't produce the expected results.
>
> Another significant set of changes involves handling the various
> type mismatches and overflows. In chrec_apply the evolution is
> evaluated after x iterations (which is an unsigned integer type,
> called orig_type in this patch) but a, b and c may be any type
> including floating point (see PR tree-opt/23391) and including
> types that trap on overflow with -ftrapv (see PR tree-opt/79721),
> and in the case of pointer arithmetic, b and c may have a
> different type (sizetype) to a! Additionally, the division by
> two in "c*x*(x-1)/2" requires the correct top bit in modulo
> arithmetic, which means that the multiplication typically needs
> to be performed with more precision (in a wider mode) than
> orig_type [unless c is an even integer constant, or x (the
> number of iterations) is a known constant at compile-time].
>
> Finally, final value replacement is only an effective optimization
> if the expression used to compute the result of the loop is cheaper
> than the loop itself, hence chrec_apply needs to return an optimized
> folded tree with the minimal number of operators. Hence if b == c,
> this patch calculates "a + c*x*(x+1)/2", when c is even we can
> perform the division at compile-time, and when x is a non-trivial
> expression, we wrap it in a SAVE_EXPR so that the lowering to
> gimple can reuse the common subexpression.
>
> Correctly handling all of the corner cases results in a patch
> significantly larger than the original "proof-of-concept".
> There's one remaining refinement, marked as TODO in the patch,
> which is to also support unsigned 64-bit to 128-bit widening
> multiplications (umulditi3) on targets that support it, such as
> x86_64, as used by LLVM, but this patch provides the core
> target-independent functionality. [This last piece would
> handle/resolve the testcase in PR tree-opt/46186 which
> requires 128-bit TImode operations, not suitable for all
> backend targets].
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check with no new failures. Ok for mainline?
+/* Fold the (exact) division of chrec by two. */
+
+static tree
+chrec_fold_divide_by_2 (tree type, tree op)
+{
+ if (SCALAR_FLOAT_TYPE_P (type))
+ return fold_build2 (MULT_EXPR, type, op, build_real (type, dconsthalf));
+ return fold_build2 (RSHIFT_EXPR, type, op, build_int_cst (type, 1));
any reason to not use EXACT_DIV_EXPR?
+/* Indicate that a chrec subexpression is used multiple times. */
+
+static tree
+chrec_save_expr (tree t)
+{
+ if (is_gimple_min_invariant (t))
+ return t;
+ t = save_expr (t);
+ if (TREE_CODE (t) == SAVE_EXPR)
+ TREE_SIDE_EFFECTS (t) = 0;
+ return t;
why clear TREE_SIDE_EFFECTS here? IIRC SCEV analysis
simply re-uses subtrees (thus generates a graph) without
wrapping the shared leafs in SAVE_EXPR (and then have
unshare_expr before gimplification blow it up of course...).
+ /* Determine whether c is even. */
+ tree half_c = NULL_TREE;
+ if (TREE_CODE (c) == INTEGER_CST
+ && (wi::to_widest (c) & 1) == 0)
wi::to_wide should work as well and is cheaper
+ /* Try to narrow the original type, by stripping zero extensions. */
+ tree orig_type = TREE_TYPE (x);
+ if (TREE_CODE (x) == NOP_EXPR)
+ {
so we do have get_narrower (), can we use that?
+ tree wide_type;
+ /* Prefer to perform multiplications in type CTYPE. */
+ if (INTEGRAL_TYPE_P (ctype)
+ && TYPE_UNSIGNED (ctype)
+ && TYPE_PRECISION (ctype) > prec)
+ wide_type = ctype;
+ else if (TYPE_PRECISION (unsigned_type_node) > prec)
+ wide_type = unsigned_type_node;
+ else if (TYPE_PRECISION (long_unsigned_type_node) > prec)
+ wide_type = long_unsigned_type_node;
+ else if (TYPE_PRECISION (long_long_unsigned_type_node) > prec)
+ wide_type = long_long_unsigned_type_node;
+ else
+ /* TODO: Try TImode on targets that support umul_widen_optab. */
+ return chrec_dont_know;
can we use mode_for_size () and type_for_mode () instead of hard-coding
these? You can look for a umul_optab if we do not want to risk
libgcc calls.
There's some code that looks like it can be factored. Maybe my eyes
are mis-matching things of course:
+ res = chrec_fold_plus (wide_type, save_x, one);
+ res = chrec_fold_multiply (wide_type, res, save_x);
+ if (half_c)
+ {
+ res = chrec_convert (ctype, res, NULL);
+ res = chrec_fold_multiply (ctype, half_c, res);
+ }
+ else
+ {
+ res = chrec_fold_divide_by_2 (wide_type, res);
+ res = chrec_convert (ctype, res, NULL);
+ res = chrec_fold_multiply (ctype, b, res);
+ }
+ /* + a */
+ if (ctype != type)
+ res = chrec_convert (type, res, NULL);
+ res = chrec_fold_plus (TREE_TYPE (a), a, res);
+ return res;
looks like a common tail with one optional + b*x?
@@ -3469,6 +3472,19 @@ expression_expensive_p (tree expr,
hash_map<tree, uint64_t> &cache,
cost += op_cost + 1;
return false;
+ case tcc_expression:
+ if (code == SAVE_EXPR)
+ {
+ if (expression_expensive_p (TREE_OPERAND (expr, 0), cache, op_cost))
+ return true;
+ /* Assume SAVE_EXPR is instanced twice. */
+ op_cost /= 2;
+ *cache.get (expr) += op_cost;
+ cost += op_cost;
+ return false;
+ }
+ return true;
+
default:
return true;
}
I think expression_expensive_p assumes we unshare_expr () (correctly,
because we do...) and thus
counts each use of a multi-use. With SAVE_EXPRs it should instead
have operated in a
walk_tree_without_duplicates mode. Maybe you should bite the bullet
and wrap expression_expensive_p,
allocating a pointer-set to simply not visit SAVE_EXPRs multiple times
(and not caching it in
'cache')? I've added the cache explicitely to avoid
expression_expensive_p to be exponential
but make it compute that unshare_expr will be. Maybe the SAVE_EXPR
behavior can be more
precisely handled?
Looks like stage1 material btw.
Thanks,
Richard.
>
> 2022-02-18 Roger Sayle <roger@nextmovesoftware.com>
> Richard Biener <rguenther@suse.de>
>
> gcc/ChangeLog
> PR target/65855
> PR c/80852
> * tree-chrec.cc (chrec_fold_divide_by_2): New function to
> divide a chrec by two, honoring the type of the chrec.
> (chrec_save_expr): New function to wrap a chrec in a
> SAVE_EXPR, but without TREE_SIDE_EFFECTS.
> (chrec_apply_triangular): New helper function of chrec
> apply to evaluate a quadratic/triangular chrec.
> (chrec_apply): Expand/clarify comment before function.
> Handle application of any chrec after zero iterations, i.e. A.
> Recursively handle cases where the iteration count is
> conditional. Handle quadratic/triangular chrecs by calling
> the new chrec_apply_triangular function.
> (chrec_convert_rhs): Handle conversion of integer constants
> to scalar floating point types here (moved from chrec_apply).
> * tree-scalar-evolution.cc (interpret_expr): Handle SAVE_EXPR
> by (tail call) recursion.
> (expression_expensive_p): Calculate the cost of a SAVE_EXPR
> as half the cost of its operand, i.e. assume it is reused.
>
> gcc/testsuite/ChangeLog
> PR target/65855
> PR c/80852
> * gcc.dg/tree-ssa/pr65855.c: New test case.
> * gcc.dg/tree-ssa/pr80852.c: New test case.
> * gcc.dg/vect/vect-iv-11.c: Update to reflect that the loop is
> no longer vectorized, but calculated by final value replacement.
>
> Roger
> --
>
@@ -573,6 +573,246 @@ chrec_evaluate (unsigned var, tree chrec, tree n, unsigned int k)
chrec_convert (ctype, chrec, NULL), binomial_n_k);
}
+
+/* Fold the (exact) division of chrec by two. */
+
+static tree
+chrec_fold_divide_by_2 (tree type, tree op)
+{
+ if (SCALAR_FLOAT_TYPE_P (type))
+ return fold_build2 (MULT_EXPR, type, op, build_real (type, dconsthalf));
+ return fold_build2 (RSHIFT_EXPR, type, op, build_int_cst (type, 1));
+}
+
+
+/* Indicate that a chrec subexpression is used multiple times. */
+
+static tree
+chrec_save_expr (tree t)
+{
+ if (is_gimple_min_invariant (t))
+ return t;
+ t = save_expr (t);
+ if (TREE_CODE (t) == SAVE_EXPR)
+ TREE_SIDE_EFFECTS (t) = 0;
+ return t;
+}
+
+
+/* Helper function of chrec_apply. Evaluate a CHREC of the
+ form {A, +, {B, +, C}} at X to A + B*X + C*X*(X-1)/2.
+ The tricky part is avoiding information loss on overflow
+ and the potential mismatch of types between A, B and C
+ (of type TYPE) and X which is an unsigned integral type. */
+
+static tree
+chrec_apply_triangular (tree type, tree a, tree b, tree c, tree x)
+{
+ tree res, tmp;
+
+ /* Determine whether c is even. */
+ tree half_c = NULL_TREE;
+ if (TREE_CODE (c) == INTEGER_CST
+ && (wi::to_widest (c) & 1) == 0)
+ half_c = chrec_fold_divide_by_2 (type, c);
+
+ /* Avoid introducing trapping arithmetic with -ftrapv, c.f. PR 79721. */
+ tree ctype = type;
+ if (INTEGRAL_TYPE_P (type) && !TYPE_OVERFLOW_WRAPS (type))
+ {
+ ctype = unsigned_type_for (type);
+ b = chrec_convert (ctype, b, NULL);
+ c = chrec_convert (ctype, c, NULL);
+ if (half_c)
+ half_c = chrec_convert (ctype, half_c, NULL);
+ }
+
+ /* First, handle the easy case where X is an integer constant,
+ so we can evaluate X*(X-1)/2 using widest_int precision at
+ compile-time. */
+ if (TREE_CODE (x) == INTEGER_CST)
+ {
+ wi::tree_to_widest_ref wix = wi::to_widest (x);
+ widest_int wit = wix*(wix-1)/2;
+ if (operand_equal_p (b, c))
+ {
+ wit += wix;
+ res = SCALAR_FLOAT_TYPE_P (ctype)
+ ? build_real_from_wide (ctype, wit, UNSIGNED)
+ : wide_int_to_tree (ctype, wit);
+ res = chrec_fold_multiply (ctype, b, res);
+ }
+ else
+ {
+ if (SCALAR_FLOAT_TYPE_P (ctype))
+ {
+ res = build_real_from_wide (ctype, wit, UNSIGNED);
+ tmp = build_real_from_wide (ctype, wix, UNSIGNED);
+ }
+ else
+ {
+ res = wide_int_to_tree (ctype, wit);
+ tmp = wide_int_to_tree (ctype, wix);
+ }
+ res = chrec_fold_multiply (ctype, c, res);
+ tmp = chrec_fold_multiply (ctype, b, tmp);
+ res = chrec_fold_plus (ctype, res, tmp);
+ }
+ if (ctype != type)
+ res = chrec_convert (type, res, NULL);
+ res = chrec_fold_plus (TREE_TYPE (a), a, res);
+ return res;
+ }
+
+ /* Alas, calculating C*X*(X-1)/2 can wrap/overflow incorrectly, unless
+ (i) C is an even integer constant, or (ii) the multiplications are
+ performed in a wider mode. */
+
+ /* Try to narrow the original type, by stripping zero extensions. */
+ tree orig_type = TREE_TYPE (x);
+ if (TREE_CODE (x) == NOP_EXPR)
+ {
+ tree nx = TREE_OPERAND (x, 0);
+ tree nt = TREE_TYPE (nx);
+ if (INTEGRAL_TYPE_P (nt)
+ && TYPE_UNSIGNED (nt)
+ && TYPE_PRECISION (nt) < TYPE_PRECISION (orig_type))
+ {
+ orig_type = nt;
+ x = nx;
+ }
+ }
+ else if (TREE_CODE (x) == PLUS_EXPR
+ && integer_all_onesp (TREE_OPERAND (x, 1))
+ && TREE_CODE (TREE_OPERAND (x, 0)) == NOP_EXPR)
+ {
+ /* We know the number of iterations can't be negative. */
+ tree nt = TREE_TYPE (TREE_OPERAND (TREE_OPERAND (x, 0), 0));
+ if (INTEGRAL_TYPE_P (nt)
+ && TYPE_UNSIGNED (nt)
+ && TYPE_PRECISION (nt) < TYPE_PRECISION (orig_type))
+ {
+ orig_type = nt;
+ x = fold_convert (nt, x);
+ }
+ }
+
+ /* We require an unsigned type with more precision than PREC. */
+ int prec = TYPE_PRECISION (orig_type);
+ if (half_c)
+ prec--;
+
+ tree wide_type;
+ /* Prefer to perform multiplications in type CTYPE. */
+ if (INTEGRAL_TYPE_P (ctype)
+ && TYPE_UNSIGNED (ctype)
+ && TYPE_PRECISION (ctype) > prec)
+ wide_type = ctype;
+ else if (TYPE_PRECISION (unsigned_type_node) > prec)
+ wide_type = unsigned_type_node;
+ else if (TYPE_PRECISION (long_unsigned_type_node) > prec)
+ wide_type = long_unsigned_type_node;
+ else if (TYPE_PRECISION (long_long_unsigned_type_node) > prec)
+ wide_type = long_long_unsigned_type_node;
+ else
+ /* TODO: Try TImode on targets that support umul_widen_optab. */
+ return chrec_dont_know;
+
+ /* X is used multiple times, so wrap it in a SAVE_EXPR. */
+ x = chrec_convert_rhs (wide_type, x, NULL);
+ tree save_x = chrec_save_expr (x);
+
+ /* When B == C, "{A, +, {B, +, B}} (x) -> "A + B*X*(X+1)/2". */
+ if (operand_equal_p (b, c))
+ {
+ /* b*x*(x+1)/2 */
+ tree one = build_int_cst (wide_type, 1);
+ res = chrec_fold_plus (wide_type, save_x, one);
+ res = chrec_fold_multiply (wide_type, res, save_x);
+ if (half_c)
+ {
+ res = chrec_convert (ctype, res, NULL);
+ res = chrec_fold_multiply (ctype, half_c, res);
+ }
+ else
+ {
+ res = chrec_fold_divide_by_2 (wide_type, res);
+ res = chrec_convert (ctype, res, NULL);
+ res = chrec_fold_multiply (ctype, b, res);
+ }
+ /* + a */
+ if (ctype != type)
+ res = chrec_convert (type, res, NULL);
+ res = chrec_fold_plus (TREE_TYPE (a), a, res);
+ return res;
+ }
+
+ /* When 2B is a multiple of C, "{A, +, {B, +, C}} (x)" may be
+ optimized to "A + C*X*(X+TMP)/2" where TMP is "2B/C-1". */
+ if (TREE_CODE (b) == INTEGER_CST && TREE_CODE (c) == INTEGER_CST)
+ {
+ wi::tree_to_widest_ref wic = wi::to_widest (c);
+ wi::tree_to_widest_ref wib = wi::to_widest (b);
+ signop sign = TYPE_SIGN (type);
+ wi::overflow_type overflow = wi::OVF_NONE;
+ widest_int wib2 = wi::add (wib, wib, sign, &overflow);
+ if (wi::multiple_of_p (wib2, wic, sign))
+ {
+ widest_int tmp = wi::div_trunc (wib2, wic, sign, &overflow);
+ tmp = wi::sub (tmp, 1, sign, &overflow);
+ if (!overflow && wi::fits_to_tree_p (tmp, orig_type))
+ {
+ /* c*x*(x+tmp)/2 */
+ res = chrec_fold_plus (wide_type, save_x,
+ wide_int_to_tree (wide_type, tmp));
+ res = chrec_fold_multiply (wide_type, res, save_x);
+ if (half_c)
+ {
+ res = chrec_convert (ctype, res, NULL);
+ res = chrec_fold_multiply (ctype, res, half_c);
+ }
+ else
+ {
+ res = chrec_fold_divide_by_2 (wide_type, res);
+ res = chrec_convert (ctype, res, NULL);
+ res = chrec_fold_multiply (ctype, res, c);
+ }
+ /* + a */
+ if (ctype != type)
+ res = chrec_convert (type, res, NULL);
+ res = chrec_fold_plus (TREE_TYPE (a), a, res);
+ return res;
+ }
+ }
+ }
+
+ /* c*x*(x-1)/2 */
+ tree one = build_int_cst (wide_type, 1);
+ res = chrec_fold_minus (wide_type, save_x, one);
+ res = chrec_fold_multiply (wide_type, res, save_x);
+ if (half_c)
+ {
+ res = chrec_convert (ctype, res, NULL);
+ res = chrec_fold_multiply (ctype, half_c, res);
+ }
+ else
+ {
+ res = chrec_fold_divide_by_2 (wide_type, res);
+ res = chrec_convert (ctype, res, NULL);
+ res = chrec_fold_multiply (ctype, c, res);
+ }
+ /* + b*x */
+ tmp = chrec_convert (ctype, save_x, NULL);
+ tmp = chrec_fold_multiply (ctype, b, tmp);
+ res = chrec_fold_plus (ctype, res, tmp);
+ /* + a */
+ if (ctype != type)
+ res = chrec_convert (type, res, NULL);
+ res = chrec_fold_plus (TREE_TYPE (a), a, res);
+ return res;
+}
+
+
/* Evaluates "CHREC (X)" when the varying variable is VAR.
Example: Given the following parameters,
@@ -581,7 +821,8 @@ chrec_evaluate (unsigned var, tree chrec, tree n, unsigned int k)
x = 10
The result is given by the Newton's interpolating formula:
- 3 * \binom{10}{0} + 4 * \binom{10}{1}.
+ 3 * \binom{10}{0} + 4 * \binom{10}{1}, which is 3 + 4 * 10 = 43.
+ In general, {a, +, b} (x) = a + b*x.
*/
tree
@@ -604,13 +845,12 @@ chrec_apply (unsigned var,
if (dump_file && (dump_flags & TDF_SCEV))
fprintf (dump_file, "(chrec_apply \n");
- if (TREE_CODE (x) == INTEGER_CST && SCALAR_FLOAT_TYPE_P (type))
- x = build_real_from_int_cst (type, x);
-
switch (TREE_CODE (chrec))
{
case POLYNOMIAL_CHREC:
- if (evolution_function_is_affine_p (chrec))
+ if (integer_zerop (x) && CHREC_VARIABLE (chrec) == var)
+ res = CHREC_LEFT (chrec);
+ else if (evolution_function_is_affine_p (chrec))
{
if (CHREC_VARIABLE (chrec) != var)
return build_polynomial_chrec
@@ -623,6 +863,29 @@ chrec_apply (unsigned var,
res = chrec_fold_multiply (TREE_TYPE (x), CHREC_RIGHT (chrec), x);
res = chrec_fold_plus (type, CHREC_LEFT (chrec), res);
}
+ else if (TREE_CODE (x) == COND_EXPR
+ && integer_zerop (TREE_OPERAND (x, 2))
+ && CHREC_VARIABLE (chrec) == var
+ && TREE_CODE (CHREC_LEFT (chrec)) == INTEGER_CST)
+ {
+ /* Optimize "{a, +, ...} (cond ? x : 0)" by hoisting the ternary
+ operator as "cond ? ({ a, +, ...} (x)) : a" for simple "a". */
+ res = chrec_apply (var, chrec, TREE_OPERAND (x, 1));
+ if (res != chrec_dont_know)
+ res = fold_build3 (COND_EXPR, type, TREE_OPERAND (x, 0),
+ res, CHREC_LEFT (chrec));
+ }
+ else if (CHREC_VARIABLE (chrec) == var
+ && evolution_function_is_affine_p (CHREC_RIGHT (chrec))
+ && CHREC_VARIABLE (CHREC_RIGHT (chrec)) == var)
+ {
+ /* "{a, +, {b, +, c} } (x)" -> "a + b*x + c*x*(x-1)/2". */
+ tree chrecr = CHREC_RIGHT (chrec);
+ res = chrec_apply_triangular (TREE_TYPE (chrecr),
+ CHREC_LEFT (chrec),
+ CHREC_LEFT (chrecr),
+ CHREC_RIGHT (chrecr), x);
+ }
else if (TREE_CODE (x) == INTEGER_CST
&& tree_int_cst_sgn (x) == 1)
/* testsuite/.../ssa-chrec-38.c. */
@@ -632,7 +895,7 @@ chrec_apply (unsigned var,
break;
CASE_CONVERT:
- res = chrec_convert (TREE_TYPE (chrec),
+ res = chrec_convert (type,
chrec_apply (var, TREE_OPERAND (chrec, 0), x),
NULL);
break;
@@ -1401,6 +1664,10 @@ chrec_convert_rhs (tree type, tree chrec, gimple *at_stmt)
if (POINTER_TYPE_P (type))
type = sizetype;
+ if (TREE_CODE (chrec) == INTEGER_CST
+ && SCALAR_FLOAT_TYPE_P (type))
+ return build_real_from_int_cst (type, chrec);
+
return chrec_convert (type, chrec, at_stmt);
}
@@ -1901,6 +1901,9 @@ interpret_expr (class loop *loop, gimple *at_stmt, tree expr)
|| get_gimple_rhs_class (TREE_CODE (expr)) == GIMPLE_TERNARY_RHS)
return chrec_dont_know;
+ if (TREE_CODE (expr) == SAVE_EXPR)
+ return interpret_expr (loop, at_stmt, TREE_OPERAND (expr, 0));
+
extract_ops_from_tree (expr, &code, &op0, &op1);
return interpret_rhs_expr (loop, at_stmt, type,
@@ -3469,6 +3472,19 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
cost += op_cost + 1;
return false;
+ case tcc_expression:
+ if (code == SAVE_EXPR)
+ {
+ if (expression_expensive_p (TREE_OPERAND (expr, 0), cache, op_cost))
+ return true;
+ /* Assume SAVE_EXPR is instanced twice. */
+ op_cost /= 2;
+ *cache.get (expr) += op_cost;
+ cost += op_cost;
+ return false;
+ }
+ return true;
+
default:
return true;
}
new file mode 100644
@@ -0,0 +1,14 @@
+/* PR middle-end/65855 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+unsigned long foo(unsigned int n)
+{
+ unsigned long t = 0;
+ for (unsigned long i=1; i<=n; i++)
+ t += i;
+ return t;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times " goto " 2 "optimized" } } */
new file mode 100644
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+int foo(int num) {
+ int a = 0;
+ for (int x = 0; x < num; x+=2) {
+ if (!(x % 2)) {
+ a += x;
+ }
+ }
+ return a;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times " goto " 2 "optimized" } } */
@@ -9,7 +9,8 @@ main1 (int len)
int s = 0;
int i = len;
- /* vectorization of induction with reduction. */
+ /* This used to test vectorization of induction with reduction.
+ * It now checks correctness of scev's final value replacement. */
for ( ; i > 1; i -=2)
s += i;
@@ -28,4 +29,3 @@ int main (void)
return 0;
}
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */