ivopts: Improve code generated for very simple loops.
Commit Message
This patch tidies up the code that GCC generates for simple loops,
by selecting/generating a simpler loop bound expression in ivopts.
The original motivation came from looking at the following loop (from
gcc.target/i386/pr90178.c)
int *find_ptr (int* mem, int sz, int val)
{
for (int i = 0; i < sz; i++)
if (mem[i] == val)
return &mem[i];
return 0;
}
which GCC currently compiles to:
find_ptr:
movq %rdi, %rax
testl %esi, %esi
jle .L4
leal -1(%rsi), %ecx
leaq 4(%rdi,%rcx,4), %rcx
jmp .L3
.L7: addq $4, %rax
cmpq %rcx, %rax
je .L4
.L3: cmpl %edx, (%rax)
jne .L7
ret
.L4: xorl %eax, %eax
ret
Notice the relatively complex leal/leaq instructions, that result
from ivopts using the following expression for the loop bound:
inv_expr 2: ((unsigned long) ((unsigned int) sz_8(D) + 4294967295)
* 4 + (unsigned long) mem_9(D)) + 4
which results from NITERS being (unsigned int) sz_8(D) + 4294967295,
i.e. (sz - 1), and the logic in cand_value_at determining the bound
as BASE + NITERS*STEP at the start of the final iteration and as
BASE + NITERS*STEP + STEP at the end of the final iteration.
Ideally, we'd like the middle-end optimizers to simplify
BASE + NITERS*STEP + STEP as BASE + (NITERS+1)*STEP, especially
when NITERS already has the form BOUND-1, but with type conversions
and possible overflow to worry about, the above "inv_expr 2" is the
best that can be done by fold (without additional context information).
This patch improves ivopts' cand_value_at by instead of using just
the tree expression for NITERS, passing the data structure that
explains how that expression was derived. This allows us to peek
under the surface to check that NITERS+1 doesn't overflow, and in
this patch to use the SSA_NAME already holding the required value.
In the motivating loop above, inv_expr 2 now becomes:
(unsigned long) sz_8(D) * 4 + (unsigned long) mem_9(D)
And as a result, on x86_64 we now generate:
find_ptr:
movq %rdi, %rax
testl %esi, %esi
jle .L4
movslq %esi, %rsi
leaq (%rdi,%rsi,4), %rcx
jmp .L3
.L7: addq $4, %rax
cmpq %rcx, %rax
je .L4
.L3: cmpl %edx, (%rax)
jne .L7
ret
.L4: xorl %eax, %eax
ret
This improvement required one minor tweak to GCC's testsuite for
gcc.dg/wrapped-binop-simplify.c, where we again generate better
code, and therefore no longer find as many optimization opportunities
in later passes (vrp2).
Previously:
void v1 (unsigned long *in, unsigned long *out, unsigned int n)
{
int i;
for (i = 0; i < n; i++) {
out[i] = in[i];
}
}
on x86_64 generated:
v1: testl %edx, %edx
je .L1
movl %edx, %edx
xorl %eax, %eax
.L3: movq (%rdi,%rax,8), %rcx
movq %rcx, (%rsi,%rax,8)
addq $1, %rax
cmpq %rax, %rdx
jne .L3
.L1: ret
and now instead generates:
v1: testl %edx, %edx
je .L1
movl %edx, %edx
xorl %eax, %eax
leaq 0(,%rdx,8), %rcx
.L3: movq (%rdi,%rax), %rdx
movq %rdx, (%rsi,%rax)
addq $8, %rax
cmpq %rax, %rcx
jne .L3
.L1: ret
This patch has been tested on x86_64-pc-linux-gnu with a make bootstrap
and make -k check with no new failures. Ok for mainline?
2021-11-15 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* tree-ssa-loop-ivopts.c (cand_value_at): Take a class
tree_niter_desc* argument instead of just a tree for NITER.
If we require the iv candidate value at the end of the final
loop iteration, try using the original loop bound as the
NITER for sufficiently simple loops.
(may_eliminate_iv): Update (only) call to cand_value_at.
gcc/testsuite
* gcc.dg/wrapped-binop-simplify.c: Update expected test result.
Thanks in advance,
Roger
--
Comments
On Mon, Nov 15, 2021 at 2:04 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch tidies up the code that GCC generates for simple loops,
> by selecting/generating a simpler loop bound expression in ivopts.
> The original motivation came from looking at the following loop (from
> gcc.target/i386/pr90178.c)
>
> int *find_ptr (int* mem, int sz, int val)
> {
> for (int i = 0; i < sz; i++)
> if (mem[i] == val)
> return &mem[i];
> return 0;
> }
>
> which GCC currently compiles to:
>
> find_ptr:
> movq %rdi, %rax
> testl %esi, %esi
> jle .L4
> leal -1(%rsi), %ecx
> leaq 4(%rdi,%rcx,4), %rcx
> jmp .L3
> .L7: addq $4, %rax
> cmpq %rcx, %rax
> je .L4
> .L3: cmpl %edx, (%rax)
> jne .L7
> ret
> .L4: xorl %eax, %eax
> ret
>
> Notice the relatively complex leal/leaq instructions, that result
> from ivopts using the following expression for the loop bound:
> inv_expr 2: ((unsigned long) ((unsigned int) sz_8(D) + 4294967295)
> * 4 + (unsigned long) mem_9(D)) + 4
>
> which results from NITERS being (unsigned int) sz_8(D) + 4294967295,
> i.e. (sz - 1), and the logic in cand_value_at determining the bound
> as BASE + NITERS*STEP at the start of the final iteration and as
> BASE + NITERS*STEP + STEP at the end of the final iteration.
>
> Ideally, we'd like the middle-end optimizers to simplify
> BASE + NITERS*STEP + STEP as BASE + (NITERS+1)*STEP, especially
> when NITERS already has the form BOUND-1, but with type conversions
> and possible overflow to worry about, the above "inv_expr 2" is the
> best that can be done by fold (without additional context information).
>
> This patch improves ivopts' cand_value_at by instead of using just
> the tree expression for NITERS, passing the data structure that
> explains how that expression was derived. This allows us to peek
> under the surface to check that NITERS+1 doesn't overflow, and in
> this patch to use the SSA_NAME already holding the required value.
>
> In the motivating loop above, inv_expr 2 now becomes:
> (unsigned long) sz_8(D) * 4 + (unsigned long) mem_9(D)
>
> And as a result, on x86_64 we now generate:
>
> find_ptr:
> movq %rdi, %rax
> testl %esi, %esi
> jle .L4
> movslq %esi, %rsi
> leaq (%rdi,%rsi,4), %rcx
> jmp .L3
> .L7: addq $4, %rax
> cmpq %rcx, %rax
> je .L4
> .L3: cmpl %edx, (%rax)
> jne .L7
> ret
> .L4: xorl %eax, %eax
> ret
>
>
> This improvement required one minor tweak to GCC's testsuite for
> gcc.dg/wrapped-binop-simplify.c, where we again generate better
> code, and therefore no longer find as many optimization opportunities
> in later passes (vrp2).
>
> Previously:
>
> void v1 (unsigned long *in, unsigned long *out, unsigned int n)
> {
> int i;
> for (i = 0; i < n; i++) {
> out[i] = in[i];
> }
> }
>
> on x86_64 generated:
> v1: testl %edx, %edx
> je .L1
> movl %edx, %edx
> xorl %eax, %eax
> .L3: movq (%rdi,%rax,8), %rcx
> movq %rcx, (%rsi,%rax,8)
> addq $1, %rax
> cmpq %rax, %rdx
> jne .L3
> .L1: ret
>
> and now instead generates:
> v1: testl %edx, %edx
> je .L1
> movl %edx, %edx
> xorl %eax, %eax
> leaq 0(,%rdx,8), %rcx
> .L3: movq (%rdi,%rax), %rdx
> movq %rdx, (%rsi,%rax)
> addq $8, %rax
> cmpq %rax, %rcx
> jne .L3
> .L1: ret
Is that actually better? IIRC the addressing modes are both complex
and we now have an extra lea? For this case I see we generate
_15 = n_10(D) + 4294967295;
_8 = (unsigned long) _15;
_7 = _8 + 1;
where n is unsigned int so if we know that n is not zero we can simplify the
addition and conveniently the loop header test provides this guarantee.
IIRC there were some attempts to enhance match.pd for some
cases of such expressions.
>
> This patch has been tested on x86_64-pc-linux-gnu with a make bootstrap
> and make -k check with no new failures. Ok for mainline?
+ /* If AFTER_ADJUST is required, the code below generates the equivalent
+ * of BASE + NITER * STEP + STEP, when ideally we'd prefer the expression
+ * BASE + (NITER + 1) * STEP, especially when NITER is often of the form
+ * SSA_NAME - 1. Unfortunately, guaranteeing that adding 1 to NITER
+ * doesn't overflow is tricky, so we peek inside the TREE_NITER_DESC
+ * class for common idioms that we know are safe. */
No '* ' each line.
+ if (after_adjust
+ && desc->control.no_overflow
+ && integer_onep (desc->control.step)
+ && integer_onep (desc->control.base)
+ && desc->cmp == LT_EXPR
+ && TREE_CODE (desc->bound) == SSA_NAME)
+ {
+ niter = desc->bound;
+ after_adjust = false;
+ }
I wonder if the non-overflowing can be captured by
integer_onep (iv->step)
&& max_stmt_executions (loop, &max)
and if we then do (niter + 1) * step instead of niter*step + step
would that do the same? That said, given we have 'niter' as
expression we might even be able to use rangers
range_of_expr to tell that '(niter + 1) * step' does not overflow?
That said - what the change does is actually ensure that
we CSE niter + 1 with the bound of the simplified exit test?
The patch doesn't add any testcase.
Thanks,
Richard.
>
> 2021-11-15 Roger Sayle <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
> * tree-ssa-loop-ivopts.c (cand_value_at): Take a class
> tree_niter_desc* argument instead of just a tree for NITER.
> If we require the iv candidate value at the end of the final
> loop iteration, try using the original loop bound as the
> NITER for sufficiently simple loops.
> (may_eliminate_iv): Update (only) call to cand_value_at.
>
> gcc/testsuite
> * gcc.dg/wrapped-binop-simplify.c: Update expected test result.
>
>
> Thanks in advance,
> Roger
> --
>
@@ -5034,28 +5034,48 @@ determine_group_iv_cost_address (struct ivopts_data *data,
return !sum_cost.infinite_cost_p ();
}
-/* Computes value of candidate CAND at position AT in iteration NITER, and
- stores it to VAL. */
+/* Computes value of candidate CAND at position AT in iteration DESC->NITER,
+ and stores it to VAL. */
static void
-cand_value_at (class loop *loop, struct iv_cand *cand, gimple *at, tree niter,
- aff_tree *val)
+cand_value_at (class loop *loop, struct iv_cand *cand, gimple *at,
+ class tree_niter_desc *desc, aff_tree *val)
{
aff_tree step, delta, nit;
struct iv *iv = cand->iv;
tree type = TREE_TYPE (iv->base);
+ tree niter = desc->niter;
+ bool after_adjust = stmt_after_increment (loop, cand, at);
tree steptype;
+
if (POINTER_TYPE_P (type))
steptype = sizetype;
else
steptype = unsigned_type_for (type);
+ /* If AFTER_ADJUST is required, the code below generates the equivalent
+ * of BASE + NITER * STEP + STEP, when ideally we'd prefer the expression
+ * BASE + (NITER + 1) * STEP, especially when NITER is often of the form
+ * SSA_NAME - 1. Unfortunately, guaranteeing that adding 1 to NITER
+ * doesn't overflow is tricky, so we peek inside the TREE_NITER_DESC
+ * class for common idioms that we know are safe. */
+ if (after_adjust
+ && desc->control.no_overflow
+ && integer_onep (desc->control.step)
+ && integer_onep (desc->control.base)
+ && desc->cmp == LT_EXPR
+ && TREE_CODE (desc->bound) == SSA_NAME)
+ {
+ niter = desc->bound;
+ after_adjust = false;
+ }
+
tree_to_aff_combination (iv->step, TREE_TYPE (iv->step), &step);
aff_combination_convert (&step, steptype);
tree_to_aff_combination (niter, TREE_TYPE (niter), &nit);
aff_combination_convert (&nit, steptype);
aff_combination_mult (&nit, &step, &delta);
- if (stmt_after_increment (loop, cand, at))
+ if (after_adjust)
aff_combination_add (&delta, &step);
tree_to_aff_combination (iv->base, type, val);
@@ -5406,7 +5426,7 @@ may_eliminate_iv (struct ivopts_data *data,
return true;
}
- cand_value_at (loop, cand, use->stmt, desc->niter, &bnd);
+ cand_value_at (loop, cand, use->stmt, desc, &bnd);
*bound = fold_convert (TREE_TYPE (cand->iv->base),
aff_combination_to_tree (&bnd));
@@ -1,6 +1,6 @@
/* { dg-do compile { target { { i?86-*-* x86_64-*-* s390*-*-* } && lp64 } } } */
/* { dg-options "-O2 -fdump-tree-vrp2-details" } */
-/* { dg-final { scan-tree-dump-times "gimple_simplified to" 4 "vrp2" } } */
+/* { dg-final { scan-tree-dump-times "gimple_simplified to" 1 "vrp2" } } */
void v1 (unsigned long *in, unsigned long *out, unsigned int n)
{