[RTL/fwprop] Allow propagations from inner loop to outer loop.
Commit Message
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk.
gcc/ChangeLog:
PR rtl/103750
* cfgloop.h (loop_contains_p): New function.
* fwprop.c (forward_propagate_into): Allow propagations from
inner loop to outer loop.
gcc/testsuite/ChangeLog:
* g++.target/i386/pr103750-fwprop-1.C: New test.
---
gcc/cfgloop.h | 12 +++++++++
gcc/fwprop.c | 7 +++--
.../g++.target/i386/pr103750-fwprop-1.C | 26 +++++++++++++++++++
3 files changed, 43 insertions(+), 2 deletions(-)
create mode 100644 gcc/testsuite/g++.target/i386/pr103750-fwprop-1.C
Comments
On Wed, Jan 5, 2022 at 6:39 AM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk.
>
> gcc/ChangeLog:
>
> PR rtl/103750
> * cfgloop.h (loop_contains_p): New function.
> * fwprop.c (forward_propagate_into): Allow propagations from
> inner loop to outer loop.
>
> gcc/testsuite/ChangeLog:
>
> * g++.target/i386/pr103750-fwprop-1.C: New test.
> ---
> gcc/cfgloop.h | 12 +++++++++
> gcc/fwprop.c | 7 +++--
> .../g++.target/i386/pr103750-fwprop-1.C | 26 +++++++++++++++++++
> 3 files changed, 43 insertions(+), 2 deletions(-)
> create mode 100644 gcc/testsuite/g++.target/i386/pr103750-fwprop-1.C
>
> diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
> index d2714e20cb0..e8fe0cedd5f 100644
> --- a/gcc/cfgloop.h
> +++ b/gcc/cfgloop.h
> @@ -908,6 +908,18 @@ loop_outermost (class loop *loop)
> return (*loop->superloops)[1];
> }
>
> +/* Returns true if loop OUTER contains loop INNER. */
that's flow_loop_nested_p (loop *outer, loop *inner) which
is implemented in O(1). Note behavior for outer == inner
might be different (didn't check your implementation too hard)
Otherwise looks OK to me.
Thanks,
Richard.
> +static inline bool
> +loop_contains_p (class loop* outer, class loop* inner)
> +{
> + unsigned n = vec_safe_length (inner->superloops);
> +
> + for (unsigned i = 0; i != n; i++)
> + if ((*inner->superloops)[i] == outer)
> + return true;
> + return false;
> +}
> +
> extern void record_niter_bound (class loop *, const widest_int &, bool, bool);
> extern HOST_WIDE_INT get_estimated_loop_iterations_int (class loop *);
> extern HOST_WIDE_INT get_max_loop_iterations_int (const class loop *);
> diff --git a/gcc/fwprop.c b/gcc/fwprop.c
> index 2eab4fd4614..aed48e7273f 100644
> --- a/gcc/fwprop.c
> +++ b/gcc/fwprop.c
> @@ -866,10 +866,13 @@ forward_propagate_into (use_info *use, bool reg_prop_only = false)
> rtx src = SET_SRC (def_set);
>
> /* Allow propagations into a loop only for reg-to-reg copies, since
> - replacing one register by another shouldn't increase the cost. */
> + replacing one register by another shouldn't increase the cost.
> + Propagations from inner loop to outer loop should be also ok. */
> struct loop *def_loop = def_insn->bb ()->cfg_bb ()->loop_father;
> struct loop *use_loop = use->bb ()->cfg_bb ()->loop_father;
> - if ((reg_prop_only || def_loop != use_loop)
> + if ((reg_prop_only
> + || (use_loop && def_loop != use_loop
> + && !loop_contains_p (use_loop, def_loop)))
> && (!reg_single_def_p (dest) || !reg_single_def_p (src)))
> return false;
>
> diff --git a/gcc/testsuite/g++.target/i386/pr103750-fwprop-1.C b/gcc/testsuite/g++.target/i386/pr103750-fwprop-1.C
> new file mode 100644
> index 00000000000..26987d307aa
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr103750-fwprop-1.C
> @@ -0,0 +1,26 @@
> +/* PR target/103750. */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -std=c++1y -march=cannonlake -fdump-rtl-fwprop1" } */
> +/* { dg-final { scan-rtl-dump-not "subreg:HI\[ \\\(\]*reg:SI\[^\n]*\n\[^\n]*UNSPEC_TZCNT" "fwprop1" } } */
> +
> +#include<immintrin.h>
> +const char16_t *qustrchr(char16_t *n, char16_t *e, char16_t c) noexcept
> +{
> + __m256i mch256 = _mm256_set1_epi16(c);
> + for ( ; n < e; n += 32) {
> + __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
> + __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1);
> + __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256);
> + __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256);
> + if (_kortestz_mask16_u8(mask1, mask2))
> + continue;
> +
> + unsigned idx = _tzcnt_u32(mask1);
> + if (mask1 == 0) {
> + idx = __tzcnt_u16(mask2);
> + n += 16;
> + }
> + return n + idx;
> + }
> + return e;
> +}
> --
> 2.18.1
>
@@ -908,6 +908,18 @@ loop_outermost (class loop *loop)
return (*loop->superloops)[1];
}
+/* Returns true if loop OUTER contains loop INNER. */
+static inline bool
+loop_contains_p (class loop* outer, class loop* inner)
+{
+ unsigned n = vec_safe_length (inner->superloops);
+
+ for (unsigned i = 0; i != n; i++)
+ if ((*inner->superloops)[i] == outer)
+ return true;
+ return false;
+}
+
extern void record_niter_bound (class loop *, const widest_int &, bool, bool);
extern HOST_WIDE_INT get_estimated_loop_iterations_int (class loop *);
extern HOST_WIDE_INT get_max_loop_iterations_int (const class loop *);
@@ -866,10 +866,13 @@ forward_propagate_into (use_info *use, bool reg_prop_only = false)
rtx src = SET_SRC (def_set);
/* Allow propagations into a loop only for reg-to-reg copies, since
- replacing one register by another shouldn't increase the cost. */
+ replacing one register by another shouldn't increase the cost.
+ Propagations from inner loop to outer loop should be also ok. */
struct loop *def_loop = def_insn->bb ()->cfg_bb ()->loop_father;
struct loop *use_loop = use->bb ()->cfg_bb ()->loop_father;
- if ((reg_prop_only || def_loop != use_loop)
+ if ((reg_prop_only
+ || (use_loop && def_loop != use_loop
+ && !loop_contains_p (use_loop, def_loop)))
&& (!reg_single_def_p (dest) || !reg_single_def_p (src)))
return false;
new file mode 100644
@@ -0,0 +1,26 @@
+/* PR target/103750. */
+/* { dg-do compile } */
+/* { dg-options "-O2 -std=c++1y -march=cannonlake -fdump-rtl-fwprop1" } */
+/* { dg-final { scan-rtl-dump-not "subreg:HI\[ \\\(\]*reg:SI\[^\n]*\n\[^\n]*UNSPEC_TZCNT" "fwprop1" } } */
+
+#include<immintrin.h>
+const char16_t *qustrchr(char16_t *n, char16_t *e, char16_t c) noexcept
+{
+ __m256i mch256 = _mm256_set1_epi16(c);
+ for ( ; n < e; n += 32) {
+ __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
+ __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1);
+ __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256);
+ __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256);
+ if (_kortestz_mask16_u8(mask1, mask2))
+ continue;
+
+ unsigned idx = _tzcnt_u32(mask1);
+ if (mask1 == 0) {
+ idx = __tzcnt_u16(mask2);
+ n += 16;
+ }
+ return n + idx;
+ }
+ return e;
+}