x86: Check each component of source operand for AVX_U128_DIRTY
Commit Message
commit 9775e465c1fbfc32656de77c618c61acf5bd905d
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Tue Jul 27 07:46:04 2021 -0700
x86: Don't set AVX_U128_DIRTY when zeroing YMM/ZMM register
called ix86_check_avx_upper_register to check mode on source operand.
But ix86_check_avx_upper_register doesn't work on source operand like
(vec_select:V2DI (reg/v:V4DI 23 xmm3 [orig:91 ymm ] [91])
(parallel [
(const_int 2 [0x2])
(const_int 3 [0x3])
]))
Add ix86_avx_u128_mode_source to check mode for each component of source
operand.
gcc/
PR target/104441
* config/i386/i386.cc (ix86_avx_u128_mode_source): New function.
(ix86_avx_u128_mode_needed): Return AVX_U128_ANY for debug INSN.
Call ix86_avx_u128_mode_source to check mode for each component
of source operand.
gcc/testsuite/
PR target/104441
* gcc.target/i386/pr104441-1a.c: New test.
* gcc.target/i386/pr104441-1b.c: Likewise.
---
gcc/config/i386/i386.cc | 145 +++++++++++---------
gcc/testsuite/gcc.target/i386/pr104441-1a.c | 57 ++++++++
gcc/testsuite/gcc.target/i386/pr104441-1b.c | 32 +++++
3 files changed, 168 insertions(+), 66 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr104441-1a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr104441-1b.c
Comments
On Wed, Feb 9, 2022 at 10:53 AM H.J. Lu via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> commit 9775e465c1fbfc32656de77c618c61acf5bd905d
> Author: H.J. Lu <hjl.tools@gmail.com>
> Date: Tue Jul 27 07:46:04 2021 -0700
>
> x86: Don't set AVX_U128_DIRTY when zeroing YMM/ZMM register
>
> called ix86_check_avx_upper_register to check mode on source operand.
> But ix86_check_avx_upper_register doesn't work on source operand like
>
The new function ix86_avx_u128_mode_source just takes the code from the
*else if (ix86_check_avx_upper_register (src))* branch to check each
component of src that meets the ix86_check_avx_upper_register condition,
which seems reasonable.
The patch LGTM.
> (vec_select:V2DI (reg/v:V4DI 23 xmm3 [orig:91 ymm ] [91])
> (parallel [
> (const_int 2 [0x2])
> (const_int 3 [0x3])
> ]))
>
> Add ix86_avx_u128_mode_source to check mode for each component of source
> operand.
>
> gcc/
>
> PR target/104441
> * config/i386/i386.cc (ix86_avx_u128_mode_source): New function.
> (ix86_avx_u128_mode_needed): Return AVX_U128_ANY for debug INSN.
> Call ix86_avx_u128_mode_source to check mode for each component
> of source operand.
>
> gcc/testsuite/
>
> PR target/104441
> * gcc.target/i386/pr104441-1a.c: New test.
> * gcc.target/i386/pr104441-1b.c: Likewise.
> ---
> gcc/config/i386/i386.cc | 145 +++++++++++---------
> gcc/testsuite/gcc.target/i386/pr104441-1a.c | 57 ++++++++
> gcc/testsuite/gcc.target/i386/pr104441-1b.c | 32 +++++
> 3 files changed, 168 insertions(+), 66 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr104441-1a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr104441-1b.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index dd5584fb8ed..2d87acca7ff 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -14365,11 +14365,82 @@ ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
> }
> }
>
> +/* For YMM/ZMM store or YMM/ZMM extract. Return mode for the source
> + operand of SRC DEFs in the same basic block before INSN. */
> +
> +static int
> +ix86_avx_u128_mode_source (rtx_insn *insn, const_rtx src)
> +{
> + basic_block bb = BLOCK_FOR_INSN (insn);
> + rtx_insn *end = BB_END (bb);
> +
> + /* Return AVX_U128_DIRTY if there is no DEF in the same basic
> + block. */
> + int status = AVX_U128_DIRTY;
> +
> + for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src));
> + def; def = DF_REF_NEXT_REG (def))
> + if (DF_REF_BB (def) == bb)
> + {
> + /* Ignore DEF from different basic blocks. */
> + rtx_insn *def_insn = DF_REF_INSN (def);
> +
> + /* Check if DEF_INSN is before INSN. */
> + rtx_insn *next;
> + for (next = NEXT_INSN (def_insn);
> + next != nullptr && next != end && next != insn;
> + next = NEXT_INSN (next))
> + ;
> +
> + /* Skip if DEF_INSN isn't before INSN. */
> + if (next != insn)
> + continue;
> +
> + /* Return AVX_U128_DIRTY if the source operand of DEF_INSN
> + isn't constant zero. */
> +
> + if (CALL_P (def_insn))
> + {
> + bool avx_upper_reg_found = false;
> + note_stores (def_insn,
> + ix86_check_avx_upper_stores,
> + &avx_upper_reg_found);
> +
> + /* Return AVX_U128_DIRTY if call returns AVX. */
> + if (avx_upper_reg_found)
> + return AVX_U128_DIRTY;
> +
> + continue;
> + }
> +
> + rtx set = single_set (def_insn);
> + if (!set)
> + return AVX_U128_DIRTY;
> +
> + rtx dest = SET_DEST (set);
> +
> + /* Skip if DEF_INSN is not an AVX load. Return AVX_U128_DIRTY
> + if the source operand isn't constant zero. */
> + if (ix86_check_avx_upper_register (dest)
> + && standard_sse_constant_p (SET_SRC (set),
> + GET_MODE (dest)) != 1)
> + return AVX_U128_DIRTY;
> +
> + /* We get here only if all AVX loads are from constant zero. */
> + status = AVX_U128_ANY;
> + }
> +
> + return status;
> +}
> +
> /* Return needed mode for entity in optimize_mode_switching pass. */
>
> static int
> ix86_avx_u128_mode_needed (rtx_insn *insn)
> {
> + if (DEBUG_INSN_P (insn))
> + return AVX_U128_ANY;
> +
> if (CALL_P (insn))
> {
> rtx link;
> @@ -14409,6 +14480,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
> return AVX_U128_CLEAN;
> }
>
> + subrtx_iterator::array_type array;
> +
> rtx set = single_set (insn);
> if (set)
> {
> @@ -14423,74 +14496,15 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
> else
> return AVX_U128_ANY;
> }
> - else if (ix86_check_avx_upper_register (src))
> + else
> {
> - /* This is an YMM/ZMM store. Check for the source operand
> - of SRC DEFs in the same basic block before INSN. */
> - basic_block bb = BLOCK_FOR_INSN (insn);
> - rtx_insn *end = BB_END (bb);
> -
> - /* Return AVX_U128_DIRTY if there is no DEF in the same basic
> - block. */
> - int status = AVX_U128_DIRTY;
> -
> - for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src));
> - def; def = DF_REF_NEXT_REG (def))
> - if (DF_REF_BB (def) == bb)
> + FOR_EACH_SUBRTX (iter, array, src, NONCONST)
> + if (ix86_check_avx_upper_register (*iter))
> {
> - /* Ignore DEF from different basic blocks. */
> - rtx_insn *def_insn = DF_REF_INSN (def);
> -
> - /* Check if DEF_INSN is before INSN. */
> - rtx_insn *next;
> - for (next = NEXT_INSN (def_insn);
> - next != nullptr && next != end && next != insn;
> - next = NEXT_INSN (next))
> - ;
> -
> - /* Skip if DEF_INSN isn't before INSN. */
> - if (next != insn)
> - continue;
> -
> - /* Return AVX_U128_DIRTY if the source operand of
> - DEF_INSN isn't constant zero. */
> -
> - if (CALL_P (def_insn))
> - {
> - bool avx_upper_reg_found = false;
> - note_stores (def_insn, ix86_check_avx_upper_stores,
> - &avx_upper_reg_found);
> -
> - /* Return AVX_U128_DIRTY if call returns AVX. */
> - if (avx_upper_reg_found)
> - return AVX_U128_DIRTY;
> -
> - continue;
> - }
> -
> - set = single_set (def_insn);
> - if (!set)
> - return AVX_U128_DIRTY;
> -
> - dest = SET_DEST (set);
> -
> - /* Skip if DEF_INSN is not an AVX load. */
> - if (ix86_check_avx_upper_register (dest))
> - {
> - src = SET_SRC (set);
> - /* Return AVX_U128_DIRTY if the source operand isn't
> - constant zero. */
> - if (standard_sse_constant_p (src, GET_MODE (dest))
> - != 1)
> - return AVX_U128_DIRTY;
> - }
> -
> - /* We get here only if all AVX loads are from constant
> - zero. */
> - status = AVX_U128_ANY;
> + int status = ix86_avx_u128_mode_source (insn, *iter);
> + if (status == AVX_U128_DIRTY)
> + return status;
> }
> -
> - return status;
> }
>
> /* This isn't YMM/ZMM load/store. */
> @@ -14501,7 +14515,6 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
> Hardware changes state only when a 256bit register is written to,
> but we need to prevent the compiler from moving optimal insertion
> point above eventual read from 256bit or 512 bit register. */
> - subrtx_iterator::array_type array;
> FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
> if (ix86_check_avx_upper_register (*iter))
> return AVX_U128_DIRTY;
> diff --git a/gcc/testsuite/gcc.target/i386/pr104441-1a.c b/gcc/testsuite/gcc.target/i386/pr104441-1a.c
> new file mode 100644
> index 00000000000..f4d263205f8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104441-1a.c
> @@ -0,0 +1,57 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mtune=skylake -Wno-attributes" } */
> +
> +#include <x86intrin.h>
> +#include <stdint.h>
> +
> +__attribute__((always_inline, target("avx2")))
> +static __m256i
> +load8bit_4x4_avx2(const uint8_t *const src, const uint32_t stride)
> +{
> + __m128i src01, src23;
> + src01 = _mm_cvtsi32_si128(*(int32_t*)(src + 0 * stride));
> + src23 = _mm_insert_epi32(src23, *(int32_t *)(src + 3 * stride), 1);
> + return _mm256_setr_m128i(src01, src23);
> +}
> +
> +__attribute__ ((noinline, noipa, target("avx2")))
> +uint32_t
> +compute4x_m_sad_avx2_intrin(uint8_t *src, uint32_t src_stride,
> + uint8_t *ref, uint32_t ref_stride,
> + uint32_t height)
> +{
> + __m128i xmm0;
> + __m256i ymm = _mm256_setzero_si256();
> + uint32_t y;
> +
> + for (y = 0; y < height; y += 4) {
> + const __m256i src0123 = load8bit_4x4_avx2(src, src_stride);
> + const __m256i ref0123 = load8bit_4x4_avx2(ref, ref_stride);
> + ymm = _mm256_add_epi32(ymm, _mm256_sad_epu8(src0123, ref0123));
> + src += src_stride << 2;
> + ref += ref_stride << 2;
> + }
> +
> + xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm),
> + _mm256_extracti128_si256(ymm, 1));
> +
> + return (uint32_t)_mm_cvtsi128_si32(xmm0);
> +}
> +
> +/* Expect assembly like:
> +
> + vextracti128 $0x1, %ymm3, %xmm3
> + vpaddd %xmm3, %xmm0, %xmm0
> + vmovd %xmm0, %eax
> + vzeroupper
> +
> +rather than:
> +
> + vzeroupper
> + vextracti128 $0x1, %ymm3, %xmm3
> + vpaddd %xmm3, %xmm0, %xmm0
> + vmovd %xmm0, %eax
> +
> + */
> +
> +/* { dg-final { scan-assembler "\[ \t\]+vextracti128\[ \t\]+\[^\n\]+\n\[ \t\]+vpaddd\[ \t\]+\[^\n\]+\n\[ \t\]+vmovd\[ \t\]+\[^\n\]+\n\[ \t\]+vzeroupper" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr104441-1b.c b/gcc/testsuite/gcc.target/i386/pr104441-1b.c
> new file mode 100644
> index 00000000000..0b8a796d93c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104441-1b.c
> @@ -0,0 +1,32 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -mvzeroupper -Wno-attributes" } */
> +
> +#include "pr104441-1a.c"
> +
> +#define ARRAY_SIZE 255
> +
> +__attribute__ ((noinline, noipa))
> +static void
> +do_test (void)
> +{
> + uint8_t src[ARRAY_SIZE];
> + uint8_t ref[ARRAY_SIZE];
> + uint32_t x;
> + uint32_t i;
> + for (i = 0; i < ARRAY_SIZE; i++)
> + {
> + src[i] = i;
> + ref[i] = i;
> + }
> + x = compute4x_m_sad_avx2_intrin(src, 64 >> 2, ref, 64, 4);
> + if (x != 0x240)
> + __builtin_abort ();
> +}
> +
> +int
> +main ()
> +{
> + if (__builtin_cpu_supports ("avx2"))
> + do_test ();
> + return 0;
> +}
> --
> 2.34.1
>
@@ -14365,11 +14365,82 @@ ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
}
}
+/* For YMM/ZMM store or YMM/ZMM extract. Return mode for the source
+ operand of SRC DEFs in the same basic block before INSN. */
+
+static int
+ix86_avx_u128_mode_source (rtx_insn *insn, const_rtx src)
+{
+ basic_block bb = BLOCK_FOR_INSN (insn);
+ rtx_insn *end = BB_END (bb);
+
+ /* Return AVX_U128_DIRTY if there is no DEF in the same basic
+ block. */
+ int status = AVX_U128_DIRTY;
+
+ for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src));
+ def; def = DF_REF_NEXT_REG (def))
+ if (DF_REF_BB (def) == bb)
+ {
+ /* Ignore DEF from different basic blocks. */
+ rtx_insn *def_insn = DF_REF_INSN (def);
+
+ /* Check if DEF_INSN is before INSN. */
+ rtx_insn *next;
+ for (next = NEXT_INSN (def_insn);
+ next != nullptr && next != end && next != insn;
+ next = NEXT_INSN (next))
+ ;
+
+ /* Skip if DEF_INSN isn't before INSN. */
+ if (next != insn)
+ continue;
+
+ /* Return AVX_U128_DIRTY if the source operand of DEF_INSN
+ isn't constant zero. */
+
+ if (CALL_P (def_insn))
+ {
+ bool avx_upper_reg_found = false;
+ note_stores (def_insn,
+ ix86_check_avx_upper_stores,
+ &avx_upper_reg_found);
+
+ /* Return AVX_U128_DIRTY if call returns AVX. */
+ if (avx_upper_reg_found)
+ return AVX_U128_DIRTY;
+
+ continue;
+ }
+
+ rtx set = single_set (def_insn);
+ if (!set)
+ return AVX_U128_DIRTY;
+
+ rtx dest = SET_DEST (set);
+
+ /* Skip if DEF_INSN is not an AVX load. Return AVX_U128_DIRTY
+ if the source operand isn't constant zero. */
+ if (ix86_check_avx_upper_register (dest)
+ && standard_sse_constant_p (SET_SRC (set),
+ GET_MODE (dest)) != 1)
+ return AVX_U128_DIRTY;
+
+ /* We get here only if all AVX loads are from constant zero. */
+ status = AVX_U128_ANY;
+ }
+
+ return status;
+}
+
/* Return needed mode for entity in optimize_mode_switching pass. */
static int
ix86_avx_u128_mode_needed (rtx_insn *insn)
{
+ if (DEBUG_INSN_P (insn))
+ return AVX_U128_ANY;
+
if (CALL_P (insn))
{
rtx link;
@@ -14409,6 +14480,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
return AVX_U128_CLEAN;
}
+ subrtx_iterator::array_type array;
+
rtx set = single_set (insn);
if (set)
{
@@ -14423,74 +14496,15 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
else
return AVX_U128_ANY;
}
- else if (ix86_check_avx_upper_register (src))
+ else
{
- /* This is an YMM/ZMM store. Check for the source operand
- of SRC DEFs in the same basic block before INSN. */
- basic_block bb = BLOCK_FOR_INSN (insn);
- rtx_insn *end = BB_END (bb);
-
- /* Return AVX_U128_DIRTY if there is no DEF in the same basic
- block. */
- int status = AVX_U128_DIRTY;
-
- for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src));
- def; def = DF_REF_NEXT_REG (def))
- if (DF_REF_BB (def) == bb)
+ FOR_EACH_SUBRTX (iter, array, src, NONCONST)
+ if (ix86_check_avx_upper_register (*iter))
{
- /* Ignore DEF from different basic blocks. */
- rtx_insn *def_insn = DF_REF_INSN (def);
-
- /* Check if DEF_INSN is before INSN. */
- rtx_insn *next;
- for (next = NEXT_INSN (def_insn);
- next != nullptr && next != end && next != insn;
- next = NEXT_INSN (next))
- ;
-
- /* Skip if DEF_INSN isn't before INSN. */
- if (next != insn)
- continue;
-
- /* Return AVX_U128_DIRTY if the source operand of
- DEF_INSN isn't constant zero. */
-
- if (CALL_P (def_insn))
- {
- bool avx_upper_reg_found = false;
- note_stores (def_insn, ix86_check_avx_upper_stores,
- &avx_upper_reg_found);
-
- /* Return AVX_U128_DIRTY if call returns AVX. */
- if (avx_upper_reg_found)
- return AVX_U128_DIRTY;
-
- continue;
- }
-
- set = single_set (def_insn);
- if (!set)
- return AVX_U128_DIRTY;
-
- dest = SET_DEST (set);
-
- /* Skip if DEF_INSN is not an AVX load. */
- if (ix86_check_avx_upper_register (dest))
- {
- src = SET_SRC (set);
- /* Return AVX_U128_DIRTY if the source operand isn't
- constant zero. */
- if (standard_sse_constant_p (src, GET_MODE (dest))
- != 1)
- return AVX_U128_DIRTY;
- }
-
- /* We get here only if all AVX loads are from constant
- zero. */
- status = AVX_U128_ANY;
+ int status = ix86_avx_u128_mode_source (insn, *iter);
+ if (status == AVX_U128_DIRTY)
+ return status;
}
-
- return status;
}
/* This isn't YMM/ZMM load/store. */
@@ -14501,7 +14515,6 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
Hardware changes state only when a 256bit register is written to,
but we need to prevent the compiler from moving optimal insertion
point above eventual read from 256bit or 512 bit register. */
- subrtx_iterator::array_type array;
FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
if (ix86_check_avx_upper_register (*iter))
return AVX_U128_DIRTY;
new file mode 100644
@@ -0,0 +1,57 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mtune=skylake -Wno-attributes" } */
+
+#include <x86intrin.h>
+#include <stdint.h>
+
+__attribute__((always_inline, target("avx2")))
+static __m256i
+load8bit_4x4_avx2(const uint8_t *const src, const uint32_t stride)
+{
+ __m128i src01, src23;
+ src01 = _mm_cvtsi32_si128(*(int32_t*)(src + 0 * stride));
+ src23 = _mm_insert_epi32(src23, *(int32_t *)(src + 3 * stride), 1);
+ return _mm256_setr_m128i(src01, src23);
+}
+
+__attribute__ ((noinline, noipa, target("avx2")))
+uint32_t
+compute4x_m_sad_avx2_intrin(uint8_t *src, uint32_t src_stride,
+ uint8_t *ref, uint32_t ref_stride,
+ uint32_t height)
+{
+ __m128i xmm0;
+ __m256i ymm = _mm256_setzero_si256();
+ uint32_t y;
+
+ for (y = 0; y < height; y += 4) {
+ const __m256i src0123 = load8bit_4x4_avx2(src, src_stride);
+ const __m256i ref0123 = load8bit_4x4_avx2(ref, ref_stride);
+ ymm = _mm256_add_epi32(ymm, _mm256_sad_epu8(src0123, ref0123));
+ src += src_stride << 2;
+ ref += ref_stride << 2;
+ }
+
+ xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm),
+ _mm256_extracti128_si256(ymm, 1));
+
+ return (uint32_t)_mm_cvtsi128_si32(xmm0);
+}
+
+/* Expect assembly like:
+
+ vextracti128 $0x1, %ymm3, %xmm3
+ vpaddd %xmm3, %xmm0, %xmm0
+ vmovd %xmm0, %eax
+ vzeroupper
+
+rather than:
+
+ vzeroupper
+ vextracti128 $0x1, %ymm3, %xmm3
+ vpaddd %xmm3, %xmm0, %xmm0
+ vmovd %xmm0, %eax
+
+ */
+
+/* { dg-final { scan-assembler "\[ \t\]+vextracti128\[ \t\]+\[^\n\]+\n\[ \t\]+vpaddd\[ \t\]+\[^\n\]+\n\[ \t\]+vmovd\[ \t\]+\[^\n\]+\n\[ \t\]+vzeroupper" } } */
new file mode 100644
@@ -0,0 +1,32 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mvzeroupper -Wno-attributes" } */
+
+#include "pr104441-1a.c"
+
+#define ARRAY_SIZE 255
+
+__attribute__ ((noinline, noipa))
+static void
+do_test (void)
+{
+ uint8_t src[ARRAY_SIZE];
+ uint8_t ref[ARRAY_SIZE];
+ uint32_t x;
+ uint32_t i;
+ for (i = 0; i < ARRAY_SIZE; i++)
+ {
+ src[i] = i;
+ ref[i] = i;
+ }
+ x = compute4x_m_sad_avx2_intrin(src, 64 >> 2, ref, 64, 4);
+ if (x != 0x240)
+ __builtin_abort ();
+}
+
+int
+main ()
+{
+ if (__builtin_cpu_supports ("avx2"))
+ do_test ();
+ return 0;
+}