x86: Check each component of source operand for AVX_U128_DIRTY

Message ID 20220209025303.3063-1-hjl.tools@gmail.com
State Committed
Commit 5390a2f191682dae3c6d1e1deac20e05be413514
Headers
Series x86: Check each component of source operand for AVX_U128_DIRTY |

Commit Message

H.J. Lu Feb. 9, 2022, 2:53 a.m. UTC
  commit 9775e465c1fbfc32656de77c618c61acf5bd905d
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Jul 27 07:46:04 2021 -0700

    x86: Don't set AVX_U128_DIRTY when zeroing YMM/ZMM register

called ix86_check_avx_upper_register to check mode on source operand.
But ix86_check_avx_upper_register doesn't work on source operand like

(vec_select:V2DI (reg/v:V4DI 23 xmm3 [orig:91 ymm ] [91])
    (parallel [
            (const_int 2 [0x2])
            (const_int 3 [0x3])
        ]))

Add ix86_avx_u128_mode_source to check mode for each component of source
operand.

gcc/

	PR target/104441
	* config/i386/i386.cc (ix86_avx_u128_mode_source): New function.
	(ix86_avx_u128_mode_needed): Return AVX_U128_ANY for debug INSN.
	Call ix86_avx_u128_mode_source to check mode for each component
	of source operand.

gcc/testsuite/

	PR target/104441
	* gcc.target/i386/pr104441-1a.c: New test.
	* gcc.target/i386/pr104441-1b.c: Likewise.
---
 gcc/config/i386/i386.cc                     | 145 +++++++++++---------
 gcc/testsuite/gcc.target/i386/pr104441-1a.c |  57 ++++++++
 gcc/testsuite/gcc.target/i386/pr104441-1b.c |  32 +++++
 3 files changed, 168 insertions(+), 66 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104441-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104441-1b.c
  

Comments

Hongtao Liu Feb. 9, 2022, 4:40 a.m. UTC | #1
On Wed, Feb 9, 2022 at 10:53 AM H.J. Lu via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> commit 9775e465c1fbfc32656de77c618c61acf5bd905d
> Author: H.J. Lu <hjl.tools@gmail.com>
> Date:   Tue Jul 27 07:46:04 2021 -0700
>
>     x86: Don't set AVX_U128_DIRTY when zeroing YMM/ZMM register
>
> called ix86_check_avx_upper_register to check mode on source operand.
> But ix86_check_avx_upper_register doesn't work on source operand like
>
The new function ix86_avx_u128_mode_source just takes the code from the
 *else if (ix86_check_avx_upper_register (src))* branch to check each
 component of src that meets the ix86_check_avx_upper_register condition,
 which seems reasonable.

The patch LGTM.
> (vec_select:V2DI (reg/v:V4DI 23 xmm3 [orig:91 ymm ] [91])
>     (parallel [
>             (const_int 2 [0x2])
>             (const_int 3 [0x3])
>         ]))
>
> Add ix86_avx_u128_mode_source to check mode for each component of source
> operand.
>
> gcc/
>
>         PR target/104441
>         * config/i386/i386.cc (ix86_avx_u128_mode_source): New function.
>         (ix86_avx_u128_mode_needed): Return AVX_U128_ANY for debug INSN.
>         Call ix86_avx_u128_mode_source to check mode for each component
>         of source operand.
>
> gcc/testsuite/
>
>         PR target/104441
>         * gcc.target/i386/pr104441-1a.c: New test.
>         * gcc.target/i386/pr104441-1b.c: Likewise.
> ---
>  gcc/config/i386/i386.cc                     | 145 +++++++++++---------
>  gcc/testsuite/gcc.target/i386/pr104441-1a.c |  57 ++++++++
>  gcc/testsuite/gcc.target/i386/pr104441-1b.c |  32 +++++
>  3 files changed, 168 insertions(+), 66 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr104441-1a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr104441-1b.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index dd5584fb8ed..2d87acca7ff 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -14365,11 +14365,82 @@ ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
>      }
>   }
>
> +/* For YMM/ZMM store or YMM/ZMM extract.  Return mode for the source
> +   operand of SRC DEFs in the same basic block before INSN.  */
> +
> +static int
> +ix86_avx_u128_mode_source (rtx_insn *insn, const_rtx src)
> +{
> +  basic_block bb = BLOCK_FOR_INSN (insn);
> +  rtx_insn *end = BB_END (bb);
> +
> +  /* Return AVX_U128_DIRTY if there is no DEF in the same basic
> +     block.  */
> +  int status = AVX_U128_DIRTY;
> +
> +  for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src));
> +       def; def = DF_REF_NEXT_REG (def))
> +    if (DF_REF_BB (def) == bb)
> +      {
> +       /* Ignore DEF from different basic blocks.  */
> +       rtx_insn *def_insn = DF_REF_INSN (def);
> +
> +       /* Check if DEF_INSN is before INSN.  */
> +       rtx_insn *next;
> +       for (next = NEXT_INSN (def_insn);
> +            next != nullptr && next != end && next != insn;
> +            next = NEXT_INSN (next))
> +         ;
> +
> +       /* Skip if DEF_INSN isn't before INSN.  */
> +       if (next != insn)
> +         continue;
> +
> +       /* Return AVX_U128_DIRTY if the source operand of DEF_INSN
> +          isn't constant zero.  */
> +
> +       if (CALL_P (def_insn))
> +         {
> +           bool avx_upper_reg_found = false;
> +           note_stores (def_insn,
> +                        ix86_check_avx_upper_stores,
> +                        &avx_upper_reg_found);
> +
> +           /* Return AVX_U128_DIRTY if call returns AVX.  */
> +           if (avx_upper_reg_found)
> +             return AVX_U128_DIRTY;
> +
> +           continue;
> +         }
> +
> +       rtx set = single_set (def_insn);
> +       if (!set)
> +         return AVX_U128_DIRTY;
> +
> +       rtx dest = SET_DEST (set);
> +
> +       /* Skip if DEF_INSN is not an AVX load.  Return AVX_U128_DIRTY
> +          if the source operand isn't constant zero.  */
> +       if (ix86_check_avx_upper_register (dest)
> +           && standard_sse_constant_p (SET_SRC (set),
> +                                       GET_MODE (dest)) != 1)
> +         return AVX_U128_DIRTY;
> +
> +       /* We get here only if all AVX loads are from constant zero.  */
> +       status = AVX_U128_ANY;
> +      }
> +
> +  return status;
> +}
> +
>  /* Return needed mode for entity in optimize_mode_switching pass.  */
>
>  static int
>  ix86_avx_u128_mode_needed (rtx_insn *insn)
>  {
> +  if (DEBUG_INSN_P (insn))
> +    return AVX_U128_ANY;
> +
>    if (CALL_P (insn))
>      {
>        rtx link;
> @@ -14409,6 +14480,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
>        return AVX_U128_CLEAN;
>      }
>
> +  subrtx_iterator::array_type array;
> +
>    rtx set = single_set (insn);
>    if (set)
>      {
> @@ -14423,74 +14496,15 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
>           else
>             return AVX_U128_ANY;
>         }
> -      else if (ix86_check_avx_upper_register (src))
> +      else
>         {
> -         /* This is an YMM/ZMM store.  Check for the source operand
> -            of SRC DEFs in the same basic block before INSN.  */
> -         basic_block bb = BLOCK_FOR_INSN (insn);
> -         rtx_insn *end = BB_END (bb);
> -
> -         /* Return AVX_U128_DIRTY if there is no DEF in the same basic
> -            block.  */
> -         int status = AVX_U128_DIRTY;
> -
> -         for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src));
> -              def; def = DF_REF_NEXT_REG (def))
> -           if (DF_REF_BB (def) == bb)
> +         FOR_EACH_SUBRTX (iter, array, src, NONCONST)
> +           if (ix86_check_avx_upper_register (*iter))
>               {
> -               /* Ignore DEF from different basic blocks.  */
> -               rtx_insn *def_insn = DF_REF_INSN (def);
> -
> -               /* Check if DEF_INSN is before INSN.  */
> -               rtx_insn *next;
> -               for (next = NEXT_INSN (def_insn);
> -                    next != nullptr && next != end && next != insn;
> -                    next = NEXT_INSN (next))
> -                 ;
> -
> -               /* Skip if DEF_INSN isn't before INSN.  */
> -               if (next != insn)
> -                 continue;
> -
> -               /* Return AVX_U128_DIRTY if the source operand of
> -                  DEF_INSN isn't constant zero.  */
> -
> -               if (CALL_P (def_insn))
> -                 {
> -                   bool avx_upper_reg_found = false;
> -                   note_stores (def_insn, ix86_check_avx_upper_stores,
> -                                &avx_upper_reg_found);
> -
> -                   /* Return AVX_U128_DIRTY if call returns AVX.  */
> -                   if (avx_upper_reg_found)
> -                     return AVX_U128_DIRTY;
> -
> -                   continue;
> -                 }
> -
> -               set = single_set (def_insn);
> -               if (!set)
> -                 return AVX_U128_DIRTY;
> -
> -               dest = SET_DEST (set);
> -
> -               /* Skip if DEF_INSN is not an AVX load.  */
> -               if (ix86_check_avx_upper_register (dest))
> -                 {
> -                   src = SET_SRC (set);
> -                   /* Return AVX_U128_DIRTY if the source operand isn't
> -                      constant zero.  */
> -                   if (standard_sse_constant_p (src, GET_MODE (dest))
> -                       != 1)
> -                     return AVX_U128_DIRTY;
> -                 }
> -
> -               /* We get here only if all AVX loads are from constant
> -                  zero.  */
> -               status = AVX_U128_ANY;
> +               int status = ix86_avx_u128_mode_source (insn, *iter);
> +               if (status == AVX_U128_DIRTY)
> +                 return status;
>               }
> -
> -         return status;
>         }
>
>        /* This isn't YMM/ZMM load/store.  */
> @@ -14501,7 +14515,6 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
>       Hardware changes state only when a 256bit register is written to,
>       but we need to prevent the compiler from moving optimal insertion
>       point above eventual read from 256bit or 512 bit register.  */
> -  subrtx_iterator::array_type array;
>    FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
>      if (ix86_check_avx_upper_register (*iter))
>        return AVX_U128_DIRTY;
> diff --git a/gcc/testsuite/gcc.target/i386/pr104441-1a.c b/gcc/testsuite/gcc.target/i386/pr104441-1a.c
> new file mode 100644
> index 00000000000..f4d263205f8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104441-1a.c
> @@ -0,0 +1,57 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mtune=skylake -Wno-attributes" } */
> +
> +#include <x86intrin.h>
> +#include <stdint.h>
> +
> +__attribute__((always_inline, target("avx2")))
> +static __m256i
> +load8bit_4x4_avx2(const uint8_t *const src, const uint32_t stride)
> +{
> +  __m128i src01, src23;
> +  src01 = _mm_cvtsi32_si128(*(int32_t*)(src + 0 * stride));
> +  src23 = _mm_insert_epi32(src23, *(int32_t *)(src + 3 * stride), 1);
> +  return _mm256_setr_m128i(src01, src23);
> +}
> +
> +__attribute__ ((noinline, noipa, target("avx2")))
> +uint32_t
> +compute4x_m_sad_avx2_intrin(uint8_t *src, uint32_t src_stride,
> +                           uint8_t *ref, uint32_t ref_stride,
> +                           uint32_t height)
> +{
> +  __m128i xmm0;
> +  __m256i ymm = _mm256_setzero_si256();
> +  uint32_t y;
> +
> +  for (y = 0; y < height; y += 4) {
> +    const __m256i src0123 = load8bit_4x4_avx2(src, src_stride);
> +    const __m256i ref0123 = load8bit_4x4_avx2(ref, ref_stride);
> +    ymm = _mm256_add_epi32(ymm, _mm256_sad_epu8(src0123, ref0123));
> +    src += src_stride << 2;
> +    ref += ref_stride << 2;
> +  }
> +
> +  xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm),
> +                      _mm256_extracti128_si256(ymm, 1));
> +
> +  return (uint32_t)_mm_cvtsi128_si32(xmm0);
> +}
> +
> +/* Expect assembly like:
> +
> +       vextracti128    $0x1, %ymm3, %xmm3
> +       vpaddd  %xmm3, %xmm0, %xmm0
> +       vmovd   %xmm0, %eax
> +       vzeroupper
> +
> +rather than:
> +
> +       vzeroupper
> +       vextracti128    $0x1, %ymm3, %xmm3
> +       vpaddd  %xmm3, %xmm0, %xmm0
> +       vmovd   %xmm0, %eax
> +
> + */
> +
> +/* { dg-final { scan-assembler "\[ \t\]+vextracti128\[ \t\]+\[^\n\]+\n\[ \t\]+vpaddd\[ \t\]+\[^\n\]+\n\[ \t\]+vmovd\[ \t\]+\[^\n\]+\n\[ \t\]+vzeroupper" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr104441-1b.c b/gcc/testsuite/gcc.target/i386/pr104441-1b.c
> new file mode 100644
> index 00000000000..0b8a796d93c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104441-1b.c
> @@ -0,0 +1,32 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -mvzeroupper -Wno-attributes" } */
> +
> +#include "pr104441-1a.c"
> +
> +#define ARRAY_SIZE 255
> +
> +__attribute__ ((noinline, noipa))
> +static void
> +do_test (void)
> +{
> +  uint8_t src[ARRAY_SIZE];
> +  uint8_t ref[ARRAY_SIZE];
> +  uint32_t x;
> +  uint32_t i;
> +  for (i = 0; i < ARRAY_SIZE; i++)
> +    {
> +      src[i] = i;
> +      ref[i] = i;
> +    }
> +  x = compute4x_m_sad_avx2_intrin(src, 64 >> 2, ref, 64, 4);
> +  if (x != 0x240)
> +    __builtin_abort ();
> +}
> +
> +int
> +main ()
> +{
> +  if (__builtin_cpu_supports ("avx2"))
> +    do_test ();
> +  return 0;
> +}
> --
> 2.34.1
>
  

Patch

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index dd5584fb8ed..2d87acca7ff 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14365,11 +14365,82 @@  ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
     }
  }
 
+/* For YMM/ZMM store or YMM/ZMM extract.  Return mode for the source
+   operand of SRC DEFs in the same basic block before INSN.  */
+
+static int
+ix86_avx_u128_mode_source (rtx_insn *insn, const_rtx src)
+{
+  basic_block bb = BLOCK_FOR_INSN (insn);
+  rtx_insn *end = BB_END (bb);
+
+  /* Return AVX_U128_DIRTY if there is no DEF in the same basic
+     block.  */
+  int status = AVX_U128_DIRTY;
+
+  for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src));
+       def; def = DF_REF_NEXT_REG (def))
+    if (DF_REF_BB (def) == bb)
+      {
+	/* Ignore DEF from different basic blocks.  */
+	rtx_insn *def_insn = DF_REF_INSN (def);
+
+	/* Check if DEF_INSN is before INSN.  */
+	rtx_insn *next;
+	for (next = NEXT_INSN (def_insn);
+	     next != nullptr && next != end && next != insn;
+	     next = NEXT_INSN (next))
+	  ;
+
+	/* Skip if DEF_INSN isn't before INSN.  */
+	if (next != insn)
+	  continue;
+
+	/* Return AVX_U128_DIRTY if the source operand of DEF_INSN
+	   isn't constant zero.  */
+
+	if (CALL_P (def_insn))
+	  {
+	    bool avx_upper_reg_found = false;
+	    note_stores (def_insn,
+			 ix86_check_avx_upper_stores,
+			 &avx_upper_reg_found);
+
+	    /* Return AVX_U128_DIRTY if call returns AVX.  */
+	    if (avx_upper_reg_found)
+	      return AVX_U128_DIRTY;
+
+	    continue;
+	  }
+
+	rtx set = single_set (def_insn);
+	if (!set)
+	  return AVX_U128_DIRTY;
+
+	rtx dest = SET_DEST (set);
+
+	/* Skip if DEF_INSN is not an AVX load.  Return AVX_U128_DIRTY
+	   if the source operand isn't constant zero.  */
+	if (ix86_check_avx_upper_register (dest)
+	    && standard_sse_constant_p (SET_SRC (set),
+					GET_MODE (dest)) != 1)
+	  return AVX_U128_DIRTY;
+
+	/* We get here only if all AVX loads are from constant zero.  */
+	status = AVX_U128_ANY;
+      }
+
+  return status;
+}
+
 /* Return needed mode for entity in optimize_mode_switching pass.  */
 
 static int
 ix86_avx_u128_mode_needed (rtx_insn *insn)
 {
+  if (DEBUG_INSN_P (insn))
+    return AVX_U128_ANY;
+
   if (CALL_P (insn))
     {
       rtx link;
@@ -14409,6 +14480,8 @@  ix86_avx_u128_mode_needed (rtx_insn *insn)
       return AVX_U128_CLEAN;
     }
 
+  subrtx_iterator::array_type array;
+
   rtx set = single_set (insn);
   if (set)
     {
@@ -14423,74 +14496,15 @@  ix86_avx_u128_mode_needed (rtx_insn *insn)
 	  else
 	    return AVX_U128_ANY;
 	}
-      else if (ix86_check_avx_upper_register (src))
+      else
 	{
-	  /* This is an YMM/ZMM store.  Check for the source operand
-	     of SRC DEFs in the same basic block before INSN.  */
-	  basic_block bb = BLOCK_FOR_INSN (insn);
-	  rtx_insn *end = BB_END (bb);
-
-	  /* Return AVX_U128_DIRTY if there is no DEF in the same basic
-	     block.  */
-	  int status = AVX_U128_DIRTY;
-
-	  for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src));
-	       def; def = DF_REF_NEXT_REG (def))
-	    if (DF_REF_BB (def) == bb)
+	  FOR_EACH_SUBRTX (iter, array, src, NONCONST)
+	    if (ix86_check_avx_upper_register (*iter))
 	      {
-		/* Ignore DEF from different basic blocks.  */
-		rtx_insn *def_insn = DF_REF_INSN (def);
-
-		/* Check if DEF_INSN is before INSN.  */
-		rtx_insn *next;
-		for (next = NEXT_INSN (def_insn);
-		     next != nullptr && next != end && next != insn;
-		     next = NEXT_INSN (next))
-		  ;
-
-		/* Skip if DEF_INSN isn't before INSN.  */
-		if (next != insn)
-		  continue;
-
-		/* Return AVX_U128_DIRTY if the source operand of
-		   DEF_INSN isn't constant zero.  */
-
-		if (CALL_P (def_insn))
-		  {
-		    bool avx_upper_reg_found = false;
-		    note_stores (def_insn, ix86_check_avx_upper_stores,
-				 &avx_upper_reg_found);
-
-		    /* Return AVX_U128_DIRTY if call returns AVX.  */
-		    if (avx_upper_reg_found)
-		      return AVX_U128_DIRTY;
-
-		    continue;
-		  }
-
-		set = single_set (def_insn);
-		if (!set)
-		  return AVX_U128_DIRTY;
-
-		dest = SET_DEST (set);
-
-		/* Skip if DEF_INSN is not an AVX load.  */
-		if (ix86_check_avx_upper_register (dest))
-		  {
-		    src = SET_SRC (set);
-		    /* Return AVX_U128_DIRTY if the source operand isn't
-		       constant zero.  */
-		    if (standard_sse_constant_p (src, GET_MODE (dest))
-			!= 1)
-		      return AVX_U128_DIRTY;
-		  }
-
-		/* We get here only if all AVX loads are from constant
-		   zero.  */
-		status = AVX_U128_ANY;
+		int status = ix86_avx_u128_mode_source (insn, *iter);
+		if (status == AVX_U128_DIRTY)
+		  return status;
 	      }
-
-	  return status;
 	}
 
       /* This isn't YMM/ZMM load/store.  */
@@ -14501,7 +14515,6 @@  ix86_avx_u128_mode_needed (rtx_insn *insn)
      Hardware changes state only when a 256bit register is written to,
      but we need to prevent the compiler from moving optimal insertion
      point above eventual read from 256bit or 512 bit register.  */
-  subrtx_iterator::array_type array;
   FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
     if (ix86_check_avx_upper_register (*iter))
       return AVX_U128_DIRTY;
diff --git a/gcc/testsuite/gcc.target/i386/pr104441-1a.c b/gcc/testsuite/gcc.target/i386/pr104441-1a.c
new file mode 100644
index 00000000000..f4d263205f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104441-1a.c
@@ -0,0 +1,57 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -mtune=skylake -Wno-attributes" } */
+
+#include <x86intrin.h>
+#include <stdint.h>
+
+__attribute__((always_inline, target("avx2")))
+static __m256i
+load8bit_4x4_avx2(const uint8_t *const src, const uint32_t stride)
+{
+  __m128i src01, src23;
+  src01 = _mm_cvtsi32_si128(*(int32_t*)(src + 0 * stride));
+  src23 = _mm_insert_epi32(src23, *(int32_t *)(src + 3 * stride), 1);
+  return _mm256_setr_m128i(src01, src23);
+}
+
+__attribute__ ((noinline, noipa, target("avx2")))
+uint32_t
+compute4x_m_sad_avx2_intrin(uint8_t *src, uint32_t src_stride,
+			    uint8_t *ref, uint32_t ref_stride,
+			    uint32_t height)
+{
+  __m128i xmm0;
+  __m256i ymm = _mm256_setzero_si256();
+  uint32_t y;
+
+  for (y = 0; y < height; y += 4) {
+    const __m256i src0123 = load8bit_4x4_avx2(src, src_stride);
+    const __m256i ref0123 = load8bit_4x4_avx2(ref, ref_stride);
+    ymm = _mm256_add_epi32(ymm, _mm256_sad_epu8(src0123, ref0123));
+    src += src_stride << 2;
+    ref += ref_stride << 2;
+  }
+
+  xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm),
+		       _mm256_extracti128_si256(ymm, 1));
+
+  return (uint32_t)_mm_cvtsi128_si32(xmm0);
+}  
+
+/* Expect assembly like:
+
+	vextracti128	$0x1, %ymm3, %xmm3
+	vpaddd	%xmm3, %xmm0, %xmm0
+	vmovd	%xmm0, %eax
+	vzeroupper
+
+rather than:
+
+	vzeroupper
+	vextracti128	$0x1, %ymm3, %xmm3
+	vpaddd	%xmm3, %xmm0, %xmm0
+	vmovd	%xmm0, %eax
+
+ */
+
+/* { dg-final { scan-assembler "\[ \t\]+vextracti128\[ \t\]+\[^\n\]+\n\[ \t\]+vpaddd\[ \t\]+\[^\n\]+\n\[ \t\]+vmovd\[ \t\]+\[^\n\]+\n\[ \t\]+vzeroupper" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr104441-1b.c b/gcc/testsuite/gcc.target/i386/pr104441-1b.c
new file mode 100644
index 00000000000..0b8a796d93c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104441-1b.c
@@ -0,0 +1,32 @@ 
+/* { dg-do run } */
+/* { dg-options "-O3 -mvzeroupper -Wno-attributes" } */
+
+#include "pr104441-1a.c"
+
+#define ARRAY_SIZE 255
+
+__attribute__ ((noinline, noipa))
+static void
+do_test (void)
+{
+  uint8_t src[ARRAY_SIZE];
+  uint8_t ref[ARRAY_SIZE];
+  uint32_t x;
+  uint32_t i;
+  for (i = 0; i < ARRAY_SIZE; i++)
+    {
+      src[i] = i;
+      ref[i] = i;
+    }
+  x = compute4x_m_sad_avx2_intrin(src, 64 >> 2, ref, 64, 4);
+  if (x != 0x240)
+    __builtin_abort ();
+}
+
+int
+main ()
+{
+  if (__builtin_cpu_supports ("avx2"))
+    do_test ();
+  return 0;
+}