Don't assume it's AVX_U128_CLEAN after call_insn whose abi.mode_clobber(V4DImode) deosn't contains all SSE_REGS.

Message ID 20231208021655.1595917-1-hongtao.liu@intel.com
State Committed
Commit fc62716fe8d1d60a9f1c6906e5a4845b3331b828
Headers
Series Don't assume it's AVX_U128_CLEAN after call_insn whose abi.mode_clobber(V4DImode) deosn't contains all SSE_REGS. |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_gcc_check--master-arm success Testing passed

Commit Message

Liu, Hongtao Dec. 8, 2023, 2:16 a.m. UTC
  If the function desn't clobber any sse registers or only clobber
128-bit part, then vzeroupper isn't issued before the function exit.
the status not CLEAN but ANY after the function.

Also for sibling_call, it's safe to issue an vzeroupper. Also there
could be missing vzeroupper since there's no mode_exit for
sibling_call_p.

Compared to the patch in the PR, this patch add sibling_call part.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk and backport?

gcc/ChangeLog:

	PR target/112891
	* config/i386/i386.cc (ix86_avx_u128_mode_after): Return
	AVX_U128_ANY if callee_abi doesn't clobber all_sse_regs to
	align with ix86_avx_u128_mode_needed.
	(ix86_avx_u128_mode_needed): Return AVX_U128_ClEAN for
	sibling_call.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr112891.c: New test.
	* gcc.target/i386/pr112891-2.c: New test.
---
 gcc/config/i386/i386.cc                    | 22 +++++++++++++---
 gcc/testsuite/gcc.target/i386/pr112891-2.c | 30 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr112891.c   | 29 +++++++++++++++++++++
 3 files changed, 78 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr112891-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr112891.c
  

Comments

Hongtao Liu Dec. 12, 2023, 5:30 a.m. UTC | #1
On Fri, Dec 8, 2023 at 10:17 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> If the function desn't clobber any sse registers or only clobber
> 128-bit part, then vzeroupper isn't issued before the function exit.
> the status not CLEAN but ANY after the function.
>
> Also for sibling_call, it's safe to issue an vzeroupper. Also there
> could be missing vzeroupper since there's no mode_exit for
> sibling_call_p.
>
> Compared to the patch in the PR, this patch add sibling_call part.
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk and backport?
Part of this has been approved  in the PR, and for the sibling_call
part, i think it should be reasonable.
So i'm going to commit the patch.
>
> gcc/ChangeLog:
>
>         PR target/112891
>         * config/i386/i386.cc (ix86_avx_u128_mode_after): Return
>         AVX_U128_ANY if callee_abi doesn't clobber all_sse_regs to
>         align with ix86_avx_u128_mode_needed.
>         (ix86_avx_u128_mode_needed): Return AVX_U128_ClEAN for
>         sibling_call.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr112891.c: New test.
>         * gcc.target/i386/pr112891-2.c: New test.
> ---
>  gcc/config/i386/i386.cc                    | 22 +++++++++++++---
>  gcc/testsuite/gcc.target/i386/pr112891-2.c | 30 ++++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr112891.c   | 29 +++++++++++++++++++++
>  3 files changed, 78 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr112891-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr112891.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 7c5cab4e2c6..fe259cdb789 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -15038,8 +15038,12 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
>          vzeroupper if all SSE registers are clobbered.  */
>        const function_abi &abi = insn_callee_abi (insn);
>        if (vzeroupper_pattern (PATTERN (insn), VOIDmode)
> -         || !hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
> -                                    abi.mode_clobbers (V4DImode)))
> +         /* Should be safe to issue an vzeroupper before sibling_call_p.
> +            Also there not mode_exit for sibling_call, so there could be
> +            missing vzeroupper for that.  */
> +         || !(SIBLING_CALL_P (insn)
> +              || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
> +                                        abi.mode_clobbers (V4DImode))))
>         return AVX_U128_ANY;
>
>        return AVX_U128_CLEAN;
> @@ -15177,7 +15181,19 @@ ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
>        bool avx_upper_reg_found = false;
>        note_stores (insn, ix86_check_avx_upper_stores, &avx_upper_reg_found);
>
> -      return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
> +      if (avx_upper_reg_found)
> +       return AVX_U128_DIRTY;
> +
> +      /* If the function desn't clobber any sse registers or only clobber
> +        128-bit part, Then vzeroupper isn't issued before the function exit.
> +        the status not CLEAN but ANY after the function.  */
> +      const function_abi &abi = insn_callee_abi (insn);
> +      if (!(SIBLING_CALL_P (insn)
> +           || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
> +                                     abi.mode_clobbers (V4DImode))))
> +       return AVX_U128_ANY;
> +
> +      return  AVX_U128_CLEAN;
>      }
>
>    /* Otherwise, return current mode.  Remember that if insn
> diff --git a/gcc/testsuite/gcc.target/i386/pr112891-2.c b/gcc/testsuite/gcc.target/i386/pr112891-2.c
> new file mode 100644
> index 00000000000..164c3985d50
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr112891-2.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx2 -O3" } */
> +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
> +
> +void
> +__attribute__((noinline))
> +bar (double* a)
> +{
> +  a[0] = 1.0;
> +  a[1] = 2.0;
> +}
> +
> +double
> +__attribute__((noinline))
> +foo (double* __restrict a, double* b)
> +{
> +  a[0] += b[0];
> +  a[1] += b[1];
> +  a[2] += b[2];
> +  a[3] += b[3];
> +  bar (b);
> +  return a[5] + b[5];
> +}
> +
> +double
> +foo1 (double* __restrict a, double* b)
> +{
> +  double c = foo (a, b);
> +  return __builtin_exp (c);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr112891.c b/gcc/testsuite/gcc.target/i386/pr112891.c
> new file mode 100644
> index 00000000000..dbf6c67948a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr112891.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx2 -O3" } */
> +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
> +
> +void
> +__attribute__((noinline))
> +bar (double* a)
> +{
> +  a[0] = 1.0;
> +  a[1] = 2.0;
> +}
> +
> +void
> +__attribute__((noinline))
> +foo (double* __restrict a, double* b)
> +{
> +  a[0] += b[0];
> +  a[1] += b[1];
> +  a[2] += b[2];
> +  a[3] += b[3];
> +  bar (b);
> +}
> +
> +double
> +foo1 (double* __restrict a, double* b)
> +{
> +  foo (a, b);
> +  return __builtin_exp (b[1]);
> +}
> --
> 2.31.1
>
  

Patch

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 7c5cab4e2c6..fe259cdb789 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -15038,8 +15038,12 @@  ix86_avx_u128_mode_needed (rtx_insn *insn)
 	 vzeroupper if all SSE registers are clobbered.  */
       const function_abi &abi = insn_callee_abi (insn);
       if (vzeroupper_pattern (PATTERN (insn), VOIDmode)
-	  || !hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
-				     abi.mode_clobbers (V4DImode)))
+	  /* Should be safe to issue an vzeroupper before sibling_call_p.
+	     Also there not mode_exit for sibling_call, so there could be
+	     missing vzeroupper for that.  */
+	  || !(SIBLING_CALL_P (insn)
+	       || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
+					 abi.mode_clobbers (V4DImode))))
 	return AVX_U128_ANY;
 
       return AVX_U128_CLEAN;
@@ -15177,7 +15181,19 @@  ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
       bool avx_upper_reg_found = false;
       note_stores (insn, ix86_check_avx_upper_stores, &avx_upper_reg_found);
 
-      return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
+      if (avx_upper_reg_found)
+	return AVX_U128_DIRTY;
+
+      /* If the function desn't clobber any sse registers or only clobber
+	 128-bit part, Then vzeroupper isn't issued before the function exit.
+	 the status not CLEAN but ANY after the function.  */
+      const function_abi &abi = insn_callee_abi (insn);
+      if (!(SIBLING_CALL_P (insn)
+	    || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
+				      abi.mode_clobbers (V4DImode))))
+	return AVX_U128_ANY;
+
+      return  AVX_U128_CLEAN;
     }
 
   /* Otherwise, return current mode.  Remember that if insn
diff --git a/gcc/testsuite/gcc.target/i386/pr112891-2.c b/gcc/testsuite/gcc.target/i386/pr112891-2.c
new file mode 100644
index 00000000000..164c3985d50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr112891-2.c
@@ -0,0 +1,30 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O3" } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+
+void
+__attribute__((noinline))
+bar (double* a)
+{
+  a[0] = 1.0;
+  a[1] = 2.0;
+}
+
+double
+__attribute__((noinline))
+foo (double* __restrict a, double* b)
+{
+  a[0] += b[0];
+  a[1] += b[1];
+  a[2] += b[2];
+  a[3] += b[3];
+  bar (b);
+  return a[5] + b[5];
+}
+
+double
+foo1 (double* __restrict a, double* b)
+{
+  double c = foo (a, b);
+  return __builtin_exp (c);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr112891.c b/gcc/testsuite/gcc.target/i386/pr112891.c
new file mode 100644
index 00000000000..dbf6c67948a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr112891.c
@@ -0,0 +1,29 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O3" } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+
+void
+__attribute__((noinline))
+bar (double* a)
+{
+  a[0] = 1.0;
+  a[1] = 2.0;
+}
+
+void
+__attribute__((noinline))
+foo (double* __restrict a, double* b)
+{
+  a[0] += b[0];
+  a[1] += b[1];
+  a[2] += b[2];
+  a[3] += b[3];
+  bar (b);
+}
+
+double
+foo1 (double* __restrict a, double* b)
+{
+  foo (a, b);
+  return __builtin_exp (b[1]);
+}