Split vector load from parm_del to elemental loads to avoid STLF stalls.

Message ID 20220401064634.16091-1-hongtao.liu@intel.com
State New
Headers
Series Split vector load from parm_del to elemental loads to avoid STLF stalls. |

Commit Message

Liu, Hongtao April 1, 2022, 6:46 a.m. UTC
  Update in V2:
1. Use get_insns instead of FOR_EACH_BB_CFUN and FOR_BB_INSNS.
2. Return for any_uncondjump_p and ANY_RETURN_P.
3. Add dump info for spliting instruction.
4. Restrict ix86_split_stlf_stall_load under TARGET_SSE2.

Since cfg is freed before machine_reorg, just do a rough calculation
of the window according to the layout.
Also according to an experiment on CLX, set window size to 64.

Currently only handle V2DFmode load since it doesn't need any scratch
registers, and it's sufficient to recover cray performance for -O2
compared to GCC11.

gcc/ChangeLog:

	PR target/101908
	* config/i386/i386.cc (ix86_split_stlf_stall_load): New
	function
	(ix86_reorg): Call ix86_split_stlf_stall_load.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr101908-1.c: New test.
	* gcc.target/i386/pr101908-2.c: New test.
---
 gcc/config/i386/i386.cc                    | 60 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 +++++
 gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 +++++
 3 files changed, 84 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
  

Comments

Richard Biener April 1, 2022, 6:53 a.m. UTC | #1
On Fri, Apr 1, 2022 at 8:47 AM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Update in V2:
> 1. Use get_insns instead of FOR_EACH_BB_CFUN and FOR_BB_INSNS.
> 2. Return for any_uncondjump_p and ANY_RETURN_P.
> 3. Add dump info for spliting instruction.
> 4. Restrict ix86_split_stlf_stall_load under TARGET_SSE2.
>
> Since cfg is freed before machine_reorg, just do a rough calculation
> of the window according to the layout.
> Also according to an experiment on CLX, set window size to 64.
>
> Currently only handle V2DFmode load since it doesn't need any scratch
> registers, and it's sufficient to recover cray performance for -O2
> compared to GCC11.
>
> gcc/ChangeLog:
>
>         PR target/101908
>         * config/i386/i386.cc (ix86_split_stlf_stall_load): New
>         function
>         (ix86_reorg): Call ix86_split_stlf_stall_load.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr101908-1.c: New test.
>         * gcc.target/i386/pr101908-2.c: New test.
> ---
>  gcc/config/i386/i386.cc                    | 60 ++++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 +++++
>  gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 +++++
>  3 files changed, 84 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 5a561966eb4..c88a689f32b 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -21933,6 +21933,64 @@ ix86_seh_fixup_eh_fallthru (void)
>        emit_insn_after (gen_nops (const1_rtx), insn);
>      }
>  }
> +/* Split vector load from parm_decl to elemental loads to avoid STLF
> +   stalls.  */
> +static void
> +ix86_split_stlf_stall_load ()
> +{
> +  rtx_insn* insn, *start = get_insns ();
> +  unsigned window = 0;
> +
> +  for (insn = start; insn; insn = NEXT_INSN (insn))
> +    {
> +      if (!NONDEBUG_INSN_P (insn))
> +       continue;
> +      window++;
> +      /* Insert 64 vaddps %xmm18, %xmm19, %xmm20(no dependence between each
> +        other, just emulate for pipeline) before stalled load, stlf stall
> +        case is as fast as no stall cases on CLX.
> +        Since CFG is freed before machine_reorg, just do a rough
> +        calculation of the window according to the layout.  */
> +      if (window > 64)

I think we want to turn the '64' into a --param at least.  You can add

-param=x86-stlf-window-ninsns=

into i386.opt (see -param= examples in aarch64/ for example).

> +       return;
> +
> +      if (any_uncondjump_p (insn)
> +         || ANY_RETURN_P (PATTERN (insn)))

You made a point about calls - does any_uncondjump_p cover them?

otherwise I think this is fine, Honza, do you agree?

Thanks,
Richard.

> +       return;
> +
> +      rtx set = single_set (insn);
> +      if (!set)
> +       continue;
> +      rtx src = SET_SRC (set);
> +      if (!MEM_P (src)
> +         /* Only handle V2DFmode load since it doesn't need any scratch
> +            register.  */
> +         || GET_MODE (src) != E_V2DFmode
> +         || !MEM_EXPR (src)
> +         || TREE_CODE (get_base_address (MEM_EXPR (src))) != PARM_DECL
> +       continue;
> +
> +      rtx zero = CONST0_RTX (V2DFmode);
> +      rtx dest = SET_DEST (set);
> +      rtx m = adjust_address (src, DFmode, 0);
> +      rtx loadlpd = gen_sse2_loadlpd (dest, zero, m);
> +      emit_insn_before (loadlpd, insn);
> +      m = adjust_address (src, DFmode, 8);
> +      rtx loadhpd = gen_sse2_loadhpd (dest, dest, m);
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       {
> +         fputs ("Due to potential STLF stall, split instruction:\n",
> +                dump_file);
> +         print_rtl_single (dump_file, insn);
> +         fputs ("To:\n", dump_file);
> +         print_rtl_single (dump_file, loadlpd);
> +         print_rtl_single (dump_file, loadhpd);
> +       }
> +      PATTERN (insn) = loadhpd;
> +      INSN_CODE (insn) = -1;
> +      gcc_assert (recog_memoized (insn) != -1);
> +    }
> +}
>
>  /* Implement machine specific optimizations.  We implement padding of returns
>     for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
> @@ -21948,6 +22006,8 @@ ix86_reorg (void)
>
>    if (optimize && optimize_function_for_speed_p (cfun))
>      {
> +      if (TARGET_SSE2)
> +       ix86_split_stlf_stall_load ();
>        if (TARGET_PAD_SHORT_FUNCTION)
>         ix86_pad_short_function ();
>        else if (TARGET_PAD_RETURNS)
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> new file mode 100644
> index 00000000000..33d9684f0ad
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-avx" } */
> +/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]} } } */
> +
> +struct X { double x[2]; };
> +typedef double v2df __attribute__((vector_size(16)));
> +
> +v2df __attribute__((noipa))
> +foo (struct X* x, struct X* y)
> +{
> +  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> new file mode 100644
> index 00000000000..45060b73c06
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-avx" } */
> +/* { dg-final { scan-assembler-times {(?n)movhpd[ \t]+} "2" } }  */
> +
> +struct X { double x[4]; };
> +typedef double v2df __attribute__((vector_size(16)));
> +
> +v2df __attribute__((noipa))
> +foo (struct X x, struct X y)
> +{
> +  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
> +}
> --
> 2.18.1
>
  
Hongtao Liu April 1, 2022, 7:14 a.m. UTC | #2
On Fri, Apr 1, 2022 at 2:54 PM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Fri, Apr 1, 2022 at 8:47 AM liuhongt via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > Update in V2:
> > 1. Use get_insns instead of FOR_EACH_BB_CFUN and FOR_BB_INSNS.
> > 2. Return for any_uncondjump_p and ANY_RETURN_P.
> > 3. Add dump info for spliting instruction.
> > 4. Restrict ix86_split_stlf_stall_load under TARGET_SSE2.
> >
> > Since cfg is freed before machine_reorg, just do a rough calculation
> > of the window according to the layout.
> > Also according to an experiment on CLX, set window size to 64.
> >
> > Currently only handle V2DFmode load since it doesn't need any scratch
> > registers, and it's sufficient to recover cray performance for -O2
> > compared to GCC11.
> >
> > gcc/ChangeLog:
> >
> >         PR target/101908
> >         * config/i386/i386.cc (ix86_split_stlf_stall_load): New
> >         function
> >         (ix86_reorg): Call ix86_split_stlf_stall_load.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/pr101908-1.c: New test.
> >         * gcc.target/i386/pr101908-2.c: New test.
> > ---
> >  gcc/config/i386/i386.cc                    | 60 ++++++++++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 +++++
> >  gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 +++++
> >  3 files changed, 84 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index 5a561966eb4..c88a689f32b 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -21933,6 +21933,64 @@ ix86_seh_fixup_eh_fallthru (void)
> >        emit_insn_after (gen_nops (const1_rtx), insn);
> >      }
> >  }
> > +/* Split vector load from parm_decl to elemental loads to avoid STLF
> > +   stalls.  */
> > +static void
> > +ix86_split_stlf_stall_load ()
> > +{
> > +  rtx_insn* insn, *start = get_insns ();
> > +  unsigned window = 0;
> > +
> > +  for (insn = start; insn; insn = NEXT_INSN (insn))
> > +    {
> > +      if (!NONDEBUG_INSN_P (insn))
> > +       continue;
> > +      window++;
> > +      /* Insert 64 vaddps %xmm18, %xmm19, %xmm20(no dependence between each
> > +        other, just emulate for pipeline) before stalled load, stlf stall
> > +        case is as fast as no stall cases on CLX.
> > +        Since CFG is freed before machine_reorg, just do a rough
> > +        calculation of the window according to the layout.  */
> > +      if (window > 64)
>
> I think we want to turn the '64' into a --param at least.  You can add
>
> -param=x86-stlf-window-ninsns=
>
> into i386.opt (see -param= examples in aarch64/ for example).
Sure.
>
> > +       return;
> > +
> > +      if (any_uncondjump_p (insn)
> > +         || ANY_RETURN_P (PATTERN (insn)))
>
> You made a point about calls - does any_uncondjump_p cover them?
>
No, I prefer excluding calls which could take sufficient time to
compensate for the STLF stall.
> otherwise I think this is fine, Honza, do you agree?
>
> Thanks,
> Richard.
>
> > +       return;
> > +
> > +      rtx set = single_set (insn);
> > +      if (!set)
> > +       continue;
> > +      rtx src = SET_SRC (set);
> > +      if (!MEM_P (src)
> > +         /* Only handle V2DFmode load since it doesn't need any scratch
> > +            register.  */
> > +         || GET_MODE (src) != E_V2DFmode
> > +         || !MEM_EXPR (src)
> > +         || TREE_CODE (get_base_address (MEM_EXPR (src))) != PARM_DECL
> > +       continue;
> > +
> > +      rtx zero = CONST0_RTX (V2DFmode);
> > +      rtx dest = SET_DEST (set);
> > +      rtx m = adjust_address (src, DFmode, 0);
> > +      rtx loadlpd = gen_sse2_loadlpd (dest, zero, m);
> > +      emit_insn_before (loadlpd, insn);
> > +      m = adjust_address (src, DFmode, 8);
> > +      rtx loadhpd = gen_sse2_loadhpd (dest, dest, m);
> > +      if (dump_file && (dump_flags & TDF_DETAILS))
> > +       {
> > +         fputs ("Due to potential STLF stall, split instruction:\n",
> > +                dump_file);
> > +         print_rtl_single (dump_file, insn);
> > +         fputs ("To:\n", dump_file);
> > +         print_rtl_single (dump_file, loadlpd);
> > +         print_rtl_single (dump_file, loadhpd);
> > +       }
> > +      PATTERN (insn) = loadhpd;
> > +      INSN_CODE (insn) = -1;
> > +      gcc_assert (recog_memoized (insn) != -1);
> > +    }
> > +}
> >
> >  /* Implement machine specific optimizations.  We implement padding of returns
> >     for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
> > @@ -21948,6 +22006,8 @@ ix86_reorg (void)
> >
> >    if (optimize && optimize_function_for_speed_p (cfun))
> >      {
> > +      if (TARGET_SSE2)
> > +       ix86_split_stlf_stall_load ();
> >        if (TARGET_PAD_SHORT_FUNCTION)
> >         ix86_pad_short_function ();
> >        else if (TARGET_PAD_RETURNS)
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> > new file mode 100644
> > index 00000000000..33d9684f0ad
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -msse2 -mno-avx" } */
> > +/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]} } } */
> > +
> > +struct X { double x[2]; };
> > +typedef double v2df __attribute__((vector_size(16)));
> > +
> > +v2df __attribute__((noipa))
> > +foo (struct X* x, struct X* y)
> > +{
> > +  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> > new file mode 100644
> > index 00000000000..45060b73c06
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -msse2 -mno-avx" } */
> > +/* { dg-final { scan-assembler-times {(?n)movhpd[ \t]+} "2" } }  */
> > +
> > +struct X { double x[4]; };
> > +typedef double v2df __attribute__((vector_size(16)));
> > +
> > +v2df __attribute__((noipa))
> > +foo (struct X x, struct X y)
> > +{
> > +  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
> > +}
> > --
> > 2.18.1
> >
  
Richard Biener April 1, 2022, 7:20 a.m. UTC | #3
On Fri, Apr 1, 2022 at 9:14 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Fri, Apr 1, 2022 at 2:54 PM Richard Biener via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Fri, Apr 1, 2022 at 8:47 AM liuhongt via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > Update in V2:
> > > 1. Use get_insns instead of FOR_EACH_BB_CFUN and FOR_BB_INSNS.
> > > 2. Return for any_uncondjump_p and ANY_RETURN_P.
> > > 3. Add dump info for spliting instruction.
> > > 4. Restrict ix86_split_stlf_stall_load under TARGET_SSE2.
> > >
> > > Since cfg is freed before machine_reorg, just do a rough calculation
> > > of the window according to the layout.
> > > Also according to an experiment on CLX, set window size to 64.
> > >
> > > Currently only handle V2DFmode load since it doesn't need any scratch
> > > registers, and it's sufficient to recover cray performance for -O2
> > > compared to GCC11.
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR target/101908
> > >         * config/i386/i386.cc (ix86_split_stlf_stall_load): New
> > >         function
> > >         (ix86_reorg): Call ix86_split_stlf_stall_load.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.target/i386/pr101908-1.c: New test.
> > >         * gcc.target/i386/pr101908-2.c: New test.
> > > ---
> > >  gcc/config/i386/i386.cc                    | 60 ++++++++++++++++++++++
> > >  gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 +++++
> > >  gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 +++++
> > >  3 files changed, 84 insertions(+)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c
> > >
> > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > > index 5a561966eb4..c88a689f32b 100644
> > > --- a/gcc/config/i386/i386.cc
> > > +++ b/gcc/config/i386/i386.cc
> > > @@ -21933,6 +21933,64 @@ ix86_seh_fixup_eh_fallthru (void)
> > >        emit_insn_after (gen_nops (const1_rtx), insn);
> > >      }
> > >  }
> > > +/* Split vector load from parm_decl to elemental loads to avoid STLF
> > > +   stalls.  */
> > > +static void
> > > +ix86_split_stlf_stall_load ()
> > > +{
> > > +  rtx_insn* insn, *start = get_insns ();
> > > +  unsigned window = 0;
> > > +
> > > +  for (insn = start; insn; insn = NEXT_INSN (insn))
> > > +    {
> > > +      if (!NONDEBUG_INSN_P (insn))
> > > +       continue;
> > > +      window++;
> > > +      /* Insert 64 vaddps %xmm18, %xmm19, %xmm20(no dependence between each
> > > +        other, just emulate for pipeline) before stalled load, stlf stall
> > > +        case is as fast as no stall cases on CLX.
> > > +        Since CFG is freed before machine_reorg, just do a rough
> > > +        calculation of the window according to the layout.  */
> > > +      if (window > 64)
> >
> > I think we want to turn the '64' into a --param at least.  You can add
> >
> > -param=x86-stlf-window-ninsns=
> >
> > into i386.opt (see -param= examples in aarch64/ for example).
> Sure.
> >
> > > +       return;
> > > +
> > > +      if (any_uncondjump_p (insn)
> > > +         || ANY_RETURN_P (PATTERN (insn)))
> >
> > You made a point about calls - does any_uncondjump_p cover them?
> >
> No, I prefer excluding calls which could take sufficient time to
> compensate for the STLF stall.

So I guess CALL_P (insn) could check for them, I agree we can stop looking
at calls.

> > otherwise I think this is fine, Honza, do you agree?
> >
> > Thanks,
> > Richard.
> >
> > > +       return;
> > > +
> > > +      rtx set = single_set (insn);
> > > +      if (!set)
> > > +       continue;
> > > +      rtx src = SET_SRC (set);
> > > +      if (!MEM_P (src)
> > > +         /* Only handle V2DFmode load since it doesn't need any scratch
> > > +            register.  */
> > > +         || GET_MODE (src) != E_V2DFmode
> > > +         || !MEM_EXPR (src)
> > > +         || TREE_CODE (get_base_address (MEM_EXPR (src))) != PARM_DECL
> > > +       continue;
> > > +
> > > +      rtx zero = CONST0_RTX (V2DFmode);
> > > +      rtx dest = SET_DEST (set);
> > > +      rtx m = adjust_address (src, DFmode, 0);
> > > +      rtx loadlpd = gen_sse2_loadlpd (dest, zero, m);
> > > +      emit_insn_before (loadlpd, insn);
> > > +      m = adjust_address (src, DFmode, 8);
> > > +      rtx loadhpd = gen_sse2_loadhpd (dest, dest, m);
> > > +      if (dump_file && (dump_flags & TDF_DETAILS))
> > > +       {
> > > +         fputs ("Due to potential STLF stall, split instruction:\n",
> > > +                dump_file);
> > > +         print_rtl_single (dump_file, insn);
> > > +         fputs ("To:\n", dump_file);
> > > +         print_rtl_single (dump_file, loadlpd);
> > > +         print_rtl_single (dump_file, loadhpd);
> > > +       }
> > > +      PATTERN (insn) = loadhpd;
> > > +      INSN_CODE (insn) = -1;
> > > +      gcc_assert (recog_memoized (insn) != -1);
> > > +    }
> > > +}
> > >
> > >  /* Implement machine specific optimizations.  We implement padding of returns
> > >     for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
> > > @@ -21948,6 +22006,8 @@ ix86_reorg (void)
> > >
> > >    if (optimize && optimize_function_for_speed_p (cfun))
> > >      {
> > > +      if (TARGET_SSE2)
> > > +       ix86_split_stlf_stall_load ();
> > >        if (TARGET_PAD_SHORT_FUNCTION)
> > >         ix86_pad_short_function ();
> > >        else if (TARGET_PAD_RETURNS)
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> > > new file mode 100644
> > > index 00000000000..33d9684f0ad
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
> > > @@ -0,0 +1,12 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -msse2 -mno-avx" } */
> > > +/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]} } } */
> > > +
> > > +struct X { double x[2]; };
> > > +typedef double v2df __attribute__((vector_size(16)));
> > > +
> > > +v2df __attribute__((noipa))
> > > +foo (struct X* x, struct X* y)
> > > +{
> > > +  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> > > new file mode 100644
> > > index 00000000000..45060b73c06
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
> > > @@ -0,0 +1,12 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -msse2 -mno-avx" } */
> > > +/* { dg-final { scan-assembler-times {(?n)movhpd[ \t]+} "2" } }  */
> > > +
> > > +struct X { double x[4]; };
> > > +typedef double v2df __attribute__((vector_size(16)));
> > > +
> > > +v2df __attribute__((noipa))
> > > +foo (struct X x, struct X y)
> > > +{
> > > +  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
> > > +}
> > > --
> > > 2.18.1
> > >
>
>
>
> --
> BR,
> Hongtao
  

Patch

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 5a561966eb4..c88a689f32b 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21933,6 +21933,64 @@  ix86_seh_fixup_eh_fallthru (void)
       emit_insn_after (gen_nops (const1_rtx), insn);
     }
 }
+/* Split vector load from parm_decl to elemental loads to avoid STLF
+   stalls.  */
+static void
+ix86_split_stlf_stall_load ()
+{
+  rtx_insn* insn, *start = get_insns ();
+  unsigned window = 0;
+
+  for (insn = start; insn; insn = NEXT_INSN (insn))
+    {
+      if (!NONDEBUG_INSN_P (insn))
+	continue;
+      window++;
+      /* Insert 64 vaddps %xmm18, %xmm19, %xmm20(no dependence between each
+	 other, just emulate for pipeline) before stalled load, stlf stall
+	 case is as fast as no stall cases on CLX.
+	 Since CFG is freed before machine_reorg, just do a rough
+	 calculation of the window according to the layout.  */
+      if (window > 64)
+	return;
+
+      if (any_uncondjump_p (insn)
+	  || ANY_RETURN_P (PATTERN (insn)))
+	return;
+
+      rtx set = single_set (insn);
+      if (!set)
+	continue;
+      rtx src = SET_SRC (set);
+      if (!MEM_P (src)
+	  /* Only handle V2DFmode load since it doesn't need any scratch
+	     register.  */
+	  || GET_MODE (src) != E_V2DFmode
+	  || !MEM_EXPR (src)
+	  || TREE_CODE (get_base_address (MEM_EXPR (src))) != PARM_DECL)
+	continue;
+
+      rtx zero = CONST0_RTX (V2DFmode);
+      rtx dest = SET_DEST (set);
+      rtx m = adjust_address (src, DFmode, 0);
+      rtx loadlpd = gen_sse2_loadlpd (dest, zero, m);
+      emit_insn_before (loadlpd, insn);
+      m = adjust_address (src, DFmode, 8);
+      rtx loadhpd = gen_sse2_loadhpd (dest, dest, m);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fputs ("Due to potential STLF stall, split instruction:\n",
+		 dump_file);
+	  print_rtl_single (dump_file, insn);
+	  fputs ("To:\n", dump_file);
+	  print_rtl_single (dump_file, loadlpd);
+	  print_rtl_single (dump_file, loadhpd);
+	}
+      PATTERN (insn) = loadhpd;
+      INSN_CODE (insn) = -1;
+      gcc_assert (recog_memoized (insn) != -1);
+    }
+}
 
 /* Implement machine specific optimizations.  We implement padding of returns
    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
@@ -21948,6 +22006,8 @@  ix86_reorg (void)
 
   if (optimize && optimize_function_for_speed_p (cfun))
     {
+      if (TARGET_SSE2)
+	ix86_split_stlf_stall_load ();
       if (TARGET_PAD_SHORT_FUNCTION)
 	ix86_pad_short_function ();
       else if (TARGET_PAD_RETURNS)
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
new file mode 100644
index 00000000000..33d9684f0ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]} } } */
+
+struct X { double x[2]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X* x, struct X* y)
+{
+  return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
new file mode 100644
index 00000000000..45060b73c06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-final { scan-assembler-times {(?n)movhpd[ \t]+} "2" } }  */
+
+struct X { double x[4]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X x, struct X y)
+{
+  return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
+}