[i386] Optimize v4si broadcast for noavx512vl.

Message ID 20220304022839.33024-1-hongtao.liu@intel.com
State New
Headers
Series [i386] Optimize v4si broadcast for noavx512vl. |

Commit Message

Liu, Hongtao March 4, 2022, 2:28 a.m. UTC
  This is incremental patch based on [1], it enables optimization as below

-       vbroadcastss    .LC1(%rip), %xmm0
+       movl    $-45, %edx
+       vmovd   %edx, %xmm0
+       vpshufd $0, %xmm0, %xmm0

According to microbenchmark, it's faster than broadcast from memory.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2022-March/591162.html.

Bootstrapped and regtest on x86_64-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

	PR target/104704
	* config/i386/sse.md (*vec_dupv4si): Add alternative $r and
	corresponding post_reload splitter.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr100865-8a.c: Adjust testcase.
	* gcc.target/i386/pr100865-8c.c: Ditto.
	* gcc.target/i386/pr100865-9c.c: Ditto.
---
 gcc/config/i386/sse.md                      | 41 ++++++++++++++++-----
 gcc/testsuite/gcc.target/i386/pr100865-8a.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr100865-8c.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr100865-9c.c |  2 +-
 4 files changed, 35 insertions(+), 12 deletions(-)
  

Comments

Hongtao Liu March 4, 2022, 2:30 a.m. UTC | #1
On Fri, Mar 4, 2022 at 10:29 AM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> This is incremental patch based on [1], it enables optimization as below
>
> -       vbroadcastss    .LC1(%rip), %xmm0
> +       movl    $-45, %edx
> +       vmovd   %edx, %xmm0
> +       vpshufd $0, %xmm0, %xmm0
>
> According to microbenchmark, it's faster than broadcast from memory.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-March/591162.html.
>
> Bootstrapped and regtest on x86_64-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR target/104704
>         * config/i386/sse.md (*vec_dupv4si): Add alternative $r and
>         corresponding post_reload splitter.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr100865-8a.c: Adjust testcase.
>         * gcc.target/i386/pr100865-8c.c: Ditto.
>         * gcc.target/i386/pr100865-9c.c: Ditto.
> ---
>  gcc/config/i386/sse.md                      | 41 ++++++++++++++++-----
>  gcc/testsuite/gcc.target/i386/pr100865-8a.c |  2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-8c.c |  2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-9c.c |  2 +-
>  4 files changed, 35 insertions(+), 12 deletions(-)
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 3066ea3734a..d124545aa5d 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -25121,20 +25121,43 @@ (define_insn "vec_dupv4sf"
>     (set_attr "mode" "V4SF")])
>
>  (define_insn "*vec_dupv4si"
> -  [(set (match_operand:V4SI 0 "register_operand"     "=v,v,x")
> +  [(set (match_operand:V4SI 0 "register_operand"     "=v,v,x,v")
>         (vec_duplicate:V4SI
> -         (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0")))]
> +         (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0,$r")))]
>    "TARGET_SSE"
>    "@
>     %vpshufd\t{$0, %1, %0|%0, %1, 0}
>     vbroadcastss\t{%1, %0|%0, %1}
> -   shufps\t{$0, %0, %0|%0, %0, 0}"
> -  [(set_attr "isa" "sse2,avx,noavx")
> -   (set_attr "type" "sselog1,ssemov,sselog1")
> -   (set_attr "length_immediate" "1,0,1")
> -   (set_attr "prefix_extra" "0,1,*")
> -   (set_attr "prefix" "maybe_vex,maybe_evex,orig")
> -   (set_attr "mode" "TI,V4SF,V4SF")])
> +   shufps\t{$0, %0, %0|%0, %0, 0}
> +   #"
> +  [(set_attr "isa" "sse2,avx,noavx,noavx512vl")
> +   (set_attr "type" "sselog1,ssemov,sselog1,sselog1")
> +   (set_attr "length_immediate" "1,0,1,1")
> +   (set_attr "prefix_extra" "0,1,*,0")
> +   (set_attr "prefix" "maybe_vex,maybe_evex,orig,maybe_vex")
> +   (set_attr "mode" "TI,V4SF,V4SF,TI")
> +   (set (attr "preferred_for_speed")
> +     (cond [(eq_attr "alternative" "3")
> +             (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
> +          ]
> +          (symbol_ref "true")))])
> +
> +(define_split
> +  [(set (match_operand:V4SI 0 "sse_reg_operand")
> +       (vec_duplicate:V4SI
> +         (match_operand:SI 1 "general_reg_operand")))]
> +  "TARGET_SSE && reload_completed
> +   /* Disable this splitter if avx512vl_vec_dup_gprv4si insn is
> +      available, because then we can broadcast from GPRs directly.  */
> +   && !TARGET_AVX512VL"
> +  [(const_int 0)]
> +{
> +  emit_insn (gen_vec_setv4si_0 (gen_lowpart (V4SImode, operands[0]),
> +                               CONST0_RTX (V4SImode),
> +                               gen_lowpart (SImode, operands[1])));
> +  emit_insn (gen_vec_duplicatev4si (operands[0], operands[0]));
> +  DONE;
> +})
>
>  (define_insn "*vec_dupv2di"
>    [(set (match_operand:V2DI 0 "register_operand"     "=x,v,v,x")
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8a.c b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
> index 911b14d4a25..544a14db6f7 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-8a.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
> @@ -20,5 +20,5 @@ foo (void)
>      array[i] = MK_CONST128_BROADCAST_SIGNED (-45);
>  }
>
> -/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8c.c b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
> index 00682edb8c9..efee0488614 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-8c.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
> @@ -3,5 +3,5 @@
>
>  #include "pr100865-8a.c"
>
> -/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-9c.c b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
> index 8ffcdc1629d..e6f25902c1d 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-9c.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
> @@ -3,5 +3,5 @@
>
>  #include "pr100865-9a.c"
>
> -/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> --
> 2.18.1
>
  
Uros Bizjak March 4, 2022, 1:02 p.m. UTC | #2
On Fri, Mar 4, 2022 at 3:28 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> This is incremental patch based on [1], it enables optimization as below
>
> -       vbroadcastss    .LC1(%rip), %xmm0
> +       movl    $-45, %edx
> +       vmovd   %edx, %xmm0
> +       vpshufd $0, %xmm0, %xmm0
>
> According to microbenchmark, it's faster than broadcast from memory.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-March/591162.html.
>
> Bootstrapped and regtest on x86_64-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR target/104704
>         * config/i386/sse.md (*vec_dupv4si): Add alternative $r and
>         corresponding post_reload splitter.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr100865-8a.c: Adjust testcase.
>         * gcc.target/i386/pr100865-8c.c: Ditto.
>         * gcc.target/i386/pr100865-9c.c: Ditto.
> ---
>  gcc/config/i386/sse.md                      | 41 ++++++++++++++++-----
>  gcc/testsuite/gcc.target/i386/pr100865-8a.c |  2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-8c.c |  2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-9c.c |  2 +-
>  4 files changed, 35 insertions(+), 12 deletions(-)
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 3066ea3734a..d124545aa5d 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -25121,20 +25121,43 @@ (define_insn "vec_dupv4sf"
>     (set_attr "mode" "V4SF")])
>
>  (define_insn "*vec_dupv4si"
> -  [(set (match_operand:V4SI 0 "register_operand"     "=v,v,x")
> +  [(set (match_operand:V4SI 0 "register_operand"     "=v,v,x,v")
>         (vec_duplicate:V4SI
> -         (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0")))]
> +         (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0,$r")))]
>    "TARGET_SSE"
>    "@
>     %vpshufd\t{$0, %1, %0|%0, %1, 0}
>     vbroadcastss\t{%1, %0|%0, %1}
> -   shufps\t{$0, %0, %0|%0, %0, 0}"
> -  [(set_attr "isa" "sse2,avx,noavx")
> -   (set_attr "type" "sselog1,ssemov,sselog1")
> -   (set_attr "length_immediate" "1,0,1")
> -   (set_attr "prefix_extra" "0,1,*")
> -   (set_attr "prefix" "maybe_vex,maybe_evex,orig")
> -   (set_attr "mode" "TI,V4SF,V4SF")])
> +   shufps\t{$0, %0, %0|%0, %0, 0}
> +   #"
> +  [(set_attr "isa" "sse2,avx,noavx,noavx512vl")
> +   (set_attr "type" "sselog1,ssemov,sselog1,sselog1")
> +   (set_attr "length_immediate" "1,0,1,1")
> +   (set_attr "prefix_extra" "0,1,*,0")
> +   (set_attr "prefix" "maybe_vex,maybe_evex,orig,maybe_vex")
> +   (set_attr "mode" "TI,V4SF,V4SF,TI")
> +   (set (attr "preferred_for_speed")
> +     (cond [(eq_attr "alternative" "3")
> +             (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
> +          ]
> +          (symbol_ref "true")))])

What happens if you set preferred_for_speed to false for alternative 1?

> +(define_split
> +  [(set (match_operand:V4SI 0 "sse_reg_operand")
> +       (vec_duplicate:V4SI
> +         (match_operand:SI 1 "general_reg_operand")))]
> +  "TARGET_SSE && reload_completed
> +   /* Disable this splitter if avx512vl_vec_dup_gprv4si insn is
> +      available, because then we can broadcast from GPRs directly.  */

I think avx512vl_vec_dup_gprv4si should be merged with the above
pattern instead.

Uros.

> +   && !TARGET_AVX512VL"
> +  [(const_int 0)]
> +{
> +  emit_insn (gen_vec_setv4si_0 (gen_lowpart (V4SImode, operands[0]),
> +                               CONST0_RTX (V4SImode),
> +                               gen_lowpart (SImode, operands[1])));
> +  emit_insn (gen_vec_duplicatev4si (operands[0], operands[0]));
> +  DONE;
> +})
>
>  (define_insn "*vec_dupv2di"
>    [(set (match_operand:V2DI 0 "register_operand"     "=x,v,v,x")
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8a.c b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
> index 911b14d4a25..544a14db6f7 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-8a.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
> @@ -20,5 +20,5 @@ foo (void)
>      array[i] = MK_CONST128_BROADCAST_SIGNED (-45);
>  }
>
> -/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8c.c b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
> index 00682edb8c9..efee0488614 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-8c.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
> @@ -3,5 +3,5 @@
>
>  #include "pr100865-8a.c"
>
> -/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-9c.c b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
> index 8ffcdc1629d..e6f25902c1d 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-9c.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
> @@ -3,5 +3,5 @@
>
>  #include "pr100865-9a.c"
>
> -/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> --
> 2.18.1
>
  
Richard Biener March 4, 2022, 4:39 p.m. UTC | #3
> Am 04.03.2022 um 03:30 schrieb Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org>:
> 
> On Fri, Mar 4, 2022 at 10:29 AM liuhongt via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>> 
>> This is incremental patch based on [1], it enables optimization as below
>> 
>> -       vbroadcastss    .LC1(%rip), %xmm0
>> +       movl    $-45, %edx
>> +       vmovd   %edx, %xmm0
>> +       vpshufd $0, %xmm0, %xmm0
>> 
>> According to microbenchmark, it's faster than broadcast from memory

Is that true even on AMD uarchs?

>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-March/591162.html.
>> 
>> Bootstrapped and regtest on x86_64-linux-gnu{-m32,}.
>> Ok for trunk?
>> 
>> gcc/ChangeLog:
>> 
>>        PR target/104704
>>        * config/i386/sse.md (*vec_dupv4si): Add alternative $r and
>>        corresponding post_reload splitter.
>> 
>> gcc/testsuite/ChangeLog:
>> 
>>        * gcc.target/i386/pr100865-8a.c: Adjust testcase.
>>        * gcc.target/i386/pr100865-8c.c: Ditto.
>>        * gcc.target/i386/pr100865-9c.c: Ditto.
>> ---
>> gcc/config/i386/sse.md                      | 41 ++++++++++++++++-----
>> gcc/testsuite/gcc.target/i386/pr100865-8a.c |  2 +-
>> gcc/testsuite/gcc.target/i386/pr100865-8c.c |  2 +-
>> gcc/testsuite/gcc.target/i386/pr100865-9c.c |  2 +-
>> 4 files changed, 35 insertions(+), 12 deletions(-)
>> 
>> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
>> index 3066ea3734a..d124545aa5d 100644
>> --- a/gcc/config/i386/sse.md
>> +++ b/gcc/config/i386/sse.md
>> @@ -25121,20 +25121,43 @@ (define_insn "vec_dupv4sf"
>>    (set_attr "mode" "V4SF")])
>> 
>> (define_insn "*vec_dupv4si"
>> -  [(set (match_operand:V4SI 0 "register_operand"     "=v,v,x")
>> +  [(set (match_operand:V4SI 0 "register_operand"     "=v,v,x,v")
>>        (vec_duplicate:V4SI
>> -         (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0")))]
>> +         (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0,$r")))]
>>   "TARGET_SSE"
>>   "@
>>    %vpshufd\t{$0, %1, %0|%0, %1, 0}
>>    vbroadcastss\t{%1, %0|%0, %1}
>> -   shufps\t{$0, %0, %0|%0, %0, 0}"
>> -  [(set_attr "isa" "sse2,avx,noavx")
>> -   (set_attr "type" "sselog1,ssemov,sselog1")
>> -   (set_attr "length_immediate" "1,0,1")
>> -   (set_attr "prefix_extra" "0,1,*")
>> -   (set_attr "prefix" "maybe_vex,maybe_evex,orig")
>> -   (set_attr "mode" "TI,V4SF,V4SF")])
>> +   shufps\t{$0, %0, %0|%0, %0, 0}
>> +   #"
>> +  [(set_attr "isa" "sse2,avx,noavx,noavx512vl")
>> +   (set_attr "type" "sselog1,ssemov,sselog1,sselog1")
>> +   (set_attr "length_immediate" "1,0,1,1")
>> +   (set_attr "prefix_extra" "0,1,*,0")
>> +   (set_attr "prefix" "maybe_vex,maybe_evex,orig,maybe_vex")
>> +   (set_attr "mode" "TI,V4SF,V4SF,TI")
>> +   (set (attr "preferred_for_speed")
>> +     (cond [(eq_attr "alternative" "3")
>> +             (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
>> +          ]
>> +          (symbol_ref "true")))])
>> +
>> +(define_split
>> +  [(set (match_operand:V4SI 0 "sse_reg_operand")
>> +       (vec_duplicate:V4SI
>> +         (match_operand:SI 1 "general_reg_operand")))]
>> +  "TARGET_SSE && reload_completed
>> +   /* Disable this splitter if avx512vl_vec_dup_gprv4si insn is
>> +      available, because then we can broadcast from GPRs directly.  */
>> +   && !TARGET_AVX512VL"
>> +  [(const_int 0)]
>> +{
>> +  emit_insn (gen_vec_setv4si_0 (gen_lowpart (V4SImode, operands[0]),
>> +                               CONST0_RTX (V4SImode),
>> +                               gen_lowpart (SImode, operands[1])));
>> +  emit_insn (gen_vec_duplicatev4si (operands[0], operands[0]));
>> +  DONE;
>> +})
>> 
>> (define_insn "*vec_dupv2di"
>>   [(set (match_operand:V2DI 0 "register_operand"     "=x,v,v,x")
>> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8a.c b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
>> index 911b14d4a25..544a14db6f7 100644
>> --- a/gcc/testsuite/gcc.target/i386/pr100865-8a.c
>> +++ b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
>> @@ -20,5 +20,5 @@ foo (void)
>>     array[i] = MK_CONST128_BROADCAST_SIGNED (-45);
>> }
>> 
>> -/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
>> +/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
>> /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
>> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8c.c b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
>> index 00682edb8c9..efee0488614 100644
>> --- a/gcc/testsuite/gcc.target/i386/pr100865-8c.c
>> +++ b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
>> @@ -3,5 +3,5 @@
>> 
>> #include "pr100865-8a.c"
>> 
>> -/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
>> +/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
>> /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
>> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-9c.c b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
>> index 8ffcdc1629d..e6f25902c1d 100644
>> --- a/gcc/testsuite/gcc.target/i386/pr100865-9c.c
>> +++ b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
>> @@ -3,5 +3,5 @@
>> 
>> #include "pr100865-9a.c"
>> 
>> -/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
>> +/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
>> /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
>> --
>> 2.18.1
>> 
> 
> 
> -- 
> BR,
> Hongtao
  
H.J. Lu March 4, 2022, 4:50 p.m. UTC | #4
On Fri, Mar 4, 2022 at 8:40 AM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
>
>
> > Am 04.03.2022 um 03:30 schrieb Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org>:
> >
> > On Fri, Mar 4, 2022 at 10:29 AM liuhongt via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> >>
> >> This is incremental patch based on [1], it enables optimization as below
> >>
> >> -       vbroadcastss    .LC1(%rip), %xmm0
> >> +       movl    $-45, %edx
> >> +       vmovd   %edx, %xmm0
> >> +       vpshufd $0, %xmm0, %xmm0
> >>
> >> According to microbenchmark, it's faster than broadcast from memory
>
> Is that true even on AMD uarchs?

Please check TARGET_INTER_UNIT_MOVES_TO_VEC.

> >> [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-March/591162.html.
> >>
> >> Bootstrapped and regtest on x86_64-linux-gnu{-m32,}.
> >> Ok for trunk?
> >>
> >> gcc/ChangeLog:
> >>
> >>        PR target/104704
> >>        * config/i386/sse.md (*vec_dupv4si): Add alternative $r and
> >>        corresponding post_reload splitter.
> >>
> >> gcc/testsuite/ChangeLog:
> >>
> >>        * gcc.target/i386/pr100865-8a.c: Adjust testcase.
> >>        * gcc.target/i386/pr100865-8c.c: Ditto.
> >>        * gcc.target/i386/pr100865-9c.c: Ditto.
> >> ---
> >> gcc/config/i386/sse.md                      | 41 ++++++++++++++++-----
> >> gcc/testsuite/gcc.target/i386/pr100865-8a.c |  2 +-
> >> gcc/testsuite/gcc.target/i386/pr100865-8c.c |  2 +-
> >> gcc/testsuite/gcc.target/i386/pr100865-9c.c |  2 +-
> >> 4 files changed, 35 insertions(+), 12 deletions(-)
> >>
> >> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> >> index 3066ea3734a..d124545aa5d 100644
> >> --- a/gcc/config/i386/sse.md
> >> +++ b/gcc/config/i386/sse.md
> >> @@ -25121,20 +25121,43 @@ (define_insn "vec_dupv4sf"
> >>    (set_attr "mode" "V4SF")])
> >>
> >> (define_insn "*vec_dupv4si"
> >> -  [(set (match_operand:V4SI 0 "register_operand"     "=v,v,x")
> >> +  [(set (match_operand:V4SI 0 "register_operand"     "=v,v,x,v")
> >>        (vec_duplicate:V4SI
> >> -         (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0")))]
> >> +         (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0,$r")))]
> >>   "TARGET_SSE"
> >>   "@
> >>    %vpshufd\t{$0, %1, %0|%0, %1, 0}
> >>    vbroadcastss\t{%1, %0|%0, %1}
> >> -   shufps\t{$0, %0, %0|%0, %0, 0}"
> >> -  [(set_attr "isa" "sse2,avx,noavx")
> >> -   (set_attr "type" "sselog1,ssemov,sselog1")
> >> -   (set_attr "length_immediate" "1,0,1")
> >> -   (set_attr "prefix_extra" "0,1,*")
> >> -   (set_attr "prefix" "maybe_vex,maybe_evex,orig")
> >> -   (set_attr "mode" "TI,V4SF,V4SF")])
> >> +   shufps\t{$0, %0, %0|%0, %0, 0}
> >> +   #"
> >> +  [(set_attr "isa" "sse2,avx,noavx,noavx512vl")
> >> +   (set_attr "type" "sselog1,ssemov,sselog1,sselog1")
> >> +   (set_attr "length_immediate" "1,0,1,1")
> >> +   (set_attr "prefix_extra" "0,1,*,0")
> >> +   (set_attr "prefix" "maybe_vex,maybe_evex,orig,maybe_vex")
> >> +   (set_attr "mode" "TI,V4SF,V4SF,TI")
> >> +   (set (attr "preferred_for_speed")
> >> +     (cond [(eq_attr "alternative" "3")
> >> +             (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
> >> +          ]
> >> +          (symbol_ref "true")))])
> >> +
> >> +(define_split
> >> +  [(set (match_operand:V4SI 0 "sse_reg_operand")
> >> +       (vec_duplicate:V4SI
> >> +         (match_operand:SI 1 "general_reg_operand")))]
> >> +  "TARGET_SSE && reload_completed
> >> +   /* Disable this splitter if avx512vl_vec_dup_gprv4si insn is
> >> +      available, because then we can broadcast from GPRs directly.  */
> >> +   && !TARGET_AVX512VL"
> >> +  [(const_int 0)]
> >> +{
> >> +  emit_insn (gen_vec_setv4si_0 (gen_lowpart (V4SImode, operands[0]),
> >> +                               CONST0_RTX (V4SImode),
> >> +                               gen_lowpart (SImode, operands[1])));
> >> +  emit_insn (gen_vec_duplicatev4si (operands[0], operands[0]));
> >> +  DONE;
> >> +})
> >>
> >> (define_insn "*vec_dupv2di"
> >>   [(set (match_operand:V2DI 0 "register_operand"     "=x,v,v,x")
> >> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8a.c b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
> >> index 911b14d4a25..544a14db6f7 100644
> >> --- a/gcc/testsuite/gcc.target/i386/pr100865-8a.c
> >> +++ b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
> >> @@ -20,5 +20,5 @@ foo (void)
> >>     array[i] = MK_CONST128_BROADCAST_SIGNED (-45);
> >> }
> >>
> >> -/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
> >> +/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> >> /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> >> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8c.c b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
> >> index 00682edb8c9..efee0488614 100644
> >> --- a/gcc/testsuite/gcc.target/i386/pr100865-8c.c
> >> +++ b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
> >> @@ -3,5 +3,5 @@
> >>
> >> #include "pr100865-8a.c"
> >>
> >> -/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
> >> +/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> >> /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> >> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-9c.c b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
> >> index 8ffcdc1629d..e6f25902c1d 100644
> >> --- a/gcc/testsuite/gcc.target/i386/pr100865-9c.c
> >> +++ b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
> >> @@ -3,5 +3,5 @@
> >>
> >> #include "pr100865-9a.c"
> >>
> >> -/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
> >> +/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> >> /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> >> --
> >> 2.18.1
> >>
> >
> >
> > --
> > BR,
> > Hongtao
  

Patch

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3066ea3734a..d124545aa5d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -25121,20 +25121,43 @@  (define_insn "vec_dupv4sf"
    (set_attr "mode" "V4SF")])
 
 (define_insn "*vec_dupv4si"
-  [(set (match_operand:V4SI 0 "register_operand"     "=v,v,x")
+  [(set (match_operand:V4SI 0 "register_operand"     "=v,v,x,v")
 	(vec_duplicate:V4SI
-	  (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0")))]
+	  (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0,$r")))]
   "TARGET_SSE"
   "@
    %vpshufd\t{$0, %1, %0|%0, %1, 0}
    vbroadcastss\t{%1, %0|%0, %1}
-   shufps\t{$0, %0, %0|%0, %0, 0}"
-  [(set_attr "isa" "sse2,avx,noavx")
-   (set_attr "type" "sselog1,ssemov,sselog1")
-   (set_attr "length_immediate" "1,0,1")
-   (set_attr "prefix_extra" "0,1,*")
-   (set_attr "prefix" "maybe_vex,maybe_evex,orig")
-   (set_attr "mode" "TI,V4SF,V4SF")])
+   shufps\t{$0, %0, %0|%0, %0, 0}
+   #"
+  [(set_attr "isa" "sse2,avx,noavx,noavx512vl")
+   (set_attr "type" "sselog1,ssemov,sselog1,sselog1")
+   (set_attr "length_immediate" "1,0,1,1")
+   (set_attr "prefix_extra" "0,1,*,0")
+   (set_attr "prefix" "maybe_vex,maybe_evex,orig,maybe_vex")
+   (set_attr "mode" "TI,V4SF,V4SF,TI")
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "3")
+	      (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
+	   ]
+	   (symbol_ref "true")))])
+
+(define_split
+  [(set (match_operand:V4SI 0 "sse_reg_operand")
+	(vec_duplicate:V4SI
+	  (match_operand:SI 1 "general_reg_operand")))]
+  "TARGET_SSE && reload_completed
+   /* Disable this splitter if avx512vl_vec_dup_gprv4si insn is
+      available, because then we can broadcast from GPRs directly.  */
+   && !TARGET_AVX512VL"
+  [(const_int 0)]
+{
+  emit_insn (gen_vec_setv4si_0 (gen_lowpart (V4SImode, operands[0]),
+				CONST0_RTX (V4SImode),
+				gen_lowpart (SImode, operands[1])));
+  emit_insn (gen_vec_duplicatev4si (operands[0], operands[0]));
+  DONE;
+})
 
 (define_insn "*vec_dupv2di"
   [(set (match_operand:V2DI 0 "register_operand"     "=x,v,v,x")
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8a.c b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
index 911b14d4a25..544a14db6f7 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-8a.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
@@ -20,5 +20,5 @@  foo (void)
     array[i] = MK_CONST128_BROADCAST_SIGNED (-45);
 }
 
-/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8c.c b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
index 00682edb8c9..efee0488614 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-8c.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
@@ -3,5 +3,5 @@ 
 
 #include "pr100865-8a.c"
 
-/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-9c.c b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
index 8ffcdc1629d..e6f25902c1d 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-9c.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
@@ -3,5 +3,5 @@ 
 
 #include "pr100865-9a.c"
 
-/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */