x86: Don't align destination for a single instruction

Message ID 20250914191444.3965672-1-hjl.tools@gmail.com
State New
Headers
Series x86: Don't align destination for a single instruction |

Commit Message

H.J. Lu Sept. 14, 2025, 7:14 p.m. UTC
  If a single instruction can store or move the whole block of memory, use
vector instruction and don't align destination.

gcc/

	PR target/121934
	* config/i386/i386-expand.cc (ix86_expand_set_or_cpymem): If a
	single instruction can store or move the whole block of memory,
	use vector instruction and don't align destination.

gcc/testsuite/

	PR target/121934
	* gcc.target/i386/pr121934-1a.c: New test.
	* gcc.target/i386/pr121934-1b.c: Likewise.
	* gcc.target/i386/pr121934-2a.c: Likewise.
	* gcc.target/i386/pr121934-2b.c: Likewise.
	* gcc.target/i386/pr121934-3a.c: Likewise.
	* gcc.target/i386/pr121934-3b.c: Likewise.
	* gcc.target/i386/pr121934-4a.c: Likewise.
	* gcc.target/i386/pr121934-4b.c: Likewise.
	* gcc.target/i386/pr121934-5a.c: Likewise.
	* gcc.target/i386/pr121934-5b.c: Likewise.

Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
---
 gcc/config/i386/i386-expand.cc              | 62 +++++++++++++--------
 gcc/testsuite/gcc.target/i386/pr121934-1a.c | 22 ++++++++
 gcc/testsuite/gcc.target/i386/pr121934-1b.c |  7 +++
 gcc/testsuite/gcc.target/i386/pr121934-2a.c | 23 ++++++++
 gcc/testsuite/gcc.target/i386/pr121934-2b.c |  7 +++
 gcc/testsuite/gcc.target/i386/pr121934-3a.c | 23 ++++++++
 gcc/testsuite/gcc.target/i386/pr121934-3b.c |  7 +++
 gcc/testsuite/gcc.target/i386/pr121934-4a.c | 23 ++++++++
 gcc/testsuite/gcc.target/i386/pr121934-4b.c |  7 +++
 gcc/testsuite/gcc.target/i386/pr121934-5a.c | 23 ++++++++
 gcc/testsuite/gcc.target/i386/pr121934-5b.c |  7 +++
 11 files changed, 187 insertions(+), 24 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5b.c
  

Comments

Uros Bizjak Sept. 15, 2025, 5:57 a.m. UTC | #1
On Sun, Sep 14, 2025 at 9:14 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> If a single instruction can store or move the whole block of memory, use
> vector instruction and don't align destination.
>
> gcc/
>
>         PR target/121934
>         * config/i386/i386-expand.cc (ix86_expand_set_or_cpymem): If a
>         single instruction can store or move the whole block of memory,
>         use vector instruction and don't align destination.
>
> gcc/testsuite/
>
>         PR target/121934
>         * gcc.target/i386/pr121934-1a.c: New test.
>         * gcc.target/i386/pr121934-1b.c: Likewise.
>         * gcc.target/i386/pr121934-2a.c: Likewise.
>         * gcc.target/i386/pr121934-2b.c: Likewise.
>         * gcc.target/i386/pr121934-3a.c: Likewise.
>         * gcc.target/i386/pr121934-3b.c: Likewise.
>         * gcc.target/i386/pr121934-4a.c: Likewise.
>         * gcc.target/i386/pr121934-4b.c: Likewise.
>         * gcc.target/i386/pr121934-5a.c: Likewise.
>         * gcc.target/i386/pr121934-5b.c: Likewise.

OK.

Thanks,
Uros.

>
> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> ---
>  gcc/config/i386/i386-expand.cc              | 62 +++++++++++++--------
>  gcc/testsuite/gcc.target/i386/pr121934-1a.c | 22 ++++++++
>  gcc/testsuite/gcc.target/i386/pr121934-1b.c |  7 +++
>  gcc/testsuite/gcc.target/i386/pr121934-2a.c | 23 ++++++++
>  gcc/testsuite/gcc.target/i386/pr121934-2b.c |  7 +++
>  gcc/testsuite/gcc.target/i386/pr121934-3a.c | 23 ++++++++
>  gcc/testsuite/gcc.target/i386/pr121934-3b.c |  7 +++
>  gcc/testsuite/gcc.target/i386/pr121934-4a.c | 23 ++++++++
>  gcc/testsuite/gcc.target/i386/pr121934-4b.c |  7 +++
>  gcc/testsuite/gcc.target/i386/pr121934-5a.c | 23 ++++++++
>  gcc/testsuite/gcc.target/i386/pr121934-5b.c |  7 +++
>  11 files changed, 187 insertions(+), 24 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5b.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index dc26b3452cb..b0b9e6da946 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -9552,9 +9552,20 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
>    if (!issetmem)
>      srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
>
> +  bool aligned_dstmem = false;
> +  unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
> +  bool single_insn_p = count && count <= nunits;
> +  if (single_insn_p)
> +    {
> +      /* If it can be done with a single instruction, use vector
> +        instruction and don't align destination.  */
> +      alg = vector_loop;
> +      noalign = true;
> +      dynamic_check = -1;
> +    }
> +
>    unroll_factor = 1;
>    move_mode = word_mode;
> -  int nunits;
>    switch (alg)
>      {
>      case libcall:
> @@ -9576,7 +9587,6 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
>        need_zero_guard = true;
>        unroll_factor = 4;
>        /* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes.  */
> -      nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
>        nunits /= GET_MODE_SIZE (word_mode);
>        if (nunits > 1)
>         {
> @@ -9629,28 +9639,32 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
>      }
>    gcc_assert (desired_align >= 1 && align >= 1);
>
> -  /* Misaligned move sequences handle both prologue and epilogue at once.
> -     Default code generation results in a smaller code for large alignments
> -     and also avoids redundant job when sizes are known precisely.  */
> -  misaligned_prologue_used
> -    = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
> -       && MAX (desired_align, epilogue_size_needed) <= 32
> -       && desired_align <= epilogue_size_needed
> -       && ((desired_align > align && !align_bytes)
> -          || (!count && epilogue_size_needed > 1)));
> -
> -  /* Destination is aligned after the misaligned prologue.  */
> -  bool aligned_dstmem = misaligned_prologue_used;
> -
> -  if (noalign && !misaligned_prologue_used)
> -    {
> -      /* Also use misaligned prologue if alignment isn't needed and
> -        destination isn't aligned.   Since alignment isn't needed,
> -        the destination after prologue won't be aligned.  */
> -      aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
> -                       <= MEM_ALIGN (dst));
> -      if (!aligned_dstmem)
> -       misaligned_prologue_used = true;
> +  if (!single_insn_p)
> +    {
> +      /* Misaligned move sequences handle both prologue and epilogue
> +        at once.  Default code generation results in a smaller code
> +        for large alignments and also avoids redundant job when sizes
> +        are known precisely.  */
> +      misaligned_prologue_used
> +       = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
> +          && MAX (desired_align, epilogue_size_needed) <= 32
> +          && desired_align <= epilogue_size_needed
> +          && ((desired_align > align && !align_bytes)
> +              || (!count && epilogue_size_needed > 1)));
> +
> +      /* Destination is aligned after the misaligned prologue.  */
> +      aligned_dstmem = misaligned_prologue_used;
> +
> +      if (noalign && !misaligned_prologue_used)
> +       {
> +         /* Also use misaligned prologue if alignment isn't needed and
> +            destination isn't aligned.   Since alignment isn't needed,
> +            the destination after prologue won't be aligned.  */
> +         aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
> +                           <= MEM_ALIGN (dst));
> +         if (!aligned_dstmem)
> +           misaligned_prologue_used = true;
> +       }
>      }
>
>    /* Do the cheap promotion to allow better CSE across the
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-1a.c b/gcc/testsuite/gcc.target/i386/pr121934-1a.c
> new file mode 100644
> index 00000000000..6b6881367db
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-1a.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre" } */
> +
> +extern int f();
> +int a, b, c, d[3];
> +void g() {
> +  int h;
> +  if (f()) {
> +    if (b)
> +    i:
> +      c > 0;
> +    a = 0;
> +    for (h = 0; h < 3; h++) {
> +      if (a != 1)
> +        __builtin_printf("0\n");
> +      d[h] = -1;
> +    }
> +    goto i;
> +  }
> +}
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-1b.c b/gcc/testsuite/gcc.target/i386/pr121934-1b.c
> new file mode 100644
> index 00000000000..47381ec3476
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-1b.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
> +
> +#include "pr121934-1a.c"
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-2a.c b/gcc/testsuite/gcc.target/i386/pr121934-2a.c
> new file mode 100644
> index 00000000000..49def11aa4e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-2a.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre" } */
> +
> +extern int f();
> +int a, b, c;
> +long long int d[3];
> +void g() {
> +  int h;
> +  if (f()) {
> +    if (b)
> +    i:
> +      c > 0;
> +    a = 0;
> +    for (h = 0; h < 3; h++) {
> +      if (a != 1)
> +        __builtin_printf("0\n");
> +      d[h] = (long long int) -1;
> +    }
> +    goto i;
> +  }
> +}
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-2b.c b/gcc/testsuite/gcc.target/i386/pr121934-2b.c
> new file mode 100644
> index 00000000000..1c634dfe420
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-2b.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
> +
> +#include "pr121934-2a.c"
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-3a.c b/gcc/testsuite/gcc.target/i386/pr121934-3a.c
> new file mode 100644
> index 00000000000..0c04b69c0d4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-3a.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -msse2" } */
> +
> +extern int f();
> +int a, b, c;
> +_BitInt(128) d[3];
> +void g() {
> +  int h;
> +  if (f()) {
> +    if (b)
> +    i:
> +      c > 0;
> +    a = 0;
> +    for (h = 0; h < 3; h++) {
> +      if (a != 1)
> +        __builtin_printf("0\n");
> +      d[h] = (_BitInt(128)) -1;
> +    }
> +    goto i;
> +  }
> +}
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-3b.c b/gcc/testsuite/gcc.target/i386/pr121934-3b.c
> new file mode 100644
> index 00000000000..ff4b0831cea
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-3b.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -msse2 -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
> +
> +#include "pr121934-3a.c"
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-4a.c b/gcc/testsuite/gcc.target/i386/pr121934-4a.c
> new file mode 100644
> index 00000000000..5aa3e069cff
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-4a.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mno-avx512f -mavx -mprefer-vector-width=256" } */
> +
> +extern int f();
> +int a, b, c;
> +_BitInt(256) d[3];
> +void g() {
> +  int h;
> +  if (f()) {
> +    if (b)
> +    i:
> +      c > 0;
> +    a = 0;
> +    for (h = 0; h < 3; h++) {
> +      if (a != 1)
> +        __builtin_printf("0\n");
> +      d[h] = (_BitInt(256)) -1;
> +    }
> +    goto i;
> +  }
> +}
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-4b.c b/gcc/testsuite/gcc.target/i386/pr121934-4b.c
> new file mode 100644
> index 00000000000..5f8241dcad5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-4b.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mno-avx512f -mavx -mprefer-vector-width=256 -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
> +
> +#include "pr121934-4a.c"
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-5a.c b/gcc/testsuite/gcc.target/i386/pr121934-5a.c
> new file mode 100644
> index 00000000000..10be0dd4343
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-5a.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mavx512f -mprefer-vector-width=512" } */
> +
> +extern int f();
> +int a, b, c;
> +_BitInt(512) d[3];
> +void g() {
> +  int h;
> +  if (f()) {
> +    if (b)
> +    i:
> +      c > 0;
> +    a = 0;
> +    for (h = 0; h < 3; h++) {
> +      if (a != 1)
> +        __builtin_printf("0\n");
> +      d[h] = (_BitInt(512)) -1;
> +    }
> +    goto i;
> +  }
> +}
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-5b.c b/gcc/testsuite/gcc.target/i386/pr121934-5b.c
> new file mode 100644
> index 00000000000..6a45a8a7a8b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-5b.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mavx512f -mprefer-vector-width=512 -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
> +
> +#include "pr121934-5a.c"
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
> --
> 2.51.0
>
  
Uros Bizjak Sept. 15, 2025, 6 a.m. UTC | #2
On Mon, Sep 15, 2025 at 7:57 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Sun, Sep 14, 2025 at 9:14 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > If a single instruction can store or move the whole block of memory, use
> > vector instruction and don't align destination.
> >
> > gcc/
> >
> >         PR target/121934
> >         * config/i386/i386-expand.cc (ix86_expand_set_or_cpymem): If a
> >         single instruction can store or move the whole block of memory,
> >         use vector instruction and don't align destination.
> >
> > gcc/testsuite/
> >
> >         PR target/121934
> >         * gcc.target/i386/pr121934-1a.c: New test.
> >         * gcc.target/i386/pr121934-1b.c: Likewise.
> >         * gcc.target/i386/pr121934-2a.c: Likewise.
> >         * gcc.target/i386/pr121934-2b.c: Likewise.
> >         * gcc.target/i386/pr121934-3a.c: Likewise.
> >         * gcc.target/i386/pr121934-3b.c: Likewise.
> >         * gcc.target/i386/pr121934-4a.c: Likewise.
> >         * gcc.target/i386/pr121934-4b.c: Likewise.
> >         * gcc.target/i386/pr121934-5a.c: Likewise.
> >         * gcc.target/i386/pr121934-5b.c: Likewise.
>
> OK.
>
> Thanks,
> Uros.
>
> >
> > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> >  gcc/config/i386/i386-expand.cc              | 62 +++++++++++++--------
> >  gcc/testsuite/gcc.target/i386/pr121934-1a.c | 22 ++++++++
> >  gcc/testsuite/gcc.target/i386/pr121934-1b.c |  7 +++
> >  gcc/testsuite/gcc.target/i386/pr121934-2a.c | 23 ++++++++
> >  gcc/testsuite/gcc.target/i386/pr121934-2b.c |  7 +++
> >  gcc/testsuite/gcc.target/i386/pr121934-3a.c | 23 ++++++++
> >  gcc/testsuite/gcc.target/i386/pr121934-3b.c |  7 +++
> >  gcc/testsuite/gcc.target/i386/pr121934-4a.c | 23 ++++++++
> >  gcc/testsuite/gcc.target/i386/pr121934-4b.c |  7 +++
> >  gcc/testsuite/gcc.target/i386/pr121934-5a.c | 23 ++++++++
> >  gcc/testsuite/gcc.target/i386/pr121934-5b.c |  7 +++
> >  11 files changed, 187 insertions(+), 24 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5b.c
> >
> > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > index dc26b3452cb..b0b9e6da946 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -9552,9 +9552,20 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
> >    if (!issetmem)
> >      srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
> >
> > +  bool aligned_dstmem = false;
> > +  unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
> > +  bool single_insn_p = count && count <= nunits;

Should the above also consider X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL
and/or X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL?

Uros.
  
H.J. Lu Sept. 15, 2025, 11:56 a.m. UTC | #3
On Sun, Sep 14, 2025 at 11:00 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Mon, Sep 15, 2025 at 7:57 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Sun, Sep 14, 2025 at 9:14 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > If a single instruction can store or move the whole block of memory, use
> > > vector instruction and don't align destination.
> > >
> > > gcc/
> > >
> > >         PR target/121934
> > >         * config/i386/i386-expand.cc (ix86_expand_set_or_cpymem): If a
> > >         single instruction can store or move the whole block of memory,
> > >         use vector instruction and don't align destination.
> > >
> > > gcc/testsuite/
> > >
> > >         PR target/121934
> > >         * gcc.target/i386/pr121934-1a.c: New test.
> > >         * gcc.target/i386/pr121934-1b.c: Likewise.
> > >         * gcc.target/i386/pr121934-2a.c: Likewise.
> > >         * gcc.target/i386/pr121934-2b.c: Likewise.
> > >         * gcc.target/i386/pr121934-3a.c: Likewise.
> > >         * gcc.target/i386/pr121934-3b.c: Likewise.
> > >         * gcc.target/i386/pr121934-4a.c: Likewise.
> > >         * gcc.target/i386/pr121934-4b.c: Likewise.
> > >         * gcc.target/i386/pr121934-5a.c: Likewise.
> > >         * gcc.target/i386/pr121934-5b.c: Likewise.
> >
> > OK.
> >
> > Thanks,
> > Uros.
> >
> > >
> > > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> > > ---
> > >  gcc/config/i386/i386-expand.cc              | 62 +++++++++++++--------
> > >  gcc/testsuite/gcc.target/i386/pr121934-1a.c | 22 ++++++++
> > >  gcc/testsuite/gcc.target/i386/pr121934-1b.c |  7 +++
> > >  gcc/testsuite/gcc.target/i386/pr121934-2a.c | 23 ++++++++
> > >  gcc/testsuite/gcc.target/i386/pr121934-2b.c |  7 +++
> > >  gcc/testsuite/gcc.target/i386/pr121934-3a.c | 23 ++++++++
> > >  gcc/testsuite/gcc.target/i386/pr121934-3b.c |  7 +++
> > >  gcc/testsuite/gcc.target/i386/pr121934-4a.c | 23 ++++++++
> > >  gcc/testsuite/gcc.target/i386/pr121934-4b.c |  7 +++
> > >  gcc/testsuite/gcc.target/i386/pr121934-5a.c | 23 ++++++++
> > >  gcc/testsuite/gcc.target/i386/pr121934-5b.c |  7 +++
> > >  11 files changed, 187 insertions(+), 24 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1a.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1b.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2a.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2b.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3a.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3b.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4a.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4b.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5a.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5b.c
> > >
> > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > > index dc26b3452cb..b0b9e6da946 100644
> > > --- a/gcc/config/i386/i386-expand.cc
> > > +++ b/gcc/config/i386/i386-expand.cc
> > > @@ -9552,9 +9552,20 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
> > >    if (!issetmem)
> > >      srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
> > >
> > > +  bool aligned_dstmem = false;
> > > +  unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
> > > +  bool single_insn_p = count && count <= nunits;
>
> Should the above also consider X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL
> and/or X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL?

Already did:

 #define MOVE_MAX \
  ((TARGET_AVX512F \
    && (ix86_move_max == PVW_AVX512 \
        || ix86_store_max == PVW_AVX512)) \
   ? 64 \
   : ((TARGET_AVX \
       && (ix86_move_max >= PVW_AVX256 \
           || ix86_store_max >= PVW_AVX256)) \
      ? 32 \
      : ((TARGET_SSE2 \
          && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
          && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
         ? 16 : UNITS_PER_WORD)))

#define STORE_MAX_PIECES \
  (TARGET_INTER_UNIT_MOVES_TO_VEC \
   ? ((TARGET_AVX512F && ix86_store_max == PVW_AVX512) \
      ? 64 \
      : ((TARGET_AVX \
          && ix86_store_max >= PVW_AVX256) \
          ? 32 \
          : ((TARGET_SSE2 \
              && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
              ? 16 : UNITS_PER_WORD))) \
   : UNITS_PER_WORD)

I am checking it in.

Thanks.

>  Uros.
  

Patch

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index dc26b3452cb..b0b9e6da946 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -9552,9 +9552,20 @@  ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
   if (!issetmem)
     srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
 
+  bool aligned_dstmem = false;
+  unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
+  bool single_insn_p = count && count <= nunits;
+  if (single_insn_p)
+    {
+      /* If it can be done with a single instruction, use vector
+	 instruction and don't align destination.  */
+      alg = vector_loop;
+      noalign = true;
+      dynamic_check = -1;
+    }
+
   unroll_factor = 1;
   move_mode = word_mode;
-  int nunits;
   switch (alg)
     {
     case libcall:
@@ -9576,7 +9587,6 @@  ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
       need_zero_guard = true;
       unroll_factor = 4;
       /* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes.  */
-      nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
       nunits /= GET_MODE_SIZE (word_mode);
       if (nunits > 1)
 	{
@@ -9629,28 +9639,32 @@  ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
     }
   gcc_assert (desired_align >= 1 && align >= 1);
 
-  /* Misaligned move sequences handle both prologue and epilogue at once.
-     Default code generation results in a smaller code for large alignments
-     and also avoids redundant job when sizes are known precisely.  */
-  misaligned_prologue_used
-    = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
-       && MAX (desired_align, epilogue_size_needed) <= 32
-       && desired_align <= epilogue_size_needed
-       && ((desired_align > align && !align_bytes)
-	   || (!count && epilogue_size_needed > 1)));
-
-  /* Destination is aligned after the misaligned prologue.  */
-  bool aligned_dstmem = misaligned_prologue_used;
-
-  if (noalign && !misaligned_prologue_used)
-    {
-      /* Also use misaligned prologue if alignment isn't needed and
-	 destination isn't aligned.   Since alignment isn't needed,
-	 the destination after prologue won't be aligned.  */
-      aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
-			<= MEM_ALIGN (dst));
-      if (!aligned_dstmem)
-	misaligned_prologue_used = true;
+  if (!single_insn_p)
+    {
+      /* Misaligned move sequences handle both prologue and epilogue
+	 at once.  Default code generation results in a smaller code
+	 for large alignments and also avoids redundant job when sizes
+	 are known precisely.  */
+      misaligned_prologue_used
+	= (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
+	   && MAX (desired_align, epilogue_size_needed) <= 32
+	   && desired_align <= epilogue_size_needed
+	   && ((desired_align > align && !align_bytes)
+	       || (!count && epilogue_size_needed > 1)));
+
+      /* Destination is aligned after the misaligned prologue.  */
+      aligned_dstmem = misaligned_prologue_used;
+
+      if (noalign && !misaligned_prologue_used)
+	{
+	  /* Also use misaligned prologue if alignment isn't needed and
+	     destination isn't aligned.   Since alignment isn't needed,
+	     the destination after prologue won't be aligned.  */
+	  aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
+			    <= MEM_ALIGN (dst));
+	  if (!aligned_dstmem)
+	    misaligned_prologue_used = true;
+	}
     }
 
   /* Do the cheap promotion to allow better CSE across the
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-1a.c b/gcc/testsuite/gcc.target/i386/pr121934-1a.c
new file mode 100644
index 00000000000..6b6881367db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-1a.c
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre" } */
+
+extern int f();
+int a, b, c, d[3];
+void g() {
+  int h;
+  if (f()) {
+    if (b)
+    i:
+      c > 0;
+    a = 0;
+    for (h = 0; h < 3; h++) {
+      if (a != 1)
+        __builtin_printf("0\n");
+      d[h] = -1;
+    }
+    goto i;
+  }
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-1b.c b/gcc/testsuite/gcc.target/i386/pr121934-1b.c
new file mode 100644
index 00000000000..47381ec3476
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-1b.c
@@ -0,0 +1,7 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
+
+#include "pr121934-1a.c"
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
+/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-2a.c b/gcc/testsuite/gcc.target/i386/pr121934-2a.c
new file mode 100644
index 00000000000..49def11aa4e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-2a.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre" } */
+
+extern int f();
+int a, b, c;
+long long int d[3];
+void g() {
+  int h;
+  if (f()) {
+    if (b)
+    i:
+      c > 0;
+    a = 0;
+    for (h = 0; h < 3; h++) {
+      if (a != 1)
+        __builtin_printf("0\n");
+      d[h] = (long long int) -1;
+    }
+    goto i;
+  }
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-2b.c b/gcc/testsuite/gcc.target/i386/pr121934-2b.c
new file mode 100644
index 00000000000..1c634dfe420
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-2b.c
@@ -0,0 +1,7 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
+
+#include "pr121934-2a.c"
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
+/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-3a.c b/gcc/testsuite/gcc.target/i386/pr121934-3a.c
new file mode 100644
index 00000000000..0c04b69c0d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-3a.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -msse2" } */
+
+extern int f();
+int a, b, c;
+_BitInt(128) d[3];
+void g() {
+  int h;
+  if (f()) {
+    if (b)
+    i:
+      c > 0;
+    a = 0;
+    for (h = 0; h < 3; h++) {
+      if (a != 1)
+        __builtin_printf("0\n");
+      d[h] = (_BitInt(128)) -1;
+    }
+    goto i;
+  }
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-3b.c b/gcc/testsuite/gcc.target/i386/pr121934-3b.c
new file mode 100644
index 00000000000..ff4b0831cea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-3b.c
@@ -0,0 +1,7 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -msse2 -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
+
+#include "pr121934-3a.c"
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
+/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-4a.c b/gcc/testsuite/gcc.target/i386/pr121934-4a.c
new file mode 100644
index 00000000000..5aa3e069cff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-4a.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mno-avx512f -mavx -mprefer-vector-width=256" } */
+
+extern int f();
+int a, b, c;
+_BitInt(256) d[3];
+void g() {
+  int h;
+  if (f()) {
+    if (b)
+    i:
+      c > 0;
+    a = 0;
+    for (h = 0; h < 3; h++) {
+      if (a != 1)
+        __builtin_printf("0\n");
+      d[h] = (_BitInt(256)) -1;
+    }
+    goto i;
+  }
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-4b.c b/gcc/testsuite/gcc.target/i386/pr121934-4b.c
new file mode 100644
index 00000000000..5f8241dcad5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-4b.c
@@ -0,0 +1,7 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mno-avx512f -mavx -mprefer-vector-width=256 -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
+
+#include "pr121934-4a.c"
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
+/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-5a.c b/gcc/testsuite/gcc.target/i386/pr121934-5a.c
new file mode 100644
index 00000000000..10be0dd4343
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-5a.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mavx512f -mprefer-vector-width=512" } */
+
+extern int f();
+int a, b, c;
+_BitInt(512) d[3];
+void g() {
+  int h;
+  if (f()) {
+    if (b)
+    i:
+      c > 0;
+    a = 0;
+    for (h = 0; h < 3; h++) {
+      if (a != 1)
+        __builtin_printf("0\n");
+      d[h] = (_BitInt(512)) -1;
+    }
+    goto i;
+  }
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr121934-5b.c b/gcc/testsuite/gcc.target/i386/pr121934-5b.c
new file mode 100644
index 00000000000..6a45a8a7a8b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr121934-5b.c
@@ -0,0 +1,7 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mavx512f -mprefer-vector-width=512 -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
+
+#include "pr121934-5a.c"
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
+/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */