x86: Don't align destination for a single instruction
Commit Message
If a single instruction can store or move the whole block of memory, use
vector instruction and don't align destination.
gcc/
PR target/121934
* config/i386/i386-expand.cc (ix86_expand_set_or_cpymem): If a
single instruction can store or move the whole block of memory,
use vector instruction and don't align destination.
gcc/testsuite/
PR target/121934
* gcc.target/i386/pr121934-1a.c: New test.
* gcc.target/i386/pr121934-1b.c: Likewise.
* gcc.target/i386/pr121934-2a.c: Likewise.
* gcc.target/i386/pr121934-2b.c: Likewise.
* gcc.target/i386/pr121934-3a.c: Likewise.
* gcc.target/i386/pr121934-3b.c: Likewise.
* gcc.target/i386/pr121934-4a.c: Likewise.
* gcc.target/i386/pr121934-4b.c: Likewise.
* gcc.target/i386/pr121934-5a.c: Likewise.
* gcc.target/i386/pr121934-5b.c: Likewise.
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
---
gcc/config/i386/i386-expand.cc | 62 +++++++++++++--------
gcc/testsuite/gcc.target/i386/pr121934-1a.c | 22 ++++++++
gcc/testsuite/gcc.target/i386/pr121934-1b.c | 7 +++
gcc/testsuite/gcc.target/i386/pr121934-2a.c | 23 ++++++++
gcc/testsuite/gcc.target/i386/pr121934-2b.c | 7 +++
gcc/testsuite/gcc.target/i386/pr121934-3a.c | 23 ++++++++
gcc/testsuite/gcc.target/i386/pr121934-3b.c | 7 +++
gcc/testsuite/gcc.target/i386/pr121934-4a.c | 23 ++++++++
gcc/testsuite/gcc.target/i386/pr121934-4b.c | 7 +++
gcc/testsuite/gcc.target/i386/pr121934-5a.c | 23 ++++++++
gcc/testsuite/gcc.target/i386/pr121934-5b.c | 7 +++
11 files changed, 187 insertions(+), 24 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5b.c
Comments
On Sun, Sep 14, 2025 at 9:14 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> If a single instruction can store or move the whole block of memory, use
> vector instruction and don't align destination.
>
> gcc/
>
> PR target/121934
> * config/i386/i386-expand.cc (ix86_expand_set_or_cpymem): If a
> single instruction can store or move the whole block of memory,
> use vector instruction and don't align destination.
>
> gcc/testsuite/
>
> PR target/121934
> * gcc.target/i386/pr121934-1a.c: New test.
> * gcc.target/i386/pr121934-1b.c: Likewise.
> * gcc.target/i386/pr121934-2a.c: Likewise.
> * gcc.target/i386/pr121934-2b.c: Likewise.
> * gcc.target/i386/pr121934-3a.c: Likewise.
> * gcc.target/i386/pr121934-3b.c: Likewise.
> * gcc.target/i386/pr121934-4a.c: Likewise.
> * gcc.target/i386/pr121934-4b.c: Likewise.
> * gcc.target/i386/pr121934-5a.c: Likewise.
> * gcc.target/i386/pr121934-5b.c: Likewise.
OK.
Thanks,
Uros.
>
> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> ---
> gcc/config/i386/i386-expand.cc | 62 +++++++++++++--------
> gcc/testsuite/gcc.target/i386/pr121934-1a.c | 22 ++++++++
> gcc/testsuite/gcc.target/i386/pr121934-1b.c | 7 +++
> gcc/testsuite/gcc.target/i386/pr121934-2a.c | 23 ++++++++
> gcc/testsuite/gcc.target/i386/pr121934-2b.c | 7 +++
> gcc/testsuite/gcc.target/i386/pr121934-3a.c | 23 ++++++++
> gcc/testsuite/gcc.target/i386/pr121934-3b.c | 7 +++
> gcc/testsuite/gcc.target/i386/pr121934-4a.c | 23 ++++++++
> gcc/testsuite/gcc.target/i386/pr121934-4b.c | 7 +++
> gcc/testsuite/gcc.target/i386/pr121934-5a.c | 23 ++++++++
> gcc/testsuite/gcc.target/i386/pr121934-5b.c | 7 +++
> 11 files changed, 187 insertions(+), 24 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1b.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2b.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3b.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4b.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5b.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index dc26b3452cb..b0b9e6da946 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -9552,9 +9552,20 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
> if (!issetmem)
> srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
>
> + bool aligned_dstmem = false;
> + unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
> + bool single_insn_p = count && count <= nunits;
> + if (single_insn_p)
> + {
> + /* If it can be done with a single instruction, use vector
> + instruction and don't align destination. */
> + alg = vector_loop;
> + noalign = true;
> + dynamic_check = -1;
> + }
> +
> unroll_factor = 1;
> move_mode = word_mode;
> - int nunits;
> switch (alg)
> {
> case libcall:
> @@ -9576,7 +9587,6 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
> need_zero_guard = true;
> unroll_factor = 4;
> /* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes. */
> - nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
> nunits /= GET_MODE_SIZE (word_mode);
> if (nunits > 1)
> {
> @@ -9629,28 +9639,32 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
> }
> gcc_assert (desired_align >= 1 && align >= 1);
>
> - /* Misaligned move sequences handle both prologue and epilogue at once.
> - Default code generation results in a smaller code for large alignments
> - and also avoids redundant job when sizes are known precisely. */
> - misaligned_prologue_used
> - = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
> - && MAX (desired_align, epilogue_size_needed) <= 32
> - && desired_align <= epilogue_size_needed
> - && ((desired_align > align && !align_bytes)
> - || (!count && epilogue_size_needed > 1)));
> -
> - /* Destination is aligned after the misaligned prologue. */
> - bool aligned_dstmem = misaligned_prologue_used;
> -
> - if (noalign && !misaligned_prologue_used)
> - {
> - /* Also use misaligned prologue if alignment isn't needed and
> - destination isn't aligned. Since alignment isn't needed,
> - the destination after prologue won't be aligned. */
> - aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
> - <= MEM_ALIGN (dst));
> - if (!aligned_dstmem)
> - misaligned_prologue_used = true;
> + if (!single_insn_p)
> + {
> + /* Misaligned move sequences handle both prologue and epilogue
> + at once. Default code generation results in a smaller code
> + for large alignments and also avoids redundant job when sizes
> + are known precisely. */
> + misaligned_prologue_used
> + = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
> + && MAX (desired_align, epilogue_size_needed) <= 32
> + && desired_align <= epilogue_size_needed
> + && ((desired_align > align && !align_bytes)
> + || (!count && epilogue_size_needed > 1)));
> +
> + /* Destination is aligned after the misaligned prologue. */
> + aligned_dstmem = misaligned_prologue_used;
> +
> + if (noalign && !misaligned_prologue_used)
> + {
> + /* Also use misaligned prologue if alignment isn't needed and
> + destination isn't aligned. Since alignment isn't needed,
> + the destination after prologue won't be aligned. */
> + aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
> + <= MEM_ALIGN (dst));
> + if (!aligned_dstmem)
> + misaligned_prologue_used = true;
> + }
> }
>
> /* Do the cheap promotion to allow better CSE across the
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-1a.c b/gcc/testsuite/gcc.target/i386/pr121934-1a.c
> new file mode 100644
> index 00000000000..6b6881367db
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-1a.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre" } */
> +
> +extern int f();
> +int a, b, c, d[3];
> +void g() {
> + int h;
> + if (f()) {
> + if (b)
> + i:
> + c > 0;
> + a = 0;
> + for (h = 0; h < 3; h++) {
> + if (a != 1)
> + __builtin_printf("0\n");
> + d[h] = -1;
> + }
> + goto i;
> + }
> +}
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-1b.c b/gcc/testsuite/gcc.target/i386/pr121934-1b.c
> new file mode 100644
> index 00000000000..47381ec3476
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-1b.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
> +
> +#include "pr121934-1a.c"
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-2a.c b/gcc/testsuite/gcc.target/i386/pr121934-2a.c
> new file mode 100644
> index 00000000000..49def11aa4e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-2a.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre" } */
> +
> +extern int f();
> +int a, b, c;
> +long long int d[3];
> +void g() {
> + int h;
> + if (f()) {
> + if (b)
> + i:
> + c > 0;
> + a = 0;
> + for (h = 0; h < 3; h++) {
> + if (a != 1)
> + __builtin_printf("0\n");
> + d[h] = (long long int) -1;
> + }
> + goto i;
> + }
> +}
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-2b.c b/gcc/testsuite/gcc.target/i386/pr121934-2b.c
> new file mode 100644
> index 00000000000..1c634dfe420
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-2b.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
> +
> +#include "pr121934-2a.c"
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-3a.c b/gcc/testsuite/gcc.target/i386/pr121934-3a.c
> new file mode 100644
> index 00000000000..0c04b69c0d4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-3a.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -msse2" } */
> +
> +extern int f();
> +int a, b, c;
> +_BitInt(128) d[3];
> +void g() {
> + int h;
> + if (f()) {
> + if (b)
> + i:
> + c > 0;
> + a = 0;
> + for (h = 0; h < 3; h++) {
> + if (a != 1)
> + __builtin_printf("0\n");
> + d[h] = (_BitInt(128)) -1;
> + }
> + goto i;
> + }
> +}
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-3b.c b/gcc/testsuite/gcc.target/i386/pr121934-3b.c
> new file mode 100644
> index 00000000000..ff4b0831cea
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-3b.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -msse2 -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
> +
> +#include "pr121934-3a.c"
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-4a.c b/gcc/testsuite/gcc.target/i386/pr121934-4a.c
> new file mode 100644
> index 00000000000..5aa3e069cff
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-4a.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mno-avx512f -mavx -mprefer-vector-width=256" } */
> +
> +extern int f();
> +int a, b, c;
> +_BitInt(256) d[3];
> +void g() {
> + int h;
> + if (f()) {
> + if (b)
> + i:
> + c > 0;
> + a = 0;
> + for (h = 0; h < 3; h++) {
> + if (a != 1)
> + __builtin_printf("0\n");
> + d[h] = (_BitInt(256)) -1;
> + }
> + goto i;
> + }
> +}
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-4b.c b/gcc/testsuite/gcc.target/i386/pr121934-4b.c
> new file mode 100644
> index 00000000000..5f8241dcad5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-4b.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mno-avx512f -mavx -mprefer-vector-width=256 -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
> +
> +#include "pr121934-4a.c"
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-5a.c b/gcc/testsuite/gcc.target/i386/pr121934-5a.c
> new file mode 100644
> index 00000000000..10be0dd4343
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-5a.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mavx512f -mprefer-vector-width=512" } */
> +
> +extern int f();
> +int a, b, c;
> +_BitInt(512) d[3];
> +void g() {
> + int h;
> + if (f()) {
> + if (b)
> + i:
> + c > 0;
> + a = 0;
> + for (h = 0; h < 3; h++) {
> + if (a != 1)
> + __builtin_printf("0\n");
> + d[h] = (_BitInt(512)) -1;
> + }
> + goto i;
> + }
> +}
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-5b.c b/gcc/testsuite/gcc.target/i386/pr121934-5b.c
> new file mode 100644
> index 00000000000..6a45a8a7a8b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-5b.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mavx512f -mprefer-vector-width=512 -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
> +
> +#include "pr121934-5a.c"
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
> --
> 2.51.0
>
On Mon, Sep 15, 2025 at 7:57 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Sun, Sep 14, 2025 at 9:14 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > If a single instruction can store or move the whole block of memory, use
> > vector instruction and don't align destination.
> >
> > gcc/
> >
> > PR target/121934
> > * config/i386/i386-expand.cc (ix86_expand_set_or_cpymem): If a
> > single instruction can store or move the whole block of memory,
> > use vector instruction and don't align destination.
> >
> > gcc/testsuite/
> >
> > PR target/121934
> > * gcc.target/i386/pr121934-1a.c: New test.
> > * gcc.target/i386/pr121934-1b.c: Likewise.
> > * gcc.target/i386/pr121934-2a.c: Likewise.
> > * gcc.target/i386/pr121934-2b.c: Likewise.
> > * gcc.target/i386/pr121934-3a.c: Likewise.
> > * gcc.target/i386/pr121934-3b.c: Likewise.
> > * gcc.target/i386/pr121934-4a.c: Likewise.
> > * gcc.target/i386/pr121934-4b.c: Likewise.
> > * gcc.target/i386/pr121934-5a.c: Likewise.
> > * gcc.target/i386/pr121934-5b.c: Likewise.
>
> OK.
>
> Thanks,
> Uros.
>
> >
> > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> > gcc/config/i386/i386-expand.cc | 62 +++++++++++++--------
> > gcc/testsuite/gcc.target/i386/pr121934-1a.c | 22 ++++++++
> > gcc/testsuite/gcc.target/i386/pr121934-1b.c | 7 +++
> > gcc/testsuite/gcc.target/i386/pr121934-2a.c | 23 ++++++++
> > gcc/testsuite/gcc.target/i386/pr121934-2b.c | 7 +++
> > gcc/testsuite/gcc.target/i386/pr121934-3a.c | 23 ++++++++
> > gcc/testsuite/gcc.target/i386/pr121934-3b.c | 7 +++
> > gcc/testsuite/gcc.target/i386/pr121934-4a.c | 23 ++++++++
> > gcc/testsuite/gcc.target/i386/pr121934-4b.c | 7 +++
> > gcc/testsuite/gcc.target/i386/pr121934-5a.c | 23 ++++++++
> > gcc/testsuite/gcc.target/i386/pr121934-5b.c | 7 +++
> > 11 files changed, 187 insertions(+), 24 deletions(-)
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1a.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1b.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2a.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2b.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3a.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3b.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4a.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4b.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5a.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5b.c
> >
> > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > index dc26b3452cb..b0b9e6da946 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -9552,9 +9552,20 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
> > if (!issetmem)
> > srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
> >
> > + bool aligned_dstmem = false;
> > + unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
> > + bool single_insn_p = count && count <= nunits;
Should the above also consider X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL
and/or X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL?
Uros.
On Sun, Sep 14, 2025 at 11:00 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Mon, Sep 15, 2025 at 7:57 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Sun, Sep 14, 2025 at 9:14 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > If a single instruction can store or move the whole block of memory, use
> > > vector instruction and don't align destination.
> > >
> > > gcc/
> > >
> > > PR target/121934
> > > * config/i386/i386-expand.cc (ix86_expand_set_or_cpymem): If a
> > > single instruction can store or move the whole block of memory,
> > > use vector instruction and don't align destination.
> > >
> > > gcc/testsuite/
> > >
> > > PR target/121934
> > > * gcc.target/i386/pr121934-1a.c: New test.
> > > * gcc.target/i386/pr121934-1b.c: Likewise.
> > > * gcc.target/i386/pr121934-2a.c: Likewise.
> > > * gcc.target/i386/pr121934-2b.c: Likewise.
> > > * gcc.target/i386/pr121934-3a.c: Likewise.
> > > * gcc.target/i386/pr121934-3b.c: Likewise.
> > > * gcc.target/i386/pr121934-4a.c: Likewise.
> > > * gcc.target/i386/pr121934-4b.c: Likewise.
> > > * gcc.target/i386/pr121934-5a.c: Likewise.
> > > * gcc.target/i386/pr121934-5b.c: Likewise.
> >
> > OK.
> >
> > Thanks,
> > Uros.
> >
> > >
> > > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> > > ---
> > > gcc/config/i386/i386-expand.cc | 62 +++++++++++++--------
> > > gcc/testsuite/gcc.target/i386/pr121934-1a.c | 22 ++++++++
> > > gcc/testsuite/gcc.target/i386/pr121934-1b.c | 7 +++
> > > gcc/testsuite/gcc.target/i386/pr121934-2a.c | 23 ++++++++
> > > gcc/testsuite/gcc.target/i386/pr121934-2b.c | 7 +++
> > > gcc/testsuite/gcc.target/i386/pr121934-3a.c | 23 ++++++++
> > > gcc/testsuite/gcc.target/i386/pr121934-3b.c | 7 +++
> > > gcc/testsuite/gcc.target/i386/pr121934-4a.c | 23 ++++++++
> > > gcc/testsuite/gcc.target/i386/pr121934-4b.c | 7 +++
> > > gcc/testsuite/gcc.target/i386/pr121934-5a.c | 23 ++++++++
> > > gcc/testsuite/gcc.target/i386/pr121934-5b.c | 7 +++
> > > 11 files changed, 187 insertions(+), 24 deletions(-)
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1a.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1b.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2a.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2b.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3a.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3b.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4a.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4b.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5a.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5b.c
> > >
> > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > > index dc26b3452cb..b0b9e6da946 100644
> > > --- a/gcc/config/i386/i386-expand.cc
> > > +++ b/gcc/config/i386/i386-expand.cc
> > > @@ -9552,9 +9552,20 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
> > > if (!issetmem)
> > > srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
> > >
> > > + bool aligned_dstmem = false;
> > > + unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
> > > + bool single_insn_p = count && count <= nunits;
>
> Should the above also consider X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL
> and/or X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL?
Already did:
#define MOVE_MAX \
((TARGET_AVX512F \
&& (ix86_move_max == PVW_AVX512 \
|| ix86_store_max == PVW_AVX512)) \
? 64 \
: ((TARGET_AVX \
&& (ix86_move_max >= PVW_AVX256 \
|| ix86_store_max >= PVW_AVX256)) \
? 32 \
: ((TARGET_SSE2 \
&& TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
&& TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
? 16 : UNITS_PER_WORD)))
#define STORE_MAX_PIECES \
(TARGET_INTER_UNIT_MOVES_TO_VEC \
? ((TARGET_AVX512F && ix86_store_max == PVW_AVX512) \
? 64 \
: ((TARGET_AVX \
&& ix86_store_max >= PVW_AVX256) \
? 32 \
: ((TARGET_SSE2 \
&& TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
? 16 : UNITS_PER_WORD))) \
: UNITS_PER_WORD)
I am checking it in.
Thanks.
> Uros.
@@ -9552,9 +9552,20 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
if (!issetmem)
srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
+ bool aligned_dstmem = false;
+ unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
+ bool single_insn_p = count && count <= nunits;
+ if (single_insn_p)
+ {
+ /* If it can be done with a single instruction, use vector
+ instruction and don't align destination. */
+ alg = vector_loop;
+ noalign = true;
+ dynamic_check = -1;
+ }
+
unroll_factor = 1;
move_mode = word_mode;
- int nunits;
switch (alg)
{
case libcall:
@@ -9576,7 +9587,6 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
need_zero_guard = true;
unroll_factor = 4;
/* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes. */
- nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
nunits /= GET_MODE_SIZE (word_mode);
if (nunits > 1)
{
@@ -9629,28 +9639,32 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
}
gcc_assert (desired_align >= 1 && align >= 1);
- /* Misaligned move sequences handle both prologue and epilogue at once.
- Default code generation results in a smaller code for large alignments
- and also avoids redundant job when sizes are known precisely. */
- misaligned_prologue_used
- = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
- && MAX (desired_align, epilogue_size_needed) <= 32
- && desired_align <= epilogue_size_needed
- && ((desired_align > align && !align_bytes)
- || (!count && epilogue_size_needed > 1)));
-
- /* Destination is aligned after the misaligned prologue. */
- bool aligned_dstmem = misaligned_prologue_used;
-
- if (noalign && !misaligned_prologue_used)
- {
- /* Also use misaligned prologue if alignment isn't needed and
- destination isn't aligned. Since alignment isn't needed,
- the destination after prologue won't be aligned. */
- aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
- <= MEM_ALIGN (dst));
- if (!aligned_dstmem)
- misaligned_prologue_used = true;
+ if (!single_insn_p)
+ {
+ /* Misaligned move sequences handle both prologue and epilogue
+ at once. Default code generation results in a smaller code
+ for large alignments and also avoids redundant job when sizes
+ are known precisely. */
+ misaligned_prologue_used
+ = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
+ && MAX (desired_align, epilogue_size_needed) <= 32
+ && desired_align <= epilogue_size_needed
+ && ((desired_align > align && !align_bytes)
+ || (!count && epilogue_size_needed > 1)));
+
+ /* Destination is aligned after the misaligned prologue. */
+ aligned_dstmem = misaligned_prologue_used;
+
+ if (noalign && !misaligned_prologue_used)
+ {
+ /* Also use misaligned prologue if alignment isn't needed and
+ destination isn't aligned. Since alignment isn't needed,
+ the destination after prologue won't be aligned. */
+ aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
+ <= MEM_ALIGN (dst));
+ if (!aligned_dstmem)
+ misaligned_prologue_used = true;
+ }
}
/* Do the cheap promotion to allow better CSE across the
new file mode 100644
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre" } */
+
+extern int f();
+int a, b, c, d[3];
+void g() {
+ int h;
+ if (f()) {
+ if (b)
+ i:
+ c > 0;
+ a = 0;
+ for (h = 0; h < 3; h++) {
+ if (a != 1)
+ __builtin_printf("0\n");
+ d[h] = -1;
+ }
+ goto i;
+ }
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
new file mode 100644
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
+
+#include "pr121934-1a.c"
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
+/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
new file mode 100644
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre" } */
+
+extern int f();
+int a, b, c;
+long long int d[3];
+void g() {
+ int h;
+ if (f()) {
+ if (b)
+ i:
+ c > 0;
+ a = 0;
+ for (h = 0; h < 3; h++) {
+ if (a != 1)
+ __builtin_printf("0\n");
+ d[h] = (long long int) -1;
+ }
+ goto i;
+ }
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
new file mode 100644
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
+
+#include "pr121934-2a.c"
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
+/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
new file mode 100644
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -msse2" } */
+
+extern int f();
+int a, b, c;
+_BitInt(128) d[3];
+void g() {
+ int h;
+ if (f()) {
+ if (b)
+ i:
+ c > 0;
+ a = 0;
+ for (h = 0; h < 3; h++) {
+ if (a != 1)
+ __builtin_printf("0\n");
+ d[h] = (_BitInt(128)) -1;
+ }
+ goto i;
+ }
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
new file mode 100644
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -msse2 -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
+
+#include "pr121934-3a.c"
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
+/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
new file mode 100644
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mno-avx512f -mavx -mprefer-vector-width=256" } */
+
+extern int f();
+int a, b, c;
+_BitInt(256) d[3];
+void g() {
+ int h;
+ if (f()) {
+ if (b)
+ i:
+ c > 0;
+ a = 0;
+ for (h = 0; h < 3; h++) {
+ if (a != 1)
+ __builtin_printf("0\n");
+ d[h] = (_BitInt(256)) -1;
+ }
+ goto i;
+ }
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
new file mode 100644
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mno-avx512f -mavx -mprefer-vector-width=256 -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
+
+#include "pr121934-4a.c"
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
+/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
new file mode 100644
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mavx512f -mprefer-vector-width=512" } */
+
+extern int f();
+int a, b, c;
+_BitInt(512) d[3];
+void g() {
+ int h;
+ if (f()) {
+ if (b)
+ i:
+ c > 0;
+ a = 0;
+ for (h = 0; h < 3; h++) {
+ if (a != 1)
+ __builtin_printf("0\n");
+ d[h] = (_BitInt(512)) -1;
+ }
+ goto i;
+ }
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
new file mode 100644
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mavx512f -mprefer-vector-width=512 -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
+
+#include "pr121934-5a.c"
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
+/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */