[v2,2/7] Alpha: Optimize block moves coming from longword-aligned source
Commit Message
Now that we have proper alignment determination for block moves in place
the case of copying a block of longword-aligned data has become real, so
implement the merging of loaded data from pairs of SImode registers into
single DImode registers for the purpose of using with unaligned stores
efficiently, as suggested by a comment in `alpha_expand_block_move' and
discard the comment. Provide test cases accordingly.
gcc/
* config/alpha/alpha.cc (alpha_expand_block_move): Merge loaded
data from pairs of SImode registers into single DImode registers
if to be used with unaligned stores.
gcc/testsuite/
* gcc.target/alpha/memcpy-si-aligned.c: New file.
* gcc.target/alpha/memcpy-si-unaligned.c: New file.
* gcc.target/alpha/memcpy-si-unaligned-dst.c: New file.
* gcc.target/alpha/memcpy-si-unaligned-src.c: New file.
* gcc.target/alpha/memcpy-si-unaligned-src-bwx.c: New file.
---
No change from v1.
---
gcc/config/alpha/alpha.cc | 45 +++++++--
gcc/testsuite/gcc.target/alpha/memcpy-si-aligned.c | 16 +++
gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-dst.c | 16 +++
gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src-bwx.c | 11 ++
gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src.c | 15 +++
gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned.c | 51 +++++++++++
6 files changed, 146 insertions(+), 8 deletions(-)
gcc-alpha-block-move-si-unaligned.diff
Comments
On 1/6/25 6:03 AM, Maciej W. Rozycki wrote:
> Now that we have proper alignment determination for block moves in place
> the case of copying a block of longword-aligned data has become real, so
> implement the merging of loaded data from pairs of SImode registers into
> single DImode registers for the purpose of using with unaligned stores
> efficiently, as suggested by a comment in `alpha_expand_block_move' and
> discard the comment. Provide test cases accordingly.
>
> gcc/
> * config/alpha/alpha.cc (alpha_expand_block_move): Merge loaded
> data from pairs of SImode registers into single DImode registers
> if to be used with unaligned stores.
>
> gcc/testsuite/
> * gcc.target/alpha/memcpy-si-aligned.c: New file.
> * gcc.target/alpha/memcpy-si-unaligned.c: New file.
> * gcc.target/alpha/memcpy-si-unaligned-dst.c: New file.
> * gcc.target/alpha/memcpy-si-unaligned-src.c: New file.
> * gcc.target/alpha/memcpy-si-unaligned-src-bwx.c: New file.
OK
jeff
On Mon, 6 Jan 2025, Jeff Law wrote:
> > gcc/
> > * config/alpha/alpha.cc (alpha_expand_block_move): Merge loaded
> > data from pairs of SImode registers into single DImode registers
> > if to be used with unaligned stores.
> >
> > gcc/testsuite/
> > * gcc.target/alpha/memcpy-si-aligned.c: New file.
> > * gcc.target/alpha/memcpy-si-unaligned.c: New file.
> > * gcc.target/alpha/memcpy-si-unaligned-dst.c: New file.
> > * gcc.target/alpha/memcpy-si-unaligned-src.c: New file.
> > * gcc.target/alpha/memcpy-si-unaligned-src-bwx.c: New file.
> OK
Applied now, thanks for your review.
Maciej
===================================================================
@@ -3930,14 +3930,44 @@ alpha_expand_block_move (rtx operands[])
{
words = bytes / 4;
- for (i = 0; i < words; ++i)
- data_regs[nregs + i] = gen_reg_rtx (SImode);
+ /* Load an even quantity of SImode data pieces only. */
+ unsigned int hwords = words / 2;
+ for (i = 0; i / 2 < hwords; ++i)
+ {
+ data_regs[nregs + i] = gen_reg_rtx (SImode);
+ emit_move_insn (data_regs[nregs + i],
+ adjust_address (orig_src, SImode, ofs + i * 4));
+ }
- for (i = 0; i < words; ++i)
- emit_move_insn (data_regs[nregs + i],
- adjust_address (orig_src, SImode, ofs + i * 4));
+ /* If we'll be using unaligned stores, merge data from pairs
+ of SImode registers into DImode registers so that we can
+ store it more efficiently via quadword unaligned stores. */
+ unsigned int j;
+ if (dst_align < 32)
+ for (i = 0, j = 0; i < words / 2; ++i, j = i * 2)
+ {
+ rtx hi = expand_simple_binop (DImode, ASHIFT,
+ data_regs[nregs + j + 1],
+ GEN_INT (32), NULL_RTX,
+ 1, OPTAB_WIDEN);
+ data_regs[nregs + i] = expand_simple_binop (DImode, IOR, hi,
+ data_regs[nregs + j],
+ NULL_RTX,
+ 1, OPTAB_WIDEN);
+ }
+ else
+ j = i;
- nregs += words;
+ /* Take care of any remaining odd trailing SImode data piece. */
+ if (j < words)
+ {
+ data_regs[nregs + i] = gen_reg_rtx (SImode);
+ emit_move_insn (data_regs[nregs + i],
+ adjust_address (orig_src, SImode, ofs + j * 4));
+ ++i;
+ }
+
+ nregs += i;
bytes -= words * 4;
ofs += words * 4;
}
@@ -4056,13 +4086,12 @@ alpha_expand_block_move (rtx operands[])
}
/* Due to the above, this won't be aligned. */
- /* ??? If we have more than one of these, consider constructing full
- words in registers and using alpha_expand_unaligned_store_words. */
while (i < nregs && GET_MODE (data_regs[i]) == SImode)
{
alpha_expand_unaligned_store (orig_dst, data_regs[i], 4, ofs);
ofs += 4;
i++;
+ gcc_assert (i == nregs || GET_MODE (data_regs[i]) != SImode);
}
if (dst_align >= 16)
===================================================================
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+unsigned int aligned_src_si[17] = { [0 ... 16] = 0xeaebeced };
+unsigned int aligned_dst_si[17] = { [0 ... 16] = 0xdcdbdad9 };
+
+void
+memcpy_aligned_data_si (void)
+{
+ __builtin_memcpy (aligned_dst_si + 1, aligned_src_si + 1, 60);
+}
+
+/* { dg-final { scan-assembler-times "\\sldl\\s" 15 } } */
+/* { dg-final { scan-assembler-times "\\sstl\\s" 15 } } */
+/* { dg-final { scan-assembler-not "\\s(?:ldq_u|stq_u)\\s" } } */
===================================================================
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+unsigned int unaligned_src_si[17] = { [0 ... 16] = 0xfefdfcfb };
+
+void
+memcpy_unaligned_dst_si (void *dst)
+{
+ __builtin_memcpy (dst, unaligned_src_si + 1, 60);
+}
+
+/* { dg-final { scan-assembler-times "\\sldl\\s" 15 } } */
+/* { dg-final { scan-assembler-times "\\sldq_u\\s" 4 } } */
+/* { dg-final { scan-assembler-times "\\sstq_u\\s" 10 } } */
+/* { dg-final { scan-assembler-not "\\sstl\\s" } } */
===================================================================
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-mbwx" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+#include "memcpy-si-unaligned-src.c"
+
+/* { dg-final { scan-assembler-times "\\sldbu\\s" 4 } } */
+/* { dg-final { scan-assembler-times "\\sldq_u\\s" 8 } } */
+/* { dg-final { scan-assembler-times "\\sstb\\s" 4 } } */
+/* { dg-final { scan-assembler-times "\\sstl\\s" 14 } } */
+/* { dg-final { scan-assembler-not "\\s(?:ldl|stq_u)\\s" } } */
===================================================================
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-mno-bwx" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+unsigned int unaligned_dst_si[17] = { [0 ... 16] = 0xc8c9cacb };
+
+void
+memcpy_unaligned_src_si (const void *src)
+{
+ __builtin_memcpy (unaligned_dst_si + 1, src, 60);
+}
+
+/* { dg-final { scan-assembler-times "\\sldq_u\\s" 10 } } */
+/* { dg-final { scan-assembler-times "\\sstl\\s" 15 } } */
+/* { dg-final { scan-assembler-not "\\s(?:ldl|stq_u)\\s" } } */
===================================================================
@@ -0,0 +1,51 @@
+/* { dg-do run } */
+/* { dg-additional-sources memcpy-si-aligned.c } */
+/* { dg-additional-sources memcpy-si-unaligned-src.c } */
+/* { dg-additional-sources memcpy-si-unaligned-dst.c } */
+/* { dg-options "" } */
+
+void memcpy_aligned_data_si (void);
+void memcpy_unaligned_dst_si (void *);
+void memcpy_unaligned_src_si (const void *);
+
+extern unsigned int aligned_src_si[];
+extern unsigned int aligned_dst_si[];
+extern unsigned int unaligned_src_si[];
+extern unsigned int unaligned_dst_si[];
+
+int
+main (void)
+{
+ unsigned int v;
+ int i;
+
+ for (i = 1, v = 0x04030201; i < 16; i++, v += 0x04040404)
+ unaligned_src_si[i] = v;
+ asm ("" : : : "memory");
+ memcpy_unaligned_dst_si (aligned_src_si + 1);
+ asm ("" : : : "memory");
+ memcpy_aligned_data_si ();
+ asm ("" : : : "memory");
+ memcpy_unaligned_src_si (aligned_dst_si + 1);
+ asm ("" : : : "memory");
+ for (i = 1, v = 0x04030201; i < 16; i++, v += 0x04040404)
+ if (unaligned_dst_si[i] != v)
+ return 1;
+ if (unaligned_src_si[0] != 0xfefdfcfb)
+ return 1;
+ if (unaligned_src_si[16] != 0xfefdfcfb)
+ return 1;
+ if (aligned_src_si[0] != 0xeaebeced)
+ return 1;
+ if (aligned_src_si[16] != 0xeaebeced)
+ return 1;
+ if (aligned_dst_si[0] != 0xdcdbdad9)
+ return 1;
+ if (aligned_dst_si[16] != 0xdcdbdad9)
+ return 1;
+ if (unaligned_dst_si[0] != 0xc8c9cacb)
+ return 1;
+ if (unaligned_dst_si[16] != 0xc8c9cacb)
+ return 1;
+ return 0;
+}