[rs6000] Enable overlap memory store for block memory clear
Checks
Context |
Check |
Description |
linaro-tcwg-bot/tcwg_gcc_build--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 |
success
|
Testing passed
|
Commit Message
Hi,
This patch enables overlap memory store for block memory clear which
saves the number of store instructions. The expander calls
widest_fixed_size_mode_for_block_clear to get the mode for looped block
clear and calls widest_fixed_size_mode_for_block_clear to get the mode
for last overlapped clear.
Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no
regressions. Is it OK for the trunk or next stage 1?
Thanks
Gui Haochen
ChangeLog
rs6000: Enable overlap memory store for block memory clear
gcc/
* config/rs6000/rs6000-string.cc
(widest_fixed_size_mode_for_block_clear): New.
(smallest_fixed_size_mode_for_block_clear): New.
(expand_block_clear): Call widest_fixed_size_mode_for_block_clear to
get the mode for looped memory stores and call
smallest_fixed_size_mode_for_block_clear to get the mode for the last
overlapped memory store.
gcc/testsuite
* gcc.target/powerpc/block-clear-1.c: New.
patch.diff
Comments
Hi,
As now it's stage 1, gently ping this:
https://gcc.gnu.org/pipermail/gcc-patches/2024-February/646478.html
Thanks
Gui Haochen
在 2024/2/26 10:25, HAO CHEN GUI 写道:
> Hi,
> This patch enables overlap memory store for block memory clear which
> saves the number of store instructions. The expander calls
> widest_fixed_size_mode_for_block_clear to get the mode for looped block
> clear and calls widest_fixed_size_mode_for_block_clear to get the mode
> for last overlapped clear.
>
> Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no
> regressions. Is it OK for the trunk or next stage 1?
>
> Thanks
> Gui Haochen
>
>
> ChangeLog
> rs6000: Enable overlap memory store for block memory clear
>
> gcc/
> * config/rs6000/rs6000-string.cc
> (widest_fixed_size_mode_for_block_clear): New.
> (smallest_fixed_size_mode_for_block_clear): New.
> (expand_block_clear): Call widest_fixed_size_mode_for_block_clear to
> get the mode for looped memory stores and call
> smallest_fixed_size_mode_for_block_clear to get the mode for the last
> overlapped memory store.
>
> gcc/testsuite
> * gcc.target/powerpc/block-clear-1.c: New.
>
>
> patch.diff
> diff --git a/gcc/config/rs6000/rs6000-string.cc b/gcc/config/rs6000/rs6000-string.cc
> index 133e5382af2..c2a6095a586 100644
> --- a/gcc/config/rs6000/rs6000-string.cc
> +++ b/gcc/config/rs6000/rs6000-string.cc
> @@ -38,6 +38,49 @@
> #include "profile-count.h"
> #include "predict.h"
>
> +/* Return the widest mode which mode size is less than or equal to the
> + size. */
> +static fixed_size_mode
> +widest_fixed_size_mode_for_block_clear (unsigned int size, unsigned int align,
> + bool unaligned_vsx_ok)
> +{
> + machine_mode mode;
> +
> + if (TARGET_ALTIVEC
> + && size >= 16
> + && (align >= 128
> + || unaligned_vsx_ok))
> + mode = V4SImode;
> + else if (size >= 8
> + && TARGET_POWERPC64
> + && (align >= 64
> + || !STRICT_ALIGNMENT))
> + mode = DImode;
> + else if (size >= 4
> + && (align >= 32
> + || !STRICT_ALIGNMENT))
> + mode = SImode;
> + else if (size >= 2
> + && (align >= 16
> + || !STRICT_ALIGNMENT))
> + mode = HImode;
> + else
> + mode = QImode;
> +
> + return as_a <fixed_size_mode> (mode);
> +}
> +
> +/* Return the smallest mode which mode size is smaller than or eqaul to
> + the size. */
> +static fixed_size_mode
> +smallest_fixed_size_mode_for_block_clear (unsigned int size)
> +{
> + if (size > UNITS_PER_WORD)
> + return as_a <fixed_size_mode> (V4SImode);
> +
> + return smallest_int_mode_for_size (size * BITS_PER_UNIT);
> +}
> +
> /* Expand a block clear operation, and return 1 if successful. Return 0
> if we should let the compiler generate normal code.
>
> @@ -55,7 +98,6 @@ expand_block_clear (rtx operands[])
> HOST_WIDE_INT align;
> HOST_WIDE_INT bytes;
> int offset;
> - int clear_bytes;
> int clear_step;
>
> /* If this is not a fixed size move, just call memcpy */
> @@ -89,62 +131,36 @@ expand_block_clear (rtx operands[])
>
> bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
>
> - for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
> + auto mode = widest_fixed_size_mode_for_block_clear (bytes, align,
> + unaligned_vsx_ok);
> + offset = 0;
> + rtx dest;
> +
> + do
> {
> - machine_mode mode = BLKmode;
> - rtx dest;
> + unsigned int size = GET_MODE_SIZE (mode);
>
> - if (TARGET_ALTIVEC
> - && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok)))
> + while (bytes >= size)
> {
> - clear_bytes = 16;
> - mode = V4SImode;
> - }
> - else if (bytes >= 8 && TARGET_POWERPC64
> - && (align >= 64 || !STRICT_ALIGNMENT))
> - {
> - clear_bytes = 8;
> - mode = DImode;
> - if (offset == 0 && align < 64)
> - {
> - rtx addr;
> + dest = adjust_address (orig_dest, mode, offset);
> + emit_move_insn (dest, CONST0_RTX (mode));
>
> - /* If the address form is reg+offset with offset not a
> - multiple of four, reload into reg indirect form here
> - rather than waiting for reload. This way we get one
> - reload, not one per store. */
> - addr = XEXP (orig_dest, 0);
> - if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
> - && CONST_INT_P (XEXP (addr, 1))
> - && (INTVAL (XEXP (addr, 1)) & 3) != 0)
> - {
> - addr = copy_addr_to_reg (addr);
> - orig_dest = replace_equiv_address (orig_dest, addr);
> - }
> - }
> - }
> - else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
> - { /* move 4 bytes */
> - clear_bytes = 4;
> - mode = SImode;
> - }
> - else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
> - { /* move 2 bytes */
> - clear_bytes = 2;
> - mode = HImode;
> - }
> - else /* move 1 byte at a time */
> - {
> - clear_bytes = 1;
> - mode = QImode;
> + offset += size;
> + bytes -= size;
> }
>
> - dest = adjust_address (orig_dest, mode, offset);
> + if (bytes == 0)
> + return 1;
>
> - emit_move_insn (dest, CONST0_RTX (mode));
> + mode = smallest_fixed_size_mode_for_block_clear (bytes);
> + int gap = GET_MODE_SIZE (mode) - bytes;
> + if (gap > 0)
> + {
> + offset -= gap;
> + bytes += gap;
> + }
> }
> -
> - return 1;
> + while (1);
> }
>
> /* Figure out the correct instructions to generate to load data for
> diff --git a/gcc/testsuite/gcc.target/powerpc/block-clear-1.c b/gcc/testsuite/gcc.target/powerpc/block-clear-1.c
> new file mode 100644
> index 00000000000..5e16c44fea3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/block-clear-1.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-not {\mst[hb]\M} } } */
> +
> +/* Verify that memclear takes overlap store. */
> +void* foo (char* s1)
> +{
> + __builtin_memset (s1, 0, 31);
> +}
Hi,
Gently ping it.
https://gcc.gnu.org/pipermail/gcc-patches/2024-February/646478.html
Thanks
Gui Haochen
在 2024/5/8 9:55, HAO CHEN GUI 写道:
> Hi,
> As now it's stage 1, gently ping this:
> https://gcc.gnu.org/pipermail/gcc-patches/2024-February/646478.html
>
> Thanks
> Gui Haochen
>
> 在 2024/2/26 10:25, HAO CHEN GUI 写道:
>> Hi,
>> This patch enables overlap memory store for block memory clear which
>> saves the number of store instructions. The expander calls
>> widest_fixed_size_mode_for_block_clear to get the mode for looped block
>> clear and calls widest_fixed_size_mode_for_block_clear to get the mode
>> for last overlapped clear.
>>
>> Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no
>> regressions. Is it OK for the trunk or next stage 1?
>>
>> Thanks
>> Gui Haochen
>>
>>
>> ChangeLog
>> rs6000: Enable overlap memory store for block memory clear
>>
>> gcc/
>> * config/rs6000/rs6000-string.cc
>> (widest_fixed_size_mode_for_block_clear): New.
>> (smallest_fixed_size_mode_for_block_clear): New.
>> (expand_block_clear): Call widest_fixed_size_mode_for_block_clear to
>> get the mode for looped memory stores and call
>> smallest_fixed_size_mode_for_block_clear to get the mode for the last
>> overlapped memory store.
>>
>> gcc/testsuite
>> * gcc.target/powerpc/block-clear-1.c: New.
>>
>>
>> patch.diff
>> diff --git a/gcc/config/rs6000/rs6000-string.cc b/gcc/config/rs6000/rs6000-string.cc
>> index 133e5382af2..c2a6095a586 100644
>> --- a/gcc/config/rs6000/rs6000-string.cc
>> +++ b/gcc/config/rs6000/rs6000-string.cc
>> @@ -38,6 +38,49 @@
>> #include "profile-count.h"
>> #include "predict.h"
>>
>> +/* Return the widest mode which mode size is less than or equal to the
>> + size. */
>> +static fixed_size_mode
>> +widest_fixed_size_mode_for_block_clear (unsigned int size, unsigned int align,
>> + bool unaligned_vsx_ok)
>> +{
>> + machine_mode mode;
>> +
>> + if (TARGET_ALTIVEC
>> + && size >= 16
>> + && (align >= 128
>> + || unaligned_vsx_ok))
>> + mode = V4SImode;
>> + else if (size >= 8
>> + && TARGET_POWERPC64
>> + && (align >= 64
>> + || !STRICT_ALIGNMENT))
>> + mode = DImode;
>> + else if (size >= 4
>> + && (align >= 32
>> + || !STRICT_ALIGNMENT))
>> + mode = SImode;
>> + else if (size >= 2
>> + && (align >= 16
>> + || !STRICT_ALIGNMENT))
>> + mode = HImode;
>> + else
>> + mode = QImode;
>> +
>> + return as_a <fixed_size_mode> (mode);
>> +}
>> +
>> +/* Return the smallest mode which mode size is smaller than or eqaul to
>> + the size. */
>> +static fixed_size_mode
>> +smallest_fixed_size_mode_for_block_clear (unsigned int size)
>> +{
>> + if (size > UNITS_PER_WORD)
>> + return as_a <fixed_size_mode> (V4SImode);
>> +
>> + return smallest_int_mode_for_size (size * BITS_PER_UNIT);
>> +}
>> +
>> /* Expand a block clear operation, and return 1 if successful. Return 0
>> if we should let the compiler generate normal code.
>>
>> @@ -55,7 +98,6 @@ expand_block_clear (rtx operands[])
>> HOST_WIDE_INT align;
>> HOST_WIDE_INT bytes;
>> int offset;
>> - int clear_bytes;
>> int clear_step;
>>
>> /* If this is not a fixed size move, just call memcpy */
>> @@ -89,62 +131,36 @@ expand_block_clear (rtx operands[])
>>
>> bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
>>
>> - for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
>> + auto mode = widest_fixed_size_mode_for_block_clear (bytes, align,
>> + unaligned_vsx_ok);
>> + offset = 0;
>> + rtx dest;
>> +
>> + do
>> {
>> - machine_mode mode = BLKmode;
>> - rtx dest;
>> + unsigned int size = GET_MODE_SIZE (mode);
>>
>> - if (TARGET_ALTIVEC
>> - && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok)))
>> + while (bytes >= size)
>> {
>> - clear_bytes = 16;
>> - mode = V4SImode;
>> - }
>> - else if (bytes >= 8 && TARGET_POWERPC64
>> - && (align >= 64 || !STRICT_ALIGNMENT))
>> - {
>> - clear_bytes = 8;
>> - mode = DImode;
>> - if (offset == 0 && align < 64)
>> - {
>> - rtx addr;
>> + dest = adjust_address (orig_dest, mode, offset);
>> + emit_move_insn (dest, CONST0_RTX (mode));
>>
>> - /* If the address form is reg+offset with offset not a
>> - multiple of four, reload into reg indirect form here
>> - rather than waiting for reload. This way we get one
>> - reload, not one per store. */
>> - addr = XEXP (orig_dest, 0);
>> - if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
>> - && CONST_INT_P (XEXP (addr, 1))
>> - && (INTVAL (XEXP (addr, 1)) & 3) != 0)
>> - {
>> - addr = copy_addr_to_reg (addr);
>> - orig_dest = replace_equiv_address (orig_dest, addr);
>> - }
>> - }
>> - }
>> - else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
>> - { /* move 4 bytes */
>> - clear_bytes = 4;
>> - mode = SImode;
>> - }
>> - else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
>> - { /* move 2 bytes */
>> - clear_bytes = 2;
>> - mode = HImode;
>> - }
>> - else /* move 1 byte at a time */
>> - {
>> - clear_bytes = 1;
>> - mode = QImode;
>> + offset += size;
>> + bytes -= size;
>> }
>>
>> - dest = adjust_address (orig_dest, mode, offset);
>> + if (bytes == 0)
>> + return 1;
>>
>> - emit_move_insn (dest, CONST0_RTX (mode));
>> + mode = smallest_fixed_size_mode_for_block_clear (bytes);
>> + int gap = GET_MODE_SIZE (mode) - bytes;
>> + if (gap > 0)
>> + {
>> + offset -= gap;
>> + bytes += gap;
>> + }
>> }
>> -
>> - return 1;
>> + while (1);
>> }
>>
>> /* Figure out the correct instructions to generate to load data for
>> diff --git a/gcc/testsuite/gcc.target/powerpc/block-clear-1.c b/gcc/testsuite/gcc.target/powerpc/block-clear-1.c
>> new file mode 100644
>> index 00000000000..5e16c44fea3
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/block-clear-1.c
>> @@ -0,0 +1,9 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" } */
>> +/* { dg-final { scan-assembler-not {\mst[hb]\M} } } */
>> +
>> +/* Verify that memclear takes overlap store. */
>> +void* foo (char* s1)
>> +{
>> + __builtin_memset (s1, 0, 31);
>> +}
@@ -38,6 +38,49 @@
#include "profile-count.h"
#include "predict.h"
+/* Return the widest mode which mode size is less than or equal to the
+ size. */
+static fixed_size_mode
+widest_fixed_size_mode_for_block_clear (unsigned int size, unsigned int align,
+ bool unaligned_vsx_ok)
+{
+ machine_mode mode;
+
+ if (TARGET_ALTIVEC
+ && size >= 16
+ && (align >= 128
+ || unaligned_vsx_ok))
+ mode = V4SImode;
+ else if (size >= 8
+ && TARGET_POWERPC64
+ && (align >= 64
+ || !STRICT_ALIGNMENT))
+ mode = DImode;
+ else if (size >= 4
+ && (align >= 32
+ || !STRICT_ALIGNMENT))
+ mode = SImode;
+ else if (size >= 2
+ && (align >= 16
+ || !STRICT_ALIGNMENT))
+ mode = HImode;
+ else
+ mode = QImode;
+
+ return as_a <fixed_size_mode> (mode);
+}
+
+/* Return the smallest mode which mode size is smaller than or eqaul to
+ the size. */
+static fixed_size_mode
+smallest_fixed_size_mode_for_block_clear (unsigned int size)
+{
+ if (size > UNITS_PER_WORD)
+ return as_a <fixed_size_mode> (V4SImode);
+
+ return smallest_int_mode_for_size (size * BITS_PER_UNIT);
+}
+
/* Expand a block clear operation, and return 1 if successful. Return 0
if we should let the compiler generate normal code.
@@ -55,7 +98,6 @@ expand_block_clear (rtx operands[])
HOST_WIDE_INT align;
HOST_WIDE_INT bytes;
int offset;
- int clear_bytes;
int clear_step;
/* If this is not a fixed size move, just call memcpy */
@@ -89,62 +131,36 @@ expand_block_clear (rtx operands[])
bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
- for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
+ auto mode = widest_fixed_size_mode_for_block_clear (bytes, align,
+ unaligned_vsx_ok);
+ offset = 0;
+ rtx dest;
+
+ do
{
- machine_mode mode = BLKmode;
- rtx dest;
+ unsigned int size = GET_MODE_SIZE (mode);
- if (TARGET_ALTIVEC
- && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok)))
+ while (bytes >= size)
{
- clear_bytes = 16;
- mode = V4SImode;
- }
- else if (bytes >= 8 && TARGET_POWERPC64
- && (align >= 64 || !STRICT_ALIGNMENT))
- {
- clear_bytes = 8;
- mode = DImode;
- if (offset == 0 && align < 64)
- {
- rtx addr;
+ dest = adjust_address (orig_dest, mode, offset);
+ emit_move_insn (dest, CONST0_RTX (mode));
- /* If the address form is reg+offset with offset not a
- multiple of four, reload into reg indirect form here
- rather than waiting for reload. This way we get one
- reload, not one per store. */
- addr = XEXP (orig_dest, 0);
- if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
- && CONST_INT_P (XEXP (addr, 1))
- && (INTVAL (XEXP (addr, 1)) & 3) != 0)
- {
- addr = copy_addr_to_reg (addr);
- orig_dest = replace_equiv_address (orig_dest, addr);
- }
- }
- }
- else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
- { /* move 4 bytes */
- clear_bytes = 4;
- mode = SImode;
- }
- else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
- { /* move 2 bytes */
- clear_bytes = 2;
- mode = HImode;
- }
- else /* move 1 byte at a time */
- {
- clear_bytes = 1;
- mode = QImode;
+ offset += size;
+ bytes -= size;
}
- dest = adjust_address (orig_dest, mode, offset);
+ if (bytes == 0)
+ return 1;
- emit_move_insn (dest, CONST0_RTX (mode));
+ mode = smallest_fixed_size_mode_for_block_clear (bytes);
+ int gap = GET_MODE_SIZE (mode) - bytes;
+ if (gap > 0)
+ {
+ offset -= gap;
+ bytes += gap;
+ }
}
-
- return 1;
+ while (1);
}
/* Figure out the correct instructions to generate to load data for
new file mode 100644
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-not {\mst[hb]\M} } } */
+
+/* Verify that memclear takes overlap store. */
+void* foo (char* s1)
+{
+ __builtin_memset (s1, 0, 31);
+}