[v2] AArch64: Fix strict-align cpymem/setmem [PR103100]
Checks
Context |
Check |
Description |
linaro-tcwg-bot/tcwg_gcc_build--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_gcc_check--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 |
success
|
Testing passed
|
Commit Message
v2: Use UINTVAL, rename max_mops_size.
The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.
Passes regress/bootstrap, OK for commit?
gcc/ChangeLog/
PR target/103100
* config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
(setmemdi): Likewise.
* config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
strict-align. Cleanup condition for using MOPS.
(aarch64_expand_setmem): Likewise.
---
Comments
ping
v2: Use UINTVAL, rename max_mops_size.
The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.
Passes regress/bootstrap, OK for commit?
gcc/ChangeLog/
PR target/103100
* config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
(setmemdi): Likewise.
* config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
strict-align. Cleanup condition for using MOPS.
(aarch64_expand_setmem): Likewise.
---
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
int mode_bits;
rtx dst = operands[0];
rtx src = operands[1];
+ unsigned align = UINTVAL (operands[3]);
rtx base;
machine_mode cur_mode = BLKmode;
+ bool size_p = optimize_function_for_size_p (cfun);
- /* Variable-sized memcpy can go through the MOPS expansion if available. */
- if (!CONST_INT_P (operands[2]))
+ /* Variable-sized or strict-align copies may use the MOPS expansion. */
+ if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
return aarch64_expand_cpymem_mops (operands);
- unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
-
- /* Try to inline up to 256 bytes or use the MOPS threshold if available. */
- unsigned HOST_WIDE_INT max_copy_size
- = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+ unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
- bool size_p = optimize_function_for_size_p (cfun);
+ /* Try to inline up to 256 bytes. */
+ unsigned max_copy_size = 256;
+ unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
- /* Large constant-sized cpymem should go through MOPS when possible.
- It should be a win even for size optimization in the general case.
- For speed optimization the choice between MOPS and the SIMD sequence
- depends on the size of the copy, rather than number of instructions,
- alignment etc. */
- if (size > max_copy_size)
+ /* Large copies use MOPS when available or a library call. */
+ if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
return aarch64_expand_cpymem_mops (operands);
int copy_bits = 256;
@@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
unsigned HOST_WIDE_INT len;
rtx dst = operands[0];
rtx val = operands[2], src;
+ unsigned align = UINTVAL (operands[3]);
rtx base;
machine_mode cur_mode = BLKmode, next_mode;
- /* If we don't have SIMD registers or the size is variable use the MOPS
- inlined sequence if possible. */
- if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+ /* Variable-sized or strict-align memset may use the MOPS expansion. */
+ if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+ || (STRICT_ALIGNMENT && align < 16))
return aarch64_expand_setmem_mops (operands);
bool size_p = optimize_function_for_size_p (cfun);
@@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
/* Default the maximum to 256-bytes when considering only libcall vs
SIMD broadcast sequence. */
unsigned max_set_size = 256;
+ unsigned mops_threshold = aarch64_mops_memset_size_threshold;
- len = INTVAL (operands[1]);
- if (len > max_set_size && !TARGET_MOPS)
- return false;
+ len = UINTVAL (operands[1]);
+
+ /* Large memset uses MOPS when available or a library call. */
+ if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
+ return aarch64_expand_setmem_mops (operands);
int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
/* The MOPS sequence takes:
@@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
the arguments + 1 for the call. */
unsigned libcall_cost = 4;
- /* Upper bound check. For large constant-sized setmem use the MOPS sequence
- when available. */
- if (TARGET_MOPS
- && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
- return aarch64_expand_setmem_mops (operands);
-
/* Attempt a sequence with a vector broadcast followed by stores.
Count the number of operations involved to see if it's worth it
against the alternatives. A simple counter simd_ops on the
@@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
simd_ops++;
n -= mode_bits;
- /* Do certain trailing copies as overlapping if it's going to be
- cheaper. i.e. less instructions to do so. For instance doing a 15
- byte copy it's more efficient to do two overlapping 8 byte copies than
- 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */
+ /* Emit trailing writes using overlapping unaligned accesses
+ (when !STRICT_ALIGNMENT) - this is smaller and faster. */
if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
{
next_mode = smallest_mode_for_size (n, MODE_INT);
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
(match_operand:BLK 1 "memory_operand")
(match_operand:DI 2 "general_operand")
(match_operand:DI 3 "immediate_operand")]
- "!STRICT_ALIGNMENT || TARGET_MOPS"
+ ""
{
if (aarch64_expand_cpymem (operands))
DONE;
@@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
(match_operand:QI 2 "nonmemory_operand")) ;; Value
(use (match_operand:DI 1 "general_operand")) ;; Length
(match_operand 3 "immediate_operand")] ;; Align
- "TARGET_SIMD || TARGET_MOPS"
+ ""
{
if (aarch64_expand_setmem (operands))
DONE;
ping
v2: Use UINTVAL, rename max_mops_size.
The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.
Passes regress/bootstrap, OK for commit?
gcc/ChangeLog/
PR target/103100
* config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
(setmemdi): Likewise.
* config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
strict-align. Cleanup condition for using MOPS.
(aarch64_expand_setmem): Likewise.
---
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
int mode_bits;
rtx dst = operands[0];
rtx src = operands[1];
+ unsigned align = UINTVAL (operands[3]);
rtx base;
machine_mode cur_mode = BLKmode;
+ bool size_p = optimize_function_for_size_p (cfun);
- /* Variable-sized memcpy can go through the MOPS expansion if available. */
- if (!CONST_INT_P (operands[2]))
+ /* Variable-sized or strict-align copies may use the MOPS expansion. */
+ if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
return aarch64_expand_cpymem_mops (operands);
- unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
-
- /* Try to inline up to 256 bytes or use the MOPS threshold if available. */
- unsigned HOST_WIDE_INT max_copy_size
- = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+ unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
- bool size_p = optimize_function_for_size_p (cfun);
+ /* Try to inline up to 256 bytes. */
+ unsigned max_copy_size = 256;
+ unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
- /* Large constant-sized cpymem should go through MOPS when possible.
- It should be a win even for size optimization in the general case.
- For speed optimization the choice between MOPS and the SIMD sequence
- depends on the size of the copy, rather than number of instructions,
- alignment etc. */
- if (size > max_copy_size)
+ /* Large copies use MOPS when available or a library call. */
+ if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
return aarch64_expand_cpymem_mops (operands);
int copy_bits = 256;
@@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
unsigned HOST_WIDE_INT len;
rtx dst = operands[0];
rtx val = operands[2], src;
+ unsigned align = UINTVAL (operands[3]);
rtx base;
machine_mode cur_mode = BLKmode, next_mode;
- /* If we don't have SIMD registers or the size is variable use the MOPS
- inlined sequence if possible. */
- if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+ /* Variable-sized or strict-align memset may use the MOPS expansion. */
+ if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+ || (STRICT_ALIGNMENT && align < 16))
return aarch64_expand_setmem_mops (operands);
bool size_p = optimize_function_for_size_p (cfun);
@@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
/* Default the maximum to 256-bytes when considering only libcall vs
SIMD broadcast sequence. */
unsigned max_set_size = 256;
+ unsigned mops_threshold = aarch64_mops_memset_size_threshold;
- len = INTVAL (operands[1]);
- if (len > max_set_size && !TARGET_MOPS)
- return false;
+ len = UINTVAL (operands[1]);
+
+ /* Large memset uses MOPS when available or a library call. */
+ if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
+ return aarch64_expand_setmem_mops (operands);
int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
/* The MOPS sequence takes:
@@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
the arguments + 1 for the call. */
unsigned libcall_cost = 4;
- /* Upper bound check. For large constant-sized setmem use the MOPS sequence
- when available. */
- if (TARGET_MOPS
- && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
- return aarch64_expand_setmem_mops (operands);
-
/* Attempt a sequence with a vector broadcast followed by stores.
Count the number of operations involved to see if it's worth it
against the alternatives. A simple counter simd_ops on the
@@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
simd_ops++;
n -= mode_bits;
- /* Do certain trailing copies as overlapping if it's going to be
- cheaper. i.e. less instructions to do so. For instance doing a 15
- byte copy it's more efficient to do two overlapping 8 byte copies than
- 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */
+ /* Emit trailing writes using overlapping unaligned accesses
+ (when !STRICT_ALIGNMENT) - this is smaller and faster. */
if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
{
next_mode = smallest_mode_for_size (n, MODE_INT);
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
(match_operand:BLK 1 "memory_operand")
(match_operand:DI 2 "general_operand")
(match_operand:DI 3 "immediate_operand")]
- "!STRICT_ALIGNMENT || TARGET_MOPS"
+ ""
{
if (aarch64_expand_cpymem (operands))
DONE;
@@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
(match_operand:QI 2 "nonmemory_operand")) ;; Value
(use (match_operand:DI 1 "general_operand")) ;; Length
(match_operand 3 "immediate_operand")] ;; Align
- "TARGET_SIMD || TARGET_MOPS"
+ ""
{
if (aarch64_expand_setmem (operands))
DONE;
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> v2: Use UINTVAL, rename max_mops_size.
>
> The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
> Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
> Clean up the condition when to use MOPS.
>
> Passes regress/bootstrap, OK for commit?
>
> gcc/ChangeLog/
> PR target/103100
> * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
> (setmemdi): Likewise.
> * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
> strict-align. Cleanup condition for using MOPS.
> (aarch64_expand_setmem): Likewise.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
> int mode_bits;
> rtx dst = operands[0];
> rtx src = operands[1];
> + unsigned align = UINTVAL (operands[3]);
> rtx base;
> machine_mode cur_mode = BLKmode;
> + bool size_p = optimize_function_for_size_p (cfun);
>
> - /* Variable-sized memcpy can go through the MOPS expansion if available. */
> - if (!CONST_INT_P (operands[2]))
> + /* Variable-sized or strict-align copies may use the MOPS expansion. */
> + if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
> return aarch64_expand_cpymem_mops (operands);
>
> - unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
> -
> - /* Try to inline up to 256 bytes or use the MOPS threshold if available. */
> - unsigned HOST_WIDE_INT max_copy_size
> - = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
> + unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
>
> - bool size_p = optimize_function_for_size_p (cfun);
> + /* Try to inline up to 256 bytes. */
> + unsigned max_copy_size = 256;
> + unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
>
> - /* Large constant-sized cpymem should go through MOPS when possible.
> - It should be a win even for size optimization in the general case.
> - For speed optimization the choice between MOPS and the SIMD sequence
> - depends on the size of the copy, rather than number of instructions,
> - alignment etc. */
> - if (size > max_copy_size)
> + /* Large copies use MOPS when available or a library call. */
> + if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
> return aarch64_expand_cpymem_mops (operands);
It feels a little unintuitive to be calling aarch64_expand_cpymem_mops
for !TARGET_MOPS, but that's pre-existing, and I can see there are
arguments both ways.
Although !TARGET_SIMD is a niche interest on current trunk, it becomes
important for streaming-compatible mode. So we might want to look
again at the different handling of !TARGET_SIMD in this function (where
we lower the copy size but not the threshold) and aarch64_expand_setmem
(where we bail out early). That's not something for this patch though,
just mentioning it.
The patch is OK with me, but please give Richard E a day to object.
Thanks,
Richard
>
> int copy_bits = 256;
> @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
> unsigned HOST_WIDE_INT len;
> rtx dst = operands[0];
> rtx val = operands[2], src;
> + unsigned align = UINTVAL (operands[3]);
> rtx base;
> machine_mode cur_mode = BLKmode, next_mode;
>
> - /* If we don't have SIMD registers or the size is variable use the MOPS
> - inlined sequence if possible. */
> - if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
> + /* Variable-sized or strict-align memset may use the MOPS expansion. */
> + if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
> + || (STRICT_ALIGNMENT && align < 16))
> return aarch64_expand_setmem_mops (operands);
>
> bool size_p = optimize_function_for_size_p (cfun);
> @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
> /* Default the maximum to 256-bytes when considering only libcall vs
> SIMD broadcast sequence. */
> unsigned max_set_size = 256;
> + unsigned mops_threshold = aarch64_mops_memset_size_threshold;
>
> - len = INTVAL (operands[1]);
> - if (len > max_set_size && !TARGET_MOPS)
> - return false;
> + len = UINTVAL (operands[1]);
> +
> + /* Large memset uses MOPS when available or a library call. */
> + if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
> + return aarch64_expand_setmem_mops (operands);
>
> int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
> /* The MOPS sequence takes:
> @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
> the arguments + 1 for the call. */
> unsigned libcall_cost = 4;
>
> - /* Upper bound check. For large constant-sized setmem use the MOPS sequence
> - when available. */
> - if (TARGET_MOPS
> - && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
> - return aarch64_expand_setmem_mops (operands);
> -
> /* Attempt a sequence with a vector broadcast followed by stores.
> Count the number of operations involved to see if it's worth it
> against the alternatives. A simple counter simd_ops on the
> @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
> simd_ops++;
> n -= mode_bits;
>
> - /* Do certain trailing copies as overlapping if it's going to be
> - cheaper. i.e. less instructions to do so. For instance doing a 15
> - byte copy it's more efficient to do two overlapping 8 byte copies than
> - 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */
> + /* Emit trailing writes using overlapping unaligned accesses
> + (when !STRICT_ALIGNMENT) - this is smaller and faster. */
> if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
> {
> next_mode = smallest_mode_for_size (n, MODE_INT);
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
> (match_operand:BLK 1 "memory_operand")
> (match_operand:DI 2 "general_operand")
> (match_operand:DI 3 "immediate_operand")]
> - "!STRICT_ALIGNMENT || TARGET_MOPS"
> + ""
> {
> if (aarch64_expand_cpymem (operands))
> DONE;
> @@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
> (match_operand:QI 2 "nonmemory_operand")) ;; Value
> (use (match_operand:DI 1 "general_operand")) ;; Length
> (match_operand 3 "immediate_operand")] ;; Align
> - "TARGET_SIMD || TARGET_MOPS"
> + ""
> {
> if (aarch64_expand_setmem (operands))
> DONE;
On 29/11/2023 18:09, Richard Sandiford wrote:
> Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
>> v2: Use UINTVAL, rename max_mops_size.
>>
>> The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
>> Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
>> Clean up the condition when to use MOPS.
>>
>> Passes regress/bootstrap, OK for commit?
>>
>> gcc/ChangeLog/
>> PR target/103100
>> * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
>> (setmemdi): Likewise.
>> * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
>> strict-align. Cleanup condition for using MOPS.
>> (aarch64_expand_setmem): Likewise.
>>
>> ---
>>
>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644
>> --- a/gcc/config/aarch64/aarch64.cc
>> +++ b/gcc/config/aarch64/aarch64.cc
>> @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
>> int mode_bits;
>> rtx dst = operands[0];
>> rtx src = operands[1];
>> + unsigned align = UINTVAL (operands[3]);
>> rtx base;
>> machine_mode cur_mode = BLKmode;
>> + bool size_p = optimize_function_for_size_p (cfun);
>>
>> - /* Variable-sized memcpy can go through the MOPS expansion if available. */
>> - if (!CONST_INT_P (operands[2]))
>> + /* Variable-sized or strict-align copies may use the MOPS expansion. */
>> + if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
>> return aarch64_expand_cpymem_mops (operands);
>>
>> - unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
>> -
>> - /* Try to inline up to 256 bytes or use the MOPS threshold if available. */
>> - unsigned HOST_WIDE_INT max_copy_size
>> - = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
>> + unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
>>
>> - bool size_p = optimize_function_for_size_p (cfun);
>> + /* Try to inline up to 256 bytes. */
>> + unsigned max_copy_size = 256;
>> + unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
>>
>> - /* Large constant-sized cpymem should go through MOPS when possible.
>> - It should be a win even for size optimization in the general case.
>> - For speed optimization the choice between MOPS and the SIMD sequence
>> - depends on the size of the copy, rather than number of instructions,
>> - alignment etc. */
>> - if (size > max_copy_size)
>> + /* Large copies use MOPS when available or a library call. */
>> + if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
>> return aarch64_expand_cpymem_mops (operands);
>
> It feels a little unintuitive to be calling aarch64_expand_cpymem_mops
> for !TARGET_MOPS, but that's pre-existing, and I can see there are
> arguments both ways.
>
> Although !TARGET_SIMD is a niche interest on current trunk, it becomes
> important for streaming-compatible mode. So we might want to look
> again at the different handling of !TARGET_SIMD in this function (where
> we lower the copy size but not the threshold) and aarch64_expand_setmem
> (where we bail out early). That's not something for this patch though,
> just mentioning it.
>
> The patch is OK with me, but please give Richard E a day to object.
This is fine by me.
R.
>
> Thanks,
> Richard
>
>>
>> int copy_bits = 256;
>> @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
>> unsigned HOST_WIDE_INT len;
>> rtx dst = operands[0];
>> rtx val = operands[2], src;
>> + unsigned align = UINTVAL (operands[3]);
>> rtx base;
>> machine_mode cur_mode = BLKmode, next_mode;
>>
>> - /* If we don't have SIMD registers or the size is variable use the MOPS
>> - inlined sequence if possible. */
>> - if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
>> + /* Variable-sized or strict-align memset may use the MOPS expansion. */
>> + if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
>> + || (STRICT_ALIGNMENT && align < 16))
>> return aarch64_expand_setmem_mops (operands);
>>
>> bool size_p = optimize_function_for_size_p (cfun);
>> @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
>> /* Default the maximum to 256-bytes when considering only libcall vs
>> SIMD broadcast sequence. */
>> unsigned max_set_size = 256;
>> + unsigned mops_threshold = aarch64_mops_memset_size_threshold;
>>
>> - len = INTVAL (operands[1]);
>> - if (len > max_set_size && !TARGET_MOPS)
>> - return false;
>> + len = UINTVAL (operands[1]);
>> +
>> + /* Large memset uses MOPS when available or a library call. */
>> + if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
>> + return aarch64_expand_setmem_mops (operands);
>>
>> int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
>> /* The MOPS sequence takes:
>> @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
>> the arguments + 1 for the call. */
>> unsigned libcall_cost = 4;
>>
>> - /* Upper bound check. For large constant-sized setmem use the MOPS sequence
>> - when available. */
>> - if (TARGET_MOPS
>> - && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
>> - return aarch64_expand_setmem_mops (operands);
>> -
>> /* Attempt a sequence with a vector broadcast followed by stores.
>> Count the number of operations involved to see if it's worth it
>> against the alternatives. A simple counter simd_ops on the
>> @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
>> simd_ops++;
>> n -= mode_bits;
>>
>> - /* Do certain trailing copies as overlapping if it's going to be
>> - cheaper. i.e. less instructions to do so. For instance doing a 15
>> - byte copy it's more efficient to do two overlapping 8 byte copies than
>> - 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */
>> + /* Emit trailing writes using overlapping unaligned accesses
>> + (when !STRICT_ALIGNMENT) - this is smaller and faster. */
>> if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
>> {
>> next_mode = smallest_mode_for_size (n, MODE_INT);
>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>> index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
>> (match_operand:BLK 1 "memory_operand")
>> (match_operand:DI 2 "general_operand")
>> (match_operand:DI 3 "immediate_operand")]
>> - "!STRICT_ALIGNMENT || TARGET_MOPS"
>> + ""
>> {
>> if (aarch64_expand_cpymem (operands))
>> DONE;
>> @@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
>> (match_operand:QI 2 "nonmemory_operand")) ;; Value
>> (use (match_operand:DI 1 "general_operand")) ;; Length
>> (match_operand 3 "immediate_operand")] ;; Align
>> - "TARGET_SIMD || TARGET_MOPS"
>> + ""
>> {
>> if (aarch64_expand_setmem (operands))
>> DONE;
OK to backport to GCC13 (it applies cleanly and regress/bootstrap passes)?
Cheers,
Wilco
On 29/11/2023 18:09, Richard Sandiford wrote:
> Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
>> v2: Use UINTVAL, rename max_mops_size.
>>
>> The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
>> Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
>> Clean up the condition when to use MOPS.
>>
>> Passes regress/bootstrap, OK for commit?
>>
>> gcc/ChangeLog/
>> PR target/103100
>> * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
>> (setmemdi): Likewise.
>> * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
>> strict-align. Cleanup condition for using MOPS.
>> (aarch64_expand_setmem): Likewise.
>>
>> ---
>>
>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644
>> --- a/gcc/config/aarch64/aarch64.cc
>> +++ b/gcc/config/aarch64/aarch64.cc
>> @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
>> int mode_bits;
>> rtx dst = operands[0];
>> rtx src = operands[1];
>> + unsigned align = UINTVAL (operands[3]);
>> rtx base;
>> machine_mode cur_mode = BLKmode;
>> + bool size_p = optimize_function_for_size_p (cfun);
>>
>> - /* Variable-sized memcpy can go through the MOPS expansion if available. */
>> - if (!CONST_INT_P (operands[2]))
>> + /* Variable-sized or strict-align copies may use the MOPS expansion. */
>> + if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
>> return aarch64_expand_cpymem_mops (operands);
>>
>> - unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
>> -
>> - /* Try to inline up to 256 bytes or use the MOPS threshold if available. */
>> - unsigned HOST_WIDE_INT max_copy_size
>> - = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
>> + unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
>>
>> - bool size_p = optimize_function_for_size_p (cfun);
>> + /* Try to inline up to 256 bytes. */
>> + unsigned max_copy_size = 256;
>> + unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
>>
>> - /* Large constant-sized cpymem should go through MOPS when possible.
>> - It should be a win even for size optimization in the general case.
>> - For speed optimization the choice between MOPS and the SIMD sequence
>> - depends on the size of the copy, rather than number of instructions,
>> - alignment etc. */
>> - if (size > max_copy_size)
>> + /* Large copies use MOPS when available or a library call. */
>> + if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
>> return aarch64_expand_cpymem_mops (operands);
>
> It feels a little unintuitive to be calling aarch64_expand_cpymem_mops
> for !TARGET_MOPS, but that's pre-existing, and I can see there are
> arguments both ways.
>
> Although !TARGET_SIMD is a niche interest on current trunk, it becomes
> important for streaming-compatible mode. So we might want to look
> again at the different handling of !TARGET_SIMD in this function (where
> we lower the copy size but not the threshold) and aarch64_expand_setmem
> (where we bail out early). That's not something for this patch though,
> just mentioning it.
>
> The patch is OK with me, but please give Richard E a day to object.
This is fine by me.
R.
>
> Thanks,
> Richard
>
>>
>> int copy_bits = 256;
>> @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
>> unsigned HOST_WIDE_INT len;
>> rtx dst = operands[0];
>> rtx val = operands[2], src;
>> + unsigned align = UINTVAL (operands[3]);
>> rtx base;
>> machine_mode cur_mode = BLKmode, next_mode;
>>
>> - /* If we don't have SIMD registers or the size is variable use the MOPS
>> - inlined sequence if possible. */
>> - if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
>> + /* Variable-sized or strict-align memset may use the MOPS expansion. */
>> + if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
>> + || (STRICT_ALIGNMENT && align < 16))
>> return aarch64_expand_setmem_mops (operands);
>>
>> bool size_p = optimize_function_for_size_p (cfun);
>> @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
>> /* Default the maximum to 256-bytes when considering only libcall vs
>> SIMD broadcast sequence. */
>> unsigned max_set_size = 256;
>> + unsigned mops_threshold = aarch64_mops_memset_size_threshold;
>>
>> - len = INTVAL (operands[1]);
>> - if (len > max_set_size && !TARGET_MOPS)
>> - return false;
>> + len = UINTVAL (operands[1]);
>> +
>> + /* Large memset uses MOPS when available or a library call. */
>> + if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
>> + return aarch64_expand_setmem_mops (operands);
>>
>> int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
>> /* The MOPS sequence takes:
>> @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
>> the arguments + 1 for the call. */
>> unsigned libcall_cost = 4;
>>
>> - /* Upper bound check. For large constant-sized setmem use the MOPS sequence
>> - when available. */
>> - if (TARGET_MOPS
>> - && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
>> - return aarch64_expand_setmem_mops (operands);
>> -
>> /* Attempt a sequence with a vector broadcast followed by stores.
>> Count the number of operations involved to see if it's worth it
>> against the alternatives. A simple counter simd_ops on the
>> @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
>> simd_ops++;
>> n -= mode_bits;
>>
>> - /* Do certain trailing copies as overlapping if it's going to be
>> - cheaper. i.e. less instructions to do so. For instance doing a 15
>> - byte copy it's more efficient to do two overlapping 8 byte copies than
>> - 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */
>> + /* Emit trailing writes using overlapping unaligned accesses
>> + (when !STRICT_ALIGNMENT) - this is smaller and faster. */
>> if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
>> {
>> next_mode = smallest_mode_for_size (n, MODE_INT);
>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>> index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
>> (match_operand:BLK 1 "memory_operand")
>> (match_operand:DI 2 "general_operand")
>> (match_operand:DI 3 "immediate_operand")]
>> - "!STRICT_ALIGNMENT || TARGET_MOPS"
>> + ""
>> {
>> if (aarch64_expand_cpymem (operands))
>> DONE;
>> @@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
>> (match_operand:QI 2 "nonmemory_operand")) ;; Value
>> (use (match_operand:DI 1 "general_operand")) ;; Length
>> (match_operand 3 "immediate_operand")] ;; Align
>> - "TARGET_SIMD || TARGET_MOPS"
>> + ""
>> {
>> if (aarch64_expand_setmem (operands))
>> DONE;
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> OK to backport to GCC13 (it applies cleanly and regress/bootstrap passes)?
Yes, thanks.
Richard
>
> Cheers,
> Wilco
>
> On 29/11/2023 18:09, Richard Sandiford wrote:
>> Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
>>> v2: Use UINTVAL, rename max_mops_size.
>>>
>>> The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
>>> Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
>>> Clean up the condition when to use MOPS.
>>>
>>> Passes regress/bootstrap, OK for commit?
>>>
>>> gcc/ChangeLog/
>>> PR target/103100
>>> * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
>>> (setmemdi): Likewise.
>>> * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
>>> strict-align. Cleanup condition for using MOPS.
>>> (aarch64_expand_setmem): Likewise.
>>>
>>> ---
>>>
>>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>>> index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644
>>> --- a/gcc/config/aarch64/aarch64.cc
>>> +++ b/gcc/config/aarch64/aarch64.cc
>>> @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
>>> int mode_bits;
>>> rtx dst = operands[0];
>>> rtx src = operands[1];
>>> + unsigned align = UINTVAL (operands[3]);
>>> rtx base;
>>> machine_mode cur_mode = BLKmode;
>>> + bool size_p = optimize_function_for_size_p (cfun);
>>>
>>> - /* Variable-sized memcpy can go through the MOPS expansion if available. */
>>> - if (!CONST_INT_P (operands[2]))
>>> + /* Variable-sized or strict-align copies may use the MOPS expansion. */
>>> + if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
>>> return aarch64_expand_cpymem_mops (operands);
>>>
>>> - unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
>>> -
>>> - /* Try to inline up to 256 bytes or use the MOPS threshold if available. */
>>> - unsigned HOST_WIDE_INT max_copy_size
>>> - = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
>>> + unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
>>>
>>> - bool size_p = optimize_function_for_size_p (cfun);
>>> + /* Try to inline up to 256 bytes. */
>>> + unsigned max_copy_size = 256;
>>> + unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
>>>
>>> - /* Large constant-sized cpymem should go through MOPS when possible.
>>> - It should be a win even for size optimization in the general case.
>>> - For speed optimization the choice between MOPS and the SIMD sequence
>>> - depends on the size of the copy, rather than number of instructions,
>>> - alignment etc. */
>>> - if (size > max_copy_size)
>>> + /* Large copies use MOPS when available or a library call. */
>>> + if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
>>> return aarch64_expand_cpymem_mops (operands);
>>
>> It feels a little unintuitive to be calling aarch64_expand_cpymem_mops
>> for !TARGET_MOPS, but that's pre-existing, and I can see there are
>> arguments both ways.
>>
>> Although !TARGET_SIMD is a niche interest on current trunk, it becomes
>> important for streaming-compatible mode. So we might want to look
>> again at the different handling of !TARGET_SIMD in this function (where
>> we lower the copy size but not the threshold) and aarch64_expand_setmem
>> (where we bail out early). That's not something for this patch though,
>> just mentioning it.
>>
>> The patch is OK with me, but please give Richard E a day to object.
>
> This is fine by me.
>
> R.
>
>>
>> Thanks,
>> Richard
>>
>>>
>>> int copy_bits = 256;
>>> @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
>>> unsigned HOST_WIDE_INT len;
>>> rtx dst = operands[0];
>>> rtx val = operands[2], src;
>>> + unsigned align = UINTVAL (operands[3]);
>>> rtx base;
>>> machine_mode cur_mode = BLKmode, next_mode;
>>>
>>> - /* If we don't have SIMD registers or the size is variable use the MOPS
>>> - inlined sequence if possible. */
>>> - if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
>>> + /* Variable-sized or strict-align memset may use the MOPS expansion. */
>>> + if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
>>> + || (STRICT_ALIGNMENT && align < 16))
>>> return aarch64_expand_setmem_mops (operands);
>>>
>>> bool size_p = optimize_function_for_size_p (cfun);
>>> @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
>>> /* Default the maximum to 256-bytes when considering only libcall vs
>>> SIMD broadcast sequence. */
>>> unsigned max_set_size = 256;
>>> + unsigned mops_threshold = aarch64_mops_memset_size_threshold;
>>>
>>> - len = INTVAL (operands[1]);
>>> - if (len > max_set_size && !TARGET_MOPS)
>>> - return false;
>>> + len = UINTVAL (operands[1]);
>>> +
>>> + /* Large memset uses MOPS when available or a library call. */
>>> + if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
>>> + return aarch64_expand_setmem_mops (operands);
>>>
>>> int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
>>> /* The MOPS sequence takes:
>>> @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
>>> the arguments + 1 for the call. */
>>> unsigned libcall_cost = 4;
>>>
>>> - /* Upper bound check. For large constant-sized setmem use the MOPS sequence
>>> - when available. */
>>> - if (TARGET_MOPS
>>> - && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
>>> - return aarch64_expand_setmem_mops (operands);
>>> -
>>> /* Attempt a sequence with a vector broadcast followed by stores.
>>> Count the number of operations involved to see if it's worth it
>>> against the alternatives. A simple counter simd_ops on the
>>> @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
>>> simd_ops++;
>>> n -= mode_bits;
>>>
>>> - /* Do certain trailing copies as overlapping if it's going to be
>>> - cheaper. i.e. less instructions to do so. For instance doing a 15
>>> - byte copy it's more efficient to do two overlapping 8 byte copies than
>>> - 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */
>>> + /* Emit trailing writes using overlapping unaligned accesses
>>> + (when !STRICT_ALIGNMENT) - this is smaller and faster. */
>>> if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
>>> {
>>> next_mode = smallest_mode_for_size (n, MODE_INT);
>>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>>> index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
>>> --- a/gcc/config/aarch64/aarch64.md
>>> +++ b/gcc/config/aarch64/aarch64.md
>>> @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
>>> (match_operand:BLK 1 "memory_operand")
>>> (match_operand:DI 2 "general_operand")
>>> (match_operand:DI 3 "immediate_operand")]
>>> - "!STRICT_ALIGNMENT || TARGET_MOPS"
>>> + ""
>>> {
>>> if (aarch64_expand_cpymem (operands))
>>> DONE;
>>> @@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
>>> (match_operand:QI 2 "nonmemory_operand")) ;; Value
>>> (use (match_operand:DI 1 "general_operand")) ;; Length
>>> (match_operand 3 "immediate_operand")] ;; Align
>>> - "TARGET_SIMD || TARGET_MOPS"
>>> + ""
>>> {
>>> if (aarch64_expand_setmem (operands))
>>> DONE;
@@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
int mode_bits;
rtx dst = operands[0];
rtx src = operands[1];
+ unsigned align = UINTVAL (operands[3]);
rtx base;
machine_mode cur_mode = BLKmode;
+ bool size_p = optimize_function_for_size_p (cfun);
- /* Variable-sized memcpy can go through the MOPS expansion if available. */
- if (!CONST_INT_P (operands[2]))
+ /* Variable-sized or strict-align copies may use the MOPS expansion. */
+ if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
return aarch64_expand_cpymem_mops (operands);
- unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
-
- /* Try to inline up to 256 bytes or use the MOPS threshold if available. */
- unsigned HOST_WIDE_INT max_copy_size
- = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+ unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
- bool size_p = optimize_function_for_size_p (cfun);
+ /* Try to inline up to 256 bytes. */
+ unsigned max_copy_size = 256;
+ unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
- /* Large constant-sized cpymem should go through MOPS when possible.
- It should be a win even for size optimization in the general case.
- For speed optimization the choice between MOPS and the SIMD sequence
- depends on the size of the copy, rather than number of instructions,
- alignment etc. */
- if (size > max_copy_size)
+ /* Large copies use MOPS when available or a library call. */
+ if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
return aarch64_expand_cpymem_mops (operands);
int copy_bits = 256;
@@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
unsigned HOST_WIDE_INT len;
rtx dst = operands[0];
rtx val = operands[2], src;
+ unsigned align = UINTVAL (operands[3]);
rtx base;
machine_mode cur_mode = BLKmode, next_mode;
- /* If we don't have SIMD registers or the size is variable use the MOPS
- inlined sequence if possible. */
- if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+ /* Variable-sized or strict-align memset may use the MOPS expansion. */
+ if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+ || (STRICT_ALIGNMENT && align < 16))
return aarch64_expand_setmem_mops (operands);
bool size_p = optimize_function_for_size_p (cfun);
@@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
/* Default the maximum to 256-bytes when considering only libcall vs
SIMD broadcast sequence. */
unsigned max_set_size = 256;
+ unsigned mops_threshold = aarch64_mops_memset_size_threshold;
- len = INTVAL (operands[1]);
- if (len > max_set_size && !TARGET_MOPS)
- return false;
+ len = UINTVAL (operands[1]);
+
+ /* Large memset uses MOPS when available or a library call. */
+ if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
+ return aarch64_expand_setmem_mops (operands);
int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
/* The MOPS sequence takes:
@@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
the arguments + 1 for the call. */
unsigned libcall_cost = 4;
- /* Upper bound check. For large constant-sized setmem use the MOPS sequence
- when available. */
- if (TARGET_MOPS
- && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
- return aarch64_expand_setmem_mops (operands);
-
/* Attempt a sequence with a vector broadcast followed by stores.
Count the number of operations involved to see if it's worth it
against the alternatives. A simple counter simd_ops on the
@@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
simd_ops++;
n -= mode_bits;
- /* Do certain trailing copies as overlapping if it's going to be
- cheaper. i.e. less instructions to do so. For instance doing a 15
- byte copy it's more efficient to do two overlapping 8 byte copies than
- 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */
+ /* Emit trailing writes using overlapping unaligned accesses
+ (when !STRICT_ALIGNMENT) - this is smaller and faster. */
if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
{
next_mode = smallest_mode_for_size (n, MODE_INT);
@@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
(match_operand:BLK 1 "memory_operand")
(match_operand:DI 2 "general_operand")
(match_operand:DI 3 "immediate_operand")]
- "!STRICT_ALIGNMENT || TARGET_MOPS"
+ ""
{
if (aarch64_expand_cpymem (operands))
DONE;
@@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
(match_operand:QI 2 "nonmemory_operand")) ;; Value
(use (match_operand:DI 1 "general_operand")) ;; Length
(match_operand 3 "immediate_operand")] ;; Align
- "TARGET_SIMD || TARGET_MOPS"
+ ""
{
if (aarch64_expand_setmem (operands))
DONE;