@@ -766,7 +766,7 @@ bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
tree aarch64_vector_load_decl (tree);
void aarch64_expand_call (rtx, rtx, rtx, bool);
bool aarch64_expand_cpymem_mops (rtx *, bool);
-bool aarch64_expand_cpymem (rtx *);
+bool aarch64_expand_cpymem (rtx *, bool);
bool aarch64_expand_setmem (rtx *);
bool aarch64_float_const_zero_rtx_p (rtx);
bool aarch64_float_const_rtx_p (rtx);
@@ -25191,48 +25191,35 @@ aarch64_progress_pointer (rtx pointer)
MODE bytes. */
static void
-aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
- machine_mode mode)
+aarch64_copy_one_block (rtx *load, rtx *store, rtx src, rtx dst,
+ int offset, machine_mode mode)
{
/* Handle 256-bit memcpy separately. We do this by making 2 adjacent memory
address copies using V4SImode so that we can use Q registers. */
if (known_eq (GET_MODE_BITSIZE (mode), 256))
{
mode = V4SImode;
+ rtx src1 = adjust_address (src, mode, offset);
+ rtx src2 = adjust_address (src, mode, offset + 16);
+ rtx dst1 = adjust_address (dst, mode, offset);
+ rtx dst2 = adjust_address (dst, mode, offset + 16);
rtx reg1 = gen_reg_rtx (mode);
rtx reg2 = gen_reg_rtx (mode);
- /* "Cast" the pointers to the correct mode. */
- *src = adjust_address (*src, mode, 0);
- *dst = adjust_address (*dst, mode, 0);
- /* Emit the memcpy. */
- emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
- aarch64_progress_pointer (*src)));
- emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
- aarch64_progress_pointer (*dst), reg2));
- /* Move the pointers forward. */
- *src = aarch64_move_pointer (*src, 32);
- *dst = aarch64_move_pointer (*dst, 32);
+ *load = aarch64_gen_load_pair (mode, reg1, src1, reg2, src2);
+ *store = aarch64_gen_store_pair (mode, dst1, reg1, dst2, reg2);
return;
}
rtx reg = gen_reg_rtx (mode);
-
- /* "Cast" the pointers to the correct mode. */
- *src = adjust_address (*src, mode, 0);
- *dst = adjust_address (*dst, mode, 0);
- /* Emit the memcpy. */
- emit_move_insn (reg, *src);
- emit_move_insn (*dst, reg);
- /* Move the pointers forward. */
- *src = aarch64_progress_pointer (*src);
- *dst = aarch64_progress_pointer (*dst);
+ *load = gen_move_insn (reg, adjust_address (src, mode, offset));
+ *store = gen_move_insn (adjust_address (dst, mode, offset), reg);
}
/* Expand a cpymem/movmem using the MOPS extension. OPERANDS are taken
from the cpymem/movmem pattern. IS_MEMMOVE is true if this is a memmove
rather than memcpy. Return true iff we succeeded. */
bool
-aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
+aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
{
if (!TARGET_MOPS)
return false;
@@ -25251,12 +25238,12 @@ aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
return true;
}
-/* Expand cpymem, as if from a __builtin_memcpy. Return true if
- we succeed, otherwise return false, indicating that a libcall to
- memcpy should be emitted. */
-
+/* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
+ OPERANDS are taken from the cpymem/movmem pattern. IS_MEMMOVE is true
+ if this is a memmove rather than memcpy. Return true if we succeed,
+ otherwise return false, indicating that a libcall should be emitted. */
bool
-aarch64_expand_cpymem (rtx *operands)
+aarch64_expand_cpymem (rtx *operands, bool is_memmove)
{
int mode_bits;
rtx dst = operands[0];
@@ -25268,17 +25255,22 @@ aarch64_expand_cpymem (rtx *operands)
/* Variable-sized or strict-align copies may use the MOPS expansion. */
if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
- return aarch64_expand_cpymem_mops (operands);
+ return aarch64_expand_cpymem_mops (operands, is_memmove);
unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
- /* Try to inline up to 256 bytes. */
- unsigned max_copy_size = 256;
- unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
+ /* Set inline limits for memmove/memcpy. MOPS has a separate threshold. */
+ unsigned max_copy_size = TARGET_SIMD ? 256 : 128;
+ unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
+ : aarch64_mops_memcpy_size_threshold;
+
+ /* Reduce the maximum size with -Os. */
+ if (size_p)
+ max_copy_size /= 4;
/* Large copies use MOPS when available or a library call. */
if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
- return aarch64_expand_cpymem_mops (operands);
+ return aarch64_expand_cpymem_mops (operands, is_memmove);
int copy_bits = 256;
@@ -25290,23 +25282,20 @@ aarch64_expand_cpymem (rtx *operands)
& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
copy_bits = 128;
- /* Emit an inline load+store sequence and count the number of operations
- involved. We use a simple count of just the loads and stores emitted
- rather than rtx_insn count as all the pointer adjustments and reg copying
- in this function will get optimized away later in the pipeline. */
- start_sequence ();
- unsigned nops = 0;
-
base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
dst = adjust_automodify_address (dst, VOIDmode, base, 0);
base = copy_to_mode_reg (Pmode, XEXP (src, 0));
src = adjust_automodify_address (src, VOIDmode, base, 0);
+ const int max_ops = 40;
+ rtx load[max_ops], store[max_ops];
+
/* Convert size to bits to make the rest of the code simpler. */
int n = size * BITS_PER_UNIT;
+ int nops, offset;
- while (n > 0)
+ for (nops = 0, offset = 0; n > 0; nops++)
{
/* Find the largest mode in which to do the copy in without over reading
or writing. */
@@ -25315,7 +25304,7 @@ aarch64_expand_cpymem (rtx *operands)
if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
cur_mode = mode_iter.require ();
- gcc_assert (cur_mode != BLKmode);
+ gcc_assert (cur_mode != BLKmode && nops < max_ops);
mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
@@ -25323,49 +25312,38 @@ aarch64_expand_cpymem (rtx *operands)
if (mode_bits == 128 && copy_bits == 256)
cur_mode = V4SImode;
- aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
- /* A single block copy is 1 load + 1 store. */
- nops += 2;
+ aarch64_copy_one_block (&load[nops], &store[nops], src, dst, offset, cur_mode);
n -= mode_bits;
+ offset += mode_bits / BITS_PER_UNIT;
- /* Emit trailing copies using overlapping unaligned accesses
- (when !STRICT_ALIGNMENT) - this is smaller and faster. */
- if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
+ /* Emit trailing copies using overlapping unaligned accesses -
+ this is smaller and faster. */
+ if (n > 0 && n < copy_bits / 2)
{
machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
gcc_assert (n_bits <= mode_bits);
- src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
- dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
+ offset -= (n_bits - n) / BITS_PER_UNIT;
n = n_bits;
}
}
- rtx_insn *seq = get_insns ();
- end_sequence ();
- /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
- the constant size into a register. */
- unsigned mops_cost = 3 + 1;
-
- /* If MOPS is available at this point we don't consider the libcall as it's
- not a win even on code size. At this point only consider MOPS if
- optimizing for size. For speed optimizations we will have chosen between
- the two based on copy size already. */
- if (TARGET_MOPS)
- {
- if (size_p && mops_cost < nops)
- return aarch64_expand_cpymem_mops (operands);
- emit_insn (seq);
- return true;
- }
- /* A memcpy libcall in the worst case takes 3 instructions to prepare the
- arguments + 1 for the call. When MOPS is not available and we're
- optimizing for size a libcall may be preferable. */
- unsigned libcall_cost = 4;
- if (size_p && libcall_cost < nops)
- return false;
+ /* Memcpy interleaves loads with stores, memmove emits all loads first. */
+ int i, j, m, inc;
+ inc = is_memmove ? nops : 3;
+ if (nops == inc + 1)
+ inc = nops / 2;
+ for (i = 0; i < nops; i += inc)
+ {
+ m = inc;
+ if (i + m > nops)
+ m = nops - i;
- emit_insn (seq);
+ for (j = 0; j < m; j++)
+ emit_insn (load[i + j]);
+ for (j = 0; j < m; j++)
+ emit_insn (store[i + j]);
+ }
return true;
}
@@ -1629,7 +1629,7 @@ (define_expand "cpymemdi"
(match_operand:DI 3 "immediate_operand")]
""
{
- if (aarch64_expand_cpymem (operands))
+ if (aarch64_expand_cpymem (operands, false))
DONE;
FAIL;
}
@@ -1673,17 +1673,9 @@ (define_expand "movmemdi"
(match_operand:BLK 1 "memory_operand")
(match_operand:DI 2 "general_operand")
(match_operand:DI 3 "immediate_operand")]
- "TARGET_MOPS"
+ ""
{
- rtx sz_reg = operands[2];
- /* For constant-sized memmoves check the threshold.
- FIXME: We should add a non-MOPS memmove expansion for smaller,
- constant-sized memmove to avoid going to a libcall. */
- if (CONST_INT_P (sz_reg)
- && INTVAL (sz_reg) < aarch64_mops_memmove_size_threshold)
- FAIL;
-
- if (aarch64_expand_cpymem_mops (operands, true))
+ if (aarch64_expand_cpymem (operands, true))
DONE;
FAIL;
}
@@ -327,7 +327,7 @@ Target Joined UInteger Var(aarch64_mops_memcpy_size_threshold) Init(256) Param
Constant memcpy size in bytes above which to start using MOPS sequence.
-param=aarch64-mops-memmove-size-threshold=
-Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(0) Param
+Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(256) Param
Constant memmove size in bytes above which to start using MOPS sequence.
-param=aarch64-mops-memset-size-threshold=
new file mode 100644
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+void
+copy1 (int *x, int *y)
+{
+ __builtin_memmove (x, y, 12);
+}
+
+void
+copy2 (int *x, int *y)
+{
+ __builtin_memmove (x, y, 128);
+}
+
+void
+copy3 (int *x, int *y)
+{
+ __builtin_memmove (x, y, 255);
+}
+
+/* { dg-final { scan-assembler-not {\tb\tmemmove} } } */