x86-64: Avoid overlap of rep movsb in memmove [BZ #27130]
Commit Message
When copying with "rep movsb", if the distance between source and
destination is [1..63], performance may be very slow on some processors
with AVX or AVX512. Update memmove-vec-unaligned-erms.S for AVX and
AVX512 versions with the distance in RCX:
cmpq $63, %rcx
// Don't use "rep movsb" if RCX <= 63
jbe L(Don't use rep movsb")
Use "rep movsb"
Benchtests data with bench-memcpy, bench-memcpy-large, bench-memcpy-random
and bench-memcpy-walk on Skylake, Ice Lake and Tiger Lake shows that its
performance impact is within noise range since "rep movsb" is only used
for data size >= 4KB.
---
.../multiarch/memmove-vec-unaligned-erms.S | 21 +++++++++++++++++++
1 file changed, 21 insertions(+)
@@ -56,6 +56,13 @@
# endif
#endif
+/* Avoid overlap of rep movsb only with non-SSE vector. */
+#ifndef AVOID_OVERLAP_REP_MOVSB
+# define AVOID_OVERLAP_REP_MOVSB (VEC_SIZE > 16)
+#else
+# define AVOID_OVERLAP_REP_MOVSB 0
+#endif
+
#ifndef PREFETCH
# define PREFETCH(addr) prefetcht0 addr
#endif
@@ -243,7 +250,21 @@ L(movsb):
cmpq %r9, %rdi
/* Avoid slow backward REP MOVSB. */
jb L(more_8x_vec_backward)
+# if AVOID_OVERLAP_REP_MOVSB
+ movq %rdi, %rcx
+ subq %rsi, %rcx
+ jmp 2f
+# endif
1:
+# if AVOID_OVERLAP_REP_MOVSB
+ movq %rsi, %rcx
+ subq %rdi, %rcx
+2:
+/* Avoid "rep movsb" if RCX, the distance between source and destination,
+ is [1..63]. */
+ cmpq $63, %rcx
+ jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
+# endif
mov %RDX_LP, %RCX_LP
rep movsb
L(nop):