[v2,2/3] x86/string: Use simpler approach for large memcpy [BZ #32475]

Message ID 20250114210341.599037-2-goldstein.w.n@gmail.com (mailing list archive)
State New
Delegated to: Florian Weimer
Headers
Series [v2,1/3] x86/string: Factor out large memmove implemention to seperate file |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Test passed
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Test passed

Commit Message

Noah Goldstein Jan. 14, 2025, 9:03 p.m. UTC
  The new approach does a simple 4x non-temporal loop (forwards or
backwards to avoid 4k aliasing). This is similiar what we used to do prior to:

commit 1a8605b6cd257e8a74e29b5b71c057211f5fb847
Author: noah <goldstein.w.n@gmail.com>
Date:   Sat Apr 3 04:12:15 2021 -0400

    x86: Update large memcpy case in memmove-vec-unaligned-erms.S

But with 4k aliasing detection to avoid a known pathological slow
case.

The multi-page approach yielded 5-15% better performance for the size
ranges covered by bench-memcpy-large (roughly 64KB-32MB) on the tested
platforms but has some notable draw backs.

The drawbacks stem from the fact that the multi-page approach is
significantly less "canonical" a form of memcpy and thus is likely to
have less reliably "good" performance on untested platforms (including
future ones) and configurations (i.e > 2GB copies from BZ #32475).

Since there are known slow cases with the multi-page approach (that
far exceed 15%) and the multi-page approach is much more brittle, it
seems prudent to switch to this simpler, more reliable, better
future-proofed implementation.

Tested on x86_64.
---
 sysdeps/x86_64/multiarch/memmove-vec-large.S  | 107 ++++++++++++++++++
 .../multiarch/memmove-vec-unaligned-erms.S    |   2 +-
 2 files changed, 108 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/x86_64/multiarch/memmove-vec-large.S
  

Patch

diff --git a/sysdeps/x86_64/multiarch/memmove-vec-large.S b/sysdeps/x86_64/multiarch/memmove-vec-large.S
new file mode 100644
index 0000000000..fa13bd66a0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-vec-large.S
@@ -0,0 +1,107 @@ 
+#ifdef MEMMOVE_LARGE_IMPL
+# error "Multiple large memmove impls included!"
+#endif
+#define MEMMOVE_LARGE_IMPL	1
+
+/* Copies large regions by with a 4x unrolled loop of non-temporal
+   stores.  */
+
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+L(large_memcpy_check):
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	jb	L(more_8x_vec_check)
+L(large_memcpy):
+	/* To reach this point it is impossible for dst > src and
+	   overlap. Remaining to check is src > dst and overlap. rcx
+	   already contains dst - src. Negate rcx to get src - dst. If
+	   length > rcx then there is overlap and forward copy is best.  */
+	negq	%rcx
+	cmpq	%rcx, %rdx
+	ja	L(more_8x_vec_forward)
+
+	/* We are doing non-temporal copy and no overlap. Choose forward
+	   or backward copy based on avoiding 4k aliasing. ecx already
+	   contains src - dst. We check if:
+	   (src % 4096) - (dst % 4096) > (4096 - 512)
+	   If true then we risk aliasing.  */
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - 512), %ecx
+	ja	L(large_backward)
+
+	subq	%rdi, %rsi
+
+	/* Store the first VEC.  */
+	VMOVU	%VMM(0), (%rdi)
+
+	/* Store end of buffer minus tail in rdx.  */
+	leaq	(VEC_SIZE * -4)(%rdi, %rdx), %rdx
+
+	/* Align DST.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	incq	%rdi
+	leaq	(%rdi, %rsi), %rcx
+	/* Dont use multi-byte nop to align.  */
+	.p2align 4,, 11
+L(loop_4x_nt_forward):
+	PREFETCH_ONE_SET (1, (%rcx), VEC_SIZE * 8)
+	/* Copy 4 * VEC a time forward.  */
+	VMOVU	(VEC_SIZE * 0)(%rcx), %VMM(1)
+	VMOVU	(VEC_SIZE * 1)(%rcx), %VMM(2)
+	VMOVU	(VEC_SIZE * 2)(%rcx), %VMM(3)
+	VMOVU	(VEC_SIZE * 3)(%rcx), %VMM(4)
+	subq	$-(VEC_SIZE * 4), %rcx
+	VMOVNT	%VMM(1), (VEC_SIZE * 0)(%rdi)
+	VMOVNT	%VMM(2), (VEC_SIZE * 1)(%rdi)
+	VMOVNT	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	VMOVNT	%VMM(4), (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	ja	L(loop_4x_nt_forward)
+	sfence
+
+	VMOVU	(VEC_SIZE * 0)(%rsi, %rdx), %VMM(1)
+	VMOVU	(VEC_SIZE * 1)(%rsi, %rdx), %VMM(2)
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
+	VMOVU	%VMM(1), (VEC_SIZE * 0)(%rdx)
+	VMOVU	%VMM(2), (VEC_SIZE * 1)(%rdx)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdx)
+	VMOVU	%VMM(4), (VEC_SIZE * 3)(%rdx)
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 10
+L(large_backward):
+	leaq	(VEC_SIZE * -4 - 1)(%rdi, %rdx), %rcx
+	VMOVU	(VEC_SIZE * -1)(%rsi, %rdx), %VMM(5)
+	VMOVU	%VMM(5), (VEC_SIZE * -1)(%rdi, %rdx)
+	andq	$-(VEC_SIZE), %rcx
+	subq	%rdi, %rsi
+	leaq	(%rsi, %rcx), %rdx
+	/* Don't use multi-byte nop to align.  */
+	.p2align 4,, 11
+L(loop_4x_nt_backward):
+	PREFETCH_ONE_SET (-1, (%rdx), -VEC_SIZE * 8)
+	VMOVU	(VEC_SIZE * 3)(%rdx), %VMM(1)
+	VMOVU	(VEC_SIZE * 2)(%rdx), %VMM(2)
+	VMOVU	(VEC_SIZE * 1)(%rdx), %VMM(3)
+	VMOVU	(VEC_SIZE * 0)(%rdx), %VMM(4)
+	addq	$(VEC_SIZE * -4), %rdx
+	VMOVNT	%VMM(1), (VEC_SIZE * 3)(%rcx)
+	VMOVNT	%VMM(2), (VEC_SIZE * 2)(%rcx)
+	VMOVNT	%VMM(3), (VEC_SIZE * 1)(%rcx)
+	VMOVNT	%VMM(4), (VEC_SIZE * 0)(%rcx)
+	addq	$(VEC_SIZE * -4), %rcx
+	cmpq	%rcx, %rdi
+	jb	L(loop_4x_nt_backward)
+
+	sfence
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
+	VMOVU	(VEC_SIZE * 1)(%rsi, %rdi), %VMM(2)
+	/* We already loaded VMM(0).  */
+	VMOVU	%VMM(4), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VMM(2), (VEC_SIZE * 1)(%rdi)
+	VMOVU	%VMM(0), (VEC_SIZE * 0)(%rdi)
+	VZEROUPPER_RETURN
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 70d303687c..7c4765286d 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -127,7 +127,7 @@ 
 #endif
 
 #ifndef MEMMOVE_VEC_LARGE_IMPL
-# define MEMMOVE_VEC_LARGE_IMPL	"memmove-vec-large-page-unrolled.S"
+# define MEMMOVE_VEC_LARGE_IMPL	"memmove-vec-large.S"
 #endif
 
 #ifndef SECTION