[v4,1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
Commit Message
No Bug. This commit updates the large memcpy case (no overlap). The
update is to perform memcpy on either 2 or 4 contiguous pages at
once. This 1) helps to alleviate the affects of false memory aliasing
when destination and source have a close 4k alignment and 2) In most
cases and for most DRAM units is a modestly more efficient access
pattern. These changes are a clear performance improvement for
VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
pass.
Signed-off-by: noah <goldstein.w.n@gmail.com>
---
.../multiarch/memmove-vec-unaligned-erms.S | 326 ++++++++++++++----
1 file changed, 258 insertions(+), 68 deletions(-)
Comments
On Tue, Mar 30, 2021 at 2:38 PM noah <goldstein.w.n@gmail.com> wrote:
>
> No Bug. This commit updates the large memcpy case (no overlap). The
> update is to perform memcpy on either 2 or 4 contiguous pages at
> once. This 1) helps to alleviate the affects of false memory aliasing
> when destination and source have a close 4k alignment and 2) In most
> cases and for most DRAM units is a modestly more efficient access
> pattern. These changes are a clear performance improvement for
> VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
> test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
> pass.
>
> Signed-off-by: noah <goldstein.w.n@gmail.com>
> ---
> .../multiarch/memmove-vec-unaligned-erms.S | 326 ++++++++++++++----
> 1 file changed, 258 insertions(+), 68 deletions(-)
1. There are many trailing whitespaces.
2. Replace "jccSPACElabel" with "jccTABlabel".
3. Replace "insnSPACEoperand" with "insnTABoperand" if needed.
> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> index 897a3d9762..dae3e2bac5 100644
> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> @@ -35,7 +35,16 @@
> __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
> 7. If size >= __x86_shared_non_temporal_threshold and there is no
> overlap between destination and source, use non-temporal store
> - instead of aligned store. */
> + instead of aligned store copying from either 2 or 4 pages at
> + once.
> + 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
> + and source and destination do not page alias, copy from 2 pages
> + at once using non-temporal stores. Page aliasing in this case is
> + considered true if destination's page alignment - sources' page
> + alignment is less than 8 * VEC_SIZE.
> + 9. If size >= 16 * __x86_shared_non_temporal_threshold or source
> + and destination do page alias copy from 4 pages at once using
> + non-temporal stores. */
>
> #include <sysdep.h>
>
> @@ -67,6 +76,35 @@
> # endif
> #endif
>
> +#ifndef PAGE_SIZE
> +# define PAGE_SIZE 4096
> +#endif
> +
> +#if PAGE_SIZE != 4096
> +# error Unsupported PAGE_SIZE
> +#endif
> +
> +#ifndef LOG_PAGE_SIZE
> +# define LOG_PAGE_SIZE 12
> +#endif
> +
> +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
> +# error Invalid LOG_PAGE_SIZE
> +#endif
> +
> +/* Byte per page for large_memcpy inner loop. */
> +#if VEC_SIZE == 64
> +# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
> +#else
> +# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
> +#endif
> +
> +
> +/* Amount to shift rdx by to compare for memcpy_large_4x. */
> +#ifndef LOG_4X_MEMCPY_THRESH
> +# define LOG_4X_MEMCPY_THRESH 4
> +#endif
> +
> /* Avoid short distance rep movsb only with non-SSE vector. */
> #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
> # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
> @@ -106,6 +144,28 @@
> # error Unsupported PREFETCH_SIZE!
> #endif
>
> +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
> +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
> + VMOVU (offset)base, vec0; \
> + VMOVU ((offset) + VEC_SIZE)base, vec1;
> +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
> + VMOVNT vec0, (offset)base; \
> + VMOVNT vec1, ((offset) + VEC_SIZE)base;
> +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
> +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> + VMOVU (offset)base, vec0; \
> + VMOVU ((offset) + VEC_SIZE)base, vec1; \
> + VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \
> + VMOVU ((offset) + VEC_SIZE * 3)base, vec3;
> +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> + VMOVNT vec0, (offset)base; \
> + VMOVNT vec1, ((offset) + VEC_SIZE)base; \
> + VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \
> + VMOVNT vec3, ((offset) + VEC_SIZE * 3)base;
> +#else
> +# error Invalid LARGE_LOAD_SIZE
> +#endif
> +
> #ifndef SECTION
> # error SECTION is not defined!
> #endif
> @@ -393,6 +453,15 @@ L(last_4x_vec):
> VZEROUPPER_RETURN
>
> L(more_8x_vec):
> + /* Check if non-temporal move candidate. */
> +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> + /* Check non-temporal store threshold. */
> + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> + ja L(large_memcpy_2x)
> +#endif
> + /* Entry if rdx is greater than non-temporal threshold but there
> + is overlap. */
> +L(more_8x_vec_check):
> cmpq %rsi, %rdi
> ja L(more_8x_vec_backward)
> /* Source == destination is less common. */
> @@ -419,11 +488,6 @@ L(more_8x_vec):
> subq %r8, %rdi
> /* Adjust length. */
> addq %r8, %rdx
> -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> - /* Check non-temporal store threshold. */
> - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> - ja L(large_forward)
> -#endif
> L(loop_4x_vec_forward):
> /* Copy 4 * VEC a time forward. */
> VMOVU (%rsi), %VEC(0)
> @@ -470,11 +534,6 @@ L(more_8x_vec_backward):
> subq %r8, %r9
> /* Adjust length. */
> subq %r8, %rdx
> -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> - /* Check non-temporal store threshold. */
> - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> - ja L(large_backward)
> -#endif
> L(loop_4x_vec_backward):
> /* Copy 4 * VEC a time backward. */
> VMOVU (%rcx), %VEC(0)
> @@ -500,72 +559,203 @@ L(loop_4x_vec_backward):
> VZEROUPPER_RETURN
>
> #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> -L(large_forward):
> +L(large_memcpy_2x):
> + /* Compute absolute value of difference between source and
> + destination. */
> + movq %rdi, %r9
> + subq %rsi, %r9
> + movq %r9, %r8
> + leaq -1(%r9), %rcx
> + sarq $63, %r8
> + xorq %r8, %r9
> + subq %r8, %r9
> /* Don't use non-temporal store if there is overlap between
> - destination and source since destination may be in cache
> - when source is loaded. */
> - leaq (%rdi, %rdx), %r10
> - cmpq %r10, %rsi
> - jb L(loop_4x_vec_forward)
> -L(loop_large_forward):
> - /* Copy 4 * VEC a time forward with non-temporal stores. */
> - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
> + destination and source since destination may be in cache when
> + source is loaded. */
> + cmpq %r9, %rdx
> + ja L(more_8x_vec_check)
> +
> + /* Cache align destination. First store the first 64 bytes then
> + adjust alignments. */
> + VMOVU (%rsi), %VEC(8)
> +#if VEC_SIZE < 64
> + VMOVU VEC_SIZE(%rsi), %VEC(9)
> +#if VEC_SIZE < 32
> + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
> + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
> +#endif
> +#endif
> + VMOVU %VEC(8), (%rdi)
> +#if VEC_SIZE < 64
> + VMOVU %VEC(9), VEC_SIZE(%rdi)
> +#if VEC_SIZE < 32
> + VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
> + VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
> +#endif
> +#endif
> + /* Adjust source, destination, and size. */
> + MOVQ %rdi, %r8
movq
> + andq $63, %r8
> + /* Get the negative of offset for alignment. */
> + subq $64, %r8
> + /* Adjust source. */
> + subq %r8, %rsi
> + /* Adjust destination which should be aligned now. */
> + subq %r8, %rdi
> + /* Adjust length. */
> + addq %r8, %rdx
> +
> + /* Test if source and destination addresses will alias. If they do
> + the larger pipeline in large_memcpy_4x alleviated the
> + performance drop. */
> + testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
> + jz L(large_memcpy_4x)
> +
> + movq %rdx, %r10
> + shrq $LOG_4X_MEMCPY_THRESH, %r10
> + cmp __x86_shared_non_temporal_threshold(%rip), %r10
> + jae L(large_memcpy_4x)
> +
> + /* edx will store remainder size for copying tail. */
> + andl $(PAGE_SIZE * 2 - 1), %edx
> + /* r10 stores outer loop counter. */
> + shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
> + /* Copy 4x VEC at a time from 2 pages. */
> + .p2align 4
If you drop .p2align, will it show up on the glibc benchtest?
> +L(loop_large_memcpy_2x_outer):
> + /* ecx stores inner loop counter. */
> + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> +L(loop_large_memcpy_2x_inner):
> + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
> + /* Load vectors from rsi. */
> + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> + addq $LARGE_LOAD_SIZE, %rsi
> + /* Non-temporal store vectors to rdi. */
> + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> + addq $LARGE_LOAD_SIZE, %rdi
> + decl %ecx
> + jnz L(loop_large_memcpy_2x_inner)
> + addq $PAGE_SIZE, %rdi
> + addq $PAGE_SIZE, %rsi
> + decq %r10
> + jne L(loop_large_memcpy_2x_outer)
> + sfence
> +
> + /* Check if only last 4 loads are needed. */
> + cmpl $(VEC_SIZE * 4), %edx
> + jbe L(large_memcpy_2x_end)
> +
> + /* Handle the last 2 * PAGE_SIZE bytes. Use temporal stores
> + here. The region will fit in cache and it should fit user
> + expectations for the tail of the memcpy region to be hot. */
> +L(loop_large_memcpy_2x_tail):
> + /* Copy 4 * VEC a time forward with temporal stores. */
> + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> VMOVU (%rsi), %VEC(0)
> VMOVU VEC_SIZE(%rsi), %VEC(1)
> VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
> VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
> - addq $PREFETCHED_LOAD_SIZE, %rsi
> - subq $PREFETCHED_LOAD_SIZE, %rdx
> - VMOVNT %VEC(0), (%rdi)
> - VMOVNT %VEC(1), VEC_SIZE(%rdi)
> - VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
> - VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
> - addq $PREFETCHED_LOAD_SIZE, %rdi
> - cmpq $PREFETCHED_LOAD_SIZE, %rdx
> - ja L(loop_large_forward)
> - sfence
> + addq $(VEC_SIZE * 4), %rsi
> + subl $(VEC_SIZE * 4), %edx
> + VMOVA %VEC(0), (%rdi)
> + VMOVA %VEC(1), VEC_SIZE(%rdi)
> + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
> + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
> + addq $(VEC_SIZE * 4), %rdi
> + cmpl $(VEC_SIZE * 4), %edx
> + ja L(loop_large_memcpy_2x_tail)
> +
> +L(large_memcpy_2x_end):
> /* Store the last 4 * VEC. */
> - VMOVU %VEC(5), (%rcx)
> - VMOVU %VEC(6), -VEC_SIZE(%rcx)
> - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
> - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
> - /* Store the first VEC. */
> - VMOVU %VEC(4), (%r11)
> + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
> +
> + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
> VZEROUPPER_RETURN
>
> -L(large_backward):
> - /* Don't use non-temporal store if there is overlap between
> - destination and source since destination may be in cache
> - when source is loaded. */
> - leaq (%rcx, %rdx), %r10
> - cmpq %r10, %r9
> - jb L(loop_4x_vec_backward)
> -L(loop_large_backward):
> - /* Copy 4 * VEC a time backward with non-temporal stores. */
> - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
> - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
> - VMOVU (%rcx), %VEC(0)
> - VMOVU -VEC_SIZE(%rcx), %VEC(1)
> - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
> - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
> - subq $PREFETCHED_LOAD_SIZE, %rcx
> - subq $PREFETCHED_LOAD_SIZE, %rdx
> - VMOVNT %VEC(0), (%r9)
> - VMOVNT %VEC(1), -VEC_SIZE(%r9)
> - VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
> - VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
> - subq $PREFETCHED_LOAD_SIZE, %r9
> - cmpq $PREFETCHED_LOAD_SIZE, %rdx
> - ja L(loop_large_backward)
> +L(large_memcpy_4x):
> + movq %rdx, %r10
> + /* edx will store remainder size for copying tail. */
> + andl $(PAGE_SIZE * 4 - 1), %edx
> + /* r10 stores outer loop counter. */
> + shrq $(LOG_PAGE_SIZE + 2), %r10
> + /* Copy 4x VEC at a time from 4 pages. */
> + .p2align 4
If you drop .p2align, will it show up on the glibc benchtest?
> +L(loop_large_memcpy_4x_outer):
> + /* ecx stores inner loop counter. */
> + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> +L(loop_large_memcpy_4x_inner):
> + /* Only one prefetch set per page as doing 4 pages give more time
> + for prefetcher to keep up. */
> + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
> + /* Load vectors from rsi. */
> + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> + LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> + LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> + addq $LARGE_LOAD_SIZE, %rsi
> + /* Non-temporal store vectors to rdi. */
> + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> + STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> + STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> + addq $LARGE_LOAD_SIZE, %rdi
> + decl %ecx
> + jnz L(loop_large_memcpy_4x_inner)
> + addq $(PAGE_SIZE * 3), %rdi
> + addq $(PAGE_SIZE * 3), %rsi
> + decq %r10
> + jne L(loop_large_memcpy_4x_outer)
> sfence
> - /* Store the first 4 * VEC. */
> - VMOVU %VEC(4), (%rdi)
> - VMOVU %VEC(5), VEC_SIZE(%rdi)
> - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
> - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
> - /* Store the last VEC. */
> - VMOVU %VEC(8), (%r11)
> +
> + /* Check if only last 4 loads are needed. */
> + cmpl $(VEC_SIZE * 4), %edx
> + jbe L(large_memcpy_4x_end)
> +
> + /* Handle the last 4 * PAGE_SIZE bytes. */
> +L(loop_large_memcpy_4x_tail):
> + /* Copy 4 * VEC a time forward with temporal stores. */
> + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> + VMOVU (%rsi), %VEC(0)
> + VMOVU VEC_SIZE(%rsi), %VEC(1)
> + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
> + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
> + addq $(VEC_SIZE * 4), %rsi
> + subl $(VEC_SIZE * 4), %edx
> + VMOVA %VEC(0), (%rdi)
> + VMOVA %VEC(1), VEC_SIZE(%rdi)
> + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
> + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
> + addq $(VEC_SIZE * 4), %rdi
> + cmpl $(VEC_SIZE * 4), %edx
> + ja L(loop_large_memcpy_4x_tail)
> +
> +L(large_memcpy_4x_end):
> + /* Store the last 4 * VEC. */
> + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
> +
> + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
> VZEROUPPER_RETURN
> #endif
> END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> --
> 2.29.2
>
On Thu, Apr 1, 2021 at 9:54 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Mar 30, 2021 at 2:38 PM noah <goldstein.w.n@gmail.com> wrote:
> >
> > No Bug. This commit updates the large memcpy case (no overlap). The
> > update is to perform memcpy on either 2 or 4 contiguous pages at
> > once. This 1) helps to alleviate the affects of false memory aliasing
> > when destination and source have a close 4k alignment and 2) In most
> > cases and for most DRAM units is a modestly more efficient access
> > pattern. These changes are a clear performance improvement for
> > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
> > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
> > pass.
> >
> > Signed-off-by: noah <goldstein.w.n@gmail.com>
> > ---
> > .../multiarch/memmove-vec-unaligned-erms.S | 326 ++++++++++++++----
> > 1 file changed, 258 insertions(+), 68 deletions(-)
>
> 1. There are many trailing whitespaces.
done.
> 2. Replace "jccSPACElabel" with "jccTABlabel".
done.
> 3. Replace "insnSPACEoperand" with "insnTABoperand" if needed.
done.
>
> > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > index 897a3d9762..dae3e2bac5 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > @@ -35,7 +35,16 @@
> > __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
> > 7. If size >= __x86_shared_non_temporal_threshold and there is no
> > overlap between destination and source, use non-temporal store
> > - instead of aligned store. */
> > + instead of aligned store copying from either 2 or 4 pages at
> > + once.
> > + 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
> > + and source and destination do not page alias, copy from 2 pages
> > + at once using non-temporal stores. Page aliasing in this case is
> > + considered true if destination's page alignment - sources' page
> > + alignment is less than 8 * VEC_SIZE.
> > + 9. If size >= 16 * __x86_shared_non_temporal_threshold or source
> > + and destination do page alias copy from 4 pages at once using
> > + non-temporal stores. */
> >
> > #include <sysdep.h>
> >
> > @@ -67,6 +76,35 @@
> > # endif
> > #endif
> >
> > +#ifndef PAGE_SIZE
> > +# define PAGE_SIZE 4096
> > +#endif
> > +
> > +#if PAGE_SIZE != 4096
> > +# error Unsupported PAGE_SIZE
> > +#endif
> > +
> > +#ifndef LOG_PAGE_SIZE
> > +# define LOG_PAGE_SIZE 12
> > +#endif
> > +
> > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
> > +# error Invalid LOG_PAGE_SIZE
> > +#endif
> > +
> > +/* Byte per page for large_memcpy inner loop. */
> > +#if VEC_SIZE == 64
> > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
> > +#else
> > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
> > +#endif
> > +
> > +
> > +/* Amount to shift rdx by to compare for memcpy_large_4x. */
> > +#ifndef LOG_4X_MEMCPY_THRESH
> > +# define LOG_4X_MEMCPY_THRESH 4
> > +#endif
> > +
> > /* Avoid short distance rep movsb only with non-SSE vector. */
> > #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
> > # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
> > @@ -106,6 +144,28 @@
> > # error Unsupported PREFETCH_SIZE!
> > #endif
> >
> > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
> > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
> > + VMOVU (offset)base, vec0; \
> > + VMOVU ((offset) + VEC_SIZE)base, vec1;
> > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
> > + VMOVNT vec0, (offset)base; \
> > + VMOVNT vec1, ((offset) + VEC_SIZE)base;
> > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
> > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> > + VMOVU (offset)base, vec0; \
> > + VMOVU ((offset) + VEC_SIZE)base, vec1; \
> > + VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \
> > + VMOVU ((offset) + VEC_SIZE * 3)base, vec3;
> > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> > + VMOVNT vec0, (offset)base; \
> > + VMOVNT vec1, ((offset) + VEC_SIZE)base; \
> > + VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \
> > + VMOVNT vec3, ((offset) + VEC_SIZE * 3)base;
> > +#else
> > +# error Invalid LARGE_LOAD_SIZE
> > +#endif
> > +
> > #ifndef SECTION
> > # error SECTION is not defined!
> > #endif
> > @@ -393,6 +453,15 @@ L(last_4x_vec):
> > VZEROUPPER_RETURN
> >
> > L(more_8x_vec):
> > + /* Check if non-temporal move candidate. */
> > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > + /* Check non-temporal store threshold. */
> > + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > + ja L(large_memcpy_2x)
> > +#endif
> > + /* Entry if rdx is greater than non-temporal threshold but there
> > + is overlap. */
> > +L(more_8x_vec_check):
> > cmpq %rsi, %rdi
> > ja L(more_8x_vec_backward)
> > /* Source == destination is less common. */
> > @@ -419,11 +488,6 @@ L(more_8x_vec):
> > subq %r8, %rdi
> > /* Adjust length. */
> > addq %r8, %rdx
> > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > - /* Check non-temporal store threshold. */
> > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > - ja L(large_forward)
> > -#endif
> > L(loop_4x_vec_forward):
> > /* Copy 4 * VEC a time forward. */
> > VMOVU (%rsi), %VEC(0)
> > @@ -470,11 +534,6 @@ L(more_8x_vec_backward):
> > subq %r8, %r9
> > /* Adjust length. */
> > subq %r8, %rdx
> > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > - /* Check non-temporal store threshold. */
> > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > - ja L(large_backward)
> > -#endif
> > L(loop_4x_vec_backward):
> > /* Copy 4 * VEC a time backward. */
> > VMOVU (%rcx), %VEC(0)
> > @@ -500,72 +559,203 @@ L(loop_4x_vec_backward):
> > VZEROUPPER_RETURN
> >
> > #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > -L(large_forward):
> > +L(large_memcpy_2x):
> > + /* Compute absolute value of difference between source and
> > + destination. */
> > + movq %rdi, %r9
> > + subq %rsi, %r9
> > + movq %r9, %r8
> > + leaq -1(%r9), %rcx
> > + sarq $63, %r8
> > + xorq %r8, %r9
> > + subq %r8, %r9
> > /* Don't use non-temporal store if there is overlap between
> > - destination and source since destination may be in cache
> > - when source is loaded. */
> > - leaq (%rdi, %rdx), %r10
> > - cmpq %r10, %rsi
> > - jb L(loop_4x_vec_forward)
> > -L(loop_large_forward):
> > - /* Copy 4 * VEC a time forward with non-temporal stores. */
> > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> > - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
> > + destination and source since destination may be in cache when
> > + source is loaded. */
> > + cmpq %r9, %rdx
> > + ja L(more_8x_vec_check)
> > +
> > + /* Cache align destination. First store the first 64 bytes then
> > + adjust alignments. */
> > + VMOVU (%rsi), %VEC(8)
> > +#if VEC_SIZE < 64
> > + VMOVU VEC_SIZE(%rsi), %VEC(9)
> > +#if VEC_SIZE < 32
> > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
> > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
> > +#endif
> > +#endif
> > + VMOVU %VEC(8), (%rdi)
> > +#if VEC_SIZE < 64
> > + VMOVU %VEC(9), VEC_SIZE(%rdi)
> > +#if VEC_SIZE < 32
> > + VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
> > + VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
> > +#endif
> > +#endif
> > + /* Adjust source, destination, and size. */
> > + MOVQ %rdi, %r8
>
> movq
done.
>
> > + andq $63, %r8
> > + /* Get the negative of offset for alignment. */
> > + subq $64, %r8
> > + /* Adjust source. */
> > + subq %r8, %rsi
> > + /* Adjust destination which should be aligned now. */
> > + subq %r8, %rdi
> > + /* Adjust length. */
> > + addq %r8, %rdx
> > +
> > + /* Test if source and destination addresses will alias. If they do
> > + the larger pipeline in large_memcpy_4x alleviated the
> > + performance drop. */
> > + testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
> > + jz L(large_memcpy_4x)
> > +
> > + movq %rdx, %r10
> > + shrq $LOG_4X_MEMCPY_THRESH, %r10
> > + cmp __x86_shared_non_temporal_threshold(%rip), %r10
> > + jae L(large_memcpy_4x)
> > +
> > + /* edx will store remainder size for copying tail. */
> > + andl $(PAGE_SIZE * 2 - 1), %edx
> > + /* r10 stores outer loop counter. */
> > + shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
> > + /* Copy 4x VEC at a time from 2 pages. */
> > + .p2align 4
>
> If you drop .p2align, will it show up on the glibc benchtest?
No. Dropped it.
>
> > +L(loop_large_memcpy_2x_outer):
> > + /* ecx stores inner loop counter. */
> > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> > +L(loop_large_memcpy_2x_inner):
> > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
> > + /* Load vectors from rsi. */
> > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > + addq $LARGE_LOAD_SIZE, %rsi
> > + /* Non-temporal store vectors to rdi. */
> > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > + addq $LARGE_LOAD_SIZE, %rdi
> > + decl %ecx
> > + jnz L(loop_large_memcpy_2x_inner)
> > + addq $PAGE_SIZE, %rdi
> > + addq $PAGE_SIZE, %rsi
> > + decq %r10
> > + jne L(loop_large_memcpy_2x_outer)
> > + sfence
> > +
> > + /* Check if only last 4 loads are needed. */
> > + cmpl $(VEC_SIZE * 4), %edx
> > + jbe L(large_memcpy_2x_end)
> > +
> > + /* Handle the last 2 * PAGE_SIZE bytes. Use temporal stores
> > + here. The region will fit in cache and it should fit user
> > + expectations for the tail of the memcpy region to be hot. */
> > +L(loop_large_memcpy_2x_tail):
> > + /* Copy 4 * VEC a time forward with temporal stores. */
> > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> > VMOVU (%rsi), %VEC(0)
> > VMOVU VEC_SIZE(%rsi), %VEC(1)
> > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
> > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
> > - addq $PREFETCHED_LOAD_SIZE, %rsi
> > - subq $PREFETCHED_LOAD_SIZE, %rdx
> > - VMOVNT %VEC(0), (%rdi)
> > - VMOVNT %VEC(1), VEC_SIZE(%rdi)
> > - VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
> > - VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
> > - addq $PREFETCHED_LOAD_SIZE, %rdi
> > - cmpq $PREFETCHED_LOAD_SIZE, %rdx
> > - ja L(loop_large_forward)
> > - sfence
> > + addq $(VEC_SIZE * 4), %rsi
> > + subl $(VEC_SIZE * 4), %edx
> > + VMOVA %VEC(0), (%rdi)
> > + VMOVA %VEC(1), VEC_SIZE(%rdi)
> > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
> > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
> > + addq $(VEC_SIZE * 4), %rdi
> > + cmpl $(VEC_SIZE * 4), %edx
> > + ja L(loop_large_memcpy_2x_tail)
> > +
> > +L(large_memcpy_2x_end):
> > /* Store the last 4 * VEC. */
> > - VMOVU %VEC(5), (%rcx)
> > - VMOVU %VEC(6), -VEC_SIZE(%rcx)
> > - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
> > - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
> > - /* Store the first VEC. */
> > - VMOVU %VEC(4), (%r11)
> > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > +
> > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
> > VZEROUPPER_RETURN
> >
> > -L(large_backward):
> > - /* Don't use non-temporal store if there is overlap between
> > - destination and source since destination may be in cache
> > - when source is loaded. */
> > - leaq (%rcx, %rdx), %r10
> > - cmpq %r10, %r9
> > - jb L(loop_4x_vec_backward)
> > -L(loop_large_backward):
> > - /* Copy 4 * VEC a time backward with non-temporal stores. */
> > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
> > - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
> > - VMOVU (%rcx), %VEC(0)
> > - VMOVU -VEC_SIZE(%rcx), %VEC(1)
> > - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
> > - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
> > - subq $PREFETCHED_LOAD_SIZE, %rcx
> > - subq $PREFETCHED_LOAD_SIZE, %rdx
> > - VMOVNT %VEC(0), (%r9)
> > - VMOVNT %VEC(1), -VEC_SIZE(%r9)
> > - VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
> > - VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
> > - subq $PREFETCHED_LOAD_SIZE, %r9
> > - cmpq $PREFETCHED_LOAD_SIZE, %rdx
> > - ja L(loop_large_backward)
> > +L(large_memcpy_4x):
> > + movq %rdx, %r10
> > + /* edx will store remainder size for copying tail. */
> > + andl $(PAGE_SIZE * 4 - 1), %edx
> > + /* r10 stores outer loop counter. */
> > + shrq $(LOG_PAGE_SIZE + 2), %r10
> > + /* Copy 4x VEC at a time from 4 pages. */
> > + .p2align 4
>
> If you drop .p2align, will it show up on the glibc benchtest?
No. Dropped it.
>
> > +L(loop_large_memcpy_4x_outer):
> > + /* ecx stores inner loop counter. */
> > + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> > +L(loop_large_memcpy_4x_inner):
> > + /* Only one prefetch set per page as doing 4 pages give more time
> > + for prefetcher to keep up. */
> > + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> > + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
> > + /* Load vectors from rsi. */
> > + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > + LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > + addq $LARGE_LOAD_SIZE, %rsi
> > + /* Non-temporal store vectors to rdi. */
> > + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > + STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > + STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > + addq $LARGE_LOAD_SIZE, %rdi
> > + decl %ecx
> > + jnz L(loop_large_memcpy_4x_inner)
> > + addq $(PAGE_SIZE * 3), %rdi
> > + addq $(PAGE_SIZE * 3), %rsi
> > + decq %r10
> > + jne L(loop_large_memcpy_4x_outer)
> > sfence
> > - /* Store the first 4 * VEC. */
> > - VMOVU %VEC(4), (%rdi)
> > - VMOVU %VEC(5), VEC_SIZE(%rdi)
> > - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
> > - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
> > - /* Store the last VEC. */
> > - VMOVU %VEC(8), (%r11)
> > +
> > + /* Check if only last 4 loads are needed. */
> > + cmpl $(VEC_SIZE * 4), %edx
> > + jbe L(large_memcpy_4x_end)
> > +
> > + /* Handle the last 4 * PAGE_SIZE bytes. */
> > +L(loop_large_memcpy_4x_tail):
> > + /* Copy 4 * VEC a time forward with temporal stores. */
> > + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> > + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> > + VMOVU (%rsi), %VEC(0)
> > + VMOVU VEC_SIZE(%rsi), %VEC(1)
> > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
> > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
> > + addq $(VEC_SIZE * 4), %rsi
> > + subl $(VEC_SIZE * 4), %edx
> > + VMOVA %VEC(0), (%rdi)
> > + VMOVA %VEC(1), VEC_SIZE(%rdi)
> > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
> > + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
> > + addq $(VEC_SIZE * 4), %rdi
> > + cmpl $(VEC_SIZE * 4), %edx
> > + ja L(loop_large_memcpy_4x_tail)
> > +
> > +L(large_memcpy_4x_end):
> > + /* Store the last 4 * VEC. */
> > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > +
> > + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
> > VZEROUPPER_RETURN
> > #endif
> > END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> > --
> > 2.29.2
> >
>
>
> --
> H.J.
@@ -35,7 +35,16 @@
__x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
7. If size >= __x86_shared_non_temporal_threshold and there is no
overlap between destination and source, use non-temporal store
- instead of aligned store. */
+ instead of aligned store copying from either 2 or 4 pages at
+ once.
+ 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
+ and source and destination do not page alias, copy from 2 pages
+ at once using non-temporal stores. Page aliasing in this case is
+ considered true if destination's page alignment - sources' page
+ alignment is less than 8 * VEC_SIZE.
+ 9. If size >= 16 * __x86_shared_non_temporal_threshold or source
+ and destination do page alias copy from 4 pages at once using
+ non-temporal stores. */
#include <sysdep.h>
@@ -67,6 +76,35 @@
# endif
#endif
+#ifndef PAGE_SIZE
+# define PAGE_SIZE 4096
+#endif
+
+#if PAGE_SIZE != 4096
+# error Unsupported PAGE_SIZE
+#endif
+
+#ifndef LOG_PAGE_SIZE
+# define LOG_PAGE_SIZE 12
+#endif
+
+#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
+# error Invalid LOG_PAGE_SIZE
+#endif
+
+/* Byte per page for large_memcpy inner loop. */
+#if VEC_SIZE == 64
+# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
+#else
+# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
+#endif
+
+
+/* Amount to shift rdx by to compare for memcpy_large_4x. */
+#ifndef LOG_4X_MEMCPY_THRESH
+# define LOG_4X_MEMCPY_THRESH 4
+#endif
+
/* Avoid short distance rep movsb only with non-SSE vector. */
#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
@@ -106,6 +144,28 @@
# error Unsupported PREFETCH_SIZE!
#endif
+#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
+ VMOVU (offset)base, vec0; \
+ VMOVU ((offset) + VEC_SIZE)base, vec1;
+# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
+ VMOVNT vec0, (offset)base; \
+ VMOVNT vec1, ((offset) + VEC_SIZE)base;
+#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+ VMOVU (offset)base, vec0; \
+ VMOVU ((offset) + VEC_SIZE)base, vec1; \
+ VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \
+ VMOVU ((offset) + VEC_SIZE * 3)base, vec3;
+# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+ VMOVNT vec0, (offset)base; \
+ VMOVNT vec1, ((offset) + VEC_SIZE)base; \
+ VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \
+ VMOVNT vec3, ((offset) + VEC_SIZE * 3)base;
+#else
+# error Invalid LARGE_LOAD_SIZE
+#endif
+
#ifndef SECTION
# error SECTION is not defined!
#endif
@@ -393,6 +453,15 @@ L(last_4x_vec):
VZEROUPPER_RETURN
L(more_8x_vec):
+ /* Check if non-temporal move candidate. */
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ /* Check non-temporal store threshold. */
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ ja L(large_memcpy_2x)
+#endif
+ /* Entry if rdx is greater than non-temporal threshold but there
+ is overlap. */
+L(more_8x_vec_check):
cmpq %rsi, %rdi
ja L(more_8x_vec_backward)
/* Source == destination is less common. */
@@ -419,11 +488,6 @@ L(more_8x_vec):
subq %r8, %rdi
/* Adjust length. */
addq %r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
- /* Check non-temporal store threshold. */
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
- ja L(large_forward)
-#endif
L(loop_4x_vec_forward):
/* Copy 4 * VEC a time forward. */
VMOVU (%rsi), %VEC(0)
@@ -470,11 +534,6 @@ L(more_8x_vec_backward):
subq %r8, %r9
/* Adjust length. */
subq %r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
- /* Check non-temporal store threshold. */
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
- ja L(large_backward)
-#endif
L(loop_4x_vec_backward):
/* Copy 4 * VEC a time backward. */
VMOVU (%rcx), %VEC(0)
@@ -500,72 +559,203 @@ L(loop_4x_vec_backward):
VZEROUPPER_RETURN
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-L(large_forward):
+L(large_memcpy_2x):
+ /* Compute absolute value of difference between source and
+ destination. */
+ movq %rdi, %r9
+ subq %rsi, %r9
+ movq %r9, %r8
+ leaq -1(%r9), %rcx
+ sarq $63, %r8
+ xorq %r8, %r9
+ subq %r8, %r9
/* Don't use non-temporal store if there is overlap between
- destination and source since destination may be in cache
- when source is loaded. */
- leaq (%rdi, %rdx), %r10
- cmpq %r10, %rsi
- jb L(loop_4x_vec_forward)
-L(loop_large_forward):
- /* Copy 4 * VEC a time forward with non-temporal stores. */
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+ destination and source since destination may be in cache when
+ source is loaded. */
+ cmpq %r9, %rdx
+ ja L(more_8x_vec_check)
+
+ /* Cache align destination. First store the first 64 bytes then
+ adjust alignments. */
+ VMOVU (%rsi), %VEC(8)
+#if VEC_SIZE < 64
+ VMOVU VEC_SIZE(%rsi), %VEC(9)
+#if VEC_SIZE < 32
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
+#endif
+#endif
+ VMOVU %VEC(8), (%rdi)
+#if VEC_SIZE < 64
+ VMOVU %VEC(9), VEC_SIZE(%rdi)
+#if VEC_SIZE < 32
+ VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
+#endif
+#endif
+ /* Adjust source, destination, and size. */
+ MOVQ %rdi, %r8
+ andq $63, %r8
+ /* Get the negative of offset for alignment. */
+ subq $64, %r8
+ /* Adjust source. */
+ subq %r8, %rsi
+ /* Adjust destination which should be aligned now. */
+ subq %r8, %rdi
+ /* Adjust length. */
+ addq %r8, %rdx
+
+ /* Test if source and destination addresses will alias. If they do
+ the larger pipeline in large_memcpy_4x alleviated the
+ performance drop. */
+ testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
+ jz L(large_memcpy_4x)
+
+ movq %rdx, %r10
+ shrq $LOG_4X_MEMCPY_THRESH, %r10
+ cmp __x86_shared_non_temporal_threshold(%rip), %r10
+ jae L(large_memcpy_4x)
+
+ /* edx will store remainder size for copying tail. */
+ andl $(PAGE_SIZE * 2 - 1), %edx
+ /* r10 stores outer loop counter. */
+ shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
+ /* Copy 4x VEC at a time from 2 pages. */
+ .p2align 4
+L(loop_large_memcpy_2x_outer):
+ /* ecx stores inner loop counter. */
+ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_2x_inner):
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
+ /* Load vectors from rsi. */
+ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+ addq $LARGE_LOAD_SIZE, %rsi
+ /* Non-temporal store vectors to rdi. */
+ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+ addq $LARGE_LOAD_SIZE, %rdi
+ decl %ecx
+ jnz L(loop_large_memcpy_2x_inner)
+ addq $PAGE_SIZE, %rdi
+ addq $PAGE_SIZE, %rsi
+ decq %r10
+ jne L(loop_large_memcpy_2x_outer)
+ sfence
+
+ /* Check if only last 4 loads are needed. */
+ cmpl $(VEC_SIZE * 4), %edx
+ jbe L(large_memcpy_2x_end)
+
+ /* Handle the last 2 * PAGE_SIZE bytes. Use temporal stores
+ here. The region will fit in cache and it should fit user
+ expectations for the tail of the memcpy region to be hot. */
+L(loop_large_memcpy_2x_tail):
+ /* Copy 4 * VEC a time forward with temporal stores. */
+ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
- addq $PREFETCHED_LOAD_SIZE, %rsi
- subq $PREFETCHED_LOAD_SIZE, %rdx
- VMOVNT %VEC(0), (%rdi)
- VMOVNT %VEC(1), VEC_SIZE(%rdi)
- VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
- VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
- addq $PREFETCHED_LOAD_SIZE, %rdi
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
- ja L(loop_large_forward)
- sfence
+ addq $(VEC_SIZE * 4), %rsi
+ subl $(VEC_SIZE * 4), %edx
+ VMOVA %VEC(0), (%rdi)
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
+ addq $(VEC_SIZE * 4), %rdi
+ cmpl $(VEC_SIZE * 4), %edx
+ ja L(loop_large_memcpy_2x_tail)
+
+L(large_memcpy_2x_end):
/* Store the last 4 * VEC. */
- VMOVU %VEC(5), (%rcx)
- VMOVU %VEC(6), -VEC_SIZE(%rcx)
- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
- /* Store the first VEC. */
- VMOVU %VEC(4), (%r11)
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
VZEROUPPER_RETURN
-L(large_backward):
- /* Don't use non-temporal store if there is overlap between
- destination and source since destination may be in cache
- when source is loaded. */
- leaq (%rcx, %rdx), %r10
- cmpq %r10, %r9
- jb L(loop_4x_vec_backward)
-L(loop_large_backward):
- /* Copy 4 * VEC a time backward with non-temporal stores. */
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
- VMOVU (%rcx), %VEC(0)
- VMOVU -VEC_SIZE(%rcx), %VEC(1)
- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
- subq $PREFETCHED_LOAD_SIZE, %rcx
- subq $PREFETCHED_LOAD_SIZE, %rdx
- VMOVNT %VEC(0), (%r9)
- VMOVNT %VEC(1), -VEC_SIZE(%r9)
- VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
- VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
- subq $PREFETCHED_LOAD_SIZE, %r9
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
- ja L(loop_large_backward)
+L(large_memcpy_4x):
+ movq %rdx, %r10
+ /* edx will store remainder size for copying tail. */
+ andl $(PAGE_SIZE * 4 - 1), %edx
+ /* r10 stores outer loop counter. */
+ shrq $(LOG_PAGE_SIZE + 2), %r10
+ /* Copy 4x VEC at a time from 4 pages. */
+ .p2align 4
+L(loop_large_memcpy_4x_outer):
+ /* ecx stores inner loop counter. */
+ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_4x_inner):
+ /* Only one prefetch set per page as doing 4 pages give more time
+ for prefetcher to keep up. */
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
+ /* Load vectors from rsi. */
+ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+ LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+ LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+ addq $LARGE_LOAD_SIZE, %rsi
+ /* Non-temporal store vectors to rdi. */
+ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+ STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+ STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+ addq $LARGE_LOAD_SIZE, %rdi
+ decl %ecx
+ jnz L(loop_large_memcpy_4x_inner)
+ addq $(PAGE_SIZE * 3), %rdi
+ addq $(PAGE_SIZE * 3), %rsi
+ decq %r10
+ jne L(loop_large_memcpy_4x_outer)
sfence
- /* Store the first 4 * VEC. */
- VMOVU %VEC(4), (%rdi)
- VMOVU %VEC(5), VEC_SIZE(%rdi)
- VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
- /* Store the last VEC. */
- VMOVU %VEC(8), (%r11)
+
+ /* Check if only last 4 loads are needed. */
+ cmpl $(VEC_SIZE * 4), %edx
+ jbe L(large_memcpy_4x_end)
+
+ /* Handle the last 4 * PAGE_SIZE bytes. */
+L(loop_large_memcpy_4x_tail):
+ /* Copy 4 * VEC a time forward with temporal stores. */
+ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
+ addq $(VEC_SIZE * 4), %rsi
+ subl $(VEC_SIZE * 4), %edx
+ VMOVA %VEC(0), (%rdi)
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
+ addq $(VEC_SIZE * 4), %rdi
+ cmpl $(VEC_SIZE * 4), %edx
+ ja L(loop_large_memcpy_4x_tail)
+
+L(large_memcpy_4x_end):
+ /* Store the last 4 * VEC. */
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
VZEROUPPER_RETURN
#endif
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))