[v4,4/5] x86: Optimize memmove-vec-unaligned-erms.S
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
No bug.
The optimizations are as follows:
1) Always align entry to 64 bytes. This makes behavior more
predictable and makes other frontend optimizations easier.
2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
significant benefits in the case that:
0 < (dst - src) < [256, 512]
3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
improvement and for FSRM [-10%, 25%].
In addition to these primary changes there is general cleanup
throughout to optimize the aligning routines and control flow logic.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/memmove.S | 2 +-
.../memmove-avx-unaligned-erms-rtm.S | 2 +-
.../multiarch/memmove-avx-unaligned-erms.S | 2 +-
.../multiarch/memmove-avx512-unaligned-erms.S | 2 +-
.../multiarch/memmove-evex-unaligned-erms.S | 2 +-
.../multiarch/memmove-vec-unaligned-erms.S | 595 +++++++++++-------
6 files changed, 381 insertions(+), 224 deletions(-)
Comments
On Sat, Nov 6, 2021 at 11:33 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug.
>
> The optimizations are as follows:
>
> 1) Always align entry to 64 bytes. This makes behavior more
> predictable and makes other frontend optimizations easier.
>
> 2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
> significant benefits in the case that:
> 0 < (dst - src) < [256, 512]
>
> 3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
> improvement and for FSRM [-10%, 25%].
>
> In addition to these primary changes there is general cleanup
> throughout to optimize the aligning routines and control flow logic.
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> ---
> sysdeps/x86_64/memmove.S | 2 +-
> .../memmove-avx-unaligned-erms-rtm.S | 2 +-
> .../multiarch/memmove-avx-unaligned-erms.S | 2 +-
> .../multiarch/memmove-avx512-unaligned-erms.S | 2 +-
> .../multiarch/memmove-evex-unaligned-erms.S | 2 +-
> .../multiarch/memmove-vec-unaligned-erms.S | 595 +++++++++++-------
> 6 files changed, 381 insertions(+), 224 deletions(-)
>
> diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
> index db106a7a1f..b2b3180848 100644
> --- a/sysdeps/x86_64/memmove.S
> +++ b/sysdeps/x86_64/memmove.S
> @@ -25,7 +25,7 @@
> /* Use movups and movaps for smaller code sizes. */
> #define VMOVU movups
> #define VMOVA movaps
> -
> +#define MOV_SIZE 3
> #define SECTION(p) p
>
> #ifdef USE_MULTIARCH
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> index 1ec1962e86..67a55f0c85 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> @@ -4,7 +4,7 @@
> # define VMOVNT vmovntdq
> # define VMOVU vmovdqu
> # define VMOVA vmovdqa
> -
> +# define MOV_SIZE 4
> # define ZERO_UPPER_VEC_REGISTERS_RETURN \
> ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> index e195e93f15..975ae6c051 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> @@ -4,7 +4,7 @@
> # define VMOVNT vmovntdq
> # define VMOVU vmovdqu
> # define VMOVA vmovdqa
> -
> +# define MOV_SIZE 4
> # define SECTION(p) p##.avx
> # define MEMMOVE_SYMBOL(p,s) p##_avx_##s
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> index 848848ab39..0fa7126830 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> @@ -25,7 +25,7 @@
> # define VMOVU vmovdqu64
> # define VMOVA vmovdqa64
> # define VZEROUPPER
> -
> +# define MOV_SIZE 6
> # define SECTION(p) p##.evex512
> # define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> index 0cbce8f944..88715441fe 100644
> --- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> @@ -25,7 +25,7 @@
> # define VMOVU vmovdqu64
> # define VMOVA vmovdqa64
> # define VZEROUPPER
> -
> +# define MOV_SIZE 6
> # define SECTION(p) p##.evex
> # define MEMMOVE_SYMBOL(p,s) p##_evex_##s
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> index abde8438d4..7b27cbdda5 100644
> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> @@ -76,6 +76,25 @@
> # endif
> #endif
>
> +/* Whether to align before movsb. Ultimately we want 64 byte
> + align and not worth it to load 4x VEC for VEC_SIZE == 16. */
> +#define ALIGN_MOVSB (VEC_SIZE > 16)
> +/* Number of bytes to align movsb to. */
> +#define MOVSB_ALIGN_TO 64
> +
> +#define SMALL_MOV_SIZE (MOV_SIZE <= 4)
> +#define LARGE_MOV_SIZE (MOV_SIZE > 4)
> +
> +#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
> +# error MOV_SIZE Unknown
> +#endif
> +
> +#if LARGE_MOV_SIZE
> +# define SMALL_SIZE_OFFSET (4)
> +#else
> +# define SMALL_SIZE_OFFSET (0)
> +#endif
> +
> #ifndef PAGE_SIZE
> # define PAGE_SIZE 4096
> #endif
> @@ -199,25 +218,21 @@ L(start):
> # endif
> cmp $VEC_SIZE, %RDX_LP
> jb L(less_vec)
> + /* Load regardless. */
> + VMOVU (%rsi), %VEC(0)
> cmp $(VEC_SIZE * 2), %RDX_LP
> ja L(more_2x_vec)
> -#if !defined USE_MULTIARCH || !IS_IN (libc)
> -L(last_2x_vec):
> -#endif
> /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
> - VMOVU (%rsi), %VEC(0)
> VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
> VMOVU %VEC(0), (%rdi)
> VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
> -#if !defined USE_MULTIARCH || !IS_IN (libc)
> -L(nop):
> - ret
> +#if !(defined USE_MULTIARCH && IS_IN (libc))
> + ZERO_UPPER_VEC_REGISTERS_RETURN
> #else
> VZEROUPPER_RETURN
> #endif
> #if defined USE_MULTIARCH && IS_IN (libc)
> END (MEMMOVE_SYMBOL (__memmove, unaligned))
> -
> # if VEC_SIZE == 16
> ENTRY (__mempcpy_chk_erms)
> cmp %RDX_LP, %RCX_LP
> @@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
> END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
> # endif
>
> -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> +ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
> movq %rdi, %rax
> L(start_erms):
> # ifdef __ILP32__
> @@ -298,310 +313,448 @@ L(start_erms):
> # endif
> cmp $VEC_SIZE, %RDX_LP
> jb L(less_vec)
> + /* Load regardless. */
> + VMOVU (%rsi), %VEC(0)
> cmp $(VEC_SIZE * 2), %RDX_LP
> ja L(movsb_more_2x_vec)
> -L(last_2x_vec):
> - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
> - VMOVU (%rsi), %VEC(0)
> - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
> + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
> + */
> + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1)
> VMOVU %VEC(0), (%rdi)
> - VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
> + VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx)
> L(return):
> -#if VEC_SIZE > 16
> +# if VEC_SIZE > 16
> ZERO_UPPER_VEC_REGISTERS_RETURN
> -#else
> +# else
> ret
> +# endif
> #endif
>
> -L(movsb):
> - cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> - jae L(more_8x_vec)
> - cmpq %rsi, %rdi
> - jb 1f
> - /* Source == destination is less common. */
> - je L(nop)
> - leaq (%rsi,%rdx), %r9
> - cmpq %r9, %rdi
> - /* Avoid slow backward REP MOVSB. */
> - jb L(more_8x_vec_backward)
> -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> - testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> - jz 3f
> - movq %rdi, %rcx
> - subq %rsi, %rcx
> - jmp 2f
> -# endif
> -1:
> -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> - testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> - jz 3f
> - movq %rsi, %rcx
> - subq %rdi, %rcx
> -2:
> -/* Avoid "rep movsb" if RCX, the distance between source and destination,
> - is N*4GB + [1..63] with N >= 0. */
> - cmpl $63, %ecx
> - jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
> -3:
> -# endif
> - mov %RDX_LP, %RCX_LP
> - rep movsb
> -L(nop):
> +#if LARGE_MOV_SIZE
> + /* If LARGE_MOV_SIZE this fits in the aligning bytes between the
> + ENTRY block and L(less_vec). */
> + .p2align 4,, 8
> +L(between_4_7):
> + /* From 4 to 7. No branch when size == 4. */
> + movl (%rsi), %ecx
> + movl (%rsi, %rdx), %esi
> + movl %ecx, (%rdi)
> + movl %esi, (%rdi, %rdx)
> ret
> #endif
>
> + .p2align 4
> L(less_vec):
> /* Less than 1 VEC. */
> #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
> # error Unsupported VEC_SIZE!
> #endif
> #if VEC_SIZE > 32
> - cmpb $32, %dl
> + cmpl $32, %edx
> jae L(between_32_63)
> #endif
> #if VEC_SIZE > 16
> - cmpb $16, %dl
> + cmpl $16, %edx
> jae L(between_16_31)
> #endif
> - cmpb $8, %dl
> + cmpl $8, %edx
> jae L(between_8_15)
> - cmpb $4, %dl
> +#if SMALL_MOV_SIZE
> + cmpl $4, %edx
> +#else
> + subq $4, %rdx
> +#endif
> jae L(between_4_7)
> - cmpb $1, %dl
> - ja L(between_2_3)
> - jb 1f
> - movzbl (%rsi), %ecx
> + cmpl $(1 - SMALL_SIZE_OFFSET), %edx
> + jl L(copy_0)
> + movb (%rsi), %cl
> + je L(copy_1)
> + movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
> + movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
> +L(copy_1):
> movb %cl, (%rdi)
> -1:
> +L(copy_0):
> ret
> +
> +#if SMALL_MOV_SIZE
> + .p2align 4,, 8
> +L(between_4_7):
> + /* From 4 to 7. No branch when size == 4. */
> + movl -4(%rsi, %rdx), %ecx
> + movl (%rsi), %esi
> + movl %ecx, -4(%rdi, %rdx)
> + movl %esi, (%rdi)
> + ret
> +#endif
> +
> +#if VEC_SIZE > 16
> + /* From 16 to 31. No branch when size == 16. */
> + .p2align 4,, 8
> +L(between_16_31):
> + vmovdqu (%rsi), %xmm0
> + vmovdqu -16(%rsi, %rdx), %xmm1
> + vmovdqu %xmm0, (%rdi)
> + vmovdqu %xmm1, -16(%rdi, %rdx)
> + /* No ymm registers have been touched. */
> + ret
> +#endif
> +
> #if VEC_SIZE > 32
> + .p2align 4,, 10
> L(between_32_63):
> /* From 32 to 63. No branch when size == 32. */
> VMOVU (%rsi), %YMM0
> - VMOVU -32(%rsi,%rdx), %YMM1
> + VMOVU -32(%rsi, %rdx), %YMM1
> VMOVU %YMM0, (%rdi)
> - VMOVU %YMM1, -32(%rdi,%rdx)
> - VZEROUPPER_RETURN
> -#endif
> -#if VEC_SIZE > 16
> - /* From 16 to 31. No branch when size == 16. */
> -L(between_16_31):
> - VMOVU (%rsi), %XMM0
> - VMOVU -16(%rsi,%rdx), %XMM1
> - VMOVU %XMM0, (%rdi)
> - VMOVU %XMM1, -16(%rdi,%rdx)
> + VMOVU %YMM1, -32(%rdi, %rdx)
> VZEROUPPER_RETURN
> #endif
> +
> + .p2align 4,, 10
> L(between_8_15):
> /* From 8 to 15. No branch when size == 8. */
> - movq -8(%rsi,%rdx), %rcx
> + movq -8(%rsi, %rdx), %rcx
> movq (%rsi), %rsi
> - movq %rcx, -8(%rdi,%rdx)
> movq %rsi, (%rdi)
> + movq %rcx, -8(%rdi, %rdx)
> ret
> -L(between_4_7):
> - /* From 4 to 7. No branch when size == 4. */
> - movl -4(%rsi,%rdx), %ecx
> - movl (%rsi), %esi
> - movl %ecx, -4(%rdi,%rdx)
> - movl %esi, (%rdi)
> - ret
> -L(between_2_3):
> - /* From 2 to 3. No branch when size == 2. */
> - movzwl -2(%rsi,%rdx), %ecx
> - movzwl (%rsi), %esi
> - movw %cx, -2(%rdi,%rdx)
> - movw %si, (%rdi)
> - ret
>
> + .p2align 4,, 10
> +L(last_4x_vec):
> + /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
> +
> + /* VEC(0) and VEC(1) have already been loaded. */
> + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2)
> + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
> + VMOVU %VEC(0), (%rdi)
> + VMOVU %VEC(1), VEC_SIZE(%rdi)
> + VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx)
> + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
> + VZEROUPPER_RETURN
> +
> + .p2align 4
> #if defined USE_MULTIARCH && IS_IN (libc)
> L(movsb_more_2x_vec):
> cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
> ja L(movsb)
> #endif
> L(more_2x_vec):
> - /* More than 2 * VEC and there may be overlap between destination
> - and source. */
> + /* More than 2 * VEC and there may be overlap between
> + destination and source. */
> cmpq $(VEC_SIZE * 8), %rdx
> ja L(more_8x_vec)
> + /* Load VEC(1) regardless. VEC(0) has already been loaded. */
> + VMOVU VEC_SIZE(%rsi), %VEC(1)
> cmpq $(VEC_SIZE * 4), %rdx
> jbe L(last_4x_vec)
> - /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
> - VMOVU (%rsi), %VEC(0)
> - VMOVU VEC_SIZE(%rsi), %VEC(1)
> + /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
> VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
> VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
> - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
> - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
> - VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
> - VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
> + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4)
> + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
> + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
> + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
> VMOVU %VEC(0), (%rdi)
> VMOVU %VEC(1), VEC_SIZE(%rdi)
> VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
> VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
> - VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
> - VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
> - VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
> - VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
> - VZEROUPPER_RETURN
> -L(last_4x_vec):
> - /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
> - VMOVU (%rsi), %VEC(0)
> - VMOVU VEC_SIZE(%rsi), %VEC(1)
> - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
> - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
> - VMOVU %VEC(0), (%rdi)
> - VMOVU %VEC(1), VEC_SIZE(%rdi)
> - VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
> - VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
> + VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx)
> + VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
> + VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
> + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
> VZEROUPPER_RETURN
>
> + .p2align 4,, 4
> L(more_8x_vec):
> + movq %rdi, %rcx
> + subq %rsi, %rcx
> + /* Go to backwards temporal copy if overlap no matter what as
> + backward REP MOVSB is slow and we don't want to use NT stores if
> + there is overlap. */
> + cmpq %rdx, %rcx
> + /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
> + jb L(more_8x_vec_backward_check_nop)
> /* Check if non-temporal move candidate. */
> #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> /* Check non-temporal store threshold. */
> - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> ja L(large_memcpy_2x)
> #endif
> - /* Entry if rdx is greater than non-temporal threshold but there
> - is overlap. */
> + /* To reach this point there cannot be overlap and dst > src. So
> + check for overlap and src > dst in which case correctness
> + requires forward copy. Otherwise decide between backward/forward
> + copy depending on address aliasing. */
> +
> + /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
> + but less than __x86_shared_non_temporal_threshold. */
> L(more_8x_vec_check):
> - cmpq %rsi, %rdi
> - ja L(more_8x_vec_backward)
> - /* Source == destination is less common. */
> - je L(nop)
> - /* Load the first VEC and last 4 * VEC to support overlapping
> - addresses. */
> - VMOVU (%rsi), %VEC(4)
> + /* rcx contains dst - src. Add back length (rdx). */
> + leaq (%rcx, %rdx), %r8
> + /* If r8 has different sign than rcx then there is overlap so we
> + must do forward copy. */
> + xorq %rcx, %r8
> + /* Isolate just sign bit of r8. */
> + shrq $63, %r8
> + /* Get 4k difference dst - src. */
> + andl $(PAGE_SIZE - 256), %ecx
> + /* If r8 is non-zero must do foward for correctness. Otherwise
> + if ecx is non-zero there is 4k False Alaising so do backward
> + copy. */
> + addl %r8d, %ecx
> + jz L(more_8x_vec_backward)
> +
> + /* if rdx is greater than __x86_shared_non_temporal_threshold
> + but there is overlap, or from short distance movsb. */
> +L(more_8x_vec_forward):
> + /* Load first and last 4 * VEC to support overlapping addresses.
> + */
> +
> + /* First vec was already loaded into VEC(0). */
> VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
> VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
> + /* Save begining of dst. */
> + movq %rdi, %rcx
> + /* Align dst to VEC_SIZE - 1. */
> + orq $(VEC_SIZE - 1), %rdi
> VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
> VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
> - /* Save start and stop of the destination buffer. */
> - movq %rdi, %r11
> - leaq -VEC_SIZE(%rdi, %rdx), %rcx
> - /* Align destination for aligned stores in the loop. Compute
> - how much destination is misaligned. */
> - movq %rdi, %r8
> - andq $(VEC_SIZE - 1), %r8
> - /* Get the negative of offset for alignment. */
> - subq $VEC_SIZE, %r8
> - /* Adjust source. */
> - subq %r8, %rsi
> - /* Adjust destination which should be aligned now. */
> - subq %r8, %rdi
> - /* Adjust length. */
> - addq %r8, %rdx
>
> - .p2align 4
> + /* Subtract dst from src. Add back after dst aligned. */
> + subq %rcx, %rsi
> + /* Finish aligning dst. */
> + incq %rdi
> + /* Restore src adjusted with new value for aligned dst. */
> + addq %rdi, %rsi
> + /* Store end of buffer minus tail in rdx. */
> + leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx
> +
> + /* Dont use multi-byte nop to align. */
> + .p2align 4,, 11
> L(loop_4x_vec_forward):
> /* Copy 4 * VEC a time forward. */
> - VMOVU (%rsi), %VEC(0)
> - VMOVU VEC_SIZE(%rsi), %VEC(1)
> - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
> - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
> + VMOVU (%rsi), %VEC(1)
> + VMOVU VEC_SIZE(%rsi), %VEC(2)
> + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3)
> + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4)
> subq $-(VEC_SIZE * 4), %rsi
> - addq $-(VEC_SIZE * 4), %rdx
> - VMOVA %VEC(0), (%rdi)
> - VMOVA %VEC(1), VEC_SIZE(%rdi)
> - VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
> - VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
> + VMOVA %VEC(1), (%rdi)
> + VMOVA %VEC(2), VEC_SIZE(%rdi)
> + VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi)
> + VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi)
> subq $-(VEC_SIZE * 4), %rdi
> - cmpq $(VEC_SIZE * 4), %rdx
> + cmpq %rdi, %rdx
> ja L(loop_4x_vec_forward)
> /* Store the last 4 * VEC. */
> - VMOVU %VEC(5), (%rcx)
> - VMOVU %VEC(6), -VEC_SIZE(%rcx)
> - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
> - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
> + VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx)
> + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx)
> + VMOVU %VEC(7), VEC_SIZE(%rdx)
> + VMOVU %VEC(8), (%rdx)
> /* Store the first VEC. */
> - VMOVU %VEC(4), (%r11)
> + VMOVU %VEC(0), (%rcx)
> + /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
> + */
> +L(nop_backward):
> VZEROUPPER_RETURN
>
> + .p2align 4,, 8
> +L(more_8x_vec_backward_check_nop):
> + /* rcx contains dst - src. Test for dst == src to skip all of
> + memmove. */
> + testq %rcx, %rcx
> + jz L(nop_backward)
> L(more_8x_vec_backward):
> /* Load the first 4 * VEC and last VEC to support overlapping
> addresses. */
> - VMOVU (%rsi), %VEC(4)
> +
> + /* First vec was also loaded into VEC(0). */
> VMOVU VEC_SIZE(%rsi), %VEC(5)
> VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
> + /* Begining of region for 4x backward copy stored in rcx. */
> + leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
> VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
> - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
> - /* Save stop of the destination buffer. */
> - leaq -VEC_SIZE(%rdi, %rdx), %r11
> - /* Align destination end for aligned stores in the loop. Compute
> - how much destination end is misaligned. */
> - leaq -VEC_SIZE(%rsi, %rdx), %rcx
> - movq %r11, %r9
> - movq %r11, %r8
> - andq $(VEC_SIZE - 1), %r8
> - /* Adjust source. */
> - subq %r8, %rcx
> - /* Adjust the end of destination which should be aligned now. */
> - subq %r8, %r9
> - /* Adjust length. */
> - subq %r8, %rdx
> -
> - .p2align 4
> + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8)
> + /* Subtract dst from src. Add back after dst aligned. */
> + subq %rdi, %rsi
> + /* Align dst. */
> + andq $-(VEC_SIZE), %rcx
> + /* Restore src. */
> + addq %rcx, %rsi
> +
> + /* Don't use multi-byte nop to align. */
> + .p2align 4,, 11
> L(loop_4x_vec_backward):
> /* Copy 4 * VEC a time backward. */
> - VMOVU (%rcx), %VEC(0)
> - VMOVU -VEC_SIZE(%rcx), %VEC(1)
> - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
> - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
> - addq $-(VEC_SIZE * 4), %rcx
> - addq $-(VEC_SIZE * 4), %rdx
> - VMOVA %VEC(0), (%r9)
> - VMOVA %VEC(1), -VEC_SIZE(%r9)
> - VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
> - VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
> - addq $-(VEC_SIZE * 4), %r9
> - cmpq $(VEC_SIZE * 4), %rdx
> - ja L(loop_4x_vec_backward)
> + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
> + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
> + VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3)
> + VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4)
> + addq $(VEC_SIZE * -4), %rsi
> + VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx)
> + VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx)
> + VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx)
> + VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx)
> + addq $(VEC_SIZE * -4), %rcx
> + cmpq %rcx, %rdi
> + jb L(loop_4x_vec_backward)
> /* Store the first 4 * VEC. */
> - VMOVU %VEC(4), (%rdi)
> + VMOVU %VEC(0), (%rdi)
> VMOVU %VEC(5), VEC_SIZE(%rdi)
> VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
> VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
> /* Store the last VEC. */
> - VMOVU %VEC(8), (%r11)
> + VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi)
> + VZEROUPPER_RETURN
> +
> +#if defined USE_MULTIARCH && IS_IN (libc)
> + /* L(skip_short_movsb_check) is only used with ERMS. Not for
> + FSRM. */
> + .p2align 5,, 16
> +# if ALIGN_MOVSB
> +L(skip_short_movsb_check):
> +# if MOVSB_ALIGN_TO > VEC_SIZE
> + VMOVU VEC_SIZE(%rsi), %VEC(1)
> +# endif
> +# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> +# error Unsupported MOVSB_ALIGN_TO
> +# endif
> + /* If CPU does not have FSRM two options for aligning. Align src
> + if dst and src 4k alias. Otherwise align dst. */
> + testl $(PAGE_SIZE - 512), %ecx
> + jnz L(movsb_align_dst)
> + /* Fall through. dst and src 4k alias. It's better to align src
> + here because the bottleneck will be loads dues to the false
> + dependency on dst. */
> +
> + /* rcx already has dst - src. */
> + movq %rcx, %r9
> + /* Add src to len. Subtract back after src aligned. -1 because
> + src is initially aligned to MOVSB_ALIGN_TO - 1. */
> + leaq -1(%rsi, %rdx), %rcx
> + /* Inclusively align src to MOVSB_ALIGN_TO - 1. */
> + orq $(MOVSB_ALIGN_TO - 1), %rsi
> + /* Restore dst and len adjusted with new values for aligned dst.
> + */
> + leaq 1(%rsi, %r9), %rdi
> + subq %rsi, %rcx
> + /* Finish aligning src. */
> + incq %rsi
> +
> + rep movsb
> +
> + VMOVU %VEC(0), (%r8)
> +# if MOVSB_ALIGN_TO > VEC_SIZE
> + VMOVU %VEC(1), VEC_SIZE(%r8)
> +# endif
> VZEROUPPER_RETURN
> +# endif
> +
> + .p2align 4,, 12
> +L(movsb):
> + movq %rdi, %rcx
> + subq %rsi, %rcx
> + /* Go to backwards temporal copy if overlap no matter what as
> + backward REP MOVSB is slow and we don't want to use NT stores if
> + there is overlap. */
> + cmpq %rdx, %rcx
> + /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
> + jb L(more_8x_vec_backward_check_nop)
> +# if ALIGN_MOVSB
> + /* Save dest for storing aligning VECs later. */
> + movq %rdi, %r8
> +# endif
> + /* If above __x86_rep_movsb_stop_threshold most likely is
> + candidate for NT moves aswell. */
> + cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> + jae L(large_memcpy_2x_check)
> +# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
> + /* Only avoid short movsb if CPU has FSRM. */
> + testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> + jz L(skip_short_movsb_check)
> +# if AVOID_SHORT_DISTANCE_REP_MOVSB
> + /* Avoid "rep movsb" if RCX, the distance between source and
> + destination, is N*4GB + [1..63] with N >= 0. */
> +
> + /* ecx contains dst - src. Early check for backward copy
> + conditions means only case of slow movsb with src = dst + [0,
> + 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
> + for that case. */
> + cmpl $-64, %ecx
> + ja L(more_8x_vec_forward)
> +# endif
> +# endif
> +# if ALIGN_MOVSB
> +# if MOVSB_ALIGN_TO > VEC_SIZE
> + VMOVU VEC_SIZE(%rsi), %VEC(1)
> +# endif
> +# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> +# error Unsupported MOVSB_ALIGN_TO
> +# endif
> + /* Fall through means cpu has FSRM. In that case exclusively
> + align destination. */
> +L(movsb_align_dst):
> + /* Subtract dst from src. Add back after dst aligned. */
> + subq %rdi, %rsi
> + /* Exclusively align dst to MOVSB_ALIGN_TO (64). */
> + addq $(MOVSB_ALIGN_TO - 1), %rdi
> + /* Add dst to len. Subtract back after dst aligned. */
> + leaq (%r8, %rdx), %rcx
> + /* Finish aligning dst. */
> + andq $-(MOVSB_ALIGN_TO), %rdi
> + /* Restore src and len adjusted with new values for aligned dst.
> + */
> + addq %rdi, %rsi
> + subq %rdi, %rcx
> +
> + rep movsb
> +
> + /* Store VECs loaded for aligning. */
> + VMOVU %VEC(0), (%r8)
> +# if MOVSB_ALIGN_TO > VEC_SIZE
> + VMOVU %VEC(1), VEC_SIZE(%r8)
> +# endif
> + VZEROUPPER_RETURN
> +# else /* !ALIGN_MOVSB. */
> +L(skip_short_movsb_check):
> + mov %RDX_LP, %RCX_LP
> + rep movsb
> + ret
> +# endif
> +#endif
>
> + .p2align 4,, 10
> #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> - .p2align 4
> +L(large_memcpy_2x_check):
> + cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
> + jb L(more_8x_vec_check)
> L(large_memcpy_2x):
> - /* Compute absolute value of difference between source and
> - destination. */
> - movq %rdi, %r9
> - subq %rsi, %r9
> - movq %r9, %r8
> - leaq -1(%r9), %rcx
> - sarq $63, %r8
> - xorq %r8, %r9
> - subq %r8, %r9
> - /* Don't use non-temporal store if there is overlap between
> - destination and source since destination may be in cache when
> - source is loaded. */
> - cmpq %r9, %rdx
> - ja L(more_8x_vec_check)
> + /* To reach this point it is impossible for dst > src and
> + overlap. Remaining to check is src > dst and overlap. rcx
> + already contains dst - src. Negate rcx to get src - dst. If
> + length > rcx then there is overlap and forward copy is best. */
> + negq %rcx
> + cmpq %rcx, %rdx
> + ja L(more_8x_vec_forward)
>
> /* Cache align destination. First store the first 64 bytes then
> adjust alignments. */
> - VMOVU (%rsi), %VEC(8)
> -#if VEC_SIZE < 64
> - VMOVU VEC_SIZE(%rsi), %VEC(9)
> -#if VEC_SIZE < 32
> - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
> - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
> -#endif
> -#endif
> - VMOVU %VEC(8), (%rdi)
> -#if VEC_SIZE < 64
> - VMOVU %VEC(9), VEC_SIZE(%rdi)
> -#if VEC_SIZE < 32
> - VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
> - VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
> -#endif
> -#endif
> +
> + /* First vec was also loaded into VEC(0). */
> +# if VEC_SIZE < 64
> + VMOVU VEC_SIZE(%rsi), %VEC(1)
> +# if VEC_SIZE < 32
> + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
> + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
> +# endif
> +# endif
> + VMOVU %VEC(0), (%rdi)
> +# if VEC_SIZE < 64
> + VMOVU %VEC(1), VEC_SIZE(%rdi)
> +# if VEC_SIZE < 32
> + VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
> + VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
> +# endif
> +# endif
> +
> /* Adjust source, destination, and size. */
> movq %rdi, %r8
> andq $63, %r8
> @@ -614,9 +767,13 @@ L(large_memcpy_2x):
> /* Adjust length. */
> addq %r8, %rdx
>
> - /* Test if source and destination addresses will alias. If they do
> - the larger pipeline in large_memcpy_4x alleviated the
> + /* Test if source and destination addresses will alias. If they
> + do the larger pipeline in large_memcpy_4x alleviated the
> performance drop. */
> +
> + /* ecx contains -(dst - src). not ecx will return dst - src - 1
> + which works for testing aliasing. */
> + notl %ecx
> testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
> jz L(large_memcpy_4x)
>
> @@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
> /* ecx stores inner loop counter. */
> movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> L(loop_large_memcpy_4x_inner):
> - /* Only one prefetch set per page as doing 4 pages give more time
> - for prefetcher to keep up. */
> + /* Only one prefetch set per page as doing 4 pages give more
> + time for prefetcher to keep up. */
> PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
On Sat, Nov 6, 2021 at 12:12 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Sat, Nov 6, 2021 at 11:33 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > No bug.
> >
> > The optimizations are as follows:
> >
> > 1) Always align entry to 64 bytes. This makes behavior more
> > predictable and makes other frontend optimizations easier.
> >
> > 2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
> > significant benefits in the case that:
> > 0 < (dst - src) < [256, 512]
> >
> > 3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
> > improvement and for FSRM [-10%, 25%].
> >
> > In addition to these primary changes there is general cleanup
> > throughout to optimize the aligning routines and control flow logic.
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> > sysdeps/x86_64/memmove.S | 2 +-
> > .../memmove-avx-unaligned-erms-rtm.S | 2 +-
> > .../multiarch/memmove-avx-unaligned-erms.S | 2 +-
> > .../multiarch/memmove-avx512-unaligned-erms.S | 2 +-
> > .../multiarch/memmove-evex-unaligned-erms.S | 2 +-
> > .../multiarch/memmove-vec-unaligned-erms.S | 595 +++++++++++-------
> > 6 files changed, 381 insertions(+), 224 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
> > index db106a7a1f..b2b3180848 100644
> > --- a/sysdeps/x86_64/memmove.S
> > +++ b/sysdeps/x86_64/memmove.S
> > @@ -25,7 +25,7 @@
> > /* Use movups and movaps for smaller code sizes. */
> > #define VMOVU movups
> > #define VMOVA movaps
> > -
> > +#define MOV_SIZE 3
> > #define SECTION(p) p
> >
> > #ifdef USE_MULTIARCH
> > diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> > index 1ec1962e86..67a55f0c85 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> > @@ -4,7 +4,7 @@
> > # define VMOVNT vmovntdq
> > # define VMOVU vmovdqu
> > # define VMOVA vmovdqa
> > -
> > +# define MOV_SIZE 4
> > # define ZERO_UPPER_VEC_REGISTERS_RETURN \
> > ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> >
> > diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> > index e195e93f15..975ae6c051 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> > @@ -4,7 +4,7 @@
> > # define VMOVNT vmovntdq
> > # define VMOVU vmovdqu
> > # define VMOVA vmovdqa
> > -
> > +# define MOV_SIZE 4
> > # define SECTION(p) p##.avx
> > # define MEMMOVE_SYMBOL(p,s) p##_avx_##s
> >
> > diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> > index 848848ab39..0fa7126830 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> > @@ -25,7 +25,7 @@
> > # define VMOVU vmovdqu64
> > # define VMOVA vmovdqa64
> > # define VZEROUPPER
> > -
> > +# define MOV_SIZE 6
> > # define SECTION(p) p##.evex512
> > # define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
> >
> > diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> > index 0cbce8f944..88715441fe 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> > @@ -25,7 +25,7 @@
> > # define VMOVU vmovdqu64
> > # define VMOVA vmovdqa64
> > # define VZEROUPPER
> > -
> > +# define MOV_SIZE 6
> > # define SECTION(p) p##.evex
> > # define MEMMOVE_SYMBOL(p,s) p##_evex_##s
> >
> > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > index abde8438d4..7b27cbdda5 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > @@ -76,6 +76,25 @@
> > # endif
> > #endif
> >
> > +/* Whether to align before movsb. Ultimately we want 64 byte
> > + align and not worth it to load 4x VEC for VEC_SIZE == 16. */
> > +#define ALIGN_MOVSB (VEC_SIZE > 16)
> > +/* Number of bytes to align movsb to. */
> > +#define MOVSB_ALIGN_TO 64
> > +
> > +#define SMALL_MOV_SIZE (MOV_SIZE <= 4)
> > +#define LARGE_MOV_SIZE (MOV_SIZE > 4)
> > +
> > +#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
> > +# error MOV_SIZE Unknown
> > +#endif
> > +
> > +#if LARGE_MOV_SIZE
> > +# define SMALL_SIZE_OFFSET (4)
> > +#else
> > +# define SMALL_SIZE_OFFSET (0)
> > +#endif
> > +
> > #ifndef PAGE_SIZE
> > # define PAGE_SIZE 4096
> > #endif
> > @@ -199,25 +218,21 @@ L(start):
> > # endif
> > cmp $VEC_SIZE, %RDX_LP
> > jb L(less_vec)
> > + /* Load regardless. */
> > + VMOVU (%rsi), %VEC(0)
> > cmp $(VEC_SIZE * 2), %RDX_LP
> > ja L(more_2x_vec)
> > -#if !defined USE_MULTIARCH || !IS_IN (libc)
> > -L(last_2x_vec):
> > -#endif
> > /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
> > - VMOVU (%rsi), %VEC(0)
> > VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
> > VMOVU %VEC(0), (%rdi)
> > VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
> > -#if !defined USE_MULTIARCH || !IS_IN (libc)
> > -L(nop):
> > - ret
> > +#if !(defined USE_MULTIARCH && IS_IN (libc))
> > + ZERO_UPPER_VEC_REGISTERS_RETURN
> > #else
> > VZEROUPPER_RETURN
> > #endif
> > #if defined USE_MULTIARCH && IS_IN (libc)
> > END (MEMMOVE_SYMBOL (__memmove, unaligned))
> > -
> > # if VEC_SIZE == 16
> > ENTRY (__mempcpy_chk_erms)
> > cmp %RDX_LP, %RCX_LP
> > @@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
> > END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
> > # endif
> >
> > -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> > +ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
> > movq %rdi, %rax
> > L(start_erms):
> > # ifdef __ILP32__
> > @@ -298,310 +313,448 @@ L(start_erms):
> > # endif
> > cmp $VEC_SIZE, %RDX_LP
> > jb L(less_vec)
> > + /* Load regardless. */
> > + VMOVU (%rsi), %VEC(0)
> > cmp $(VEC_SIZE * 2), %RDX_LP
> > ja L(movsb_more_2x_vec)
> > -L(last_2x_vec):
> > - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
> > - VMOVU (%rsi), %VEC(0)
> > - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
> > + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
> > + */
> > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1)
> > VMOVU %VEC(0), (%rdi)
> > - VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
> > + VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx)
> > L(return):
> > -#if VEC_SIZE > 16
> > +# if VEC_SIZE > 16
> > ZERO_UPPER_VEC_REGISTERS_RETURN
> > -#else
> > +# else
> > ret
> > +# endif
> > #endif
> >
> > -L(movsb):
> > - cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> > - jae L(more_8x_vec)
> > - cmpq %rsi, %rdi
> > - jb 1f
> > - /* Source == destination is less common. */
> > - je L(nop)
> > - leaq (%rsi,%rdx), %r9
> > - cmpq %r9, %rdi
> > - /* Avoid slow backward REP MOVSB. */
> > - jb L(more_8x_vec_backward)
> > -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> > - testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> > - jz 3f
> > - movq %rdi, %rcx
> > - subq %rsi, %rcx
> > - jmp 2f
> > -# endif
> > -1:
> > -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> > - testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> > - jz 3f
> > - movq %rsi, %rcx
> > - subq %rdi, %rcx
> > -2:
> > -/* Avoid "rep movsb" if RCX, the distance between source and destination,
> > - is N*4GB + [1..63] with N >= 0. */
> > - cmpl $63, %ecx
> > - jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
> > -3:
> > -# endif
> > - mov %RDX_LP, %RCX_LP
> > - rep movsb
> > -L(nop):
> > +#if LARGE_MOV_SIZE
> > + /* If LARGE_MOV_SIZE this fits in the aligning bytes between the
> > + ENTRY block and L(less_vec). */
> > + .p2align 4,, 8
> > +L(between_4_7):
> > + /* From 4 to 7. No branch when size == 4. */
> > + movl (%rsi), %ecx
> > + movl (%rsi, %rdx), %esi
> > + movl %ecx, (%rdi)
> > + movl %esi, (%rdi, %rdx)
> > ret
> > #endif
> >
> > + .p2align 4
> > L(less_vec):
> > /* Less than 1 VEC. */
> > #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
> > # error Unsupported VEC_SIZE!
> > #endif
> > #if VEC_SIZE > 32
> > - cmpb $32, %dl
> > + cmpl $32, %edx
> > jae L(between_32_63)
> > #endif
> > #if VEC_SIZE > 16
> > - cmpb $16, %dl
> > + cmpl $16, %edx
> > jae L(between_16_31)
> > #endif
> > - cmpb $8, %dl
> > + cmpl $8, %edx
> > jae L(between_8_15)
> > - cmpb $4, %dl
> > +#if SMALL_MOV_SIZE
> > + cmpl $4, %edx
> > +#else
> > + subq $4, %rdx
> > +#endif
> > jae L(between_4_7)
> > - cmpb $1, %dl
> > - ja L(between_2_3)
> > - jb 1f
> > - movzbl (%rsi), %ecx
> > + cmpl $(1 - SMALL_SIZE_OFFSET), %edx
> > + jl L(copy_0)
> > + movb (%rsi), %cl
> > + je L(copy_1)
> > + movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
> > + movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
> > +L(copy_1):
> > movb %cl, (%rdi)
> > -1:
> > +L(copy_0):
> > ret
> > +
> > +#if SMALL_MOV_SIZE
> > + .p2align 4,, 8
> > +L(between_4_7):
> > + /* From 4 to 7. No branch when size == 4. */
> > + movl -4(%rsi, %rdx), %ecx
> > + movl (%rsi), %esi
> > + movl %ecx, -4(%rdi, %rdx)
> > + movl %esi, (%rdi)
> > + ret
> > +#endif
> > +
> > +#if VEC_SIZE > 16
> > + /* From 16 to 31. No branch when size == 16. */
> > + .p2align 4,, 8
> > +L(between_16_31):
> > + vmovdqu (%rsi), %xmm0
> > + vmovdqu -16(%rsi, %rdx), %xmm1
> > + vmovdqu %xmm0, (%rdi)
> > + vmovdqu %xmm1, -16(%rdi, %rdx)
> > + /* No ymm registers have been touched. */
> > + ret
> > +#endif
> > +
> > #if VEC_SIZE > 32
> > + .p2align 4,, 10
> > L(between_32_63):
> > /* From 32 to 63. No branch when size == 32. */
> > VMOVU (%rsi), %YMM0
> > - VMOVU -32(%rsi,%rdx), %YMM1
> > + VMOVU -32(%rsi, %rdx), %YMM1
> > VMOVU %YMM0, (%rdi)
> > - VMOVU %YMM1, -32(%rdi,%rdx)
> > - VZEROUPPER_RETURN
> > -#endif
> > -#if VEC_SIZE > 16
> > - /* From 16 to 31. No branch when size == 16. */
> > -L(between_16_31):
> > - VMOVU (%rsi), %XMM0
> > - VMOVU -16(%rsi,%rdx), %XMM1
> > - VMOVU %XMM0, (%rdi)
> > - VMOVU %XMM1, -16(%rdi,%rdx)
> > + VMOVU %YMM1, -32(%rdi, %rdx)
> > VZEROUPPER_RETURN
> > #endif
> > +
> > + .p2align 4,, 10
> > L(between_8_15):
> > /* From 8 to 15. No branch when size == 8. */
> > - movq -8(%rsi,%rdx), %rcx
> > + movq -8(%rsi, %rdx), %rcx
> > movq (%rsi), %rsi
> > - movq %rcx, -8(%rdi,%rdx)
> > movq %rsi, (%rdi)
> > + movq %rcx, -8(%rdi, %rdx)
> > ret
> > -L(between_4_7):
> > - /* From 4 to 7. No branch when size == 4. */
> > - movl -4(%rsi,%rdx), %ecx
> > - movl (%rsi), %esi
> > - movl %ecx, -4(%rdi,%rdx)
> > - movl %esi, (%rdi)
> > - ret
> > -L(between_2_3):
> > - /* From 2 to 3. No branch when size == 2. */
> > - movzwl -2(%rsi,%rdx), %ecx
> > - movzwl (%rsi), %esi
> > - movw %cx, -2(%rdi,%rdx)
> > - movw %si, (%rdi)
> > - ret
> >
> > + .p2align 4,, 10
> > +L(last_4x_vec):
> > + /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
> > +
> > + /* VEC(0) and VEC(1) have already been loaded. */
> > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2)
> > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
> > + VMOVU %VEC(0), (%rdi)
> > + VMOVU %VEC(1), VEC_SIZE(%rdi)
> > + VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx)
> > + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
> > + VZEROUPPER_RETURN
> > +
> > + .p2align 4
> > #if defined USE_MULTIARCH && IS_IN (libc)
> > L(movsb_more_2x_vec):
> > cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
> > ja L(movsb)
> > #endif
> > L(more_2x_vec):
> > - /* More than 2 * VEC and there may be overlap between destination
> > - and source. */
> > + /* More than 2 * VEC and there may be overlap between
> > + destination and source. */
> > cmpq $(VEC_SIZE * 8), %rdx
> > ja L(more_8x_vec)
> > + /* Load VEC(1) regardless. VEC(0) has already been loaded. */
> > + VMOVU VEC_SIZE(%rsi), %VEC(1)
> > cmpq $(VEC_SIZE * 4), %rdx
> > jbe L(last_4x_vec)
> > - /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
> > - VMOVU (%rsi), %VEC(0)
> > - VMOVU VEC_SIZE(%rsi), %VEC(1)
> > + /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
> > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
> > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
> > - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
> > - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
> > - VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
> > - VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
> > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4)
> > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
> > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
> > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
> > VMOVU %VEC(0), (%rdi)
> > VMOVU %VEC(1), VEC_SIZE(%rdi)
> > VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
> > VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
> > - VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
> > - VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
> > - VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
> > - VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
> > - VZEROUPPER_RETURN
> > -L(last_4x_vec):
> > - /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
> > - VMOVU (%rsi), %VEC(0)
> > - VMOVU VEC_SIZE(%rsi), %VEC(1)
> > - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
> > - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
> > - VMOVU %VEC(0), (%rdi)
> > - VMOVU %VEC(1), VEC_SIZE(%rdi)
> > - VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
> > - VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
> > + VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx)
> > + VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
> > + VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
> > + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
> > VZEROUPPER_RETURN
> >
> > + .p2align 4,, 4
> > L(more_8x_vec):
> > + movq %rdi, %rcx
> > + subq %rsi, %rcx
> > + /* Go to backwards temporal copy if overlap no matter what as
> > + backward REP MOVSB is slow and we don't want to use NT stores if
> > + there is overlap. */
> > + cmpq %rdx, %rcx
> > + /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
> > + jb L(more_8x_vec_backward_check_nop)
> > /* Check if non-temporal move candidate. */
> > #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > /* Check non-temporal store threshold. */
> > - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > ja L(large_memcpy_2x)
> > #endif
> > - /* Entry if rdx is greater than non-temporal threshold but there
> > - is overlap. */
> > + /* To reach this point there cannot be overlap and dst > src. So
> > + check for overlap and src > dst in which case correctness
> > + requires forward copy. Otherwise decide between backward/forward
> > + copy depending on address aliasing. */
> > +
> > + /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
> > + but less than __x86_shared_non_temporal_threshold. */
> > L(more_8x_vec_check):
> > - cmpq %rsi, %rdi
> > - ja L(more_8x_vec_backward)
> > - /* Source == destination is less common. */
> > - je L(nop)
> > - /* Load the first VEC and last 4 * VEC to support overlapping
> > - addresses. */
> > - VMOVU (%rsi), %VEC(4)
> > + /* rcx contains dst - src. Add back length (rdx). */
> > + leaq (%rcx, %rdx), %r8
> > + /* If r8 has different sign than rcx then there is overlap so we
> > + must do forward copy. */
> > + xorq %rcx, %r8
> > + /* Isolate just sign bit of r8. */
> > + shrq $63, %r8
> > + /* Get 4k difference dst - src. */
> > + andl $(PAGE_SIZE - 256), %ecx
> > + /* If r8 is non-zero must do foward for correctness. Otherwise
> > + if ecx is non-zero there is 4k False Alaising so do backward
> > + copy. */
> > + addl %r8d, %ecx
> > + jz L(more_8x_vec_backward)
> > +
> > + /* if rdx is greater than __x86_shared_non_temporal_threshold
> > + but there is overlap, or from short distance movsb. */
> > +L(more_8x_vec_forward):
> > + /* Load first and last 4 * VEC to support overlapping addresses.
> > + */
> > +
> > + /* First vec was already loaded into VEC(0). */
> > VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
> > VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
> > + /* Save begining of dst. */
> > + movq %rdi, %rcx
> > + /* Align dst to VEC_SIZE - 1. */
> > + orq $(VEC_SIZE - 1), %rdi
> > VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
> > VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
> > - /* Save start and stop of the destination buffer. */
> > - movq %rdi, %r11
> > - leaq -VEC_SIZE(%rdi, %rdx), %rcx
> > - /* Align destination for aligned stores in the loop. Compute
> > - how much destination is misaligned. */
> > - movq %rdi, %r8
> > - andq $(VEC_SIZE - 1), %r8
> > - /* Get the negative of offset for alignment. */
> > - subq $VEC_SIZE, %r8
> > - /* Adjust source. */
> > - subq %r8, %rsi
> > - /* Adjust destination which should be aligned now. */
> > - subq %r8, %rdi
> > - /* Adjust length. */
> > - addq %r8, %rdx
> >
> > - .p2align 4
> > + /* Subtract dst from src. Add back after dst aligned. */
> > + subq %rcx, %rsi
> > + /* Finish aligning dst. */
> > + incq %rdi
> > + /* Restore src adjusted with new value for aligned dst. */
> > + addq %rdi, %rsi
> > + /* Store end of buffer minus tail in rdx. */
> > + leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx
> > +
> > + /* Dont use multi-byte nop to align. */
> > + .p2align 4,, 11
> > L(loop_4x_vec_forward):
> > /* Copy 4 * VEC a time forward. */
> > - VMOVU (%rsi), %VEC(0)
> > - VMOVU VEC_SIZE(%rsi), %VEC(1)
> > - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
> > - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
> > + VMOVU (%rsi), %VEC(1)
> > + VMOVU VEC_SIZE(%rsi), %VEC(2)
> > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3)
> > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4)
> > subq $-(VEC_SIZE * 4), %rsi
> > - addq $-(VEC_SIZE * 4), %rdx
> > - VMOVA %VEC(0), (%rdi)
> > - VMOVA %VEC(1), VEC_SIZE(%rdi)
> > - VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
> > - VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
> > + VMOVA %VEC(1), (%rdi)
> > + VMOVA %VEC(2), VEC_SIZE(%rdi)
> > + VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi)
> > + VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi)
> > subq $-(VEC_SIZE * 4), %rdi
> > - cmpq $(VEC_SIZE * 4), %rdx
> > + cmpq %rdi, %rdx
> > ja L(loop_4x_vec_forward)
> > /* Store the last 4 * VEC. */
> > - VMOVU %VEC(5), (%rcx)
> > - VMOVU %VEC(6), -VEC_SIZE(%rcx)
> > - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
> > - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
> > + VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx)
> > + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx)
> > + VMOVU %VEC(7), VEC_SIZE(%rdx)
> > + VMOVU %VEC(8), (%rdx)
> > /* Store the first VEC. */
> > - VMOVU %VEC(4), (%r11)
> > + VMOVU %VEC(0), (%rcx)
> > + /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
> > + */
> > +L(nop_backward):
> > VZEROUPPER_RETURN
> >
> > + .p2align 4,, 8
> > +L(more_8x_vec_backward_check_nop):
> > + /* rcx contains dst - src. Test for dst == src to skip all of
> > + memmove. */
> > + testq %rcx, %rcx
> > + jz L(nop_backward)
> > L(more_8x_vec_backward):
> > /* Load the first 4 * VEC and last VEC to support overlapping
> > addresses. */
> > - VMOVU (%rsi), %VEC(4)
> > +
> > + /* First vec was also loaded into VEC(0). */
> > VMOVU VEC_SIZE(%rsi), %VEC(5)
> > VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
> > + /* Begining of region for 4x backward copy stored in rcx. */
> > + leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
> > VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
> > - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
> > - /* Save stop of the destination buffer. */
> > - leaq -VEC_SIZE(%rdi, %rdx), %r11
> > - /* Align destination end for aligned stores in the loop. Compute
> > - how much destination end is misaligned. */
> > - leaq -VEC_SIZE(%rsi, %rdx), %rcx
> > - movq %r11, %r9
> > - movq %r11, %r8
> > - andq $(VEC_SIZE - 1), %r8
> > - /* Adjust source. */
> > - subq %r8, %rcx
> > - /* Adjust the end of destination which should be aligned now. */
> > - subq %r8, %r9
> > - /* Adjust length. */
> > - subq %r8, %rdx
> > -
> > - .p2align 4
> > + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8)
> > + /* Subtract dst from src. Add back after dst aligned. */
> > + subq %rdi, %rsi
> > + /* Align dst. */
> > + andq $-(VEC_SIZE), %rcx
> > + /* Restore src. */
> > + addq %rcx, %rsi
> > +
> > + /* Don't use multi-byte nop to align. */
> > + .p2align 4,, 11
> > L(loop_4x_vec_backward):
> > /* Copy 4 * VEC a time backward. */
> > - VMOVU (%rcx), %VEC(0)
> > - VMOVU -VEC_SIZE(%rcx), %VEC(1)
> > - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
> > - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
> > - addq $-(VEC_SIZE * 4), %rcx
> > - addq $-(VEC_SIZE * 4), %rdx
> > - VMOVA %VEC(0), (%r9)
> > - VMOVA %VEC(1), -VEC_SIZE(%r9)
> > - VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
> > - VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
> > - addq $-(VEC_SIZE * 4), %r9
> > - cmpq $(VEC_SIZE * 4), %rdx
> > - ja L(loop_4x_vec_backward)
> > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
> > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
> > + VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3)
> > + VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4)
> > + addq $(VEC_SIZE * -4), %rsi
> > + VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx)
> > + VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx)
> > + VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx)
> > + VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx)
> > + addq $(VEC_SIZE * -4), %rcx
> > + cmpq %rcx, %rdi
> > + jb L(loop_4x_vec_backward)
> > /* Store the first 4 * VEC. */
> > - VMOVU %VEC(4), (%rdi)
> > + VMOVU %VEC(0), (%rdi)
> > VMOVU %VEC(5), VEC_SIZE(%rdi)
> > VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
> > VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
> > /* Store the last VEC. */
> > - VMOVU %VEC(8), (%r11)
> > + VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi)
> > + VZEROUPPER_RETURN
> > +
> > +#if defined USE_MULTIARCH && IS_IN (libc)
> > + /* L(skip_short_movsb_check) is only used with ERMS. Not for
> > + FSRM. */
> > + .p2align 5,, 16
> > +# if ALIGN_MOVSB
> > +L(skip_short_movsb_check):
> > +# if MOVSB_ALIGN_TO > VEC_SIZE
> > + VMOVU VEC_SIZE(%rsi), %VEC(1)
> > +# endif
> > +# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> > +# error Unsupported MOVSB_ALIGN_TO
> > +# endif
> > + /* If CPU does not have FSRM two options for aligning. Align src
> > + if dst and src 4k alias. Otherwise align dst. */
> > + testl $(PAGE_SIZE - 512), %ecx
> > + jnz L(movsb_align_dst)
> > + /* Fall through. dst and src 4k alias. It's better to align src
> > + here because the bottleneck will be loads dues to the false
> > + dependency on dst. */
> > +
> > + /* rcx already has dst - src. */
> > + movq %rcx, %r9
> > + /* Add src to len. Subtract back after src aligned. -1 because
> > + src is initially aligned to MOVSB_ALIGN_TO - 1. */
> > + leaq -1(%rsi, %rdx), %rcx
> > + /* Inclusively align src to MOVSB_ALIGN_TO - 1. */
> > + orq $(MOVSB_ALIGN_TO - 1), %rsi
> > + /* Restore dst and len adjusted with new values for aligned dst.
> > + */
> > + leaq 1(%rsi, %r9), %rdi
> > + subq %rsi, %rcx
> > + /* Finish aligning src. */
> > + incq %rsi
> > +
> > + rep movsb
> > +
> > + VMOVU %VEC(0), (%r8)
> > +# if MOVSB_ALIGN_TO > VEC_SIZE
> > + VMOVU %VEC(1), VEC_SIZE(%r8)
> > +# endif
> > VZEROUPPER_RETURN
> > +# endif
> > +
> > + .p2align 4,, 12
> > +L(movsb):
> > + movq %rdi, %rcx
> > + subq %rsi, %rcx
> > + /* Go to backwards temporal copy if overlap no matter what as
> > + backward REP MOVSB is slow and we don't want to use NT stores if
> > + there is overlap. */
> > + cmpq %rdx, %rcx
> > + /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
> > + jb L(more_8x_vec_backward_check_nop)
> > +# if ALIGN_MOVSB
> > + /* Save dest for storing aligning VECs later. */
> > + movq %rdi, %r8
> > +# endif
> > + /* If above __x86_rep_movsb_stop_threshold most likely is
> > + candidate for NT moves aswell. */
> > + cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> > + jae L(large_memcpy_2x_check)
> > +# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
> > + /* Only avoid short movsb if CPU has FSRM. */
> > + testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> > + jz L(skip_short_movsb_check)
> > +# if AVOID_SHORT_DISTANCE_REP_MOVSB
> > + /* Avoid "rep movsb" if RCX, the distance between source and
> > + destination, is N*4GB + [1..63] with N >= 0. */
> > +
> > + /* ecx contains dst - src. Early check for backward copy
> > + conditions means only case of slow movsb with src = dst + [0,
> > + 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
> > + for that case. */
> > + cmpl $-64, %ecx
> > + ja L(more_8x_vec_forward)
> > +# endif
> > +# endif
> > +# if ALIGN_MOVSB
> > +# if MOVSB_ALIGN_TO > VEC_SIZE
> > + VMOVU VEC_SIZE(%rsi), %VEC(1)
> > +# endif
> > +# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> > +# error Unsupported MOVSB_ALIGN_TO
> > +# endif
> > + /* Fall through means cpu has FSRM. In that case exclusively
> > + align destination. */
> > +L(movsb_align_dst):
> > + /* Subtract dst from src. Add back after dst aligned. */
> > + subq %rdi, %rsi
> > + /* Exclusively align dst to MOVSB_ALIGN_TO (64). */
> > + addq $(MOVSB_ALIGN_TO - 1), %rdi
> > + /* Add dst to len. Subtract back after dst aligned. */
> > + leaq (%r8, %rdx), %rcx
> > + /* Finish aligning dst. */
> > + andq $-(MOVSB_ALIGN_TO), %rdi
> > + /* Restore src and len adjusted with new values for aligned dst.
> > + */
> > + addq %rdi, %rsi
> > + subq %rdi, %rcx
> > +
> > + rep movsb
> > +
> > + /* Store VECs loaded for aligning. */
> > + VMOVU %VEC(0), (%r8)
> > +# if MOVSB_ALIGN_TO > VEC_SIZE
> > + VMOVU %VEC(1), VEC_SIZE(%r8)
> > +# endif
> > + VZEROUPPER_RETURN
> > +# else /* !ALIGN_MOVSB. */
> > +L(skip_short_movsb_check):
> > + mov %RDX_LP, %RCX_LP
> > + rep movsb
> > + ret
> > +# endif
> > +#endif
> >
> > + .p2align 4,, 10
> > #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > - .p2align 4
> > +L(large_memcpy_2x_check):
> > + cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
> > + jb L(more_8x_vec_check)
> > L(large_memcpy_2x):
> > - /* Compute absolute value of difference between source and
> > - destination. */
> > - movq %rdi, %r9
> > - subq %rsi, %r9
> > - movq %r9, %r8
> > - leaq -1(%r9), %rcx
> > - sarq $63, %r8
> > - xorq %r8, %r9
> > - subq %r8, %r9
> > - /* Don't use non-temporal store if there is overlap between
> > - destination and source since destination may be in cache when
> > - source is loaded. */
> > - cmpq %r9, %rdx
> > - ja L(more_8x_vec_check)
> > + /* To reach this point it is impossible for dst > src and
> > + overlap. Remaining to check is src > dst and overlap. rcx
> > + already contains dst - src. Negate rcx to get src - dst. If
> > + length > rcx then there is overlap and forward copy is best. */
> > + negq %rcx
> > + cmpq %rcx, %rdx
> > + ja L(more_8x_vec_forward)
> >
> > /* Cache align destination. First store the first 64 bytes then
> > adjust alignments. */
> > - VMOVU (%rsi), %VEC(8)
> > -#if VEC_SIZE < 64
> > - VMOVU VEC_SIZE(%rsi), %VEC(9)
> > -#if VEC_SIZE < 32
> > - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
> > - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
> > -#endif
> > -#endif
> > - VMOVU %VEC(8), (%rdi)
> > -#if VEC_SIZE < 64
> > - VMOVU %VEC(9), VEC_SIZE(%rdi)
> > -#if VEC_SIZE < 32
> > - VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
> > - VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
> > -#endif
> > -#endif
> > +
> > + /* First vec was also loaded into VEC(0). */
> > +# if VEC_SIZE < 64
> > + VMOVU VEC_SIZE(%rsi), %VEC(1)
> > +# if VEC_SIZE < 32
> > + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
> > + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
> > +# endif
> > +# endif
> > + VMOVU %VEC(0), (%rdi)
> > +# if VEC_SIZE < 64
> > + VMOVU %VEC(1), VEC_SIZE(%rdi)
> > +# if VEC_SIZE < 32
> > + VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
> > + VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
> > +# endif
> > +# endif
> > +
> > /* Adjust source, destination, and size. */
> > movq %rdi, %r8
> > andq $63, %r8
> > @@ -614,9 +767,13 @@ L(large_memcpy_2x):
> > /* Adjust length. */
> > addq %r8, %rdx
> >
> > - /* Test if source and destination addresses will alias. If they do
> > - the larger pipeline in large_memcpy_4x alleviated the
> > + /* Test if source and destination addresses will alias. If they
> > + do the larger pipeline in large_memcpy_4x alleviated the
> > performance drop. */
> > +
> > + /* ecx contains -(dst - src). not ecx will return dst - src - 1
> > + which works for testing aliasing. */
> > + notl %ecx
> > testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
> > jz L(large_memcpy_4x)
> >
> > @@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
> > /* ecx stores inner loop counter. */
> > movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> > L(loop_large_memcpy_4x_inner):
> > - /* Only one prefetch set per page as doing 4 pages give more time
> > - for prefetcher to keep up. */
> > + /* Only one prefetch set per page as doing 4 pages give more
> > + time for prefetcher to keep up. */
> > PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> > PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> > PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
@@ -25,7 +25,7 @@
/* Use movups and movaps for smaller code sizes. */
#define VMOVU movups
#define VMOVA movaps
-
+#define MOV_SIZE 3
#define SECTION(p) p
#ifdef USE_MULTIARCH
@@ -4,7 +4,7 @@
# define VMOVNT vmovntdq
# define VMOVU vmovdqu
# define VMOVA vmovdqa
-
+# define MOV_SIZE 4
# define ZERO_UPPER_VEC_REGISTERS_RETURN \
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
@@ -4,7 +4,7 @@
# define VMOVNT vmovntdq
# define VMOVU vmovdqu
# define VMOVA vmovdqa
-
+# define MOV_SIZE 4
# define SECTION(p) p##.avx
# define MEMMOVE_SYMBOL(p,s) p##_avx_##s
@@ -25,7 +25,7 @@
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
# define VZEROUPPER
-
+# define MOV_SIZE 6
# define SECTION(p) p##.evex512
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
@@ -25,7 +25,7 @@
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
# define VZEROUPPER
-
+# define MOV_SIZE 6
# define SECTION(p) p##.evex
# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
@@ -76,6 +76,25 @@
# endif
#endif
+/* Whether to align before movsb. Ultimately we want 64 byte
+ align and not worth it to load 4x VEC for VEC_SIZE == 16. */
+#define ALIGN_MOVSB (VEC_SIZE > 16)
+/* Number of bytes to align movsb to. */
+#define MOVSB_ALIGN_TO 64
+
+#define SMALL_MOV_SIZE (MOV_SIZE <= 4)
+#define LARGE_MOV_SIZE (MOV_SIZE > 4)
+
+#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
+# error MOV_SIZE Unknown
+#endif
+
+#if LARGE_MOV_SIZE
+# define SMALL_SIZE_OFFSET (4)
+#else
+# define SMALL_SIZE_OFFSET (0)
+#endif
+
#ifndef PAGE_SIZE
# define PAGE_SIZE 4096
#endif
@@ -199,25 +218,21 @@ L(start):
# endif
cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
+ /* Load regardless. */
+ VMOVU (%rsi), %VEC(0)
cmp $(VEC_SIZE * 2), %RDX_LP
ja L(more_2x_vec)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(last_2x_vec):
-#endif
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU (%rsi), %VEC(0)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(nop):
- ret
+#if !(defined USE_MULTIARCH && IS_IN (libc))
+ ZERO_UPPER_VEC_REGISTERS_RETURN
#else
VZEROUPPER_RETURN
#endif
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMMOVE_SYMBOL (__memmove, unaligned))
-
# if VEC_SIZE == 16
ENTRY (__mempcpy_chk_erms)
cmp %RDX_LP, %RCX_LP
@@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
# endif
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
movq %rdi, %rax
L(start_erms):
# ifdef __ILP32__
@@ -298,310 +313,448 @@ L(start_erms):
# endif
cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
+ /* Load regardless. */
+ VMOVU (%rsi), %VEC(0)
cmp $(VEC_SIZE * 2), %RDX_LP
ja L(movsb_more_2x_vec)
-L(last_2x_vec):
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU (%rsi), %VEC(0)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
+ */
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx)
L(return):
-#if VEC_SIZE > 16
+# if VEC_SIZE > 16
ZERO_UPPER_VEC_REGISTERS_RETURN
-#else
+# else
ret
+# endif
#endif
-L(movsb):
- cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
- jae L(more_8x_vec)
- cmpq %rsi, %rdi
- jb 1f
- /* Source == destination is less common. */
- je L(nop)
- leaq (%rsi,%rdx), %r9
- cmpq %r9, %rdi
- /* Avoid slow backward REP MOVSB. */
- jb L(more_8x_vec_backward)
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
- testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
- jz 3f
- movq %rdi, %rcx
- subq %rsi, %rcx
- jmp 2f
-# endif
-1:
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
- testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
- jz 3f
- movq %rsi, %rcx
- subq %rdi, %rcx
-2:
-/* Avoid "rep movsb" if RCX, the distance between source and destination,
- is N*4GB + [1..63] with N >= 0. */
- cmpl $63, %ecx
- jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
-3:
-# endif
- mov %RDX_LP, %RCX_LP
- rep movsb
-L(nop):
+#if LARGE_MOV_SIZE
+ /* If LARGE_MOV_SIZE this fits in the aligning bytes between the
+ ENTRY block and L(less_vec). */
+ .p2align 4,, 8
+L(between_4_7):
+ /* From 4 to 7. No branch when size == 4. */
+ movl (%rsi), %ecx
+ movl (%rsi, %rdx), %esi
+ movl %ecx, (%rdi)
+ movl %esi, (%rdi, %rdx)
ret
#endif
+ .p2align 4
L(less_vec):
/* Less than 1 VEC. */
#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
#endif
#if VEC_SIZE > 32
- cmpb $32, %dl
+ cmpl $32, %edx
jae L(between_32_63)
#endif
#if VEC_SIZE > 16
- cmpb $16, %dl
+ cmpl $16, %edx
jae L(between_16_31)
#endif
- cmpb $8, %dl
+ cmpl $8, %edx
jae L(between_8_15)
- cmpb $4, %dl
+#if SMALL_MOV_SIZE
+ cmpl $4, %edx
+#else
+ subq $4, %rdx
+#endif
jae L(between_4_7)
- cmpb $1, %dl
- ja L(between_2_3)
- jb 1f
- movzbl (%rsi), %ecx
+ cmpl $(1 - SMALL_SIZE_OFFSET), %edx
+ jl L(copy_0)
+ movb (%rsi), %cl
+ je L(copy_1)
+ movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
+ movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
+L(copy_1):
movb %cl, (%rdi)
-1:
+L(copy_0):
ret
+
+#if SMALL_MOV_SIZE
+ .p2align 4,, 8
+L(between_4_7):
+ /* From 4 to 7. No branch when size == 4. */
+ movl -4(%rsi, %rdx), %ecx
+ movl (%rsi), %esi
+ movl %ecx, -4(%rdi, %rdx)
+ movl %esi, (%rdi)
+ ret
+#endif
+
+#if VEC_SIZE > 16
+ /* From 16 to 31. No branch when size == 16. */
+ .p2align 4,, 8
+L(between_16_31):
+ vmovdqu (%rsi), %xmm0
+ vmovdqu -16(%rsi, %rdx), %xmm1
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm1, -16(%rdi, %rdx)
+ /* No ymm registers have been touched. */
+ ret
+#endif
+
#if VEC_SIZE > 32
+ .p2align 4,, 10
L(between_32_63):
/* From 32 to 63. No branch when size == 32. */
VMOVU (%rsi), %YMM0
- VMOVU -32(%rsi,%rdx), %YMM1
+ VMOVU -32(%rsi, %rdx), %YMM1
VMOVU %YMM0, (%rdi)
- VMOVU %YMM1, -32(%rdi,%rdx)
- VZEROUPPER_RETURN
-#endif
-#if VEC_SIZE > 16
- /* From 16 to 31. No branch when size == 16. */
-L(between_16_31):
- VMOVU (%rsi), %XMM0
- VMOVU -16(%rsi,%rdx), %XMM1
- VMOVU %XMM0, (%rdi)
- VMOVU %XMM1, -16(%rdi,%rdx)
+ VMOVU %YMM1, -32(%rdi, %rdx)
VZEROUPPER_RETURN
#endif
+
+ .p2align 4,, 10
L(between_8_15):
/* From 8 to 15. No branch when size == 8. */
- movq -8(%rsi,%rdx), %rcx
+ movq -8(%rsi, %rdx), %rcx
movq (%rsi), %rsi
- movq %rcx, -8(%rdi,%rdx)
movq %rsi, (%rdi)
+ movq %rcx, -8(%rdi, %rdx)
ret
-L(between_4_7):
- /* From 4 to 7. No branch when size == 4. */
- movl -4(%rsi,%rdx), %ecx
- movl (%rsi), %esi
- movl %ecx, -4(%rdi,%rdx)
- movl %esi, (%rdi)
- ret
-L(between_2_3):
- /* From 2 to 3. No branch when size == 2. */
- movzwl -2(%rsi,%rdx), %ecx
- movzwl (%rsi), %esi
- movw %cx, -2(%rdi,%rdx)
- movw %si, (%rdi)
- ret
+ .p2align 4,, 10
+L(last_4x_vec):
+ /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
+
+ /* VEC(0) and VEC(1) have already been loaded. */
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2)
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
+ VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx)
+ VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
+ VZEROUPPER_RETURN
+
+ .p2align 4
#if defined USE_MULTIARCH && IS_IN (libc)
L(movsb_more_2x_vec):
cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
ja L(movsb)
#endif
L(more_2x_vec):
- /* More than 2 * VEC and there may be overlap between destination
- and source. */
+ /* More than 2 * VEC and there may be overlap between
+ destination and source. */
cmpq $(VEC_SIZE * 8), %rdx
ja L(more_8x_vec)
+ /* Load VEC(1) regardless. VEC(0) has already been loaded. */
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
cmpq $(VEC_SIZE * 4), %rdx
jbe L(last_4x_vec)
- /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
- VMOVU (%rsi), %VEC(0)
- VMOVU VEC_SIZE(%rsi), %VEC(1)
+ /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
- VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
- VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4)
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), VEC_SIZE(%rdi)
VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
- VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
- VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
- VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
- VZEROUPPER_RETURN
-L(last_4x_vec):
- /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
- VMOVU (%rsi), %VEC(0)
- VMOVU VEC_SIZE(%rsi), %VEC(1)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), VEC_SIZE(%rdi)
- VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+ VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx)
+ VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
+ VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
+ VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
VZEROUPPER_RETURN
+ .p2align 4,, 4
L(more_8x_vec):
+ movq %rdi, %rcx
+ subq %rsi, %rcx
+ /* Go to backwards temporal copy if overlap no matter what as
+ backward REP MOVSB is slow and we don't want to use NT stores if
+ there is overlap. */
+ cmpq %rdx, %rcx
+ /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
+ jb L(more_8x_vec_backward_check_nop)
/* Check if non-temporal move candidate. */
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
/* Check non-temporal store threshold. */
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
ja L(large_memcpy_2x)
#endif
- /* Entry if rdx is greater than non-temporal threshold but there
- is overlap. */
+ /* To reach this point there cannot be overlap and dst > src. So
+ check for overlap and src > dst in which case correctness
+ requires forward copy. Otherwise decide between backward/forward
+ copy depending on address aliasing. */
+
+ /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
+ but less than __x86_shared_non_temporal_threshold. */
L(more_8x_vec_check):
- cmpq %rsi, %rdi
- ja L(more_8x_vec_backward)
- /* Source == destination is less common. */
- je L(nop)
- /* Load the first VEC and last 4 * VEC to support overlapping
- addresses. */
- VMOVU (%rsi), %VEC(4)
+ /* rcx contains dst - src. Add back length (rdx). */
+ leaq (%rcx, %rdx), %r8
+ /* If r8 has different sign than rcx then there is overlap so we
+ must do forward copy. */
+ xorq %rcx, %r8
+ /* Isolate just sign bit of r8. */
+ shrq $63, %r8
+ /* Get 4k difference dst - src. */
+ andl $(PAGE_SIZE - 256), %ecx
+ /* If r8 is non-zero must do foward for correctness. Otherwise
+ if ecx is non-zero there is 4k False Alaising so do backward
+ copy. */
+ addl %r8d, %ecx
+ jz L(more_8x_vec_backward)
+
+ /* if rdx is greater than __x86_shared_non_temporal_threshold
+ but there is overlap, or from short distance movsb. */
+L(more_8x_vec_forward):
+ /* Load first and last 4 * VEC to support overlapping addresses.
+ */
+
+ /* First vec was already loaded into VEC(0). */
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+ /* Save begining of dst. */
+ movq %rdi, %rcx
+ /* Align dst to VEC_SIZE - 1. */
+ orq $(VEC_SIZE - 1), %rdi
VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
- /* Save start and stop of the destination buffer. */
- movq %rdi, %r11
- leaq -VEC_SIZE(%rdi, %rdx), %rcx
- /* Align destination for aligned stores in the loop. Compute
- how much destination is misaligned. */
- movq %rdi, %r8
- andq $(VEC_SIZE - 1), %r8
- /* Get the negative of offset for alignment. */
- subq $VEC_SIZE, %r8
- /* Adjust source. */
- subq %r8, %rsi
- /* Adjust destination which should be aligned now. */
- subq %r8, %rdi
- /* Adjust length. */
- addq %r8, %rdx
- .p2align 4
+ /* Subtract dst from src. Add back after dst aligned. */
+ subq %rcx, %rsi
+ /* Finish aligning dst. */
+ incq %rdi
+ /* Restore src adjusted with new value for aligned dst. */
+ addq %rdi, %rsi
+ /* Store end of buffer minus tail in rdx. */
+ leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx
+
+ /* Dont use multi-byte nop to align. */
+ .p2align 4,, 11
L(loop_4x_vec_forward):
/* Copy 4 * VEC a time forward. */
- VMOVU (%rsi), %VEC(0)
- VMOVU VEC_SIZE(%rsi), %VEC(1)
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
+ VMOVU (%rsi), %VEC(1)
+ VMOVU VEC_SIZE(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4)
subq $-(VEC_SIZE * 4), %rsi
- addq $-(VEC_SIZE * 4), %rdx
- VMOVA %VEC(0), (%rdi)
- VMOVA %VEC(1), VEC_SIZE(%rdi)
- VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
- VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
+ VMOVA %VEC(1), (%rdi)
+ VMOVA %VEC(2), VEC_SIZE(%rdi)
+ VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi)
+ VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi)
subq $-(VEC_SIZE * 4), %rdi
- cmpq $(VEC_SIZE * 4), %rdx
+ cmpq %rdi, %rdx
ja L(loop_4x_vec_forward)
/* Store the last 4 * VEC. */
- VMOVU %VEC(5), (%rcx)
- VMOVU %VEC(6), -VEC_SIZE(%rcx)
- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
+ VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx)
+ VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx)
+ VMOVU %VEC(7), VEC_SIZE(%rdx)
+ VMOVU %VEC(8), (%rdx)
/* Store the first VEC. */
- VMOVU %VEC(4), (%r11)
+ VMOVU %VEC(0), (%rcx)
+ /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
+ */
+L(nop_backward):
VZEROUPPER_RETURN
+ .p2align 4,, 8
+L(more_8x_vec_backward_check_nop):
+ /* rcx contains dst - src. Test for dst == src to skip all of
+ memmove. */
+ testq %rcx, %rcx
+ jz L(nop_backward)
L(more_8x_vec_backward):
/* Load the first 4 * VEC and last VEC to support overlapping
addresses. */
- VMOVU (%rsi), %VEC(4)
+
+ /* First vec was also loaded into VEC(0). */
VMOVU VEC_SIZE(%rsi), %VEC(5)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
+ /* Begining of region for 4x backward copy stored in rcx. */
+ leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
- /* Save stop of the destination buffer. */
- leaq -VEC_SIZE(%rdi, %rdx), %r11
- /* Align destination end for aligned stores in the loop. Compute
- how much destination end is misaligned. */
- leaq -VEC_SIZE(%rsi, %rdx), %rcx
- movq %r11, %r9
- movq %r11, %r8
- andq $(VEC_SIZE - 1), %r8
- /* Adjust source. */
- subq %r8, %rcx
- /* Adjust the end of destination which should be aligned now. */
- subq %r8, %r9
- /* Adjust length. */
- subq %r8, %rdx
-
- .p2align 4
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8)
+ /* Subtract dst from src. Add back after dst aligned. */
+ subq %rdi, %rsi
+ /* Align dst. */
+ andq $-(VEC_SIZE), %rcx
+ /* Restore src. */
+ addq %rcx, %rsi
+
+ /* Don't use multi-byte nop to align. */
+ .p2align 4,, 11
L(loop_4x_vec_backward):
/* Copy 4 * VEC a time backward. */
- VMOVU (%rcx), %VEC(0)
- VMOVU -VEC_SIZE(%rcx), %VEC(1)
- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
- addq $-(VEC_SIZE * 4), %rcx
- addq $-(VEC_SIZE * 4), %rdx
- VMOVA %VEC(0), (%r9)
- VMOVA %VEC(1), -VEC_SIZE(%r9)
- VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
- VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
- addq $-(VEC_SIZE * 4), %r9
- cmpq $(VEC_SIZE * 4), %rdx
- ja L(loop_4x_vec_backward)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3)
+ VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4)
+ addq $(VEC_SIZE * -4), %rsi
+ VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx)
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx)
+ VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx)
+ VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx)
+ addq $(VEC_SIZE * -4), %rcx
+ cmpq %rcx, %rdi
+ jb L(loop_4x_vec_backward)
/* Store the first 4 * VEC. */
- VMOVU %VEC(4), (%rdi)
+ VMOVU %VEC(0), (%rdi)
VMOVU %VEC(5), VEC_SIZE(%rdi)
VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
/* Store the last VEC. */
- VMOVU %VEC(8), (%r11)
+ VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi)
+ VZEROUPPER_RETURN
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+ /* L(skip_short_movsb_check) is only used with ERMS. Not for
+ FSRM. */
+ .p2align 5,, 16
+# if ALIGN_MOVSB
+L(skip_short_movsb_check):
+# if MOVSB_ALIGN_TO > VEC_SIZE
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+# endif
+# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+# error Unsupported MOVSB_ALIGN_TO
+# endif
+ /* If CPU does not have FSRM two options for aligning. Align src
+ if dst and src 4k alias. Otherwise align dst. */
+ testl $(PAGE_SIZE - 512), %ecx
+ jnz L(movsb_align_dst)
+ /* Fall through. dst and src 4k alias. It's better to align src
+ here because the bottleneck will be loads dues to the false
+ dependency on dst. */
+
+ /* rcx already has dst - src. */
+ movq %rcx, %r9
+ /* Add src to len. Subtract back after src aligned. -1 because
+ src is initially aligned to MOVSB_ALIGN_TO - 1. */
+ leaq -1(%rsi, %rdx), %rcx
+ /* Inclusively align src to MOVSB_ALIGN_TO - 1. */
+ orq $(MOVSB_ALIGN_TO - 1), %rsi
+ /* Restore dst and len adjusted with new values for aligned dst.
+ */
+ leaq 1(%rsi, %r9), %rdi
+ subq %rsi, %rcx
+ /* Finish aligning src. */
+ incq %rsi
+
+ rep movsb
+
+ VMOVU %VEC(0), (%r8)
+# if MOVSB_ALIGN_TO > VEC_SIZE
+ VMOVU %VEC(1), VEC_SIZE(%r8)
+# endif
VZEROUPPER_RETURN
+# endif
+
+ .p2align 4,, 12
+L(movsb):
+ movq %rdi, %rcx
+ subq %rsi, %rcx
+ /* Go to backwards temporal copy if overlap no matter what as
+ backward REP MOVSB is slow and we don't want to use NT stores if
+ there is overlap. */
+ cmpq %rdx, %rcx
+ /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
+ jb L(more_8x_vec_backward_check_nop)
+# if ALIGN_MOVSB
+ /* Save dest for storing aligning VECs later. */
+ movq %rdi, %r8
+# endif
+ /* If above __x86_rep_movsb_stop_threshold most likely is
+ candidate for NT moves aswell. */
+ cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
+ jae L(large_memcpy_2x_check)
+# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
+ /* Only avoid short movsb if CPU has FSRM. */
+ testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+ jz L(skip_short_movsb_check)
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
+ /* Avoid "rep movsb" if RCX, the distance between source and
+ destination, is N*4GB + [1..63] with N >= 0. */
+
+ /* ecx contains dst - src. Early check for backward copy
+ conditions means only case of slow movsb with src = dst + [0,
+ 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
+ for that case. */
+ cmpl $-64, %ecx
+ ja L(more_8x_vec_forward)
+# endif
+# endif
+# if ALIGN_MOVSB
+# if MOVSB_ALIGN_TO > VEC_SIZE
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+# endif
+# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+# error Unsupported MOVSB_ALIGN_TO
+# endif
+ /* Fall through means cpu has FSRM. In that case exclusively
+ align destination. */
+L(movsb_align_dst):
+ /* Subtract dst from src. Add back after dst aligned. */
+ subq %rdi, %rsi
+ /* Exclusively align dst to MOVSB_ALIGN_TO (64). */
+ addq $(MOVSB_ALIGN_TO - 1), %rdi
+ /* Add dst to len. Subtract back after dst aligned. */
+ leaq (%r8, %rdx), %rcx
+ /* Finish aligning dst. */
+ andq $-(MOVSB_ALIGN_TO), %rdi
+ /* Restore src and len adjusted with new values for aligned dst.
+ */
+ addq %rdi, %rsi
+ subq %rdi, %rcx
+
+ rep movsb
+
+ /* Store VECs loaded for aligning. */
+ VMOVU %VEC(0), (%r8)
+# if MOVSB_ALIGN_TO > VEC_SIZE
+ VMOVU %VEC(1), VEC_SIZE(%r8)
+# endif
+ VZEROUPPER_RETURN
+# else /* !ALIGN_MOVSB. */
+L(skip_short_movsb_check):
+ mov %RDX_LP, %RCX_LP
+ rep movsb
+ ret
+# endif
+#endif
+ .p2align 4,, 10
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
- .p2align 4
+L(large_memcpy_2x_check):
+ cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
+ jb L(more_8x_vec_check)
L(large_memcpy_2x):
- /* Compute absolute value of difference between source and
- destination. */
- movq %rdi, %r9
- subq %rsi, %r9
- movq %r9, %r8
- leaq -1(%r9), %rcx
- sarq $63, %r8
- xorq %r8, %r9
- subq %r8, %r9
- /* Don't use non-temporal store if there is overlap between
- destination and source since destination may be in cache when
- source is loaded. */
- cmpq %r9, %rdx
- ja L(more_8x_vec_check)
+ /* To reach this point it is impossible for dst > src and
+ overlap. Remaining to check is src > dst and overlap. rcx
+ already contains dst - src. Negate rcx to get src - dst. If
+ length > rcx then there is overlap and forward copy is best. */
+ negq %rcx
+ cmpq %rcx, %rdx
+ ja L(more_8x_vec_forward)
/* Cache align destination. First store the first 64 bytes then
adjust alignments. */
- VMOVU (%rsi), %VEC(8)
-#if VEC_SIZE < 64
- VMOVU VEC_SIZE(%rsi), %VEC(9)
-#if VEC_SIZE < 32
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
-#endif
-#endif
- VMOVU %VEC(8), (%rdi)
-#if VEC_SIZE < 64
- VMOVU %VEC(9), VEC_SIZE(%rdi)
-#if VEC_SIZE < 32
- VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
-#endif
-#endif
+
+ /* First vec was also loaded into VEC(0). */
+# if VEC_SIZE < 64
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+# if VEC_SIZE < 32
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
+# endif
+# endif
+ VMOVU %VEC(0), (%rdi)
+# if VEC_SIZE < 64
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
+# if VEC_SIZE < 32
+ VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
+# endif
+# endif
+
/* Adjust source, destination, and size. */
movq %rdi, %r8
andq $63, %r8
@@ -614,9 +767,13 @@ L(large_memcpy_2x):
/* Adjust length. */
addq %r8, %rdx
- /* Test if source and destination addresses will alias. If they do
- the larger pipeline in large_memcpy_4x alleviated the
+ /* Test if source and destination addresses will alias. If they
+ do the larger pipeline in large_memcpy_4x alleviated the
performance drop. */
+
+ /* ecx contains -(dst - src). not ecx will return dst - src - 1
+ which works for testing aliasing. */
+ notl %ecx
testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
jz L(large_memcpy_4x)
@@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
/* ecx stores inner loop counter. */
movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
L(loop_large_memcpy_4x_inner):
- /* Only one prefetch set per page as doing 4 pages give more time
- for prefetcher to keep up. */
+ /* Only one prefetch set per page as doing 4 pages give more
+ time for prefetcher to keep up. */
PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)