[v4,4/5] x86: Optimize memmove-vec-unaligned-erms.S

Message ID 20211106183322.3129442-4-goldstein.w.n@gmail.com
State Committed
Commit a6b7502ec0c2da89a7437f43171f160d713e39c6
Headers
Series [v4,1/5] string: Make tests birdirectional test-memcpy.c |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Noah Goldstein Nov. 6, 2021, 6:33 p.m. UTC
  No bug.

The optimizations are as follows:

1) Always align entry to 64 bytes. This makes behavior more
   predictable and makes other frontend optimizations easier.

2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
   significant benefits in the case that:
        0 < (dst - src) < [256, 512]

3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
   improvement and for FSRM [-10%, 25%].

In addition to these primary changes there is general cleanup
throughout to optimize the aligning routines and control flow logic.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 sysdeps/x86_64/memmove.S                      |   2 +-
 .../memmove-avx-unaligned-erms-rtm.S          |   2 +-
 .../multiarch/memmove-avx-unaligned-erms.S    |   2 +-
 .../multiarch/memmove-avx512-unaligned-erms.S |   2 +-
 .../multiarch/memmove-evex-unaligned-erms.S   |   2 +-
 .../multiarch/memmove-vec-unaligned-erms.S    | 595 +++++++++++-------
 6 files changed, 381 insertions(+), 224 deletions(-)
  

Comments

H.J. Lu Nov. 6, 2021, 7:11 p.m. UTC | #1
On Sat, Nov 6, 2021 at 11:33 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug.
>
> The optimizations are as follows:
>
> 1) Always align entry to 64 bytes. This makes behavior more
>    predictable and makes other frontend optimizations easier.
>
> 2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
>    significant benefits in the case that:
>         0 < (dst - src) < [256, 512]
>
> 3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
>    improvement and for FSRM [-10%, 25%].
>
> In addition to these primary changes there is general cleanup
> throughout to optimize the aligning routines and control flow logic.
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> ---
>  sysdeps/x86_64/memmove.S                      |   2 +-
>  .../memmove-avx-unaligned-erms-rtm.S          |   2 +-
>  .../multiarch/memmove-avx-unaligned-erms.S    |   2 +-
>  .../multiarch/memmove-avx512-unaligned-erms.S |   2 +-
>  .../multiarch/memmove-evex-unaligned-erms.S   |   2 +-
>  .../multiarch/memmove-vec-unaligned-erms.S    | 595 +++++++++++-------
>  6 files changed, 381 insertions(+), 224 deletions(-)
>
> diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
> index db106a7a1f..b2b3180848 100644
> --- a/sysdeps/x86_64/memmove.S
> +++ b/sysdeps/x86_64/memmove.S
> @@ -25,7 +25,7 @@
>  /* Use movups and movaps for smaller code sizes.  */
>  #define VMOVU          movups
>  #define VMOVA          movaps
> -
> +#define MOV_SIZE       3
>  #define SECTION(p)             p
>
>  #ifdef USE_MULTIARCH
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> index 1ec1962e86..67a55f0c85 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> @@ -4,7 +4,7 @@
>  # define VMOVNT                vmovntdq
>  # define VMOVU         vmovdqu
>  # define VMOVA         vmovdqa
> -
> +# define MOV_SIZE      4
>  # define ZERO_UPPER_VEC_REGISTERS_RETURN \
>    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> index e195e93f15..975ae6c051 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> @@ -4,7 +4,7 @@
>  # define VMOVNT                vmovntdq
>  # define VMOVU         vmovdqu
>  # define VMOVA         vmovdqa
> -
> +# define MOV_SIZE      4
>  # define SECTION(p)            p##.avx
>  # define MEMMOVE_SYMBOL(p,s)   p##_avx_##s
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> index 848848ab39..0fa7126830 100644
> --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> @@ -25,7 +25,7 @@
>  # define VMOVU         vmovdqu64
>  # define VMOVA         vmovdqa64
>  # define VZEROUPPER
> -
> +# define MOV_SIZE      6
>  # define SECTION(p)            p##.evex512
>  # define MEMMOVE_SYMBOL(p,s)   p##_avx512_##s
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> index 0cbce8f944..88715441fe 100644
> --- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> @@ -25,7 +25,7 @@
>  # define VMOVU         vmovdqu64
>  # define VMOVA         vmovdqa64
>  # define VZEROUPPER
> -
> +# define MOV_SIZE      6
>  # define SECTION(p)            p##.evex
>  # define MEMMOVE_SYMBOL(p,s)   p##_evex_##s
>
> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> index abde8438d4..7b27cbdda5 100644
> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> @@ -76,6 +76,25 @@
>  # endif
>  #endif
>
> +/* Whether to align before movsb. Ultimately we want 64 byte
> +   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
> +#define ALIGN_MOVSB    (VEC_SIZE > 16)
> +/* Number of bytes to align movsb to.  */
> +#define MOVSB_ALIGN_TO 64
> +
> +#define SMALL_MOV_SIZE (MOV_SIZE <= 4)
> +#define LARGE_MOV_SIZE (MOV_SIZE > 4)
> +
> +#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
> +# error MOV_SIZE Unknown
> +#endif
> +
> +#if LARGE_MOV_SIZE
> +# define SMALL_SIZE_OFFSET     (4)
> +#else
> +# define SMALL_SIZE_OFFSET     (0)
> +#endif
> +
>  #ifndef PAGE_SIZE
>  # define PAGE_SIZE 4096
>  #endif
> @@ -199,25 +218,21 @@ L(start):
>  # endif
>         cmp     $VEC_SIZE, %RDX_LP
>         jb      L(less_vec)
> +       /* Load regardless.  */
> +       VMOVU   (%rsi), %VEC(0)
>         cmp     $(VEC_SIZE * 2), %RDX_LP
>         ja      L(more_2x_vec)
> -#if !defined USE_MULTIARCH || !IS_IN (libc)
> -L(last_2x_vec):
> -#endif
>         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> -       VMOVU   (%rsi), %VEC(0)
>         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
>         VMOVU   %VEC(0), (%rdi)
>         VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
> -#if !defined USE_MULTIARCH || !IS_IN (libc)
> -L(nop):
> -       ret
> +#if !(defined USE_MULTIARCH && IS_IN (libc))
> +       ZERO_UPPER_VEC_REGISTERS_RETURN
>  #else
>         VZEROUPPER_RETURN
>  #endif
>  #if defined USE_MULTIARCH && IS_IN (libc)
>  END (MEMMOVE_SYMBOL (__memmove, unaligned))
> -
>  # if VEC_SIZE == 16
>  ENTRY (__mempcpy_chk_erms)
>         cmp     %RDX_LP, %RCX_LP
> @@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
>  END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
>  # endif
>
> -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> +ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
>         movq    %rdi, %rax
>  L(start_erms):
>  # ifdef __ILP32__
> @@ -298,310 +313,448 @@ L(start_erms):
>  # endif
>         cmp     $VEC_SIZE, %RDX_LP
>         jb      L(less_vec)
> +       /* Load regardless.  */
> +       VMOVU   (%rsi), %VEC(0)
>         cmp     $(VEC_SIZE * 2), %RDX_LP
>         ja      L(movsb_more_2x_vec)
> -L(last_2x_vec):
> -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
> -       VMOVU   (%rsi), %VEC(0)
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
> +       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
> +        */
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(1)
>         VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
> +       VMOVU   %VEC(1), -VEC_SIZE(%rdi, %rdx)
>  L(return):
> -#if VEC_SIZE > 16
> +# if VEC_SIZE > 16
>         ZERO_UPPER_VEC_REGISTERS_RETURN
> -#else
> +# else
>         ret
> +# endif
>  #endif
>
> -L(movsb):
> -       cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> -       jae     L(more_8x_vec)
> -       cmpq    %rsi, %rdi
> -       jb      1f
> -       /* Source == destination is less common.  */
> -       je      L(nop)
> -       leaq    (%rsi,%rdx), %r9
> -       cmpq    %r9, %rdi
> -       /* Avoid slow backward REP MOVSB.  */
> -       jb      L(more_8x_vec_backward)
> -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> -       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> -       jz      3f
> -       movq    %rdi, %rcx
> -       subq    %rsi, %rcx
> -       jmp     2f
> -# endif
> -1:
> -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> -       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> -       jz      3f
> -       movq    %rsi, %rcx
> -       subq    %rdi, %rcx
> -2:
> -/* Avoid "rep movsb" if RCX, the distance between source and destination,
> -   is N*4GB + [1..63] with N >= 0.  */
> -       cmpl    $63, %ecx
> -       jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <= 63.  */
> -3:
> -# endif
> -       mov     %RDX_LP, %RCX_LP
> -       rep movsb
> -L(nop):
> +#if LARGE_MOV_SIZE
> +       /* If LARGE_MOV_SIZE this fits in the aligning bytes between the
> +          ENTRY block and L(less_vec).  */
> +       .p2align 4,, 8
> +L(between_4_7):
> +       /* From 4 to 7.  No branch when size == 4.  */
> +       movl    (%rsi), %ecx
> +       movl    (%rsi, %rdx), %esi
> +       movl    %ecx, (%rdi)
> +       movl    %esi, (%rdi, %rdx)
>         ret
>  #endif
>
> +       .p2align 4
>  L(less_vec):
>         /* Less than 1 VEC.  */
>  #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
>  # error Unsupported VEC_SIZE!
>  #endif
>  #if VEC_SIZE > 32
> -       cmpb    $32, %dl
> +       cmpl    $32, %edx
>         jae     L(between_32_63)
>  #endif
>  #if VEC_SIZE > 16
> -       cmpb    $16, %dl
> +       cmpl    $16, %edx
>         jae     L(between_16_31)
>  #endif
> -       cmpb    $8, %dl
> +       cmpl    $8, %edx
>         jae     L(between_8_15)
> -       cmpb    $4, %dl
> +#if SMALL_MOV_SIZE
> +       cmpl    $4, %edx
> +#else
> +       subq    $4, %rdx
> +#endif
>         jae     L(between_4_7)
> -       cmpb    $1, %dl
> -       ja      L(between_2_3)
> -       jb      1f
> -       movzbl  (%rsi), %ecx
> +       cmpl    $(1 - SMALL_SIZE_OFFSET), %edx
> +       jl      L(copy_0)
> +       movb    (%rsi), %cl
> +       je      L(copy_1)
> +       movzwl  (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
> +       movw    %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
> +L(copy_1):
>         movb    %cl, (%rdi)
> -1:
> +L(copy_0):
>         ret
> +
> +#if SMALL_MOV_SIZE
> +       .p2align 4,, 8
> +L(between_4_7):
> +       /* From 4 to 7.  No branch when size == 4.  */
> +       movl    -4(%rsi, %rdx), %ecx
> +       movl    (%rsi), %esi
> +       movl    %ecx, -4(%rdi, %rdx)
> +       movl    %esi, (%rdi)
> +       ret
> +#endif
> +
> +#if VEC_SIZE > 16
> +       /* From 16 to 31.  No branch when size == 16.  */
> +       .p2align 4,, 8
> +L(between_16_31):
> +       vmovdqu (%rsi), %xmm0
> +       vmovdqu -16(%rsi, %rdx), %xmm1
> +       vmovdqu %xmm0, (%rdi)
> +       vmovdqu %xmm1, -16(%rdi, %rdx)
> +       /* No ymm registers have been touched.  */
> +       ret
> +#endif
> +
>  #if VEC_SIZE > 32
> +       .p2align 4,, 10
>  L(between_32_63):
>         /* From 32 to 63.  No branch when size == 32.  */
>         VMOVU   (%rsi), %YMM0
> -       VMOVU   -32(%rsi,%rdx), %YMM1
> +       VMOVU   -32(%rsi, %rdx), %YMM1
>         VMOVU   %YMM0, (%rdi)
> -       VMOVU   %YMM1, -32(%rdi,%rdx)
> -       VZEROUPPER_RETURN
> -#endif
> -#if VEC_SIZE > 16
> -       /* From 16 to 31.  No branch when size == 16.  */
> -L(between_16_31):
> -       VMOVU   (%rsi), %XMM0
> -       VMOVU   -16(%rsi,%rdx), %XMM1
> -       VMOVU   %XMM0, (%rdi)
> -       VMOVU   %XMM1, -16(%rdi,%rdx)
> +       VMOVU   %YMM1, -32(%rdi, %rdx)
>         VZEROUPPER_RETURN
>  #endif
> +
> +       .p2align 4,, 10
>  L(between_8_15):
>         /* From 8 to 15.  No branch when size == 8.  */
> -       movq    -8(%rsi,%rdx), %rcx
> +       movq    -8(%rsi, %rdx), %rcx
>         movq    (%rsi), %rsi
> -       movq    %rcx, -8(%rdi,%rdx)
>         movq    %rsi, (%rdi)
> +       movq    %rcx, -8(%rdi, %rdx)
>         ret
> -L(between_4_7):
> -       /* From 4 to 7.  No branch when size == 4.  */
> -       movl    -4(%rsi,%rdx), %ecx
> -       movl    (%rsi), %esi
> -       movl    %ecx, -4(%rdi,%rdx)
> -       movl    %esi, (%rdi)
> -       ret
> -L(between_2_3):
> -       /* From 2 to 3.  No branch when size == 2.  */
> -       movzwl  -2(%rsi,%rdx), %ecx
> -       movzwl  (%rsi), %esi
> -       movw    %cx, -2(%rdi,%rdx)
> -       movw    %si, (%rdi)
> -       ret
>
> +       .p2align 4,, 10
> +L(last_4x_vec):
> +       /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
> +
> +       /* VEC(0) and VEC(1) have already been loaded.  */
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(2)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
> +       VMOVU   %VEC(0), (%rdi)
> +       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> +       VMOVU   %VEC(2), -VEC_SIZE(%rdi, %rdx)
> +       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4
>  #if defined USE_MULTIARCH && IS_IN (libc)
>  L(movsb_more_2x_vec):
>         cmp     __x86_rep_movsb_threshold(%rip), %RDX_LP
>         ja      L(movsb)
>  #endif
>  L(more_2x_vec):
> -       /* More than 2 * VEC and there may be overlap between destination
> -          and source.  */
> +       /* More than 2 * VEC and there may be overlap between
> +          destination and source.  */
>         cmpq    $(VEC_SIZE * 8), %rdx
>         ja      L(more_8x_vec)
> +       /* Load VEC(1) regardless. VEC(0) has already been loaded.  */
> +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
>         cmpq    $(VEC_SIZE * 4), %rdx
>         jbe     L(last_4x_vec)
> -       /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
> -       VMOVU   (%rsi), %VEC(0)
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +       /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
>         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
>         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(4)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
> -       VMOVU   -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
> -       VMOVU   -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(4)
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
>         VMOVU   %VEC(0), (%rdi)
>         VMOVU   %VEC(1), VEC_SIZE(%rdi)
>         VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
>         VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
> -       VMOVU   %VEC(4), -VEC_SIZE(%rdi,%rdx)
> -       VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
> -       VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
> -       VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
> -       VZEROUPPER_RETURN
> -L(last_4x_vec):
> -       /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
> -       VMOVU   (%rsi), %VEC(0)
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(2)
> -       VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
> -       VMOVU   %VEC(0), (%rdi)
> -       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> -       VMOVU   %VEC(2), -VEC_SIZE(%rdi,%rdx)
> -       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
> +       VMOVU   %VEC(4), -VEC_SIZE(%rdi, %rdx)
> +       VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
> +       VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
> +       VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
>         VZEROUPPER_RETURN
>
> +       .p2align 4,, 4
>  L(more_8x_vec):
> +       movq    %rdi, %rcx
> +       subq    %rsi, %rcx
> +       /* Go to backwards temporal copy if overlap no matter what as
> +          backward REP MOVSB is slow and we don't want to use NT stores if
> +          there is overlap.  */
> +       cmpq    %rdx, %rcx
> +       /* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
> +       jb      L(more_8x_vec_backward_check_nop)
>         /* Check if non-temporal move candidate.  */
>  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
>         /* Check non-temporal store threshold.  */
> -       cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> +       cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
>         ja      L(large_memcpy_2x)
>  #endif
> -       /* Entry if rdx is greater than non-temporal threshold but there
> -       is overlap.  */
> +       /* To reach this point there cannot be overlap and dst > src. So
> +          check for overlap and src > dst in which case correctness
> +          requires forward copy. Otherwise decide between backward/forward
> +          copy depending on address aliasing.  */
> +
> +       /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
> +          but less than __x86_shared_non_temporal_threshold.  */
>  L(more_8x_vec_check):
> -       cmpq    %rsi, %rdi
> -       ja      L(more_8x_vec_backward)
> -       /* Source == destination is less common.  */
> -       je      L(nop)
> -       /* Load the first VEC and last 4 * VEC to support overlapping
> -          addresses.  */
> -       VMOVU   (%rsi), %VEC(4)
> +       /* rcx contains dst - src. Add back length (rdx).  */
> +       leaq    (%rcx, %rdx), %r8
> +       /* If r8 has different sign than rcx then there is overlap so we
> +          must do forward copy.  */
> +       xorq    %rcx, %r8
> +       /* Isolate just sign bit of r8.  */
> +       shrq    $63, %r8
> +       /* Get 4k difference dst - src.  */
> +       andl    $(PAGE_SIZE - 256), %ecx
> +       /* If r8 is non-zero must do foward for correctness. Otherwise
> +          if ecx is non-zero there is 4k False Alaising so do backward
> +          copy.  */
> +       addl    %r8d, %ecx
> +       jz      L(more_8x_vec_backward)
> +
> +       /* if rdx is greater than __x86_shared_non_temporal_threshold
> +          but there is overlap, or from short distance movsb.  */
> +L(more_8x_vec_forward):
> +       /* Load first and last 4 * VEC to support overlapping addresses.
> +        */
> +
> +       /* First vec was already loaded into VEC(0).  */
>         VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(5)
>         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
> +       /* Save begining of dst.  */
> +       movq    %rdi, %rcx
> +       /* Align dst to VEC_SIZE - 1.  */
> +       orq     $(VEC_SIZE - 1), %rdi
>         VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
>         VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
> -       /* Save start and stop of the destination buffer.  */
> -       movq    %rdi, %r11
> -       leaq    -VEC_SIZE(%rdi, %rdx), %rcx
> -       /* Align destination for aligned stores in the loop.  Compute
> -          how much destination is misaligned.  */
> -       movq    %rdi, %r8
> -       andq    $(VEC_SIZE - 1), %r8
> -       /* Get the negative of offset for alignment.  */
> -       subq    $VEC_SIZE, %r8
> -       /* Adjust source.  */
> -       subq    %r8, %rsi
> -       /* Adjust destination which should be aligned now.  */
> -       subq    %r8, %rdi
> -       /* Adjust length.  */
> -       addq    %r8, %rdx
>
> -       .p2align 4
> +       /* Subtract dst from src. Add back after dst aligned.  */
> +       subq    %rcx, %rsi
> +       /* Finish aligning dst.  */
> +       incq    %rdi
> +       /* Restore src adjusted with new value for aligned dst.  */
> +       addq    %rdi, %rsi
> +       /* Store end of buffer minus tail in rdx.  */
> +       leaq    (VEC_SIZE * -4)(%rcx, %rdx), %rdx
> +
> +       /* Dont use multi-byte nop to align.  */
> +       .p2align 4,, 11
>  L(loop_4x_vec_forward):
>         /* Copy 4 * VEC a time forward.  */
> -       VMOVU   (%rsi), %VEC(0)
> -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> +       VMOVU   (%rsi), %VEC(1)
> +       VMOVU   VEC_SIZE(%rsi), %VEC(2)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(3)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(4)
>         subq    $-(VEC_SIZE * 4), %rsi
> -       addq    $-(VEC_SIZE * 4), %rdx
> -       VMOVA   %VEC(0), (%rdi)
> -       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> -       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> -       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> +       VMOVA   %VEC(1), (%rdi)
> +       VMOVA   %VEC(2), VEC_SIZE(%rdi)
> +       VMOVA   %VEC(3), (VEC_SIZE * 2)(%rdi)
> +       VMOVA   %VEC(4), (VEC_SIZE * 3)(%rdi)
>         subq    $-(VEC_SIZE * 4), %rdi
> -       cmpq    $(VEC_SIZE * 4), %rdx
> +       cmpq    %rdi, %rdx
>         ja      L(loop_4x_vec_forward)
>         /* Store the last 4 * VEC.  */
> -       VMOVU   %VEC(5), (%rcx)
> -       VMOVU   %VEC(6), -VEC_SIZE(%rcx)
> -       VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
> -       VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
> +       VMOVU   %VEC(5), (VEC_SIZE * 3)(%rdx)
> +       VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdx)
> +       VMOVU   %VEC(7), VEC_SIZE(%rdx)
> +       VMOVU   %VEC(8), (%rdx)
>         /* Store the first VEC.  */
> -       VMOVU   %VEC(4), (%r11)
> +       VMOVU   %VEC(0), (%rcx)
> +       /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
> +        */
> +L(nop_backward):
>         VZEROUPPER_RETURN
>
> +       .p2align 4,, 8
> +L(more_8x_vec_backward_check_nop):
> +       /* rcx contains dst - src. Test for dst == src to skip all of
> +          memmove.  */
> +       testq   %rcx, %rcx
> +       jz      L(nop_backward)
>  L(more_8x_vec_backward):
>         /* Load the first 4 * VEC and last VEC to support overlapping
>            addresses.  */
> -       VMOVU   (%rsi), %VEC(4)
> +
> +       /* First vec was also loaded into VEC(0).  */
>         VMOVU   VEC_SIZE(%rsi), %VEC(5)
>         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(6)
> +       /* Begining of region for 4x backward copy stored in rcx.  */
> +       leaq    (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
>         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(7)
> -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(8)
> -       /* Save stop of the destination buffer.  */
> -       leaq    -VEC_SIZE(%rdi, %rdx), %r11
> -       /* Align destination end for aligned stores in the loop.  Compute
> -          how much destination end is misaligned.  */
> -       leaq    -VEC_SIZE(%rsi, %rdx), %rcx
> -       movq    %r11, %r9
> -       movq    %r11, %r8
> -       andq    $(VEC_SIZE - 1), %r8
> -       /* Adjust source.  */
> -       subq    %r8, %rcx
> -       /* Adjust the end of destination which should be aligned now.  */
> -       subq    %r8, %r9
> -       /* Adjust length.  */
> -       subq    %r8, %rdx
> -
> -       .p2align 4
> +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(8)
> +       /* Subtract dst from src. Add back after dst aligned.  */
> +       subq    %rdi, %rsi
> +       /* Align dst.  */
> +       andq    $-(VEC_SIZE), %rcx
> +       /* Restore src.  */
> +       addq    %rcx, %rsi
> +
> +       /* Don't use multi-byte nop to align.  */
> +       .p2align 4,, 11
>  L(loop_4x_vec_backward):
>         /* Copy 4 * VEC a time backward.  */
> -       VMOVU   (%rcx), %VEC(0)
> -       VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> -       VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> -       VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> -       addq    $-(VEC_SIZE * 4), %rcx
> -       addq    $-(VEC_SIZE * 4), %rdx
> -       VMOVA   %VEC(0), (%r9)
> -       VMOVA   %VEC(1), -VEC_SIZE(%r9)
> -       VMOVA   %VEC(2), -(VEC_SIZE * 2)(%r9)
> -       VMOVA   %VEC(3), -(VEC_SIZE * 3)(%r9)
> -       addq    $-(VEC_SIZE * 4), %r9
> -       cmpq    $(VEC_SIZE * 4), %rdx
> -       ja      L(loop_4x_vec_backward)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(1)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> +       VMOVU   (VEC_SIZE * 1)(%rsi), %VEC(3)
> +       VMOVU   (VEC_SIZE * 0)(%rsi), %VEC(4)
> +       addq    $(VEC_SIZE * -4), %rsi
> +       VMOVA   %VEC(1), (VEC_SIZE * 3)(%rcx)
> +       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rcx)
> +       VMOVA   %VEC(3), (VEC_SIZE * 1)(%rcx)
> +       VMOVA   %VEC(4), (VEC_SIZE * 0)(%rcx)
> +       addq    $(VEC_SIZE * -4), %rcx
> +       cmpq    %rcx, %rdi
> +       jb      L(loop_4x_vec_backward)
>         /* Store the first 4 * VEC.  */
> -       VMOVU   %VEC(4), (%rdi)
> +       VMOVU   %VEC(0), (%rdi)
>         VMOVU   %VEC(5), VEC_SIZE(%rdi)
>         VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
>         VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
>         /* Store the last VEC.  */
> -       VMOVU   %VEC(8), (%r11)
> +       VMOVU   %VEC(8), -VEC_SIZE(%rdx, %rdi)
> +       VZEROUPPER_RETURN
> +
> +#if defined USE_MULTIARCH && IS_IN (libc)
> +       /* L(skip_short_movsb_check) is only used with ERMS. Not for
> +          FSRM.  */
> +       .p2align 5,, 16
> +# if ALIGN_MOVSB
> +L(skip_short_movsb_check):
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +#  endif
> +#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> +#   error Unsupported MOVSB_ALIGN_TO
> +#  endif
> +       /* If CPU does not have FSRM two options for aligning. Align src
> +          if dst and src 4k alias. Otherwise align dst.  */
> +       testl   $(PAGE_SIZE - 512), %ecx
> +       jnz     L(movsb_align_dst)
> +       /* Fall through. dst and src 4k alias. It's better to align src
> +          here because the bottleneck will be loads dues to the false
> +          dependency on dst.  */
> +
> +       /* rcx already has dst - src.  */
> +       movq    %rcx, %r9
> +       /* Add src to len. Subtract back after src aligned. -1 because
> +          src is initially aligned to MOVSB_ALIGN_TO - 1.  */
> +       leaq    -1(%rsi, %rdx), %rcx
> +       /* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
> +       orq     $(MOVSB_ALIGN_TO - 1), %rsi
> +       /* Restore dst and len adjusted with new values for aligned dst.
> +        */
> +       leaq    1(%rsi, %r9), %rdi
> +       subq    %rsi, %rcx
> +       /* Finish aligning src.  */
> +       incq    %rsi
> +
> +       rep     movsb
> +
> +       VMOVU   %VEC(0), (%r8)
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +       VMOVU   %VEC(1), VEC_SIZE(%r8)
> +#  endif
>         VZEROUPPER_RETURN
> +# endif
> +
> +       .p2align 4,, 12
> +L(movsb):
> +       movq    %rdi, %rcx
> +       subq    %rsi, %rcx
> +       /* Go to backwards temporal copy if overlap no matter what as
> +          backward REP MOVSB is slow and we don't want to use NT stores if
> +          there is overlap.  */
> +       cmpq    %rdx, %rcx
> +       /* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
> +       jb      L(more_8x_vec_backward_check_nop)
> +# if ALIGN_MOVSB
> +       /* Save dest for storing aligning VECs later.  */
> +       movq    %rdi, %r8
> +# endif
> +       /* If above __x86_rep_movsb_stop_threshold most likely is
> +          candidate for NT moves aswell.  */
> +       cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> +       jae     L(large_memcpy_2x_check)
> +# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
> +       /* Only avoid short movsb if CPU has FSRM.  */
> +       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> +       jz      L(skip_short_movsb_check)
> +#  if AVOID_SHORT_DISTANCE_REP_MOVSB
> +       /* Avoid "rep movsb" if RCX, the distance between source and
> +          destination, is N*4GB + [1..63] with N >= 0.  */
> +
> +       /* ecx contains dst - src. Early check for backward copy
> +          conditions means only case of slow movsb with src = dst + [0,
> +          63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
> +          for that case.  */
> +       cmpl    $-64, %ecx
> +       ja      L(more_8x_vec_forward)
> +#  endif
> +# endif
> +# if ALIGN_MOVSB
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +#  endif
> +#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> +#   error Unsupported MOVSB_ALIGN_TO
> +#  endif
> +       /* Fall through means cpu has FSRM. In that case exclusively
> +          align destination.  */
> +L(movsb_align_dst):
> +       /* Subtract dst from src. Add back after dst aligned.  */
> +       subq    %rdi, %rsi
> +       /* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
> +       addq    $(MOVSB_ALIGN_TO - 1), %rdi
> +       /* Add dst to len. Subtract back after dst aligned.  */
> +       leaq    (%r8, %rdx), %rcx
> +       /* Finish aligning dst.  */
> +       andq    $-(MOVSB_ALIGN_TO), %rdi
> +       /* Restore src and len adjusted with new values for aligned dst.
> +        */
> +       addq    %rdi, %rsi
> +       subq    %rdi, %rcx
> +
> +       rep     movsb
> +
> +       /* Store VECs loaded for aligning.  */
> +       VMOVU   %VEC(0), (%r8)
> +#  if MOVSB_ALIGN_TO > VEC_SIZE
> +       VMOVU   %VEC(1), VEC_SIZE(%r8)
> +#  endif
> +       VZEROUPPER_RETURN
> +# else /* !ALIGN_MOVSB.  */
> +L(skip_short_movsb_check):
> +       mov     %RDX_LP, %RCX_LP
> +       rep     movsb
> +       ret
> +# endif
> +#endif
>
> +       .p2align 4,, 10
>  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> -       .p2align 4
> +L(large_memcpy_2x_check):
> +       cmp     __x86_rep_movsb_threshold(%rip), %RDX_LP
> +       jb      L(more_8x_vec_check)
>  L(large_memcpy_2x):
> -       /* Compute absolute value of difference between source and
> -          destination.  */
> -       movq    %rdi, %r9
> -       subq    %rsi, %r9
> -       movq    %r9, %r8
> -       leaq    -1(%r9), %rcx
> -       sarq    $63, %r8
> -       xorq    %r8, %r9
> -       subq    %r8, %r9
> -       /* Don't use non-temporal store if there is overlap between
> -          destination and source since destination may be in cache when
> -          source is loaded.  */
> -       cmpq    %r9, %rdx
> -       ja      L(more_8x_vec_check)
> +       /* To reach this point it is impossible for dst > src and
> +          overlap. Remaining to check is src > dst and overlap. rcx
> +          already contains dst - src. Negate rcx to get src - dst. If
> +          length > rcx then there is overlap and forward copy is best.  */
> +       negq    %rcx
> +       cmpq    %rcx, %rdx
> +       ja      L(more_8x_vec_forward)
>
>         /* Cache align destination. First store the first 64 bytes then
>            adjust alignments.  */
> -       VMOVU   (%rsi), %VEC(8)
> -#if VEC_SIZE < 64
> -       VMOVU   VEC_SIZE(%rsi), %VEC(9)
> -#if VEC_SIZE < 32
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(10)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(11)
> -#endif
> -#endif
> -       VMOVU   %VEC(8), (%rdi)
> -#if VEC_SIZE < 64
> -       VMOVU   %VEC(9), VEC_SIZE(%rdi)
> -#if VEC_SIZE < 32
> -       VMOVU   %VEC(10), (VEC_SIZE * 2)(%rdi)
> -       VMOVU   %VEC(11), (VEC_SIZE * 3)(%rdi)
> -#endif
> -#endif
> +
> +       /* First vec was also loaded into VEC(0).  */
> +# if VEC_SIZE < 64
> +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> +#  if VEC_SIZE < 32
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> +#  endif
> +# endif
> +       VMOVU   %VEC(0), (%rdi)
> +# if VEC_SIZE < 64
> +       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> +#  if VEC_SIZE < 32
> +       VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
> +       VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
> +#  endif
> +# endif
> +
>         /* Adjust source, destination, and size.  */
>         movq    %rdi, %r8
>         andq    $63, %r8
> @@ -614,9 +767,13 @@ L(large_memcpy_2x):
>         /* Adjust length.  */
>         addq    %r8, %rdx
>
> -       /* Test if source and destination addresses will alias. If they do
> -          the larger pipeline in large_memcpy_4x alleviated the
> +       /* Test if source and destination addresses will alias. If they
> +          do the larger pipeline in large_memcpy_4x alleviated the
>            performance drop.  */
> +
> +       /* ecx contains -(dst - src). not ecx will return dst - src - 1
> +          which works for testing aliasing.  */
> +       notl    %ecx
>         testl   $(PAGE_SIZE - VEC_SIZE * 8), %ecx
>         jz      L(large_memcpy_4x)
>
> @@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
>         /* ecx stores inner loop counter.  */
>         movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
>  L(loop_large_memcpy_4x_inner):
> -       /* Only one prefetch set per page as doing 4 pages give more time
> -          for prefetcher to keep up.  */
> +       /* Only one prefetch set per page as doing 4 pages give more
> +          time for prefetcher to keep up.  */
>         PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
>         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
>         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
  
Sunil Pandey April 23, 2022, 1:41 a.m. UTC | #2
On Sat, Nov 6, 2021 at 12:12 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Sat, Nov 6, 2021 at 11:33 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > No bug.
> >
> > The optimizations are as follows:
> >
> > 1) Always align entry to 64 bytes. This makes behavior more
> >    predictable and makes other frontend optimizations easier.
> >
> > 2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
> >    significant benefits in the case that:
> >         0 < (dst - src) < [256, 512]
> >
> > 3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
> >    improvement and for FSRM [-10%, 25%].
> >
> > In addition to these primary changes there is general cleanup
> > throughout to optimize the aligning routines and control flow logic.
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> >  sysdeps/x86_64/memmove.S                      |   2 +-
> >  .../memmove-avx-unaligned-erms-rtm.S          |   2 +-
> >  .../multiarch/memmove-avx-unaligned-erms.S    |   2 +-
> >  .../multiarch/memmove-avx512-unaligned-erms.S |   2 +-
> >  .../multiarch/memmove-evex-unaligned-erms.S   |   2 +-
> >  .../multiarch/memmove-vec-unaligned-erms.S    | 595 +++++++++++-------
> >  6 files changed, 381 insertions(+), 224 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
> > index db106a7a1f..b2b3180848 100644
> > --- a/sysdeps/x86_64/memmove.S
> > +++ b/sysdeps/x86_64/memmove.S
> > @@ -25,7 +25,7 @@
> >  /* Use movups and movaps for smaller code sizes.  */
> >  #define VMOVU          movups
> >  #define VMOVA          movaps
> > -
> > +#define MOV_SIZE       3
> >  #define SECTION(p)             p
> >
> >  #ifdef USE_MULTIARCH
> > diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> > index 1ec1962e86..67a55f0c85 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
> > @@ -4,7 +4,7 @@
> >  # define VMOVNT                vmovntdq
> >  # define VMOVU         vmovdqu
> >  # define VMOVA         vmovdqa
> > -
> > +# define MOV_SIZE      4
> >  # define ZERO_UPPER_VEC_REGISTERS_RETURN \
> >    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> >
> > diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> > index e195e93f15..975ae6c051 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
> > @@ -4,7 +4,7 @@
> >  # define VMOVNT                vmovntdq
> >  # define VMOVU         vmovdqu
> >  # define VMOVA         vmovdqa
> > -
> > +# define MOV_SIZE      4
> >  # define SECTION(p)            p##.avx
> >  # define MEMMOVE_SYMBOL(p,s)   p##_avx_##s
> >
> > diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> > index 848848ab39..0fa7126830 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
> > @@ -25,7 +25,7 @@
> >  # define VMOVU         vmovdqu64
> >  # define VMOVA         vmovdqa64
> >  # define VZEROUPPER
> > -
> > +# define MOV_SIZE      6
> >  # define SECTION(p)            p##.evex512
> >  # define MEMMOVE_SYMBOL(p,s)   p##_avx512_##s
> >
> > diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> > index 0cbce8f944..88715441fe 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
> > @@ -25,7 +25,7 @@
> >  # define VMOVU         vmovdqu64
> >  # define VMOVA         vmovdqa64
> >  # define VZEROUPPER
> > -
> > +# define MOV_SIZE      6
> >  # define SECTION(p)            p##.evex
> >  # define MEMMOVE_SYMBOL(p,s)   p##_evex_##s
> >
> > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > index abde8438d4..7b27cbdda5 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > @@ -76,6 +76,25 @@
> >  # endif
> >  #endif
> >
> > +/* Whether to align before movsb. Ultimately we want 64 byte
> > +   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
> > +#define ALIGN_MOVSB    (VEC_SIZE > 16)
> > +/* Number of bytes to align movsb to.  */
> > +#define MOVSB_ALIGN_TO 64
> > +
> > +#define SMALL_MOV_SIZE (MOV_SIZE <= 4)
> > +#define LARGE_MOV_SIZE (MOV_SIZE > 4)
> > +
> > +#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
> > +# error MOV_SIZE Unknown
> > +#endif
> > +
> > +#if LARGE_MOV_SIZE
> > +# define SMALL_SIZE_OFFSET     (4)
> > +#else
> > +# define SMALL_SIZE_OFFSET     (0)
> > +#endif
> > +
> >  #ifndef PAGE_SIZE
> >  # define PAGE_SIZE 4096
> >  #endif
> > @@ -199,25 +218,21 @@ L(start):
> >  # endif
> >         cmp     $VEC_SIZE, %RDX_LP
> >         jb      L(less_vec)
> > +       /* Load regardless.  */
> > +       VMOVU   (%rsi), %VEC(0)
> >         cmp     $(VEC_SIZE * 2), %RDX_LP
> >         ja      L(more_2x_vec)
> > -#if !defined USE_MULTIARCH || !IS_IN (libc)
> > -L(last_2x_vec):
> > -#endif
> >         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> > -       VMOVU   (%rsi), %VEC(0)
> >         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
> >         VMOVU   %VEC(0), (%rdi)
> >         VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
> > -#if !defined USE_MULTIARCH || !IS_IN (libc)
> > -L(nop):
> > -       ret
> > +#if !(defined USE_MULTIARCH && IS_IN (libc))
> > +       ZERO_UPPER_VEC_REGISTERS_RETURN
> >  #else
> >         VZEROUPPER_RETURN
> >  #endif
> >  #if defined USE_MULTIARCH && IS_IN (libc)
> >  END (MEMMOVE_SYMBOL (__memmove, unaligned))
> > -
> >  # if VEC_SIZE == 16
> >  ENTRY (__mempcpy_chk_erms)
> >         cmp     %RDX_LP, %RCX_LP
> > @@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
> >  END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
> >  # endif
> >
> > -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> > +ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
> >         movq    %rdi, %rax
> >  L(start_erms):
> >  # ifdef __ILP32__
> > @@ -298,310 +313,448 @@ L(start_erms):
> >  # endif
> >         cmp     $VEC_SIZE, %RDX_LP
> >         jb      L(less_vec)
> > +       /* Load regardless.  */
> > +       VMOVU   (%rsi), %VEC(0)
> >         cmp     $(VEC_SIZE * 2), %RDX_LP
> >         ja      L(movsb_more_2x_vec)
> > -L(last_2x_vec):
> > -       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
> > -       VMOVU   (%rsi), %VEC(0)
> > -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
> > +       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
> > +        */
> > +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(1)
> >         VMOVU   %VEC(0), (%rdi)
> > -       VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
> > +       VMOVU   %VEC(1), -VEC_SIZE(%rdi, %rdx)
> >  L(return):
> > -#if VEC_SIZE > 16
> > +# if VEC_SIZE > 16
> >         ZERO_UPPER_VEC_REGISTERS_RETURN
> > -#else
> > +# else
> >         ret
> > +# endif
> >  #endif
> >
> > -L(movsb):
> > -       cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> > -       jae     L(more_8x_vec)
> > -       cmpq    %rsi, %rdi
> > -       jb      1f
> > -       /* Source == destination is less common.  */
> > -       je      L(nop)
> > -       leaq    (%rsi,%rdx), %r9
> > -       cmpq    %r9, %rdi
> > -       /* Avoid slow backward REP MOVSB.  */
> > -       jb      L(more_8x_vec_backward)
> > -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> > -       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> > -       jz      3f
> > -       movq    %rdi, %rcx
> > -       subq    %rsi, %rcx
> > -       jmp     2f
> > -# endif
> > -1:
> > -# if AVOID_SHORT_DISTANCE_REP_MOVSB
> > -       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> > -       jz      3f
> > -       movq    %rsi, %rcx
> > -       subq    %rdi, %rcx
> > -2:
> > -/* Avoid "rep movsb" if RCX, the distance between source and destination,
> > -   is N*4GB + [1..63] with N >= 0.  */
> > -       cmpl    $63, %ecx
> > -       jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <= 63.  */
> > -3:
> > -# endif
> > -       mov     %RDX_LP, %RCX_LP
> > -       rep movsb
> > -L(nop):
> > +#if LARGE_MOV_SIZE
> > +       /* If LARGE_MOV_SIZE this fits in the aligning bytes between the
> > +          ENTRY block and L(less_vec).  */
> > +       .p2align 4,, 8
> > +L(between_4_7):
> > +       /* From 4 to 7.  No branch when size == 4.  */
> > +       movl    (%rsi), %ecx
> > +       movl    (%rsi, %rdx), %esi
> > +       movl    %ecx, (%rdi)
> > +       movl    %esi, (%rdi, %rdx)
> >         ret
> >  #endif
> >
> > +       .p2align 4
> >  L(less_vec):
> >         /* Less than 1 VEC.  */
> >  #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
> >  # error Unsupported VEC_SIZE!
> >  #endif
> >  #if VEC_SIZE > 32
> > -       cmpb    $32, %dl
> > +       cmpl    $32, %edx
> >         jae     L(between_32_63)
> >  #endif
> >  #if VEC_SIZE > 16
> > -       cmpb    $16, %dl
> > +       cmpl    $16, %edx
> >         jae     L(between_16_31)
> >  #endif
> > -       cmpb    $8, %dl
> > +       cmpl    $8, %edx
> >         jae     L(between_8_15)
> > -       cmpb    $4, %dl
> > +#if SMALL_MOV_SIZE
> > +       cmpl    $4, %edx
> > +#else
> > +       subq    $4, %rdx
> > +#endif
> >         jae     L(between_4_7)
> > -       cmpb    $1, %dl
> > -       ja      L(between_2_3)
> > -       jb      1f
> > -       movzbl  (%rsi), %ecx
> > +       cmpl    $(1 - SMALL_SIZE_OFFSET), %edx
> > +       jl      L(copy_0)
> > +       movb    (%rsi), %cl
> > +       je      L(copy_1)
> > +       movzwl  (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
> > +       movw    %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
> > +L(copy_1):
> >         movb    %cl, (%rdi)
> > -1:
> > +L(copy_0):
> >         ret
> > +
> > +#if SMALL_MOV_SIZE
> > +       .p2align 4,, 8
> > +L(between_4_7):
> > +       /* From 4 to 7.  No branch when size == 4.  */
> > +       movl    -4(%rsi, %rdx), %ecx
> > +       movl    (%rsi), %esi
> > +       movl    %ecx, -4(%rdi, %rdx)
> > +       movl    %esi, (%rdi)
> > +       ret
> > +#endif
> > +
> > +#if VEC_SIZE > 16
> > +       /* From 16 to 31.  No branch when size == 16.  */
> > +       .p2align 4,, 8
> > +L(between_16_31):
> > +       vmovdqu (%rsi), %xmm0
> > +       vmovdqu -16(%rsi, %rdx), %xmm1
> > +       vmovdqu %xmm0, (%rdi)
> > +       vmovdqu %xmm1, -16(%rdi, %rdx)
> > +       /* No ymm registers have been touched.  */
> > +       ret
> > +#endif
> > +
> >  #if VEC_SIZE > 32
> > +       .p2align 4,, 10
> >  L(between_32_63):
> >         /* From 32 to 63.  No branch when size == 32.  */
> >         VMOVU   (%rsi), %YMM0
> > -       VMOVU   -32(%rsi,%rdx), %YMM1
> > +       VMOVU   -32(%rsi, %rdx), %YMM1
> >         VMOVU   %YMM0, (%rdi)
> > -       VMOVU   %YMM1, -32(%rdi,%rdx)
> > -       VZEROUPPER_RETURN
> > -#endif
> > -#if VEC_SIZE > 16
> > -       /* From 16 to 31.  No branch when size == 16.  */
> > -L(between_16_31):
> > -       VMOVU   (%rsi), %XMM0
> > -       VMOVU   -16(%rsi,%rdx), %XMM1
> > -       VMOVU   %XMM0, (%rdi)
> > -       VMOVU   %XMM1, -16(%rdi,%rdx)
> > +       VMOVU   %YMM1, -32(%rdi, %rdx)
> >         VZEROUPPER_RETURN
> >  #endif
> > +
> > +       .p2align 4,, 10
> >  L(between_8_15):
> >         /* From 8 to 15.  No branch when size == 8.  */
> > -       movq    -8(%rsi,%rdx), %rcx
> > +       movq    -8(%rsi, %rdx), %rcx
> >         movq    (%rsi), %rsi
> > -       movq    %rcx, -8(%rdi,%rdx)
> >         movq    %rsi, (%rdi)
> > +       movq    %rcx, -8(%rdi, %rdx)
> >         ret
> > -L(between_4_7):
> > -       /* From 4 to 7.  No branch when size == 4.  */
> > -       movl    -4(%rsi,%rdx), %ecx
> > -       movl    (%rsi), %esi
> > -       movl    %ecx, -4(%rdi,%rdx)
> > -       movl    %esi, (%rdi)
> > -       ret
> > -L(between_2_3):
> > -       /* From 2 to 3.  No branch when size == 2.  */
> > -       movzwl  -2(%rsi,%rdx), %ecx
> > -       movzwl  (%rsi), %esi
> > -       movw    %cx, -2(%rdi,%rdx)
> > -       movw    %si, (%rdi)
> > -       ret
> >
> > +       .p2align 4,, 10
> > +L(last_4x_vec):
> > +       /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
> > +
> > +       /* VEC(0) and VEC(1) have already been loaded.  */
> > +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(2)
> > +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
> > +       VMOVU   %VEC(0), (%rdi)
> > +       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> > +       VMOVU   %VEC(2), -VEC_SIZE(%rdi, %rdx)
> > +       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
> > +       VZEROUPPER_RETURN
> > +
> > +       .p2align 4
> >  #if defined USE_MULTIARCH && IS_IN (libc)
> >  L(movsb_more_2x_vec):
> >         cmp     __x86_rep_movsb_threshold(%rip), %RDX_LP
> >         ja      L(movsb)
> >  #endif
> >  L(more_2x_vec):
> > -       /* More than 2 * VEC and there may be overlap between destination
> > -          and source.  */
> > +       /* More than 2 * VEC and there may be overlap between
> > +          destination and source.  */
> >         cmpq    $(VEC_SIZE * 8), %rdx
> >         ja      L(more_8x_vec)
> > +       /* Load VEC(1) regardless. VEC(0) has already been loaded.  */
> > +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> >         cmpq    $(VEC_SIZE * 4), %rdx
> >         jbe     L(last_4x_vec)
> > -       /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
> > -       VMOVU   (%rsi), %VEC(0)
> > -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > +       /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
> >         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> >         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(4)
> > -       VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
> > -       VMOVU   -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
> > -       VMOVU   -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
> > +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(4)
> > +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
> > +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
> > +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
> >         VMOVU   %VEC(0), (%rdi)
> >         VMOVU   %VEC(1), VEC_SIZE(%rdi)
> >         VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
> >         VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > -       VMOVU   %VEC(4), -VEC_SIZE(%rdi,%rdx)
> > -       VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
> > -       VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
> > -       VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
> > -       VZEROUPPER_RETURN
> > -L(last_4x_vec):
> > -       /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
> > -       VMOVU   (%rsi), %VEC(0)
> > -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(2)
> > -       VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
> > -       VMOVU   %VEC(0), (%rdi)
> > -       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> > -       VMOVU   %VEC(2), -VEC_SIZE(%rdi,%rdx)
> > -       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
> > +       VMOVU   %VEC(4), -VEC_SIZE(%rdi, %rdx)
> > +       VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
> > +       VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
> > +       VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
> >         VZEROUPPER_RETURN
> >
> > +       .p2align 4,, 4
> >  L(more_8x_vec):
> > +       movq    %rdi, %rcx
> > +       subq    %rsi, %rcx
> > +       /* Go to backwards temporal copy if overlap no matter what as
> > +          backward REP MOVSB is slow and we don't want to use NT stores if
> > +          there is overlap.  */
> > +       cmpq    %rdx, %rcx
> > +       /* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
> > +       jb      L(more_8x_vec_backward_check_nop)
> >         /* Check if non-temporal move candidate.  */
> >  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> >         /* Check non-temporal store threshold.  */
> > -       cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > +       cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> >         ja      L(large_memcpy_2x)
> >  #endif
> > -       /* Entry if rdx is greater than non-temporal threshold but there
> > -       is overlap.  */
> > +       /* To reach this point there cannot be overlap and dst > src. So
> > +          check for overlap and src > dst in which case correctness
> > +          requires forward copy. Otherwise decide between backward/forward
> > +          copy depending on address aliasing.  */
> > +
> > +       /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
> > +          but less than __x86_shared_non_temporal_threshold.  */
> >  L(more_8x_vec_check):
> > -       cmpq    %rsi, %rdi
> > -       ja      L(more_8x_vec_backward)
> > -       /* Source == destination is less common.  */
> > -       je      L(nop)
> > -       /* Load the first VEC and last 4 * VEC to support overlapping
> > -          addresses.  */
> > -       VMOVU   (%rsi), %VEC(4)
> > +       /* rcx contains dst - src. Add back length (rdx).  */
> > +       leaq    (%rcx, %rdx), %r8
> > +       /* If r8 has different sign than rcx then there is overlap so we
> > +          must do forward copy.  */
> > +       xorq    %rcx, %r8
> > +       /* Isolate just sign bit of r8.  */
> > +       shrq    $63, %r8
> > +       /* Get 4k difference dst - src.  */
> > +       andl    $(PAGE_SIZE - 256), %ecx
> > +       /* If r8 is non-zero must do foward for correctness. Otherwise
> > +          if ecx is non-zero there is 4k False Alaising so do backward
> > +          copy.  */
> > +       addl    %r8d, %ecx
> > +       jz      L(more_8x_vec_backward)
> > +
> > +       /* if rdx is greater than __x86_shared_non_temporal_threshold
> > +          but there is overlap, or from short distance movsb.  */
> > +L(more_8x_vec_forward):
> > +       /* Load first and last 4 * VEC to support overlapping addresses.
> > +        */
> > +
> > +       /* First vec was already loaded into VEC(0).  */
> >         VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(5)
> >         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
> > +       /* Save begining of dst.  */
> > +       movq    %rdi, %rcx
> > +       /* Align dst to VEC_SIZE - 1.  */
> > +       orq     $(VEC_SIZE - 1), %rdi
> >         VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
> >         VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
> > -       /* Save start and stop of the destination buffer.  */
> > -       movq    %rdi, %r11
> > -       leaq    -VEC_SIZE(%rdi, %rdx), %rcx
> > -       /* Align destination for aligned stores in the loop.  Compute
> > -          how much destination is misaligned.  */
> > -       movq    %rdi, %r8
> > -       andq    $(VEC_SIZE - 1), %r8
> > -       /* Get the negative of offset for alignment.  */
> > -       subq    $VEC_SIZE, %r8
> > -       /* Adjust source.  */
> > -       subq    %r8, %rsi
> > -       /* Adjust destination which should be aligned now.  */
> > -       subq    %r8, %rdi
> > -       /* Adjust length.  */
> > -       addq    %r8, %rdx
> >
> > -       .p2align 4
> > +       /* Subtract dst from src. Add back after dst aligned.  */
> > +       subq    %rcx, %rsi
> > +       /* Finish aligning dst.  */
> > +       incq    %rdi
> > +       /* Restore src adjusted with new value for aligned dst.  */
> > +       addq    %rdi, %rsi
> > +       /* Store end of buffer minus tail in rdx.  */
> > +       leaq    (VEC_SIZE * -4)(%rcx, %rdx), %rdx
> > +
> > +       /* Dont use multi-byte nop to align.  */
> > +       .p2align 4,, 11
> >  L(loop_4x_vec_forward):
> >         /* Copy 4 * VEC a time forward.  */
> > -       VMOVU   (%rsi), %VEC(0)
> > -       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > +       VMOVU   (%rsi), %VEC(1)
> > +       VMOVU   VEC_SIZE(%rsi), %VEC(2)
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(3)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(4)
> >         subq    $-(VEC_SIZE * 4), %rsi
> > -       addq    $-(VEC_SIZE * 4), %rdx
> > -       VMOVA   %VEC(0), (%rdi)
> > -       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> > -       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > -       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > +       VMOVA   %VEC(1), (%rdi)
> > +       VMOVA   %VEC(2), VEC_SIZE(%rdi)
> > +       VMOVA   %VEC(3), (VEC_SIZE * 2)(%rdi)
> > +       VMOVA   %VEC(4), (VEC_SIZE * 3)(%rdi)
> >         subq    $-(VEC_SIZE * 4), %rdi
> > -       cmpq    $(VEC_SIZE * 4), %rdx
> > +       cmpq    %rdi, %rdx
> >         ja      L(loop_4x_vec_forward)
> >         /* Store the last 4 * VEC.  */
> > -       VMOVU   %VEC(5), (%rcx)
> > -       VMOVU   %VEC(6), -VEC_SIZE(%rcx)
> > -       VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
> > -       VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
> > +       VMOVU   %VEC(5), (VEC_SIZE * 3)(%rdx)
> > +       VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdx)
> > +       VMOVU   %VEC(7), VEC_SIZE(%rdx)
> > +       VMOVU   %VEC(8), (%rdx)
> >         /* Store the first VEC.  */
> > -       VMOVU   %VEC(4), (%r11)
> > +       VMOVU   %VEC(0), (%rcx)
> > +       /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
> > +        */
> > +L(nop_backward):
> >         VZEROUPPER_RETURN
> >
> > +       .p2align 4,, 8
> > +L(more_8x_vec_backward_check_nop):
> > +       /* rcx contains dst - src. Test for dst == src to skip all of
> > +          memmove.  */
> > +       testq   %rcx, %rcx
> > +       jz      L(nop_backward)
> >  L(more_8x_vec_backward):
> >         /* Load the first 4 * VEC and last VEC to support overlapping
> >            addresses.  */
> > -       VMOVU   (%rsi), %VEC(4)
> > +
> > +       /* First vec was also loaded into VEC(0).  */
> >         VMOVU   VEC_SIZE(%rsi), %VEC(5)
> >         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(6)
> > +       /* Begining of region for 4x backward copy stored in rcx.  */
> > +       leaq    (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
> >         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(7)
> > -       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(8)
> > -       /* Save stop of the destination buffer.  */
> > -       leaq    -VEC_SIZE(%rdi, %rdx), %r11
> > -       /* Align destination end for aligned stores in the loop.  Compute
> > -          how much destination end is misaligned.  */
> > -       leaq    -VEC_SIZE(%rsi, %rdx), %rcx
> > -       movq    %r11, %r9
> > -       movq    %r11, %r8
> > -       andq    $(VEC_SIZE - 1), %r8
> > -       /* Adjust source.  */
> > -       subq    %r8, %rcx
> > -       /* Adjust the end of destination which should be aligned now.  */
> > -       subq    %r8, %r9
> > -       /* Adjust length.  */
> > -       subq    %r8, %rdx
> > -
> > -       .p2align 4
> > +       VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(8)
> > +       /* Subtract dst from src. Add back after dst aligned.  */
> > +       subq    %rdi, %rsi
> > +       /* Align dst.  */
> > +       andq    $-(VEC_SIZE), %rcx
> > +       /* Restore src.  */
> > +       addq    %rcx, %rsi
> > +
> > +       /* Don't use multi-byte nop to align.  */
> > +       .p2align 4,, 11
> >  L(loop_4x_vec_backward):
> >         /* Copy 4 * VEC a time backward.  */
> > -       VMOVU   (%rcx), %VEC(0)
> > -       VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> > -       VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> > -       VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> > -       addq    $-(VEC_SIZE * 4), %rcx
> > -       addq    $-(VEC_SIZE * 4), %rdx
> > -       VMOVA   %VEC(0), (%r9)
> > -       VMOVA   %VEC(1), -VEC_SIZE(%r9)
> > -       VMOVA   %VEC(2), -(VEC_SIZE * 2)(%r9)
> > -       VMOVA   %VEC(3), -(VEC_SIZE * 3)(%r9)
> > -       addq    $-(VEC_SIZE * 4), %r9
> > -       cmpq    $(VEC_SIZE * 4), %rdx
> > -       ja      L(loop_4x_vec_backward)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(1)
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > +       VMOVU   (VEC_SIZE * 1)(%rsi), %VEC(3)
> > +       VMOVU   (VEC_SIZE * 0)(%rsi), %VEC(4)
> > +       addq    $(VEC_SIZE * -4), %rsi
> > +       VMOVA   %VEC(1), (VEC_SIZE * 3)(%rcx)
> > +       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rcx)
> > +       VMOVA   %VEC(3), (VEC_SIZE * 1)(%rcx)
> > +       VMOVA   %VEC(4), (VEC_SIZE * 0)(%rcx)
> > +       addq    $(VEC_SIZE * -4), %rcx
> > +       cmpq    %rcx, %rdi
> > +       jb      L(loop_4x_vec_backward)
> >         /* Store the first 4 * VEC.  */
> > -       VMOVU   %VEC(4), (%rdi)
> > +       VMOVU   %VEC(0), (%rdi)
> >         VMOVU   %VEC(5), VEC_SIZE(%rdi)
> >         VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
> >         VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
> >         /* Store the last VEC.  */
> > -       VMOVU   %VEC(8), (%r11)
> > +       VMOVU   %VEC(8), -VEC_SIZE(%rdx, %rdi)
> > +       VZEROUPPER_RETURN
> > +
> > +#if defined USE_MULTIARCH && IS_IN (libc)
> > +       /* L(skip_short_movsb_check) is only used with ERMS. Not for
> > +          FSRM.  */
> > +       .p2align 5,, 16
> > +# if ALIGN_MOVSB
> > +L(skip_short_movsb_check):
> > +#  if MOVSB_ALIGN_TO > VEC_SIZE
> > +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > +#  endif
> > +#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> > +#   error Unsupported MOVSB_ALIGN_TO
> > +#  endif
> > +       /* If CPU does not have FSRM two options for aligning. Align src
> > +          if dst and src 4k alias. Otherwise align dst.  */
> > +       testl   $(PAGE_SIZE - 512), %ecx
> > +       jnz     L(movsb_align_dst)
> > +       /* Fall through. dst and src 4k alias. It's better to align src
> > +          here because the bottleneck will be loads dues to the false
> > +          dependency on dst.  */
> > +
> > +       /* rcx already has dst - src.  */
> > +       movq    %rcx, %r9
> > +       /* Add src to len. Subtract back after src aligned. -1 because
> > +          src is initially aligned to MOVSB_ALIGN_TO - 1.  */
> > +       leaq    -1(%rsi, %rdx), %rcx
> > +       /* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
> > +       orq     $(MOVSB_ALIGN_TO - 1), %rsi
> > +       /* Restore dst and len adjusted with new values for aligned dst.
> > +        */
> > +       leaq    1(%rsi, %r9), %rdi
> > +       subq    %rsi, %rcx
> > +       /* Finish aligning src.  */
> > +       incq    %rsi
> > +
> > +       rep     movsb
> > +
> > +       VMOVU   %VEC(0), (%r8)
> > +#  if MOVSB_ALIGN_TO > VEC_SIZE
> > +       VMOVU   %VEC(1), VEC_SIZE(%r8)
> > +#  endif
> >         VZEROUPPER_RETURN
> > +# endif
> > +
> > +       .p2align 4,, 12
> > +L(movsb):
> > +       movq    %rdi, %rcx
> > +       subq    %rsi, %rcx
> > +       /* Go to backwards temporal copy if overlap no matter what as
> > +          backward REP MOVSB is slow and we don't want to use NT stores if
> > +          there is overlap.  */
> > +       cmpq    %rdx, %rcx
> > +       /* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
> > +       jb      L(more_8x_vec_backward_check_nop)
> > +# if ALIGN_MOVSB
> > +       /* Save dest for storing aligning VECs later.  */
> > +       movq    %rdi, %r8
> > +# endif
> > +       /* If above __x86_rep_movsb_stop_threshold most likely is
> > +          candidate for NT moves aswell.  */
> > +       cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
> > +       jae     L(large_memcpy_2x_check)
> > +# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
> > +       /* Only avoid short movsb if CPU has FSRM.  */
> > +       testl   $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> > +       jz      L(skip_short_movsb_check)
> > +#  if AVOID_SHORT_DISTANCE_REP_MOVSB
> > +       /* Avoid "rep movsb" if RCX, the distance between source and
> > +          destination, is N*4GB + [1..63] with N >= 0.  */
> > +
> > +       /* ecx contains dst - src. Early check for backward copy
> > +          conditions means only case of slow movsb with src = dst + [0,
> > +          63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
> > +          for that case.  */
> > +       cmpl    $-64, %ecx
> > +       ja      L(more_8x_vec_forward)
> > +#  endif
> > +# endif
> > +# if ALIGN_MOVSB
> > +#  if MOVSB_ALIGN_TO > VEC_SIZE
> > +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > +#  endif
> > +#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
> > +#   error Unsupported MOVSB_ALIGN_TO
> > +#  endif
> > +       /* Fall through means cpu has FSRM. In that case exclusively
> > +          align destination.  */
> > +L(movsb_align_dst):
> > +       /* Subtract dst from src. Add back after dst aligned.  */
> > +       subq    %rdi, %rsi
> > +       /* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
> > +       addq    $(MOVSB_ALIGN_TO - 1), %rdi
> > +       /* Add dst to len. Subtract back after dst aligned.  */
> > +       leaq    (%r8, %rdx), %rcx
> > +       /* Finish aligning dst.  */
> > +       andq    $-(MOVSB_ALIGN_TO), %rdi
> > +       /* Restore src and len adjusted with new values for aligned dst.
> > +        */
> > +       addq    %rdi, %rsi
> > +       subq    %rdi, %rcx
> > +
> > +       rep     movsb
> > +
> > +       /* Store VECs loaded for aligning.  */
> > +       VMOVU   %VEC(0), (%r8)
> > +#  if MOVSB_ALIGN_TO > VEC_SIZE
> > +       VMOVU   %VEC(1), VEC_SIZE(%r8)
> > +#  endif
> > +       VZEROUPPER_RETURN
> > +# else /* !ALIGN_MOVSB.  */
> > +L(skip_short_movsb_check):
> > +       mov     %RDX_LP, %RCX_LP
> > +       rep     movsb
> > +       ret
> > +# endif
> > +#endif
> >
> > +       .p2align 4,, 10
> >  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > -       .p2align 4
> > +L(large_memcpy_2x_check):
> > +       cmp     __x86_rep_movsb_threshold(%rip), %RDX_LP
> > +       jb      L(more_8x_vec_check)
> >  L(large_memcpy_2x):
> > -       /* Compute absolute value of difference between source and
> > -          destination.  */
> > -       movq    %rdi, %r9
> > -       subq    %rsi, %r9
> > -       movq    %r9, %r8
> > -       leaq    -1(%r9), %rcx
> > -       sarq    $63, %r8
> > -       xorq    %r8, %r9
> > -       subq    %r8, %r9
> > -       /* Don't use non-temporal store if there is overlap between
> > -          destination and source since destination may be in cache when
> > -          source is loaded.  */
> > -       cmpq    %r9, %rdx
> > -       ja      L(more_8x_vec_check)
> > +       /* To reach this point it is impossible for dst > src and
> > +          overlap. Remaining to check is src > dst and overlap. rcx
> > +          already contains dst - src. Negate rcx to get src - dst. If
> > +          length > rcx then there is overlap and forward copy is best.  */
> > +       negq    %rcx
> > +       cmpq    %rcx, %rdx
> > +       ja      L(more_8x_vec_forward)
> >
> >         /* Cache align destination. First store the first 64 bytes then
> >            adjust alignments.  */
> > -       VMOVU   (%rsi), %VEC(8)
> > -#if VEC_SIZE < 64
> > -       VMOVU   VEC_SIZE(%rsi), %VEC(9)
> > -#if VEC_SIZE < 32
> > -       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(10)
> > -       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(11)
> > -#endif
> > -#endif
> > -       VMOVU   %VEC(8), (%rdi)
> > -#if VEC_SIZE < 64
> > -       VMOVU   %VEC(9), VEC_SIZE(%rdi)
> > -#if VEC_SIZE < 32
> > -       VMOVU   %VEC(10), (VEC_SIZE * 2)(%rdi)
> > -       VMOVU   %VEC(11), (VEC_SIZE * 3)(%rdi)
> > -#endif
> > -#endif
> > +
> > +       /* First vec was also loaded into VEC(0).  */
> > +# if VEC_SIZE < 64
> > +       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > +#  if VEC_SIZE < 32
> > +       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > +#  endif
> > +# endif
> > +       VMOVU   %VEC(0), (%rdi)
> > +# if VEC_SIZE < 64
> > +       VMOVU   %VEC(1), VEC_SIZE(%rdi)
> > +#  if VEC_SIZE < 32
> > +       VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > +       VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > +#  endif
> > +# endif
> > +
> >         /* Adjust source, destination, and size.  */
> >         movq    %rdi, %r8
> >         andq    $63, %r8
> > @@ -614,9 +767,13 @@ L(large_memcpy_2x):
> >         /* Adjust length.  */
> >         addq    %r8, %rdx
> >
> > -       /* Test if source and destination addresses will alias. If they do
> > -          the larger pipeline in large_memcpy_4x alleviated the
> > +       /* Test if source and destination addresses will alias. If they
> > +          do the larger pipeline in large_memcpy_4x alleviated the
> >            performance drop.  */
> > +
> > +       /* ecx contains -(dst - src). not ecx will return dst - src - 1
> > +          which works for testing aliasing.  */
> > +       notl    %ecx
> >         testl   $(PAGE_SIZE - VEC_SIZE * 8), %ecx
> >         jz      L(large_memcpy_4x)
> >
> > @@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
> >         /* ecx stores inner loop counter.  */
> >         movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> >  L(loop_large_memcpy_4x_inner):
> > -       /* Only one prefetch set per page as doing 4 pages give more time
> > -          for prefetcher to keep up.  */
> > +       /* Only one prefetch set per page as doing 4 pages give more
> > +          time for prefetcher to keep up.  */
> >         PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> >         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> >         PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil
  

Patch

diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
index db106a7a1f..b2b3180848 100644
--- a/sysdeps/x86_64/memmove.S
+++ b/sysdeps/x86_64/memmove.S
@@ -25,7 +25,7 @@ 
 /* Use movups and movaps for smaller code sizes.  */
 #define VMOVU		movups
 #define VMOVA		movaps
-
+#define MOV_SIZE	3
 #define SECTION(p)		p
 
 #ifdef USE_MULTIARCH
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
index 1ec1962e86..67a55f0c85 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -4,7 +4,7 @@ 
 # define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
-
+# define MOV_SIZE	4
 # define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index e195e93f15..975ae6c051 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -4,7 +4,7 @@ 
 # define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
-
+# define MOV_SIZE	4
 # define SECTION(p)		p##.avx
 # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
 
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index 848848ab39..0fa7126830 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -25,7 +25,7 @@ 
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 # define VZEROUPPER
-
+# define MOV_SIZE	6
 # define SECTION(p)		p##.evex512
 # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
 
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
index 0cbce8f944..88715441fe 100644
--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -25,7 +25,7 @@ 
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 # define VZEROUPPER
-
+# define MOV_SIZE	6
 # define SECTION(p)		p##.evex
 # define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
 
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index abde8438d4..7b27cbdda5 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -76,6 +76,25 @@ 
 # endif
 #endif
 
+/* Whether to align before movsb. Ultimately we want 64 byte
+   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
+#define ALIGN_MOVSB	(VEC_SIZE > 16)
+/* Number of bytes to align movsb to.  */
+#define MOVSB_ALIGN_TO	64
+
+#define SMALL_MOV_SIZE	(MOV_SIZE <= 4)
+#define LARGE_MOV_SIZE	(MOV_SIZE > 4)
+
+#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
+# error MOV_SIZE Unknown
+#endif
+
+#if LARGE_MOV_SIZE
+# define SMALL_SIZE_OFFSET	(4)
+#else
+# define SMALL_SIZE_OFFSET	(0)
+#endif
+
 #ifndef PAGE_SIZE
 # define PAGE_SIZE 4096
 #endif
@@ -199,25 +218,21 @@  L(start):
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
+	/* Load regardless.  */
+	VMOVU	(%rsi), %VEC(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(more_2x_vec)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(last_2x_vec):
-#endif
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	(%rsi), %VEC(0)
 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(nop):
-	ret
+#if !(defined USE_MULTIARCH && IS_IN (libc))
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 #else
 	VZEROUPPER_RETURN
 #endif
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMMOVE_SYMBOL (__memmove, unaligned))
-
 # if VEC_SIZE == 16
 ENTRY (__mempcpy_chk_erms)
 	cmp	%RDX_LP, %RCX_LP
@@ -289,7 +304,7 @@  ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 # endif
 
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
 	movq	%rdi, %rax
 L(start_erms):
 # ifdef __ILP32__
@@ -298,310 +313,448 @@  L(start_erms):
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
+	/* Load regardless.  */
+	VMOVU	(%rsi), %VEC(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(movsb_more_2x_vec)
-L(last_2x_vec):
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
+	 */
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
 L(return):
-#if VEC_SIZE > 16
+# if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
-#else
+# else
 	ret
+# endif
 #endif
 
-L(movsb):
-	cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
-	jae	L(more_8x_vec)
-	cmpq	%rsi, %rdi
-	jb	1f
-	/* Source == destination is less common.  */
-	je	L(nop)
-	leaq	(%rsi,%rdx), %r9
-	cmpq	%r9, %rdi
-	/* Avoid slow backward REP MOVSB.  */
-	jb	L(more_8x_vec_backward)
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
-	jz	3f
-	movq	%rdi, %rcx
-	subq	%rsi, %rcx
-	jmp	2f
-# endif
-1:
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
-	jz	3f
-	movq	%rsi, %rcx
-	subq	%rdi, %rcx
-2:
-/* Avoid "rep movsb" if RCX, the distance between source and destination,
-   is N*4GB + [1..63] with N >= 0.  */
-	cmpl	$63, %ecx
-	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
-3:
-# endif
-	mov	%RDX_LP, %RCX_LP
-	rep movsb
-L(nop):
+#if LARGE_MOV_SIZE
+	/* If LARGE_MOV_SIZE this fits in the aligning bytes between the
+	   ENTRY block and L(less_vec).  */
+	.p2align 4,, 8
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	(%rsi), %ecx
+	movl	(%rsi, %rdx), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, (%rdi, %rdx)
 	ret
 #endif
 
+	.p2align 4
 L(less_vec):
 	/* Less than 1 VEC.  */
 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 # error Unsupported VEC_SIZE!
 #endif
 #if VEC_SIZE > 32
-	cmpb	$32, %dl
+	cmpl	$32, %edx
 	jae	L(between_32_63)
 #endif
 #if VEC_SIZE > 16
-	cmpb	$16, %dl
+	cmpl	$16, %edx
 	jae	L(between_16_31)
 #endif
-	cmpb	$8, %dl
+	cmpl	$8, %edx
 	jae	L(between_8_15)
-	cmpb	$4, %dl
+#if SMALL_MOV_SIZE
+	cmpl	$4, %edx
+#else
+	subq	$4, %rdx
+#endif
 	jae	L(between_4_7)
-	cmpb	$1, %dl
-	ja	L(between_2_3)
-	jb	1f
-	movzbl	(%rsi), %ecx
+	cmpl	$(1 - SMALL_SIZE_OFFSET), %edx
+	jl	L(copy_0)
+	movb	(%rsi), %cl
+	je	L(copy_1)
+	movzwl	(-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
+	movw	%si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
+L(copy_1):
 	movb	%cl, (%rdi)
-1:
+L(copy_0):
 	ret
+
+#if SMALL_MOV_SIZE
+	.p2align 4,, 8
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	-4(%rsi, %rdx), %ecx
+	movl	(%rsi), %esi
+	movl	%ecx, -4(%rdi, %rdx)
+	movl	%esi, (%rdi)
+	ret
+#endif
+
+#if VEC_SIZE > 16
+	/* From 16 to 31.  No branch when size == 16.  */
+	.p2align 4,, 8
+L(between_16_31):
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	-16(%rsi, %rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -16(%rdi, %rdx)
+	/* No ymm registers have been touched.  */
+	ret
+#endif
+
 #if VEC_SIZE > 32
+	.p2align 4,, 10
 L(between_32_63):
 	/* From 32 to 63.  No branch when size == 32.  */
 	VMOVU	(%rsi), %YMM0
-	VMOVU	-32(%rsi,%rdx), %YMM1
+	VMOVU	-32(%rsi, %rdx), %YMM1
 	VMOVU	%YMM0, (%rdi)
-	VMOVU	%YMM1, -32(%rdi,%rdx)
-	VZEROUPPER_RETURN
-#endif
-#if VEC_SIZE > 16
-	/* From 16 to 31.  No branch when size == 16.  */
-L(between_16_31):
-	VMOVU	(%rsi), %XMM0
-	VMOVU	-16(%rsi,%rdx), %XMM1
-	VMOVU	%XMM0, (%rdi)
-	VMOVU	%XMM1, -16(%rdi,%rdx)
+	VMOVU	%YMM1, -32(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
+
+	.p2align 4,, 10
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
-	movq	-8(%rsi,%rdx), %rcx
+	movq	-8(%rsi, %rdx), %rcx
 	movq	(%rsi), %rsi
-	movq	%rcx, -8(%rdi,%rdx)
 	movq	%rsi, (%rdi)
+	movq	%rcx, -8(%rdi, %rdx)
 	ret
-L(between_4_7):
-	/* From 4 to 7.  No branch when size == 4.  */
-	movl	-4(%rsi,%rdx), %ecx
-	movl	(%rsi), %esi
-	movl	%ecx, -4(%rdi,%rdx)
-	movl	%esi, (%rdi)
-	ret
-L(between_2_3):
-	/* From 2 to 3.  No branch when size == 2.  */
-	movzwl	-2(%rsi,%rdx), %ecx
-	movzwl	(%rsi), %esi
-	movw	%cx, -2(%rdi,%rdx)
-	movw	%si, (%rdi)
-	ret
 
+	.p2align 4,, 10
+L(last_4x_vec):
+	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
+
+	/* VEC(0) and VEC(1) have already been loaded.  */
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+
+	.p2align 4
 #if defined USE_MULTIARCH && IS_IN (libc)
 L(movsb_more_2x_vec):
 	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
 	ja	L(movsb)
 #endif
 L(more_2x_vec):
-	/* More than 2 * VEC and there may be overlap between destination
-	   and source.  */
+	/* More than 2 * VEC and there may be overlap between
+	   destination and source.  */
 	cmpq	$(VEC_SIZE * 8), %rdx
 	ja	L(more_8x_vec)
+	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jbe	L(last_4x_vec)
-	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
 	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
-	VZEROUPPER_RETURN
-L(last_4x_vec):
-	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
+	.p2align 4,, 4
 L(more_8x_vec):
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	/* Go to backwards temporal copy if overlap no matter what as
+	   backward REP MOVSB is slow and we don't want to use NT stores if
+	   there is overlap.  */
+	cmpq	%rdx, %rcx
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
+	jb	L(more_8x_vec_backward_check_nop)
 	/* Check if non-temporal move candidate.  */
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 	/* Check non-temporal store threshold.  */
-	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 	ja	L(large_memcpy_2x)
 #endif
-	/* Entry if rdx is greater than non-temporal threshold but there
-       is overlap.  */
+	/* To reach this point there cannot be overlap and dst > src. So
+	   check for overlap and src > dst in which case correctness
+	   requires forward copy. Otherwise decide between backward/forward
+	   copy depending on address aliasing.  */
+
+	/* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
+	   but less than __x86_shared_non_temporal_threshold.  */
 L(more_8x_vec_check):
-	cmpq	%rsi, %rdi
-	ja	L(more_8x_vec_backward)
-	/* Source == destination is less common.  */
-	je	L(nop)
-	/* Load the first VEC and last 4 * VEC to support overlapping
-	   addresses.  */
-	VMOVU	(%rsi), %VEC(4)
+	/* rcx contains dst - src. Add back length (rdx).  */
+	leaq	(%rcx, %rdx), %r8
+	/* If r8 has different sign than rcx then there is overlap so we
+	   must do forward copy.  */
+	xorq	%rcx, %r8
+	/* Isolate just sign bit of r8.  */
+	shrq	$63, %r8
+	/* Get 4k difference dst - src.  */
+	andl	$(PAGE_SIZE - 256), %ecx
+	/* If r8 is non-zero must do foward for correctness. Otherwise
+	   if ecx is non-zero there is 4k False Alaising so do backward
+	   copy.  */
+	addl	%r8d, %ecx
+	jz	L(more_8x_vec_backward)
+
+	/* if rdx is greater than __x86_shared_non_temporal_threshold
+	   but there is overlap, or from short distance movsb.  */
+L(more_8x_vec_forward):
+	/* Load first and last 4 * VEC to support overlapping addresses.
+	 */
+
+	/* First vec was already loaded into VEC(0).  */
 	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	/* Save begining of dst.  */
+	movq	%rdi, %rcx
+	/* Align dst to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
 	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
 	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
-	/* Save start and stop of the destination buffer.  */
-	movq	%rdi, %r11
-	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
-	/* Align destination for aligned stores in the loop.  Compute
-	   how much destination is misaligned.  */
-	movq	%rdi, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Get the negative of offset for alignment.  */
-	subq	$VEC_SIZE, %r8
-	/* Adjust source.  */
-	subq	%r8, %rsi
-	/* Adjust destination which should be aligned now.  */
-	subq	%r8, %rdi
-	/* Adjust length.  */
-	addq	%r8, %rdx
 
-	.p2align 4
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rcx, %rsi
+	/* Finish aligning dst.  */
+	incq	%rdi
+	/* Restore src adjusted with new value for aligned dst.  */
+	addq	%rdi, %rsi
+	/* Store end of buffer minus tail in rdx.  */
+	leaq	(VEC_SIZE * -4)(%rcx, %rdx), %rdx
+
+	/* Dont use multi-byte nop to align.  */
+	.p2align 4,, 11
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
 	subq	$-(VEC_SIZE * 4), %rsi
-	addq	$-(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VEC(1), (%rdi)
+	VMOVA	%VEC(2), VEC_SIZE(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
-	cmpq	$(VEC_SIZE * 4), %rdx
+	cmpq	%rdi, %rdx
 	ja	L(loop_4x_vec_forward)
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
+	VMOVU	%VEC(7), VEC_SIZE(%rdx)
+	VMOVU	%VEC(8), (%rdx)
 	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
+	VMOVU	%VEC(0), (%rcx)
+	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
+	 */
+L(nop_backward):
 	VZEROUPPER_RETURN
 
+	.p2align 4,, 8
+L(more_8x_vec_backward_check_nop):
+	/* rcx contains dst - src. Test for dst == src to skip all of
+	   memmove.  */
+	testq	%rcx, %rcx
+	jz	L(nop_backward)
 L(more_8x_vec_backward):
 	/* Load the first 4 * VEC and last VEC to support overlapping
 	   addresses.  */
-	VMOVU	(%rsi), %VEC(4)
+
+	/* First vec was also loaded into VEC(0).  */
 	VMOVU	VEC_SIZE(%rsi), %VEC(5)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	/* Begining of region for 4x backward copy stored in rcx.  */
+	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
-	/* Save stop of the destination buffer.  */
-	leaq	-VEC_SIZE(%rdi, %rdx), %r11
-	/* Align destination end for aligned stores in the loop.  Compute
-	   how much destination end is misaligned.  */
-	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
-	movq	%r11, %r9
-	movq	%r11, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Adjust source.  */
-	subq	%r8, %rcx
-	/* Adjust the end of destination which should be aligned now.  */
-	subq	%r8, %r9
-	/* Adjust length.  */
-	subq	%r8, %rdx
-
-	.p2align 4
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Align dst.  */
+	andq	$-(VEC_SIZE), %rcx
+	/* Restore src.  */
+	addq	%rcx, %rsi
+
+	/* Don't use multi-byte nop to align.  */
+	.p2align 4,, 11
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	addq	$-(VEC_SIZE * 4), %rcx
-	addq	$-(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%r9)
-	VMOVA	%VEC(1), -VEC_SIZE(%r9)
-	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	addq	$-(VEC_SIZE * 4), %r9
-	cmpq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec_backward)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
+	addq	$(VEC_SIZE * -4), %rsi
+	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
+	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
+	addq	$(VEC_SIZE * -4), %rcx
+	cmpq	%rcx, %rdi
+	jb	L(loop_4x_vec_backward)
 	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(5), VEC_SIZE(%rdi)
 	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
 	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
+	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
+	VZEROUPPER_RETURN
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+	/* L(skip_short_movsb_check) is only used with ERMS. Not for
+	   FSRM.  */
+	.p2align 5,, 16
+# if ALIGN_MOVSB
+L(skip_short_movsb_check):
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  endif
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+#   error Unsupported MOVSB_ALIGN_TO
+#  endif
+	/* If CPU does not have FSRM two options for aligning. Align src
+	   if dst and src 4k alias. Otherwise align dst.  */
+	testl	$(PAGE_SIZE - 512), %ecx
+	jnz	L(movsb_align_dst)
+	/* Fall through. dst and src 4k alias. It's better to align src
+	   here because the bottleneck will be loads dues to the false
+	   dependency on dst.  */
+
+	/* rcx already has dst - src.  */
+	movq	%rcx, %r9
+	/* Add src to len. Subtract back after src aligned. -1 because
+	   src is initially aligned to MOVSB_ALIGN_TO - 1.  */
+	leaq	-1(%rsi, %rdx), %rcx
+	/* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
+	orq	$(MOVSB_ALIGN_TO - 1), %rsi
+	/* Restore dst and len adjusted with new values for aligned dst.
+	 */
+	leaq	1(%rsi, %r9), %rdi
+	subq	%rsi, %rcx
+	/* Finish aligning src.  */
+	incq	%rsi
+
+	rep	movsb
+
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
 	VZEROUPPER_RETURN
+# endif
+
+	.p2align 4,, 12
+L(movsb):
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	/* Go to backwards temporal copy if overlap no matter what as
+	   backward REP MOVSB is slow and we don't want to use NT stores if
+	   there is overlap.  */
+	cmpq	%rdx, %rcx
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
+	jb	L(more_8x_vec_backward_check_nop)
+# if ALIGN_MOVSB
+	/* Save dest for storing aligning VECs later.  */
+	movq	%rdi, %r8
+# endif
+	/* If above __x86_rep_movsb_stop_threshold most likely is
+	   candidate for NT moves aswell.  */
+	cmp	__x86_rep_movsb_stop_threshold(%rip), %RDX_LP
+	jae	L(large_memcpy_2x_check)
+# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
+	/* Only avoid short movsb if CPU has FSRM.  */
+	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+	jz	L(skip_short_movsb_check)
+#  if AVOID_SHORT_DISTANCE_REP_MOVSB
+	/* Avoid "rep movsb" if RCX, the distance between source and
+	   destination, is N*4GB + [1..63] with N >= 0.  */
+
+	/* ecx contains dst - src. Early check for backward copy
+	   conditions means only case of slow movsb with src = dst + [0,
+	   63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
+	   for that case.  */
+	cmpl	$-64, %ecx
+	ja	L(more_8x_vec_forward)
+#  endif
+# endif
+# if ALIGN_MOVSB
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  endif
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+#   error Unsupported MOVSB_ALIGN_TO
+#  endif
+	/* Fall through means cpu has FSRM. In that case exclusively
+	   align destination.  */
+L(movsb_align_dst):
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
+	addq	$(MOVSB_ALIGN_TO - 1), %rdi
+	/* Add dst to len. Subtract back after dst aligned.  */
+	leaq	(%r8, %rdx), %rcx
+	/* Finish aligning dst.  */
+	andq	$-(MOVSB_ALIGN_TO), %rdi
+	/* Restore src and len adjusted with new values for aligned dst.
+	 */
+	addq	%rdi, %rsi
+	subq	%rdi, %rcx
+
+	rep	movsb
+
+	/* Store VECs loaded for aligning.  */
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
+	VZEROUPPER_RETURN
+# else	/* !ALIGN_MOVSB.  */
+L(skip_short_movsb_check):
+	mov	%RDX_LP, %RCX_LP
+	rep	movsb
+	ret
+# endif
+#endif
 
+	.p2align 4,, 10
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	.p2align 4
+L(large_memcpy_2x_check):
+	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
+	jb	L(more_8x_vec_check)
 L(large_memcpy_2x):
-	/* Compute absolute value of difference between source and
-	   destination.  */
-	movq	%rdi, %r9
-	subq	%rsi, %r9
-	movq	%r9, %r8
-	leaq	-1(%r9), %rcx
-	sarq	$63, %r8
-	xorq	%r8, %r9
-	subq	%r8, %r9
-	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache when
-	   source is loaded.  */
-	cmpq	%r9, %rdx
-	ja	L(more_8x_vec_check)
+	/* To reach this point it is impossible for dst > src and
+	   overlap. Remaining to check is src > dst and overlap. rcx
+	   already contains dst - src. Negate rcx to get src - dst. If
+	   length > rcx then there is overlap and forward copy is best.  */
+	negq	%rcx
+	cmpq	%rcx, %rdx
+	ja	L(more_8x_vec_forward)
 
 	/* Cache align destination. First store the first 64 bytes then
 	   adjust alignments.  */
-	VMOVU	(%rsi), %VEC(8)
-#if VEC_SIZE < 64
-	VMOVU	VEC_SIZE(%rsi), %VEC(9)
-#if VEC_SIZE < 32
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
-#endif
-#endif
-	VMOVU	%VEC(8), (%rdi)
-#if VEC_SIZE < 64
-	VMOVU	%VEC(9), VEC_SIZE(%rdi)
-#if VEC_SIZE < 32
-	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
-#endif
-#endif
+
+	/* First vec was also loaded into VEC(0).  */
+# if VEC_SIZE < 64
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  if VEC_SIZE < 32
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+#  endif
+# endif
+	VMOVU	%VEC(0), (%rdi)
+# if VEC_SIZE < 64
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+#  if VEC_SIZE < 32
+	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+#  endif
+# endif
+
 	/* Adjust source, destination, and size.  */
 	movq	%rdi, %r8
 	andq	$63, %r8
@@ -614,9 +767,13 @@  L(large_memcpy_2x):
 	/* Adjust length.  */
 	addq	%r8, %rdx
 
-	/* Test if source and destination addresses will alias. If they do
-	   the larger pipeline in large_memcpy_4x alleviated the
+	/* Test if source and destination addresses will alias. If they
+	   do the larger pipeline in large_memcpy_4x alleviated the
 	   performance drop.  */
+
+	/* ecx contains -(dst - src). not ecx will return dst - src - 1
+	   which works for testing aliasing.  */
+	notl	%ecx
 	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
 	jz	L(large_memcpy_4x)
 
@@ -704,8 +861,8 @@  L(loop_large_memcpy_4x_outer):
 	/* ecx stores inner loop counter.  */
 	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
 L(loop_large_memcpy_4x_inner):
-	/* Only one prefetch set per page as doing 4 pages give more time
-	   for prefetcher to keep up.  */
+	/* Only one prefetch set per page as doing 4 pages give more
+	   time for prefetcher to keep up.  */
 	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)