diff mbox

[v2,AArch64] Improve integer memcpy

Message ID AM5PR0801MB20353EB926BF1E3BBFD3471C83EA0@AM5PR0801MB2035.eurprd08.prod.outlook.com
State New
Headers show

Commit Message

Wilco Dijkstra Feb. 26, 2020, 4:18 p.m. UTC
Version 2 fixes white space and uses ENTRY_ALIGN rather than ENTRY:

Further optimize integer memcpy.  Small cases now include copies up
to 32 bytes.  64-128 byte copies are split into two cases to improve
performance of 64-96 byte copies.  Comments have been rewritten.
The attached graph shows how the new memcpy (memcpy_new) performs
against the current generic memcpy and the previous version (memcpy.S
before commit b9f145df85).

Passes GLIBC tests.

--

Comments

Adhemerval Zanella March 10, 2020, 6:46 p.m. UTC | #1
On 26/02/2020 13:18, Wilco Dijkstra wrote:
> 
> Version 2 fixes white space and uses ENTRY_ALIGN rather than ENTRY:
> 
> Further optimize integer memcpy.  Small cases now include copies up
> to 32 bytes.  64-128 byte copies are split into two cases to improve
> performance of 64-96 byte copies.  Comments have been rewritten.
> The attached graph shows how the new memcpy (memcpy_new) performs
> against the current generic memcpy and the previous version (memcpy.S
> before commit b9f145df85).

I wonder if the optimization for sizes up to 128 yields same gain
for the other chip memcpy implementation (thunderx, thunderx2, and
falkor).  The main differences seems to be how each chip handles
large copies, with thundex and falkor doing 64 bytes per loop, 
while thunderx2 does either 128 bytes (when source and dest are 
aligned) or 64 for unaligned inputs (it also does not issue 
unaligned access, doing aligned load plus merge using a jump table).

So it seems that I don't see a straightforward way to unify the
implementations, maybe adding a common shared code for sizes
less than 128 bytes.

One question is if doing operation for large sizes using 
ldp/stp might yield some gains (as thunderx2 does, at least
for aligned case), or if the cost of checking and using some 
specific cases does not pay of.

The patch LGTM, thanks.

Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>

> 
> Passes GLIBC tests.
> 
> --
> diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
> index ff720c800ed0ca3afac03d19ba02f67817b3422e..d31f7bb38eaf91692fb90f1c313b5e276fdf975b 100644
> --- a/sysdeps/aarch64/memcpy.S
> +++ b/sysdeps/aarch64/memcpy.S
> @@ -33,11 +33,11 @@
>  #define A_l x6
>  #define A_lw w6
>  #define A_h x7
> -#define A_hw w7
>  #define B_l x8
>  #define B_lw w8
>  #define B_h x9
>  #define C_l x10
> +#define C_lw w10
>  #define C_h x11
>  #define D_l x12
>  #define D_h x13
> @@ -51,16 +51,6 @@
>  #define H_h srcend
>  #define tmp1 x14
>  
> -/* Copies are split into 3 main cases: small copies of up to 32 bytes,
> -   medium copies of 33..128 bytes which are fully unrolled. Large copies
> -   of more than 128 bytes align the destination and use an unrolled loop
> -   processing 64 bytes per iteration.
> -   In order to share code with memmove, small and medium copies read all
> -   data before writing, allowing any kind of overlap. So small, medium
> -   and large backwards memmoves are handled by falling through into memcpy.
> -   Overlapping large forward memmoves use a loop that copies backwards.
> -*/
> -
>  #ifndef MEMMOVE
>  # define MEMMOVE memmove
>  #endif
> @@ -68,128 +58,124 @@
>  # define MEMCPY memcpy
>  #endif
>  
> -ENTRY_ALIGN (MEMMOVE, 6)
> +/* This implementation supports both memcpy and memmove and shares most code.
> +   It uses unaligned accesses and branchless sequences to keep the code small,

I think this line is too long.

> +   simple and improve performance.
>  
> - DELOUSE (0)
> - DELOUSE (1)
> - DELOUSE (2)
> +   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
> +   copies of up to 128 bytes, and large copies.  The overhead of the overlap
> +   check in memmove is negligible since it is only required for large copies.
>  
> - sub tmp1, dstin, src
> - cmp count, 128
> - ccmp tmp1, count, 2, hi
> - b.lo L(move_long)
> -
> - /* Common case falls through into memcpy.  */
> -END (MEMMOVE)
> -libc_hidden_builtin_def (MEMMOVE)
> -ENTRY (MEMCPY)
> +   Large copies use a software pipelined loop processing 64 bytes per iteration.
> +   The destination pointer is 16-byte aligned to minimize unaligned accesses.
> +   The loop tail is handled by always copying 64 bytes from the end.
> +*/

Ok, so it now uses a similar strategy ThunderX/Falkor memcpy (Falkor
limits the copy to one register due a hardware prefetcher limitation).

>  
> +ENTRY_ALIGN (MEMCPY, 6)
>   DELOUSE (0)
>   DELOUSE (1)
>   DELOUSE (2)
>  
> - prfm PLDL1KEEP, [src]

I think it should mention that no prefetch is done.

>   add srcend, src, count
>   add dstend, dstin, count
> - cmp count, 32
> - b.ls L(copy32)
>   cmp count, 128
>   b.hi L(copy_long)
> + cmp count, 32
> + b.hi L(copy32_128)
>  
> - /* Medium copies: 33..128 bytes.  */
> + /* Small copies: 0..32 bytes.  */
> + cmp count, 16
> + b.lo L(copy16)
>   ldp A_l, A_h, [src]
> - ldp B_l, B_h, [src, 16]
> - ldp C_l, C_h, [srcend, -32]
>   ldp D_l, D_h, [srcend, -16]
> - cmp count, 64
> - b.hi L(copy128)
>   stp A_l, A_h, [dstin]
> - stp B_l, B_h, [dstin, 16]
> - stp C_l, C_h, [dstend, -32]
>   stp D_l, D_h, [dstend, -16]
>   ret
>  

Ok.

> - .p2align 4
> - /* Small copies: 0..32 bytes.  */
> -L(copy32):
> - /* 16-32 bytes.  */
> - cmp count, 16
> - b.lo 1f
> - ldp A_l, A_h, [src]
> - ldp B_l, B_h, [srcend, -16]
> - stp A_l, A_h, [dstin]
> - stp B_l, B_h, [dstend, -16]
> - ret
> - .p2align 4
> -1:
> - /* 8-15 bytes.  */
> - tbz count, 3, 1f
> + /* Copy 8-15 bytes.  */
> +L(copy16):
> + tbz count, 3, L(copy8)
>   ldr A_l, [src]
>   ldr A_h, [srcend, -8]
>   str A_l, [dstin]
>   str A_h, [dstend, -8]
>   ret

Ok.

> - .p2align 4
> -1:
> - /* 4-7 bytes.  */
> - tbz count, 2, 1f
> +
> + .p2align 3
> + /* Copy 4-7 bytes.  */
> +L(copy8):
> + tbz count, 2, L(copy4)
>   ldr A_lw, [src]
> - ldr A_hw, [srcend, -4]
> + ldr B_lw, [srcend, -4]
>   str A_lw, [dstin]
> - str A_hw, [dstend, -4]
> + str B_lw, [dstend, -4]
>   ret
>  

Ok.

> - /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
> -   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
> -1:
> - cbz count, 2f
> + /* Copy 0..3 bytes using a branchless sequence.  */
> +L(copy4):
> + cbz count, L(copy0)
>   lsr tmp1, count, 1
>   ldrb A_lw, [src]
> - ldrb A_hw, [srcend, -1]
> + ldrb C_lw, [srcend, -1]
>   ldrb B_lw, [src, tmp1]
>   strb A_lw, [dstin]
>   strb B_lw, [dstin, tmp1]
> - strb A_hw, [dstend, -1]
> -2: ret
> + strb C_lw, [dstend, -1]
> +L(copy0):
> + ret

Ok.

>  
>   .p2align 4
> - /* Copy 65..128 bytes.  Copy 64 bytes from the start and
> -   64 bytes from the end.  */
> + /* Medium copies: 33..128 bytes.  */
> +L(copy32_128):
> + ldp A_l, A_h, [src]
> + ldp B_l, B_h, [src, 16]
> + ldp C_l, C_h, [srcend, -32]
> + ldp D_l, D_h, [srcend, -16]
> + cmp count, 64
> + b.hi L(copy128)
> + stp A_l, A_h, [dstin]
> + stp B_l, B_h, [dstin, 16]
> + stp C_l, C_h, [dstend, -32]
> + stp D_l, D_h, [dstend, -16]
> + ret

Ok.

> +
> + .p2align 4
> + /* Copy 65..128 bytes.  */
>  L(copy128):
>   ldp E_l, E_h, [src, 32]
>   ldp F_l, F_h, [src, 48]
> + cmp count, 96
> + b.ls L(copy96)
>   ldp G_l, G_h, [srcend, -64]
>   ldp H_l, H_h, [srcend, -48]
> + stp G_l, G_h, [dstend, -64]
> + stp H_l, H_h, [dstend, -48]
> +L(copy96):
>   stp A_l, A_h, [dstin]
>   stp B_l, B_h, [dstin, 16]
>   stp E_l, E_h, [dstin, 32]
>   stp F_l, F_h, [dstin, 48]
> - stp G_l, G_h, [dstend, -64]
> - stp H_l, H_h, [dstend, -48]
>   stp C_l, C_h, [dstend, -32]
>   stp D_l, D_h, [dstend, -16]
>   ret

Ok.

>  
> - /* Align DST to 16 byte alignment so that we don't cross cache line
> -   boundaries on both loads and stores.  There are at least 128 bytes
> -   to copy, so copy 16 bytes unaligned and then align.  The loop
> -   copies 64 bytes per iteration and prefetches one iteration ahead.  */
> -
>   .p2align 4
> + /* Copy more than 128 bytes.  */
>  L(copy_long):
> + /* Copy 16 bytes and then align dst to 16-byte alignment.  */
> + ldp D_l, D_h, [src]
>   and tmp1, dstin, 15
>   bic dst, dstin, 15
> - ldp D_l, D_h, [src]
>   sub src, src, tmp1
> - add count, count, tmp1      /* Count is now 16 too large.  */
> + add count, count, tmp1 /* Count is now 16 too large.  */
>   ldp A_l, A_h, [src, 16]
>   stp D_l, D_h, [dstin]
>   ldp B_l, B_h, [src, 32]
>   ldp C_l, C_h, [src, 48]
>   ldp D_l, D_h, [src, 64]!
> -
>   subs count, count, 128 + 16 /* Test and readjust count.  */
> - b.ls L(last64)
> + b.ls L(copy64_from_end)
> +

Ok.

>  L(loop64):
>   stp A_l, A_h, [dst, 16]
>   ldp A_l, A_h, [src, 16]
> @@ -202,10 +188,8 @@ L(loop64):
>   subs count, count, 64
>   b.hi L(loop64)
>  
> - /* Write the last full set of 64 bytes.  The remainder is at most 64
> -   bytes, so it is safe to always copy 64 bytes from the end even if
> -   there is just 1 byte left.  */
> -L(last64):
> + /* Write the last iteration and copy 64 bytes from the end.  */
> +L(copy64_from_end):
>   ldp E_l, E_h, [srcend, -64]
>   stp A_l, A_h, [dst, 16]
>   ldp A_l, A_h, [srcend, -48]
> @@ -220,20 +204,42 @@ L(last64):
>   stp C_l, C_h, [dstend, -16]
>   ret

Ok.

>  
> - .p2align 4
> -L(move_long):
> - cbz tmp1, 3f
> +END (MEMCPY)
> +libc_hidden_builtin_def (MEMCPY)
> +
> +ENTRY_ALIGN (MEMMOVE, 4)
> + DELOUSE (0)
> + DELOUSE (1)
> + DELOUSE (2)
>  
>   add srcend, src, count
>   add dstend, dstin, count
> + cmp count, 128
> + b.hi L(move_long)
> + cmp count, 32
> + b.hi L(copy32_128)

Ok.

>  
> - /* Align dstend to 16 byte alignment so that we don't cross cache line
> -   boundaries on both loads and stores.  There are at least 128 bytes
> -   to copy, so copy 16 bytes unaligned and then align.  The loop
> -   copies 64 bytes per iteration and prefetches one iteration ahead.  */
> + /* Small copies: 0..32 bytes.  */
> + cmp count, 16
> + b.lo L(copy16)
> + ldp A_l, A_h, [src]
> + ldp D_l, D_h, [srcend, -16]
> + stp A_l, A_h, [dstin]
> + stp D_l, D_h, [dstend, -16]
> + ret

Ok.

>  
> - and tmp1, dstend, 15
> + .p2align 4
> +L(move_long):
> + /* Only use backward copy if there is an overlap.  */
> + sub tmp1, dstin, src
> + cbz tmp1, L(copy0)
> + cmp tmp1, count
> + b.hs L(copy_long)
> +
> + /* Large backwards copy for overlapping copies.
> +   Copy 16 bytes and then align dst to 16-byte alignment.  */
>   ldp D_l, D_h, [srcend, -16]
> + and tmp1, dstend, 15
>   sub srcend, srcend, tmp1
>   sub count, count, tmp1
>   ldp A_l, A_h, [srcend, -16]
> @@ -243,10 +249,9 @@ L(move_long):
>   ldp D_l, D_h, [srcend, -64]!
>   sub dstend, dstend, tmp1
>   subs count, count, 128
> - b.ls 2f
> + b.ls L(copy64_from_start)

Ok.

>  
> - nop
> -1:
> +L(loop64_backwards):
>   stp A_l, A_h, [dstend, -16]
>   ldp A_l, A_h, [srcend, -16]
>   stp B_l, B_h, [dstend, -32]
> @@ -256,12 +261,10 @@ L(move_long):
>   stp D_l, D_h, [dstend, -64]!
>   ldp D_l, D_h, [srcend, -64]!
>   subs count, count, 64
> - b.hi 1b
> + b.hi L(loop64_backwards)

Ok.

>  
> - /* Write the last full set of 64 bytes.  The remainder is at most 64
> -   bytes, so it is safe to always copy 64 bytes from the start even if
> -   there is just 1 byte left.  */
> -2:
> + /* Write the last iteration and copy 64 bytes from the start.  */
> +L(copy64_from_start):
>   ldp G_l, G_h, [src, 48]
>   stp A_l, A_h, [dstend, -16]
>   ldp A_l, A_h, [src, 32]
> @@ -274,7 +277,7 @@ L(move_long):
>   stp A_l, A_h, [dstin, 32]
>   stp B_l, B_h, [dstin, 16]
>   stp C_l, C_h, [dstin]
> -3: ret
> + ret
>  
> -END (MEMCPY)
> -libc_hidden_builtin_def (MEMCPY)
> +END (MEMMOVE)
> +libc_hidden_builtin_def (MEMMOVE)
> 

Ok.
Wilco Dijkstra March 11, 2020, 4:32 p.m. UTC | #2
Hi Adhemerval,

> I wonder if the optimization for sizes up to 128 yields same gain
> for the other chip memcpy implementation (thunderx, thunderx2, and
> falkor).  

Most definitely - the new memcpy is 15-20% faster than __memcpy_thunderx2
on TX2.

> The main differences seems to be how each chip handles
> large copies, with thundex and falkor doing 64 bytes per loop,
> while thunderx2 does either 128 bytes (when source and dest are
> aligned) or 64 for unaligned inputs (it also does not issue
> unaligned access, doing aligned load plus merge using a jump table).

Yes that jump table is insane at 1KB of code... It may seem great in
microbenchmarks but it falls apart in the real world.

> So it seems that I don't see a straightforward way to unify the
> implementations, maybe adding a common shared code for sizes
> less than 128 bytes.

Yes we could share the code for small cases across implementations.
I was thinking about having an ifunc for large copies so we could
statically link a common routine to handle small copies and avoid
PLT overheads in 99% of cases.

> One question is if doing operation for large sizes using
> ldp/stp might yield some gains (as thunderx2 does, at least
> for aligned case), or if the cost of checking and using some
> specific cases does not pay of.

You mean LDP/STP of SIMD registers? There is some gain for those on
modern cores.

> +   Large copies use a software pipelined loop processing 64 bytes per iteration.
> +   The destination pointer is 16-byte aligned to minimize unaligned accesses.
> +   The loop tail is handled by always copying 64 bytes from the end.
> +*/

> Ok, so it now uses a similar strategy ThunderX/Falkor memcpy (Falkor
> limits the copy to one register due a hardware prefetcher limitation).

Well this is what it always did. It's faster on in-order cores and supports
overlapping copies (unlike the Falkor memcpy).

I'll fix up the long lines before commit.

Cheers,
Wilco
Adhemerval Zanella March 13, 2020, 4:43 p.m. UTC | #3
On 11/03/2020 13:32, Wilco Dijkstra wrote:
> Hi Adhemerval,
> 
>> I wonder if the optimization for sizes up to 128 yields same gain
>> for the other chip memcpy implementation (thunderx, thunderx2, and
>> falkor).  
> 
> Most definitely - the new memcpy is 15-20% faster than __memcpy_thunderx2
> on TX2.

OK, what I would like to avoid is keep maintaining subpar architecture
implementations once generic implementation improves.  

So, for ThunderX the only optimization its implementation uses iis
prefetch for sizes larger then 32KB. Is it really paying off?
Could it switch to generic implementation as well?

For ThundeX2, it uses Q registers and 128 bytes loops for aligned
loops and the jump table for unaligned.  Is the jump table still
a gain for ThunderX2?  Also, it might an option to have a generic
memcpy that uses Q register with a larger window (so ThunderX and
newer core might prefer it instead of generic one).

> 
>> The main differences seems to be how each chip handles
>> large copies, with thundex and falkor doing 64 bytes per loop,
>> while thunderx2 does either 128 bytes (when source and dest are
>> aligned) or 64 for unaligned inputs (it also does not issue
>> unaligned access, doing aligned load plus merge using a jump table).
> 
> Yes that jump table is insane at 1KB of code... It may seem great in
> microbenchmarks but it falls apart in the real world.
> 
>> So it seems that I don't see a straightforward way to unify the
>> implementations, maybe adding a common shared code for sizes
>> less than 128 bytes.
> 
> Yes we could share the code for small cases across implementations.
> I was thinking about having an ifunc for large copies so we could
> statically link a common routine to handle small copies and avoid
> PLT overheads in 99% of cases.
> 
>> One question is if doing operation for large sizes using
>> ldp/stp might yield some gains (as thunderx2 does, at least
>> for aligned case), or if the cost of checking and using some
>> specific cases does not pay of.
> 
> You mean LDP/STP of SIMD registers? There is some gain for those on
> modern cores.
> 
>> +   Large copies use a software pipelined loop processing 64 bytes per iteration.
>> +   The destination pointer is 16-byte aligned to minimize unaligned accesses.
>> +   The loop tail is handled by always copying 64 bytes from the end.
>> +*/
> 
>> Ok, so it now uses a similar strategy ThunderX/Falkor memcpy (Falkor
>> limits the copy to one register due a hardware prefetcher limitation).
> 
> Well this is what it always did. It's faster on in-order cores and supports
> overlapping copies (unlike the Falkor memcpy).
> 
> I'll fix up the long lines before commit.
> 
> Cheers,
> Wilco
>
diff mbox

Patch

diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index ff720c800ed0ca3afac03d19ba02f67817b3422e..d31f7bb38eaf91692fb90f1c313b5e276fdf975b 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -33,11 +33,11 @@ 
 #define A_l x6
 #define A_lw w6
 #define A_h x7
-#define A_hw w7
 #define B_l x8
 #define B_lw w8
 #define B_h x9
 #define C_l x10
+#define C_lw w10
 #define C_h x11
 #define D_l x12
 #define D_h x13
@@ -51,16 +51,6 @@ 
 #define H_h srcend
 #define tmp1 x14
 
-/* Copies are split into 3 main cases: small copies of up to 32 bytes,
-   medium copies of 33..128 bytes which are fully unrolled. Large copies
-   of more than 128 bytes align the destination and use an unrolled loop
-   processing 64 bytes per iteration.
-   In order to share code with memmove, small and medium copies read all
-   data before writing, allowing any kind of overlap. So small, medium
-   and large backwards memmoves are handled by falling through into memcpy.
-   Overlapping large forward memmoves use a loop that copies backwards.
-*/
-
 #ifndef MEMMOVE
 # define MEMMOVE memmove
 #endif
@@ -68,128 +58,124 @@ 
 # define MEMCPY memcpy
 #endif
 
-ENTRY_ALIGN (MEMMOVE, 6)
+/* This implementation supports both memcpy and memmove and shares most code.
+   It uses unaligned accesses and branchless sequences to keep the code small,
+   simple and improve performance.
 
- DELOUSE (0)
- DELOUSE (1)
- DELOUSE (2)
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check in memmove is negligible since it is only required for large copies.
 
- sub tmp1, dstin, src
- cmp count, 128
- ccmp tmp1, count, 2, hi
- b.lo L(move_long)
-
- /* Common case falls through into memcpy.  */
-END (MEMMOVE)
-libc_hidden_builtin_def (MEMMOVE)
-ENTRY (MEMCPY)
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
 
+ENTRY_ALIGN (MEMCPY, 6)
  DELOUSE (0)
  DELOUSE (1)
  DELOUSE (2)
 
- prfm PLDL1KEEP, [src]
  add srcend, src, count
  add dstend, dstin, count
- cmp count, 32
- b.ls L(copy32)
  cmp count, 128
  b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
 
- /* Medium copies: 33..128 bytes.  */
+ /* Small copies: 0..32 bytes.  */
+ cmp count, 16
+ b.lo L(copy16)
  ldp A_l, A_h, [src]
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
  ldp D_l, D_h, [srcend, -16]
- cmp count, 64
- b.hi L(copy128)
  stp A_l, A_h, [dstin]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
  stp D_l, D_h, [dstend, -16]
  ret
 
- .p2align 4
- /* Small copies: 0..32 bytes.  */
-L(copy32):
- /* 16-32 bytes.  */
- cmp count, 16
- b.lo 1f
- ldp A_l, A_h, [src]
- ldp B_l, B_h, [srcend, -16]
- stp A_l, A_h, [dstin]
- stp B_l, B_h, [dstend, -16]
- ret
- .p2align 4
-1:
- /* 8-15 bytes.  */
- tbz count, 3, 1f
+ /* Copy 8-15 bytes.  */
+L(copy16):
+ tbz count, 3, L(copy8)
  ldr A_l, [src]
  ldr A_h, [srcend, -8]
  str A_l, [dstin]
  str A_h, [dstend, -8]
  ret
- .p2align 4
-1:
- /* 4-7 bytes.  */
- tbz count, 2, 1f
+
+ .p2align 3
+ /* Copy 4-7 bytes.  */
+L(copy8):
+ tbz count, 2, L(copy4)
  ldr A_lw, [src]
- ldr A_hw, [srcend, -4]
+ ldr B_lw, [srcend, -4]
  str A_lw, [dstin]
- str A_hw, [dstend, -4]
+ str B_lw, [dstend, -4]
  ret
 
- /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
-   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
-1:
- cbz count, 2f
+ /* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+ cbz count, L(copy0)
  lsr tmp1, count, 1
  ldrb A_lw, [src]
- ldrb A_hw, [srcend, -1]
+ ldrb C_lw, [srcend, -1]
  ldrb B_lw, [src, tmp1]
  strb A_lw, [dstin]
  strb B_lw, [dstin, tmp1]
- strb A_hw, [dstend, -1]
-2: ret
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
 
  .p2align 4
- /* Copy 65..128 bytes.  Copy 64 bytes from the start and
-   64 bytes from the end.  */
+ /* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+ ldp A_l, A_h, [src]
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ ldp D_l, D_h, [srcend, -16]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Copy 65..128 bytes.  */
 L(copy128):
  ldp E_l, E_h, [src, 32]
  ldp F_l, F_h, [src, 48]
+ cmp count, 96
+ b.ls L(copy96)
  ldp G_l, G_h, [srcend, -64]
  ldp H_l, H_h, [srcend, -48]
+ stp G_l, G_h, [dstend, -64]
+ stp H_l, H_h, [dstend, -48]
+L(copy96):
  stp A_l, A_h, [dstin]
  stp B_l, B_h, [dstin, 16]
  stp E_l, E_h, [dstin, 32]
  stp F_l, F_h, [dstin, 48]
- stp G_l, G_h, [dstend, -64]
- stp H_l, H_h, [dstend, -48]
  stp C_l, C_h, [dstend, -32]
  stp D_l, D_h, [dstend, -16]
  ret
 
- /* Align DST to 16 byte alignment so that we don't cross cache line
-   boundaries on both loads and stores.  There are at least 128 bytes
-   to copy, so copy 16 bytes unaligned and then align.  The loop
-   copies 64 bytes per iteration and prefetches one iteration ahead.  */
-
  .p2align 4
+ /* Copy more than 128 bytes.  */
 L(copy_long):
+ /* Copy 16 bytes and then align dst to 16-byte alignment.  */
+ ldp D_l, D_h, [src]
  and tmp1, dstin, 15
  bic dst, dstin, 15
- ldp D_l, D_h, [src]
  sub src, src, tmp1
- add count, count, tmp1      /* Count is now 16 too large.  */
+ add count, count, tmp1 /* Count is now 16 too large.  */
  ldp A_l, A_h, [src, 16]
  stp D_l, D_h, [dstin]
  ldp B_l, B_h, [src, 32]
  ldp C_l, C_h, [src, 48]
  ldp D_l, D_h, [src, 64]!
-
  subs count, count, 128 + 16 /* Test and readjust count.  */
- b.ls L(last64)
+ b.ls L(copy64_from_end)
+
 L(loop64):
  stp A_l, A_h, [dst, 16]
  ldp A_l, A_h, [src, 16]
@@ -202,10 +188,8 @@  L(loop64):
  subs count, count, 64
  b.hi L(loop64)
 
- /* Write the last full set of 64 bytes.  The remainder is at most 64
-   bytes, so it is safe to always copy 64 bytes from the end even if
-   there is just 1 byte left.  */
-L(last64):
+ /* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
  ldp E_l, E_h, [srcend, -64]
  stp A_l, A_h, [dst, 16]
  ldp A_l, A_h, [srcend, -48]
@@ -220,20 +204,42 @@  L(last64):
  stp C_l, C_h, [dstend, -16]
  ret
 
- .p2align 4
-L(move_long):
- cbz tmp1, 3f
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
+
+ENTRY_ALIGN (MEMMOVE, 4)
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
 
  add srcend, src, count
  add dstend, dstin, count
+ cmp count, 128
+ b.hi L(move_long)
+ cmp count, 32
+ b.hi L(copy32_128)
 
- /* Align dstend to 16 byte alignment so that we don't cross cache line
-   boundaries on both loads and stores.  There are at least 128 bytes
-   to copy, so copy 16 bytes unaligned and then align.  The loop
-   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+ /* Small copies: 0..32 bytes.  */
+ cmp count, 16
+ b.lo L(copy16)
+ ldp A_l, A_h, [src]
+ ldp D_l, D_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
 
- and tmp1, dstend, 15
+ .p2align 4
+L(move_long):
+ /* Only use backward copy if there is an overlap.  */
+ sub tmp1, dstin, src
+ cbz tmp1, L(copy0)
+ cmp tmp1, count
+ b.hs L(copy_long)
+
+ /* Large backwards copy for overlapping copies.
+   Copy 16 bytes and then align dst to 16-byte alignment.  */
  ldp D_l, D_h, [srcend, -16]
+ and tmp1, dstend, 15
  sub srcend, srcend, tmp1
  sub count, count, tmp1
  ldp A_l, A_h, [srcend, -16]
@@ -243,10 +249,9 @@  L(move_long):
  ldp D_l, D_h, [srcend, -64]!
  sub dstend, dstend, tmp1
  subs count, count, 128
- b.ls 2f
+ b.ls L(copy64_from_start)
 
- nop
-1:
+L(loop64_backwards):
  stp A_l, A_h, [dstend, -16]
  ldp A_l, A_h, [srcend, -16]
  stp B_l, B_h, [dstend, -32]
@@ -256,12 +261,10 @@  L(move_long):
  stp D_l, D_h, [dstend, -64]!
  ldp D_l, D_h, [srcend, -64]!
  subs count, count, 64
- b.hi 1b
+ b.hi L(loop64_backwards)
 
- /* Write the last full set of 64 bytes.  The remainder is at most 64
-   bytes, so it is safe to always copy 64 bytes from the start even if
-   there is just 1 byte left.  */
-2:
+ /* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
  ldp G_l, G_h, [src, 48]
  stp A_l, A_h, [dstend, -16]
  ldp A_l, A_h, [src, 32]
@@ -274,7 +277,7 @@  L(move_long):
  stp A_l, A_h, [dstin, 32]
  stp B_l, B_h, [dstin, 16]
  stp C_l, C_h, [dstin]
-3: ret
+ ret
 
-END (MEMCPY)
-libc_hidden_builtin_def (MEMCPY)
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)