@@ -352,7 +352,7 @@ L(bytes_32_to_48):
/* Small copies: 0..16 bytes. */
L(memcopy16):
cmp count, 8
- b.lo L(bytes_0_to_8)
+ b.lo 1f
ldr A_l, [src]
ldr A_h, [srcend, -8]
add dstend, dstin, count
@@ -361,8 +361,8 @@ L(memcopy16):
ret
.p2align 4
-L(bytes_0_to_8):
- tbz count, 2, L(bytes_0_to_3)
+1:
+ tbz count, 2, 1f
ldr A_lw, [src]
ldr A_hw, [srcend, -4]
add dstend, dstin, count
@@ -372,8 +372,8 @@ L(bytes_0_to_8):
/* Copy 0..3 bytes. Use a branchless sequence that copies the same
byte 3 times if count==1, or the 2nd byte twice if count==2. */
-L(bytes_0_to_3):
- cbz count, L(end)
+1:
+ cbz count, 1f
lsr tmp1, count, 1
ldrb A_lw, [src]
ldrb A_hw, [srcend, -1]
@@ -382,7 +382,8 @@ L(bytes_0_to_3):
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
strb A_hw, [dstend, -1]
-L(end): ret
+1:
+ ret
.p2align 4
@@ -404,7 +405,7 @@ L(memcpy_copy96):
after tmp [0..15] gets added to it,
count now is <bytes-left-to-load>+48 */
cmp count, 80
- b.gt L(copy96_medium)
+ b.gt 1f
ldr D_q, [src, 32]
stp B_q, C_q, [dst, 16]
str E_q, [dstend, -16]
@@ -412,17 +413,17 @@ L(memcpy_copy96):
ret
.p2align 4
-L(copy96_medium):
+1:
ldp D_q, A_q, [src, 32]
str B_q, [dst, 16]
cmp count, 96
- b.gt L(copy96_large)
+ b.gt 1f
str E_q, [dstend, -16]
stp C_q, D_q, [dst, 32]
str A_q, [dst, 64]
ret
-L(copy96_large):
+1:
ldr F_q, [src, 64]
stp C_q, D_q, [dst, 32]
str E_q, [dstend, -16]
@@ -557,17 +558,9 @@ L(ext_size_ ## shft):;\
ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\
ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\
subs count, count, 32;\
- b.ge 2f;\
+ b.lt 2f;\
1:;\
stp A_q, B_q, [dst], #32;\
- ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
- ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
- stp H_q, I_q, [dst], #16;\
- add dst, dst, tmp1;\
- str G_q, [dst], #16;\
- b L(copy_long_check32);\
-2:;\
- stp A_q, B_q, [dst], #32;\
prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
ldp D_q, J_q, [src], #32;\
ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
@@ -579,8 +572,15 @@ L(ext_size_ ## shft):;\
ext B_v.16b, D_v.16b, J_v.16b, 16-shft;\
mov E_v.16b, J_v.16b;\
subs count, count, 64;\
- b.ge 2b;\
- b 1b;\
+ b.ge 1b;\
+2:;\
+ stp A_q, B_q, [dst], #32;\
+ ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
+ ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
+ stp H_q, I_q, [dst], #16;\
+ add dst, dst, tmp1;\
+ str G_q, [dst], #16;\
+ b L(copy_long_check32);\
EXT_CHUNK(1)
EXT_CHUNK(2)