@@ -38,7 +38,6 @@
#define dest_ptr x7
#define src_ptr x8
#define vector_length x9
-#define cl_remainder x10 // CACHE_LINE_SIZE remainder
#if HAVE_AARCH64_SVE_ASM
# if IS_IN (libc)
@@ -47,14 +46,6 @@
.arch armv8.2-a+sve
- .macro dc_zva times
- dc zva, tmp1
- add tmp1, tmp1, CACHE_LINE_SIZE
- .if \times-1
- dc_zva "(\times-1)"
- .endif
- .endm
-
.macro ld1b_unroll8
ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
@@ -106,69 +97,49 @@
.macro shortcut_for_small_size exit
// if rest <= vector_length * 2
- whilelo p0.b, xzr, n
+ whilelo p0.b, xzr, n
whilelo p1.b, vector_length, n
- b.last 1f
ld1b z0.b, p0/z, [src, #0, mul vl]
ld1b z1.b, p1/z, [src, #1, mul vl]
+ b.last 1f
st1b z0.b, p0, [dest, #0, mul vl]
st1b z1.b, p1, [dest, #1, mul vl]
ret
+
1: // if rest > vector_length * 8
cmp n, vector_length, lsl 3 // vector_length * 8
b.hi \exit
+
// if rest <= vector_length * 4
lsl tmp1, vector_length, 1 // vector_length * 2
- whilelo p2.b, tmp1, n
- incb tmp1
- whilelo p3.b, tmp1, n
- b.last 1f
- ld1b z0.b, p0/z, [src, #0, mul vl]
- ld1b z1.b, p1/z, [src, #1, mul vl]
+ sub n, n, tmp1
+ whilelo p2.b, xzr, n
+ whilelo p3.b, vector_length, n
ld1b z2.b, p2/z, [src, #2, mul vl]
ld1b z3.b, p3/z, [src, #3, mul vl]
- st1b z0.b, p0, [dest, #0, mul vl]
- st1b z1.b, p1, [dest, #1, mul vl]
- st1b z2.b, p2, [dest, #2, mul vl]
- st1b z3.b, p3, [dest, #3, mul vl]
- ret
-1: // if rest <= vector_length * 8
- lsl tmp1, vector_length, 2 // vector_length * 4
- whilelo p4.b, tmp1, n
- incb tmp1
- whilelo p5.b, tmp1, n
b.last 1f
- ld1b z0.b, p0/z, [src, #0, mul vl]
- ld1b z1.b, p1/z, [src, #1, mul vl]
- ld1b z2.b, p2/z, [src, #2, mul vl]
- ld1b z3.b, p3/z, [src, #3, mul vl]
- ld1b z4.b, p4/z, [src, #4, mul vl]
- ld1b z5.b, p5/z, [src, #5, mul vl]
st1b z0.b, p0, [dest, #0, mul vl]
- st1b z1.b, p1, [dest, #1, mul vl]
+ st1b z1.b, p0, [dest, #1, mul vl]
st1b z2.b, p2, [dest, #2, mul vl]
st1b z3.b, p3, [dest, #3, mul vl]
- st1b z4.b, p4, [dest, #4, mul vl]
- st1b z5.b, p5, [dest, #5, mul vl]
ret
-1: lsl tmp1, vector_length, 2 // vector_length * 4
- incb tmp1 // vector_length * 5
- incb tmp1 // vector_length * 6
- whilelo p6.b, tmp1, n
- incb tmp1
- whilelo p7.b, tmp1, n
- ld1b z0.b, p0/z, [src, #0, mul vl]
- ld1b z1.b, p1/z, [src, #1, mul vl]
- ld1b z2.b, p2/z, [src, #2, mul vl]
- ld1b z3.b, p3/z, [src, #3, mul vl]
+
+1: // if rest <= vector_length * 8
+ sub n, n, tmp1
+ add tmp2, tmp1, vector_length
+ whilelo p4.b, xzr, n
+ whilelo p5.b, vector_length, n
+ whilelo p6.b, tmp1, n
+ whilelo p7.b, tmp2, n
+
ld1b z4.b, p4/z, [src, #4, mul vl]
ld1b z5.b, p5/z, [src, #5, mul vl]
ld1b z6.b, p6/z, [src, #6, mul vl]
ld1b z7.b, p7/z, [src, #7, mul vl]
st1b z0.b, p0, [dest, #0, mul vl]
- st1b z1.b, p1, [dest, #1, mul vl]
- st1b z2.b, p2, [dest, #2, mul vl]
- st1b z3.b, p3, [dest, #3, mul vl]
+ st1b z1.b, p0, [dest, #1, mul vl]
+ st1b z2.b, p0, [dest, #2, mul vl]
+ st1b z3.b, p0, [dest, #3, mul vl]
st1b z4.b, p4, [dest, #4, mul vl]
st1b z5.b, p5, [dest, #5, mul vl]
st1b z6.b, p6, [dest, #6, mul vl]
@@ -182,8 +153,8 @@ ENTRY (MEMCPY)
PTR_ARG (1)
SIZE_ARG (2)
-L(memcpy):
cntb vector_length
+L(memmove_small):
// shortcut for less than vector_length * 8
// gives a free ptrue to p0.b for n >= vector_length
shortcut_for_small_size L(vl_agnostic)
@@ -201,135 +172,107 @@ L(vl_agnostic): // VL Agnostic
L(unroll8): // unrolling and software pipeline
lsl tmp1, vector_length, 3 // vector_length * 8
- .p2align 3
- cmp rest, tmp1
- b.cc L(last)
+ sub rest, rest, tmp1
ld1b_unroll8
add src_ptr, src_ptr, tmp1
- sub rest, rest, tmp1
- cmp rest, tmp1
+ subs rest, rest, tmp1
b.cc 2f
- .p2align 3
+ .p2align 4
1: stld1b_unroll8
add dest_ptr, dest_ptr, tmp1
add src_ptr, src_ptr, tmp1
- sub rest, rest, tmp1
- cmp rest, tmp1
- b.ge 1b
+ subs rest, rest, tmp1
+ b.hs 1b
2: st1b_unroll8
add dest_ptr, dest_ptr, tmp1
+ add rest, rest, tmp1
.p2align 3
L(last):
- whilelo p0.b, xzr, rest
+ whilelo p0.b, xzr, rest
whilelo p1.b, vector_length, rest
- b.last 1f
- ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
- ld1b z1.b, p1/z, [src_ptr, #1, mul vl]
- st1b z0.b, p0, [dest_ptr, #0, mul vl]
- st1b z1.b, p1, [dest_ptr, #1, mul vl]
- ret
-1: lsl tmp1, vector_length, 1 // vector_length * 2
- whilelo p2.b, tmp1, rest
- incb tmp1
- whilelo p3.b, tmp1, rest
- b.last 1f
- ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
- ld1b z1.b, p1/z, [src_ptr, #1, mul vl]
- ld1b z2.b, p2/z, [src_ptr, #2, mul vl]
- ld1b z3.b, p3/z, [src_ptr, #3, mul vl]
- st1b z0.b, p0, [dest_ptr, #0, mul vl]
- st1b z1.b, p1, [dest_ptr, #1, mul vl]
- st1b z2.b, p2, [dest_ptr, #2, mul vl]
- st1b z3.b, p3, [dest_ptr, #3, mul vl]
- ret
-1: lsl tmp1, vector_length, 2 // vector_length * 4
- whilelo p4.b, tmp1, rest
- incb tmp1
- whilelo p5.b, tmp1, rest
- incb tmp1
- whilelo p6.b, tmp1, rest
- incb tmp1
- whilelo p7.b, tmp1, rest
ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
ld1b z1.b, p1/z, [src_ptr, #1, mul vl]
+ b.nlast 1f
+
+ lsl tmp1, vector_length, 1 // vector_length * 2
+ sub rest, rest, tmp1
+ whilelo p2.b, xzr, rest
+ whilelo p3.b, vector_length, rest
ld1b z2.b, p2/z, [src_ptr, #2, mul vl]
ld1b z3.b, p3/z, [src_ptr, #3, mul vl]
+ b.nlast 2f
+
+ sub rest, rest, tmp1
+ add tmp2, tmp1, vector_length // vector_length * 3
+ whilelo p4.b, xzr, rest
+ whilelo p5.b, vector_length, rest
+ whilelo p6.b, tmp1, rest
+ whilelo p7.b, tmp2, rest
+
ld1b z4.b, p4/z, [src_ptr, #4, mul vl]
ld1b z5.b, p5/z, [src_ptr, #5, mul vl]
ld1b z6.b, p6/z, [src_ptr, #6, mul vl]
ld1b z7.b, p7/z, [src_ptr, #7, mul vl]
- st1b z0.b, p0, [dest_ptr, #0, mul vl]
- st1b z1.b, p1, [dest_ptr, #1, mul vl]
- st1b z2.b, p2, [dest_ptr, #2, mul vl]
- st1b z3.b, p3, [dest_ptr, #3, mul vl]
st1b z4.b, p4, [dest_ptr, #4, mul vl]
st1b z5.b, p5, [dest_ptr, #5, mul vl]
st1b z6.b, p6, [dest_ptr, #6, mul vl]
st1b z7.b, p7, [dest_ptr, #7, mul vl]
+2: st1b z2.b, p2, [dest_ptr, #2, mul vl]
+ st1b z3.b, p3, [dest_ptr, #3, mul vl]
+1: st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p1, [dest_ptr, #1, mul vl]
ret
L(L2):
// align dest address at CACHE_LINE_SIZE byte boundary
- mov tmp1, CACHE_LINE_SIZE
- ands tmp2, dest_ptr, CACHE_LINE_SIZE - 1
- // if cl_remainder == 0
- b.eq L(L2_dc_zva)
- sub cl_remainder, tmp1, tmp2
- // process remainder until the first CACHE_LINE_SIZE boundary
- whilelo p1.b, xzr, cl_remainder // keep p0.b all true
- whilelo p2.b, vector_length, cl_remainder
- b.last 1f
- ld1b z1.b, p1/z, [src_ptr, #0, mul vl]
- ld1b z2.b, p2/z, [src_ptr, #1, mul vl]
- st1b z1.b, p1, [dest_ptr, #0, mul vl]
- st1b z2.b, p2, [dest_ptr, #1, mul vl]
- b 2f
-1: lsl tmp1, vector_length, 1 // vector_length * 2
- whilelo p3.b, tmp1, cl_remainder
- incb tmp1
- whilelo p4.b, tmp1, cl_remainder
- ld1b z1.b, p1/z, [src_ptr, #0, mul vl]
- ld1b z2.b, p2/z, [src_ptr, #1, mul vl]
- ld1b z3.b, p3/z, [src_ptr, #2, mul vl]
- ld1b z4.b, p4/z, [src_ptr, #3, mul vl]
- st1b z1.b, p1, [dest_ptr, #0, mul vl]
- st1b z2.b, p2, [dest_ptr, #1, mul vl]
- st1b z3.b, p3, [dest_ptr, #2, mul vl]
- st1b z4.b, p4, [dest_ptr, #3, mul vl]
-2: add dest_ptr, dest_ptr, cl_remainder
- add src_ptr, src_ptr, cl_remainder
- sub rest, rest, cl_remainder
+ and tmp1, dest_ptr, CACHE_LINE_SIZE - 1
+ sub tmp1, tmp1, CACHE_LINE_SIZE
+ ld1b z1.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #1, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #3, mul vl]
+ st1b z1.b, p0, [dest_ptr, #0, mul vl]
+ st1b z2.b, p0, [dest_ptr, #1, mul vl]
+ st1b z3.b, p0, [dest_ptr, #2, mul vl]
+ st1b z4.b, p0, [dest_ptr, #3, mul vl]
+ sub dest_ptr, dest_ptr, tmp1
+ sub src_ptr, src_ptr, tmp1
+ add rest, rest, tmp1
L(L2_dc_zva):
- // zero fill
- and tmp1, dest, 0xffffffffffffff
- and tmp2, src, 0xffffffffffffff
- subs tmp1, tmp1, tmp2 // diff
- b.ge 1f
- neg tmp1, tmp1
-1: mov tmp3, ZF_DIST + CACHE_LINE_SIZE * 2
- cmp tmp1, tmp3
+ // check for overlap
+ sub tmp1, src_ptr, dest_ptr
+ and tmp1, tmp1, 0xffffffffffffff // clear tag bits
+ mov tmp2, ZF_DIST
+ cmp tmp1, tmp2
b.lo L(unroll8)
+
+ // zero fill loop
mov tmp1, dest_ptr
- dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1
+ mov tmp3, ZF_DIST / CACHE_LINE_SIZE
+1: dc zva, tmp1
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ subs tmp3, tmp3, 1
+ b.ne 1b
+
+ mov tmp3, ZF_DIST + CACHE_LINE_SIZE * 2
// unroll
- ld1b_unroll8 // this line has to be after "b.lo L(unroll8)"
- add src_ptr, src_ptr, CACHE_LINE_SIZE * 2
- sub rest, rest, CACHE_LINE_SIZE * 2
- mov tmp1, ZF_DIST
- .p2align 3
-1: stld1b_unroll4a
- add tmp2, dest_ptr, tmp1 // dest_ptr + ZF_DIST
- dc zva, tmp2
+ ld1b_unroll8
+ add src_ptr, src_ptr, CACHE_LINE_SIZE * 2
+ sub rest, rest, CACHE_LINE_SIZE * 2
+ .p2align 4
+2: stld1b_unroll4a
+ add tmp1, dest_ptr, tmp2 // dest_ptr + ZF_DIST
+ dc zva, tmp1
stld1b_unroll4b
- add tmp2, tmp2, CACHE_LINE_SIZE
- dc zva, tmp2
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1
add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
add src_ptr, src_ptr, CACHE_LINE_SIZE * 2
sub rest, rest, CACHE_LINE_SIZE * 2
- cmp rest, tmp3 // ZF_DIST + CACHE_LINE_SIZE * 2
- b.ge 1b
+ cmp rest, tmp3
+ b.hs 2b
st1b_unroll8
add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
b L(unroll8)
@@ -338,68 +281,50 @@ END (MEMCPY)
libc_hidden_builtin_def (MEMCPY)
-ENTRY (MEMMOVE)
+ENTRY_ALIGN (MEMMOVE, 4)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
- // remove tag address
- // dest has to be immutable because it is the return value
- // src has to be immutable because it is used in L(bwd_last)
- and tmp2, dest, 0xffffffffffffff // save dest_notag into tmp2
- and tmp3, src, 0xffffffffffffff // save src_notag intp tmp3
- cmp n, 0
- ccmp tmp2, tmp3, 4, ne
- b.ne 1f
- ret
-1: cntb vector_length
- // shortcut for less than vector_length * 8
- // gives a free ptrue to p0.b for n >= vector_length
- // tmp2 and tmp3 should not be used in this macro to keep
- // notag addresses
- shortcut_for_small_size L(dispatch)
- // end of shortcut
-
-L(dispatch):
- // tmp2 = dest_notag, tmp3 = src_notag
- // diff = dest_notag - src_notag
- sub tmp1, tmp2, tmp3
- // if diff <= 0 || diff >= n then memcpy
- cmp tmp1, 0
- ccmp tmp1, n, 2, gt
- b.cs L(vl_agnostic)
-
-L(bwd_start):
- mov rest, n
- add dest_ptr, dest, n // dest_end
- add src_ptr, src, n // src_end
+ cntb vector_length
+ // diff = dest - src
+ sub tmp1, dest, src
+ ands tmp1, tmp1, 0xffffffffffffff // clear tag bits
+ b.eq L(full_overlap)
-L(bwd_unroll8): // unrolling and software pipeline
- lsl tmp1, vector_length, 3 // vector_length * 8
- .p2align 3
- cmp rest, tmp1
- b.cc L(bwd_last)
- sub src_ptr, src_ptr, tmp1
+ cmp n, vector_length, lsl 3 // vector_length * 8
+ b.ls L(memmove_small)
+
+ ptrue p0.b
+ // if diff < 0 || diff >= n then memcpy
+ cmp tmp1, n
+ b.hs L(vl_agnostic)
+
+ // unrolling and software pipeline
+ lsl tmp1, vector_length, 3 // vector_length * 8
+ add dest_ptr, dest, n // dest_end
+ sub rest, n, tmp1
+ add src_ptr, src, rest // src_end
ld1b_unroll8
- sub rest, rest, tmp1
- cmp rest, tmp1
+ subs rest, rest, tmp1
b.cc 2f
- .p2align 3
+ .p2align 4
1: sub src_ptr, src_ptr, tmp1
sub dest_ptr, dest_ptr, tmp1
stld1b_unroll8
- sub rest, rest, tmp1
- cmp rest, tmp1
- b.ge 1b
+ subs rest, rest, tmp1
+ b.hs 1b
2: sub dest_ptr, dest_ptr, tmp1
st1b_unroll8
-
-L(bwd_last):
+ add rest, rest, tmp1
mov dest_ptr, dest
mov src_ptr, src
b L(last)
+L(full_overlap):
+ ret
+
END (MEMMOVE)
libc_hidden_builtin_def (MEMMOVE)
# endif /* IS_IN (libc) */