From 4b59f2c1dd07df6e5c2f48a085f5dac079fdfcbd Mon Sep 17 00:00:00 2001
From: Alex Butler <alex.butler@arm.com>
Date: Wed, 13 May 2020 23:38:05 +0100
Subject: [PATCH] aarch64: add MTE compatible memrchr
---
sysdeps/aarch64/memrchr.S | 199 +++++++++++++++++++---------------------------
1 file changed, 83 insertions(+), 116 deletions(-)
@@ -22,144 +22,111 @@
/* Assumptions:
*
- * ARMv8-a, AArch64
- * Neon Available.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
*/
/* Arguments and results. */
#define srcin x0
#define chrin w1
#define cntin x2
-
-#define seek_dstin x11
-#define seek_dst x12
#define result x0
#define src x3
-#define tmp x4
-#define wtmp2 w5
-#define synd x6
-#define soff x9
-#define cntrem x10
+#define cntrem x4
+#define synd x5
+#define shift x6
+#define tmp x7
+#define wtmp w7
+#define end x8
+#define endm1 x9
#define vrepchr v0
-#define vdata1 v1
-#define vdata2 v2
-
-#define vhas_chr1 v3
-#define vhas_chr2 v4
-#define vrepmask v5
-#define vend v6
+#define qdata q1
+#define vdata v1
+#define vhas_chr v2
+#define vrepmask v3
+#define vend v4
+#define dend d4
/*
- * Core algorithm:
- *
- * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
- * per byte. For each tuple, bit 0 is set if the relevant byte matched the
- * requested character and bit 1 is not used (faster than using a 32bit
- * syndrome). Since the bits in the syndrome reflect exactly the order in which
- * things occur in the original string, counting trailing zeros allows to
- * identify exactly which byte has matched.
- */
+ Core algorithm:
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
ENTRY (__memrchr)
- /* Do not dereference srcin if no bytes to compare. */
- cbz cntin, L(zero_length)
- /*
- * Magic constant 0x40100401 allows us to identify which lane matches
- * the requested byte.
- */
- mov wtmp2, #0x0401
- movk wtmp2, #0x4010, lsl #16
+ DELOUSE (0)
+ DELOUSE (2)
+ add end, srcin, cntin
+ sub endm1, end, 1
+ bic src, endm1, 15
+ cbz cntin, L(nomatch)
+ ld1 {vdata.16b}, [src]
dup vrepchr.16b, chrin
- /* Work with aligned 32-byte chunks */
- add seek_dstin, cntin, srcin
- add tmp, seek_dstin, #31
- bic seek_dst, tmp, #31
- dup vrepmask.4s, wtmp2
- ands soff, seek_dstin, #31
- mov tmp, #32
- sub soff,tmp,soff
- and cntrem, cntin, #31
- b.eq L(loop)
-
- /*
- * Input string is not 32-byte aligned. We calculate the syndrome
- * value for the aligned 32 bytes block containing the first bytes
- * and mask the irrelevant part.
- */
-
- sub seek_dst, seek_dst, #32
- ld1 {vdata1.16b, vdata2.16b}, [seek_dst]
- sub tmp, soff, #32
- adds cntin, cntin, tmp
- cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
- cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
- and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
- addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
- addp vend.16b, vend.16b, vend.16b /* 128->64 */
- mov synd, vend.2d[0]
- /* Clear the (32-soff)*2 upper bits */
- lsl tmp, soff, #1
- lsl synd, synd, tmp
- lsr synd, synd, tmp
- /* The first block can also be the last */
- b.ls L(masklast)
- /* Have we found something already? */
- cbnz synd, L(tail)
-
-L(loop):
- sub seek_dst, seek_dst, #32
- ld1 {vdata1.16b, vdata2.16b}, [seek_dst]
- subs cntin, cntin, #32
- cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
- cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- /* If we're out of data we finish regardless of the result */
- b.ls L(end)
- /* Use a fast check for the termination condition */
- orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
- addp vend.2d, vend.2d, vend.2d
- mov synd, vend.2d[0]
- /* We're not out of data, loop if we haven't found the character */
- cbz synd, L(loop)
+ mov wtmp, 0xf00f
+ dup vrepmask.8h, wtmp
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ neg shift, end, lsl 2
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ lsl synd, synd, shift
+ cbz synd, L(start_loop)
+ clz synd, synd
+ sub result, endm1, synd, lsr 2
+ cmp cntin, synd, lsr 2
+ csel result, result, xzr, hi
+ ret
+
+L(start_loop):
+ sub tmp, end, src
+ subs cntrem, cntin, tmp
+ b.ls L(nomatch)
+
+ /* Make sure that it won't overread by a 16-byte chunk */
+ add tmp, cntrem, 15
+ tbnz tmp, 4, L(loop32_2)
+
+ .p2align 4
+L(loop32):
+ ldr qdata, [src, -16]!
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ cbnz synd, L(end)
+
+L(loop32_2):
+ ldr qdata, [src, -16]!
+ subs cntrem, cntrem, 32
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ b.ls L(end)
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ cbz synd, L(loop32)
L(end):
- /* Termination condition found, let's calculate the syndrome value */
- and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
- and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
- addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
- addp vend.16b, vend.16b, vend.16b /* 128->64 */
- mov synd, vend.2d[0]
- /* Only do the clear for the last possible block */
- b.hi L(tail)
-
-L(masklast):
- /* Clear the (32 - ((cntrem + (32-soff)) % 32)) * 2 lower bits */
- add tmp, cntrem, soff
- and tmp, tmp, #31
- sub tmp, tmp, #32
- neg tmp, tmp, lsl #1
- lsr synd, synd, tmp
- lsl synd, synd, tmp
-
-L(tail):
- /* Compensate the last post-increment*/
- add seek_dst, seek_dst, #32
- /* Check that we have found a character */
- cmp synd, #0
- /* And count the leading zeros */
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+
+ add tmp, src, 15
+#ifdef __AARCH64EB__
+ rbit synd, synd
+#endif
clz synd, synd
- /* Compute the potential result */
- sub result, seek_dst, synd, lsr #1
- sub result, result, #1
- /* Select result or NULL */
- csel result, xzr, result, eq
+ sub tmp, tmp, synd, lsr 2
+ cmp tmp, srcin
+ csel result, tmp, xzr, hs
ret
-L(zero_length):
- mov result, #0
+L(nomatch):
+ mov result, 0
ret
+
END (__memrchr)
weak_alias (__memrchr, memrchr)
libc_hidden_builtin_def (memrchr)
--
2.7.4