aarch64: MTE compatible strlen

Message ID gkrpnag8ohk.fsf@arm.com
State Committed
Commit a365ac45b7b51dbd9dc65629203cc2a9603420bb
Headers
Series aarch64: MTE compatible strlen |

Commit Message

Andrea Corallo June 3, 2020, 9:53 a.m. UTC
  Hi all,

I'd like to submit this patch introducing an Arm MTE compatible strlen
implementation.

Follows a performance comparison of the strlen benchmark run on 
Cortex-A72, Cortex-A53, Neoverse N1.

| length | alignment | perf-uplift A72 | perf-uplift A53 |perf-uplift |
|--------+-----------+-----------------+-----------------|------------|
|      1 |         1 |           1.00x |           0.96x |      1.13x |
|      1 |         0 |           2.15x |           0.96x |      1.00x |
|      2 |         2 |           1.16x |           0.95x |      1.09x |
|      2 |         0 |           1.17x |           0.93x |      1.00x |
|      3 |         3 |           1.30x |           0.95x |      1.09x |
|      3 |         0 |           1.32x |           0.96x |      1.00x |
|      4 |         4 |           1.14x |           0.87x |      0.99x |
|      4 |         0 |           1.14x |           0.96x |      1.00x |
|      5 |         5 |           1.15x |           0.89x |      1.09x |
|      5 |         0 |           1.19x |           0.96x |      1.00x |
|      6 |         6 |           1.14x |           0.96x |      1.39x |
|      6 |         0 |           1.14x |           0.95x |      1.00x |
|      7 |         7 |           1.03x |           0.90x |      1.09x |
|      7 |         0 |           1.14x |           0.95x |      1.27x |
|      4 |         0 |           1.15x |           0.87x |      1.00x |
|      4 |         7 |           1.15x |           0.96x |      1.10x |
|      4 |         2 |           1.27x |           0.95x |      1.39x |
|      2 |         2 |           1.14x |           0.96x |      1.09x |
|      8 |         0 |           1.15x |           0.96x |      1.00x |
|      8 |         7 |           1.14x |           0.96x |      1.09x |
|      8 |         3 |           1.17x |           0.96x |      1.39x |
|      5 |         3 |           1.14x |           0.96x |      1.39x |
|     16 |         0 |           1.15x |           0.83x |      1.48x |
|     16 |         7 |           1.14x |           0.80x |      1.43x |
|     16 |         4 |           1.15x |           0.83x |      1.48x |
|     10 |         4 |           1.15x |           0.96x |      1.27x |
|     32 |         0 |           1.04x |           0.88x |      1.16x |
|     32 |         7 |           1.02x |           0.84x |      1.19x |
|     32 |         5 |           1.04x |           0.84x |      1.23x |
|     21 |         5 |           1.14x |           0.83x |      1.60x |
|     64 |         0 |           1.17x |           0.80x |      1.75x |
|     64 |         7 |           1.17x |           0.77x |      1.83x |
|     64 |         6 |           1.17x |           0.77x |      1.57x |
|     42 |         6 |           1.00x |           0.80x |      1.42x |
|    128 |         0 |           0.96x |           0.68x |      1.80x |
|    128 |         7 |           0.96x |           0.66x |      1.85x |
|    128 |         7 |           0.96x |           0.67x |      1.86x |
|     85 |         7 |           1.05x |           0.75x |      1.87x |
|    256 |         0 |           0.98x |           0.69x |      1.88x |
|    256 |         7 |           0.98x |           0.68x |      1.92x |
|    256 |         8 |           0.99x |           0.69x |      1.88x |
|    170 |         8 |           0.96x |           0.72x |      1.86x |
|    512 |         0 |           0.99x |           0.65x |      1.90x |
|    512 |         7 |           0.98x |           0.65x |      1.92x |
|    512 |         9 |           0.99x |           0.65x |      1.92x |
|    341 |         9 |           0.98x |           0.68x |      1.99x |
|   1024 |         0 |           0.99x |           0.63x |      1.90x |
|   1024 |         7 |           0.99x |           0.62x |      1.92x |
|   1024 |        10 |           0.99x |           0.62x |      1.92x |
|    682 |        10 |           0.99x |           0.64x |      1.96x |
|   2048 |         0 |           0.99x |           0.61x |      1.92x |
|   2048 |         7 |           1.01x |           0.61x |      1.93x |
|   2048 |        11 |           1.00x |           0.61x |      1.95x |
|   1365 |        11 |           1.00x |           0.62x |      1.94x |
|   4096 |         0 |           1.00x |           0.61x |      1.93x |
|   4096 |         7 |           1.00x |           0.61x |      1.94x |
|   4096 |        12 |           1.00x |           0.61x |      1.95x |
|   2730 |        12 |           1.00x |           0.61x |      1.94x |

This patch is passing GLIBC tests.

Regards

  Andrea

8< --- 8< --- 8<
Introduce an Arm MTE compatible strlen implementation.


Benchmarked on Cortex-A72, Cortex-A53, Neoverse N1 does not show
performance regressions.

Co-authored-by: Wilco Dijkstra <wilco.dijkstra@arm.com>
  

Patch

diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index e01fab7c2a..e314fffed6 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -20,205 +20,78 @@ 
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
  */
 
 #ifndef STRLEN
 # define STRLEN __strlen
 #endif
 
-/* To test the page crossing code path more thoroughly, compile with
-   -DTEST_PAGE_CROSS - this will force all calls through the slower
-   entry path.  This option is not intended for production use.  */
-
-/* Arguments and results.  */
 #define srcin		x0
-#define len		x0
+#define result		x0
 
-/* Locals and temporaries.  */
 #define src		x1
-#define data1		x2
-#define data2		x3
-#define has_nul1	x4
-#define has_nul2	x5
-#define tmp1		x4
-#define tmp2		x5
-#define tmp3		x6
-#define tmp4		x7
-#define zeroones	x8
-
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word. A faster check
-	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
-	   false hits for characters 129..255.	*/
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
-#ifdef TEST_PAGE_CROSS
-# define MIN_PAGE_SIZE 16
-#else
-# define MIN_PAGE_SIZE 4096
-#endif
-
-	/* Since strings are short on average, we check the first 16 bytes
-	   of the string for a NUL character.  In order to do an unaligned ldp
-	   safely we have to do a page cross check first.  If there is a NUL
-	   byte we calculate the length from the 2 8-byte words using
-	   conditional select to reduce branch mispredictions (it is unlikely
-	   strlen will be repeatedly called on strings with the same length).
-
-	   If the string is longer than 16 bytes, we align src so don't need
-	   further page cross checks, and process 32 bytes per iteration
-	   using the fast NUL check.  If we encounter non-ASCII characters,
-	   fallback to a second loop using the full NUL check.
-
-	   If the page cross check fails, we read 16 bytes from an aligned
-	   address, remove any characters before the string, and continue
-	   in the main loop using aligned loads.  Since strings crossing a
-	   page in the first 16 bytes are rare (probability of
-	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
-
-	   AArch64 systems have a minimum page size of 4k.  We don't bother
-	   checking for larger page sizes - the cost of setting up the correct
-	   page size is just not worth the extra gain from a small reduction in
-	   the cases taking the slow path.  Note that we only care about
-	   whether the first fetch, which may be misaligned, crosses a page
-	   boundary.  */
-
-ENTRY_ALIGN (STRLEN, 6)
+#define	synd		x2
+#define tmp		x3
+#define wtmp		w3
+#define shift		x4
+
+#define data		q0
+#define vdata		v0
+#define vhas_nul	v1
+#define vrepmask	v2
+#define vend		v3
+#define dend		d3
+
+/* Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (STRLEN)
 	DELOUSE (0)
 	DELOUSE (1)
-	and	tmp1, srcin, MIN_PAGE_SIZE - 1
-	mov	zeroones, REP8_01
-	cmp	tmp1, MIN_PAGE_SIZE - 16
-	b.gt	L(page_cross)
-	ldp	data1, data2, [srcin]
-#ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul1/2 directly.
-	   Since we expect strings to be small and early-exit,
-	   byte-swap the data now so has_null1/2 will be correct.  */
-	rev	data1, data1
-	rev	data2, data2
-#endif
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	ccmp	has_nul2, 0, 0, eq
-	beq	L(main_loop_entry)
-
-	/* Enter with C = has_nul1 == 0.  */
-	csel	has_nul1, has_nul1, has_nul2, cc
-	mov	len, 8
-	rev	has_nul1, has_nul1
-	clz	tmp1, has_nul1
-	csel	len, xzr, len, cc
-	add	len, len, tmp1, lsr 3
+	bic	src, srcin, 15
+	mov	wtmp, 0xf00f
+	ld1	{vdata.16b}, [src]
+	dup	vrepmask.8h, wtmp
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbz	synd, L(loop)
+
+	rbit	synd, synd
+	clz	result, synd
+	lsr	result, result, 2
 	ret
 
-	/* The inner loop processes 32 bytes per iteration and uses the fast
-	   NUL check.  If we encounter non-ASCII characters, use a second
-	   loop with the accurate NUL check.  */
-	.p2align 4
-L(main_loop_entry):
-	bic	src, srcin, 15
-	sub	src, src, 16
-L(main_loop):
-	ldp	data1, data2, [src, 32]!
-L(page_cross_entry):
-	sub	tmp1, data1, zeroones
-	sub	tmp3, data2, zeroones
-	orr	tmp2, tmp1, tmp3
-	tst	tmp2, zeroones, lsl 7
-	bne	1f
-	ldp	data1, data2, [src, 16]
-	sub	tmp1, data1, zeroones
-	sub	tmp3, data2, zeroones
-	orr	tmp2, tmp1, tmp3
-	tst	tmp2, zeroones, lsl 7
-	beq	L(main_loop)
-	add	src, src, 16
-1:
-	/* The fast check failed, so do the slower, accurate NUL check.	 */
-	orr	tmp2, data1, REP8_7f
-	orr	tmp4, data2, REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	ccmp	has_nul2, 0, 0, eq
-	beq	L(nonascii_loop)
-
-	/* Enter with C = has_nul1 == 0.  */
-L(tail):
-#ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul1/2 directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
-	csel	data1, data1, data2, cc
-	rev	data1, data1
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	bic	has_nul1, tmp1, tmp2
-#else
-	csel	has_nul1, has_nul1, has_nul2, cc
+	.p2align 5
+L(loop):
+	ldr	data, [src, 16]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(loop)
+
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	sub	result, src, srcin
+	fmov	synd, dend
+#ifndef __AARCH64EB__
+	rbit	synd, synd
 #endif
-	sub	len, src, srcin
-	rev	has_nul1, has_nul1
-	add	tmp2, len, 8
-	clz	tmp1, has_nul1
-	csel	len, len, tmp2, cc
-	add	len, len, tmp1, lsr 3
+	clz	tmp, synd
+	add	result, result, tmp, lsr 2
 	ret
 
-L(nonascii_loop):
-	ldp	data1, data2, [src, 16]!
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	ccmp	has_nul2, 0, 0, eq
-	bne	L(tail)
-	ldp	data1, data2, [src, 16]!
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	ccmp	has_nul2, 0, 0, eq
-	beq	L(nonascii_loop)
-	b	L(tail)
-
-	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
-	   srcin to 0x7f, so we ignore any NUL bytes before the string.
-	   Then continue in the aligned loop.  */
-L(page_cross):
-	bic	src, srcin, 15
-	ldp	data1, data2, [src]
-	lsl	tmp1, srcin, 3
-	mov	tmp4, -1
-#ifdef __AARCH64EB__
-	/* Big-endian.	Early bytes are at MSB.	 */
-	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
-#endif
-	orr	tmp1, tmp1, REP8_80
-	orn	data1, data1, tmp1
-	orn	tmp2, data2, tmp1
-	tst	srcin, 8
-	csel	data1, data1, tmp4, eq
-	csel	data2, data2, tmp2, eq
-	b	L(page_cross_entry)
 END (STRLEN)
 weak_alias (STRLEN, strlen)
 libc_hidden_builtin_def (strlen)