[16/16] aarch64: Optimize __libc_mtag_tag_zero_region

Message ID 8d06a578d4431a8bfb2f424ec60fa442041a1173.1614874816.git.szabolcs.nagy@arm.com
State Committed
Commit 1dc17ea8f8492d618a91f0b7b3f1e7fd089889d1
Headers
Series memory tagging improvements |

Commit Message

Szabolcs Nagy March 4, 2021, 4:34 p.m. UTC
  This is a target hook for memory tagging, the original was a naive
implementation. Uses the same algorithm as __libc_mtag_tag_region,
but with instructions that also zero the memory.  This was not
benchmarked on real cpu, but expected to be faster than the naive
implementation.
---
 sysdeps/aarch64/__mtag_tag_zero_region.S | 96 ++++++++++++++++++++----
 1 file changed, 80 insertions(+), 16 deletions(-)
  

Patch

diff --git a/sysdeps/aarch64/__mtag_tag_zero_region.S b/sysdeps/aarch64/__mtag_tag_zero_region.S
index 74d398bba5..7d955fbd0c 100644
--- a/sysdeps/aarch64/__mtag_tag_zero_region.S
+++ b/sysdeps/aarch64/__mtag_tag_zero_region.S
@@ -20,30 +20,94 @@ 
 
 #ifdef USE_MTAG
 
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, MTE, LP64 ABI.
+ *
+ * Interface contract:
+ * Address is 16 byte aligned and size is multiple of 16.
+ * Returns the passed pointer.
+ * The memory region may remain untagged if tagging is not enabled.
+ */
 	.arch armv8.5-a
 	.arch_extension memtag
 
-/* NB, only supported on variants with 64-bit pointers.  */
+#define dstin	x0
+#define count	x1
+#define dst	x2
+#define dstend	x3
+#define tmp	x4
+#define zva_val	x4
 
-/* FIXME: This is a minimal implementation.  We could do much better than
-   this for large values of COUNT.  */
+ENTRY (__libc_mtag_tag_zero_region)
+	PTR_ARG (0)
+	SIZE_ARG (1)
 
-#define dstin x0
-#define count x1
-#define dst   x2
+	add	dstend, dstin, count
 
-ENTRY(__libc_mtag_tag_zero_region)
+	cmp	count, 96
+	b.hi	L(set_long)
 
-	mov	dst, dstin
-L(loop):
-	stzg	dst, [dst], #16
-	subs	count, count, 16
-	bne	L(loop)
-#if 0
-	/* This is not currently needed, since for now we are only called
-	   to tag memory that is taggable.  */
-	ldg	dstin, [dstin] // Recover the tag created (might be untagged).
+	tbnz	count, 6, L(set96)
+
+	/* Set 0, 16, 32, or 48 bytes.  */
+	lsr	tmp, count, 5
+	add	tmp, dstin, tmp, lsl 4
+	cbz     count, L(end)
+	stzg	dstin, [dstin]
+	stzg	dstin, [tmp]
+	stzg	dstin, [dstend, -16]
+L(end):
+	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	stz2g	dstin, [dstin]
+	stz2g	dstin, [dstin, 32]
+	stz2g	dstin, [dstend, -32]
+	ret
+
+	.p2align 4
+	/* Size is > 96 bytes.  */
+L(set_long):
+	cmp	count, 160
+	b.lo	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
 #endif
+	stz2g	dstin, [dstin]
+	stz2g	dstin, [dstin, 32]
+	bic	dst, dstin, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+L(zva_loop):
+	add	dst, dst, 64
+	dc	gzva, dst
+	subs	count, count, 64
+	b.hi	L(zva_loop)
+	stz2g	dstin, [dstend, -64]
+	stz2g	dstin, [dstend, -32]
 	ret
+
+L(no_zva):
+	sub	dst, dstin, 32		/* Dst is biased by -32.  */
+	sub	count, count, 64	/* Adjust count for loop.  */
+L(no_zva_loop):
+	stz2g	dstin, [dst, 32]
+	stz2g	dstin, [dst, 64]!
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
+	stz2g	dstin, [dstend, -64]
+	stz2g	dstin, [dstend, -32]
+	ret
+
 END (__libc_mtag_tag_zero_region)
 #endif /* USE_MTAG */