aarch64: optimize _dl_tlsdesc_dynamic fast path

Message ID 59F9A544.9080709@arm.com
State New, archived
Headers

Commit Message

Szabolcs Nagy Nov. 1, 2017, 10:43 a.m. UTC
  This patch will go on top of the lazy tlsdesc removal patch set.
  

Comments

Szabolcs Nagy Nov. 3, 2017, 2:56 p.m. UTC | #1
On 01/11/17 10:43, Szabolcs Nagy wrote:
> This patch will go on top of the lazy tlsdesc removal patch set.
> 

committed the lazy tlsdesc patch set and this optimization.
  

Patch

From 9f713143d817fdf60233ecbc8104d6e9d028342a Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Tue, 24 Oct 2017 17:49:14 +0100
Subject: [PATCH] aarch64: optimize _dl_tlsdesc_dynamic fast path

Remove some load/store instructions from the dynamic tlsdesc resolver
fast path.  This gives around 20% faster tls access in dlopened shared
libraries (assuming glibc ran out of static tls space).

2017-10-25  Szabolcs Nagy  <szabolcs.nagy@arm.com>

	* sysdeps/aarch64/dl-tlsdesc.S (_dl_tlsdesc_dynamic): Optimize.
---
 sysdeps/aarch64/dl-tlsdesc.S | 105 +++++++++++++++++++++----------------------
 1 file changed, 51 insertions(+), 54 deletions(-)

diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S
index 70550c7ce0..1d2008cbf2 100644
--- a/sysdeps/aarch64/dl-tlsdesc.S
+++ b/sysdeps/aarch64/dl-tlsdesc.S
@@ -142,23 +142,17 @@  _dl_tlsdesc_undefweak:
 	cfi_startproc
 	.align 2
 _dl_tlsdesc_dynamic:
-# define NSAVEXREGPAIRS 2
-	stp	x29, x30, [sp,#-(32+16*NSAVEXREGPAIRS)]!
-	cfi_adjust_cfa_offset (32+16*NSAVEXREGPAIRS)
-	cfi_rel_offset (x29, 0)
-	cfi_rel_offset (x30, 8)
-	mov	x29, sp
 	DELOUSE (0)
 
 	/* Save just enough registers to support fast path, if we fall
 	   into slow path we will save additional registers.  */
-
-	stp	x1,  x2, [sp, #32+16*0]
-	stp	x3,  x4, [sp, #32+16*1]
-	cfi_rel_offset (x1, 32)
-	cfi_rel_offset (x2, 32+8)
-	cfi_rel_offset (x3, 32+16)
-	cfi_rel_offset (x4, 32+24)
+	stp	x1,  x2, [sp, #-32]!
+	stp	x3,  x4, [sp, #16]
+	cfi_adjust_cfa_offset (32)
+	cfi_rel_offset (x1, 0)
+	cfi_rel_offset (x2, 8)
+	cfi_rel_offset (x3, 16)
+	cfi_rel_offset (x4, 24)
 
 	mrs	x4, tpidr_el0
 	ldr	PTR_REG (1), [x0,#TLSDESC_ARG]
@@ -167,23 +161,18 @@  _dl_tlsdesc_dynamic:
 	ldr	PTR_REG (2), [x0,#DTV_COUNTER]
 	cmp	PTR_REG (3), PTR_REG (2)
 	b.hi	2f
-	ldr	PTR_REG (2), [x1,#TLSDESC_MODID]
+	/* Load r2 = td->tlsinfo.ti_module and r3 = td->tlsinfo.ti_offset.  */
+	ldp	PTR_REG (2), PTR_REG (3), [x1,#TLSDESC_MODID]
 	add	PTR_REG (0), PTR_REG (0), PTR_REG (2), lsl #(PTR_LOG_SIZE + 1)
 	ldr	PTR_REG (0), [x0] /* Load val member of DTV entry.  */
 	cmp	PTR_REG (0), #TLS_DTV_UNALLOCATED
 	b.eq	2f
-	ldr	PTR_REG (1), [x1,#TLSDESC_MODOFF]
-	add	PTR_REG (0), PTR_REG (0), PTR_REG (1)
-	sub	PTR_REG (0), PTR_REG (0), PTR_REG (4)
+	sub	PTR_REG (3), PTR_REG (3), PTR_REG (4)
+	add	PTR_REG (0), PTR_REG (0), PTR_REG (3)
 1:
-	ldp	 x1,  x2, [sp, #32+16*0]
-	ldp	 x3,  x4, [sp, #32+16*1]
-
-	ldp	x29, x30, [sp], #(32+16*NSAVEXREGPAIRS)
-	cfi_adjust_cfa_offset (-32-16*NSAVEXREGPAIRS)
-	cfi_restore (x29)
-	cfi_restore (x30)
-# undef NSAVEXREGPAIRS
+	ldp	 x3,  x4, [sp, #16]
+	ldp	 x1,  x2, [sp], #32
+	cfi_adjust_cfa_offset (-32)
 	RET
 2:
 	/* This is the slow path. We need to call __tls_get_addr() which
@@ -191,29 +180,33 @@  _dl_tlsdesc_dynamic:
 	   callee will trash.  */
 
 	/* Save the remaining registers that we must treat as caller save.  */
-# define NSAVEXREGPAIRS 7
-	stp	 x5,  x6, [sp, #-16*NSAVEXREGPAIRS]!
+# define NSAVEXREGPAIRS 8
+	stp	x29, x30, [sp,#-16*NSAVEXREGPAIRS]!
 	cfi_adjust_cfa_offset (16*NSAVEXREGPAIRS)
-	stp	 x7,  x8, [sp, #16*1]
-	stp	 x9, x10, [sp, #16*2]
-	stp	x11, x12, [sp, #16*3]
-	stp	x13, x14, [sp, #16*4]
-	stp	x15, x16, [sp, #16*5]
-	stp	x17, x18, [sp, #16*6]
-	cfi_rel_offset (x5, 0)
-	cfi_rel_offset (x6, 8)
-	cfi_rel_offset (x7, 16)
-	cfi_rel_offset (x8, 16+8)
-	cfi_rel_offset (x9, 16*2)
-	cfi_rel_offset (x10, 16*2+8)
-	cfi_rel_offset (x11, 16*3)
-	cfi_rel_offset (x12, 16*3+8)
-	cfi_rel_offset (x13, 16*4)
-	cfi_rel_offset (x14, 16*4+8)
-	cfi_rel_offset (x15, 16*5)
-	cfi_rel_offset (x16, 16*5+8)
-	cfi_rel_offset (x17, 16*6)
-	cfi_rel_offset (x18, 16*6+8)
+	cfi_rel_offset (x29, 0)
+	cfi_rel_offset (x30, 8)
+	mov	x29, sp
+	stp	 x5,  x6, [sp, #16*1]
+	stp	 x7,  x8, [sp, #16*2]
+	stp	 x9, x10, [sp, #16*3]
+	stp	x11, x12, [sp, #16*4]
+	stp	x13, x14, [sp, #16*5]
+	stp	x15, x16, [sp, #16*6]
+	stp	x17, x18, [sp, #16*7]
+	cfi_rel_offset (x5, 16*1)
+	cfi_rel_offset (x6, 16*1+8)
+	cfi_rel_offset (x7, 16*2)
+	cfi_rel_offset (x8, 16*2+8)
+	cfi_rel_offset (x9, 16*3)
+	cfi_rel_offset (x10, 16*3+8)
+	cfi_rel_offset (x11, 16*4)
+	cfi_rel_offset (x12, 16*4+8)
+	cfi_rel_offset (x13, 16*5)
+	cfi_rel_offset (x14, 16*5+8)
+	cfi_rel_offset (x15, 16*6)
+	cfi_rel_offset (x16, 16*6+8)
+	cfi_rel_offset (x17, 16*7)
+	cfi_rel_offset (x18, 16*7+8)
 
 	SAVE_Q_REGISTERS
 
@@ -225,14 +218,18 @@  _dl_tlsdesc_dynamic:
 
 	RESTORE_Q_REGISTERS
 
-	ldp	 x7,  x8, [sp, #16*1]
-	ldp	 x9, x10, [sp, #16*2]
-	ldp	x11, x12, [sp, #16*3]
-	ldp	x13, x14, [sp, #16*4]
-	ldp	x15, x16, [sp, #16*5]
-	ldp	x17, x18, [sp, #16*6]
-	ldp	 x5,  x6, [sp], #16*NSAVEXREGPAIRS
+	ldp	 x5,  x6, [sp, #16*1]
+	ldp	 x7,  x8, [sp, #16*2]
+	ldp	 x9, x10, [sp, #16*3]
+	ldp	x11, x12, [sp, #16*4]
+	ldp	x13, x14, [sp, #16*5]
+	ldp	x15, x16, [sp, #16*6]
+	ldp	x17, x18, [sp, #16*7]
+
+	ldp	x29, x30, [sp], #16*NSAVEXREGPAIRS
 	cfi_adjust_cfa_offset (-16*NSAVEXREGPAIRS)
+	cfi_restore (x29)
+	cfi_restore (x30)
 	b	1b
 	cfi_endproc
 	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
-- 
2.11.0