Patchwork [2/2] aarch64: Remove barriers from TLS descriptor functions

login
register
mail settings
Submitter Szabolcs Nagy
Date Oct. 6, 2017, 2:24 p.m.
Message ID <59D7920A.6050205@arm.com>
Download mbox | patch
Permalink /patch/23379/
State New
Headers show

Comments

Szabolcs Nagy - Oct. 6, 2017, 2:24 p.m.

Patch

From 8a98ef1313d620a0c3fbda4dffcb9df9e47adba0 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Wed, 27 Sep 2017 18:14:21 +0100
Subject: [PATCH 2/2] aarch64: Remove barriers from TLS descriptor functions

Remove ldar synchronization and most lazy TLSDESC initialization
related code.

2017-09-29  Szabolcs Nagy  <szabolcs.nagy@arm.com>

	* elf/tlsdeschtab.h (_dl_tls_resolve_early_return_p): Mark unused.
	(_dl_tlsdesc_wake_up_held_fixups): Likewise.
	* sysdeps/aarch64/dl-machine.h (elf_machine_runtime_setup): Remove
	DT_TLSDESC_GOT initialization.
	* sysdeps/aarch64/dl-tlsdesc.S (_dl_tlsdesc_return_lazy): Remove.
	(_dl_tlsdesc_resolve_rela): Likewise.
	(_dl_tlsdesc_resolve_hold): Likewise.
	(_dl_tlsdesc_undefweak): Remove ldar.
	(_dl_tlsdesc_dynamic): Likewise.
	* sysdeps/aarch64/dl-tlsdesc.h (_dl_tlsdesc_return_lazy): Remove.
	(_dl_tlsdesc_resolve_rela): Likewise.
	(_dl_tlsdesc_resolve_hold): Likewise.
	* sysdeps/aarch64/tlsdesc.c (_dl_tlsdesc_resolve_rela_fixup): Remove.
	(_dl_tlsdesc_resolve_hold_fixup): Likewise.
	(_dl_tlsdesc_resolve_rela): Likewise.
	(_dl_tlsdesc_resolve_hold): Likewise.
---
 elf/tlsdeschtab.h            |   2 +
 sysdeps/aarch64/dl-machine.h |   4 -
 sysdeps/aarch64/dl-tlsdesc.S | 203 -------------------------------------------
 sysdeps/aarch64/dl-tlsdesc.h |   9 --
 sysdeps/aarch64/tlsdesc.c    | 127 +--------------------------
 5 files changed, 3 insertions(+), 342 deletions(-)

diff --git a/elf/tlsdeschtab.h b/elf/tlsdeschtab.h
index ad3001dac5..879631897c 100644
--- a/elf/tlsdeschtab.h
+++ b/elf/tlsdeschtab.h
@@ -137,6 +137,7 @@  _dl_make_tlsdesc_dynamic (struct link_map *map, size_t ti_offset)
    avoid introducing such dependencies.  */
 
 static int
+__attribute__ ((unused))
 _dl_tlsdesc_resolve_early_return_p (struct tlsdesc volatile *td, void *caller)
 {
   if (caller != atomic_load_relaxed (&td->entry))
@@ -155,6 +156,7 @@  _dl_tlsdesc_resolve_early_return_p (struct tlsdesc volatile *td, void *caller)
 }
 
 static void
+__attribute__ ((unused))
 _dl_tlsdesc_wake_up_held_fixups (void)
 {
   __rtld_lock_unlock_recursive (GL(dl_load_lock));
diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h
index 9bd48752e5..89d0fe87db 100644
--- a/sysdeps/aarch64/dl-machine.h
+++ b/sysdeps/aarch64/dl-machine.h
@@ -131,10 +131,6 @@  elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	}
     }
 
-  if (l->l_info[ADDRIDX (DT_TLSDESC_GOT)] && lazy)
-    *(ElfW(Addr)*)(D_PTR (l, l_info[ADDRIDX (DT_TLSDESC_GOT)]) + l->l_addr)
-      = (ElfW(Addr)) &_dl_tlsdesc_resolve_rela;
-
   return lazy;
 }
 
diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S
index 3b2da62f39..70550c7ce0 100644
--- a/sysdeps/aarch64/dl-tlsdesc.S
+++ b/sysdeps/aarch64/dl-tlsdesc.S
@@ -80,30 +80,6 @@  _dl_tlsdesc_return:
 	cfi_endproc
 	.size	_dl_tlsdesc_return, .-_dl_tlsdesc_return
 
-	/* Same as _dl_tlsdesc_return but with synchronization for
-	   lazy relocation.
-	   Prototype:
-	   _dl_tlsdesc_return_lazy (tlsdesc *) ;
-	 */
-	.hidden _dl_tlsdesc_return_lazy
-	.global	_dl_tlsdesc_return_lazy
-	.type	_dl_tlsdesc_return_lazy,%function
-	cfi_startproc
-	.align 2
-_dl_tlsdesc_return_lazy:
-	/* The ldar here happens after the load from [x0] at the call site
-	   (that is generated by the compiler as part of the TLS access ABI),
-	   so it reads the same value (this function is the final value of
-	   td->entry) and thus it synchronizes with the release store to
-	   td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
-	   from [x0,#PTR_SIZE] here happens after the initialization of td->arg. */
-	DELOUSE (0)
-	ldar	PTR_REG (zr), [x0]
-	ldr	PTR_REG (0), [x0, #PTR_SIZE]
-	RET
-	cfi_endproc
-	.size	_dl_tlsdesc_return_lazy, .-_dl_tlsdesc_return_lazy
-
 	/* Handler for undefined weak TLS symbols.
 	   Prototype:
 	   _dl_tlsdesc_undefweak (tlsdesc *);
@@ -121,14 +97,7 @@  _dl_tlsdesc_return_lazy:
 _dl_tlsdesc_undefweak:
 	str	x1, [sp, #-16]!
 	cfi_adjust_cfa_offset (16)
-	/* The ldar here happens after the load from [x0] at the call site
-	   (that is generated by the compiler as part of the TLS access ABI),
-	   so it reads the same value (this function is the final value of
-	   td->entry) and thus it synchronizes with the release store to
-	   td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
-	   from [x0,#8] here happens after the initialization of td->arg.  */
 	DELOUSE (0)
-	ldar	PTR_REG (zr), [x0]
 	ldr	PTR_REG (0), [x0, #PTR_SIZE]
 	mrs	x1, tpidr_el0
 	sub	PTR_REG (0), PTR_REG (0), PTR_REG (1)
@@ -192,13 +161,6 @@  _dl_tlsdesc_dynamic:
 	cfi_rel_offset (x4, 32+24)
 
 	mrs	x4, tpidr_el0
-	/* The ldar here happens after the load from [x0] at the call site
-	   (that is generated by the compiler as part of the TLS access ABI),
-	   so it reads the same value (this function is the final value of
-	   td->entry) and thus it synchronizes with the release store to
-	   td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
-	   from [x0,#PTR_SIZE] here happens after the initialization of td->arg.  */
-	ldar	PTR_REG (zr), [x0]
 	ldr	PTR_REG (1), [x0,#TLSDESC_ARG]
 	ldr	PTR_REG (0), [x4,#TCBHEAD_DTV]
 	ldr	PTR_REG (3), [x1,#TLSDESC_GEN_COUNT]
@@ -276,168 +238,3 @@  _dl_tlsdesc_dynamic:
 	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
 # undef NSAVEXREGPAIRS
 #endif
-
-	/* This function is a wrapper for a lazy resolver for TLS_DESC
-	   RELA relocations.
-	   When the actual resolver returns, it will have adjusted the
-	   TLS descriptor such that we can tail-call it for it to return
-	   the TP offset of the symbol.  */
-
-	.hidden _dl_tlsdesc_resolve_rela
-	.global	_dl_tlsdesc_resolve_rela
-	.type	_dl_tlsdesc_resolve_rela,%function
-	cfi_startproc
-	.align 2
-_dl_tlsdesc_resolve_rela:
-#define	NSAVEXREGPAIRS 9
-	/* The tlsdesc PLT entry pushes x2 and x3 to the stack.  */
-	cfi_adjust_cfa_offset (16)
-	cfi_rel_offset (x2, 0)
-	cfi_rel_offset (x3, 8)
-	stp	x29, x30, [sp, #-(32+16*NSAVEXREGPAIRS)]!
-	cfi_adjust_cfa_offset (32+16*NSAVEXREGPAIRS)
-	cfi_rel_offset (x29, 0)
-	cfi_rel_offset (x30, 8)
-	mov	x29, sp
-	stp	 x1,  x4, [sp, #32+16*0]
-	stp	 x5,  x6, [sp, #32+16*1]
-	stp	 x7,  x8, [sp, #32+16*2]
-	stp	 x9, x10, [sp, #32+16*3]
-	stp	x11, x12, [sp, #32+16*4]
-	stp	x13, x14, [sp, #32+16*5]
-	stp	x15, x16, [sp, #32+16*6]
-	stp	x17, x18, [sp, #32+16*7]
-	str	x0,       [sp, #32+16*8]
-	cfi_rel_offset (x1, 32)
-	cfi_rel_offset (x4, 32+8)
-	cfi_rel_offset (x5, 32+16)
-	cfi_rel_offset (x6, 32+16+8)
-	cfi_rel_offset (x7, 32+16*2)
-	cfi_rel_offset (x8, 32+16*2+8)
-	cfi_rel_offset (x9, 32+16*3)
-	cfi_rel_offset (x10, 32+16*3+8)
-	cfi_rel_offset (x11, 32+16*4)
-	cfi_rel_offset (x12, 32+16*4+8)
-	cfi_rel_offset (x13, 32+16*5)
-	cfi_rel_offset (x14, 32+16*5+8)
-	cfi_rel_offset (x15, 32+16*6)
-	cfi_rel_offset (x16, 32+16*6+8)
-	cfi_rel_offset (x17, 32+16*7)
-	cfi_rel_offset (x18, 32+16*7+8)
-	cfi_rel_offset (x0, 32+16*8)
-
-	SAVE_Q_REGISTERS
-
-	DELOUSE (3)
-	ldr	PTR_REG (1), [x3, #PTR_SIZE]
-	bl	_dl_tlsdesc_resolve_rela_fixup
-
-	RESTORE_Q_REGISTERS
-
-	ldr	x0, [sp, #32+16*8]
-	DELOUSE (0)
-	ldr	PTR_REG (1), [x0]
-	blr	x1
-
-	ldp	 x1,  x4, [sp, #32+16*0]
-	ldp	 x5,  x6, [sp, #32+16*1]
-	ldp	 x7,  x8, [sp, #32+16*2]
-	ldp	 x9, x10, [sp, #32+16*3]
-	ldp	x11, x12, [sp, #32+16*4]
-	ldp	x13, x14, [sp, #32+16*5]
-	ldp	x15, x16, [sp, #32+16*6]
-	ldp	x17, x18, [sp, #32+16*7]
-	ldp	x29, x30, [sp], #(32+16*NSAVEXREGPAIRS)
-	cfi_adjust_cfa_offset (-32-16*NSAVEXREGPAIRS)
-	cfi_restore (x29)
-	cfi_restore (x30)
-	ldp	x2, x3, [sp], #16
-	cfi_adjust_cfa_offset (-16)
-	RET
-#undef NSAVEXREGPAIRS
-	cfi_endproc
-	.size	_dl_tlsdesc_resolve_rela, .-_dl_tlsdesc_resolve_rela
-
-	/* This function is a placeholder for lazy resolving of TLS
-	relocations.  Once some thread starts resolving a TLS
-	relocation, it sets up the TLS descriptor to use this
-	resolver, such that other threads that would attempt to
-	resolve it concurrently may skip the call to the original lazy
-	resolver and go straight to a condition wait.
-
-	When the actual resolver returns, it will have adjusted the
-	TLS descriptor such that we can tail-call it for it to return
-	the TP offset of the symbol.  */
-
-	.hidden _dl_tlsdesc_resolve_hold
-	.global	_dl_tlsdesc_resolve_hold
-	.type	_dl_tlsdesc_resolve_hold,%function
-	cfi_startproc
-	.align 2
-_dl_tlsdesc_resolve_hold:
-#define	NSAVEXREGPAIRS 10
-1:
-	stp	x29, x30, [sp, #-(32+16*NSAVEXREGPAIRS)]!
-	cfi_adjust_cfa_offset (32+16*NSAVEXREGPAIRS)
-	cfi_rel_offset (x29, 0)
-	cfi_rel_offset (x30, 8)
-	mov	x29, sp
-	stp	 x1,  x2, [sp, #32+16*0]
-	stp	 x3,  x4, [sp, #32+16*1]
-	stp	 x5,  x6, [sp, #32+16*2]
-	stp	 x7,  x8, [sp, #32+16*3]
-	stp	 x9, x10, [sp, #32+16*4]
-	stp	x11, x12, [sp, #32+16*5]
-	stp	x13, x14, [sp, #32+16*6]
-	stp	x15, x16, [sp, #32+16*7]
-	stp	x17, x18, [sp, #32+16*8]
-	str	x0,       [sp, #32+16*9]
-	cfi_rel_offset (x1, 32)
-	cfi_rel_offset (x2, 32+8)
-	cfi_rel_offset (x3, 32+16)
-	cfi_rel_offset (x4, 32+16+8)
-	cfi_rel_offset (x5, 32+16*2)
-	cfi_rel_offset (x6, 32+16*2+8)
-	cfi_rel_offset (x7, 32+16*3)
-	cfi_rel_offset (x8, 32+16*3+8)
-	cfi_rel_offset (x9, 32+16*4)
-	cfi_rel_offset (x10, 32+16*4+8)
-	cfi_rel_offset (x11, 32+16*5)
-	cfi_rel_offset (x12, 32+16*5+8)
-	cfi_rel_offset (x13, 32+16*6)
-	cfi_rel_offset (x14, 32+16*6+8)
-	cfi_rel_offset (x15, 32+16*7)
-	cfi_rel_offset (x16, 32+16*7+8)
-	cfi_rel_offset (x17, 32+16*8)
-	cfi_rel_offset (x18, 32+16*8+8)
-	cfi_rel_offset (x0, 32+16*9)
-
-	SAVE_Q_REGISTERS
-
-	adr	x1, 1b
-	bl	_dl_tlsdesc_resolve_hold_fixup
-
-	RESTORE_Q_REGISTERS
-
-	ldr	x0, [sp, #32+16*9]
-	DELOUSE (0)
-	ldr	PTR_REG (1), [x0]
-	blr	x1
-
-	ldp	 x1,  x2, [sp, #32+16*0]
-	ldp	 x3,  x4, [sp, #32+16*1]
-	ldp	 x5,  x6, [sp, #32+16*2]
-	ldp	 x7,  x8, [sp, #32+16*3]
-	ldp	 x9, x10, [sp, #32+16*4]
-	ldp	x11, x12, [sp, #32+16*5]
-	ldp	x13, x14, [sp, #32+16*6]
-	ldp	x15, x16, [sp, #32+16*7]
-	ldp	x17, x18, [sp, #32+16*8]
-	ldp	x29, x30, [sp], #(32+16*NSAVEXREGPAIRS)
-	cfi_adjust_cfa_offset (-32-16*NSAVEXREGPAIRS)
-	cfi_restore (x29)
-	cfi_restore (x30)
-	RET
-	cfi_endproc
-	.size	_dl_tlsdesc_resolve_hold, .-_dl_tlsdesc_resolve_hold
-#undef NSAVEXREGPAIRS
diff --git a/sysdeps/aarch64/dl-tlsdesc.h b/sysdeps/aarch64/dl-tlsdesc.h
index 66ec0dea7f..25b5b64e6e 100644
--- a/sysdeps/aarch64/dl-tlsdesc.h
+++ b/sysdeps/aarch64/dl-tlsdesc.h
@@ -46,17 +46,8 @@  extern ptrdiff_t attribute_hidden
 _dl_tlsdesc_return (struct tlsdesc *);
 
 extern ptrdiff_t attribute_hidden
-_dl_tlsdesc_return_lazy (struct tlsdesc *);
-
-extern ptrdiff_t attribute_hidden
 _dl_tlsdesc_undefweak (struct tlsdesc *);
 
-extern ptrdiff_t attribute_hidden
-_dl_tlsdesc_resolve_rela (struct tlsdesc *);
-
-extern ptrdiff_t attribute_hidden
-_dl_tlsdesc_resolve_hold (struct tlsdesc *);
-
 # ifdef SHARED
 extern void *_dl_make_tlsdesc_dynamic (struct link_map *, size_t);
 
diff --git a/sysdeps/aarch64/tlsdesc.c b/sysdeps/aarch64/tlsdesc.c
index 4c8a5a81d1..bd1356fa8e 100644
--- a/sysdeps/aarch64/tlsdesc.c
+++ b/sysdeps/aarch64/tlsdesc.c
@@ -18,137 +18,12 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <link.h>
 #include <ldsodefs.h>
-#include <elf/dynamic-link.h>
 #include <tls.h>
 #include <dl-tlsdesc.h>
 #include <dl-unmap-segments.h>
+#define _dl_tlsdesc_resolve_hold 0
 #include <tlsdeschtab.h>
-#include <atomic.h>
-
-/* The following functions take an entry_check_offset argument.  It's
-   computed by the caller as an offset between its entry point and the
-   call site, such that by adding the built-in return address that is
-   implicitly passed to the function with this offset, we can easily
-   obtain the caller's entry point to compare with the entry point
-   given in the TLS descriptor.  If it's changed, we want to return
-   immediately.  */
-
-/* This function is used to lazily resolve TLS_DESC RELA relocations.
-   The argument location is used to hold a pointer to the relocation.  */
-
-void
-attribute_hidden
-_dl_tlsdesc_resolve_rela_fixup (struct tlsdesc *td, struct link_map *l)
-{
-  const ElfW(Rela) *reloc = atomic_load_relaxed (&td->arg);
-
-  /* After GL(dl_load_lock) is grabbed only one caller can see td->entry in
-     initial state in _dl_tlsdesc_resolve_early_return_p, other concurrent
-     callers will return and retry calling td->entry.  The updated td->entry
-     synchronizes with the single writer so all read accesses here can use
-     relaxed order.  */
-  if (_dl_tlsdesc_resolve_early_return_p
-      (td, (void*)(D_PTR (l, l_info[ADDRIDX (DT_TLSDESC_PLT)]) + l->l_addr)))
-    return;
-
-  /* The code below was borrowed from _dl_fixup(),
-     except for checking for STB_LOCAL.  */
-  const ElfW(Sym) *const symtab
-    = (const void *) D_PTR (l, l_info[DT_SYMTAB]);
-  const char *strtab = (const void *) D_PTR (l, l_info[DT_STRTAB]);
-  const ElfW(Sym) *sym = &symtab[ELFW(R_SYM) (reloc->r_info)];
-  lookup_t result;
-
-   /* Look up the target symbol.  If the normal lookup rules are not
-      used don't look in the global scope.  */
-  if (ELFW(ST_BIND) (sym->st_info) != STB_LOCAL
-      && __builtin_expect (ELFW(ST_VISIBILITY) (sym->st_other), 0) == 0)
-    {
-      const struct r_found_version *version = NULL;
-
-      if (l->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
-	{
-	  const ElfW(Half) *vernum =
-	    (const void *) D_PTR (l, l_info[VERSYMIDX (DT_VERSYM)]);
-	  ElfW(Half) ndx = vernum[ELFW(R_SYM) (reloc->r_info)] & 0x7fff;
-	  version = &l->l_versions[ndx];
-	  if (version->hash == 0)
-	    version = NULL;
-	}
-
-      result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym,
-				    l->l_scope, version, ELF_RTYPE_CLASS_PLT,
-				    DL_LOOKUP_ADD_DEPENDENCY, NULL);
-    }
-  else
-    {
-      /* We already found the symbol.  The module (and therefore its load
-	 address) is also known.  */
-      result = l;
-    }
-
-  if (!sym)
-    {
-      atomic_store_relaxed (&td->arg, (void *) reloc->r_addend);
-      /* This release store synchronizes with the ldar acquire load
-	 instruction in _dl_tlsdesc_undefweak.  */
-      atomic_store_release (&td->entry, _dl_tlsdesc_undefweak);
-    }
-  else
-    {
-#  ifndef SHARED
-      CHECK_STATIC_TLS (l, result);
-#  else
-      if (!TRY_STATIC_TLS (l, result))
-	{
-	  void *p = _dl_make_tlsdesc_dynamic (result, sym->st_value
-					      + reloc->r_addend);
-	  atomic_store_relaxed (&td->arg, p);
-	  /* This release store synchronizes with the ldar acquire load
-	     instruction in _dl_tlsdesc_dynamic.  */
-	  atomic_store_release (&td->entry, _dl_tlsdesc_dynamic);
-	}
-      else
-#  endif
-	{
-	  void *p = (void*) (sym->st_value + result->l_tls_offset
-			     + reloc->r_addend);
-	  atomic_store_relaxed (&td->arg, p);
-	  /* This release store synchronizes with the ldar acquire load
-	     instruction in _dl_tlsdesc_return_lazy.  */
-	  atomic_store_release (&td->entry, _dl_tlsdesc_return_lazy);
-	}
-    }
-
-  _dl_tlsdesc_wake_up_held_fixups ();
-}
-
-/* This function is used to avoid busy waiting for other threads to
-   complete the lazy relocation.  Once another thread wins the race to
-   relocate a TLS descriptor, it sets the descriptor up such that this
-   function is called to wait until the resolver releases the
-   lock.  */
-
-void
-attribute_hidden
-_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc *td, void *caller)
-{
-  /* Maybe we're lucky and can return early.  */
-  if (caller != atomic_load_relaxed (&td->entry))
-    return;
-
-  /* Locking here will stop execution until the running resolver runs
-     _dl_tlsdesc_wake_up_held_fixups(), releasing the lock.
-
-     FIXME: We'd be better off waiting on a condition variable, such
-     that we didn't have to hold the lock throughout the relocation
-     processing.  */
-  __rtld_lock_lock_recursive (GL(dl_load_lock));
-  __rtld_lock_unlock_recursive (GL(dl_load_lock));
-}
-
 
 /* Unmap the dynamic object, but also release its TLS descriptor table
    if there is one.  */
-- 
2.11.0