Patchwork [BZ,18572,ARM] Fix lazy TLSDESC race on arm

login
register
mail settings
Submitter Szabolcs Nagy
Date Nov. 12, 2015, 11:31 a.m.
Message ID <564478A2.6020901@arm.com>
Download mbox | patch
Permalink /patch/9653/
State New
Headers show

Comments

Szabolcs Nagy - Nov. 12, 2015, 11:31 a.m.
Lazily initialized TLS descriptors have a race on ARM, so
concurrent TLS access can cause crashes with -mtls-dialect=gnu2.

See the fix and description for AArch64:
https://sourceware.org/ml/libc-alpha/2015-06/msg00496.html

Differences compared to AArch64:

- Various versions of ARM have different synchronization methods,
   defined a DMB() macro that selects the right memory barrier at
   compile-time (since compiler builtins are not available in asm
   and it seems glibc does not do runtime dispatch).

- tls descriptor is (arg,f) on arm instead of (f,arg), this makes
   the "load f again with acquire mo" solution of AArch64 less
   attractive. (LDA on AArch32 only works with 0 offset and with
   an extra ADD the armv8 case would need more special casing with
   questionable benefits).

- the argument is different (symbol table index instead of a
   pointer to the relocation entry) so symbol lookup is different
   and there is no simple way to have common tlsdesc.c across
   different archs.

- _dl_tlsdesc_undefweak does not access the argument so it needs
   no synchronization.

The C code uses atomic_store_release for lazy initialization and
the asm uses DMB in the resolvers: every tls access will go
through an extra barrier (which must have a performance impact).

Tested on armv7-a.

2015-11-12  Szabolcs Nagy  <szabolcs.nagy@arm.com>

	[BZ #18572]
	* sysdeps/arm/sysdep.h (DMB): Define.
	* sysdeps/arm/dl-tlsdesc.h (_dl_tlsdesc_return_lazy): Declare.
	* sysdeps/arm/dl-tlsdesc.S (_dl_tlsdesc_return_lazy): Define.
	(_dl_tlsdesc_dynamic): Guarantee TLSDESC entry and argument load-load
	ordering using DMB.
	* sysdeps/arm/tlsdesc.c (_dl_tlsdesc_lazy_resolver_fixup): Use relaxed
	atomics instead of volatile and synchronize using release store.
	(_dl_tlsdesc_resolve_hold_fixup): Use relaxed atomics instead of
	volatile.
Rich Felker - Nov. 14, 2015, 5:20 p.m.
On Thu, Nov 12, 2015 at 11:31:46AM +0000, Szabolcs Nagy wrote:
> Lazily initialized TLS descriptors have a race on ARM, so
> concurrent TLS access can cause crashes with -mtls-dialect=gnu2.
> 
> See the fix and description for AArch64:
> https://sourceware.org/ml/libc-alpha/2015-06/msg00496.html
> 
> Differences compared to AArch64:
> 
> - Various versions of ARM have different synchronization methods,
>   defined a DMB() macro that selects the right memory barrier at
>   compile-time (since compiler builtins are not available in asm
>   and it seems glibc does not do runtime dispatch).
> 
> - tls descriptor is (arg,f) on arm instead of (f,arg), this makes
>   the "load f again with acquire mo" solution of AArch64 less
>   attractive. (LDA on AArch32 only works with 0 offset and with
>   an extra ADD the armv8 case would need more special casing with
>   questionable benefits).
> 
> - the argument is different (symbol table index instead of a
>   pointer to the relocation entry) so symbol lookup is different
>   and there is no simple way to have common tlsdesc.c across
>   different archs.
> 
> - _dl_tlsdesc_undefweak does not access the argument so it needs
>   no synchronization.
> 
> The C code uses atomic_store_release for lazy initialization and
> the asm uses DMB in the resolvers: every tls access will go
> through an extra barrier (which must have a performance impact).

I think the right solution is to deprecate and eventually remove lazy
tlsdesc resolvers, but we could start by having a macro that sysdep
files can define on affected archs to disable lazy resolution. Putting
a barrier in every tlsdesc access obviously makes it much worse than
non-tlsdesc GD model and defeats the whole purpose of tlsdesc, no?

Rich

Patch

diff --git a/sysdeps/arm/dl-tlsdesc.S b/sysdeps/arm/dl-tlsdesc.S
index 33a2695..1d28cae 100644
--- a/sysdeps/arm/dl-tlsdesc.S
+++ b/sysdeps/arm/dl-tlsdesc.S
@@ -39,6 +39,25 @@  _dl_tlsdesc_return:
 	cfi_endproc
 	.size	_dl_tlsdesc_return, .-_dl_tlsdesc_return
 
+	.hidden _dl_tlsdesc_return_lazy
+	.global	_dl_tlsdesc_return_lazy
+	.type	_dl_tlsdesc_return_lazy,#function
+	cfi_startproc
+	eabi_fnstart
+	.align 2
+_dl_tlsdesc_return_lazy:
+	/* This DMB synchronizes the load of td->entry ([r0,#4])
+	   at the call site with the release store to it in
+	   _dl_tlsdesc_lazy_resolver_fixup so the load from [r0]
+	   here happens after the initialization of td->argument.  */
+	DMB ()
+	sfi_breg r0, \
+	ldr	r0, [\B]
+	BX	(lr)
+	eabi_fnend
+	cfi_endproc
+	.size	_dl_tlsdesc_return_lazy, .-_dl_tlsdesc_return_lazy
+
 	.hidden _dl_tlsdesc_undefweak
 	.global	_dl_tlsdesc_undefweak
 	.type	_dl_tlsdesc_undefweak,#function
@@ -92,6 +111,11 @@  _dl_tlsdesc_dynamic:
 	cfi_rel_offset (r3,4)
 	cfi_rel_offset (r4,8)
 	cfi_rel_offset (lr,12)
+	/* This DMB synchronizes the load of td->entry ([r0,#4])
+	   at the call site with the release store to it in
+	   _dl_tlsdesc_lazy_resolver_fixup so the load from [r0]
+	   here happens after the initialization of td->argument.  */
+	DMB ()
 	sfi_breg r0, \
 	ldr	r1, [\B] /* td */
 	GET_TLS (lr)
diff --git a/sysdeps/arm/dl-tlsdesc.h b/sysdeps/arm/dl-tlsdesc.h
index e17b0bd..26fdb62 100644
--- a/sysdeps/arm/dl-tlsdesc.h
+++ b/sysdeps/arm/dl-tlsdesc.h
@@ -48,6 +48,7 @@  struct tlsdesc_dynamic_arg
 
 extern ptrdiff_t attribute_hidden
   _dl_tlsdesc_return(struct tlsdesc *),
+  _dl_tlsdesc_return_lazy(struct tlsdesc *),
   _dl_tlsdesc_undefweak(struct tlsdesc *),
   _dl_tlsdesc_resolve_hold(struct tlsdesc *),
   _dl_tlsdesc_lazy_resolver(struct tlsdesc *);
diff --git a/sysdeps/arm/sysdep.h b/sysdeps/arm/sysdep.h
index 9bbd009..c767c56 100644
--- a/sysdeps/arm/sysdep.h
+++ b/sysdeps/arm/sysdep.h
@@ -277,6 +277,15 @@ 
 	cfi_restore_state
 # endif /* ARCH_HAS_HARD_TP */
 
+/* Memory barrier for TLSDESC resolvers.  */
+#if __ARM_ARCH >= 7
+# define DMB()	dmb ish
+#elif __ARM_ARCH >= 6
+# define DMB()	mcr p15, 0, r0, c7, c10, 5
+#else
+# define DMB()	bl __sync_synchronize
+#endif
+
 # ifndef ARM_SFI_MACROS
 # define ARM_SFI_MACROS 1
 /* This assembly macro is prepended to any load/store instruction,
diff --git a/sysdeps/arm/tlsdesc.c b/sysdeps/arm/tlsdesc.c
index ca941b8..27101e6 100644
--- a/sysdeps/arm/tlsdesc.c
+++ b/sysdeps/arm/tlsdesc.c
@@ -23,6 +23,7 @@ 
 #include <dl-tlsdesc.h>
 #include <dl-unmap-segments.h>
 #include <tlsdeschtab.h>
+#include <atomic.h>
 
 /* This function is used to lazily resolve TLS_DESC REL relocations
    Besides the TLS descriptor itself, we get the module's got address
@@ -30,22 +31,26 @@ 
 
 void
 attribute_hidden
-_dl_tlsdesc_lazy_resolver_fixup (struct tlsdesc volatile *td,
-				 Elf32_Addr *got)
+_dl_tlsdesc_lazy_resolver_fixup (struct tlsdesc *td, Elf32_Addr *got)
 {
   struct link_map *l = (struct link_map *)got[1];
   lookup_t result;
-  unsigned long value;
+  unsigned long value = atomic_load_relaxed (&td->argument.value);
 
+  /* After GL(dl_load_lock) is grabbed only one caller can see td->entry in
+     initial state in _dl_tlsdesc_resolve_early_return_p, other concurrent
+     callers will return and retry calling td->entry.  The updated td->entry
+     synchronizes with the single writer so all read accesses here can use
+     relaxed order.  */
   if (_dl_tlsdesc_resolve_early_return_p
       (td, (void*)(D_PTR (l, l_info[ADDRIDX (DT_TLSDESC_PLT)]) + l->l_addr)))
     return;
 
-  if (td->argument.value & 0x80000000)
+  if (value & 0x80000000)
     {
       /* A global symbol, this is the symbol index.  */
       /* The code below was borrowed from _dl_fixup().  */
-      const Elf_Symndx symndx = td->argument.value ^ 0x80000000;
+      const Elf_Symndx symndx = value ^ 0x80000000;
       const ElfW(Sym) *const symtab
 	= (const void *) D_PTR (l, l_info[DT_SYMTAB]);
       const char *strtab = (const void *) D_PTR (l, l_info[DT_STRTAB]);
@@ -76,7 +81,9 @@  _dl_tlsdesc_lazy_resolver_fixup (struct tlsdesc volatile *td,
 	    value = sym->st_value;
 	  else
 	    {
-	      td->entry = _dl_tlsdesc_undefweak;
+	      /* _dl_tlsdesc_undefweak does not access td->argument
+		 so this store can be relaxed.  */
+	      atomic_store_relaxed (&td->entry, _dl_tlsdesc_undefweak);
 	      goto done;
 	    }
 	}
@@ -92,7 +99,6 @@  _dl_tlsdesc_lazy_resolver_fixup (struct tlsdesc volatile *td,
     {
       /* A local symbol, this is the offset within our tls section.
 	 */
-      value = td->argument.value;
       result = l;
     }
 
@@ -101,14 +107,19 @@  _dl_tlsdesc_lazy_resolver_fixup (struct tlsdesc volatile *td,
 #else
   if (!TRY_STATIC_TLS (l, result))
     {
-      td->argument.pointer = _dl_make_tlsdesc_dynamic (result, value);
-      td->entry = _dl_tlsdesc_dynamic;
+      void *p = _dl_make_tlsdesc_dynamic (result, value);
+      atomic_store_relaxed (&td->argument.pointer, p);
+      /* This release store synchronizes with the DMB in
+	 _dl_tlsdesc_dynamic (relying on the arm memory model).  */
+      atomic_store_release (&td->entry, _dl_tlsdesc_dynamic);
     }
   else
 #endif
     {
-      td->argument.value = value + result->l_tls_offset;
-      td->entry = _dl_tlsdesc_return;
+      atomic_store_relaxed (&td->argument.value, value + result->l_tls_offset);
+      /* This release store synchronizes with the DMB in
+	 _dl_tlsdesc_return_lazy (relying on the arm memory model).  */
+      atomic_store_release (&td->entry, _dl_tlsdesc_return_lazy);
     }
 
  done:
@@ -123,11 +134,10 @@  _dl_tlsdesc_lazy_resolver_fixup (struct tlsdesc volatile *td,
 
 void
 attribute_hidden
-_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc volatile *td,
-				void *caller)
+_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc *td, void *caller)
 {
   /* Maybe we're lucky and can return early.  */
-  if (caller != td->entry)
+  if (caller != atomic_load_relaxed (&td->entry))
     return;
 
   /* Locking here will stop execution until the running resolver runs