diff mbox

[BZ,18034,AArch64] Lazy TLSDESC relocation data race fix

Message ID 556C3301.1070007@arm.com
State Superseded
Headers show

Commit Message

Szabolcs Nagy June 1, 2015, 10:25 a.m. UTC
On 27/05/15 14:02, Torvald Riegel wrote:
> On Wed, 2015-05-27 at 12:27 +0100, Szabolcs Nagy wrote:
>> On 26/05/15 21:37, Torvald Riegel wrote:
>>> This should have relaxed atomic accesses for all things concurrently
>>> accessed.  As a result, you can drop the volatile qualification I

removed volatile from aarch64 tlsdesc.c, but kept it in
elf/tlsdeschtab.h for now.

>>> You should also document why the relaxed MO load in
>>> _dl_tlsdesc_resolve_early_return_p is sufficient (see the other part of

i added a comment to the _dl_tlsdesc_resolve_early_return_p
call in aarch64 tlsdesc.c about the retry loop.

attached the updated patch:

- The c code now follows the glibc concurrency guidelines
(using relaxed atomics instead of volatile and release store
instead of barrier on the write side).
- Fixed the atomics in elf/tlsdeschtab.h too.
- Updated the comments.

the write is side now compiled to: (stlr has store-release semantics)

 158:   91002261        add     x1, x19, #0x8
 15c:   f9000020        str     x0, [x1]
 160:   90000000        adrp    x0, 0 <_dl_tlsdesc_return_lazy>
                        160: R_AARCH64_ADR_PREL_PG_HI21 _dl_tlsdesc_return_lazy
 164:   91000000        add     x0, x0, #0x0
                        164: R_AARCH64_ADD_ABS_LO12_NC  _dl_tlsdesc_return_lazy
 168:   c89ffe60        stlr    x0, [x19]

the read side is: (ldar has load-acquire semantics)

0000000000000008 <_dl_tlsdesc_return_lazy>:
   8:   c8dffc1f        ldar    xzr, [x0]
   c:   f9400400        ldr     x0, [x0,#8]
  10:   d65f03c0        ret

Changelog:

2015-06-01  Szabolcs Nagy  <szabolcs.nagy@arm.com>

	[BZ #18034]
	* sysdeps/aarch64/dl-tlsdesc.h (_dl_tlsdesc_return_lazy): Declare.
	* sysdeps/aarch64/dl-tlsdesc.S (_dl_tlsdesc_return_lazy): Define.
	(_dl_tlsdesc_undefweak): Guarantee TLSDESC entry and argument load-load
	ordering using ldar.
	(_dl_tlsdesc_dynamic): Likewise.
	(_dl_tlsdesc_return_lazy): Likewise.
	* sysdeps/aarch64/tlsdesc.c (_dl_tlsdesc_resolve_rela_fixup): Use
	relaxed atomics instead of volatile and synchronize with release store.
	(_dl_tlsdesc_resolve_hold_fixup): Use relaxed atomics instead of
	volatile.
	* elf/tlsdeschtab.h (_dl_tlsdesc_resolve_early_return_p): Likewise.

Comments

Torvald Riegel June 3, 2015, 10:40 a.m. UTC | #1
On Mon, 2015-06-01 at 11:25 +0100, Szabolcs Nagy wrote:
> i added a comment to the _dl_tlsdesc_resolve_early_return_p
> call in aarch64 tlsdesc.c about the retry loop.

That's good, but it would have been better if you could have briefly
pointed out that this relates to mo_relaxed loads in
_dl_tlsdesc_resolve_early_return_p.  And/or added a comment there saying
that the mo_relaxed loads are fine because of this retry loop in the
caller(s).

> diff --git a/sysdeps/aarch64/tlsdesc.c b/sysdeps/aarch64/tlsdesc.c
> index 4821f8c..d2f1cd5 100644
> --- a/sysdeps/aarch64/tlsdesc.c
> +++ b/sysdeps/aarch64/tlsdesc.c
> @@ -39,11 +40,12 @@
>  
>  void
>  attribute_hidden
> -_dl_tlsdesc_resolve_rela_fixup (struct tlsdesc volatile *td,
> -                               struct link_map *l)
> +_dl_tlsdesc_resolve_rela_fixup (struct tlsdesc *td, struct link_map
> *l)
>  {
> -  const ElfW(Rela) *reloc = td->arg;
> +  const ElfW(Rela) *reloc = atomic_load_relaxed (&td->arg);

Good change.  Can you add a brief comment saying why the mo_relaxed load
is sufficient?  IIRC, this is because of the acquire loads done by the
caller.

OK with those changes.

Thanks!
diff mbox

Patch

diff --git a/elf/tlsdeschtab.h b/elf/tlsdeschtab.h
index d13b4e5..fb0eb88 100644
--- a/elf/tlsdeschtab.h
+++ b/elf/tlsdeschtab.h
@@ -20,6 +20,8 @@ 
 #ifndef TLSDESCHTAB_H
 # define TLSDESCHTAB_H 1
 
+#include <atomic.h>
+
 # ifdef SHARED
 
 #  include <inline-hashtab.h>
@@ -138,17 +140,17 @@  _dl_make_tlsdesc_dynamic (struct link_map *map, size_t ti_offset)
 static int
 _dl_tlsdesc_resolve_early_return_p (struct tlsdesc volatile *td, void *caller)
 {
-  if (caller != td->entry)
+  if (caller != atomic_load_relaxed (&td->entry))
     return 1;
 
   __rtld_lock_lock_recursive (GL(dl_load_lock));
-  if (caller != td->entry)
+  if (caller != atomic_load_relaxed (&td->entry))
     {
       __rtld_lock_unlock_recursive (GL(dl_load_lock));
       return 1;
     }
 
-  td->entry = _dl_tlsdesc_resolve_hold;
+  atomic_store_relaxed (&td->entry, _dl_tlsdesc_resolve_hold);
 
   return 0;
 }
diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S
index be9b9b3..c7adf79 100644
--- a/sysdeps/aarch64/dl-tlsdesc.S
+++ b/sysdeps/aarch64/dl-tlsdesc.S
@@ -79,6 +79,29 @@  _dl_tlsdesc_return:
 	cfi_endproc
 	.size	_dl_tlsdesc_return, .-_dl_tlsdesc_return
 
+	/* Same as _dl_tlsdesc_return but with synchronization for
+	   lazy relocation.
+	   Prototype:
+	   _dl_tlsdesc_return_lazy (tlsdesc *) ;
+	 */
+	.hidden _dl_tlsdesc_return_lazy
+	.global	_dl_tlsdesc_return_lazy
+	.type	_dl_tlsdesc_return_lazy,%function
+	cfi_startproc
+	.align 2
+_dl_tlsdesc_return_lazy:
+	/* The ldar here happens after the load from [x0] at the call site
+	   (that is generated by the compiler as part of the TLS access ABI),
+	   so it reads the same value (this function is the final value of
+	   td->entry) and thus it synchronizes with the release store to
+	   td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
+	   from [x0,#8] here happens after the initialization of td->arg.  */
+	ldar	xzr, [x0]
+	ldr	x0, [x0, #8]
+	RET
+	cfi_endproc
+	.size	_dl_tlsdesc_return_lazy, .-_dl_tlsdesc_return_lazy
+
 	/* Handler for undefined weak TLS symbols.
 	   Prototype:
 	   _dl_tlsdesc_undefweak (tlsdesc *);
@@ -96,6 +119,13 @@  _dl_tlsdesc_return:
 _dl_tlsdesc_undefweak:
 	str	x1, [sp, #-16]!
 	cfi_adjust_cfa_offset(16)
+	/* The ldar here happens after the load from [x0] at the call site
+	   (that is generated by the compiler as part of the TLS access ABI),
+	   so it reads the same value (this function is the final value of
+	   td->entry) and thus it synchronizes with the release store to
+	   td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
+	   from [x0,#8] here happens after the initialization of td->arg.  */
+	ldar	xzr, [x0]
 	ldr	x0, [x0, #8]
 	mrs	x1, tpidr_el0
 	sub	x0, x0, x1
@@ -152,6 +182,13 @@  _dl_tlsdesc_dynamic:
 	stp	x3,  x4, [sp, #32+16*1]
 
 	mrs	x4, tpidr_el0
+	/* The ldar here happens after the load from [x0] at the call site
+	   (that is generated by the compiler as part of the TLS access ABI),
+	   so it reads the same value (this function is the final value of
+	   td->entry) and thus it synchronizes with the release store to
+	   td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
+	   from [x0,#8] here happens after the initialization of td->arg.  */
+	ldar	xzr, [x0]
 	ldr	x1, [x0,#8]
 	ldr	x0, [x4]
 	ldr	x3, [x1,#16]
diff --git a/sysdeps/aarch64/dl-tlsdesc.h b/sysdeps/aarch64/dl-tlsdesc.h
index 7a1285e..e6c0078 100644
--- a/sysdeps/aarch64/dl-tlsdesc.h
+++ b/sysdeps/aarch64/dl-tlsdesc.h
@@ -46,6 +46,9 @@  extern ptrdiff_t attribute_hidden
 _dl_tlsdesc_return (struct tlsdesc *);
 
 extern ptrdiff_t attribute_hidden
+_dl_tlsdesc_return_lazy (struct tlsdesc *);
+
+extern ptrdiff_t attribute_hidden
 _dl_tlsdesc_undefweak (struct tlsdesc *);
 
 extern ptrdiff_t attribute_hidden
diff --git a/sysdeps/aarch64/tlsdesc.c b/sysdeps/aarch64/tlsdesc.c
index 4821f8c..d2f1cd5 100644
--- a/sysdeps/aarch64/tlsdesc.c
+++ b/sysdeps/aarch64/tlsdesc.c
@@ -25,6 +25,7 @@ 
 #include <dl-tlsdesc.h>
 #include <dl-unmap-segments.h>
 #include <tlsdeschtab.h>
+#include <atomic.h>
 
 /* The following functions take an entry_check_offset argument.  It's
    computed by the caller as an offset between its entry point and the
@@ -39,11 +40,12 @@ 
 
 void
 attribute_hidden
-_dl_tlsdesc_resolve_rela_fixup (struct tlsdesc volatile *td,
-				struct link_map *l)
+_dl_tlsdesc_resolve_rela_fixup (struct tlsdesc *td, struct link_map *l)
 {
-  const ElfW(Rela) *reloc = td->arg;
+  const ElfW(Rela) *reloc = atomic_load_relaxed (&td->arg);
 
+  /* Return and thus retry calling td->entry if it changes before
+     GL(dl_load_lock) is grabbed.  */
   if (_dl_tlsdesc_resolve_early_return_p
       (td, (void*)(D_PTR (l, l_info[ADDRIDX (DT_TLSDESC_PLT)]) + l->l_addr)))
     return;
@@ -86,8 +88,10 @@  _dl_tlsdesc_resolve_rela_fixup (struct tlsdesc volatile *td,
 
   if (!sym)
     {
-      td->arg = (void*) reloc->r_addend;
-      td->entry = _dl_tlsdesc_undefweak;
+      atomic_store_relaxed (&td->arg, (void *) reloc->r_addend);
+      /* This release store synchronizes with the ldar acquire load
+	 instruction in _dl_tlsdesc_undefweak.  */
+      atomic_store_release (&td->entry, _dl_tlsdesc_undefweak);
     }
   else
     {
@@ -96,16 +100,22 @@  _dl_tlsdesc_resolve_rela_fixup (struct tlsdesc volatile *td,
 #  else
       if (!TRY_STATIC_TLS (l, result))
 	{
-	  td->arg = _dl_make_tlsdesc_dynamic (result, sym->st_value
+	  void *p = _dl_make_tlsdesc_dynamic (result, sym->st_value
 					      + reloc->r_addend);
-	  td->entry = _dl_tlsdesc_dynamic;
+	  atomic_store_relaxed (&td->arg, p);
+	  /* This release store synchronizes with the ldar acquire load
+	     instruction in _dl_tlsdesc_dynamic.  */
+	  atomic_store_release (&td->entry, _dl_tlsdesc_dynamic);
 	}
       else
 #  endif
 	{
-	  td->arg = (void*) (sym->st_value + result->l_tls_offset
+	  void *p = (void*) (sym->st_value + result->l_tls_offset
 			     + reloc->r_addend);
-	  td->entry = _dl_tlsdesc_return;
+	  atomic_store_relaxed (&td->arg, p);
+	  /* This release store synchronizes with the ldar acquire load
+	     instruction in _dl_tlsdesc_return_lazy.  */
+	  atomic_store_release (&td->entry, _dl_tlsdesc_return_lazy);
 	}
     }
 
@@ -120,11 +130,10 @@  _dl_tlsdesc_resolve_rela_fixup (struct tlsdesc volatile *td,
 
 void
 attribute_hidden
-_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc volatile *td,
-				void *caller)
+_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc *td, void *caller)
 {
   /* Maybe we're lucky and can return early.  */
-  if (caller != td->entry)
+  if (caller != atomic_load_relaxed (&td->entry))
     return;
 
   /* Locking here will stop execution until the running resolver runs