[v2,06/14] elf: Use relaxed atomics for racy accesses [BZ #19329]

Message ID 10fb15a36b3f6bc3e5ca62cda081c86512f47d32.1618301209.git.szabolcs.nagy@arm.com
State Committed
Delegated to: Adhemerval Zanella Netto
Headers
Series Dynamic TLS related data race fixes |

Commit Message

Szabolcs Nagy April 13, 2021, 8:19 a.m. UTC
  This is a follow up patch to the fix for bug 19329.  This adds
relaxed MO atomics to accesses that are racy, but relaxed MO is
enough.

--
v2:
- handle x86_64 dl-tls.c too
---
 elf/dl-close.c          | 20 +++++++++++++-------
 elf/dl-open.c           |  5 ++++-
 elf/dl-tls.c            | 31 +++++++++++++++++++++++--------
 sysdeps/x86_64/dl-tls.c |  3 ++-
 4 files changed, 42 insertions(+), 17 deletions(-)
  

Comments

Adhemerval Zanella April 15, 2021, 6:21 p.m. UTC | #1
On 13/04/2021 05:19, Szabolcs Nagy via Libc-alpha wrote:
> This is a follow up patch to the fix for bug 19329.  This adds
> relaxed MO atomics to accesses that are racy, but relaxed MO is
> enough.

Could you extend a bit why relaxed MO should be suffice?

Patch looks good, just a small request below.

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

> 
> --
> v2:
> - handle x86_64 dl-tls.c too
> ---
>  elf/dl-close.c          | 20 +++++++++++++-------
>  elf/dl-open.c           |  5 ++++-
>  elf/dl-tls.c            | 31 +++++++++++++++++++++++--------
>  sysdeps/x86_64/dl-tls.c |  3 ++-
>  4 files changed, 42 insertions(+), 17 deletions(-)
> 
> diff --git a/elf/dl-close.c b/elf/dl-close.c
> index c51becd06b..3720e47dd1 100644
> --- a/elf/dl-close.c
> +++ b/elf/dl-close.c
> @@ -79,9 +79,10 @@ remove_slotinfo (size_t idx, struct dtv_slotinfo_list *listp, size_t disp,
>  	{
>  	  assert (old_map->l_tls_modid == idx);
>  
> -	  /* Mark the entry as unused. */
> -	  listp->slotinfo[idx - disp].gen = GL(dl_tls_generation) + 1;
> -	  listp->slotinfo[idx - disp].map = NULL;
> +	  /* Mark the entry as unused.  These can be read concurrently.  */
> +	  atomic_store_relaxed (&listp->slotinfo[idx - disp].gen,
> +				GL(dl_tls_generation) + 1);
> +	  atomic_store_relaxed (&listp->slotinfo[idx - disp].map, NULL);
>  	}
>  
>        /* If this is not the last currently used entry no need to look

Ok.

> @@ -96,8 +97,8 @@ remove_slotinfo (size_t idx, struct dtv_slotinfo_list *listp, size_t disp,
>  
>        if (listp->slotinfo[idx - disp].map != NULL)
>  	{
> -	  /* Found a new last used index.  */
> -	  GL(dl_tls_max_dtv_idx) = idx;
> +	  /* Found a new last used index.  This can be read concurrently.  */
> +	  atomic_store_relaxed (&GL(dl_tls_max_dtv_idx), idx);
>  	  return true;
>  	}
>      }

Ok.

> @@ -571,7 +572,9 @@ _dl_close_worker (struct link_map *map, bool force)
>  					GL(dl_tls_dtv_slotinfo_list), 0,
>  					imap->l_init_called))
>  		/* All dynamically loaded modules with TLS are unloaded.  */
> -		GL(dl_tls_max_dtv_idx) = GL(dl_tls_static_nelem);
> +		/* Can be read concurrently.  */
> +		atomic_store_relaxed (&GL(dl_tls_max_dtv_idx),
> +				      GL(dl_tls_static_nelem));
>  
>  	      if (imap->l_tls_offset != NO_TLS_OFFSET
>  		  && imap->l_tls_offset != FORCED_DYNAMIC_TLS_OFFSET)

Ok.

> @@ -769,8 +772,11 @@ _dl_close_worker (struct link_map *map, bool force)
>    /* If we removed any object which uses TLS bump the generation counter.  */
>    if (any_tls)
>      {
> -      if (__glibc_unlikely (++GL(dl_tls_generation) == 0))
> +      size_t newgen = GL(dl_tls_generation) + 1;
> +      if (__glibc_unlikely (newgen == 0))
>  	_dl_fatal_printf ("TLS generation counter wrapped!  Please report as described in "REPORT_BUGS_TO".\n");
> +      /* Can be read concurrently.  */
> +      atomic_store_relaxed (&GL(dl_tls_generation), newgen);
>  
>        if (tls_free_end == GL(dl_tls_static_used))
>  	GL(dl_tls_static_used) = tls_free_start;

Ok.

> diff --git a/elf/dl-open.c b/elf/dl-open.c
> index ab7aaa345e..83b8e96a5c 100644
> --- a/elf/dl-open.c
> +++ b/elf/dl-open.c
> @@ -395,9 +395,12 @@ update_tls_slotinfo (struct link_map *new)
>  	}
>      }
>  
> -  if (__builtin_expect (++GL(dl_tls_generation) == 0, 0))
> +  size_t newgen = GL(dl_tls_generation) + 1;
> +  if (__builtin_expect (newgen == 0, 0))
>      _dl_fatal_printf (N_("\

Use __glibc_unlikely since you are modifying it.

>  TLS generation counter wrapped!  Please report this."));
> +  /* Can be read concurrently.  */
> +  atomic_store_relaxed (&GL(dl_tls_generation), newgen);
>  
>    /* We need a second pass for static tls data, because
>       _dl_update_slotinfo must not be run while calls to

Ok.

> diff --git a/elf/dl-tls.c b/elf/dl-tls.c
> index 33c06782b1..c4466bd9fc 100644
> --- a/elf/dl-tls.c
> +++ b/elf/dl-tls.c
> @@ -175,7 +175,9 @@ _dl_next_tls_modid (void)
>        /* No gaps, allocate a new entry.  */
>      nogaps:
>  
> -      result = ++GL(dl_tls_max_dtv_idx);
> +      result = GL(dl_tls_max_dtv_idx) + 1;
> +      /* Can be read concurrently.  */
> +      atomic_store_relaxed (&GL(dl_tls_max_dtv_idx), result);
>      }
>  
>    return result;

Ok.

> @@ -359,10 +361,12 @@ allocate_dtv (void *result)
>    dtv_t *dtv;
>    size_t dtv_length;
>  
> +  /* Relaxed MO, because the dtv size is later rechecked, not relied on.  */
> +  size_t max_modid = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx));
>    /* We allocate a few more elements in the dtv than are needed for the
>       initial set of modules.  This should avoid in most cases expansions
>       of the dtv.  */
> -  dtv_length = GL(dl_tls_max_dtv_idx) + DTV_SURPLUS;
> +  dtv_length = max_modid + DTV_SURPLUS;
>    dtv = calloc (dtv_length + 2, sizeof (dtv_t));
>    if (dtv != NULL)
>      {

Ok.

> @@ -767,7 +771,7 @@ _dl_update_slotinfo (unsigned long int req_modid)
>  	      if (modid > max_modid)
>  		break;
>  
> -	      size_t gen = listp->slotinfo[cnt].gen;
> +	      size_t gen = atomic_load_relaxed (&listp->slotinfo[cnt].gen);
>  
>  	      if (gen > new_gen)
>  		/* Not relevant.  */

Ok.

> @@ -779,7 +783,8 @@ _dl_update_slotinfo (unsigned long int req_modid)
>  		continue;
>  
>  	      /* If there is no map this means the entry is empty.  */
> -	      struct link_map *map = listp->slotinfo[cnt].map;
> +	      struct link_map *map
> +		= atomic_load_relaxed (&listp->slotinfo[cnt].map);
>  	      /* Check whether the current dtv array is large enough.  */
>  	      if (dtv[-1].counter < modid)
>  		{

OK.

> @@ -923,7 +928,12 @@ __tls_get_addr (GET_ADDR_ARGS)
>  {
>    dtv_t *dtv = THREAD_DTV ();
>  
> -  if (__glibc_unlikely (dtv[0].counter != GL(dl_tls_generation)))
> +  /* Update is needed if dtv[0].counter < the generation of the accessed
> +     module.  The global generation counter is used here as it is easier
> +     to check.  Synchronization for the relaxed MO access is guaranteed
> +     by user code, see CONCURRENCY NOTES in _dl_update_slotinfo.  */
> +  size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
> +  if (__glibc_unlikely (dtv[0].counter != gen))
>      return update_get_addr (GET_ADDR_PARAM);
>  
>    void *p = dtv[GET_ADDR_MODULE].pointer.val;

Ok.

> @@ -946,7 +956,10 @@ _dl_tls_get_addr_soft (struct link_map *l)
>      return NULL;
>  
>    dtv_t *dtv = THREAD_DTV ();
> -  if (__glibc_unlikely (dtv[0].counter != GL(dl_tls_generation)))
> +  /* This may be called without holding the GL(dl_load_lock).  Reading
> +     arbitrary gen value is fine since this is best effort code.  */
> +  size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
> +  if (__glibc_unlikely (dtv[0].counter != gen))
>      {
>        /* This thread's DTV is not completely current,
>  	 but it might already cover this module.  */

Ok.

> @@ -1032,7 +1045,9 @@ cannot create TLS data structures"));
>    /* Add the information into the slotinfo data structure.  */
>    if (do_add)
>      {
> -      listp->slotinfo[idx].map = l;
> -      listp->slotinfo[idx].gen = GL(dl_tls_generation) + 1;
> +      /* Can be read concurrently.  See _dl_update_slotinfo.  */
> +      atomic_store_relaxed (&listp->slotinfo[idx].map, l);
> +      atomic_store_relaxed (&listp->slotinfo[idx].gen,
> +			    GL(dl_tls_generation) + 1);
>      }
>  }

Ok.

> diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
> index 6595f6615b..24ef560b71 100644
> --- a/sysdeps/x86_64/dl-tls.c
> +++ b/sysdeps/x86_64/dl-tls.c
> @@ -40,7 +40,8 @@ __tls_get_addr_slow (GET_ADDR_ARGS)
>  {
>    dtv_t *dtv = THREAD_DTV ();
>  
> -  if (__glibc_unlikely (dtv[0].counter != GL(dl_tls_generation)))
> +  size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
> +  if (__glibc_unlikely (dtv[0].counter != gen))
>      return update_get_addr (GET_ADDR_PARAM);
>  
>    return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
> 

Ok.

X86_64 also access dl_tls_generation on sysdeps/x86_64/tls_get_addr.S,
but I afaik the default memory ordering for x86_64 already guarantee
relaxed MO.
  
Szabolcs Nagy April 16, 2021, 9:12 a.m. UTC | #2
The 04/15/2021 15:21, Adhemerval Zanella wrote:
> On 13/04/2021 05:19, Szabolcs Nagy via Libc-alpha wrote:
> > This is a follow up patch to the fix for bug 19329.  This adds
> > relaxed MO atomics to accesses that are racy, but relaxed MO is
> > enough.
> 
> Could you extend a bit why relaxed MO should be suffice?

is it ok to change the commit message to:

This is a follow up patch to the fix for bug 19329.  This adds relaxed
MO atomics to accesses that are racy, but relaxed MO is enough.

The racy accesses all follow the pattern that the write is behind the
dlopen lock, but a read can happen concurrently (e.g. during tls access)
without holding the lock.  For slotinfo entries the read value only
matters if it reads from a synchronized write in dlopen or dlclose,
otherwise the related dtv entry is not valid to access so it is fine to
leave it in inconsistent state. Same for GL(dl_tls_max_dtv_idx) and
GL(dl_tls_generation), but there we rely on the read value being larger
than the last written value that was synchronized.

> 
> Patch looks good, just a small request below.
> 
> Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

thanks.
  
Carlos O'Donell May 11, 2021, 2:56 a.m. UTC | #3
On 4/16/21 5:12 AM, Szabolcs Nagy via Libc-alpha wrote:
> The 04/15/2021 15:21, Adhemerval Zanella wrote:
>> On 13/04/2021 05:19, Szabolcs Nagy via Libc-alpha wrote:
>>> This is a follow up patch to the fix for bug 19329.  This adds
>>> relaxed MO atomics to accesses that are racy, but relaxed MO is
>>> enough.
>>
>> Could you extend a bit why relaxed MO should be suffice?
> 
> is it ok to change the commit message to:
> 
> This is a follow up patch to the fix for bug 19329.  This adds relaxed
> MO atomics to accesses that are racy, but relaxed MO is enough.

Suggest:

This is a follow up patch to the fix for bug 19329.  This adds relaxed
MO atomics to accesses that were previously data races but are now
race conditions, and where relaxed MO is sufficient.
 
> The racy accesses all follow the pattern that the write is behind the

s/racy accesses/race conditions/g

> dlopen lock, but a read can happen concurrently (e.g. during tls access)
> without holding the lock.  For slotinfo entries the read value only
> matters if it reads from a synchronized write in dlopen or dlclose,
> otherwise the related dtv entry is not valid to access so it is fine to
> leave it in inconsistent state. Same for GL(dl_tls_max_dtv_idx) and

s/it in/it in an/g

s/Same/The same applies for/g

> GL(dl_tls_generation), but there we rely on the read value being larger
> than the last written value that was synchronized.

Do you mean to imply that the synchronized writes all increase the generation
counter, and so any out of order reads rely on the value to be increasing?

Suggested:
The same applies for GL(dl_tls_max_dtv_idx) and GL(dl_tls_generation), but
there the algorithm relies on the fact that the read of the last synchronized
write is an increasing value.
  
Szabolcs Nagy May 11, 2021, 9:31 a.m. UTC | #4
The 05/10/2021 22:56, Carlos O'Donell wrote:
> On 4/16/21 5:12 AM, Szabolcs Nagy via Libc-alpha wrote:
> > The 04/15/2021 15:21, Adhemerval Zanella wrote:
> >> On 13/04/2021 05:19, Szabolcs Nagy via Libc-alpha wrote:
> >>> This is a follow up patch to the fix for bug 19329.  This adds
> >>> relaxed MO atomics to accesses that are racy, but relaxed MO is
> >>> enough.
> >>
> >> Could you extend a bit why relaxed MO should be suffice?
> > 
> > is it ok to change the commit message to:
> > 
> > This is a follow up patch to the fix for bug 19329.  This adds relaxed
> > MO atomics to accesses that are racy, but relaxed MO is enough.
> 
> Suggest:
> 
> This is a follow up patch to the fix for bug 19329.  This adds relaxed
> MO atomics to accesses that were previously data races but are now
> race conditions, and where relaxed MO is sufficient.
>  
> > The racy accesses all follow the pattern that the write is behind the
> 
> s/racy accesses/race conditions/g
> 
> > dlopen lock, but a read can happen concurrently (e.g. during tls access)
> > without holding the lock.  For slotinfo entries the read value only
> > matters if it reads from a synchronized write in dlopen or dlclose,
> > otherwise the related dtv entry is not valid to access so it is fine to
> > leave it in inconsistent state. Same for GL(dl_tls_max_dtv_idx) and
> 
> s/it in/it in an/g
> 
> s/Same/The same applies for/g
> 
> > GL(dl_tls_generation), but there we rely on the read value being larger
> > than the last written value that was synchronized.
> 
> Do you mean to imply that the synchronized writes all increase the generation
> counter, and so any out of order reads rely on the value to be increasing?

yes, if the current thread is synchronized with a dlopen and reads
GL(dl_tls_genertion) with relaxed MO then either the gen of the
dlopened module is read or a larger value. So we can use relaxed MO
value to see if the dtv needs update at tls access.

(This is the difficult bit in fixing bug 19924: we can use relaxed
MO because we update the dtv upto the gen of the module. slotinfo
entries with larger gen are ignored, but if we want to update upto
the global gen then we need additional synchronization.)

> Suggested:
> The same applies for GL(dl_tls_max_dtv_idx) and GL(dl_tls_generation), but
> there the algorithm relies on the fact that the read of the last synchronized
> write is an increasing value.

Thanks for the review, i attached the fixed patch i plan to commit
later today if there are no further comments.
  
Szabolcs Nagy May 11, 2021, 4:19 p.m. UTC | #5
The 05/11/2021 10:31, Szabolcs Nagy via Libc-alpha wrote:
> Thanks for the review, i attached the fixed patch i plan to commit
> later today if there are no further comments.

committed as f4f8f4d4e0f92488431b268c8cd9555730b9afe9

and committed the other reviewed patches from the series too.

> From 2a65e592a753310334c1ded2841215fb9c6024f8 Mon Sep 17 00:00:00 2001
> From: Szabolcs Nagy <szabolcs.nagy@arm.com>
> Date: Wed, 30 Dec 2020 19:19:37 +0000
> Subject: [PATCH] elf: Use relaxed atomics for racy accesses [BZ #19329]
> 
> This is a follow up patch to the fix for bug 19329.  This adds relaxed
> MO atomics to accesses that were previously data races but are now
> race conditions, and where relaxed MO is sufficient.
> 
> The race conditions all follow the pattern that the write is behind the
> dlopen lock, but a read can happen concurrently (e.g. during tls access)
> without holding the lock.  For slotinfo entries the read value only
> matters if it reads from a synchronized write in dlopen or dlclose,
> otherwise the related dtv entry is not valid to access so it is fine
> to leave it in an inconsistent state.  The same applies for
> GL(dl_tls_max_dtv_idx) and GL(dl_tls_generation), but there the
> algorithm relies on the fact that the read of the last synchronized
> write is an increasing value.
> 
> Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
...
  
Carlos O'Donell May 12, 2021, 8:33 p.m. UTC | #6
On 5/11/21 5:31 AM, Szabolcs Nagy wrote:
> The 05/10/2021 22:56, Carlos O'Donell wrote:
>> On 4/16/21 5:12 AM, Szabolcs Nagy via Libc-alpha wrote:
>>> The 04/15/2021 15:21, Adhemerval Zanella wrote:
>>>> On 13/04/2021 05:19, Szabolcs Nagy via Libc-alpha wrote:
>>>>> This is a follow up patch to the fix for bug 19329.  This adds
>>>>> relaxed MO atomics to accesses that are racy, but relaxed MO is
>>>>> enough.
>>>>
>>>> Could you extend a bit why relaxed MO should be suffice?
>>>
>>> is it ok to change the commit message to:
>>>
>>> This is a follow up patch to the fix for bug 19329.  This adds relaxed
>>> MO atomics to accesses that are racy, but relaxed MO is enough.
>>
>> Suggest:
>>
>> This is a follow up patch to the fix for bug 19329.  This adds relaxed
>> MO atomics to accesses that were previously data races but are now
>> race conditions, and where relaxed MO is sufficient.
>>  
>>> The racy accesses all follow the pattern that the write is behind the
>>
>> s/racy accesses/race conditions/g
>>
>>> dlopen lock, but a read can happen concurrently (e.g. during tls access)
>>> without holding the lock.  For slotinfo entries the read value only
>>> matters if it reads from a synchronized write in dlopen or dlclose,
>>> otherwise the related dtv entry is not valid to access so it is fine to
>>> leave it in inconsistent state. Same for GL(dl_tls_max_dtv_idx) and
>>
>> s/it in/it in an/g
>>
>> s/Same/The same applies for/g
>>
>>> GL(dl_tls_generation), but there we rely on the read value being larger
>>> than the last written value that was synchronized.
>>
>> Do you mean to imply that the synchronized writes all increase the generation
>> counter, and so any out of order reads rely on the value to be increasing?
> 
> yes, if the current thread is synchronized with a dlopen and reads
> GL(dl_tls_genertion) with relaxed MO then either the gen of the
> dlopened module is read or a larger value. So we can use relaxed MO
> value to see if the dtv needs update at tls access.
> 
> (This is the difficult bit in fixing bug 19924: we can use relaxed
> MO because we update the dtv upto the gen of the module. slotinfo
> entries with larger gen are ignored, but if we want to update upto
> the global gen then we need additional synchronization.)
> 
>> Suggested:
>> The same applies for GL(dl_tls_max_dtv_idx) and GL(dl_tls_generation), but
>> there the algorithm relies on the fact that the read of the last synchronized
>> write is an increasing value.
> 
> Thanks for the review, i attached the fixed patch i plan to commit
> later today if there are no further comments.
 
Thanks. I saw you commited this and it looked good to me!

Thanks for working through these issues!
  

Patch

diff --git a/elf/dl-close.c b/elf/dl-close.c
index c51becd06b..3720e47dd1 100644
--- a/elf/dl-close.c
+++ b/elf/dl-close.c
@@ -79,9 +79,10 @@  remove_slotinfo (size_t idx, struct dtv_slotinfo_list *listp, size_t disp,
 	{
 	  assert (old_map->l_tls_modid == idx);
 
-	  /* Mark the entry as unused. */
-	  listp->slotinfo[idx - disp].gen = GL(dl_tls_generation) + 1;
-	  listp->slotinfo[idx - disp].map = NULL;
+	  /* Mark the entry as unused.  These can be read concurrently.  */
+	  atomic_store_relaxed (&listp->slotinfo[idx - disp].gen,
+				GL(dl_tls_generation) + 1);
+	  atomic_store_relaxed (&listp->slotinfo[idx - disp].map, NULL);
 	}
 
       /* If this is not the last currently used entry no need to look
@@ -96,8 +97,8 @@  remove_slotinfo (size_t idx, struct dtv_slotinfo_list *listp, size_t disp,
 
       if (listp->slotinfo[idx - disp].map != NULL)
 	{
-	  /* Found a new last used index.  */
-	  GL(dl_tls_max_dtv_idx) = idx;
+	  /* Found a new last used index.  This can be read concurrently.  */
+	  atomic_store_relaxed (&GL(dl_tls_max_dtv_idx), idx);
 	  return true;
 	}
     }
@@ -571,7 +572,9 @@  _dl_close_worker (struct link_map *map, bool force)
 					GL(dl_tls_dtv_slotinfo_list), 0,
 					imap->l_init_called))
 		/* All dynamically loaded modules with TLS are unloaded.  */
-		GL(dl_tls_max_dtv_idx) = GL(dl_tls_static_nelem);
+		/* Can be read concurrently.  */
+		atomic_store_relaxed (&GL(dl_tls_max_dtv_idx),
+				      GL(dl_tls_static_nelem));
 
 	      if (imap->l_tls_offset != NO_TLS_OFFSET
 		  && imap->l_tls_offset != FORCED_DYNAMIC_TLS_OFFSET)
@@ -769,8 +772,11 @@  _dl_close_worker (struct link_map *map, bool force)
   /* If we removed any object which uses TLS bump the generation counter.  */
   if (any_tls)
     {
-      if (__glibc_unlikely (++GL(dl_tls_generation) == 0))
+      size_t newgen = GL(dl_tls_generation) + 1;
+      if (__glibc_unlikely (newgen == 0))
 	_dl_fatal_printf ("TLS generation counter wrapped!  Please report as described in "REPORT_BUGS_TO".\n");
+      /* Can be read concurrently.  */
+      atomic_store_relaxed (&GL(dl_tls_generation), newgen);
 
       if (tls_free_end == GL(dl_tls_static_used))
 	GL(dl_tls_static_used) = tls_free_start;
diff --git a/elf/dl-open.c b/elf/dl-open.c
index ab7aaa345e..83b8e96a5c 100644
--- a/elf/dl-open.c
+++ b/elf/dl-open.c
@@ -395,9 +395,12 @@  update_tls_slotinfo (struct link_map *new)
 	}
     }
 
-  if (__builtin_expect (++GL(dl_tls_generation) == 0, 0))
+  size_t newgen = GL(dl_tls_generation) + 1;
+  if (__builtin_expect (newgen == 0, 0))
     _dl_fatal_printf (N_("\
 TLS generation counter wrapped!  Please report this."));
+  /* Can be read concurrently.  */
+  atomic_store_relaxed (&GL(dl_tls_generation), newgen);
 
   /* We need a second pass for static tls data, because
      _dl_update_slotinfo must not be run while calls to
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index 33c06782b1..c4466bd9fc 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -175,7 +175,9 @@  _dl_next_tls_modid (void)
       /* No gaps, allocate a new entry.  */
     nogaps:
 
-      result = ++GL(dl_tls_max_dtv_idx);
+      result = GL(dl_tls_max_dtv_idx) + 1;
+      /* Can be read concurrently.  */
+      atomic_store_relaxed (&GL(dl_tls_max_dtv_idx), result);
     }
 
   return result;
@@ -359,10 +361,12 @@  allocate_dtv (void *result)
   dtv_t *dtv;
   size_t dtv_length;
 
+  /* Relaxed MO, because the dtv size is later rechecked, not relied on.  */
+  size_t max_modid = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx));
   /* We allocate a few more elements in the dtv than are needed for the
      initial set of modules.  This should avoid in most cases expansions
      of the dtv.  */
-  dtv_length = GL(dl_tls_max_dtv_idx) + DTV_SURPLUS;
+  dtv_length = max_modid + DTV_SURPLUS;
   dtv = calloc (dtv_length + 2, sizeof (dtv_t));
   if (dtv != NULL)
     {
@@ -767,7 +771,7 @@  _dl_update_slotinfo (unsigned long int req_modid)
 	      if (modid > max_modid)
 		break;
 
-	      size_t gen = listp->slotinfo[cnt].gen;
+	      size_t gen = atomic_load_relaxed (&listp->slotinfo[cnt].gen);
 
 	      if (gen > new_gen)
 		/* Not relevant.  */
@@ -779,7 +783,8 @@  _dl_update_slotinfo (unsigned long int req_modid)
 		continue;
 
 	      /* If there is no map this means the entry is empty.  */
-	      struct link_map *map = listp->slotinfo[cnt].map;
+	      struct link_map *map
+		= atomic_load_relaxed (&listp->slotinfo[cnt].map);
 	      /* Check whether the current dtv array is large enough.  */
 	      if (dtv[-1].counter < modid)
 		{
@@ -923,7 +928,12 @@  __tls_get_addr (GET_ADDR_ARGS)
 {
   dtv_t *dtv = THREAD_DTV ();
 
-  if (__glibc_unlikely (dtv[0].counter != GL(dl_tls_generation)))
+  /* Update is needed if dtv[0].counter < the generation of the accessed
+     module.  The global generation counter is used here as it is easier
+     to check.  Synchronization for the relaxed MO access is guaranteed
+     by user code, see CONCURRENCY NOTES in _dl_update_slotinfo.  */
+  size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
+  if (__glibc_unlikely (dtv[0].counter != gen))
     return update_get_addr (GET_ADDR_PARAM);
 
   void *p = dtv[GET_ADDR_MODULE].pointer.val;
@@ -946,7 +956,10 @@  _dl_tls_get_addr_soft (struct link_map *l)
     return NULL;
 
   dtv_t *dtv = THREAD_DTV ();
-  if (__glibc_unlikely (dtv[0].counter != GL(dl_tls_generation)))
+  /* This may be called without holding the GL(dl_load_lock).  Reading
+     arbitrary gen value is fine since this is best effort code.  */
+  size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
+  if (__glibc_unlikely (dtv[0].counter != gen))
     {
       /* This thread's DTV is not completely current,
 	 but it might already cover this module.  */
@@ -1032,7 +1045,9 @@  cannot create TLS data structures"));
   /* Add the information into the slotinfo data structure.  */
   if (do_add)
     {
-      listp->slotinfo[idx].map = l;
-      listp->slotinfo[idx].gen = GL(dl_tls_generation) + 1;
+      /* Can be read concurrently.  See _dl_update_slotinfo.  */
+      atomic_store_relaxed (&listp->slotinfo[idx].map, l);
+      atomic_store_relaxed (&listp->slotinfo[idx].gen,
+			    GL(dl_tls_generation) + 1);
     }
 }
diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
index 6595f6615b..24ef560b71 100644
--- a/sysdeps/x86_64/dl-tls.c
+++ b/sysdeps/x86_64/dl-tls.c
@@ -40,7 +40,8 @@  __tls_get_addr_slow (GET_ADDR_ARGS)
 {
   dtv_t *dtv = THREAD_DTV ();
 
-  if (__glibc_unlikely (dtv[0].counter != GL(dl_tls_generation)))
+  size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
+  if (__glibc_unlikely (dtv[0].counter != gen))
     return update_get_addr (GET_ADDR_PARAM);
 
   return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);