[v2,14/14] RFC elf: Fix slow tls access after dlopen [BZ #19924]

Message ID b116855de71098ef7dd2875dd3237f8f3ecc12c2.1618301209.git.szabolcs.nagy@arm.com
State Superseded
Delegated to: Adhemerval Zanella Netto
Headers
Series Dynamic TLS related data race fixes |

Commit Message

Szabolcs Nagy April 13, 2021, 8:21 a.m. UTC
  In short: __tls_get_addr checks the global generation counter,
_dl_update_slotinfo updates up to the generation of the accessed
module. If the global generation is newer than geneneration of the
module then __tls_get_addr keeps hitting the slow path that updates
the dtv.

Possible approaches i can see:

1. update to global generation instead of module,
2. check the module generation in the fast path.

This patch is 1.: it needs additional sync (load acquire) so the
slotinfo list is up to date with the observed global generation.

Approach 2. would require walking the slotinfo list at all times.
I don't know how to make that fast with many modules.

Note: in the x86_64 version of dl-tls.c the generation is only loaded
once, since relaxed mo is not faster than acquire mo load.

I have not benchmarked this yet.
---
 elf/dl-close.c             |  2 +-
 elf/dl-open.c              |  8 ++++----
 elf/dl-reloc.c             |  5 ++---
 elf/dl-tls.c               | 28 ++++++++++++----------------
 sysdeps/generic/ldsodefs.h |  3 ++-
 sysdeps/x86_64/dl-tls.c    |  4 ++--
 6 files changed, 23 insertions(+), 27 deletions(-)
  

Comments

Carlos O'Donell Sept. 16, 2022, 9:54 a.m. UTC | #1
On Tue, Apr 13, 2021 at 09:21:58AM +0100, Szabolcs Nagy via Libc-alpha wrote:
> In short: __tls_get_addr checks the global generation counter,
> _dl_update_slotinfo updates up to the generation of the accessed
> module. If the global generation is newer than geneneration of the
> module then __tls_get_addr keeps hitting the slow path that updates
> the dtv.
> 
> Possible approaches i can see:
> 
> 1. update to global generation instead of module,
> 2. check the module generation in the fast path.
> 
> This patch is 1.: it needs additional sync (load acquire) so the
> slotinfo list is up to date with the observed global generation.
> 
> Approach 2. would require walking the slotinfo list at all times.
> I don't know how to make that fast with many modules.
> 
> Note: in the x86_64 version of dl-tls.c the generation is only loaded
> once, since relaxed mo is not faster than acquire mo load.
> 
> I have not benchmarked this yet.

Szabolcs,

Could you please repost this as a distinct patch? I'd like to track
this as a possible solution that we can discuss and improve. It's the
only patch that we haven't applied from this series.

Cheers,
Carlos.
  

Patch

diff --git a/elf/dl-close.c b/elf/dl-close.c
index 9f31532f41..45f8a7fe31 100644
--- a/elf/dl-close.c
+++ b/elf/dl-close.c
@@ -780,7 +780,7 @@  _dl_close_worker (struct link_map *map, bool force)
       if (__glibc_unlikely (newgen == 0))
 	_dl_fatal_printf ("TLS generation counter wrapped!  Please report as described in "REPORT_BUGS_TO".\n");
       /* Can be read concurrently.  */
-      atomic_store_relaxed (&GL(dl_tls_generation), newgen);
+      atomic_store_release (&GL(dl_tls_generation), newgen);
 
       if (tls_free_end == GL(dl_tls_static_used))
 	GL(dl_tls_static_used) = tls_free_start;
diff --git a/elf/dl-open.c b/elf/dl-open.c
index 661f26977e..5b9816e4e8 100644
--- a/elf/dl-open.c
+++ b/elf/dl-open.c
@@ -400,7 +400,7 @@  update_tls_slotinfo (struct link_map *new)
     _dl_fatal_printf (N_("\
 TLS generation counter wrapped!  Please report this."));
   /* Can be read concurrently.  */
-  atomic_store_relaxed (&GL(dl_tls_generation), newgen);
+  atomic_store_release (&GL(dl_tls_generation), newgen);
 
   /* We need a second pass for static tls data, because
      _dl_update_slotinfo must not be run while calls to
@@ -417,8 +417,8 @@  TLS generation counter wrapped!  Please report this."));
 	     now, but we can delay updating the DTV.  */
 	  imap->l_need_tls_init = 0;
 #ifdef SHARED
-	  /* Update the slot information data for at least the
-	     generation of the DSO we are allocating data for.  */
+	  /* Update the slot information data for the current
+	     generation.  */
 
 	  /* FIXME: This can terminate the process on memory
 	     allocation failure.  It is not possible to raise
@@ -426,7 +426,7 @@  TLS generation counter wrapped!  Please report this."));
 	     _dl_update_slotinfo would have to be split into two
 	     operations, similar to resize_scopes and update_scopes
 	     above.  This is related to bug 16134.  */
-	  _dl_update_slotinfo (imap->l_tls_modid);
+	  _dl_update_slotinfo (imap->l_tls_modid, newgen);
 #endif
 
 	  GL(dl_init_static_tls) (imap);
diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c
index c2df26deea..427669d769 100644
--- a/elf/dl-reloc.c
+++ b/elf/dl-reloc.c
@@ -111,11 +111,10 @@  _dl_try_allocate_static_tls (struct link_map *map, bool optional)
   if (map->l_real->l_relocated)
     {
 #ifdef SHARED
+// TODO: it is not clear why we need to update the DTV here, add comment
       if (__builtin_expect (THREAD_DTV()[0].counter != GL(dl_tls_generation),
 			    0))
-	/* Update the slot information data for at least the generation of
-	   the DSO we are allocating data for.  */
-	(void) _dl_update_slotinfo (map->l_tls_modid);
+	(void) _dl_update_slotinfo (map->l_tls_modid, GL(dl_tls_generation));
 #endif
 
       GL(dl_init_static_tls) (map);
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index b0257185e9..b51a4f3a19 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -701,7 +701,7 @@  allocate_and_init (struct link_map *map)
 
 
 struct link_map *
-_dl_update_slotinfo (unsigned long int req_modid)
+_dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
 {
   struct link_map *the_map = NULL;
   dtv_t *dtv = THREAD_DTV ();
@@ -718,19 +718,12 @@  _dl_update_slotinfo (unsigned long int req_modid)
      code and therefore add to the slotinfo list.  This is a problem
      since we must not pick up any information about incomplete work.
      The solution to this is to ignore all dtv slots which were
-     created after the one we are currently interested.  We know that
-     dynamic loading for this module is completed and this is the last
-     load operation we know finished.  */
-  unsigned long int idx = req_modid;
+     created after the generation we are interested in.  We know that
+     dynamic loading for this generation is completed and this is the
+     last load operation we know finished.  */
   struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
 
-  while (idx >= listp->len)
-    {
-      idx -= listp->len;
-      listp = listp->next;
-    }
-
-  if (dtv[0].counter < listp->slotinfo[idx].gen)
+  if (dtv[0].counter < new_gen)
     {
       /* CONCURRENCY NOTES:
 
@@ -751,7 +744,6 @@  _dl_update_slotinfo (unsigned long int req_modid)
 	 other entries are racy.  However updating a non-relevant dtv
 	 entry does not affect correctness.  For a relevant module m,
 	 max_modid >= modid of m.  */
-      size_t new_gen = listp->slotinfo[idx].gen;
       size_t total = 0;
       size_t max_modid  = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx));
       assert (max_modid >= req_modid);
@@ -894,9 +886,9 @@  tls_get_addr_tail (GET_ADDR_ARGS, dtv_t *dtv, struct link_map *the_map)
 
 static struct link_map *
 __attribute_noinline__
-update_get_addr (GET_ADDR_ARGS)
+update_get_addr (GET_ADDR_ARGS, size_t gen)
 {
-  struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE);
+  struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE, gen);
   dtv_t *dtv = THREAD_DTV ();
 
   void *p = dtv[GET_ADDR_MODULE].pointer.val;
@@ -931,7 +923,11 @@  __tls_get_addr (GET_ADDR_ARGS)
      by user code, see CONCURRENCY NOTES in _dl_update_slotinfo.  */
   size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
   if (__glibc_unlikely (dtv[0].counter != gen))
-    return update_get_addr (GET_ADDR_PARAM);
+    {
+// TODO: needs comment update if we rely on consistent generation with slotinfo
+      gen = atomic_load_acquire (&GL(dl_tls_generation));
+      return update_get_addr (GET_ADDR_PARAM, gen);
+    }
 
   void *p = dtv[GET_ADDR_MODULE].pointer.val;
 
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
index ea3f7a69d0..614463f016 100644
--- a/sysdeps/generic/ldsodefs.h
+++ b/sysdeps/generic/ldsodefs.h
@@ -1224,7 +1224,8 @@  extern void _dl_add_to_slotinfo (struct link_map *l, bool do_add)
 
 /* Update slot information data for at least the generation of the
    module with the given index.  */
-extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid)
+extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid,
+					     size_t gen)
      attribute_hidden;
 
 /* Look up the module's TLS block as for __tls_get_addr,
diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
index 24ef560b71..4ded8dd6b9 100644
--- a/sysdeps/x86_64/dl-tls.c
+++ b/sysdeps/x86_64/dl-tls.c
@@ -40,9 +40,9 @@  __tls_get_addr_slow (GET_ADDR_ARGS)
 {
   dtv_t *dtv = THREAD_DTV ();
 
-  size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
+  size_t gen = atomic_load_acquire (&GL(dl_tls_generation));
   if (__glibc_unlikely (dtv[0].counter != gen))
-    return update_get_addr (GET_ADDR_PARAM);
+    return update_get_addr (GET_ADDR_PARAM, gen);
 
   return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
 }