[v2] elf: Fix slow tls access after dlopen [BZ #19924]

Message ID 20221019111439.63501-1-szabolcs.nagy@arm.com
State Superseded
Headers
Series [v2] elf: Fix slow tls access after dlopen [BZ #19924] |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Szabolcs Nagy Oct. 19, 2022, 11:14 a.m. UTC
  In short: __tls_get_addr checks the global generation counter and if
the current dtv is older then _dl_update_slotinfo updates dtv up to the
generation of the accessed module. So if the global generation is newer
than geneneration of the module then __tls_get_addr keeps hitting the
slow dtv update path. The dtv update path includes a number of checks
to see if any update is needed and this already causes measurable tls
access slow down after dlopen.

Possible approaches i can see:

1. update dtv to the global generation instead of module,
2. bail out faster if dtv is up to date for the module.

This patch is 1.: It needs additional sync (load acquire) so the
slotinfo list is up to date with the observed global generation.
(before the patch we could rely on that the tls access was already
synced with the dlopen of the module that is accessed.)

Approach 2. Would require walking the slotinfo list to see if dtv is
up to date to the module generation. This should be quick if less than
TLS_SLOTINFO_SURPLUS modules are dlopened, but can take long with
arbitrary many dlopens.

Note: in the x86_64 version of dl-tls.c the generation is only loaded
once, since relaxed mo is not faster than acquire mo load.

I have not benchmarked this yet.

TODO: some comments need further update.

---
v2: rebased and updated the commit message a bit. still RFC quality.
---
 elf/dl-close.c             |  2 +-
 elf/dl-open.c              |  8 ++++----
 elf/dl-reloc.c             |  5 ++---
 elf/dl-tls.c               | 28 ++++++++++++----------------
 sysdeps/generic/ldsodefs.h |  3 ++-
 sysdeps/x86_64/dl-tls.c    |  4 ++--
 6 files changed, 23 insertions(+), 27 deletions(-)
  

Patch

diff --git a/elf/dl-close.c b/elf/dl-close.c
index bcd6e206e9..8e81cee8e7 100644
--- a/elf/dl-close.c
+++ b/elf/dl-close.c
@@ -743,7 +743,7 @@  _dl_close_worker (struct link_map *map, bool force)
       if (__glibc_unlikely (newgen == 0))
 	_dl_fatal_printf ("TLS generation counter wrapped!  Please report as described in "REPORT_BUGS_TO".\n");
       /* Can be read concurrently.  */
-      atomic_store_relaxed (&GL(dl_tls_generation), newgen);
+      atomic_store_release (&GL(dl_tls_generation), newgen);
 
       if (tls_free_end == GL(dl_tls_static_used))
 	GL(dl_tls_static_used) = tls_free_start;
diff --git a/elf/dl-open.c b/elf/dl-open.c
index e7db5e9642..734a0bee4e 100644
--- a/elf/dl-open.c
+++ b/elf/dl-open.c
@@ -405,7 +405,7 @@  update_tls_slotinfo (struct link_map *new)
     _dl_fatal_printf (N_("\
 TLS generation counter wrapped!  Please report this."));
   /* Can be read concurrently.  */
-  atomic_store_relaxed (&GL(dl_tls_generation), newgen);
+  atomic_store_release (&GL(dl_tls_generation), newgen);
 
   /* We need a second pass for static tls data, because
      _dl_update_slotinfo must not be run while calls to
@@ -422,8 +422,8 @@  TLS generation counter wrapped!  Please report this."));
 	     now, but we can delay updating the DTV.  */
 	  imap->l_need_tls_init = 0;
 #ifdef SHARED
-	  /* Update the slot information data for at least the
-	     generation of the DSO we are allocating data for.  */
+	  /* Update the slot information data for the current
+	     generation.  */
 
 	  /* FIXME: This can terminate the process on memory
 	     allocation failure.  It is not possible to raise
@@ -431,7 +431,7 @@  TLS generation counter wrapped!  Please report this."));
 	     _dl_update_slotinfo would have to be split into two
 	     operations, similar to resize_scopes and update_scopes
 	     above.  This is related to bug 16134.  */
-	  _dl_update_slotinfo (imap->l_tls_modid);
+	  _dl_update_slotinfo (imap->l_tls_modid, newgen);
 #endif
 
 	  dl_init_static_tls (imap);
diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c
index 756bf950f6..e391ed47b0 100644
--- a/elf/dl-reloc.c
+++ b/elf/dl-reloc.c
@@ -112,11 +112,10 @@  _dl_try_allocate_static_tls (struct link_map *map, bool optional)
   if (map->l_real->l_relocated)
     {
 #ifdef SHARED
+// TODO: it is not clear why we need to update the DTV here, add comment
       if (__builtin_expect (THREAD_DTV()[0].counter != GL(dl_tls_generation),
 			    0))
-	/* Update the slot information data for at least the generation of
-	   the DSO we are allocating data for.  */
-	(void) _dl_update_slotinfo (map->l_tls_modid);
+	(void) _dl_update_slotinfo (map->l_tls_modid, GL(dl_tls_generation));
 #endif
 
       dl_init_static_tls (map);
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index 093cdddb7e..3d1ae00b61 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -721,7 +721,7 @@  allocate_and_init (struct link_map *map)
 
 
 struct link_map *
-_dl_update_slotinfo (unsigned long int req_modid)
+_dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
 {
   struct link_map *the_map = NULL;
   dtv_t *dtv = THREAD_DTV ();
@@ -738,19 +738,12 @@  _dl_update_slotinfo (unsigned long int req_modid)
      code and therefore add to the slotinfo list.  This is a problem
      since we must not pick up any information about incomplete work.
      The solution to this is to ignore all dtv slots which were
-     created after the one we are currently interested.  We know that
-     dynamic loading for this module is completed and this is the last
-     load operation we know finished.  */
-  unsigned long int idx = req_modid;
+     created after the generation we are interested in.  We know that
+     dynamic loading for this generation is completed and this is the
+     last load operation we know finished.  */
   struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
 
-  while (idx >= listp->len)
-    {
-      idx -= listp->len;
-      listp = listp->next;
-    }
-
-  if (dtv[0].counter < listp->slotinfo[idx].gen)
+  if (dtv[0].counter < new_gen)
     {
       /* CONCURRENCY NOTES:
 
@@ -771,7 +764,6 @@  _dl_update_slotinfo (unsigned long int req_modid)
 	 other entries are racy.  However updating a non-relevant dtv
 	 entry does not affect correctness.  For a relevant module m,
 	 max_modid >= modid of m.  */
-      size_t new_gen = listp->slotinfo[idx].gen;
       size_t total = 0;
       size_t max_modid  = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx));
       assert (max_modid >= req_modid);
@@ -914,9 +906,9 @@  tls_get_addr_tail (GET_ADDR_ARGS, dtv_t *dtv, struct link_map *the_map)
 
 static struct link_map *
 __attribute_noinline__
-update_get_addr (GET_ADDR_ARGS)
+update_get_addr (GET_ADDR_ARGS, size_t gen)
 {
-  struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE);
+  struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE, gen);
   dtv_t *dtv = THREAD_DTV ();
 
   void *p = dtv[GET_ADDR_MODULE].pointer.val;
@@ -951,7 +943,11 @@  __tls_get_addr (GET_ADDR_ARGS)
      by user code, see CONCURRENCY NOTES in _dl_update_slotinfo.  */
   size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
   if (__glibc_unlikely (dtv[0].counter != gen))
-    return update_get_addr (GET_ADDR_PARAM);
+    {
+// TODO: needs comment update if we rely on consistent generation with slotinfo
+      gen = atomic_load_acquire (&GL(dl_tls_generation));
+      return update_get_addr (GET_ADDR_PARAM, gen);
+    }
 
   void *p = dtv[GET_ADDR_MODULE].pointer.val;
 
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
index 6b256b8388..8ba8e75c3d 100644
--- a/sysdeps/generic/ldsodefs.h
+++ b/sysdeps/generic/ldsodefs.h
@@ -1259,7 +1259,8 @@  extern void _dl_add_to_slotinfo (struct link_map *l, bool do_add)
 
 /* Update slot information data for at least the generation of the
    module with the given index.  */
-extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid)
+extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid,
+					     size_t gen)
      attribute_hidden;
 
 /* Look up the module's TLS block as for __tls_get_addr,
diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
index 24a6e643b0..f74b20dd8d 100644
--- a/sysdeps/x86_64/dl-tls.c
+++ b/sysdeps/x86_64/dl-tls.c
@@ -40,9 +40,9 @@  __tls_get_addr_slow (GET_ADDR_ARGS)
 {
   dtv_t *dtv = THREAD_DTV ();
 
-  size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
+  size_t gen = atomic_load_acquire (&GL(dl_tls_generation));
   if (__glibc_unlikely (dtv[0].counter != gen))
-    return update_get_addr (GET_ADDR_PARAM);
+    return update_get_addr (GET_ADDR_PARAM, gen);
 
   return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
 }