initialize fde objects lazily

Message ID 7d18f085-ae46-138d-4f04-df5857b7b014@in.tum.de
State New
Headers
Series initialize fde objects lazily |

Commit Message

Thomas Neumann Dec. 9, 2022, 5:34 p.m. UTC
  When registering an unwind frame with __register_frame_info_bases
we currently initialize that fde object eagerly. This has the
advantage that it is immutable afterwards and we can safely
access it from multiple threads, but it has the disadvantage
that we pay the initialization cost even if the application
never throws an exception.

This commit changes the logic to initialize the objects lazily.
The objects themselves are inserted into the b-tree when
registering the frame, but the sorted fde_vector is
not constructed yet. Only on the first time that an
exception tries to pass through the registered code the
object is initialized. We notice that with a double checking,
first doing a relaxed load of the sorted bit and then re-checking
under a mutex when the object was not initialized yet.

Note that the check must implicitly be safe concering a concurrent
frame deregistration, as trying the deregister a frame that is
on the unwinding path of a concurrent exception is inherently racy.

libgcc/ChangeLog:
         * unwind-dw2-fde.c: Initialize fde object lazily when
         the first exception tries to pass through.
---
  libgcc/unwind-dw2-fde.c | 52 ++++++++++++++++++++++++++++++++---------
  1 file changed, 41 insertions(+), 11 deletions(-)
  

Comments

Tamar Christina Dec. 15, 2022, 4:11 p.m. UTC | #1
Thanks Thomas!

We've tested it and this brings the startup time back into a reasonable amount!

We'd quite like to see this get into GCC 13.

Regards,
Tamar

> -----Original Message-----
> From: Thomas Neumann <thomas.neumann@in.tum.de>
> Sent: Friday, December 9, 2022 5:34 PM
> To: gcc-patches@gcc.gnu.org
> Cc: H.J. Lu <hjl.tools@gmail.com>; Jakub Jelinek <jakub@redhat.com>;
> Tamar Christina <Tamar.Christina@arm.com>; Jason Merrill
> <jason@redhat.com>; Jonathan Wakely <jwakely.gcc@gmail.com>; Florian
> Weimer <fweimer@redhat.com>
> Subject: [PATCH] initialize fde objects lazily
> 
> When registering an unwind frame with __register_frame_info_bases we
> currently initialize that fde object eagerly. This has the advantage that it is
> immutable afterwards and we can safely access it from multiple threads, but
> it has the disadvantage that we pay the initialization cost even if the
> application never throws an exception.
> 
> This commit changes the logic to initialize the objects lazily.
> The objects themselves are inserted into the b-tree when registering the
> frame, but the sorted fde_vector is not constructed yet. Only on the first
> time that an exception tries to pass through the registered code the object is
> initialized. We notice that with a double checking, first doing a relaxed load of
> the sorted bit and then re-checking under a mutex when the object was not
> initialized yet.
> 
> Note that the check must implicitly be safe concering a concurrent frame
> deregistration, as trying the deregister a frame that is on the unwinding path
> of a concurrent exception is inherently racy.
> 
> libgcc/ChangeLog:
>          * unwind-dw2-fde.c: Initialize fde object lazily when
>          the first exception tries to pass through.
> ---
>   libgcc/unwind-dw2-fde.c | 52 ++++++++++++++++++++++++++++++++-----
> ----
>   1 file changed, 41 insertions(+), 11 deletions(-)
> 
> diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c index
> 3c0cc654ec0..6f69c20ff4b 100644
> --- a/libgcc/unwind-dw2-fde.c
> +++ b/libgcc/unwind-dw2-fde.c
> @@ -63,8 +63,6 @@ release_registered_frames (void)
> 
>   static void
>   get_pc_range (const struct object *ob, uintptr_type *range); -static void -
> init_object (struct object *ob);
> 
>   #else
>   /* Without fast path frame deregistration must always succeed.  */ @@ -
> 76,6 +74,7 @@ static const int in_shutdown = 0;
>      by decreasing value of pc_begin.  */
>   static struct object *unseen_objects;
>   static struct object *seen_objects;
> +#endif
> 
>   #ifdef __GTHREAD_MUTEX_INIT
>   static __gthread_mutex_t object_mutex = __GTHREAD_MUTEX_INIT; @@
> -103,7 +102,6 @@ init_object_mutex_once (void)
>   static __gthread_mutex_t object_mutex;
>   #endif
>   #endif
> -#endif
> 
>   /* Called from crtbegin.o to register the unwind info for an object.  */
> 
> @@ -126,10 +124,7 @@ __register_frame_info_bases (const void *begin,
> struct object *ob,
>   #endif
> 
>   #ifdef ATOMIC_FDE_FAST_PATH
> -  // Initialize eagerly to avoid locking later
> -  init_object (ob);
> -
> -  // And register the frame
> +  // Register the frame in the b-tree
>     uintptr_type range[2];
>     get_pc_range (ob, range);
>     btree_insert (&registered_frames, range[0], range[1] - range[0], ob); @@
> -180,10 +175,7 @@ __register_frame_info_table_bases (void *begin, struct
> object *ob,
>     ob->s.b.encoding = DW_EH_PE_omit;
> 
>   #ifdef ATOMIC_FDE_FAST_PATH
> -  // Initialize eagerly to avoid locking later
> -  init_object (ob);
> -
> -  // And register the frame
> +  // Register the frame in the b-tree
>     uintptr_type range[2];
>     get_pc_range (ob, range);
>     btree_insert (&registered_frames, range[0], range[1] - range[0], ob); @@
> -892,7 +884,15 @@ init_object (struct object* ob)
>     accu.linear->orig_data = ob->u.single;
>     ob->u.sort = accu.linear;
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +  // We must update the sorted bit with an atomic operation
> +  struct object tmp;
> +  tmp.s.b = ob->s.b;
> +  tmp.s.b.sorted = 1;
> +  __atomic_store (&(ob->s.b), &(tmp.s.b), __ATOMIC_SEQ_CST); #else
>     ob->s.b.sorted = 1;
> +#endif
>   }
> 
>   #ifdef ATOMIC_FDE_FAST_PATH
> @@ -1130,6 +1130,21 @@ search_object (struct object* ob, void *pc)
>       }
>   }
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +
> +// Check if the object was already initialized static inline bool
> +is_object_initialized (struct object *ob) {
> +  // We have to use relaxed atomics for the read, which
> +  // is a bit involved as we read from a bitfield
> +  struct object tmp;
> +  __atomic_load (&(ob->s.b), &(tmp.s.b), __ATOMIC_RELAXED);
> +  return tmp.s.b.sorted;
> +}
> +
> +#endif
> +
>   const fde *
>   _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
>   {
> @@ -1141,6 +1156,21 @@ _Unwind_Find_FDE (void *pc, struct
> dwarf_eh_bases *bases)
>     if (!ob)
>       return NULL;
> 
> +  // Initialize the object lazily
> +  if (!is_object_initialized (ob))
> +    {
> +      // Check again under mutex
> +      init_object_mutex_once ();
> +      __gthread_mutex_lock (&object_mutex);
> +
> +      if (!ob->s.b.sorted)
> +	{
> +	  init_object (ob);
> +	}
> +
> +      __gthread_mutex_unlock (&object_mutex);
> +    }
> +
>     f = search_object (ob, pc);
>   #else
> 
> --
> 2.37.2
  
Jason Merrill Dec. 16, 2022, 5:25 p.m. UTC | #2
On 12/9/22 12:34, Thomas Neumann wrote:
> When registering an unwind frame with __register_frame_info_bases
> we currently initialize that fde object eagerly. This has the
> advantage that it is immutable afterwards and we can safely
> access it from multiple threads, but it has the disadvantage
> that we pay the initialization cost even if the application
> never throws an exception.
> 
> This commit changes the logic to initialize the objects lazily.
> The objects themselves are inserted into the b-tree when
> registering the frame, but the sorted fde_vector is
> not constructed yet. Only on the first time that an
> exception tries to pass through the registered code the
> object is initialized. We notice that with a double checking,
> first doing a relaxed load of the sorted bit and then re-checking
> under a mutex when the object was not initialized yet.
> 
> Note that the check must implicitly be safe concering a concurrent
> frame deregistration, as trying the deregister a frame that is
> on the unwinding path of a concurrent exception is inherently racy.

OK, thanks.

> libgcc/ChangeLog:
>          * unwind-dw2-fde.c: Initialize fde object lazily when
>          the first exception tries to pass through.
> ---
>   libgcc/unwind-dw2-fde.c | 52 ++++++++++++++++++++++++++++++++---------
>   1 file changed, 41 insertions(+), 11 deletions(-)
> 
> diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
> index 3c0cc654ec0..6f69c20ff4b 100644
> --- a/libgcc/unwind-dw2-fde.c
> +++ b/libgcc/unwind-dw2-fde.c
> @@ -63,8 +63,6 @@ release_registered_frames (void)
> 
>   static void
>   get_pc_range (const struct object *ob, uintptr_type *range);
> -static void
> -init_object (struct object *ob);
> 
>   #else
>   /* Without fast path frame deregistration must always succeed.  */
> @@ -76,6 +74,7 @@ static const int in_shutdown = 0;
>      by decreasing value of pc_begin.  */
>   static struct object *unseen_objects;
>   static struct object *seen_objects;
> +#endif
> 
>   #ifdef __GTHREAD_MUTEX_INIT
>   static __gthread_mutex_t object_mutex = __GTHREAD_MUTEX_INIT;
> @@ -103,7 +102,6 @@ init_object_mutex_once (void)
>   static __gthread_mutex_t object_mutex;
>   #endif
>   #endif
> -#endif
> 
>   /* Called from crtbegin.o to register the unwind info for an object.  */
> 
> @@ -126,10 +124,7 @@ __register_frame_info_bases (const void *begin, 
> struct object *ob,
>   #endif
> 
>   #ifdef ATOMIC_FDE_FAST_PATH
> -  // Initialize eagerly to avoid locking later
> -  init_object (ob);
> -
> -  // And register the frame
> +  // Register the frame in the b-tree
>     uintptr_type range[2];
>     get_pc_range (ob, range);
>     btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
> @@ -180,10 +175,7 @@ __register_frame_info_table_bases (void *begin, 
> struct object *ob,
>     ob->s.b.encoding = DW_EH_PE_omit;
> 
>   #ifdef ATOMIC_FDE_FAST_PATH
> -  // Initialize eagerly to avoid locking later
> -  init_object (ob);
> -
> -  // And register the frame
> +  // Register the frame in the b-tree
>     uintptr_type range[2];
>     get_pc_range (ob, range);
>     btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
> @@ -892,7 +884,15 @@ init_object (struct object* ob)
>     accu.linear->orig_data = ob->u.single;
>     ob->u.sort = accu.linear;
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +  // We must update the sorted bit with an atomic operation
> +  struct object tmp;
> +  tmp.s.b = ob->s.b;
> +  tmp.s.b.sorted = 1;
> +  __atomic_store (&(ob->s.b), &(tmp.s.b), __ATOMIC_SEQ_CST);
> +#else
>     ob->s.b.sorted = 1;
> +#endif
>   }
> 
>   #ifdef ATOMIC_FDE_FAST_PATH
> @@ -1130,6 +1130,21 @@ search_object (struct object* ob, void *pc)
>       }
>   }
> 
> +#ifdef ATOMIC_FDE_FAST_PATH
> +
> +// Check if the object was already initialized
> +static inline bool
> +is_object_initialized (struct object *ob)
> +{
> +  // We have to use relaxed atomics for the read, which
> +  // is a bit involved as we read from a bitfield
> +  struct object tmp;
> +  __atomic_load (&(ob->s.b), &(tmp.s.b), __ATOMIC_RELAXED);
> +  return tmp.s.b.sorted;
> +}
> +
> +#endif
> +
>   const fde *
>   _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
>   {
> @@ -1141,6 +1156,21 @@ _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases 
> *bases)
>     if (!ob)
>       return NULL;
> 
> +  // Initialize the object lazily
> +  if (!is_object_initialized (ob))
> +    {
> +      // Check again under mutex
> +      init_object_mutex_once ();
> +      __gthread_mutex_lock (&object_mutex);
> +
> +      if (!ob->s.b.sorted)
> +    {
> +      init_object (ob);
> +    }
> +
> +      __gthread_mutex_unlock (&object_mutex);
> +    }
> +
>     f = search_object (ob, pc);
>   #else
>
  

Patch

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 3c0cc654ec0..6f69c20ff4b 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -63,8 +63,6 @@  release_registered_frames (void)
  
  static void
  get_pc_range (const struct object *ob, uintptr_type *range);
-static void
-init_object (struct object *ob);
  
  #else
  /* Without fast path frame deregistration must always succeed.  */
@@ -76,6 +74,7 @@  static const int in_shutdown = 0;
     by decreasing value of pc_begin.  */
  static struct object *unseen_objects;
  static struct object *seen_objects;
+#endif
  
  #ifdef __GTHREAD_MUTEX_INIT
  static __gthread_mutex_t object_mutex = __GTHREAD_MUTEX_INIT;
@@ -103,7 +102,6 @@  init_object_mutex_once (void)
  static __gthread_mutex_t object_mutex;
  #endif
  #endif
-#endif
  
  /* Called from crtbegin.o to register the unwind info for an object.  */
  
@@ -126,10 +124,7 @@  __register_frame_info_bases (const void *begin, struct object *ob,
  #endif
  
  #ifdef ATOMIC_FDE_FAST_PATH
-  // Initialize eagerly to avoid locking later
-  init_object (ob);
-
-  // And register the frame
+  // Register the frame in the b-tree
    uintptr_type range[2];
    get_pc_range (ob, range);
    btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
@@ -180,10 +175,7 @@  __register_frame_info_table_bases (void *begin, struct object *ob,
    ob->s.b.encoding = DW_EH_PE_omit;
  
  #ifdef ATOMIC_FDE_FAST_PATH
-  // Initialize eagerly to avoid locking later
-  init_object (ob);
-
-  // And register the frame
+  // Register the frame in the b-tree
    uintptr_type range[2];
    get_pc_range (ob, range);
    btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
@@ -892,7 +884,15 @@  init_object (struct object* ob)
    accu.linear->orig_data = ob->u.single;
    ob->u.sort = accu.linear;
  
+#ifdef ATOMIC_FDE_FAST_PATH
+  // We must update the sorted bit with an atomic operation
+  struct object tmp;
+  tmp.s.b = ob->s.b;
+  tmp.s.b.sorted = 1;
+  __atomic_store (&(ob->s.b), &(tmp.s.b), __ATOMIC_SEQ_CST);
+#else
    ob->s.b.sorted = 1;
+#endif
  }
  
  #ifdef ATOMIC_FDE_FAST_PATH
@@ -1130,6 +1130,21 @@  search_object (struct object* ob, void *pc)
      }
  }
  
+#ifdef ATOMIC_FDE_FAST_PATH
+
+// Check if the object was already initialized
+static inline bool
+is_object_initialized (struct object *ob)
+{
+  // We have to use relaxed atomics for the read, which
+  // is a bit involved as we read from a bitfield
+  struct object tmp;
+  __atomic_load (&(ob->s.b), &(tmp.s.b), __ATOMIC_RELAXED);
+  return tmp.s.b.sorted;
+}
+
+#endif
+
  const fde *
  _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
  {
@@ -1141,6 +1156,21 @@  _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
    if (!ob)
      return NULL;
  
+  // Initialize the object lazily
+  if (!is_object_initialized (ob))
+    {
+      // Check again under mutex
+      init_object_mutex_once ();
+      __gthread_mutex_lock (&object_mutex);
+
+      if (!ob->s.b.sorted)
+	{
+	  init_object (ob);
+	}
+
+      __gthread_mutex_unlock (&object_mutex);
+    }
+
    f = search_object (ob, pc);
  #else