[v14,7/9] nptl: Move the rseq area to the 'extra TLS' block

Message ID 20241121190924.837446-8-mjeanson@efficios.com
State Under Review
Delegated to: Florian Weimer
Headers
Series Add rseq extensible ABI support |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Michael Jeanson Nov. 21, 2024, 7:08 p.m. UTC
  Move the rseq area to the newly added 'extra TLS' block, this is the
last step in adding support for the rseq extended ABI. The size of the
rseq area is now dynamic and depends on the rseq features reported by
the kernel through the elf auxiliary vector. This will allow
applications to use rseq features past the 32 bytes of the original rseq
ABI as they become available in future kernels.

Signed-off-by: Michael Jeanson <mjeanson@efficios.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
---
Changes since v12:
- Style nits, missing spaces before '(' in function macro/calls
- Add comment and variable array member to 'struct rseq_area'
- Comment and style nits in tst-rseq-disable.c
Changes since v11:
- __rseq_size is now set directly in _dl_parse_auxv, set it to 0 when
  the main thread registration fails or is disabled by tunable
---
 nptl/pthread_create.c                         |  2 +-
 sysdeps/nptl/dl-tls_init_tp.c                 |  9 --
 sysdeps/unix/sysv/linux/Makefile              | 10 +++
 sysdeps/unix/sysv/linux/dl-parse_auxv.h       | 12 +--
 sysdeps/unix/sysv/linux/rseq-internal.h       | 49 ++++++++---
 sysdeps/unix/sysv/linux/sched_getcpu.c        |  3 +-
 .../unix/sysv/linux/tst-rseq-disable-static.c |  1 +
 sysdeps/unix/sysv/linux/tst-rseq-disable.c    | 64 +++++++++++---
 .../unix/sysv/linux/tst-rseq-nptl-static.c    |  1 +
 sysdeps/unix/sysv/linux/tst-rseq-static.c     |  1 +
 sysdeps/unix/sysv/linux/tst-rseq.c            | 84 +++++++++++++++----
 sysdeps/unix/sysv/linux/tst-rseq.h            |  2 +-
 12 files changed, 180 insertions(+), 58 deletions(-)
 create mode 100644 sysdeps/unix/sysv/linux/tst-rseq-disable-static.c
 create mode 100644 sysdeps/unix/sysv/linux/tst-rseq-nptl-static.c
 create mode 100644 sysdeps/unix/sysv/linux/tst-rseq-static.c
  

Patch

diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
index ef3ec33290..dd11a9fa46 100644
--- a/nptl/pthread_create.c
+++ b/nptl/pthread_create.c
@@ -696,7 +696,7 @@  __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
 
   /* Inherit rseq registration state.  Without seccomp filters, rseq
      registration will either always fail or always succeed.  */
-  if ((int) THREAD_GETMEM_VOLATILE (self, rseq_area.cpu_id) >= 0)
+  if ((int) RSEQ_GETMEM_VOLATILE (cpu_id) >= 0)
     pd->flags |= ATTR_FLAG_DO_RSEQ;
 
   /* Initialize the field for the ID of the thread which is waiting
diff --git a/sysdeps/nptl/dl-tls_init_tp.c b/sysdeps/nptl/dl-tls_init_tp.c
index 801b993000..7265e8768d 100644
--- a/sysdeps/nptl/dl-tls_init_tp.c
+++ b/sysdeps/nptl/dl-tls_init_tp.c
@@ -109,15 +109,6 @@  __tls_init_tp (void)
     bool do_rseq = TUNABLE_GET (rseq, int, NULL);
     if (!rseq_register_current_thread (pd, do_rseq))
       _rseq_size = 0;
-
-#ifdef RSEQ_SIG
-    /* This should be a compile-time constant, but the current
-       infrastructure makes it difficult to determine its value.  Not
-       all targets support __thread_pointer, so set __rseq_offset only
-       if the rseq registration may have happened because RSEQ_SIG is
-       defined.  */
-    _rseq_offset = (char *) &pd->rseq_area - (char *) __thread_pointer ();
-#endif
   }
 
   /* Set initial thread's stack block from 0 up to __libc_stack_end.
diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index 527c7a5ae8..ea0e712011 100644
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -267,6 +267,11 @@  tests-internal += \
   tst-rseq-disable \
   # tests-internal
 
+tests-static += \
+  tst-rseq-disable-static \
+  tst-rseq-static \
+  # tests-static
+
 tests-time64 += \
   tst-adjtimex-time64 \
   tst-clock_adjtime-time64 \
@@ -410,6 +415,7 @@  $(objpfx)tst-sched-consts.out: ../sysdeps/unix/sysv/linux/tst-sched-consts.py
 $(objpfx)tst-sched-consts.out: $(sysdeps-linux-python-deps)
 
 tst-rseq-disable-ENV = GLIBC_TUNABLES=glibc.pthread.rseq=0
+tst-rseq-disable-static-ENV = GLIBC_TUNABLES=glibc.pthread.rseq=0
 
 endif # $(subdir) == misc
 
@@ -675,4 +681,8 @@  tests += \
 tests-internal += \
   tst-rseq-nptl \
   # tests-internal
+
+tests-static += \
+  tst-rseq-nptl-static \
+  # tests-static
 endif
diff --git a/sysdeps/unix/sysv/linux/dl-parse_auxv.h b/sysdeps/unix/sysv/linux/dl-parse_auxv.h
index 5a0ef0ae2a..8657cf7b2f 100644
--- a/sysdeps/unix/sysv/linux/dl-parse_auxv.h
+++ b/sysdeps/unix/sysv/linux/dl-parse_auxv.h
@@ -61,15 +61,9 @@  void _dl_parse_auxv (ElfW(auxv_t) *av, dl_parse_auxv_t auxv_values)
 #endif
 
   /* Get the rseq feature size, with a minimum of RSEQ_AREA_SIZE_INITIAL_USED
-     (20) for kernels that don't have AT_RSEQ_FEATURE_SIZE.  Limit the feature
-     size to RSEQ_AREA_SIZE_MAX_USED (28) which fits the rseq area in 'struct
-     pthread' and represents the maximum feature size of currently released
-     kernels.  Since no kernels currently cross the 32 bytes of the original
-     ABI, the semantics of a feature size of 32 or more are still undetermined.
-     */
-  _rseq_size = MIN (MAX (auxv_values[AT_RSEQ_FEATURE_SIZE],
-                         RSEQ_AREA_SIZE_INITIAL_USED),
-		    RSEQ_AREA_SIZE_MAX_USED);
+     (20) for kernels that don't have AT_RSEQ_FEATURE_SIZE.  */
+  _rseq_size = MAX (auxv_values[AT_RSEQ_FEATURE_SIZE],
+                    RSEQ_AREA_SIZE_INITIAL_USED);
   _rseq_align = MAX (auxv_values[AT_RSEQ_ALIGN], RSEQ_MIN_ALIGN);
 
   DL_PLATFORM_AUXV
diff --git a/sysdeps/unix/sysv/linux/rseq-internal.h b/sysdeps/unix/sysv/linux/rseq-internal.h
index 8e4a292ffc..eaa065e27b 100644
--- a/sysdeps/unix/sysv/linux/rseq-internal.h
+++ b/sysdeps/unix/sysv/linux/rseq-internal.h
@@ -26,6 +26,30 @@ 
 #include <sys/rseq.h>
 #include <ldsodefs.h>
 #include <thread_pointer.h>
+#include <rseq-access.h>
+
+/* rseq area registered with the kernel.  Use a custom definition here to
+   isolate from the system provided header which could lack some fields of the
+   Extended ABI.
+
+   This is only used to get the field offsets and sizes, it should never be
+   used for direct object allocations.
+
+   Access to fields of the Extended ABI beyond the 20 bytes of the original ABI
+   (after 'flags') must be gated by a check of the feature size.  */
+struct rseq_area
+{
+  /* Original ABI.  */
+  uint32_t cpu_id_start;
+  uint32_t cpu_id;
+  uint64_t rseq_cs;
+  uint32_t flags;
+  /* Extended ABI.  */
+  uint32_t node_id;
+  uint32_t mm_cid;
+  /* Flexible array member to discourage direct object allocations.  */
+  char end[];
+};
 
 /* Minimum size of the rseq area allocation required by the syscall.  The
    actually used rseq feature size may be less (20 bytes initially).  */
@@ -47,10 +71,12 @@  extern size_t _rseq_align attribute_hidden;
 
 /* Size of the active features in the rseq area.
    Populated from the auxiliary vector with a minimum of '20'.
+   Set to '0' on registration failure of the main thread.
    In .data.relro but not yet write-protected.  */
 extern unsigned int _rseq_size attribute_hidden;
 
-/* Offset from the thread pointer to the rseq area.
+/* Offset from the thread pointer to the rseq area, always set to allow
+   checking the registration status by reading the 'cpu_id' field.
    In .data.relro but not yet write-protected.  */
 extern ptrdiff_t _rseq_offset attribute_hidden;
 
@@ -75,34 +101,35 @@  rseq_register_current_thread (struct pthread *self, bool do_rseq)
     {
       unsigned int size =  __rseq_size;
 
+      /* The feature size can be smaller than the minimum rseq area size of 32
+         bytes accepted by the syscall, if this is the case, bump the size of
+         the registration to the minimum. The 'extra TLS' block is always at
+         least 32 bytes. */
       if (size < RSEQ_AREA_SIZE_INITIAL)
-        /* The initial implementation used only 20 bytes out of 32,
-           but still expected size 32.  */
         size = RSEQ_AREA_SIZE_INITIAL;
 
       /* Initialize the rseq fields that are read by the kernel on
          registration, there is no guarantee that struct pthread is
          cleared on all architectures.  */
-      THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_UNINITIALIZED);
-      THREAD_SETMEM (self, rseq_area.cpu_id_start, 0);
-      THREAD_SETMEM (self, rseq_area.rseq_cs, 0);
-      THREAD_SETMEM (self, rseq_area.flags, 0);
+      RSEQ_SETMEM (cpu_id, RSEQ_CPU_ID_UNINITIALIZED);
+      RSEQ_SETMEM (cpu_id_start, 0);
+      RSEQ_SETMEM (rseq_cs, 0);
+      RSEQ_SETMEM (flags, 0);
 
-      int ret = INTERNAL_SYSCALL_CALL (rseq, &self->rseq_area,
-                                       size, 0, RSEQ_SIG);
+      int ret = INTERNAL_SYSCALL_CALL (rseq, RSEQ_SELF (), size, 0, RSEQ_SIG);
       if (!INTERNAL_SYSCALL_ERROR_P (ret))
         return true;
     }
   /* When rseq is disabled by tunables or the registration fails, inform
      userspace by setting 'cpu_id' to RSEQ_CPU_ID_REGISTRATION_FAILED.  */
-  THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED);
+  RSEQ_SETMEM (cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED);
   return false;
 }
 #else /* RSEQ_SIG */
 static inline bool
 rseq_register_current_thread (struct pthread *self, bool do_rseq)
 {
-  THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED);
+  RSEQ_SETMEM (cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED);
   return false;
 }
 #endif /* RSEQ_SIG */
diff --git a/sysdeps/unix/sysv/linux/sched_getcpu.c b/sysdeps/unix/sysv/linux/sched_getcpu.c
index 72a3360550..18a7817148 100644
--- a/sysdeps/unix/sysv/linux/sched_getcpu.c
+++ b/sysdeps/unix/sysv/linux/sched_getcpu.c
@@ -19,6 +19,7 @@ 
 #include <sched.h>
 #include <sysdep.h>
 #include <sysdep-vdso.h>
+#include <rseq-internal.h>
 
 static int
 vsyscall_sched_getcpu (void)
@@ -36,6 +37,6 @@  vsyscall_sched_getcpu (void)
 int
 sched_getcpu (void)
 {
-  int cpu_id = THREAD_GETMEM_VOLATILE (THREAD_SELF, rseq_area.cpu_id);
+  int cpu_id = RSEQ_GETMEM_VOLATILE (cpu_id);
   return __glibc_likely (cpu_id >= 0) ? cpu_id : vsyscall_sched_getcpu ();
 }
diff --git a/sysdeps/unix/sysv/linux/tst-rseq-disable-static.c b/sysdeps/unix/sysv/linux/tst-rseq-disable-static.c
new file mode 100644
index 0000000000..2687d13d3d
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/tst-rseq-disable-static.c
@@ -0,0 +1 @@ 
+#include "tst-rseq-disable.c"
diff --git a/sysdeps/unix/sysv/linux/tst-rseq-disable.c b/sysdeps/unix/sysv/linux/tst-rseq-disable.c
index bbc655bec4..31b3444e11 100644
--- a/sysdeps/unix/sysv/linux/tst-rseq-disable.c
+++ b/sysdeps/unix/sysv/linux/tst-rseq-disable.c
@@ -26,32 +26,69 @@ 
 #include <unistd.h>
 
 #ifdef RSEQ_SIG
+# include <sys/auxv.h>
+# include "tst-rseq.h"
+
+/* Used to test private registration with the rseq system call because glibc
+   rseq is disabled.  */
+static __thread struct rseq local_rseq = {
+  .cpu_id = RSEQ_CPU_ID_REGISTRATION_FAILED,
+};
 
 /* Check that rseq can be registered and has not been taken by glibc.  */
 static void
 check_rseq_disabled (void)
 {
-  struct pthread *pd = THREAD_SELF;
+  struct rseq *rseq_abi = (struct rseq *) ((char *) __thread_pointer () +
+		           __rseq_offset);
+
+#if TLS_TCB_AT_TP
+  /* The rseq area block should come before the thread pointer and be at least
+     32 bytes. */
+  TEST_VERIFY (__rseq_offset <= -RSEQ_AREA_SIZE_INITIAL);
+#elif TLS_DTV_AT_TP
+  /* The rseq area block should come after the thread pointer. */
+  TEST_VERIFY (__rseq_offset >= 0);
+#else
+# error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
+#endif
 
+  /* __rseq_flags is unused and should always be '0'.  */
   TEST_COMPARE (__rseq_flags, 0);
-  TEST_VERIFY ((char *) __thread_pointer () + __rseq_offset
-               == (char *) &pd->rseq_area);
+
+  /* When rseq is not registered, __rseq_size should always be '0'.  */
   TEST_COMPARE (__rseq_size, 0);
-  TEST_COMPARE ((int) pd->rseq_area.cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED);
 
-  int ret = syscall (__NR_rseq, &pd->rseq_area, sizeof (pd->rseq_area),
-                     0, RSEQ_SIG);
+  /* When rseq is not registered, the 'cpu_id' field should be set to
+     RSEQ_CPU_ID_REGISTRATION_FAILED.  */
+  TEST_COMPARE ((int) rseq_abi->cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED);
+
+  /* Test a rseq registration which should succeed since the internal
+     registration is disabled.  */
+  int ret = syscall (__NR_rseq, &local_rseq, RSEQ_AREA_SIZE_INITIAL, 0, RSEQ_SIG);
   if (ret == 0)
     {
-      ret = syscall (__NR_rseq, &pd->rseq_area, sizeof (pd->rseq_area),
+      /* A successful registration should set the cpu id.  */
+      TEST_VERIFY (local_rseq.cpu_id >= 0);
+
+      /* Test we can also unregister rseq.  */
+      ret = syscall (__NR_rseq, &local_rseq, RSEQ_AREA_SIZE_INITIAL,
                      RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
       TEST_COMPARE (ret, 0);
-      pd->rseq_area.cpu_id = RSEQ_CPU_ID_REGISTRATION_FAILED;
     }
   else
     {
-      TEST_VERIFY (errno != -EINVAL);
-      TEST_VERIFY (errno != -EBUSY);
+      /* Check if we failed with EINVAL which would mean an invalid rseq flags,
+         a mis-aligned rseq area address or an incorrect rseq size.  */
+      TEST_VERIFY (errno != EINVAL);
+
+      /* Check if we failed with EBUSY which means an existing rseq
+         registration. */
+      TEST_VERIFY (errno != EBUSY);
+
+      /* Check if we failed with EFAULT which means an invalid rseq area
+         address.  */
+      TEST_VERIFY (errno != EFAULT);
     }
 }
 
@@ -71,6 +108,13 @@  proc_func (void *ignored)
 static int
 do_test (void)
 {
+  printf ("info: __rseq_size: %u\n", __rseq_size);
+  printf ("info: __rseq_offset: %td\n", __rseq_offset);
+  printf ("info: __rseq_flags: %u\n", __rseq_flags);
+  printf ("info: getauxval (AT_RSEQ_FEATURE_SIZE): %ld\n",
+          getauxval (AT_RSEQ_FEATURE_SIZE));
+  printf ("info: getauxval (AT_RSEQ_ALIGN): %ld\n", getauxval (AT_RSEQ_ALIGN));
+
   puts ("info: checking main thread");
   check_rseq_disabled ();
 
diff --git a/sysdeps/unix/sysv/linux/tst-rseq-nptl-static.c b/sysdeps/unix/sysv/linux/tst-rseq-nptl-static.c
new file mode 100644
index 0000000000..6e2c923bb9
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/tst-rseq-nptl-static.c
@@ -0,0 +1 @@ 
+#include "tst-rseq-nptl.c"
diff --git a/sysdeps/unix/sysv/linux/tst-rseq-static.c b/sysdeps/unix/sysv/linux/tst-rseq-static.c
new file mode 100644
index 0000000000..1d97f3bd3d
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/tst-rseq-static.c
@@ -0,0 +1 @@ 
+#include "tst-rseq.c"
diff --git a/sysdeps/unix/sysv/linux/tst-rseq.c b/sysdeps/unix/sysv/linux/tst-rseq.c
index d1ae16b953..7bdda17feb 100644
--- a/sysdeps/unix/sysv/linux/tst-rseq.c
+++ b/sysdeps/unix/sysv/linux/tst-rseq.c
@@ -19,6 +19,8 @@ 
    not linked against libpthread.  */
 
 #include <support/check.h>
+#include <support/namespace.h>
+#include <support/xthread.h>
 #include <stdio.h>
 #include <sys/rseq.h>
 #include <unistd.h>
@@ -32,25 +34,69 @@ 
 # include <sys/auxv.h>
 # include <thread_pointer.h>
 # include <tls.h>
+# include <sys/auxv.h>
 # include "tst-rseq.h"
 
 static void
 do_rseq_main_test (void)
 {
-  struct pthread *pd = THREAD_SELF;
-  size_t rseq_feature_size = MIN (MAX (getauxval (AT_RSEQ_FEATURE_SIZE),
-                                       RSEQ_AREA_SIZE_INITIAL_USED),
-                                  RSEQ_AREA_SIZE_MAX_USED);
+  size_t rseq_align = MAX (getauxval (AT_RSEQ_ALIGN), RSEQ_MIN_ALIGN);
+  size_t rseq_feature_size = MAX (getauxval (AT_RSEQ_FEATURE_SIZE),
+                                  RSEQ_AREA_SIZE_INITIAL_USED);
+  size_t rseq_alloc_size = roundup (MAX (rseq_feature_size,
+                                    RSEQ_AREA_SIZE_INITIAL_USED), rseq_align);
+  struct rseq *rseq_abi = __thread_pointer () + __rseq_offset;
 
   TEST_VERIFY_EXIT (rseq_thread_registered ());
+
+  /* __rseq_flags is unused and should always be '0'.  */
   TEST_COMPARE (__rseq_flags, 0);
-  TEST_VERIFY ((char *) __thread_pointer () + __rseq_offset
-               == (char *) &pd->rseq_area);
+
+  /* When rseq is registered, __rseq_size should report the feature size.  */
   TEST_COMPARE (__rseq_size, rseq_feature_size);
+
+  /* When rseq is registered, the 'cpu_id' field should be set to a valid cpu
+   * number.  */
+  TEST_VERIFY ((int32_t) rseq_abi->cpu_id >= 0);
+
+  /* The rseq area address must be aligned.  */
+  TEST_VERIFY (((unsigned long) rseq_abi % rseq_align) == 0);
+
+#if TLS_TCB_AT_TP
+  /* The rseq area block should come before the thread pointer and be at least
+     32 bytes. */
+  TEST_VERIFY (__rseq_offset <= -RSEQ_AREA_SIZE_INITIAL);
+#elif TLS_DTV_AT_TP
+  /* The rseq area block should come after the thread pointer. */
+  TEST_VERIFY (__rseq_offset >= 0);
+#else
+# error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
+#endif
+
+  /* Test a rseq registration with the same arguments as the internal
+     registration which should fail with errno == EBUSY.  */
+  TEST_VERIFY (((unsigned long) rseq_abi % rseq_align) == 0);
+  TEST_VERIFY (__rseq_size <= rseq_alloc_size);
+  int ret = syscall (__NR_rseq, rseq_abi, rseq_alloc_size, 0, RSEQ_SIG);
+  TEST_VERIFY (ret != 0);
+  TEST_COMPARE (errno, EBUSY);
+}
+
+static void *
+thread_func (void *ignored)
+{
+  do_rseq_main_test ();
+  return NULL;
 }
 
 static void
-do_rseq_test (void)
+proc_func (void *ignored)
+{
+  do_rseq_main_test ();
+}
+
+static int
+do_test (void)
 {
   if (!rseq_available ())
     {
@@ -62,21 +108,27 @@  do_rseq_test (void)
   printf ("info: getauxval (AT_RSEQ_FEATURE_SIZE): %ld\n",
           getauxval (AT_RSEQ_FEATURE_SIZE));
   printf ("info: getauxval (AT_RSEQ_ALIGN): %ld\n", getauxval (AT_RSEQ_ALIGN));
+
+  puts ("info: checking main thread");
+  do_rseq_main_test ();
+
+  puts ("info: checking main thread (2)");
   do_rseq_main_test ();
+
+  puts ("info: checking new thread");
+  xpthread_join (xpthread_create (NULL, thread_func, NULL));
+
+  puts ("info: checking subprocess");
+  support_isolate_in_subprocess (proc_func, NULL);
+
+  return 0;
 }
 #else /* RSEQ_SIG */
-static void
-do_rseq_test (void)
-{
-  FAIL_UNSUPPORTED ("glibc does not define RSEQ_SIG, skipping test");
-}
-#endif /* RSEQ_SIG */
-
 static int
 do_test (void)
 {
-  do_rseq_test ();
-  return 0;
+  FAIL_UNSUPPORTED ("glibc does not define RSEQ_SIG, skipping test");
 }
+#endif /* RSEQ_SIG */
 
 #include <support/test-driver.c>
diff --git a/sysdeps/unix/sysv/linux/tst-rseq.h b/sysdeps/unix/sysv/linux/tst-rseq.h
index 7f82554e83..75dd8efb2e 100644
--- a/sysdeps/unix/sysv/linux/tst-rseq.h
+++ b/sysdeps/unix/sysv/linux/tst-rseq.h
@@ -28,7 +28,7 @@ 
 static inline bool
 rseq_thread_registered (void)
 {
-  return THREAD_GETMEM_VOLATILE (THREAD_SELF, rseq_area.cpu_id) >= 0;
+  return RSEQ_GETMEM_VOLATILE (cpu_id) >= 0;
 }
 
 static inline int