[RFC] pthread_create: Name stacks in /proc/<pid>/maps

Message ID 20230927034527.1648061-1-irogers@google.com
State Superseded
Headers
Series [RFC] pthread_create: Name stacks in /proc/<pid>/maps |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_check--master-arm fail Testing failed
redhat-pt-bot/TryBot-32bit fail Patch caused testsuite regressions
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 fail Testing failed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 fail Testing failed
linaro-tcwg-bot/tcwg_glibc_build--master-arm fail Testing failed

Commit Message

Ian Rogers Sept. 27, 2023, 3:45 a.m. UTC
  Linux 4.5 removed thread stack annotations due to the complexity of
computing them:
https://github.com/torvalds/linux/commit/65376df582174ffcec9e6471bf5b0dd79ba05e4a

This untested (other than building) RFC patch uses
PR_SET_VMA_ANON_NAME to name stacks again as part of thread
creation. PR_SET_VMA_ANON_NAME was added in:
https://lore.kernel.org/lkml/1383170047-21074-2-git-send-email-ccross@android.com/

The patch is intended to create discussion and possibly serve as an
implementation. When I try to test it current I get unrelated
failures and so guidance would be appreciated.

The naming of stacks can be useful in situations like debugging and
profiling, for example, to differentiate stack from heap memory accesses.
---
 nptl/allocatestack.c  | 28 ++++++++++++++++++++++++++++
 nptl/pthread_create.c |  5 +++++
 2 files changed, 33 insertions(+)
  

Comments

Adhemerval Zanella Netto Sept. 27, 2023, 5:55 p.m. UTC | #1
On 27/09/23 00:45, Ian Rogers wrote:
> Linux 4.5 removed thread stack annotations due to the complexity of
> computing them:
> https://github.com/torvalds/linux/commit/65376df582174ffcec9e6471bf5b0dd79ba05e4a
> 
> This untested (other than building) RFC patch uses
> PR_SET_VMA_ANON_NAME to name stacks again as part of thread
> creation. PR_SET_VMA_ANON_NAME was added in:
> https://lore.kernel.org/lkml/1383170047-21074-2-git-send-email-ccross@android.com/

I think referring to this patch is somewhat confusing because this is a
old version with a slight different API, which has been fixed on the
final kABI (the user-provided string is just a pointer to user process,
while on installed version the string is copied and kept by the kernel).

So I think it would be better to reference to the actual kernel commit
9a10064f5625d5572c3626c1516e0bebc6c9fe9b (added on 5.17).

> 
> The patch is intended to create discussion and possibly serve as an
> implementation. When I try to test it current I get unrelated
> failures and so guidance would be appreciated.
> 
> The naming of stacks can be useful in situations like debugging and
> profiling, for example, to differentiate stack from heap memory accesses.

An extra prctl on each thread creation should be fine, although I am not 
sure about other performance implications.  It seems that the fork performance 
issues is already handled by the kernel, so adding this as default feature would
only add some contention on extra mmap creation during thread creation (since
this pcrtl requires the mmap lock). If this incurs in some performance issues
in heavily content mmap workloads, it might be better to add a way to disable
it through a tunable.

> ---
>  nptl/allocatestack.c  | 28 ++++++++++++++++++++++++++++
>  nptl/pthread_create.c |  5 +++++
>  2 files changed, 33 insertions(+)
> 
> diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
> index f9d8cdfd08..bf8297950f 100644
> --- a/nptl/allocatestack.c
> +++ b/nptl/allocatestack.c
> @@ -23,6 +23,7 @@
>  #include <unistd.h>
>  #include <sys/mman.h>
>  #include <sys/param.h>
> +#include <sys/prctl.h>
>  #include <dl-sysdep.h>
>  #include <dl-tls.h>
>  #include <tls.h>
> @@ -577,3 +578,30 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
>  
>    return 0;
>  }
> +
> +#if defined(PR_SET_VMA) && defined(PR_SET_VMA_ANON_NAME)

I think this is not ideal, this support should not be conditional based 
on installed kernel support (similar on how we handle syscalls now).
Instead, define them for internal used if linux/prctl.h does not define
them:

diff --git a/include/sys/prctl.h b/include/sys/prctl.h
index d33f3a290e..971f73914a 100644
--- a/include/sys/prctl.h
+++ b/include/sys/prctl.h
@@ -3,6 +3,11 @@

 # ifndef _ISOMAC

+#  ifndef PR_SET_VMA
+#   define PR_SET_VMA          0x53564d41
+#   define PR_SET_VMA_ANON_NAME 0
+#  endif
+
 extern int __prctl (int __option, ...);
 libc_hidden_proto (__prctl)

It would fix the !PR_SET_VMA case as well (triggered by the aarch64 bot [2]).

> +/* Use PR_SET_VMA_ANON_NAME to name the thread's stack stack:<tid> */
> +static void
> +name_stack_maps (struct pthread *pd, bool set)
> +{
> +  void *stack = THREAD_GETMEM (pd, stackblock);
> +  size_t stacksize = THREAD_GETMEM (pd, stackblock_size);
> +  if (!set) {
> +    __prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, stack, stacksize, NULL);
> +    return;
> +  } else {

Style issues, curly bracket in a new line, space after the function
call, no need to add brackets for one line call:

  if (!set)
    __prctl (PR_SET_VMA, PR_SET_VMA_ANON_NAME, stack, stacksize, NULL);
  else
    {
      [...];
    }

> +#define STACK_MAPS_PREFIX "stack:"
> +    char stack_name[sizeof(STACK_MAPS_PREFIX)+11];

We have INT_BUFSIZE_BOUND macro to get the expected bounds for formatting
integers:

  char stack_name[sizeof (STACK_MAPS_PREFIX)
                  + INT_BUFSIZE_BOUND (unsigned int)];

> +
> +    snprintf(stack_name, sizeof(stack_name), STACK_MAPS_PREFIX "%u",
> +             (unsigned)THREAD_GETMEM (pd, tid));

You need to use the internal definition (__snprintf) to avoid linknamespace
issues (this what caused patchwork buildbot to mark this patch as failed [1]).

Also always use the full type (unsigned int), I think printing as unsigned
should be ok (since at the time of call the thread id is always valid).

> +    __prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, stack, stacksize, stack_name);
> +#undef STACK_MAPS_PREFIX
> +  }
> +}
> +#else /* defined(PR_SET_VMA) && defined(PR_SET_VMA_ANON_NAME) */
> +static void
> +name_stack_maps (struct pthread *pd)
> +{
> +}
> +#endif
> diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
> index 6a41d50109..0e36d66f50 100644
> --- a/nptl/pthread_create.c
> +++ b/nptl/pthread_create.c
> @@ -369,6 +369,9 @@ start_thread (void *arg)
>    /* Initialize pointers to locale data.  */
>    __ctype_init ();
>  
> +  /* Set up /proc/<pid>/maps entry for stack to stack:tid. */
> +  name_stack_maps(pd, true);

Style issue, space after function name.

> +
>    /* Register rseq TLS to the kernel.  */
>    {
>      bool do_rseq = THREAD_GETMEM (pd, flags) & ATTR_FLAG_DO_RSEQ;
> @@ -571,6 +574,8 @@ start_thread (void *arg)
>      /* Free the TCB.  */
>      __nptl_free_tcb (pd);
>  
> +  /* Release memory for /proc/<pid>/maps. */

This comment is misleading, it releases the mapping name of associated
map, not the map itself.  And since the name allocation is an implementation
detail of the kABI, I think the comment should say the the mapping between
thread and a identifier is removed since it uses the thread id and the
same stack might be used by a different thread in the future.

> +  name_stack_maps(pd, false);
>  out:
>    /* We cannot call '_exit' here.  '_exit' will terminate the process.
>  


[1] https://patchwork.sourceware.org/project/glibc/patch/20230927034527.1648061-1-irogers@google.com/
[2] https://ci.linaro.org/job/tcwg_glibc_build--master-aarch64-precommit/768/artifact/artifacts/artifacts.precommit/notify/mail-body.txt
  
Ian Rogers Sept. 27, 2023, 7:07 p.m. UTC | #2
On Wed, Sep 27, 2023 at 10:56 AM Adhemerval Zanella Netto
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 27/09/23 00:45, Ian Rogers wrote:
> > Linux 4.5 removed thread stack annotations due to the complexity of
> > computing them:
> > https://github.com/torvalds/linux/commit/65376df582174ffcec9e6471bf5b0dd79ba05e4a
> >
> > This untested (other than building) RFC patch uses
> > PR_SET_VMA_ANON_NAME to name stacks again as part of thread
> > creation. PR_SET_VMA_ANON_NAME was added in:
> > https://lore.kernel.org/lkml/1383170047-21074-2-git-send-email-ccross@android.com/
>
> I think referring to this patch is somewhat confusing because this is a
> old version with a slight different API, which has been fixed on the
> final kABI (the user-provided string is just a pointer to user process,
> while on installed version the string is copied and kept by the kernel).
>
> So I think it would be better to reference to the actual kernel commit
> 9a10064f5625d5572c3626c1516e0bebc6c9fe9b (added on 5.17).
>
> >
> > The patch is intended to create discussion and possibly serve as an
> > implementation. When I try to test it current I get unrelated
> > failures and so guidance would be appreciated.
> >
> > The naming of stacks can be useful in situations like debugging and
> > profiling, for example, to differentiate stack from heap memory accesses.
>
> An extra prctl on each thread creation should be fine, although I am not
> sure about other performance implications.  It seems that the fork performance
> issues is already handled by the kernel, so adding this as default feature would
> only add some contention on extra mmap creation during thread creation (since
> this pcrtl requires the mmap lock). If this incurs in some performance issues
> in heavily content mmap workloads, it might be better to add a way to disable
> it through a tunable.

Thanks! Should the work to add a tunable be followed up based on an
observed problem? The mmap lock contention issue may well be resolved:
https://lwn.net/Articles/906852/
in which case a tunable will be baggage.

I'll send a fresh RFC shortly. Thanks for catching the many style and
other issues, I've never posted here before and the help is very much
appreciated!
Ian


Ian


> > ---
> >  nptl/allocatestack.c  | 28 ++++++++++++++++++++++++++++
> >  nptl/pthread_create.c |  5 +++++
> >  2 files changed, 33 insertions(+)
> >
> > diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
> > index f9d8cdfd08..bf8297950f 100644
> > --- a/nptl/allocatestack.c
> > +++ b/nptl/allocatestack.c
> > @@ -23,6 +23,7 @@
> >  #include <unistd.h>
> >  #include <sys/mman.h>
> >  #include <sys/param.h>
> > +#include <sys/prctl.h>
> >  #include <dl-sysdep.h>
> >  #include <dl-tls.h>
> >  #include <tls.h>
> > @@ -577,3 +578,30 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
> >
> >    return 0;
> >  }
> > +
> > +#if defined(PR_SET_VMA) && defined(PR_SET_VMA_ANON_NAME)
>
> I think this is not ideal, this support should not be conditional based
> on installed kernel support (similar on how we handle syscalls now).
> Instead, define them for internal used if linux/prctl.h does not define
> them:
>
> diff --git a/include/sys/prctl.h b/include/sys/prctl.h
> index d33f3a290e..971f73914a 100644
> --- a/include/sys/prctl.h
> +++ b/include/sys/prctl.h
> @@ -3,6 +3,11 @@
>
>  # ifndef _ISOMAC
>
> +#  ifndef PR_SET_VMA
> +#   define PR_SET_VMA          0x53564d41
> +#   define PR_SET_VMA_ANON_NAME 0
> +#  endif
> +
>  extern int __prctl (int __option, ...);
>  libc_hidden_proto (__prctl)
>
> It would fix the !PR_SET_VMA case as well (triggered by the aarch64 bot [2]).
>
> > +/* Use PR_SET_VMA_ANON_NAME to name the thread's stack stack:<tid> */
> > +static void
> > +name_stack_maps (struct pthread *pd, bool set)
> > +{
> > +  void *stack = THREAD_GETMEM (pd, stackblock);
> > +  size_t stacksize = THREAD_GETMEM (pd, stackblock_size);
> > +  if (!set) {
> > +    __prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, stack, stacksize, NULL);
> > +    return;
> > +  } else {
>
> Style issues, curly bracket in a new line, space after the function
> call, no need to add brackets for one line call:
>
>   if (!set)
>     __prctl (PR_SET_VMA, PR_SET_VMA_ANON_NAME, stack, stacksize, NULL);
>   else
>     {
>       [...];
>     }
>
> > +#define STACK_MAPS_PREFIX "stack:"
> > +    char stack_name[sizeof(STACK_MAPS_PREFIX)+11];
>
> We have INT_BUFSIZE_BOUND macro to get the expected bounds for formatting
> integers:
>
>   char stack_name[sizeof (STACK_MAPS_PREFIX)
>                   + INT_BUFSIZE_BOUND (unsigned int)];
>
> > +
> > +    snprintf(stack_name, sizeof(stack_name), STACK_MAPS_PREFIX "%u",
> > +             (unsigned)THREAD_GETMEM (pd, tid));
>
> You need to use the internal definition (__snprintf) to avoid linknamespace
> issues (this what caused patchwork buildbot to mark this patch as failed [1]).
>
> Also always use the full type (unsigned int), I think printing as unsigned
> should be ok (since at the time of call the thread id is always valid).
>
> > +    __prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, stack, stacksize, stack_name);
> > +#undef STACK_MAPS_PREFIX
> > +  }
> > +}
> > +#else /* defined(PR_SET_VMA) && defined(PR_SET_VMA_ANON_NAME) */
> > +static void
> > +name_stack_maps (struct pthread *pd)
> > +{
> > +}
> > +#endif
> > diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
> > index 6a41d50109..0e36d66f50 100644
> > --- a/nptl/pthread_create.c
> > +++ b/nptl/pthread_create.c
> > @@ -369,6 +369,9 @@ start_thread (void *arg)
> >    /* Initialize pointers to locale data.  */
> >    __ctype_init ();
> >
> > +  /* Set up /proc/<pid>/maps entry for stack to stack:tid. */
> > +  name_stack_maps(pd, true);
>
> Style issue, space after function name.
>
> > +
> >    /* Register rseq TLS to the kernel.  */
> >    {
> >      bool do_rseq = THREAD_GETMEM (pd, flags) & ATTR_FLAG_DO_RSEQ;
> > @@ -571,6 +574,8 @@ start_thread (void *arg)
> >      /* Free the TCB.  */
> >      __nptl_free_tcb (pd);
> >
> > +  /* Release memory for /proc/<pid>/maps. */
>
> This comment is misleading, it releases the mapping name of associated
> map, not the map itself.  And since the name allocation is an implementation
> detail of the kABI, I think the comment should say the the mapping between
> thread and a identifier is removed since it uses the thread id and the
> same stack might be used by a different thread in the future.
>
> > +  name_stack_maps(pd, false);
> >  out:
> >    /* We cannot call '_exit' here.  '_exit' will terminate the process.
> >
>
>
> [1] https://patchwork.sourceware.org/project/glibc/patch/20230927034527.1648061-1-irogers@google.com/
> [2] https://ci.linaro.org/job/tcwg_glibc_build--master-aarch64-precommit/768/artifact/artifacts/artifacts.precommit/notify/mail-body.txt
  
Adhemerval Zanella Netto Sept. 27, 2023, 7:33 p.m. UTC | #3
On 27/09/23 16:07, Ian Rogers wrote:
> On Wed, Sep 27, 2023 at 10:56 AM Adhemerval Zanella Netto
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>>
>> On 27/09/23 00:45, Ian Rogers wrote:
>>> Linux 4.5 removed thread stack annotations due to the complexity of
>>> computing them:
>>> https://github.com/torvalds/linux/commit/65376df582174ffcec9e6471bf5b0dd79ba05e4a
>>>
>>> This untested (other than building) RFC patch uses
>>> PR_SET_VMA_ANON_NAME to name stacks again as part of thread
>>> creation. PR_SET_VMA_ANON_NAME was added in:
>>> https://lore.kernel.org/lkml/1383170047-21074-2-git-send-email-ccross@android.com/
>>
>> I think referring to this patch is somewhat confusing because this is a
>> old version with a slight different API, which has been fixed on the
>> final kABI (the user-provided string is just a pointer to user process,
>> while on installed version the string is copied and kept by the kernel).
>>
>> So I think it would be better to reference to the actual kernel commit
>> 9a10064f5625d5572c3626c1516e0bebc6c9fe9b (added on 5.17).
>>
>>>
>>> The patch is intended to create discussion and possibly serve as an
>>> implementation. When I try to test it current I get unrelated
>>> failures and so guidance would be appreciated.
>>>
>>> The naming of stacks can be useful in situations like debugging and
>>> profiling, for example, to differentiate stack from heap memory accesses.
>>
>> An extra prctl on each thread creation should be fine, although I am not
>> sure about other performance implications.  It seems that the fork performance
>> issues is already handled by the kernel, so adding this as default feature would
>> only add some contention on extra mmap creation during thread creation (since
>> this pcrtl requires the mmap lock). If this incurs in some performance issues
>> in heavily content mmap workloads, it might be better to add a way to disable
>> it through a tunable.
> 
> Thanks! Should the work to add a tunable be followed up based on an
> observed problem? The mmap lock contention issue may well be resolved:
> https://lwn.net/Articles/906852/
> in which case a tunable will be baggage.

Right, but this is not yet fixed if I understand correctly.  Also, we
do not consider the tunables de-facto ABI, so we are free to change over 
releases.

But I am bringing this only because I am not sure if this would really
be a performance issues; in fact I think this would be an issues only
on very specific workloads (highly concurrent that issues a lot of
mmap/munmap calls). 

> 
> I'll send a fresh RFC shortly. Thanks for catching the many style and
> other issues, I've never posted here before and the help is very much
> appreciated!
> Ian

There are a couple of other style issues, I will reply there.
  

Patch

diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
index f9d8cdfd08..bf8297950f 100644
--- a/nptl/allocatestack.c
+++ b/nptl/allocatestack.c
@@ -23,6 +23,7 @@ 
 #include <unistd.h>
 #include <sys/mman.h>
 #include <sys/param.h>
+#include <sys/prctl.h>
 #include <dl-sysdep.h>
 #include <dl-tls.h>
 #include <tls.h>
@@ -577,3 +578,30 @@  allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 
   return 0;
 }
+
+#if defined(PR_SET_VMA) && defined(PR_SET_VMA_ANON_NAME)
+/* Use PR_SET_VMA_ANON_NAME to name the thread's stack stack:<tid> */
+static void
+name_stack_maps (struct pthread *pd, bool set)
+{
+  void *stack = THREAD_GETMEM (pd, stackblock);
+  size_t stacksize = THREAD_GETMEM (pd, stackblock_size);
+  if (!set) {
+    __prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, stack, stacksize, NULL);
+    return;
+  } else {
+#define STACK_MAPS_PREFIX "stack:"
+    char stack_name[sizeof(STACK_MAPS_PREFIX)+11];
+
+    snprintf(stack_name, sizeof(stack_name), STACK_MAPS_PREFIX "%u",
+             (unsigned)THREAD_GETMEM (pd, tid));
+    __prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, stack, stacksize, stack_name);
+#undef STACK_MAPS_PREFIX
+  }
+}
+#else /* defined(PR_SET_VMA) && defined(PR_SET_VMA_ANON_NAME) */
+static void
+name_stack_maps (struct pthread *pd)
+{
+}
+#endif
diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
index 6a41d50109..0e36d66f50 100644
--- a/nptl/pthread_create.c
+++ b/nptl/pthread_create.c
@@ -369,6 +369,9 @@  start_thread (void *arg)
   /* Initialize pointers to locale data.  */
   __ctype_init ();
 
+  /* Set up /proc/<pid>/maps entry for stack to stack:tid. */
+  name_stack_maps(pd, true);
+
   /* Register rseq TLS to the kernel.  */
   {
     bool do_rseq = THREAD_GETMEM (pd, flags) & ATTR_FLAG_DO_RSEQ;
@@ -571,6 +574,8 @@  start_thread (void *arg)
     /* Free the TCB.  */
     __nptl_free_tcb (pd);
 
+  /* Release memory for /proc/<pid>/maps. */
+  name_stack_maps(pd, false);
 out:
   /* We cannot call '_exit' here.  '_exit' will terminate the process.