[1/8] nptl: Perform signal initialization upon pthread_create

Message ID 4bbcddccf8d3c13f1f9a9032920c71918230d389.1621600831.git.fweimer@redhat.com
State Committed
Commit 2f69522d460611b1018e15df6c238dda2d8d6609
Headers
Series nptl: Complete libpthread move |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Florian Weimer May 21, 2021, 12:45 p.m. UTC
  Install signal handlers and unblock signals before pthread_create
creates the first thread.

create_thread in sysdeps/unix/sysv/linux/createthread.c can send
SIGCANCEL to the current thread, so the SIGCANCEL handler is currently
needed even if pthread_cancel is never called.  (The way timer_create
uses SIGCANCEL does not need a signal handler; both SIG_DFL and SIG_IGN
dispositions should work.)
---
 nptl/Versions         |  5 ++-
 nptl/nptl-init.c      | 75 ------------------------------------
 nptl/pthreadP.h       |  6 +++
 nptl/pthread_cancel.c | 88 ++++++++++++++++++++++++++++++++++++++-----
 nptl/pthread_create.c | 45 +++++++++++++++++++++-
 5 files changed, 131 insertions(+), 88 deletions(-)
  

Comments

Adhemerval Zanella Netto May 21, 2021, 8:03 p.m. UTC | #1
On 21/05/2021 09:45, Florian Weimer via Libc-alpha wrote:
> Install signal handlers and unblock signals before pthread_create
> creates the first thread.
> 
> create_thread in sysdeps/unix/sysv/linux/createthread.c can send
> SIGCANCEL to the current thread, so the SIGCANCEL handler is currently
> needed even if pthread_cancel is never called.  (The way timer_create
> uses SIGCANCEL does not need a signal handler; both SIG_DFL and SIG_IGN
> dispositions should work.)

LGTM, thanks.  I have only a question about the PTHREAD_STATIC_FN_REQUIRE
addition.

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

> ---
>  nptl/Versions         |  5 ++-
>  nptl/nptl-init.c      | 75 ------------------------------------
>  nptl/pthreadP.h       |  6 +++
>  nptl/pthread_cancel.c | 88 ++++++++++++++++++++++++++++++++++++++-----
>  nptl/pthread_create.c | 45 +++++++++++++++++++++-
>  5 files changed, 131 insertions(+), 88 deletions(-)
> 
> diff --git a/nptl/Versions b/nptl/Versions
> index e7883cbb49..d96b830d05 100644
> --- a/nptl/Versions
> +++ b/nptl/Versions
> @@ -367,8 +367,6 @@ libc {
>      tss_set;
>    }
>    GLIBC_PRIVATE {
> -     __nptl_create_event;
> -     __nptl_death_event;
>      __default_pthread_attr;
>      __default_pthread_attr_lock;
>      __futex_abstimed_wait64;
> @@ -386,11 +384,14 @@ libc {
>      __lll_trylock_elision;
>      __lll_unlock_elision;
>      __mutex_aconf;
> +    __nptl_create_event;
>      __nptl_deallocate_stack;
>      __nptl_deallocate_tsd;
> +    __nptl_death_event;
>      __nptl_free_tcb;
>      __nptl_nthreads;
>      __nptl_setxid_sighandler;
> +    __nptl_sigcancel_handler;
>      __nptl_stack_list_add;
>      __nptl_stack_list_del;
>      __pthread_attr_copy;

Ok.

> diff --git a/nptl/nptl-init.c b/nptl/nptl-init.c
> index f4b86fbfaf..bc4831ac89 100644
> --- a/nptl/nptl-init.c
> +++ b/nptl/nptl-init.c
> @@ -44,84 +44,9 @@ size_t __static_tls_align_m1;
>  /* Version of the library, used in libthread_db to detect mismatches.  */
>  static const char nptl_version[] __attribute_used__ = VERSION;
>  
> -/* For asynchronous cancellation we use a signal.  This is the handler.  */
> -static void
> -sigcancel_handler (int sig, siginfo_t *si, void *ctx)
> -{
> -  /* Safety check.  It would be possible to call this function for
> -     other signals and send a signal from another process.  This is not
> -     correct and might even be a security problem.  Try to catch as
> -     many incorrect invocations as possible.  */
> -  if (sig != SIGCANCEL
> -      || si->si_pid != __getpid()
> -      || si->si_code != SI_TKILL)
> -    return;
> -
> -  struct pthread *self = THREAD_SELF;
> -
> -  int oldval = THREAD_GETMEM (self, cancelhandling);
> -  while (1)
> -    {
> -      /* We are canceled now.  When canceled by another thread this flag
> -	 is already set but if the signal is directly send (internally or
> -	 from another process) is has to be done here.  */
> -      int newval = oldval | CANCELING_BITMASK | CANCELED_BITMASK;
> -
> -      if (oldval == newval || (oldval & EXITING_BITMASK) != 0)
> -	/* Already canceled or exiting.  */
> -	break;
> -
> -      int curval = THREAD_ATOMIC_CMPXCHG_VAL (self, cancelhandling, newval,
> -					      oldval);
> -      if (curval == oldval)
> -	{
> -	  /* Set the return value.  */
> -	  THREAD_SETMEM (self, result, PTHREAD_CANCELED);
> -
> -	  /* Make sure asynchronous cancellation is still enabled.  */
> -	  if ((newval & CANCELTYPE_BITMASK) != 0)
> -	    /* Run the registered destructors and terminate the thread.  */
> -	    __do_cancel ();
> -
> -	  break;
> -	}
> -
> -      oldval = curval;
> -    }
> -}
> -
> -
> -/* When using __thread for this, we do it in libc so as not
> -   to give libpthread its own TLS segment just for this.  */
> -extern void **__libc_dl_error_tsd (void) __attribute__ ((const));
> -
> -
>  void
>  __pthread_initialize_minimal_internal (void)
>  {
> -  struct sigaction sa;
> -  __sigemptyset (&sa.sa_mask);
> -
> -  /* Install the cancellation signal handler.  If for some reason we
> -     cannot install the handler we do not abort.  Maybe we should, but
> -     it is only asynchronous cancellation which is affected.  */
> -  sa.sa_sigaction = sigcancel_handler;
> -  sa.sa_flags = SA_SIGINFO;
> -  (void) __libc_sigaction (SIGCANCEL, &sa, NULL);
> -
> -  /* Install the handle to change the threads' uid/gid.  */
> -  sa.sa_sigaction = __nptl_setxid_sighandler;
> -  sa.sa_flags = SA_SIGINFO | SA_RESTART;
> -  (void) __libc_sigaction (SIGSETXID, &sa, NULL);
> -
> -  /* The parent process might have left the signals blocked.  Just in
> -     case, unblock it.  We reuse the signal mask in the sigaction
> -     structure.  It is already cleared.  */
> -  __sigaddset (&sa.sa_mask, SIGCANCEL);
> -  __sigaddset (&sa.sa_mask, SIGSETXID);
> -  INTERNAL_SYSCALL_CALL (rt_sigprocmask, SIG_UNBLOCK, &sa.sa_mask,
> -			 NULL, __NSIG_BYTES);
> -
>    /* Get the size of the static and alignment requirements for the TLS
>       block.  */
>    size_t static_tls_align;

Ok.

> diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
> index f93806e540..497c2ad3d9 100644
> --- a/nptl/pthreadP.h
> +++ b/nptl/pthreadP.h
> @@ -571,6 +571,12 @@ libc_hidden_proto (__pthread_attr_setsigmask_internal)
>  extern __typeof (pthread_attr_getsigmask_np) __pthread_attr_getsigmask_np;
>  libc_hidden_proto (__pthread_attr_getsigmask_np)
>  
> +/* The cancellation signal handler defined alongside with
> +   pthread_cancel.  This is included in statically linked binaries
> +   only if pthread_cancel is linked in.  */
> +void __nptl_sigcancel_handler (int sig, siginfo_t *si, void *ctx);
> +libc_hidden_proto (__nptl_sigcancel_handler)
> +
>  /* Special versions which use non-exported functions.  */
>  extern void __pthread_cleanup_push (struct _pthread_cleanup_buffer *buffer,
>  				    void (*routine) (void *), void *arg);

Ok.

> diff --git a/nptl/pthread_cancel.c b/nptl/pthread_cancel.c
> index e4ad602900..802c691874 100644
> --- a/nptl/pthread_cancel.c
> +++ b/nptl/pthread_cancel.c
> @@ -26,6 +26,63 @@
>  #include <unwind-link.h>
>  #include <stdio.h>
>  #include <gnu/lib-names.h>
> +#include <sys/single_threaded.h>
> +
> +/* For asynchronous cancellation we use a signal.  This is the core
> +   logic of the signal handler.  */
> +static void
> +sigcancel_handler (void)
> +{
> +  struct pthread *self = THREAD_SELF;
> +
> +  int oldval = THREAD_GETMEM (self, cancelhandling);
> +  while (1)
> +    {
> +      /* We are canceled now.  When canceled by another thread this flag
> +	 is already set but if the signal is directly send (internally or
> +	 from another process) is has to be done here.  */
> +      int newval = oldval | CANCELING_BITMASK | CANCELED_BITMASK;
> +
> +      if (oldval == newval || (oldval & EXITING_BITMASK) != 0)
> +	/* Already canceled or exiting.  */
> +	break;
> +
> +      int curval = THREAD_ATOMIC_CMPXCHG_VAL (self, cancelhandling, newval,
> +					      oldval);
> +      if (curval == oldval)
> +	{
> +	  /* Set the return value.  */
> +	  THREAD_SETMEM (self, result, PTHREAD_CANCELED);
> +
> +	  /* Make sure asynchronous cancellation is still enabled.  */
> +	  if ((newval & CANCELTYPE_BITMASK) != 0)
> +	    /* Run the registered destructors and terminate the thread.  */
> +	    __do_cancel ();
> +
> +	  break;
> +	}
> +
> +      oldval = curval;
> +    }
> +}
> +
> +/* This is the actually installed SIGCANCEL handler.  It adds some
> +   safety checks before performing the cancellation.  */
> +void
> +__nptl_sigcancel_handler (int sig, siginfo_t *si, void *ctx)
> +{
> +  /* Safety check.  It would be possible to call this function for
> +     other signals and send a signal from another process.  This is not
> +     correct and might even be a security problem.  Try to catch as
> +     many incorrect invocations as possible.  */
> +  if (sig != SIGCANCEL
> +      || si->si_pid != __getpid()
> +      || si->si_code != SI_TKILL)
> +    return;
> +
> +  sigcancel_handler ();
> +}
> +libc_hidden_def (__nptl_sigcancel_handler)
>  
>  int
>  __pthread_cancel (pthread_t th)

Ok.

> @@ -72,14 +129,23 @@ __pthread_cancel (pthread_t th)
>  						    oldval))
>  	    goto again;
>  
> -	  /* The cancellation handler will take care of marking the
> -	     thread as canceled.  */
> -	  pid_t pid = __getpid ();
> -
> -	  int val = INTERNAL_SYSCALL_CALL (tgkill, pid, pd->tid,
> -					   SIGCANCEL);
> -	  if (INTERNAL_SYSCALL_ERROR_P (val))
> -	    result = INTERNAL_SYSCALL_ERRNO (val);
> +	  if (pd == THREAD_SELF)
> +	    /* This is not merely an optimization: An application may
> +	       call pthread_cancel (pthread_self ()) without calling
> +	       pthread_create, so the signal handler may not have been
> +	       set up for a self-cancel.  */
> +	    sigcancel_handler ();
> +	  else
> +	    {
> +	      /* The cancellation handler will take care of marking the
> +		 thread as canceled.  */
> +	      pid_t pid = __getpid ();
> +
> +	      int val = INTERNAL_SYSCALL_CALL (tgkill, pid, pd->tid,
> +					       SIGCANCEL);
> +	      if (INTERNAL_SYSCALL_ERROR_P (val))
> +		result = INTERNAL_SYSCALL_ERRNO (val);
> +	    }
>  
>  	  break;
>  	}

Ok.

> @@ -106,4 +172,8 @@ versioned_symbol (libc, __pthread_cancel, pthread_cancel, GLIBC_2_34);
>  compat_symbol (libpthread, __pthread_cancel, pthread_cancel, GLIBC_2_0);
>  #endif
>  
> -PTHREAD_STATIC_FN_REQUIRE (__pthread_create)
> +/* Ensure that the unwinder is always linked in (the __pthread_unwind
> +   reference from __do_cancel is weak).  Use ___pthread_unwind_next
> +   (three underscores) to produce a strong reference to the same
> +   file.  */
> +PTHREAD_STATIC_FN_REQUIRE (___pthread_unwind_next)

Why __pthread_unwind is marked as weak now? Would be simpler to unmark
is as weak and avoid the PTHREAD_STATIC_FN_REQUIRE?

> diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
> index 770656453d..772b5efcc6 100644
> --- a/nptl/pthread_create.c
> +++ b/nptl/pthread_create.c
> @@ -56,6 +56,43 @@ static struct rtld_global *__nptl_rtld_global __attribute_used__
>    = &_rtld_global;
>  #endif
>  
> +/* This performs the initialization necessary when going from
> +   single-threaded to multi-threaded mode for the first time.  */
> +static void
> +late_init (void)
> +{
> +  struct sigaction sa;
> +  __sigemptyset (&sa.sa_mask);
> +
> +  /* Install the cancellation signal handler (in static builds only if
> +     pthread_cancel has been linked in).  If for some reason we cannot
> +     install the handler we do not abort.  Maybe we should, but it is
> +     only asynchronous cancellation which is affected.  */
> +#ifndef SHARED
> +  extern __typeof (__nptl_sigcancel_handler) __nptl_sigcancel_handler
> +    __attribute__ ((weak));
> +  if (__nptl_sigcancel_handler != NULL)
> +#endif
> +    {
> +      sa.sa_sigaction = __nptl_sigcancel_handler;
> +      sa.sa_flags = SA_SIGINFO;
> +      (void) __libc_sigaction (SIGCANCEL, &sa, NULL);
> +    }
> +
> +  /* Install the handle to change the threads' uid/gid.  */
> +  sa.sa_sigaction = __nptl_setxid_sighandler;
> +  sa.sa_flags = SA_SIGINFO | SA_RESTART;
> +  (void) __libc_sigaction (SIGSETXID, &sa, NULL);
> +
> +  /* The parent process might have left the signals blocked.  Just in
> +     case, unblock it.  We reuse the signal mask in the sigaction
> +     structure.  It is already cleared.  */
> +  __sigaddset (&sa.sa_mask, SIGCANCEL);
> +  __sigaddset (&sa.sa_mask, SIGSETXID);
> +  INTERNAL_SYSCALL_CALL (rt_sigprocmask, SIG_UNBLOCK, &sa.sa_mask,
> +			 NULL, __NSIG_BYTES);
> +}
> +
>  /* Code to allocate and deallocate a stack.  */
>  #include "allocatestack.c"
>  

Ok.

> @@ -459,9 +496,13 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
>  {
>    STACK_VARIABLES;
>  
> -  /* Avoid a data race in the multi-threaded case.  */
> +  /* Avoid a data race in the multi-threaded case, and call the
> +     deferred initialization only once.  */
>    if (__libc_single_threaded)
> -    __libc_single_threaded = 0;
> +    {
> +      late_init ();
> +      __libc_single_threaded = 0;
> +    }
>  
>    const struct pthread_attr *iattr = (struct pthread_attr *) attr;
>    union pthread_attr_transparent default_attr;
> 

Ok.
  
Florian Weimer May 21, 2021, 8:15 p.m. UTC | #2
* Adhemerval Zanella:

>> @@ -106,4 +172,8 @@ versioned_symbol (libc, __pthread_cancel, pthread_cancel, GLIBC_2_34);
>>  compat_symbol (libpthread, __pthread_cancel, pthread_cancel, GLIBC_2_0);
>>  #endif
>>  
>> -PTHREAD_STATIC_FN_REQUIRE (__pthread_create)
>> +/* Ensure that the unwinder is always linked in (the __pthread_unwind
>> +   reference from __do_cancel is weak).  Use ___pthread_unwind_next
>> +   (three underscores) to produce a strong reference to the same
>> +   file.  */
>> +PTHREAD_STATIC_FN_REQUIRE (___pthread_unwind_next)
>
> Why __pthread_unwind is marked as weak now? Would be simpler to unmark
> is as weak and avoid the PTHREAD_STATIC_FN_REQUIRE?

It's related to the compiled-in cancellation points.  The references to
__pthread_enable_asynccancel are strong, and that disassembles to:

0000000000000000 <__pthread_enable_asynccancel>:
   0:	mov    %fs:0x10,%rcx
   9:	mov    %fs:0x308,%r8d
  12:	mov    %r8d,%edx
  15:	or     $0x2,%edx
  18:	cmp    %edx,%r8d
  1b:	je     35 <__pthread_enable_asynccancel+0x35>
  1d:	mov    %r8d,%eax
  20:	lock cmpxchg %edx,0x308(%rcx)
  28:	cmp    %eax,%r8d
  2b:	jne    40 <__pthread_enable_asynccancel+0x40>
  2d:	and    $0xffffffbb,%edx
  30:	cmp    $0xa,%edx
  33:	je     45 <__pthread_enable_asynccancel+0x45>
  35:	mov    %r8d,%eax
  38:	retq   
  39:	nopl   0x0(%rax)
  40:	mov    %eax,%r8d
  43:	jmp    12 <__pthread_enable_asynccancel+0x12>
  45:	push   %rax
  46:	movq   $0xffffffffffffffff,%fs:0x630
  53:	mov    %fs:0x10,%rax
  5c:	lock orl $0x10,0x308(%rax)
  64:	mov    %fs:0x300,%rdi
  6d:	callq  72 <__pthread_enable_asynccancel+0x72>
			6e: R_X86_64_PLT32	__pthread_unwind-0x4
  72:	data16 nopw %cs:0x0(%rax,%rax,1)
  7d:	nopl   (%rax)

And the unwind reference is weak:

   17: 0000000000000000      0 NOTYPE  WEAK   HIDDEN     UNDEF __pthread_unwind

There's no address check because the cancellation state in the TCB can
only change if pthread_cancel is linked in, and that really should pull
in the unwinder, too.  That's why I added the PTHREAD_STATIC_FN_REQUIRE.
Without it, I think some of the existing tests will fail (the
pthread_create PTHREAD_STATIC_FN_REQUIREs are apparently not quite
correct).

I think it's always been this way, so that the unwinder is optional in
static links (in theory, in practice it is still pulled in by the bits
that are compiled with -fexceptions due to the _Unwind_Resume construct,
hence the need for elf/static-stubs.c).

Thanks,
Florian
  
Adhemerval Zanella Netto May 21, 2021, 8:23 p.m. UTC | #3
On 21/05/2021 17:15, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>>> @@ -106,4 +172,8 @@ versioned_symbol (libc, __pthread_cancel, pthread_cancel, GLIBC_2_34);
>>>  compat_symbol (libpthread, __pthread_cancel, pthread_cancel, GLIBC_2_0);
>>>  #endif
>>>  
>>> -PTHREAD_STATIC_FN_REQUIRE (__pthread_create)
>>> +/* Ensure that the unwinder is always linked in (the __pthread_unwind
>>> +   reference from __do_cancel is weak).  Use ___pthread_unwind_next
>>> +   (three underscores) to produce a strong reference to the same
>>> +   file.  */
>>> +PTHREAD_STATIC_FN_REQUIRE (___pthread_unwind_next)
>>
>> Why __pthread_unwind is marked as weak now? Would be simpler to unmark
>> is as weak and avoid the PTHREAD_STATIC_FN_REQUIRE?
> 
> It's related to the compiled-in cancellation points.  The references to
> __pthread_enable_asynccancel are strong, and that disassembles to:
> 
> 0000000000000000 <__pthread_enable_asynccancel>:
>    0:	mov    %fs:0x10,%rcx
>    9:	mov    %fs:0x308,%r8d
>   12:	mov    %r8d,%edx
>   15:	or     $0x2,%edx
>   18:	cmp    %edx,%r8d
>   1b:	je     35 <__pthread_enable_asynccancel+0x35>
>   1d:	mov    %r8d,%eax
>   20:	lock cmpxchg %edx,0x308(%rcx)
>   28:	cmp    %eax,%r8d
>   2b:	jne    40 <__pthread_enable_asynccancel+0x40>
>   2d:	and    $0xffffffbb,%edx
>   30:	cmp    $0xa,%edx
>   33:	je     45 <__pthread_enable_asynccancel+0x45>
>   35:	mov    %r8d,%eax
>   38:	retq   
>   39:	nopl   0x0(%rax)
>   40:	mov    %eax,%r8d
>   43:	jmp    12 <__pthread_enable_asynccancel+0x12>
>   45:	push   %rax
>   46:	movq   $0xffffffffffffffff,%fs:0x630
>   53:	mov    %fs:0x10,%rax
>   5c:	lock orl $0x10,0x308(%rax)
>   64:	mov    %fs:0x300,%rdi
>   6d:	callq  72 <__pthread_enable_asynccancel+0x72>
> 			6e: R_X86_64_PLT32	__pthread_unwind-0x4
>   72:	data16 nopw %cs:0x0(%rax,%rax,1)
>   7d:	nopl   (%rax)
> 
> And the unwind reference is weak:
> 
>    17: 0000000000000000      0 NOTYPE  WEAK   HIDDEN     UNDEF __pthread_unwind
> 
> There's no address check because the cancellation state in the TCB can
> only change if pthread_cancel is linked in, and that really should pull
> in the unwinder, too.  That's why I added the PTHREAD_STATIC_FN_REQUIRE.
> Without it, I think some of the existing tests will fail (the
> pthread_create PTHREAD_STATIC_FN_REQUIREs are apparently not quite
> correct).
> 
> I think it's always been this way, so that the unwinder is optional in
> static links (in theory, in practice it is still pulled in by the bits
> that are compiled with -fexceptions due to the _Unwind_Resume construct,
> hence the need for elf/static-stubs.c).

Right, it does seem unnecessary to pull the unwinder for SYSCALL_CANCEL.
  

Patch

diff --git a/nptl/Versions b/nptl/Versions
index e7883cbb49..d96b830d05 100644
--- a/nptl/Versions
+++ b/nptl/Versions
@@ -367,8 +367,6 @@  libc {
     tss_set;
   }
   GLIBC_PRIVATE {
-     __nptl_create_event;
-     __nptl_death_event;
     __default_pthread_attr;
     __default_pthread_attr_lock;
     __futex_abstimed_wait64;
@@ -386,11 +384,14 @@  libc {
     __lll_trylock_elision;
     __lll_unlock_elision;
     __mutex_aconf;
+    __nptl_create_event;
     __nptl_deallocate_stack;
     __nptl_deallocate_tsd;
+    __nptl_death_event;
     __nptl_free_tcb;
     __nptl_nthreads;
     __nptl_setxid_sighandler;
+    __nptl_sigcancel_handler;
     __nptl_stack_list_add;
     __nptl_stack_list_del;
     __pthread_attr_copy;
diff --git a/nptl/nptl-init.c b/nptl/nptl-init.c
index f4b86fbfaf..bc4831ac89 100644
--- a/nptl/nptl-init.c
+++ b/nptl/nptl-init.c
@@ -44,84 +44,9 @@  size_t __static_tls_align_m1;
 /* Version of the library, used in libthread_db to detect mismatches.  */
 static const char nptl_version[] __attribute_used__ = VERSION;
 
-/* For asynchronous cancellation we use a signal.  This is the handler.  */
-static void
-sigcancel_handler (int sig, siginfo_t *si, void *ctx)
-{
-  /* Safety check.  It would be possible to call this function for
-     other signals and send a signal from another process.  This is not
-     correct and might even be a security problem.  Try to catch as
-     many incorrect invocations as possible.  */
-  if (sig != SIGCANCEL
-      || si->si_pid != __getpid()
-      || si->si_code != SI_TKILL)
-    return;
-
-  struct pthread *self = THREAD_SELF;
-
-  int oldval = THREAD_GETMEM (self, cancelhandling);
-  while (1)
-    {
-      /* We are canceled now.  When canceled by another thread this flag
-	 is already set but if the signal is directly send (internally or
-	 from another process) is has to be done here.  */
-      int newval = oldval | CANCELING_BITMASK | CANCELED_BITMASK;
-
-      if (oldval == newval || (oldval & EXITING_BITMASK) != 0)
-	/* Already canceled or exiting.  */
-	break;
-
-      int curval = THREAD_ATOMIC_CMPXCHG_VAL (self, cancelhandling, newval,
-					      oldval);
-      if (curval == oldval)
-	{
-	  /* Set the return value.  */
-	  THREAD_SETMEM (self, result, PTHREAD_CANCELED);
-
-	  /* Make sure asynchronous cancellation is still enabled.  */
-	  if ((newval & CANCELTYPE_BITMASK) != 0)
-	    /* Run the registered destructors and terminate the thread.  */
-	    __do_cancel ();
-
-	  break;
-	}
-
-      oldval = curval;
-    }
-}
-
-
-/* When using __thread for this, we do it in libc so as not
-   to give libpthread its own TLS segment just for this.  */
-extern void **__libc_dl_error_tsd (void) __attribute__ ((const));
-
-
 void
 __pthread_initialize_minimal_internal (void)
 {
-  struct sigaction sa;
-  __sigemptyset (&sa.sa_mask);
-
-  /* Install the cancellation signal handler.  If for some reason we
-     cannot install the handler we do not abort.  Maybe we should, but
-     it is only asynchronous cancellation which is affected.  */
-  sa.sa_sigaction = sigcancel_handler;
-  sa.sa_flags = SA_SIGINFO;
-  (void) __libc_sigaction (SIGCANCEL, &sa, NULL);
-
-  /* Install the handle to change the threads' uid/gid.  */
-  sa.sa_sigaction = __nptl_setxid_sighandler;
-  sa.sa_flags = SA_SIGINFO | SA_RESTART;
-  (void) __libc_sigaction (SIGSETXID, &sa, NULL);
-
-  /* The parent process might have left the signals blocked.  Just in
-     case, unblock it.  We reuse the signal mask in the sigaction
-     structure.  It is already cleared.  */
-  __sigaddset (&sa.sa_mask, SIGCANCEL);
-  __sigaddset (&sa.sa_mask, SIGSETXID);
-  INTERNAL_SYSCALL_CALL (rt_sigprocmask, SIG_UNBLOCK, &sa.sa_mask,
-			 NULL, __NSIG_BYTES);
-
   /* Get the size of the static and alignment requirements for the TLS
      block.  */
   size_t static_tls_align;
diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
index f93806e540..497c2ad3d9 100644
--- a/nptl/pthreadP.h
+++ b/nptl/pthreadP.h
@@ -571,6 +571,12 @@  libc_hidden_proto (__pthread_attr_setsigmask_internal)
 extern __typeof (pthread_attr_getsigmask_np) __pthread_attr_getsigmask_np;
 libc_hidden_proto (__pthread_attr_getsigmask_np)
 
+/* The cancellation signal handler defined alongside with
+   pthread_cancel.  This is included in statically linked binaries
+   only if pthread_cancel is linked in.  */
+void __nptl_sigcancel_handler (int sig, siginfo_t *si, void *ctx);
+libc_hidden_proto (__nptl_sigcancel_handler)
+
 /* Special versions which use non-exported functions.  */
 extern void __pthread_cleanup_push (struct _pthread_cleanup_buffer *buffer,
 				    void (*routine) (void *), void *arg);
diff --git a/nptl/pthread_cancel.c b/nptl/pthread_cancel.c
index e4ad602900..802c691874 100644
--- a/nptl/pthread_cancel.c
+++ b/nptl/pthread_cancel.c
@@ -26,6 +26,63 @@ 
 #include <unwind-link.h>
 #include <stdio.h>
 #include <gnu/lib-names.h>
+#include <sys/single_threaded.h>
+
+/* For asynchronous cancellation we use a signal.  This is the core
+   logic of the signal handler.  */
+static void
+sigcancel_handler (void)
+{
+  struct pthread *self = THREAD_SELF;
+
+  int oldval = THREAD_GETMEM (self, cancelhandling);
+  while (1)
+    {
+      /* We are canceled now.  When canceled by another thread this flag
+	 is already set but if the signal is directly send (internally or
+	 from another process) is has to be done here.  */
+      int newval = oldval | CANCELING_BITMASK | CANCELED_BITMASK;
+
+      if (oldval == newval || (oldval & EXITING_BITMASK) != 0)
+	/* Already canceled or exiting.  */
+	break;
+
+      int curval = THREAD_ATOMIC_CMPXCHG_VAL (self, cancelhandling, newval,
+					      oldval);
+      if (curval == oldval)
+	{
+	  /* Set the return value.  */
+	  THREAD_SETMEM (self, result, PTHREAD_CANCELED);
+
+	  /* Make sure asynchronous cancellation is still enabled.  */
+	  if ((newval & CANCELTYPE_BITMASK) != 0)
+	    /* Run the registered destructors and terminate the thread.  */
+	    __do_cancel ();
+
+	  break;
+	}
+
+      oldval = curval;
+    }
+}
+
+/* This is the actually installed SIGCANCEL handler.  It adds some
+   safety checks before performing the cancellation.  */
+void
+__nptl_sigcancel_handler (int sig, siginfo_t *si, void *ctx)
+{
+  /* Safety check.  It would be possible to call this function for
+     other signals and send a signal from another process.  This is not
+     correct and might even be a security problem.  Try to catch as
+     many incorrect invocations as possible.  */
+  if (sig != SIGCANCEL
+      || si->si_pid != __getpid()
+      || si->si_code != SI_TKILL)
+    return;
+
+  sigcancel_handler ();
+}
+libc_hidden_def (__nptl_sigcancel_handler)
 
 int
 __pthread_cancel (pthread_t th)
@@ -72,14 +129,23 @@  __pthread_cancel (pthread_t th)
 						    oldval))
 	    goto again;
 
-	  /* The cancellation handler will take care of marking the
-	     thread as canceled.  */
-	  pid_t pid = __getpid ();
-
-	  int val = INTERNAL_SYSCALL_CALL (tgkill, pid, pd->tid,
-					   SIGCANCEL);
-	  if (INTERNAL_SYSCALL_ERROR_P (val))
-	    result = INTERNAL_SYSCALL_ERRNO (val);
+	  if (pd == THREAD_SELF)
+	    /* This is not merely an optimization: An application may
+	       call pthread_cancel (pthread_self ()) without calling
+	       pthread_create, so the signal handler may not have been
+	       set up for a self-cancel.  */
+	    sigcancel_handler ();
+	  else
+	    {
+	      /* The cancellation handler will take care of marking the
+		 thread as canceled.  */
+	      pid_t pid = __getpid ();
+
+	      int val = INTERNAL_SYSCALL_CALL (tgkill, pid, pd->tid,
+					       SIGCANCEL);
+	      if (INTERNAL_SYSCALL_ERROR_P (val))
+		result = INTERNAL_SYSCALL_ERRNO (val);
+	    }
 
 	  break;
 	}
@@ -106,4 +172,8 @@  versioned_symbol (libc, __pthread_cancel, pthread_cancel, GLIBC_2_34);
 compat_symbol (libpthread, __pthread_cancel, pthread_cancel, GLIBC_2_0);
 #endif
 
-PTHREAD_STATIC_FN_REQUIRE (__pthread_create)
+/* Ensure that the unwinder is always linked in (the __pthread_unwind
+   reference from __do_cancel is weak).  Use ___pthread_unwind_next
+   (three underscores) to produce a strong reference to the same
+   file.  */
+PTHREAD_STATIC_FN_REQUIRE (___pthread_unwind_next)
diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
index 770656453d..772b5efcc6 100644
--- a/nptl/pthread_create.c
+++ b/nptl/pthread_create.c
@@ -56,6 +56,43 @@  static struct rtld_global *__nptl_rtld_global __attribute_used__
   = &_rtld_global;
 #endif
 
+/* This performs the initialization necessary when going from
+   single-threaded to multi-threaded mode for the first time.  */
+static void
+late_init (void)
+{
+  struct sigaction sa;
+  __sigemptyset (&sa.sa_mask);
+
+  /* Install the cancellation signal handler (in static builds only if
+     pthread_cancel has been linked in).  If for some reason we cannot
+     install the handler we do not abort.  Maybe we should, but it is
+     only asynchronous cancellation which is affected.  */
+#ifndef SHARED
+  extern __typeof (__nptl_sigcancel_handler) __nptl_sigcancel_handler
+    __attribute__ ((weak));
+  if (__nptl_sigcancel_handler != NULL)
+#endif
+    {
+      sa.sa_sigaction = __nptl_sigcancel_handler;
+      sa.sa_flags = SA_SIGINFO;
+      (void) __libc_sigaction (SIGCANCEL, &sa, NULL);
+    }
+
+  /* Install the handle to change the threads' uid/gid.  */
+  sa.sa_sigaction = __nptl_setxid_sighandler;
+  sa.sa_flags = SA_SIGINFO | SA_RESTART;
+  (void) __libc_sigaction (SIGSETXID, &sa, NULL);
+
+  /* The parent process might have left the signals blocked.  Just in
+     case, unblock it.  We reuse the signal mask in the sigaction
+     structure.  It is already cleared.  */
+  __sigaddset (&sa.sa_mask, SIGCANCEL);
+  __sigaddset (&sa.sa_mask, SIGSETXID);
+  INTERNAL_SYSCALL_CALL (rt_sigprocmask, SIG_UNBLOCK, &sa.sa_mask,
+			 NULL, __NSIG_BYTES);
+}
+
 /* Code to allocate and deallocate a stack.  */
 #include "allocatestack.c"
 
@@ -459,9 +496,13 @@  __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
 {
   STACK_VARIABLES;
 
-  /* Avoid a data race in the multi-threaded case.  */
+  /* Avoid a data race in the multi-threaded case, and call the
+     deferred initialization only once.  */
   if (__libc_single_threaded)
-    __libc_single_threaded = 0;
+    {
+      late_init ();
+      __libc_single_threaded = 0;
+    }
 
   const struct pthread_attr *iattr = (struct pthread_attr *) attr;
   union pthread_attr_transparent default_attr;