linux: spawni.c: simplify error reporting to parent

Message ID 1474405260-16657-1-git-send-email-rv@rasmusvillemoes.dk
State New, archived
Headers

Commit Message

Rasmus Villemoes Sept. 20, 2016, 9:01 p.m. UTC
  Using VFORK already ensures that the parent does not run until the
child has either exec'ed succesfully or called _exit. Hence we don't
need to read from a CLOEXEC pipe to ensure proper synchronization - we
just make explicit use of the fact the the child and parent run in the
same VM, so the child can write an error code to a field of the
posix_spawn_args struct instead of sending it through a pipe.

To ensure that this mechanism really works, the parent initializes the
field to -1 and the child writes 0 before execing.

This eliminates some annoying bookkeeping that is necessary to avoid
the file actions from clobbering the write end of the pipe, and
getting rid of the pipe creation in the first place means fewer system
calls (four in the parent, usually one in the child) and fewer
chanches for the spawn to fail (e.g. if we're close to EMFILE).

Signed-off-by: Rasmus Villemoes <rv@rasmusvillemoes.dk>
---
 sysdeps/unix/sysv/linux/spawni.c | 71 ++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 46 deletions(-)
  

Comments

Adhemerval Zanella Netto Sept. 22, 2016, 8:53 p.m. UTC | #1
LGTM, I think it addressed all the comments.

On 20/09/2016 18:01, Rasmus Villemoes wrote:
> Using VFORK already ensures that the parent does not run until the
> child has either exec'ed succesfully or called _exit. Hence we don't
> need to read from a CLOEXEC pipe to ensure proper synchronization - we
> just make explicit use of the fact the the child and parent run in the
> same VM, so the child can write an error code to a field of the
> posix_spawn_args struct instead of sending it through a pipe.
> 
> To ensure that this mechanism really works, the parent initializes the
> field to -1 and the child writes 0 before execing.
> 
> This eliminates some annoying bookkeeping that is necessary to avoid
> the file actions from clobbering the write end of the pipe, and
> getting rid of the pipe creation in the first place means fewer system
> calls (four in the parent, usually one in the child) and fewer
> chanches for the spawn to fail (e.g. if we're close to EMFILE).
> 
> Signed-off-by: Rasmus Villemoes <rv@rasmusvillemoes.dk>
> ---
>  sysdeps/unix/sysv/linux/spawni.c | 71 ++++++++++++++--------------------------
>  1 file changed, 25 insertions(+), 46 deletions(-)
> 
> diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
> index bb3eecf..c0e15ac 100644
> --- a/sysdeps/unix/sysv/linux/spawni.c
> +++ b/sysdeps/unix/sysv/linux/spawni.c
> @@ -17,6 +17,7 @@
>     <http://www.gnu.org/licenses/>.  */
>  
>  #include <spawn.h>
> +#include <assert.h>
>  #include <fcntl.h>
>  #include <paths.h>
>  #include <string.h>
> @@ -44,11 +45,12 @@
>     3. Child must synchronize with parent to enforce 2. and to possible
>        return execv issues.
>  
> -   The first issue is solved by blocking all signals in child, even the
> -   NPTL-internal ones (SIGCANCEL and SIGSETXID).  The second and third issue
> -   is done by a stack allocation in parent and a synchronization with using
> -   a pipe or waitpid (in case or error).  The pipe has the advantage of
> -   allowing the child the communicate an exec error.  */
> +   The first issue is solved by blocking all signals in child, even
> +   the NPTL-internal ones (SIGCANCEL and SIGSETXID).  The second and
> +   third issue is done by a stack allocation in parent, and by using a
> +   field in struct spawn_args where the child can write an error
> +   code. CLONE_VFORK ensures that the parent does not run until the
> +   child has either exec'ed successfully or exited.  */
>  
>  
>  /* The Unix standard contains a long explanation of the way to signal
> @@ -79,7 +81,6 @@
>  
>  struct posix_spawn_args
>  {
> -  int pipe[2];
>    sigset_t oldmask;
>    const char *file;
>    int (*exec) (const char *, char *const *, char *const *);
> @@ -89,6 +90,7 @@ struct posix_spawn_args
>    ptrdiff_t argc;
>    char *const *envp;
>    int xflags;
> +  int err;
>  };
>  
>  /* Older version requires that shell script without shebang definition
> @@ -125,11 +127,8 @@ __spawni_child (void *arguments)
>    struct posix_spawn_args *args = arguments;
>    const posix_spawnattr_t *restrict attr = args->attr;
>    const posix_spawn_file_actions_t *file_actions = args->fa;
> -  int p = args->pipe[1];
>    int ret;
>  
> -  close_not_cancel (args->pipe[0]);
> -
>    /* The child must ensure that no signal handler are enabled because it shared
>       memory with parent, so the signal disposition must be either SIG_DFL or
>       SIG_IGN.  It does by iterating over all signals and although it could
> @@ -203,17 +202,6 @@ __spawni_child (void *arguments)
>  	{
>  	  struct __spawn_action *action = &file_actions->__actions[cnt];
>  
> -	  /* Dup the pipe fd onto an unoccupied one to avoid any file
> -	     operation to clobber it.  */
> -	  if ((action->action.close_action.fd == p)
> -	      || (action->action.open_action.fd == p)
> -	      || (action->action.dup2_action.fd == p))
> -	    {
> -	      if ((ret = __dup (p)) < 0)
> -		goto fail;
> -	      p = ret;
> -	    }
> -
>  	  switch (action->tag)
>  	    {
>  	    case spawn_do_close:
> @@ -273,6 +261,7 @@ __spawni_child (void *arguments)
>    __sigprocmask (SIG_SETMASK, (attr->__flags & POSIX_SPAWN_SETSIGMASK)
>  		 ? &attr->__ss : &args->oldmask, 0);
>  
> +  args->err = 0;
>    args->exec (args->file, args->argv, args->envp);
>  
>    /* This is compatibility function required to enable posix_spawn run
> @@ -280,14 +269,13 @@ __spawni_child (void *arguments)
>       (2.15).  */
>    maybe_script_execute (args);
>  
> -  ret = -errno;
> -
>  fail:
> -  /* Since sizeof errno < PIPE_BUF, the write is atomic. */
> -  ret = -ret;
> -  if (ret)
> -    while (write_not_cancel (p, &ret, sizeof ret) < 0)
> -      continue;
> +  /* errno should have an appropriate non-zero value; otherwise,
> +     there's a bug in glibc or the kernel.  For lack of an error code
> +     (EINTERNALBUG) describing that, use ECHILD.  Another option would
> +     be to set args->err to some negative sentinel and have the parent
> +     abort(), but that seems needlessly harsh.  */
> +  args->err = errno ? : ECHILD;
>    _exit (SPAWN_ERROR);
>  }
>  
> @@ -304,9 +292,6 @@ __spawnix (pid_t * pid, const char *file,
>    struct posix_spawn_args args;
>    int ec;
>  
> -  if (__pipe2 (args.pipe, O_CLOEXEC))
> -    return errno;
> -
>    /* To avoid imposing hard limits on posix_spawn{p} the total number of
>       arguments is first calculated to allocate a mmap to hold all possible
>       values.  */
> @@ -333,15 +318,14 @@ __spawnix (pid_t * pid, const char *file,
>    void *stack = __mmap (NULL, stack_size, prot,
>  			MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
>    if (__glibc_unlikely (stack == MAP_FAILED))
> -    {
> -      close_not_cancel (args.pipe[0]);
> -      close_not_cancel (args.pipe[1]);
> -      return errno;
> -    }
> +    return errno;
>  
>    /* Disable asynchronous cancellation.  */
>    int cs = LIBC_CANCEL_ASYNC ();
>  
> +  /* Child must set args.err to something non-negative - we rely on
> +     the parent and child sharing VM.  */
> +  args.err = -1;
>    args.file = file;
>    args.exec = exec;
>    args.fa = file_actions;
> @@ -355,9 +339,8 @@ __spawnix (pid_t * pid, const char *file,
>  
>    /* The clone flags used will create a new child that will run in the same
>       memory space (CLONE_VM) and the execution of calling thread will be
> -     suspend until the child calls execve or _exit.  These condition as
> -     signal below either by pipe write (_exit with SPAWN_ERROR) or
> -     a successful execve.
> +     suspend until the child calls execve or _exit.
> +
>       Also since the calling thread execution will be suspend, there is not
>       need for CLONE_SETTLS.  Although parent and child share the same TLS
>       namespace, there will be no concurrent access for TLS variables (errno
> @@ -365,22 +348,18 @@ __spawnix (pid_t * pid, const char *file,
>    new_pid = CLONE (__spawni_child, STACK (stack, stack_size), stack_size,
>  		   CLONE_VM | CLONE_VFORK | SIGCHLD, &args);
>  
> -  close_not_cancel (args.pipe[1]);
> -
>    if (new_pid > 0)
>      {
> -      if (__read (args.pipe[0], &ec, sizeof ec) != sizeof ec)
> -	ec = 0;
> -      else
> -	__waitpid (new_pid, NULL, 0);
> +      ec = args.err;
> +      assert (ec >= 0);
> +      if (ec != 0)
> +	  __waitpid (new_pid, NULL, 0);
>      }
>    else
>      ec = -new_pid;
>  
>    __munmap (stack, stack_size);
>  
> -  close_not_cancel (args.pipe[0]);
> -
>    if ((ec == 0) && (pid != NULL))
>      *pid = new_pid;
>  
>
  
Florian Weimer Sept. 23, 2016, 5:21 a.m. UTC | #2
* Rasmus Villemoes:

> +      ec = args.err;
> +      assert (ec >= 0);
> +      if (ec != 0)
> +	  __waitpid (new_pid, NULL, 0);

One minor issue: Now that the variable name “ec” appears in an
assertion, it is a good idea to rename it to “error_code_from_child“
or something similar, so that the assertion message is more
meaningful.
  
Rasmus Villemoes Sept. 23, 2016, 7:09 p.m. UTC | #3
On Fri, Sep 23 2016, Florian Weimer <fw@deneb.enyo.de> wrote:

> * Rasmus Villemoes:
>
>> +      ec = args.err;
>> +      assert (ec >= 0);
>> +      if (ec != 0)
>> +	  __waitpid (new_pid, NULL, 0);
>
> One minor issue: Now that the variable name “ec” appears in an
> assertion, it is a good idea to rename it to “error_code_from_child“
> or something similar, so that the assertion message is more
> meaningful.

IMO, that's already addressed by the comments above the initialization
in the parent and exit path in the child. An assert message would have
to be overly verbose to make sense without the context of the code it
appears in. In any case, error_code_from_child is a bad name if we hit
the "ec = -new_pid" branch.

Rasmus
  
Adhemerval Zanella Netto Sept. 23, 2016, 7:28 p.m. UTC | #4
On 23/09/2016 16:09, Rasmus Villemoes wrote:
> On Fri, Sep 23 2016, Florian Weimer <fw@deneb.enyo.de> wrote:
> 
>> * Rasmus Villemoes:
>>
>>> +      ec = args.err;
>>> +      assert (ec >= 0);
>>> +      if (ec != 0)
>>> +	  __waitpid (new_pid, NULL, 0);
>>
>> One minor issue: Now that the variable name “ec” appears in an
>> assertion, it is a good idea to rename it to “error_code_from_child“
>> or something similar, so that the assertion message is more
>> meaningful.
> 
> IMO, that's already addressed by the comments above the initialization
> in the parent and exit path in the child. An assert message would have
> to be overly verbose to make sense without the context of the code it
> appears in. In any case, error_code_from_child is a bad name if we hit
> the "ec = -new_pid" branch.

Also, if CLONE_VM is not really creating a thread with shared VM it means
something really broken in the system.
  
Florian Weimer Sept. 23, 2016, 8:36 p.m. UTC | #5
* Adhemerval Zanella:

> On 23/09/2016 16:09, Rasmus Villemoes wrote:
>> On Fri, Sep 23 2016, Florian Weimer <fw@deneb.enyo.de> wrote:
>> 
>>> * Rasmus Villemoes:
>>>
>>>> +      ec = args.err;
>>>> +      assert (ec >= 0);
>>>> +      if (ec != 0)
>>>> +	  __waitpid (new_pid, NULL, 0);
>>>
>>> One minor issue: Now that the variable name “ec” appears in an
>>> assertion, it is a good idea to rename it to “error_code_from_child“
>>> or something similar, so that the assertion message is more
>>> meaningful.
>> 
>> IMO, that's already addressed by the comments above the initialization
>> in the parent and exit path in the child. An assert message would have
>> to be overly verbose to make sense without the context of the code it
>> appears in. In any case, error_code_from_child is a bad name if we hit
>> the "ec = -new_pid" branch.
>
> Also, if CLONE_VM is not really creating a thread with shared VM it means
> something really broken in the system.

Agreed.  I have no objections anymore to the patch as-is.
  
Rasmus Villemoes Sept. 27, 2016, 8:26 p.m. UTC | #6
On Fri, Sep 23 2016, Florian Weimer <fw@deneb.enyo.de> wrote:

> * Adhemerval Zanella:
>
>> On 23/09/2016 16:09, Rasmus Villemoes wrote:
>>> On Fri, Sep 23 2016, Florian Weimer <fw@deneb.enyo.de> wrote:
>>> 
>>>> * Rasmus Villemoes:
>>>>
>>>>> +      ec = args.err;
>>>>> +      assert (ec >= 0);
>>>>> +      if (ec != 0)
>>>>> +	  __waitpid (new_pid, NULL, 0);
>>>>
>>>> One minor issue: Now that the variable name “ec” appears in an
>>>> assertion, it is a good idea to rename it to “error_code_from_child“
>>>> or something similar, so that the assertion message is more
>>>> meaningful.
>>> 
>>> IMO, that's already addressed by the comments above the initialization
>>> in the parent and exit path in the child. An assert message would have
>>> to be overly verbose to make sense without the context of the code it
>>> appears in. In any case, error_code_from_child is a bad name if we hit
>>> the "ec = -new_pid" branch.
>>
>> Also, if CLONE_VM is not really creating a thread with shared VM it means
>> something really broken in the system.
>

I'd be more worried about CLONE_VFORK not being absolutely honoured in
all scenarios (emulation, ptrace, ...).  But that's what we now have the
assert for.

> Agreed.  I have no objections anymore to the patch as-is.

Thanks. Could someone with commit access take this?

Rasmus
  
Adhemerval Zanella Netto Sept. 27, 2016, 9:14 p.m. UTC | #7
On 27/09/2016 13:26, Rasmus Villemoes wrote:
> On Fri, Sep 23 2016, Florian Weimer <fw@deneb.enyo.de> wrote:
> 
>> * Adhemerval Zanella:
>>
>>> On 23/09/2016 16:09, Rasmus Villemoes wrote:
>>>> On Fri, Sep 23 2016, Florian Weimer <fw@deneb.enyo.de> wrote:
>>>>
>>>>> * Rasmus Villemoes:
>>>>>
>>>>>> +      ec = args.err;
>>>>>> +      assert (ec >= 0);
>>>>>> +      if (ec != 0)
>>>>>> +	  __waitpid (new_pid, NULL, 0);
>>>>>
>>>>> One minor issue: Now that the variable name “ec” appears in an
>>>>> assertion, it is a good idea to rename it to “error_code_from_child“
>>>>> or something similar, so that the assertion message is more
>>>>> meaningful.
>>>>
>>>> IMO, that's already addressed by the comments above the initialization
>>>> in the parent and exit path in the child. An assert message would have
>>>> to be overly verbose to make sense without the context of the code it
>>>> appears in. In any case, error_code_from_child is a bad name if we hit
>>>> the "ec = -new_pid" branch.
>>>
>>> Also, if CLONE_VM is not really creating a thread with shared VM it means
>>> something really broken in the system.
>>
> 
> I'd be more worried about CLONE_VFORK not being absolutely honoured in
> all scenarios (emulation, ptrace, ...).  But that's what we now have the
> assert for.
> 
>> Agreed.  I have no objections anymore to the patch as-is.
> 
> Thanks. Could someone with commit access take this?
> 
> Rasmus
> 

I will commit for you.
  
Rich Felker Sept. 28, 2016, 2:14 p.m. UTC | #8
On Tue, Sep 20, 2016 at 11:01:00PM +0200, Rasmus Villemoes wrote:
> Using VFORK already ensures that the parent does not run until the
> child has either exec'ed succesfully or called _exit. Hence we don't
> need to read from a CLOEXEC pipe to ensure proper synchronization - we
> just make explicit use of the fact the the child and parent run in the
> same VM, so the child can write an error code to a field of the
> posix_spawn_args struct instead of sending it through a pipe.
> 
> To ensure that this mechanism really works, the parent initializes the
> field to -1 and the child writes 0 before execing.
> 
> This eliminates some annoying bookkeeping that is necessary to avoid
> the file actions from clobbering the write end of the pipe, and
> getting rid of the pipe creation in the first place means fewer system
> calls (four in the parent, usually one in the child) and fewer
> chanches for the spawn to fail (e.g. if we're close to EMFILE).

This is a bad idea for at least one reason: running under strace seems
to cause vfork _not_ to wait in the parent, leading to stack
clobbering and runaway wrong code execution. I have not tested lately
so I don't have a recipe to reproduce it, but I know this was one of
the motivations for musl's use of a pipe.

Rich
  
Andreas Schwab Sept. 28, 2016, 3:03 p.m. UTC | #9
On Sep 28 2016, Rich Felker <dalias@libc.org> wrote:

> This is a bad idea for at least one reason: running under strace seems
> to cause vfork _not_ to wait in the parent, leading to stack
> clobbering and runaway wrong code execution.

This is not true.  With modern kernels implementing ptrace events a
tracer can correctly handle all clone variants including vfork.

Andreas.
  
Rich Felker Sept. 28, 2016, 3:22 p.m. UTC | #10
On Wed, Sep 28, 2016 at 05:03:41PM +0200, Andreas Schwab wrote:
> On Sep 28 2016, Rich Felker <dalias@libc.org> wrote:
> 
> > This is a bad idea for at least one reason: running under strace seems
> > to cause vfork _not_ to wait in the parent, leading to stack
> > clobbering and runaway wrong code execution.
> 
> This is not true.  With modern kernels implementing ptrace events a
> tracer can correctly handle all clone variants including vfork.

The problem might very well be limited to older (but still supported,
I think) kernels or older versions of strace; I'm not sure. That's
what I meant by saying I don't have a formula to reproduce the issue
at the moment.

Rich
  
Adhemerval Zanella Netto Sept. 28, 2016, 6:12 p.m. UTC | #11
On 28/09/2016 08:22, Rich Felker wrote:
> On Wed, Sep 28, 2016 at 05:03:41PM +0200, Andreas Schwab wrote:
>> On Sep 28 2016, Rich Felker <dalias@libc.org> wrote:
>>
>>> This is a bad idea for at least one reason: running under strace seems
>>> to cause vfork _not_ to wait in the parent, leading to stack
>>> clobbering and runaway wrong code execution.
>>
>> This is not true.  With modern kernels implementing ptrace events a
>> tracer can correctly handle all clone variants including vfork.
> 
> The problem might very well be limited to older (but still supported,
> I think) kernels or older versions of strace; I'm not sure. That's
> what I meant by saying I don't have a formula to reproduce the issue
> at the moment.
> 
> Rich
> 

I am not convinced this specific issue should be a blocker for this
patch.  It seems quite limited in scope (running the process in
strace) on older/not supported kernel. And the behaviour is clearly
a kernel issue.

Also I checked with CentOS 6 (2.6.32-642.el6.x86_64, strace 4.8) and
it seems to follow the expected behaviour.  I would also consider
any deviation in more recent Linux releases as a kernel regression 
and I see no point and trying to support it.
  
Florian Weimer Oct. 6, 2016, 12:52 p.m. UTC | #12
On 09/28/2016 08:12 PM, Adhemerval Zanella wrote:
> On 28/09/2016 08:22, Rich Felker wrote:
>> On Wed, Sep 28, 2016 at 05:03:41PM +0200, Andreas Schwab wrote:
>>> On Sep 28 2016, Rich Felker <dalias@libc.org> wrote:
>>>
>>>> This is a bad idea for at least one reason: running under strace seems
>>>> to cause vfork _not_ to wait in the parent, leading to stack
>>>> clobbering and runaway wrong code execution.
>>>
>>> This is not true.  With modern kernels implementing ptrace events a
>>> tracer can correctly handle all clone variants including vfork.
>>
>> The problem might very well be limited to older (but still supported,
>> I think) kernels or older versions of strace; I'm not sure. That's
>> what I meant by saying I don't have a formula to reproduce the issue
>> at the moment.

Rich, I value your comments, I really do, but this isn't actionable 
information, I'm afraid.

> I am not convinced this specific issue should be a blocker for this
> patch.  It seems quite limited in scope (running the process in
> strace) on older/not supported kernel. And the behaviour is clearly
> a kernel issue.
>
> Also I checked with CentOS 6 (2.6.32-642.el6.x86_64, strace 4.8) and
> it seems to follow the expected behaviour.  I would also consider
> any deviation in more recent Linux releases as a kernel regression
> and I see no point and trying to support it.

Agreed, seems reasonable.

Thanks,
Florian
  

Patch

diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
index bb3eecf..c0e15ac 100644
--- a/sysdeps/unix/sysv/linux/spawni.c
+++ b/sysdeps/unix/sysv/linux/spawni.c
@@ -17,6 +17,7 @@ 
    <http://www.gnu.org/licenses/>.  */
 
 #include <spawn.h>
+#include <assert.h>
 #include <fcntl.h>
 #include <paths.h>
 #include <string.h>
@@ -44,11 +45,12 @@ 
    3. Child must synchronize with parent to enforce 2. and to possible
       return execv issues.
 
-   The first issue is solved by blocking all signals in child, even the
-   NPTL-internal ones (SIGCANCEL and SIGSETXID).  The second and third issue
-   is done by a stack allocation in parent and a synchronization with using
-   a pipe or waitpid (in case or error).  The pipe has the advantage of
-   allowing the child the communicate an exec error.  */
+   The first issue is solved by blocking all signals in child, even
+   the NPTL-internal ones (SIGCANCEL and SIGSETXID).  The second and
+   third issue is done by a stack allocation in parent, and by using a
+   field in struct spawn_args where the child can write an error
+   code. CLONE_VFORK ensures that the parent does not run until the
+   child has either exec'ed successfully or exited.  */
 
 
 /* The Unix standard contains a long explanation of the way to signal
@@ -79,7 +81,6 @@ 
 
 struct posix_spawn_args
 {
-  int pipe[2];
   sigset_t oldmask;
   const char *file;
   int (*exec) (const char *, char *const *, char *const *);
@@ -89,6 +90,7 @@  struct posix_spawn_args
   ptrdiff_t argc;
   char *const *envp;
   int xflags;
+  int err;
 };
 
 /* Older version requires that shell script without shebang definition
@@ -125,11 +127,8 @@  __spawni_child (void *arguments)
   struct posix_spawn_args *args = arguments;
   const posix_spawnattr_t *restrict attr = args->attr;
   const posix_spawn_file_actions_t *file_actions = args->fa;
-  int p = args->pipe[1];
   int ret;
 
-  close_not_cancel (args->pipe[0]);
-
   /* The child must ensure that no signal handler are enabled because it shared
      memory with parent, so the signal disposition must be either SIG_DFL or
      SIG_IGN.  It does by iterating over all signals and although it could
@@ -203,17 +202,6 @@  __spawni_child (void *arguments)
 	{
 	  struct __spawn_action *action = &file_actions->__actions[cnt];
 
-	  /* Dup the pipe fd onto an unoccupied one to avoid any file
-	     operation to clobber it.  */
-	  if ((action->action.close_action.fd == p)
-	      || (action->action.open_action.fd == p)
-	      || (action->action.dup2_action.fd == p))
-	    {
-	      if ((ret = __dup (p)) < 0)
-		goto fail;
-	      p = ret;
-	    }
-
 	  switch (action->tag)
 	    {
 	    case spawn_do_close:
@@ -273,6 +261,7 @@  __spawni_child (void *arguments)
   __sigprocmask (SIG_SETMASK, (attr->__flags & POSIX_SPAWN_SETSIGMASK)
 		 ? &attr->__ss : &args->oldmask, 0);
 
+  args->err = 0;
   args->exec (args->file, args->argv, args->envp);
 
   /* This is compatibility function required to enable posix_spawn run
@@ -280,14 +269,13 @@  __spawni_child (void *arguments)
      (2.15).  */
   maybe_script_execute (args);
 
-  ret = -errno;
-
 fail:
-  /* Since sizeof errno < PIPE_BUF, the write is atomic. */
-  ret = -ret;
-  if (ret)
-    while (write_not_cancel (p, &ret, sizeof ret) < 0)
-      continue;
+  /* errno should have an appropriate non-zero value; otherwise,
+     there's a bug in glibc or the kernel.  For lack of an error code
+     (EINTERNALBUG) describing that, use ECHILD.  Another option would
+     be to set args->err to some negative sentinel and have the parent
+     abort(), but that seems needlessly harsh.  */
+  args->err = errno ? : ECHILD;
   _exit (SPAWN_ERROR);
 }
 
@@ -304,9 +292,6 @@  __spawnix (pid_t * pid, const char *file,
   struct posix_spawn_args args;
   int ec;
 
-  if (__pipe2 (args.pipe, O_CLOEXEC))
-    return errno;
-
   /* To avoid imposing hard limits on posix_spawn{p} the total number of
      arguments is first calculated to allocate a mmap to hold all possible
      values.  */
@@ -333,15 +318,14 @@  __spawnix (pid_t * pid, const char *file,
   void *stack = __mmap (NULL, stack_size, prot,
 			MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
   if (__glibc_unlikely (stack == MAP_FAILED))
-    {
-      close_not_cancel (args.pipe[0]);
-      close_not_cancel (args.pipe[1]);
-      return errno;
-    }
+    return errno;
 
   /* Disable asynchronous cancellation.  */
   int cs = LIBC_CANCEL_ASYNC ();
 
+  /* Child must set args.err to something non-negative - we rely on
+     the parent and child sharing VM.  */
+  args.err = -1;
   args.file = file;
   args.exec = exec;
   args.fa = file_actions;
@@ -355,9 +339,8 @@  __spawnix (pid_t * pid, const char *file,
 
   /* The clone flags used will create a new child that will run in the same
      memory space (CLONE_VM) and the execution of calling thread will be
-     suspend until the child calls execve or _exit.  These condition as
-     signal below either by pipe write (_exit with SPAWN_ERROR) or
-     a successful execve.
+     suspend until the child calls execve or _exit.
+
      Also since the calling thread execution will be suspend, there is not
      need for CLONE_SETTLS.  Although parent and child share the same TLS
      namespace, there will be no concurrent access for TLS variables (errno
@@ -365,22 +348,18 @@  __spawnix (pid_t * pid, const char *file,
   new_pid = CLONE (__spawni_child, STACK (stack, stack_size), stack_size,
 		   CLONE_VM | CLONE_VFORK | SIGCHLD, &args);
 
-  close_not_cancel (args.pipe[1]);
-
   if (new_pid > 0)
     {
-      if (__read (args.pipe[0], &ec, sizeof ec) != sizeof ec)
-	ec = 0;
-      else
-	__waitpid (new_pid, NULL, 0);
+      ec = args.err;
+      assert (ec >= 0);
+      if (ec != 0)
+	  __waitpid (new_pid, NULL, 0);
     }
   else
     ec = -new_pid;
 
   __munmap (stack, stack_size);
 
-  close_not_cancel (args.pipe[0]);
-
   if ((ec == 0) && (pid != NULL))
     *pid = new_pid;