Patchwork [v2,1/5] mips: Do not malloc on getdents64 fallback

login
register
mail settings
Submitter Adhemerval Zanella Netto
Date July 31, 2019, 6:31 p.m.
Message ID <20190731183136.21545-1-adhemerval.zanella@linaro.org>
Download mbox | patch
Permalink /patch/33870/
State New
Headers show

Comments

Adhemerval Zanella Netto - July 31, 2019, 6:31 p.m.
This patch changes how the fallback getdents64 implementation calls
non-LFS getdents by replacing the scratch_buffer with static buffer
plus a loop on getdents calls.  This avoids the potential malloc
call on scratch_buffer_set_array_size for large input buffer size
at the cost of more getdents syscalls.

It also adds a small optimization for older kernels, where the first
ENOSYS failure for getdents64 disable subsequent calls.

Check the dirent tests on a mips64-linux-gnu with getdents64 code
disabled.

	* sysdeps/unix/sysv/linux/mips/mips64/getdents64.c (__getdents64):
	Add small optimization for older kernel to avoid issuing
	__NR_getdents64 on each call and replace scratch_buffer usage with
	a static allocated buffer.
---
 .../unix/sysv/linux/mips/mips64/getdents64.c  | 122 ++++++++----------
 1 file changed, 54 insertions(+), 68 deletions(-)
Adhemerval Zanella Netto - Aug. 28, 2019, 2:09 p.m.
Ping.

On 31/07/2019 15:31, Adhemerval Zanella wrote:
> This patch changes how the fallback getdents64 implementation calls
> non-LFS getdents by replacing the scratch_buffer with static buffer
> plus a loop on getdents calls.  This avoids the potential malloc
> call on scratch_buffer_set_array_size for large input buffer size
> at the cost of more getdents syscalls.
> 
> It also adds a small optimization for older kernels, where the first
> ENOSYS failure for getdents64 disable subsequent calls.
> 
> Check the dirent tests on a mips64-linux-gnu with getdents64 code
> disabled.
> 
> 	* sysdeps/unix/sysv/linux/mips/mips64/getdents64.c (__getdents64):
> 	Add small optimization for older kernel to avoid issuing
> 	__NR_getdents64 on each call and replace scratch_buffer usage with
> 	a static allocated buffer.
> ---
>  .../unix/sysv/linux/mips/mips64/getdents64.c  | 122 ++++++++----------
>  1 file changed, 54 insertions(+), 68 deletions(-)
> 
> diff --git a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
> index 8bf3abb0e0..3b5afd9324 100644
> --- a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
> +++ b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
> @@ -22,98 +22,84 @@
>  #include <assert.h>
>  #include <sys/param.h>
>  #include <unistd.h>
> -#include <scratch_buffer.h>
>  #include <limits.h>
>  
>  ssize_t
> -__getdents64 (int fd, void *buf0, size_t nbytes)
> +__getdents64 (int fd, void *buf, size_t nbytes)
>  {
> -  char *buf = buf0;
> -
>    /* The system call takes an unsigned int argument, and some length
>       checks in the kernel use an int type.  */
>    if (nbytes > INT_MAX)
>      nbytes = INT_MAX;
>  
>  #ifdef __NR_getdents64
> -  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
> -  if (ret != -1)
> -    return ret;
> +  static bool getdents64_supportted = true;
> +  if (atomic_load_relaxed (&getdents64_supportted))
> +    {
> +      ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
> +      if (ret >= 0 || errno != ENOSYS)
> +	return ret;
> +
> +      atomic_store_relaxed (&getdents64_supportted, false);
> +    }
>  #endif
>  
>    /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.
> -     If syscall is not available it need to fallback to non-LFS one.  */
> +     If the syscall is not available it need to fallback to the non-LFS one.
> +     Also to avoid an unbounded allocation through VLA/alloca or malloc (which
> +     would make the syscall non async-signal-safe) it uses a limited buffer.
> +     This is sub-optimal for large NBYTES, however this is a fallback
> +     mechanism to emulate a syscall that kernel should provide.   */
>  
> +  enum { KBUF_SIZE = 1024 };
>    struct kernel_dirent
> -    {
> -      unsigned long d_ino;
> -      unsigned long d_off;
> -      unsigned short int d_reclen;
> -      char d_name[256];
> -    };
> -
> -  const size_t size_diff = (offsetof (struct dirent64, d_name)
> -			   - offsetof (struct kernel_dirent, d_name));
> -
> -  size_t red_nbytes = MIN (nbytes
> -			   - ((nbytes / (offsetof (struct dirent64, d_name)
> -					 + 14)) * size_diff),
> -			   nbytes - size_diff);
> -
> -  struct scratch_buffer tmpbuf;
> -  scratch_buffer_init (&tmpbuf);
> -  if (!scratch_buffer_set_array_size (&tmpbuf, red_nbytes, sizeof (uint8_t)))
> -    INLINE_SYSCALL_ERROR_RETURN_VALUE (ENOMEM);
> -
> -  struct kernel_dirent *skdp, *kdp;
> -  skdp = kdp = tmpbuf.data;
> -
> -  ssize_t retval = INLINE_SYSCALL_CALL (getdents, fd, kdp, red_nbytes);
> -  if (retval == -1)
> -    {
> -      scratch_buffer_free (&tmpbuf);
> -      return -1;
> -    }
> +  {
> +    unsigned long d_ino;
> +    unsigned long d_off;
> +    unsigned short int d_reclen;
> +    char d_name[1];
> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
>  
> -  off64_t last_offset = -1;
>    struct dirent64 *dp = (struct dirent64 *) buf;
> -  while ((char *) kdp < (char *) skdp + retval)
> +
> +  size_t nb = 0;
> +  off64_t last_offset = -1;
> +
> +  ssize_t r;
> +  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)
>      {
> -      const size_t alignment = _Alignof (struct dirent64);
> -      /* Since kdp->d_reclen is already aligned for the kernel structure
> -	 this may compute a value that is bigger than necessary.  */
> -      size_t new_reclen = ((kdp->d_reclen + size_diff + alignment - 1)
> -			   & ~(alignment - 1));
> -      if ((char *) dp + new_reclen > buf + nbytes)
> -        {
> -	  /* Our heuristic failed.  We read too many entries.  Reset
> -	     the stream.  */
> -	  assert (last_offset != -1);
> -	  __lseek64 (fd, last_offset, SEEK_SET);
> -
> -	  if ((char *) dp == buf)
> +      struct kernel_dirent *skdp, *kdp;
> +      skdp = kdp = kbuf;
> +
> +      while ((char *) kdp < (char *) skdp + r)
> +	{
> +	  const size_t alignment = _Alignof (struct dirent64);
> +	  size_t new_reclen = ((kdp->d_reclen + alignment - 1)
> +			      & ~(alignment - 1));
> +	  if (nb + new_reclen > nbytes)
>  	    {
> -	      scratch_buffer_free (&tmpbuf);
> -	      return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
> +		/* The new entry will overflow the input buffer, rewind to
> +		   last obtained entry and return.  */
> +	       __lseek64 (fd, last_offset, SEEK_SET);
> +	       goto out;
>  	    }
> +	  nb += new_reclen;
>  
> -	  break;
> -	}
> -
> -      last_offset = kdp->d_off;
> -      dp->d_ino = kdp->d_ino;
> -      dp->d_off = kdp->d_off;
> -      dp->d_reclen = new_reclen;
> -      dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
> -      memcpy (dp->d_name, kdp->d_name,
> -	      kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
> +	  dp->d_ino = kdp->d_ino;
> +	  dp->d_off = last_offset = kdp->d_off;
> +	  dp->d_reclen = new_reclen;
> +	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
> +	  memcpy (dp->d_name, kdp->d_name,
> +		  kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
>  
> -      dp = (struct dirent64 *) ((char *) dp + new_reclen);
> -      kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
> +	  dp = (struct dirent64 *) ((char *) dp + new_reclen);
> +	  kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
> +	}
>      }
>  
> -  scratch_buffer_free (&tmpbuf);
> -  return (char *) dp - buf;
> +out:
> +  return (char *) dp - (char *) buf;
>  }
>  libc_hidden_def (__getdents64)
>  weak_alias (__getdents64, getdents64)
>
Andreas Schwab - Aug. 28, 2019, 2:34 p.m.
On Jul 31 2019, Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote:

>  #ifdef __NR_getdents64
> -  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
> -  if (ret != -1)
> -    return ret;
> +  static bool getdents64_supportted = true;

s/supportted/supported/

Andreas.
Florian Weimer - Aug. 28, 2019, 2:42 p.m.
* Adhemerval Zanella:

>    struct kernel_dirent
> +  {
> +    unsigned long d_ino;
> +    unsigned long d_off;
> +    unsigned short int d_reclen;
> +    char d_name[1];
> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];

I think it's still not clear to me in which cases we actually need to
move the dirent entries in the buffer.  My impression is that we just
need to move d_name by one byte because before, d_type was after the
name, and afterwards, it comes before the name.  But the record
boundaries are unchanged.

Thanks,
Florian
Adhemerval Zanella Netto - Aug. 28, 2019, 5:01 p.m.
On 28/08/2019 11:34, Andreas Schwab wrote:
> On Jul 31 2019, Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote:
> 
>>  #ifdef __NR_getdents64
>> -  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
>> -  if (ret != -1)
>> -    return ret;
>> +  static bool getdents64_supportted = true;
> 
> s/supportted/supported/
> 
> Andreas.
> 

Fixed, thanks.
Florian Weimer - Aug. 30, 2019, 9:52 a.m.
Sorry, I missed that despite all the 64s in the patch, this code is also
used on 32-bit architectures.  The review below should take this new (to
me) piece of information into account.

* Adhemerval Zanella:

> +  static bool getdents64_supportted = true;
> +  if (atomic_load_relaxed (&getdents64_supportted))

Our atomics do not support bool, only 4 bytes and 8 bytes (if
__HAVE_64B_ATOMICS) is defined.  See __atomic_check_size.

Probably it will work if it compiles, but I haven't checked that.

>    /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.
> +     If the syscall is not available it need to fallback to the non-LFS one.
> +     Also to avoid an unbounded allocation through VLA/alloca or malloc (which
> +     would make the syscall non async-signal-safe) it uses a limited buffer.
> +     This is sub-optimal for large NBYTES, however this is a fallback
> +     mechanism to emulate a syscall that kernel should provide.   */
>  
> +  enum { KBUF_SIZE = 1024 };

The choice of size needs a comment.  I think the largest possible
practical length of the d_name member are 255 Unicode characters in the
BMP, in UTF-8 encoding, so d_name is 766 bytes long, plus 10 bytes from
the header, for 776 bytes total.  (NAME_MAX is not a constant on Linux
in reality.)

>    struct kernel_dirent
> +  {
> +    unsigned long d_ino;
> +    unsigned long d_off;
> +    unsigned short int d_reclen;
> +    char d_name[1];
> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;

I would define kbuf as a char array, and perhaps leave out the d_name
member in struct kernel_dirent.  You can copy out the struct
kernel_dirent using memcpy, which GCC should optimize away.

Ideally, we would perform the conversion in-line, with a forward scan to
make the d_reclen members point backwards, followed by a backwards scan
to move everything in place.  This would reduce stack usage quite
significantly and avoid a hard restriction on d_name length.

>    struct dirent64 *dp = (struct dirent64 *) buf;
> +
> +  size_t nb = 0;
> +  off64_t last_offset = -1;
> +
> +  ssize_t r;
> +  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)
>      {

Sorry, I don't see how the outer loop is exited.  I think we should
remove it because it does not seem necessary.

> +      struct kernel_dirent *skdp, *kdp;
> +      skdp = kdp = kbuf;
> +
> +      while ((char *) kdp < (char *) skdp + r)
> +	{
> +	  const size_t alignment = _Alignof (struct dirent64);
> +	  size_t new_reclen = ((kdp->d_reclen + alignment - 1)
> +			      & ~(alignment - 1));

I think this is the roundup macro.  If you use that, I think you don't
need the alignment variable.

Is the length really correct, though?  I'd expect it to grow by the
additional size of the d_ino and d_off members.  I think it would be
best recompute it from scratch, using the actual length of d_name.

> +	  if (nb + new_reclen > nbytes)
>  	    {
> +		/* The new entry will overflow the input buffer, rewind to
> +		   last obtained entry and return.  */
> +	       __lseek64 (fd, last_offset, SEEK_SET);

I don't think last_offset is guaranteed to have been set with a proper
offset at this point.  Given that d_name is essentially of unbounded
length, even expanding the first entry can cause failure.

Maybe it's possible to avoid this corner case by limiting the amount of
data being read so that we know that the application-supplied buffer is
always large enough for any possible expansion.  I think the worse-case
growth is for lengths 5 to 8, from 20 bytes to 32 bytes.  So perhaps we
should divide the buffer size by 1.6 and use that?

> +	       goto out;
>  	    }
> +	  nb += new_reclen;
>  
> +	  dp->d_ino = kdp->d_ino;
> +	  dp->d_off = last_offset = kdp->d_off;
> +	  dp->d_reclen = new_reclen;
> +	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);

I think instead of reading through kdp, you should use char *s and
memcpy, to avoid the aliasing violation, as discussed above.  Likewise
for writing to dp.

> +	  memcpy (dp->d_name, kdp->d_name,
> +		  kdp->d_reclen - offsetof (struct kernel_dirent, d_name));

See above, I have concerns about the length.

Thanks,
Florian
Adhemerval Zanella Netto - Aug. 30, 2019, 12:52 p.m.
On 30/08/2019 06:52, Florian Weimer wrote:
> Sorry, I missed that despite all the 64s in the patch, this code is also
> used on 32-bit architectures.  The review below should take this new (to
> me) piece of information into account.
> 
> * Adhemerval Zanella:
> 
>> +  static bool getdents64_supportted = true;
>> +  if (atomic_load_relaxed (&getdents64_supportted))
> 
> Our atomics do not support bool, only 4 bytes and 8 bytes (if
> __HAVE_64B_ATOMICS) is defined.  See __atomic_check_size.
> 
> Probably it will work if it compiles, but I haven't checked that.

MIPS in fact does not define USE_ATOMIC_COMPILER_BUILTINS and thus it uses
the __atomic_check_size_ls macro.  In any case, I changed to a int for this 
case.

> 
>>    /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.
>> +     If the syscall is not available it need to fallback to the non-LFS one.
>> +     Also to avoid an unbounded allocation through VLA/alloca or malloc (which
>> +     would make the syscall non async-signal-safe) it uses a limited buffer.
>> +     This is sub-optimal for large NBYTES, however this is a fallback
>> +     mechanism to emulate a syscall that kernel should provide.   */
>>  
>> +  enum { KBUF_SIZE = 1024 };
> 
> The choice of size needs a comment.  I think the largest possible
> practical length of the d_name member are 255 Unicode characters in the
> BMP, in UTF-8 encoding, so d_name is 766 bytes long, plus 10 bytes from
> the header, for 776 bytes total.  (NAME_MAX is not a constant on Linux
> in reality.)

I picked the buffer as an arbitrary value, what about:

  /* The largest possible practical length of the d_name member are 255
     Unicode characters in UTF-8 encoding, so d_name is 766 bytes long, plus
     18 (mips64) / 10 (mips64n32) bytes from header, for total of 784 (mips64)
     / 776 (mips64n32) bytes total.  Ensure that the minimum size hold at
     least one entry.  */
  enum { KBUF_SIZE = 1024 };


> 
>>    struct kernel_dirent
>> +  {
>> +    unsigned long d_ino;
>> +    unsigned long d_off;
>> +    unsigned short int d_reclen;
>> +    char d_name[1];
>> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
>> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
> 
> I would define kbuf as a char array, and perhaps leave out the d_name
> member in struct kernel_dirent.  You can copy out the struct
> kernel_dirent using memcpy, which GCC should optimize away.

I defined the buffer as 'struct kernel_dirent' to make it easier to align
for the required fields.  It allows simplify the access on the loop to
avoid memcpy calls.

> 
> Ideally, we would perform the conversion in-line, with a forward scan to
> make the d_reclen members point backwards, followed by a backwards scan
> to move everything in place.  This would reduce stack usage quite
> significantly and avoid a hard restriction on d_name length.

I take this fallback code as a best effort due the restrictions we have
(make it async-signal-safe with bounded stack allocation).  Even with a
clever buffer managements we can't remove the d_name length with the
aforementioned restrictions.  The proper solution is indeed using the
syscall.

> 
>>    struct dirent64 *dp = (struct dirent64 *) buf;
>> +
>> +  size_t nb = 0;
>> +  off64_t last_offset = -1;
>> +
>> +  ssize_t r;
>> +  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)
>>      {
> 
> Sorry, I don't see how the outer loop is exited.  I think we should
> remove it because it does not seem necessary.

We still need to handle the cases where NBYTES are larger than the temporary
buffer, because it might require multiple getdents calls.


> 
>> +      struct kernel_dirent *skdp, *kdp;
>> +      skdp = kdp = kbuf;
>> +
>> +      while ((char *) kdp < (char *) skdp + r)
>> +	{
>> +	  const size_t alignment = _Alignof (struct dirent64);
>> +	  size_t new_reclen = ((kdp->d_reclen + alignment - 1)
>> +			      & ~(alignment - 1));
> 
> I think this is the roundup macro.  If you use that, I think you don't
> need the alignment variable.

I changed to use ALIGN_UP from libc-pointer-arith.h.

> 
> Is the length really correct, though?  I'd expect it to grow by the
> additional size of the d_ino and d_off members.  I think it would be
> best recompute it from scratch, using the actual length of d_name.

It it because you are referencing to an older patch version, checking on
mips64-n32 I adjusted to:

  const size_t size_diff = (offsetof (struct dirent64, d_name)
                           - offsetof (struct kernel_dirent, d_name));
  [...]
               size_t new_reclen = ALIGN_UP (kdp->d_reclen + size_diff,
                                        _Alignof (struct dirent64));
  [...]

> 
>> +	  if (nb + new_reclen > nbytes)
>>  	    {
>> +		/* The new entry will overflow the input buffer, rewind to
>> +		   last obtained entry and return.  */
>> +	       __lseek64 (fd, last_offset, SEEK_SET);
> 
> I don't think last_offset is guaranteed to have been set with a proper
> offset at this point.  Given that d_name is essentially of unbounded
> length, even expanding the first entry can cause failure.
> 
> Maybe it's possible to avoid this corner case by limiting the amount of
> data being read so that we know that the application-supplied buffer is
> always large enough for any possible expansion.  I think the worse-case
> growth is for lengths 5 to 8, from 20 bytes to 32 bytes.  So perhaps we
> should divide the buffer size by 1.6 and use that?

For this case I really think we just need to return an error to user:

            if (nb + new_reclen > nbytes)
            {   
		/* Entry is too large for the static buffer.  */
		if (last_offset == -1)
		  {
		    __set_errno (EINVAL);
		    return -1;
		  }
                /* The new entry will overflow the input buffer, rewind to
                   last obtained entry and return.  */
               __lseek64 (fd, last_offset, SEEK_SET);
               goto out;
            }

Again I see this fallback code as a best-effort since we are emulating the
syscall with additional restraints.  Most of time glibc tries to play smart
emulating a syscall ended in a lot of headaches...


> 
>> +	       goto out;
>>  	    }
>> +	  nb += new_reclen;
>>  
>> +	  dp->d_ino = kdp->d_ino;
>> +	  dp->d_off = last_offset = kdp->d_off;
>> +	  dp->d_reclen = new_reclen;
>> +	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
> 
> I think instead of reading through kdp, you should use char *s and
> memcpy, to avoid the aliasing violation, as discussed above.  Likewise
> for writing to dp.

I think if we proper setting the buffer alignment there is no need to do it.
Also the problem of using memcpy here is for mips64n32 the size is *not* 
equal for dp and kdp, each would require an extra step as:

   {
     typeof (kdp->d_ino) kino;
     memcpy (&kino, &kdp_d->ino, sizeof (kino));
     typeof (dp->d_ino) dino = kino;
     memcpy (&dp->d_ino, &kino, sizeof (dino));
   }

> 
>> +	  memcpy (dp->d_name, kdp->d_name,
>> +		  kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
> 
> See above, I have concerns about the length.
> 
> Thanks,
> Florian
>
Florian Weimer - Sept. 2, 2019, 12:59 p.m.
* Adhemerval Zanella:

>> The choice of size needs a comment.  I think the largest possible
>> practical length of the d_name member are 255 Unicode characters in the
>> BMP, in UTF-8 encoding, so d_name is 766 bytes long, plus 10 bytes from
>> the header, for 776 bytes total.  (NAME_MAX is not a constant on Linux
>> in reality.)
>
> I picked the buffer as an arbitrary value, what about:
>
>   /* The largest possible practical length of the d_name member are 255
>      Unicode characters in UTF-8 encoding, so d_name is 766 bytes long, plus
>      18 (mips64) / 10 (mips64n32) bytes from header, for total of 784 (mips64)
>      / 776 (mips64n32) bytes total.  Ensure that the minimum size hold at
>      least one entry.  */
>   enum { KBUF_SIZE = 1024 };

“holds”

Looks good.

>>>    struct kernel_dirent
>>> +  {
>>> +    unsigned long d_ino;
>>> +    unsigned long d_off;
>>> +    unsigned short int d_reclen;
>>> +    char d_name[1];
>>> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
>>> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
>> 
>> I would define kbuf as a char array, and perhaps leave out the d_name
>> member in struct kernel_dirent.  You can copy out the struct
>> kernel_dirent using memcpy, which GCC should optimize away.
>
> I defined the buffer as 'struct kernel_dirent' to make it easier to align
> for the required fields.  It allows simplify the access on the loop to
> avoid memcpy calls.

But the code is invalid C as a result of this.  We do not compile glibc
with -fno-strict-aliasing, after all.

>>>    struct dirent64 *dp = (struct dirent64 *) buf;
>>> +
>>> +  size_t nb = 0;
>>> +  off64_t last_offset = -1;
>>> +
>>> +  ssize_t r;
>>> +  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)
>>>      {
>> 
>> Sorry, I don't see how the outer loop is exited.  I think we should
>> remove it because it does not seem necessary.
>
> We still need to handle the cases where NBYTES are larger than the temporary
> buffer, because it might require multiple getdents calls.

Why?  The application (or readdir) will call us again to get more entries.

>> Is the length really correct, though?  I'd expect it to grow by the
>> additional size of the d_ino and d_off members.  I think it would be
>> best recompute it from scratch, using the actual length of d_name.
>
> It it because you are referencing to an older patch version, checking on
> mips64-n32 I adjusted to:
>
>   const size_t size_diff = (offsetof (struct dirent64, d_name)
>                            - offsetof (struct kernel_dirent, d_name));
>   [...]
>                size_t new_reclen = ALIGN_UP (kdp->d_reclen + size_diff,
>                                         _Alignof (struct dirent64));
>   [...]

Okay, this needs a comment that this is a conservative approximation
(some of size_diff might fit into the existing padding for alignment).

>>> +	  if (nb + new_reclen > nbytes)
>>>  	    {
>>> +		/* The new entry will overflow the input buffer, rewind to
>>> +		   last obtained entry and return.  */
>>> +	       __lseek64 (fd, last_offset, SEEK_SET);
>> 
>> I don't think last_offset is guaranteed to have been set with a proper
>> offset at this point.  Given that d_name is essentially of unbounded
>> length, even expanding the first entry can cause failure.
>> 
>> Maybe it's possible to avoid this corner case by limiting the amount of
>> data being read so that we know that the application-supplied buffer is
>> always large enough for any possible expansion.  I think the worse-case
>> growth is for lengths 5 to 8, from 20 bytes to 32 bytes.  So perhaps we
>> should divide the buffer size by 1.6 and use that?
>
> For this case I really think we just need to return an error to user:
>
>             if (nb + new_reclen > nbytes)
>             {   
> 		/* Entry is too large for the static buffer.  */

Fixed-size buffer, it's not static. 8-)

> 		if (last_offset == -1)
> 		  {
> 		    __set_errno (EINVAL);
> 		    return -1;
> 		  }
>                 /* The new entry will overflow the input buffer, rewind to
>                    last obtained entry and return.  */
>                __lseek64 (fd, last_offset, SEEK_SET);
>                goto out;
>             }
>
> Again I see this fallback code as a best-effort since we are emulating the
> syscall with additional restraints.  Most of time glibc tries to play smart
> emulating a syscall ended in a lot of headaches...

I don't disagree.

Which error code does the kernel return if no entry can be read at all?
We should mirror that.

>>> +	       goto out;
>>>  	    }
>>> +	  nb += new_reclen;
>>>  
>>> +	  dp->d_ino = kdp->d_ino;
>>> +	  dp->d_off = last_offset = kdp->d_off;
>>> +	  dp->d_reclen = new_reclen;
>>> +	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
>> 
>> I think instead of reading through kdp, you should use char *s and
>> memcpy, to avoid the aliasing violation, as discussed above.  Likewise
>> for writing to dp.
>
> I think if we proper setting the buffer alignment there is no need to do it.
> Also the problem of using memcpy here is for mips64n32 the size is *not* 
> equal for dp and kdp, each would require an extra step as:
>
>    {
>      typeof (kdp->d_ino) kino;
>      memcpy (&kino, &kdp_d->ino, sizeof (kino));
>      typeof (dp->d_ino) dino = kino;
>      memcpy (&dp->d_ino, &kino, sizeof (dino));
>    }

I think that's just the price of writing correct C.  It's also what the
kernel does.

I don't even think there's a requirement that the byte buffer passed to
getdents64 has any kind of alignment.

Thanks,
Florian

Patch

diff --git a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
index 8bf3abb0e0..3b5afd9324 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
+++ b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
@@ -22,98 +22,84 @@ 
 #include <assert.h>
 #include <sys/param.h>
 #include <unistd.h>
-#include <scratch_buffer.h>
 #include <limits.h>
 
 ssize_t
-__getdents64 (int fd, void *buf0, size_t nbytes)
+__getdents64 (int fd, void *buf, size_t nbytes)
 {
-  char *buf = buf0;
-
   /* The system call takes an unsigned int argument, and some length
      checks in the kernel use an int type.  */
   if (nbytes > INT_MAX)
     nbytes = INT_MAX;
 
 #ifdef __NR_getdents64
-  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
-  if (ret != -1)
-    return ret;
+  static bool getdents64_supportted = true;
+  if (atomic_load_relaxed (&getdents64_supportted))
+    {
+      ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
+      if (ret >= 0 || errno != ENOSYS)
+	return ret;
+
+      atomic_store_relaxed (&getdents64_supportted, false);
+    }
 #endif
 
   /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.
-     If syscall is not available it need to fallback to non-LFS one.  */
+     If the syscall is not available it need to fallback to the non-LFS one.
+     Also to avoid an unbounded allocation through VLA/alloca or malloc (which
+     would make the syscall non async-signal-safe) it uses a limited buffer.
+     This is sub-optimal for large NBYTES, however this is a fallback
+     mechanism to emulate a syscall that kernel should provide.   */
 
+  enum { KBUF_SIZE = 1024 };
   struct kernel_dirent
-    {
-      unsigned long d_ino;
-      unsigned long d_off;
-      unsigned short int d_reclen;
-      char d_name[256];
-    };
-
-  const size_t size_diff = (offsetof (struct dirent64, d_name)
-			   - offsetof (struct kernel_dirent, d_name));
-
-  size_t red_nbytes = MIN (nbytes
-			   - ((nbytes / (offsetof (struct dirent64, d_name)
-					 + 14)) * size_diff),
-			   nbytes - size_diff);
-
-  struct scratch_buffer tmpbuf;
-  scratch_buffer_init (&tmpbuf);
-  if (!scratch_buffer_set_array_size (&tmpbuf, red_nbytes, sizeof (uint8_t)))
-    INLINE_SYSCALL_ERROR_RETURN_VALUE (ENOMEM);
-
-  struct kernel_dirent *skdp, *kdp;
-  skdp = kdp = tmpbuf.data;
-
-  ssize_t retval = INLINE_SYSCALL_CALL (getdents, fd, kdp, red_nbytes);
-  if (retval == -1)
-    {
-      scratch_buffer_free (&tmpbuf);
-      return -1;
-    }
+  {
+    unsigned long d_ino;
+    unsigned long d_off;
+    unsigned short int d_reclen;
+    char d_name[1];
+  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
+  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
 
-  off64_t last_offset = -1;
   struct dirent64 *dp = (struct dirent64 *) buf;
-  while ((char *) kdp < (char *) skdp + retval)
+
+  size_t nb = 0;
+  off64_t last_offset = -1;
+
+  ssize_t r;
+  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)
     {
-      const size_t alignment = _Alignof (struct dirent64);
-      /* Since kdp->d_reclen is already aligned for the kernel structure
-	 this may compute a value that is bigger than necessary.  */
-      size_t new_reclen = ((kdp->d_reclen + size_diff + alignment - 1)
-			   & ~(alignment - 1));
-      if ((char *) dp + new_reclen > buf + nbytes)
-        {
-	  /* Our heuristic failed.  We read too many entries.  Reset
-	     the stream.  */
-	  assert (last_offset != -1);
-	  __lseek64 (fd, last_offset, SEEK_SET);
-
-	  if ((char *) dp == buf)
+      struct kernel_dirent *skdp, *kdp;
+      skdp = kdp = kbuf;
+
+      while ((char *) kdp < (char *) skdp + r)
+	{
+	  const size_t alignment = _Alignof (struct dirent64);
+	  size_t new_reclen = ((kdp->d_reclen + alignment - 1)
+			      & ~(alignment - 1));
+	  if (nb + new_reclen > nbytes)
 	    {
-	      scratch_buffer_free (&tmpbuf);
-	      return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
+		/* The new entry will overflow the input buffer, rewind to
+		   last obtained entry and return.  */
+	       __lseek64 (fd, last_offset, SEEK_SET);
+	       goto out;
 	    }
+	  nb += new_reclen;
 
-	  break;
-	}
-
-      last_offset = kdp->d_off;
-      dp->d_ino = kdp->d_ino;
-      dp->d_off = kdp->d_off;
-      dp->d_reclen = new_reclen;
-      dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
-      memcpy (dp->d_name, kdp->d_name,
-	      kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
+	  dp->d_ino = kdp->d_ino;
+	  dp->d_off = last_offset = kdp->d_off;
+	  dp->d_reclen = new_reclen;
+	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
+	  memcpy (dp->d_name, kdp->d_name,
+		  kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
 
-      dp = (struct dirent64 *) ((char *) dp + new_reclen);
-      kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
+	  dp = (struct dirent64 *) ((char *) dp + new_reclen);
+	  kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
+	}
     }
 
-  scratch_buffer_free (&tmpbuf);
-  return (char *) dp - buf;
+out:
+  return (char *) dp - (char *) buf;
 }
 libc_hidden_def (__getdents64)
 weak_alias (__getdents64, getdents64)