[5/5] stdlib: Remove lstat usage from realpath [BZ #24970]

Message ID 20201224151701.1751008-6-adhemerval.zanella@linaro.org
State Superseded
Headers
Series Fix multiple realpath issues |

Commit Message

Adhemerval Zanella Netto Dec. 24, 2020, 3:17 p.m. UTC
  The readlink already tells whether the file is a symlink, so there is
no need to call lstat to check it.  However for '..' it requires an
extra readlink check if the previous component can be really accessed,
otherwise the next iteration will check a possible valid path and end
early.  It should performance-wise acceptable and a gain over lstat,
afaik symlink should not update any inode information.

It also fixes the stdlib/test-canon issued from the gnulib sync.

Checked on x86_64-linux-gnu.
---
 include/scratch_buffer.h |  21 ++++++
 stdlib/canonicalize.c    | 139 +++++++++++++++++++++++----------------
 2 files changed, 102 insertions(+), 58 deletions(-)
  

Comments

Paul Eggert Dec. 25, 2020, 12:27 a.m. UTC | #1
This email finishes the review of this proposed glibc patchset. (I 
didn't look at patch 4/5 for test cases.)

On 12/24/20 7:17 AM, Adhemerval Zanella wrote:

> +/* Check if BUFFER is using the internal buffer.  */
> +static __always_inline bool
> +scratch_buffer_using_internal (struct scratch_buffer *buffer)
> +{
> +  return buffer->data == buffer->__space.__c;
> +}
> +
> +/* Return the internal buffer from BUFFER if it is dynamic allocated,
> +   otherwise returns NULL.  Initializes the BUFFER if the internal
> +   dynamic buffer is returned.  */
> +static __always_inline void *
> +scratch_buffer_take_buffer (struct scratch_buffer *buffer)
> +{
> +  if (scratch_buffer_using_internal (buffer))
> +    return NULL;
> +
> +  void *r = buffer->data;
> +  scratch_buffer_init (buffer);
> +  return r;
> +}

This combination of functions is a little tricky. Instead, how about a 
single function that duplicates the scratch buffer on the heap, and 
frees the scratch buffer? Please see the attached proposed patch for 
Gnulib, which implements this idea. (I have not installed this into Gnulib.)

Also, shouldn't we merge the already-existing Gnulib scratch_buffer 
changes into glibc, along with this new change?

>   static idx_t
> +readlink_scratch_buffer (const char *path, struct scratch_buffer *buf)
> +{
> +  ssize_t r;
> +  while (true)
> +    {
> +      ptrdiff_t bufsize = buf->length;
> +      r = __readlink (path, buf->data, bufsize - 1);
> +      if (r < bufsize - 1)
> +	break;
> +      if (!scratch_buffer_grow (buf))
> +	return -1;
> +    }
> +  return r;
> +}

This function seems to exist because the proposed code calls readlink in 
two places. Current gnulib has been changed to call it in just one 
place, so there's less need to split out the function (the splitting 
complicates out-of-memory checking).

> -  scratch_buffer_init (rname_buf);
> -  char *rname_on_stack = rname_buf->data;
> ...
> +  scratch_buffer_init (&rname_buffer);
> +  char *rname = rname_buffer.data;

Doesn't this sort of thing have the potential to run into GCC bug 93644? 
That bug tends to be flaky; change platforms, or a few lines of code, 
and the problem recurs. Although it's just a bogus warning it cannot be 
turned off Gnulib has a GCC_LINT fix for this, which glibc could use 
simply with "#define GCC_LINT 1" in the "#ifdef _GLIBC" code.

> @@ -206,6 +219,20 @@ __realpath (const char *name, char *resolved)
>           /* nothing */;
>         else if (startlen == 2 && start[0] == '.' && start[1] == '.')
>           {
> +          if (!ISSLASH (dest[-1]))
> +            *dest++ = '/';
> +          *dest = '\0';
> +
> +	  ssize_t n = readlink_scratch_buffer (rname, &link_buffer);
> +          if (n < 0)
> +            {
> +              if (errno == ENOTDIR && dest[-1] == '/')
> +                dest[-1] = '\0';
> +	      if (errno == ENOMEM)
> +		goto error_nomem;
> +              if (errno != EINVAL)
> +                goto error;
> +            }

This can call readlink twice, once with trailing slash and once without. 
Better to call it just once.

> +	      char *buf = (char*) link_buffer.data;
>   
>                 buf[n] = '\0';
>   
> @@ -279,7 +296,7 @@ __realpath (const char *name, char *resolved)
>   
>                 /* Careful here, end may be a pointer into extra_buf... */
>                 memmove (&extra_buf[n], end, len + 1);
> -              name = end = memcpy (extra_buf, buf, n);
> +              name = end = memcpy (extra_buf, link_buffer.data, n);

If buf already equals link_buffer.data, no need for the patch to change 
buf to link_buffer.data.

> -          else if (! (startlen == 0
> -                      ? stat (rname, &st) == 0 || errno == EOVERFLOW
> -                      : errno == EINVAL))
> -            goto error;

I think current Gnulib addresses this issue in a different way, that 
doesn't involve the extra readlink calls.
>     if (DOUBLE_SLASH_IS_DISTINCT_ROOT && dest == rname + 1 && !prefix_len
>         && ISSLASH (*dest) && !ISSLASH (dest[1]))
>       dest++;
> +  *dest = '\0';
>     failed = false;
>   
>   error:
> -  *dest++ = '\0';

This looks dubious, as the error case also needs *dest to be '\0' and to 
increment dest (for when returning NULL when resolved != NULL).

Proposed patch to Gnulib attached. I hope this patch (along with what's 
already in Gnulib) addresses all the issues raised in your glibc 
patches, in the sense that the relevant files can be identical in Glibc 
and in Gnulib. I haven't installed this into Gnulib master on savannah.
  
Adhemerval Zanella Netto Dec. 28, 2020, 11:42 a.m. UTC | #2
On 24/12/2020 21:27, Paul Eggert wrote:
> This email finishes the review of this proposed glibc patchset. (I didn't look at patch 4/5 for test cases.)
> 
> On 12/24/20 7:17 AM, Adhemerval Zanella wrote:
> 
>> +/* Check if BUFFER is using the internal buffer.  */
>> +static __always_inline bool
>> +scratch_buffer_using_internal (struct scratch_buffer *buffer)
>> +{
>> +  return buffer->data == buffer->__space.__c;
>> +}
>> +
>> +/* Return the internal buffer from BUFFER if it is dynamic allocated,
>> +   otherwise returns NULL.  Initializes the BUFFER if the internal
>> +   dynamic buffer is returned.  */
>> +static __always_inline void *
>> +scratch_buffer_take_buffer (struct scratch_buffer *buffer)
>> +{
>> +  if (scratch_buffer_using_internal (buffer))
>> +    return NULL;
>> +
>> +  void *r = buffer->data;
>> +  scratch_buffer_init (buffer);
>> +  return r;
>> +}
> 
> This combination of functions is a little tricky. Instead, how about a single function that duplicates the scratch buffer on the heap, and frees the scratch buffer? Please see the attached proposed patch for Gnulib, which implements this idea. (I have not installed this into Gnulib.)

Indeed this seems a better alternative.

> 
> Also, shouldn't we merge the already-existing Gnulib scratch_buffer changes into glibc, along with this new change?

Which changes are you referring? Checking against bbaba6ce5 I see all
glibc files for scratch_buffer are similar to the gnulib ones.

> 
>>   static idx_t
>> +readlink_scratch_buffer (const char *path, struct scratch_buffer *buf)
>> +{
>> +  ssize_t r;
>> +  while (true)
>> +    {
>> +      ptrdiff_t bufsize = buf->length;
>> +      r = __readlink (path, buf->data, bufsize - 1);
>> +      if (r < bufsize - 1)
>> +    break;
>> +      if (!scratch_buffer_grow (buf))
>> +    return -1;
>> +    }
>> +  return r;
>> +}
> 
> This function seems to exist because the proposed code calls readlink in two places. Current gnulib has been changed to call it in just one place, so there's less need to split out the function (the splitting complicates out-of-memory checking).

Yes, this trades a stat call by an extra readlink call.  However it seems
that your strategy to use faccessat should be better, assuming it is lighter
syscall.

> 
>> -  scratch_buffer_init (rname_buf);
>> -  char *rname_on_stack = rname_buf->data;
>> ...
>> +  scratch_buffer_init (&rname_buffer);
>> +  char *rname = rname_buffer.data;
> 
> Doesn't this sort of thing have the potential to run into GCC bug 93644? That bug tends to be flaky; change platforms, or a few lines of code, and the problem recurs. Although it's just a bogus warning it cannot be turned off Gnulib has a GCC_LINT fix for this, which glibc could use simply with "#define GCC_LINT 1" in the "#ifdef _GLIBC" code.

I wasn't aware on the GCC issue in fact (it seems to affect GCC 10/11
and I am using GCC 9.2.1).

> 
>> @@ -206,6 +219,20 @@ __realpath (const char *name, char *resolved)
>>           /* nothing */;
>>         else if (startlen == 2 && start[0] == '.' && start[1] == '.')
>>           {
>> +          if (!ISSLASH (dest[-1]))
>> +            *dest++ = '/';
>> +          *dest = '\0';
>> +
>> +      ssize_t n = readlink_scratch_buffer (rname, &link_buffer);
>> +          if (n < 0)
>> +            {
>> +              if (errno == ENOTDIR && dest[-1] == '/')
>> +                dest[-1] = '\0';
>> +          if (errno == ENOMEM)
>> +        goto error_nomem;
>> +              if (errno != EINVAL)
>> +                goto error;
>> +            }
> 
> This can call readlink twice, once with trailing slash and once without. Better to call it just once.

Right.

> 
>> +          char *buf = (char*) link_buffer.data;
>>                   buf[n] = '\0';
>>   @@ -279,7 +296,7 @@ __realpath (const char *name, char *resolved)
>>                   /* Careful here, end may be a pointer into extra_buf... */
>>                 memmove (&extra_buf[n], end, len + 1);
>> -              name = end = memcpy (extra_buf, buf, n);
>> +              name = end = memcpy (extra_buf, link_buffer.data, n);
> 
> If buf already equals link_buffer.data, no need for the patch to change buf to link_buffer.data.
> 

Indeed, this is a left over from development.

>> -          else if (! (startlen == 0
>> -                      ? stat (rname, &st) == 0 || errno == EOVERFLOW
>> -                      : errno == EINVAL))
>> -            goto error;
> 
> I think current Gnulib addresses this issue in a different way, that doesn't involve the extra readlink calls.
>>     if (DOUBLE_SLASH_IS_DISTINCT_ROOT && dest == rname + 1 && !prefix_len
>>         && ISSLASH (*dest) && !ISSLASH (dest[1]))

Yes, I noted now on gnulib master that you handled it with an faccessat 
on dir_check. 

>>       dest++;
>> +  *dest = '\0';
>>     failed = false;
>>     error:
>> -  *dest++ = '\0';
> 
> This looks dubious, as the error case also needs *dest to be '\0' and to increment dest (for when returning NULL when resolved != NULL).
> 

Yes, I think this is also a leftover from development.

> Proposed patch to Gnulib attached. I hope this patch (along with what's already in Gnulib) addresses all the issues raised in your glibc patches, in the sense that the relevant files can be identical in Glibc and in Gnulib. I haven't installed this into Gnulib master on savannah.

Changes seems ok, I will send a v3 with the proposed changes to sync
the gnulib headers, along with the scratch_buffer change, the gnulib
sync that fix all the issues, and the extra test.

Thanks for working on this.
  
Adhemerval Zanella Netto Dec. 28, 2020, 1:32 p.m. UTC | #3
On 28/12/2020 08:42, Adhemerval Zanella wrote:
>>>   static idx_t
>>> +readlink_scratch_buffer (const char *path, struct scratch_buffer *buf)
>>> +{
>>> +  ssize_t r;
>>> +  while (true)
>>> +    {
>>> +      ptrdiff_t bufsize = buf->length;
>>> +      r = __readlink (path, buf->data, bufsize - 1);
>>> +      if (r < bufsize - 1)
>>> +    break;
>>> +      if (!scratch_buffer_grow (buf))
>>> +    return -1;
>>> +    }
>>> +  return r;
>>> +}
>>
>> This function seems to exist because the proposed code calls readlink in two places. Current gnulib has been changed to call it in just one place, so there's less need to split out the function (the splitting complicates out-of-memory checking).
> 
> Yes, this trades a stat call by an extra readlink call.  However it seems
> that your strategy to use faccessat should be better, assuming it is lighter
> syscall.

Also, it would require a recent kernel (5.8) to avoid faccessat to fallback
to use __NR_faccessat (it would make each call to faccessat2 to issue
two syscall).  We can live with this and if it turns a performance issue we
ca the hack to set a global variable to avoid it.

Now this snippet:

 47 # include <sysdep.h>
 48 # ifdef __ASSUME_FACCESSAT2
 49 #  define FACCESSAT_NEVER_EOVERFLOWS __ASSUME_FACCESSAT2
 50 # else
 51 #  define FACCESSAT_NEVER_EOVERFLOWS true
 52 # endif

is not required, since Linux faccessat fallback uses LFS stat call that
does not return EOVERFLOW.  And it also bleeds Linux implementation details
on generic code.

I will send a patch to fix it.
  
Paul Eggert Dec. 28, 2020, 9:04 p.m. UTC | #4
On 12/28/20 3:42 AM, Adhemerval Zanella wrote:

> On 24/12/2020 21:27, Paul Eggert wrote:

>> Also, shouldn't we merge the already-existing Gnulib scratch_buffer changes into glibc, along with this new change?
> 
> Which changes are you referring? Checking against bbaba6ce5 I see all
> glibc files for scratch_buffer are similar to the gnulib ones.

Sorry false alarm. Don't know what I was thinking of.

I incorporated almost all the changes suggested by your 
recently-proposed glibc changes into Gnulib, by installing the attached 
into Gnulib savannah master. There are still a few minor discrepancies 
from what you proposed for glibc, and I plan to comment on those 
discrepancies in reply to your other emails.
  

Patch

diff --git a/include/scratch_buffer.h b/include/scratch_buffer.h
index c39da78629..7ae77e5160 100644
--- a/include/scratch_buffer.h
+++ b/include/scratch_buffer.h
@@ -132,4 +132,25 @@  scratch_buffer_set_array_size (struct scratch_buffer *buffer,
 			 (buffer, nelem, size));
 }
 
+/* Check if BUFFER is using the internal buffer.  */
+static __always_inline bool
+scratch_buffer_using_internal (struct scratch_buffer *buffer)
+{
+  return buffer->data == buffer->__space.__c;
+}
+
+/* Return the internal buffer from BUFFER if it is dynamic allocated,
+   otherwise returns NULL.  Initializes the BUFFER if the internal
+   dynamic buffer is returned.  */
+static __always_inline void *
+scratch_buffer_take_buffer (struct scratch_buffer *buffer)
+{
+  if (scratch_buffer_using_internal (buffer))
+    return NULL;
+
+  void *r = buffer->data;
+  scratch_buffer_init (buffer);
+  return r;
+}
+
 #endif /* _SCRATCH_BUFFER_H */
diff --git a/stdlib/canonicalize.c b/stdlib/canonicalize.c
index 9111d92a4c..78a06227d2 100644
--- a/stdlib/canonicalize.c
+++ b/stdlib/canonicalize.c
@@ -88,6 +88,21 @@ 
 
 #if !FUNC_REALPATH_WORKS || defined _LIBC
 static idx_t
+readlink_scratch_buffer (const char *path, struct scratch_buffer *buf)
+{
+  ssize_t r;
+  while (true)
+    {
+      ptrdiff_t bufsize = buf->length;
+      r = __readlink (path, buf->data, bufsize - 1);
+      if (r < bufsize - 1)
+	break;
+      if (!scratch_buffer_grow (buf))
+	return -1;
+    }
+  return r;
+}
+static idx_t
 get_path_max (void)
 {
 # ifdef PATH_MAX
@@ -144,12 +159,10 @@  __realpath (const char *name, char *resolved)
 
   struct scratch_buffer extra_buffer, link_buffer;
   struct scratch_buffer rname_buffer;
-  struct scratch_buffer *rname_buf = &rname_buffer;
   scratch_buffer_init (&extra_buffer);
   scratch_buffer_init (&link_buffer);
-  scratch_buffer_init (rname_buf);
-  char *rname_on_stack = rname_buf->data;
-  char *rname = rname_on_stack;
+  scratch_buffer_init (&rname_buffer);
+  char *rname = rname_buffer.data;
   bool end_in_extra_buffer = false;
   bool failed = true;
 
@@ -159,16 +172,16 @@  __realpath (const char *name, char *resolved)
 
   if (!IS_ABSOLUTE_FILE_NAME (name))
     {
-      while (!__getcwd (rname, rname_buf->length))
+      while (!__getcwd (rname, rname_buffer.length))
         {
           if (errno != ERANGE)
             {
               dest = rname;
               goto error;
             }
-          if (!scratch_buffer_grow (rname_buf))
+          if (!scratch_buffer_grow (&rname_buffer))
             goto error_nomem;
-          rname = rname_buf->data;
+	  rname = rname_buffer.data;
         }
       dest = __rawmemchr (rname, '\0');
       start = name;
@@ -188,7 +201,7 @@  __realpath (const char *name, char *resolved)
       start = name + prefix_len;
     }
 
-  for ( ; *start; start = end)
+  for (end = start ; *start; start = end)
     {
       /* Skip sequence of multiple file name separators.  */
       while (ISSLASH (*start))
@@ -206,6 +219,20 @@  __realpath (const char *name, char *resolved)
         /* nothing */;
       else if (startlen == 2 && start[0] == '.' && start[1] == '.')
         {
+          if (!ISSLASH (dest[-1]))
+            *dest++ = '/';
+          *dest = '\0';
+
+	  ssize_t n = readlink_scratch_buffer (rname, &link_buffer);
+          if (n < 0)
+            {
+              if (errno == ENOTDIR && dest[-1] == '/')
+                dest[-1] = '\0';
+	      if (errno == ENOMEM)
+		goto error_nomem;
+              if (errno != EINVAL)
+                goto error;
+            }
           /* Back up to previous component, ignore if at root already.  */
           if (dest > rname + prefix_len + 1)
             for (--dest; dest > rname && !ISSLASH (dest[-1]); --dest)
@@ -220,46 +247,36 @@  __realpath (const char *name, char *resolved)
           if (!ISSLASH (dest[-1]))
             *dest++ = '/';
 
-          while (rname + rname_buf->length - dest <= startlen)
+          while (rname + rname_buffer.length - dest <= startlen)
             {
               idx_t dest_offset = dest - rname;
-              if (!scratch_buffer_grow_preserve (rname_buf))
+              if (!scratch_buffer_grow_preserve (&rname_buffer))
                 goto error_nomem;
-              rname = rname_buf->data;
+              rname = rname_buffer.data;
               dest = rname + dest_offset;
             }
 
           dest = __mempcpy (dest, start, startlen);
           *dest = '\0';
 
-          /* If STARTLEN == 0, RNAME ends in '/'; use stat rather than
-             readlink, because readlink might fail with EINVAL without
-             checking whether RNAME sans '/' is valid.  */
-          struct stat st;
-          char *buf = NULL;
-          ssize_t n;
-          if (startlen != 0)
+          ssize_t n = readlink_scratch_buffer (rname, &link_buffer);
+          if (n < 0)
             {
-              while (true)
-                {
-                  buf = link_buffer.data;
-                  idx_t bufsize = link_buffer.length;
-                  n = __readlink (rname, buf, bufsize - 1);
-                  if (n < bufsize - 1)
-                    break;
-                  if (!scratch_buffer_grow (&link_buffer))
-                    goto error_nomem;
-                }
-              if (n < 0)
-                buf = NULL;
+              if (errno == ENOTDIR && dest[-1] == '/')
+                dest[-1] = '\0';
+              if (errno == ENOMEM)
+                goto error_nomem;
+              if (errno != EINVAL)
+                goto error;
             }
-          if (buf)
+          else
             {
               if (++num_links > __eloop_threshold ())
                 {
                   __set_errno (ELOOP);
                   goto error;
                 }
+	      char *buf = (char*) link_buffer.data;
 
               buf[n] = '\0';
 
@@ -279,7 +296,7 @@  __realpath (const char *name, char *resolved)
 
               /* Careful here, end may be a pointer into extra_buf... */
               memmove (&extra_buf[n], end, len + 1);
-              name = end = memcpy (extra_buf, buf, n);
+              name = end = memcpy (extra_buf, link_buffer.data, n);
               end_in_extra_buffer = true;
 
               if (IS_ABSOLUTE_FILE_NAME (buf))
@@ -309,10 +326,6 @@  __realpath (const char *name, char *resolved)
                     dest++;
                 }
             }
-          else if (! (startlen == 0
-                      ? stat (rname, &st) == 0 || errno == EOVERFLOW
-                      : errno == EINVAL))
-            goto error;
         }
     }
   if (dest > rname + prefix_len + 1 && ISSLASH (dest[-1]))
@@ -320,34 +333,44 @@  __realpath (const char *name, char *resolved)
   if (DOUBLE_SLASH_IS_DISTINCT_ROOT && dest == rname + 1 && !prefix_len
       && ISSLASH (*dest) && !ISSLASH (dest[1]))
     dest++;
+  *dest = '\0';
   failed = false;
 
 error:
-  *dest++ = '\0';
-  if (resolved != NULL && dest - rname <= get_path_max ())
-    rname = strcpy (resolved, rname);
+  if (resolved != NULL)
+    {
+      if (dest - rname <= get_path_max ())
+        rname = strcpy (resolved, rname);
+    }
+  else
+    {
+      if (rname == resolved)
+	return rname;
+
+      idx_t rname_size = dest - rname;
+      if (scratch_buffer_using_internal (&rname_buffer))
+        {
+          rname = malloc (rname_size + 1);
+	  if (rname != NULL)
+	    {
+              memcpy (rname, rname_buffer.data, rname_size);
+	      rname[rname_size] = '\0';
+	    }
+        }
+      else
+        {
+	  rname = scratch_buffer_take_buffer (&rname_buffer);
+	  char *result = realloc (rname, rname_size);
+	  if (result != NULL)
+	    rname = result;
+        }
+    }
 
 error_nomem:
-  scratch_buffer_free (&extra_buffer);
   scratch_buffer_free (&link_buffer);
-  if (failed || rname == resolved)
-    scratch_buffer_free (rname_buf);
-
-  if (failed)
-    return NULL;
-
-  if (rname == resolved)
-    return rname;
-  idx_t rname_size = dest - rname;
-  if (rname == rname_on_stack)
-    {
-      rname = malloc (rname_size);
-      if (rname == NULL)
-        return NULL;
-      return memcpy (rname, rname_on_stack, rname_size);
-    }
-  char *result = realloc (rname, rname_size);
-  return result != NULL ? result : rname;
+  scratch_buffer_free (&extra_buffer);
+  scratch_buffer_free (&rname_buffer);
+  return failed ? NULL : rname;
 }
 libc_hidden_def (__realpath)
 versioned_symbol (libc, __realpath, realpath, GLIBC_2_3);