[v3,2/2] linux: Optimize realpath

Message ID 20200818214327.3121808-2-adhemerval.zanella@linaro.org
State Superseded
Headers
Series [v3,1/2] stdlib: Use scratch_buffer on realpath (BZ #26341) |

Commit Message

Adhemerval Zanella Netto Aug. 18, 2020, 9:43 p.m. UTC
  Changes from previous version [1]:

  - Use a scratch_buffer the readlink call.
  - Fallback to generic implementation readlink syscall fails
    (for the case where /proc is not mounted).

[1] https://sourceware.org/pipermail/libc-alpha/2020-August/116969.html

---

The Linux implementation uses the trick to open the provided path and
read the symlink pointed by '/proc/self/fd/<file>'.  As for the generic
implementation, the link is read using a scratch_buffer so default
stack usage is limited to around ~1k (and with dynamic bounded to
PATH_MAX).

Regarding syscalls usage, for a sucessful path without symlinks it
trades 2 syscalls (getcwd/lstat) for 3 (openat, readlink, and close).
This optimization is better if the input contains multiple symlinks
(where it replace multiple lstat calls by a readlink one).  For failure
case it depends whether the 'resolved' buffer is provided, which calls
the generic strategy (and thus requiring more syscalls in general).

Checked on x86_64-linux-gnu and i686-linux-gnu.
---
 include/scratch_buffer.h                |  13 ++
 include/stdlib.h                        |  16 ++
 stdlib/Makefile                         |   2 +-
 stdlib/canonicalize.c                   | 219 +----------------------
 stdlib/realpath-impl.c                  |  25 +++
 stdlib/resolve-path.c                   | 227 ++++++++++++++++++++++++
 sysdeps/unix/sysv/linux/realpath-impl.c |  69 +++++++
 7 files changed, 352 insertions(+), 219 deletions(-)
 create mode 100644 stdlib/realpath-impl.c
 create mode 100644 stdlib/resolve-path.c
 create mode 100644 sysdeps/unix/sysv/linux/realpath-impl.c
  

Patch

diff --git a/include/scratch_buffer.h b/include/scratch_buffer.h
index c39da78629..abf44d2860 100644
--- a/include/scratch_buffer.h
+++ b/include/scratch_buffer.h
@@ -86,6 +86,19 @@  scratch_buffer_free (struct scratch_buffer *buffer)
     free (buffer->data);
 }
 
+/* Returns the BUFFER->data and re-init the internal state if its was
+   allocated or NULL otherwise.  */
+static inline void *
+scratch_buffer_finalize (struct scratch_buffer *buffer)
+{
+  if (buffer->data == buffer->__space.__c)
+    return NULL;
+
+  void *r = buffer->data;
+  scratch_buffer_init (buffer);
+  return r;
+}
+
 /* Grow *BUFFER by some arbitrary amount.  The buffer contents is NOT
    preserved.  Return true on success, false on allocation failure (in
    which case the old buffer is freed).  On success, the new buffer is
diff --git a/include/stdlib.h b/include/stdlib.h
index ffcefd7b85..182de52f83 100644
--- a/include/stdlib.h
+++ b/include/stdlib.h
@@ -20,6 +20,14 @@ 
 
 # include <rtld-malloc.h>
 
+# ifndef PATH_MAX
+#  ifdef MAXPATHLEN
+#   define PATH_MAX MAXPATHLEN
+#  else
+#   define PATH_MAX 1024
+#  endif
+# endif
+
 extern __typeof (strtol_l) __strtol_l;
 extern __typeof (strtoul_l) __strtoul_l;
 extern __typeof (strtoll_l) __strtoll_l;
@@ -92,6 +100,14 @@  extern int __unsetenv (const char *__name) attribute_hidden;
 extern int __clearenv (void) attribute_hidden;
 extern char *__mktemp (char *__template) __THROW __nonnull ((1));
 extern char *__canonicalize_file_name (const char *__name);
+struct scratch_buffer;
+extern ssize_t __resolve_readlink (const char *rpath,
+				   struct scratch_buffer *out)
+     attribute_hidden;
+extern char *__resolve_path (const char *name, char *resolved)
+     attribute_hidden;
+extern char *__realpath_impl (const char *name, char *resolved)
+     attribute_hidden;
 extern char *__realpath (const char *__name, char *__resolved);
 libc_hidden_proto (__realpath)
 extern int __ptsname_r (int __fd, char *__buf, size_t __buflen)
diff --git a/stdlib/Makefile b/stdlib/Makefile
index 7093b8a584..b5d806696d 100644
--- a/stdlib/Makefile
+++ b/stdlib/Makefile
@@ -53,7 +53,7 @@  routines	:=							      \
 	strtof strtod strtold						      \
 	strtof_l strtod_l strtold_l					      \
 	strtof_nan strtod_nan strtold_nan				      \
-	system canonicalize						      \
+	system canonicalize resolve-path realpath-impl			      \
 	a64l l64a							      \
 	rpmatch strfmon strfmon_l getsubopt xpg_basename fmtmsg		      \
 	strtoimax strtoumax wcstoimax wcstoumax				      \
diff --git a/stdlib/canonicalize.c b/stdlib/canonicalize.c
index 43454f140c..c068b29043 100644
--- a/stdlib/canonicalize.c
+++ b/stdlib/canonicalize.c
@@ -16,90 +16,10 @@ 
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <assert.h>
 #include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <limits.h>
-#include <sys/stat.h>
 #include <errno.h>
-#include <stddef.h>
-
-#include <scratch_buffer.h>
-#include <eloop-threshold.h>
 #include <shlib-compat.h>
 
-#ifndef PATH_MAX
-# ifdef MAXPATHLEN
-#  define PATH_MAX MAXPATHLEN
-# else
-#  define PATH_MAX 1024
-# endif
-#endif
-
-static ssize_t
-resolve_readlink (const char *rpath, struct scratch_buffer *out)
-{
-  do
-    {
-      ssize_t n = __readlink (rpath, out->data, out->length);
-      if (n == -1)
-	return -1;
-      else if (n == out->length)
-	{
-	  if (out->length > PATH_MAX)
-	    {
-	      __set_errno (ENAMETOOLONG);
-	      return -1;
-	    }
-	  if (!scratch_buffer_grow (out))
-	    return -1;
-	}
-      else
-	{
-	  ((char*)out->data)[n] = '\0';
-	  return n;
-	}
-    }
-  while (true);
-}
-
-static bool
-realpath_readlink (const char *rpath, const char *end, size_t path_max,
-		   size_t st_size, struct scratch_buffer *out)
-{
-  struct scratch_buffer buf;
-  scratch_buffer_init (&buf);
-
-  if (!scratch_buffer_set_array_size (&buf, st_size + 1, sizeof (char *)))
-    return false;
-
-  bool r = false;
-
-  ssize_t n = resolve_readlink (rpath, &buf);
-  if (n == -1)
-    goto out;
-
-  size_t len = strlen (end);
-  if (path_max - buf.length <= len)
-    {
-      __set_errno (ENAMETOOLONG);
-      goto out;
-    }
-
-  if (! scratch_buffer_set_array_size (out, n + len + 1, sizeof (char *)))
-    goto out;
-
-  memmove (out->data + n, end, len + 1);
-  memcpy (out->data, buf.data, n);
-
-  r = true;
-
-out:
-  scratch_buffer_free (&buf);
-  return r;
-}
-
 /* Return the canonical absolute name of file NAME.  A canonical name
    does not contain any `.', `..' components nor any repeated path
    separators ('/') or symlinks.  All path components must exist.  If
@@ -114,14 +34,6 @@  out:
 char *
 __realpath (const char *name, char *resolved)
 {
-  char *rpath, *dest;
-  const char *start, *end, *rpath_limit;
-  const size_t path_max = PATH_MAX;
-  int num_links = 0;
-  struct scratch_buffer extra_buf;
-
-  scratch_buffer_init (&extra_buf);
-
   if (name == NULL)
     {
       /* As per Single Unix Specification V2 we must return an error if
@@ -140,136 +52,7 @@  __realpath (const char *name, char *resolved)
       return NULL;
     }
 
-  if (resolved == NULL)
-    {
-      rpath = malloc (path_max);
-      if (rpath == NULL)
-	return NULL;
-    }
-  else
-    rpath = resolved;
-  rpath_limit = rpath + path_max;
-
-  if (name[0] != '/')
-    {
-      if (__getcwd (rpath, path_max) == NULL)
-	{
-	  rpath[0] = '\0';
-	  goto error;
-	}
-      dest = __rawmemchr (rpath, '\0');
-    }
-  else
-    {
-      rpath[0] = '/';
-      dest = rpath + 1;
-    }
-
-  for (start = end = name; *start; start = end)
-    {
-      struct stat64 st;
-
-      /* Skip sequence of multiple path-separators.  */
-      while (*start == '/')
-	++start;
-
-      /* Find end of path component.  */
-      for (end = start; *end && *end != '/'; ++end)
-	/* Nothing.  */;
-
-      if (end - start == 0)
-	break;
-      else if (end - start == 1 && start[0] == '.')
-	/* nothing */;
-      else if (end - start == 2 && start[0] == '.' && start[1] == '.')
-	{
-	  /* Back up to previous component, ignore if at root already.  */
-	  if (dest > rpath + 1)
-	    while ((--dest)[-1] != '/');
-	}
-      else
-	{
-	  size_t new_size;
-
-	  if (dest[-1] != '/')
-	    *dest++ = '/';
-
-	  if (dest + (end - start) >= rpath_limit)
-	    {
-	      ptrdiff_t dest_offset = dest - rpath;
-	      char *new_rpath;
-
-	      if (resolved)
-		{
-		  __set_errno (ENAMETOOLONG);
-		  if (dest > rpath + 1)
-		    dest--;
-		  *dest = '\0';
-		  goto error;
-		}
-	      new_size = rpath_limit - rpath;
-	      if (end - start + 1 > path_max)
-		new_size += end - start + 1;
-	      else
-		new_size += path_max;
-	      new_rpath = (char *) realloc (rpath, new_size);
-	      if (new_rpath == NULL)
-		goto error;
-	      rpath = new_rpath;
-	      rpath_limit = rpath + new_size;
-
-	      dest = rpath + dest_offset;
-	    }
-
-	  dest = __mempcpy (dest, start, end - start);
-	  *dest = '\0';
-
-	  if (__lstat64 (rpath, &st) < 0)
-	    goto error;
-
-	  if (S_ISLNK (st.st_mode))
-	    {
-	      if (++num_links > __eloop_threshold ())
-		{
-		  __set_errno (ELOOP);
-		  goto error;
-		}
-
-	      if (! realpath_readlink (rpath, end, path_max, st.st_size,
-				       &extra_buf))
-		goto error;
-
-	      name = end = extra_buf.data;
-
-	      if (((char *)extra_buf.data)[0] == '/')
-		dest = rpath + 1;	/* It's an absolute symlink */
-	      else
-		/* Back up to previous component, ignore if at root already: */
-		if (dest > rpath + 1)
-		  while ((--dest)[-1] != '/');
-	    }
-	  else if (!S_ISDIR (st.st_mode) && *end != '\0')
-	    {
-	      __set_errno (ENOTDIR);
-	      goto error;
-	    }
-	}
-    }
-  if (dest > rpath + 1 && dest[-1] == '/')
-    --dest;
-  *dest = '\0';
-
-  scratch_buffer_free (&extra_buf);
-
-  assert (resolved == NULL || resolved == rpath);
-  return rpath;
-
-error:
-  assert (resolved == NULL || resolved == rpath);
-  if (resolved == NULL)
-    free (rpath);
-  scratch_buffer_free (&extra_buf);
-  return NULL;
+  return __realpath_impl (name, resolved);
 }
 libc_hidden_def (__realpath)
 versioned_symbol (libc, __realpath, realpath, GLIBC_2_3);
diff --git a/stdlib/realpath-impl.c b/stdlib/realpath-impl.c
new file mode 100644
index 0000000000..e099e02c0d
--- /dev/null
+++ b/stdlib/realpath-impl.c
@@ -0,0 +1,25 @@ 
+/* realpath internal implementation.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+
+char *
+__realpath_impl (const char *name, char *resolved)
+{
+  return __resolve_path (name, resolved);
+}
diff --git a/stdlib/resolve-path.c b/stdlib/resolve-path.c
new file mode 100644
index 0000000000..bfceaef160
--- /dev/null
+++ b/stdlib/resolve-path.c
@@ -0,0 +1,227 @@ 
+/* Internal realpath function.
+   Copyright (C) 1996-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <scratch_buffer.h>
+#include <eloop-threshold.h>
+
+ssize_t
+__resolve_readlink (const char *rpath, struct scratch_buffer *out)
+{
+  do
+    {
+      ssize_t n = __readlink (rpath, out->data, out->length);
+      if (n == -1)
+	return -1;
+      else if (n == out->length)
+	{
+	  if (out->length > PATH_MAX)
+	    {
+	      __set_errno (ENAMETOOLONG);
+	      return -1;
+	    }
+	  if (!scratch_buffer_grow (out))
+	    return -1;
+	}
+      else
+	{
+	  ((char*)out->data)[n] = '\0';
+	  return n;
+	}
+    }
+  while (true);
+}
+
+static bool
+realpath_readlink (const char *rpath, const char *end, size_t path_max,
+		   size_t st_size, struct scratch_buffer *out)
+{
+  struct scratch_buffer buf;
+  scratch_buffer_init (&buf);
+
+  if (!scratch_buffer_set_array_size (&buf, st_size + 1, sizeof (char *)))
+    return false;
+
+  bool r = false;
+
+  ssize_t n = __resolve_readlink (rpath, &buf);
+  if (n == -1)
+    goto out;
+
+  size_t len = strlen (end);
+  if (path_max - buf.length <= len)
+    {
+      __set_errno (ENAMETOOLONG);
+      goto out;
+    }
+
+  if (! scratch_buffer_set_array_size (out, n + len + 1, sizeof (char *)))
+    goto out;
+
+  memmove (out->data + n, end, len + 1);
+  memcpy (out->data, buf.data, n);
+
+  r = true;
+
+out:
+  scratch_buffer_free (&buf);
+  return r;
+}
+
+char *
+__resolve_path (const char *name, char *resolved)
+{
+  char *rpath, *dest;
+  const char *start, *end, *rpath_limit;
+  const size_t path_max = PATH_MAX;
+  int num_links = 0;
+  struct scratch_buffer extra_buf;
+
+  scratch_buffer_init (&extra_buf);
+
+  if (resolved == NULL)
+    {
+      rpath = malloc (path_max);
+      if (rpath == NULL)
+	return NULL;
+    }
+  else
+    rpath = resolved;
+  rpath_limit = rpath + path_max;
+
+  if (name[0] != '/')
+    {
+      if (__getcwd (rpath, path_max) == NULL)
+	{
+	  rpath[0] = '\0';
+	  goto error;
+	}
+      dest = __rawmemchr (rpath, '\0');
+    }
+  else
+    {
+      rpath[0] = '/';
+      dest = rpath + 1;
+    }
+
+  for (start = end = name; *start; start = end)
+    {
+      struct stat64 st;
+
+      /* Skip sequence of multiple path-separators.  */
+      while (*start == '/')
+	++start;
+
+      /* Find end of path component.  */
+      for (end = start; *end && *end != '/'; ++end)
+	/* Nothing.  */;
+
+      if (end - start == 0)
+	break;
+      else if (end - start == 1 && start[0] == '.')
+	/* nothing */;
+      else if (end - start == 2 && start[0] == '.' && start[1] == '.')
+	{
+	  /* Back up to previous component, ignore if at root already.  */
+	  if (dest > rpath + 1)
+	    while ((--dest)[-1] != '/');
+	}
+      else
+	{
+	  size_t new_size;
+
+	  if (dest[-1] != '/')
+	    *dest++ = '/';
+
+	  if (dest + (end - start) >= rpath_limit)
+	    {
+	      ptrdiff_t dest_offset = dest - rpath;
+	      char *new_rpath;
+
+	      if (resolved)
+		{
+		  __set_errno (ENAMETOOLONG);
+		  if (dest > rpath + 1)
+		    dest--;
+		  *dest = '\0';
+		  goto error;
+		}
+	      new_size = rpath_limit - rpath;
+	      if (end - start + 1 > path_max)
+		new_size += end - start + 1;
+	      else
+		new_size += path_max;
+	      new_rpath = (char *) realloc (rpath, new_size);
+	      if (new_rpath == NULL)
+		goto error;
+	      rpath = new_rpath;
+	      rpath_limit = rpath + new_size;
+
+	      dest = rpath + dest_offset;
+	    }
+
+	  dest = __mempcpy (dest, start, end - start);
+	  *dest = '\0';
+
+	  if (__lstat64 (rpath, &st) < 0)
+	    goto error;
+
+	  if (S_ISLNK (st.st_mode))
+	    {
+	      if (++num_links > __eloop_threshold ())
+		{
+		  __set_errno (ELOOP);
+		  goto error;
+		}
+
+	      if (! realpath_readlink (rpath, end, path_max, st.st_size,
+				       &extra_buf))
+		goto error;
+
+	      name = end = extra_buf.data;
+
+	      if (((char *)extra_buf.data)[0] == '/')
+		dest = rpath + 1;	/* It's an absolute symlink */
+	      else
+		/* Back up to previous component, ignore if at root already: */
+		if (dest > rpath + 1)
+		  while ((--dest)[-1] != '/');
+	    }
+	  else if (!S_ISDIR (st.st_mode) && *end != '\0')
+	    {
+	      __set_errno (ENOTDIR);
+	      goto error;
+	    }
+	}
+    }
+  if (dest > rpath + 1 && dest[-1] == '/')
+    --dest;
+  *dest = '\0';
+
+  scratch_buffer_free (&extra_buf);
+  return rpath;
+
+error:
+  if (resolved == NULL)
+    free (rpath);
+  scratch_buffer_free (&extra_buf);
+  return NULL;
+}
diff --git a/sysdeps/unix/sysv/linux/realpath-impl.c b/sysdeps/unix/sysv/linux/realpath-impl.c
new file mode 100644
index 0000000000..aa3d0850f0
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/realpath-impl.c
@@ -0,0 +1,69 @@ 
+/* Return the canonical absolute name of a given file.  Linux version.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+#include <not-cancel.h>
+#include <fd_to_filename.h>
+#include <scratch_buffer.h>
+
+/* The Linux implementation optimizes the worse case where the path contains
+   multiple symlinks by making the kernel resolve and return the full path
+   by opening NAME and reading the resulting /proc/self/fd entry.  */
+
+char *
+__realpath_impl (const char *name, char *resolved)
+{
+  int fd = __open64_nocancel (name, O_PATH | O_NONBLOCK | O_CLOEXEC);
+  if (fd == -1)
+    {
+      /* If the call fails with either EACCES or ENOENT and resolved_path is
+	 not NULL, then the prefix of path that is not readable or does not
+	 exist is returned in resolved_path.  This is a GNU extension.  */
+      if (resolved != NULL)
+	__resolve_path (name, resolved);
+      return NULL;
+    }
+
+  struct fd_to_filename fdfilename;
+  const char *procname = __fd_to_filename (fd, &fdfilename);
+
+  struct scratch_buffer buf;
+  scratch_buffer_init (&buf);
+
+  ssize_t len = __resolve_readlink (procname, &buf);
+  __close_nocancel_nostatus (fd);
+
+  if (len < 0)
+    {
+      /* Fallback to generic implementation is /proc is not mounted.  */
+      scratch_buffer_free (&buf);
+      return __resolve_path (name, resolved);
+    }
+
+  char *r;
+  if (resolved != NULL)
+    r = strcpy (resolved, buf.data);
+  else
+    {
+      /* If buffer was allocated return it instead of duplicate it.  */
+      r = scratch_buffer_finalize (&buf);
+      r = r ?: __strdup (buf.data);
+    }
+  scratch_buffer_free (&buf);
+  return r;
+}