[RFC] Linux: Add seccomp probing to faccessat2

Message ID 87zh3628ua.fsf@oldenburg2.str.redhat.com
State Dropped
Headers
Series [RFC] Linux: Add seccomp probing to faccessat2 |

Commit Message

Florian Weimer Nov. 24, 2020, 1:16 p.m. UTC
  Some container runtimes cause faccessat2 to fail unconditionally
with EPERM.  Since it is conceivable that the real faccessat2
implementation can return EPERM (e.g., triggered by a Linux
Security Module), unconditional fallback to the incorrect workaround
on EPERM seems wrong.  Instead, a probing sequence attempts to
figure out whether the error comes from a seccomp filter or the
kernel.

Related kernel discussion:

<https://lore.kernel.org/linux-api/87lfer2c0b.fsf@oldenburg2.str.redhat.com/>

Fixes commit 3d3ab573a5f3071992cbc4f57d50d1d29d55bde2
("Linux: Use faccessat2 to implement faccessat (bug 18683)").


This is not a real submission, I just want to show how this would like
in glibc.  I haven't actually tested it.  As I said on the kernel
thread, I'd like to see some reluctant support from kernel developers
before we go in this direction.

---
 sysdeps/unix/sysv/linux/faccessat.c | 39 +++++++++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 6 deletions(-)
  

Comments

Adhemerval Zanella Nov. 24, 2020, 5:39 p.m. UTC | #1
On 24/11/2020 10:16, Florian Weimer via Libc-alpha wrote:
> Some container runtimes cause faccessat2 to fail unconditionally
> with EPERM.  Since it is conceivable that the real faccessat2
> implementation can return EPERM (e.g., triggered by a Linux
> Security Module), unconditional fallback to the incorrect workaround
> on EPERM seems wrong.  Instead, a probing sequence attempts to
> figure out whether the error comes from a seccomp filter or the
> kernel.
> 
> Related kernel discussion:
> 
> <https://lore.kernel.org/linux-api/87lfer2c0b.fsf@oldenburg2.str.redhat.com/>
> 
> Fixes commit 3d3ab573a5f3071992cbc4f57d50d1d29d55bde2
> ("Linux: Use faccessat2 to implement faccessat (bug 18683)").
> 
> 
> This is not a real submission, I just want to show how this would like
> in glibc.  I haven't actually tested it.  As I said on the kernel
> thread, I'd like to see some reluctant support from kernel developers
> before we go in this direction.

My understanding from the discussion you had start at opencontainers [1]
is the standard and the problematic container runtimes need to be changed
to return ENOSYS and that the probing mechanism is fragile and hacky,
specially when used along syscalls that potentially may return EPERM.

I personally really dislike this hack solution for a ill-defined interface,
I don't think this change should be pushed upstream.

[1] https://groups.google.com/a/opencontainers.org/g/dev/c/gj9ErIn5LQI


> 
> ---
>  sysdeps/unix/sysv/linux/faccessat.c | 39 +++++++++++++++++++++++++++++++------
>  1 file changed, 33 insertions(+), 6 deletions(-)
> 
> diff --git a/sysdeps/unix/sysv/linux/faccessat.c b/sysdeps/unix/sysv/linux/faccessat.c
> index 5d078371b5..e39c046472 100644
> --- a/sysdeps/unix/sysv/linux/faccessat.c
> +++ b/sysdeps/unix/sysv/linux/faccessat.c
> @@ -17,26 +17,53 @@
>     <https://www.gnu.org/licenses/>.  */
>  
>  #include <fcntl.h>
> -#include <unistd.h>
> -#include <sys/types.h>
> +#include <stdbool.h>
>  #include <sys/stat.h>
> +#include <sys/types.h>
>  #include <sysdep.h>
> +#include <unistd.h>
>  
> +#if !__ASSUME_FACCESSAT2
> +/* Used to make sure that an EPERM error came from the kernel and not
> +   a system call filter.  */
> +static bool
> +check_for_eperm (int fd, const char *file, int mode, int flag)
> +{
> +  int ret = INTERNAL_SYSCALL_CALL (faccessat2, fd, file, mode, flag);
> +  return (INTERNAL_SYSCALL_ERROR_P (ret)
> +	  && INTERNAL_SYSCALL_ERRNO (ret) == EPERM);
> +}
> +#endif
>  
>  int
>  faccessat (int fd, const char *file, int mode, int flag)
>  {
> -  int ret = INLINE_SYSCALL_CALL (faccessat2, fd, file, mode, flag);
>  #if __ASSUME_FACCESSAT2
> -  return ret;
> +  return INLINE_SYSCALL_CALL (faccessat2, fd, file, mode, flag);
>  #else
> -  if (ret == 0 || errno != ENOSYS)
> +  /* Prefer the old system call if no flags are specified, to avoid
> +     any complex fallback in that case.  */
> +  if (flag == 0)
> +    return INLINE_SYSCALL (faccessat, 3, fd, file, mode);
> +
> +  int ret = INLINE_SYSCALL_CALL (faccessat2, fd, file, mode, flag);
> +  if (ret == 0 || (errno != ENOSYS && errno != EPERM))
> +    return ret;
> +
> +  /* Workaround for seccomp filters and Linux containers: Check that
> +     the EPERM system call is real by probing for known error
> +     conditions.  If either probe does not fail with EPERM, it
> +     suggests that there is no seccomp filter in place, and the
> +     initial EPERM error came from the kernel.  */
> +  if (errno == EPERM
> +      && (!check_for_eperm (-1, "", 0, 0) /* EBADFD expected.  */
> +	  || !check_for_eperm (fd, NULL, 0, 0))) /* EFAULT expected.  */
>      return ret;
>  
>    if (flag & ~(AT_SYMLINK_NOFOLLOW | AT_EACCESS))
>      return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
>  
> -  if ((flag == 0 || ((flag & ~AT_EACCESS) == 0 && ! __libc_enable_secure)))
> +  if ((flag & ~AT_EACCESS) == 0 && ! __libc_enable_secure)
>      return INLINE_SYSCALL (faccessat, 3, fd, file, mode);
>  
>    struct stat64 stats;
>
  

Patch

diff --git a/sysdeps/unix/sysv/linux/faccessat.c b/sysdeps/unix/sysv/linux/faccessat.c
index 5d078371b5..e39c046472 100644
--- a/sysdeps/unix/sysv/linux/faccessat.c
+++ b/sysdeps/unix/sysv/linux/faccessat.c
@@ -17,26 +17,53 @@ 
    <https://www.gnu.org/licenses/>.  */
 
 #include <fcntl.h>
-#include <unistd.h>
-#include <sys/types.h>
+#include <stdbool.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 #include <sysdep.h>
+#include <unistd.h>
 
+#if !__ASSUME_FACCESSAT2
+/* Used to make sure that an EPERM error came from the kernel and not
+   a system call filter.  */
+static bool
+check_for_eperm (int fd, const char *file, int mode, int flag)
+{
+  int ret = INTERNAL_SYSCALL_CALL (faccessat2, fd, file, mode, flag);
+  return (INTERNAL_SYSCALL_ERROR_P (ret)
+	  && INTERNAL_SYSCALL_ERRNO (ret) == EPERM);
+}
+#endif
 
 int
 faccessat (int fd, const char *file, int mode, int flag)
 {
-  int ret = INLINE_SYSCALL_CALL (faccessat2, fd, file, mode, flag);
 #if __ASSUME_FACCESSAT2
-  return ret;
+  return INLINE_SYSCALL_CALL (faccessat2, fd, file, mode, flag);
 #else
-  if (ret == 0 || errno != ENOSYS)
+  /* Prefer the old system call if no flags are specified, to avoid
+     any complex fallback in that case.  */
+  if (flag == 0)
+    return INLINE_SYSCALL (faccessat, 3, fd, file, mode);
+
+  int ret = INLINE_SYSCALL_CALL (faccessat2, fd, file, mode, flag);
+  if (ret == 0 || (errno != ENOSYS && errno != EPERM))
+    return ret;
+
+  /* Workaround for seccomp filters and Linux containers: Check that
+     the EPERM system call is real by probing for known error
+     conditions.  If either probe does not fail with EPERM, it
+     suggests that there is no seccomp filter in place, and the
+     initial EPERM error came from the kernel.  */
+  if (errno == EPERM
+      && (!check_for_eperm (-1, "", 0, 0) /* EBADFD expected.  */
+	  || !check_for_eperm (fd, NULL, 0, 0))) /* EFAULT expected.  */
     return ret;
 
   if (flag & ~(AT_SYMLINK_NOFOLLOW | AT_EACCESS))
     return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
 
-  if ((flag == 0 || ((flag & ~AT_EACCESS) == 0 && ! __libc_enable_secure)))
+  if ((flag & ~AT_EACCESS) == 0 && ! __libc_enable_secure)
     return INLINE_SYSCALL (faccessat, 3, fd, file, mode);
 
   struct stat64 stats;