[5/6] Optimize i386 syscall inlining

Message ID 20151012232028.GC8797@intel.com
State New, archived
Headers

Commit Message

Lu, Hongjiu Oct. 12, 2015, 11:20 p.m. UTC
  Since GCC 5 and above can properly spill %ebx when needed, we can inline
syscalls with 6 arguments if GCC 5 or above is used to compile glibc.
This patch rewrites INTERNAL_SYSCALL macros and skips __libc_do_syscall
for GCC 5.

For sysdeps/unix/sysv/linux/i386/brk.c, with -O2 -march=i686
-mtune=generic, GCC 5.2 now generates:

<__brk>:
   0:	push   %ebx
   1:	mov    $0x2d,%eax
   6:	mov    0x8(%esp),%ebx
   a:	call   b <__brk+0xb>	b: R_386_PC32	__x86.get_pc_thunk.dx
   f:	add    $0x2,%edx	11: R_386_GOTPC	_GLOBAL_OFFSET_TABLE_
  15:	call   *%gs:0x10
  1c:	mov    0x0(%edx),%edx	1e: R_386_GOT32	__curbrk
  22:	cmp    %eax,%ebx
  24:	mov    %eax,(%edx)
  26:	ja     30 <__brk+0x30>
  28:	xor    %eax,%eax
  2a:	pop    %ebx
  2b:	ret

instead of

<__brk>:
   0:	push   %ebx
   1:	mov    0x8(%esp),%ecx
   5:	call   6 <__brk+0x6>	6: R_386_PC32	__x86.get_pc_thunk.bx
   a:	add    $0x2,%ebx	c: R_386_GOTPC	_GLOBAL_OFFSET_TABLE_
  10:	xchg   %ecx,%ebx
  12:	mov    $0x2d,%eax
  17:	call   *%gs:0x10
  1e:	xchg   %ecx,%ebx
  20:	mov    %eax,%edx
  22:	mov    0x0(%ebx),%eax	24: R_386_GOT32	__curbrk
  28:	mov    %edx,(%eax)
  2a:	xor    %eax,%eax
  2c:	cmp    %edx,%ecx
  2e:	ja     38 <__brk+0x38>
  30:	pop    %ebx
  31:	ret

The new one is shorter by 2 instructions.

	* sysdeps/unix/sysv/linux/i386/libc-do-syscall.S
	(__libc_do_syscall): Defined only if !__GNUC_PREREQ (5,0).
	* sysdeps/unix/sysv/linux/i386/sysdep.h: Define assembler macros
	only if !__GNUC_PREREQ (5,0).
	(INTERNAL_SYSCALL_MAIN_6): Optimize for GCC 5.
	(INTERNAL_SYSCALL_MAIN_INLINE): Likewise.
	(INTERNAL_SYSCALL_NCS): Likewise.
	(LOADREGS_0): New macro for GCC 5.
	(ASMARGS_0): Likewise.
	(LOADREGS_1): Likewise.
	(ASMARGS_1): Likewise.
	(LOADREGS_2): Likewise.
	(ASMARGS_2): Likewise.
	(LOADREGS_3): Likewise.
	(ASMARGS_3): Likewise.
	(LOADREGS_4): Likewise.
	(ASMARGS_4): Likewise.
	(LOADREGS_5): Likewise.
	(ASMARGS_5): Likewise.
	(LOADREGS_6): Likewise.
	(ASMARGS_6): Likewise.
---
 sysdeps/unix/sysv/linux/i386/libc-do-syscall.S |   3 +
 sysdeps/unix/sysv/linux/i386/sysdep.h          | 115 ++++++++++++++++++++++---
 2 files changed, 107 insertions(+), 11 deletions(-)
  

Comments

H.J. Lu Oct. 15, 2015, 12:22 p.m. UTC | #1
On Mon, Oct 12, 2015 at 4:20 PM, H.J. Lu <hongjiu.lu@intel.com> wrote:
> Since GCC 5 and above can properly spill %ebx when needed, we can inline
> syscalls with 6 arguments if GCC 5 or above is used to compile glibc.
> This patch rewrites INTERNAL_SYSCALL macros and skips __libc_do_syscall
> for GCC 5.
>
> For sysdeps/unix/sysv/linux/i386/brk.c, with -O2 -march=i686
> -mtune=generic, GCC 5.2 now generates:
>
> <__brk>:
>    0:   push   %ebx
>    1:   mov    $0x2d,%eax
>    6:   mov    0x8(%esp),%ebx
>    a:   call   b <__brk+0xb>    b: R_386_PC32   __x86.get_pc_thunk.dx
>    f:   add    $0x2,%edx        11: R_386_GOTPC _GLOBAL_OFFSET_TABLE_
>   15:   call   *%gs:0x10
>   1c:   mov    0x0(%edx),%edx   1e: R_386_GOT32 __curbrk
>   22:   cmp    %eax,%ebx
>   24:   mov    %eax,(%edx)
>   26:   ja     30 <__brk+0x30>
>   28:   xor    %eax,%eax
>   2a:   pop    %ebx
>   2b:   ret
>
> instead of
>
> <__brk>:
>    0:   push   %ebx
>    1:   mov    0x8(%esp),%ecx
>    5:   call   6 <__brk+0x6>    6: R_386_PC32   __x86.get_pc_thunk.bx
>    a:   add    $0x2,%ebx        c: R_386_GOTPC  _GLOBAL_OFFSET_TABLE_
>   10:   xchg   %ecx,%ebx
>   12:   mov    $0x2d,%eax
>   17:   call   *%gs:0x10
>   1e:   xchg   %ecx,%ebx
>   20:   mov    %eax,%edx
>   22:   mov    0x0(%ebx),%eax   24: R_386_GOT32 __curbrk
>   28:   mov    %edx,(%eax)
>   2a:   xor    %eax,%eax
>   2c:   cmp    %edx,%ecx
>   2e:   ja     38 <__brk+0x38>
>   30:   pop    %ebx
>   31:   ret
>
> The new one is shorter by 2 instructions.
>
>         * sysdeps/unix/sysv/linux/i386/libc-do-syscall.S
>         (__libc_do_syscall): Defined only if !__GNUC_PREREQ (5,0).
>         * sysdeps/unix/sysv/linux/i386/sysdep.h: Define assembler macros
>         only if !__GNUC_PREREQ (5,0).
>         (INTERNAL_SYSCALL_MAIN_6): Optimize for GCC 5.
>         (INTERNAL_SYSCALL_MAIN_INLINE): Likewise.
>         (INTERNAL_SYSCALL_NCS): Likewise.
>         (LOADREGS_0): New macro for GCC 5.
>         (ASMARGS_0): Likewise.
>         (LOADREGS_1): Likewise.
>         (ASMARGS_1): Likewise.
>         (LOADREGS_2): Likewise.
>         (ASMARGS_2): Likewise.
>         (LOADREGS_3): Likewise.
>         (ASMARGS_3): Likewise.
>         (LOADREGS_4): Likewise.
>         (ASMARGS_4): Likewise.
>         (LOADREGS_5): Likewise.
>         (ASMARGS_5): Likewise.
>         (LOADREGS_6): Likewise.
>         (ASMARGS_6): Likewise.

I am checking in this patch now.
  
Roland McGrath Oct. 15, 2015, 7:54 p.m. UTC | #2
This needs comments explaining why the compiler version conditional
makes sense.
  
Andreas Schwab Oct. 21, 2015, 2:18 p.m. UTC | #3
gcc ../sysdeps/unix/sysv/linux/posix_fallocate64.c -c -std=gnu99 -fgnu89-inline  -D_FORTIFY_SOURCE=2 -O2 -U_FORTIFY_SOURCE -Wall -Werror -Wundef -Wwrite-strings -fasynchronous-unwind-tables -fmerge-all-constants -fmessage-length=0 -frounding-math -funwind-tables -g -grecord-gcc-switches -Wstrict-prototypes  -pg -fexceptions  -ftls-model=initial-exec      -I../include -I/home/abuild/rpmbuild/BUILD/glibc-2.22.90/cc-base/io  -I/home/abuild/rpmbuild/BUILD/glibc-2.22.90/cc-base  -I../sysdeps/unix/sysv/linux/i386  -I../sysdeps/unix/sysv/linux/x86  -I../sysdeps/i386/nptl  -I../sysdeps/unix/sysv/linux/include -I../sysdeps/unix/sysv/linux  -I../sysdeps/nptl  -I../sysdeps/pthread  -I../sysdeps/gnu  -I../sysdeps/unix/inet  -I../sysdeps/unix/sysv  -I../sysdeps/unix/i386  -I../sysdeps/unix  -I../sysdeps/posix  -I../sysdeps/i386/i586  -I../sysdeps/i386/fpu  -I../sysdeps/x86/fpu/include -I../sysdeps/x86/fpu  -I../sysdeps/i386  -I../sysdeps/x86  -I../sysdeps/wordsize-32  -I../sysdeps/ieee754/ldbl-96  -I../sysdeps/ieee754/dbl-64  -I../sysdeps/ieee754/flt-32  -I../sysdeps/ieee754  -I../sysdeps/generic  -I.. -I../libio -I.   -D_LIBC_REENTRANT -include /home/abuild/rpmbuild/BUILD/glibc-2.22.90/cc-base/libc-modules.h -DMODULE_NAME=libc -include ../include/libc-symbols.h  -DPROF      -o /home/abuild/rpmbuild/BUILD/glibc-2.22.90/cc-base/io/posix_fallocate64.op -MD -MP -MF /home/abuild/rpmbuild/BUILD/glibc-2.22.90/cc-base/io/posix_fallocate64.op.dt -MT /home/abuild/rpmbuild/BUILD/glibc-2.22.90/cc-base/io/posix_fallocate64.op
../sysdeps/unix/sysv/linux/posix_fallocate.c: In function 'posix_fallocate':
../sysdeps/unix/sysv/linux/posix_fallocate.c:39:1: error: bp cannot be used in asm here

Andreas.
  
H.J. Lu Oct. 21, 2015, 2:34 p.m. UTC | #4
On Wed, Oct 21, 2015 at 7:18 AM, Andreas Schwab <schwab@suse.de> wrote:
> gcc ../sysdeps/unix/sysv/linux/posix_fallocate64.c -c -std=gnu99 -fgnu89-inline  -D_FORTIFY_SOURCE=2 -O2 -U_FORTIFY_SOURCE -Wall -Werror -Wundef -Wwrite-strings -fasynchronous-unwind-tables -fmerge-all-constants -fmessage-length=0 -frounding-math -funwind-tables -g -grecord-gcc-switches -Wstrict-prototypes  -pg -fexceptions  -ftls-model=initial-exec      -I../include -I/home/abuild/rpmbuild/BUILD/glibc-2.22.90/cc-base/io  -I/home/abuild/rpmbuild/BUILD/glibc-2.22.90/cc-base  -I../sysdeps/unix/sysv/linux/i386  -I../sysdeps/unix/sysv/linux/x86  -I../sysdeps/i386/nptl  -I../sysdeps/unix/sysv/linux/include -I../sysdeps/unix/sysv/linux  -I../sysdeps/nptl  -I../sysdeps/pthread  -I../sysdeps/gnu  -I../sysdeps/unix/inet  -I../sysdeps/unix/sysv  -I../sysdeps/unix/i386  -I../sysdeps/unix  -I../sysdeps/posix  -I../sysdeps/i386/i586  -I../sysdeps/i386/fpu  -I../sysdeps/x86/fpu/include -I../sysdeps/x86/fpu  -I../sysdeps/i386  -I../sysdeps/x86  -I../sysdeps/wordsize-32  -I../sysdeps/ieee754/ldbl-96  -I../sysdeps/ieee754/dbl-64  -I../sysdeps/ieee754/flt-32  -I../sysdeps/ieee754  -I../sysdeps/generic  -I.. -I../libio -I.   -D_LIBC_REENTRANT -include /home/abuild/rpmbuild/BUILD/glibc-2.22.90/cc-base/libc-modules.h -DMODULE_NAME=libc -include ../include/libc-symbols.h  -DPROF      -o /home/abuild/rpmbuild/BUILD/glibc-2.22.90/cc-base/io/posix_fallocate64.op -MD -MP -MF /home/abuild/rpmbuild/BUILD/glibc-2.22.90/cc-base/io/posix_fallocate64.op.dt -MT /home/abuild/rpmbuild/BUILD/glibc-2.22.90/cc-base/io/posix_fallocate64.op
> ../sysdeps/unix/sysv/linux/posix_fallocate.c: In function 'posix_fallocate':
> ../sysdeps/unix/sysv/linux/posix_fallocate.c:39:1: error: bp cannot be used in asm here

Which GCC are you using?
  
Andreas Schwab Oct. 21, 2015, 3:01 p.m. UTC | #5
"H.J. Lu" <hjl.tools@gmail.com> writes:

> Which GCC are you using?

https://build.opensuse.org/package/show/openSUSE:Factory/gcc5

Andreas.
  
Joseph Myers Oct. 21, 2015, 4:35 p.m. UTC | #6
I suppose this illustrates that we could do with buildbot slaves using 
--enable-profile (well, there are several improvements we could do with as 
noted at <https://sourceware.org/glibc/wiki/Buildbot>, but such 
configuration variants are among the things that would be useful).
  

Patch

diff --git a/sysdeps/unix/sysv/linux/i386/libc-do-syscall.S b/sysdeps/unix/sysv/linux/i386/libc-do-syscall.S
index af5c6f0..cdef3d5 100644
--- a/sysdeps/unix/sysv/linux/i386/libc-do-syscall.S
+++ b/sysdeps/unix/sysv/linux/i386/libc-do-syscall.S
@@ -18,6 +18,8 @@ 
 
 #include <sysdep.h>
 
+#if !__GNUC_PREREQ (5,0)
+
 /* %eax, %ecx, %edx and %esi contain the values expected by the kernel.
    %edi points to a structure with the values of %ebx, %edi and %ebp.  */
 
@@ -48,3 +50,4 @@  ENTRY (__libc_do_syscall)
 	cfi_restore (ebx)
 	ret
 END (__libc_do_syscall)
+#endif
diff --git a/sysdeps/unix/sysv/linux/i386/sysdep.h b/sysdeps/unix/sysv/linux/i386/sysdep.h
index 6197ff1..1515fa6 100644
--- a/sysdeps/unix/sysv/linux/i386/sysdep.h
+++ b/sysdeps/unix/sysv/linux/i386/sysdep.h
@@ -227,6 +227,7 @@ 
 extern int __syscall_error (int)
   attribute_hidden __attribute__ ((__regparm__ (1)));
 
+#if !__GNUC_PREREQ (5,0)
 /* We need some help from the assembler to generate optimal code.  We
    define some macros here which later will be used.  */
 asm (".L__X'%ebx = 1\n\t"
@@ -266,6 +267,7 @@  struct libc_do_syscall_args
 {
   int ebx, edi, ebp;
 };
+#endif
 
 /* Define a macro which expands inline into the wrapper code for a system
    call.  */
@@ -322,8 +324,12 @@  struct libc_do_syscall_args
     INTERNAL_SYSCALL_MAIN_INLINE(name, err, 5, args)
 /* Each object using 6-argument inline syscalls must include a
    definition of __libc_do_syscall.  */
-#define INTERNAL_SYSCALL_MAIN_6(name, err, arg1, arg2, arg3,		\
-				arg4, arg5, arg6)			\
+#if __GNUC_PREREQ (5,0)
+# define INTERNAL_SYSCALL_MAIN_6(name, err, args...) \
+    INTERNAL_SYSCALL_MAIN_INLINE(name, err, 6, args)
+#else /* GCC 5  */
+# define INTERNAL_SYSCALL_MAIN_6(name, err, arg1, arg2, arg3,		\
+				 arg4, arg5, arg6)			\
   struct libc_do_syscall_args _xv =					\
     {									\
       (int) (arg1),							\
@@ -336,14 +342,52 @@  struct libc_do_syscall_args
     : "=a" (resultvar)							\
     : "i" (__NR_##name), "c" (arg2), "d" (arg3), "S" (arg4), "D" (&_xv) \
     : "memory", "cc")
+#endif /* GCC 5  */
 #define INTERNAL_SYSCALL(name, err, nr, args...) \
   ({									      \
     register unsigned int resultvar;					      \
     INTERNAL_SYSCALL_MAIN_##nr (name, err, args);			      \
     (int) resultvar; })
 #ifdef I386_USE_SYSENTER
-# ifdef SHARED
-#  define INTERNAL_SYSCALL_MAIN_INLINE(name, err, nr, args...) \
+# if __GNUC_PREREQ (5,0)
+#  ifdef SHARED
+#   define INTERNAL_SYSCALL_MAIN_INLINE(name, err, nr, args...) \
+    LOADREGS_##nr(args)							\
+    asm volatile (							\
+    "call *%%gs:%P2"							\
+    : "=a" (resultvar)							\
+    : "a" (__NR_##name), "i" (offsetof (tcbhead_t, sysinfo))		\
+      ASMARGS_##nr(args) : "memory", "cc")
+#   define INTERNAL_SYSCALL_NCS(name, err, nr, args...) \
+  ({									\
+    register unsigned int resultvar;					\
+    LOADREGS_##nr(args)							\
+    asm volatile (							\
+    "call *%%gs:%P2"							\
+    : "=a" (resultvar)							\
+    : "a" (name), "i" (offsetof (tcbhead_t, sysinfo))			\
+      ASMARGS_##nr(args) : "memory", "cc");				\
+    (int) resultvar; })
+#  else
+#   define INTERNAL_SYSCALL_MAIN_INLINE(name, err, nr, args...) \
+    LOADREGS_##nr(args)							\
+    asm volatile (							\
+    "call *_dl_sysinfo"							\
+    : "=a" (resultvar)							\
+    : "a" (__NR_##name) ASMARGS_##nr(args) : "memory", "cc")
+#   define INTERNAL_SYSCALL_NCS(name, err, nr, args...) \
+  ({									\
+    register unsigned int resultvar;					\
+    LOADREGS_##nr(args)							\
+    asm volatile (							\
+    "call *_dl_sysinfo"							\
+    : "=a" (resultvar)							\
+    : "a" (name) ASMARGS_##nr(args) : "memory", "cc");			\
+    (int) resultvar; })
+#  endif
+# else /* GCC 5  */
+#  ifdef SHARED
+#   define INTERNAL_SYSCALL_MAIN_INLINE(name, err, nr, args...) \
     EXTRAVAR_##nr							      \
     asm volatile (							      \
     LOADARGS_##nr							      \
@@ -353,7 +397,7 @@  struct libc_do_syscall_args
     : "=a" (resultvar)							      \
     : "i" (__NR_##name), "i" (offsetof (tcbhead_t, sysinfo))		      \
       ASMFMT_##nr(args) : "memory", "cc")
-#  define INTERNAL_SYSCALL_NCS(name, err, nr, args...) \
+#   define INTERNAL_SYSCALL_NCS(name, err, nr, args...) \
   ({									      \
     register unsigned int resultvar;					      \
     EXTRAVAR_##nr							      \
@@ -365,8 +409,8 @@  struct libc_do_syscall_args
     : "0" (name), "i" (offsetof (tcbhead_t, sysinfo))			      \
       ASMFMT_##nr(args) : "memory", "cc");				      \
     (int) resultvar; })
-# else
-#  define INTERNAL_SYSCALL_MAIN_INLINE(name, err, nr, args...) \
+#  else
+#   define INTERNAL_SYSCALL_MAIN_INLINE(name, err, nr, args...) \
     EXTRAVAR_##nr							      \
     asm volatile (							      \
     LOADARGS_##nr							      \
@@ -375,7 +419,7 @@  struct libc_do_syscall_args
     RESTOREARGS_##nr							      \
     : "=a" (resultvar)							      \
     : "i" (__NR_##name) ASMFMT_##nr(args) : "memory", "cc")
-#  define INTERNAL_SYSCALL_NCS(name, err, nr, args...) \
+#   define INTERNAL_SYSCALL_NCS(name, err, nr, args...) \
   ({									      \
     register unsigned int resultvar;					      \
     EXTRAVAR_##nr							      \
@@ -386,9 +430,27 @@  struct libc_do_syscall_args
     : "=a" (resultvar)							      \
     : "0" (name) ASMFMT_##nr(args) : "memory", "cc");			      \
     (int) resultvar; })
-# endif
+#  endif
+# endif /* GCC 5  */
 #else
-# define INTERNAL_SYSCALL_MAIN_INLINE(name, err, nr, args...) \
+# if __GNUC_PREREQ (5,0)
+#  define INTERNAL_SYSCALL_MAIN_INLINE(name, err, nr, args...) \
+    LOADREGS_##nr(args)							\
+    asm volatile (							\
+    "int $0x80"								\
+    : "=a" (resultvar)							\
+    : "a" (__NR_##name) ASMARGS_##nr(args) : "memory", "cc")
+#  define INTERNAL_SYSCALL_NCS(name, err, nr, args...) \
+  ({									\
+    register unsigned int resultvar;					\
+    LOADREGS_##nr(args)							\
+    asm volatile (							\
+    "int $0x80"								\
+    : "=a" (resultvar)							\
+    : "a" (name) ASMARGS_##nr(args) : "memory", "cc");			\
+    (int) resultvar; })
+# else /* GCC 5  */
+#  define INTERNAL_SYSCALL_MAIN_INLINE(name, err, nr, args...) \
     EXTRAVAR_##nr							      \
     asm volatile (							      \
     LOADARGS_##nr							      \
@@ -397,7 +459,7 @@  struct libc_do_syscall_args
     RESTOREARGS_##nr							      \
     : "=a" (resultvar)							      \
     : "i" (__NR_##name) ASMFMT_##nr(args) : "memory", "cc")
-# define INTERNAL_SYSCALL_NCS(name, err, nr, args...) \
+#  define INTERNAL_SYSCALL_NCS(name, err, nr, args...) \
   ({									      \
     register unsigned int resultvar;					      \
     EXTRAVAR_##nr							      \
@@ -408,6 +470,7 @@  struct libc_do_syscall_args
     : "=a" (resultvar)							      \
     : "0" (name) ASMFMT_##nr(args) : "memory", "cc");			      \
     (int) resultvar; })
+# endif /* GCC 5  */
 #endif
 
 #undef INTERNAL_SYSCALL_DECL
@@ -472,6 +535,36 @@  struct libc_do_syscall_args
 # define RESTOREARGS_5
 #endif
 
+#if __GNUC_PREREQ (5,0)
+# define LOADREGS_0()
+# define ASMARGS_0()
+# define LOADREGS_1(arg1) \
+	LOADREGS_0 ()
+# define ASMARGS_1(arg1) \
+	ASMARGS_0 (), "b" ((unsigned int) (arg1))
+# define LOADREGS_2(arg1, arg2) \
+	LOADREGS_1 (arg1)
+# define ASMARGS_2(arg1, arg2) \
+	ASMARGS_1 (arg1), "c" ((unsigned int) (arg2))
+# define LOADREGS_3(arg1, arg2, arg3) \
+	LOADREGS_2 (arg1, arg2)
+# define ASMARGS_3(arg1, arg2, arg3) \
+	ASMARGS_2 (arg1, arg2), "d" ((unsigned int) (arg3))
+# define LOADREGS_4(arg1, arg2, arg3, arg4) \
+	LOADREGS_3 (arg1, arg2, arg3)
+# define ASMARGS_4(arg1, arg2, arg3, arg4) \
+	ASMARGS_3 (arg1, arg2, arg3), "S" ((unsigned int) (arg4))
+# define LOADREGS_5(arg1, arg2, arg3, arg4, arg5) \
+	LOADREGS_4 (arg1, arg2, arg3, arg4)
+# define ASMARGS_5(arg1, arg2, arg3, arg4, arg5) \
+	ASMARGS_4 (arg1, arg2, arg3, arg4), "D" ((unsigned int) (arg5))
+# define LOADREGS_6(arg1, arg2, arg3, arg4, arg5, arg6) \
+	register unsigned int _a6 asm ("ebp") = (unsigned int) (arg6); \
+	LOADREGS_5 (arg1, arg2, arg3, arg4, arg5)
+# define ASMARGS_6(arg1, arg2, arg3, arg4, arg5, arg6) \
+	ASMARGS_5 (arg1, arg2, arg3, arg4, arg5), "r" (_a6)
+#endif /* GCC 5  */
+
 #define ASMFMT_0()
 #ifdef __PIC__
 # define ASMFMT_1(arg1) \