diff mbox

Save and restore xmm0-xmm7 in _dl_runtime_resolve

Message ID 20150711235002.GA7543@gmail.com
State Not Applicable
Headers show

Commit Message

H.J. Lu July 11, 2015, 11:50 p.m. UTC
On Sat, Jul 11, 2015 at 01:27:42PM -0700, H.J. Lu wrote:
> On Sat, Jul 11, 2015 at 12:46:54PM +0200, Ondřej Bílka wrote:
> > On Thu, Jul 09, 2015 at 09:07:24AM -0700, H.J. Lu wrote:
> > > On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> > > > On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote:
> > > >> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
> > > >> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
> > > >> >> wrote:
> > > >> >> > Fixed in the attached patch
> > > >> >> >
> > > >> >>
> > > >> >> I fixed some typos and updated sysdeps/i386/configure for
> > > >> >> HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT and
> > > >> >> without on i386 and x86-64.
> > > >> >
> > > >> > Done, all works fine
> > > >> >
> > > >>
> > > >> I checked it in for you.
> > > >>
> > > > These are nice but you could have same problem with lazy tls allocation.
> > > > I wrote patch to merge trampolines, which now conflicts. Could you write
> > > > similar patch to solve that? Original purpose was to always save xmm
> > > > registers so we could use sse2 routines which speeds up lookup time.
> > > 
> > > So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How
> > > much gain it will give us?
> > >
> > I couldn't measure that without patch. Gain now would be big as we now
> > use byte-by-byte loop to check symbol name which is slow, especially
> > with c++ name mangling. Would be following benchmark good to measure
> > speedup or do I need to measure startup time which is bit harder?
> > 
> 
> Please try this.
> 

We have to use movups instead of movaps due to

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066


H.J.
---
 sysdeps/x86_64/dl-trampoline.S | 51 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 7 deletions(-)

Comments

H.J. Lu July 12, 2015, 9:50 p.m. UTC | #1
On Sat, Jul 11, 2015 at 4:50 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Sat, Jul 11, 2015 at 01:27:42PM -0700, H.J. Lu wrote:
>> On Sat, Jul 11, 2015 at 12:46:54PM +0200, Ondřej Bílka wrote:
>> > On Thu, Jul 09, 2015 at 09:07:24AM -0700, H.J. Lu wrote:
>> > > On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>> > > > On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote:
>> > > >> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
>> > > >> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
>> > > >> >> wrote:
>> > > >> >> > Fixed in the attached patch
>> > > >> >> >
>> > > >> >>
>> > > >> >> I fixed some typos and updated sysdeps/i386/configure for
>> > > >> >> HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT and
>> > > >> >> without on i386 and x86-64.
>> > > >> >
>> > > >> > Done, all works fine
>> > > >> >
>> > > >>
>> > > >> I checked it in for you.
>> > > >>
>> > > > These are nice but you could have same problem with lazy tls allocation.
>> > > > I wrote patch to merge trampolines, which now conflicts. Could you write
>> > > > similar patch to solve that? Original purpose was to always save xmm
>> > > > registers so we could use sse2 routines which speeds up lookup time.
>> > >
>> > > So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How
>> > > much gain it will give us?
>> > >
>> > I couldn't measure that without patch. Gain now would be big as we now
>> > use byte-by-byte loop to check symbol name which is slow, especially
>> > with c++ name mangling. Would be following benchmark good to measure
>> > speedup or do I need to measure startup time which is bit harder?
>> >
>>
>> Please try this.
>>
>
> We have to use movups instead of movaps due to
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
>

It won't work since stack may be misaligned when SSE2 functions
are called.  Please try glibc hjl/pr18661 branch, where I fixed a few
stack misalignment bugs in x86-64 assembly code and save/restore
xmm0-xmm7,  with GCC hjl/pr58066/gcc-5-branch
diff mbox

Patch

diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 678c57f..4a1928d 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -27,26 +27,36 @@ 
 /* Area on stack to save and restore registers used for parameter
    passing when calling _dl_fixup.  */
 #ifdef __ILP32__
-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX.  */
-# define REGISTER_SAVE_AREA	(8 * 7)
-# define REGISTER_SAVE_RAX	0
+/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as XMM0 to
+   XMM7.  */
+# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 8)
+/* Align XMM register save area to 16 bytes.  */
+# define REGISTER_SAVE_XMM0	0
 # define PRESERVE_BND_REGS_PREFIX
 #else
-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0,
-   BND1, BND2, BND3.  */
-# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 4)
+/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
+   BND0, BND1, BND2, BND3 and XMM0 to XMM7. */
+# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 4 + 16 * 8)
 /* Align bound register save area to 16 bytes.  */
 # define REGISTER_SAVE_BND0	0
 # define REGISTER_SAVE_BND1	(REGISTER_SAVE_BND0 + 16)
 # define REGISTER_SAVE_BND2	(REGISTER_SAVE_BND1 + 16)
 # define REGISTER_SAVE_BND3	(REGISTER_SAVE_BND2 + 16)
-# define REGISTER_SAVE_RAX	(REGISTER_SAVE_BND3 + 16)
+# define REGISTER_SAVE_XMM0	(REGISTER_SAVE_BND3 + 16)
 # ifdef HAVE_MPX_SUPPORT
 #  define PRESERVE_BND_REGS_PREFIX bnd
 # else
 #  define PRESERVE_BND_REGS_PREFIX .byte 0xf2
 # endif
 #endif
+#define REGISTER_SAVE_XMM1	(REGISTER_SAVE_XMM0 + 16)
+#define REGISTER_SAVE_XMM2	(REGISTER_SAVE_XMM1 + 16)
+#define REGISTER_SAVE_XMM3	(REGISTER_SAVE_XMM2 + 16)
+#define REGISTER_SAVE_XMM4	(REGISTER_SAVE_XMM3 + 16)
+#define REGISTER_SAVE_XMM5	(REGISTER_SAVE_XMM4 + 16)
+#define REGISTER_SAVE_XMM6	(REGISTER_SAVE_XMM5 + 16)
+#define REGISTER_SAVE_XMM7	(REGISTER_SAVE_XMM6 + 16)
+#define REGISTER_SAVE_RAX	(REGISTER_SAVE_XMM7 + 16)
 #define REGISTER_SAVE_RCX	(REGISTER_SAVE_RAX + 8)
 #define REGISTER_SAVE_RDX	(REGISTER_SAVE_RCX + 8)
 #define REGISTER_SAVE_RSI	(REGISTER_SAVE_RDX + 8)
@@ -54,6 +64,10 @@ 
 #define REGISTER_SAVE_R8	(REGISTER_SAVE_RDI + 8)
 #define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)
 
+#if (REGISTER_SAVE_AREA % 16) != 8
+# error REGISTER_SAVE_AREA must be odd multples of 8
+#endif
+
 	.text
 	.globl _dl_runtime_resolve
 	.type _dl_runtime_resolve, @function
@@ -71,6 +85,20 @@  _dl_runtime_resolve:
 	movq %rdi, REGISTER_SAVE_RDI(%rsp)
 	movq %r8, REGISTER_SAVE_R8(%rsp)
 	movq %r9, REGISTER_SAVE_R9(%rsp)
+	# movaps is 1-byte shorter.
+	# We use movups instea of movaps since __tls_get_addr may be
+	# called with misaligned stack:
+	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+	# On Intel processors since 2011, movups is as fast as movaps
+	# on aligned address.
+	movups %xmm0, REGISTER_SAVE_XMM0(%rsp)
+	movups %xmm1, REGISTER_SAVE_XMM1(%rsp)
+	movups %xmm2, REGISTER_SAVE_XMM2(%rsp)
+	movups %xmm3, REGISTER_SAVE_XMM3(%rsp)
+	movups %xmm4, REGISTER_SAVE_XMM4(%rsp)
+	movups %xmm5, REGISTER_SAVE_XMM5(%rsp)
+	movups %xmm6, REGISTER_SAVE_XMM6(%rsp)
+	movups %xmm7, REGISTER_SAVE_XMM7(%rsp)
 #ifndef __ILP32__
 	# We also have to preserve bound registers.  These are nops if
 	# Intel MPX isn't available or disabled.
@@ -123,6 +151,15 @@  _dl_runtime_resolve:
 	movq REGISTER_SAVE_RDX(%rsp), %rdx
 	movq REGISTER_SAVE_RCX(%rsp), %rcx
 	movq REGISTER_SAVE_RAX(%rsp), %rax
+	# movaps is 1-byte shorter.
+	movups REGISTER_SAVE_XMM0(%rsp), %xmm0
+	movups REGISTER_SAVE_XMM1(%rsp), %xmm1
+	movups REGISTER_SAVE_XMM2(%rsp), %xmm2
+	movups REGISTER_SAVE_XMM3(%rsp), %xmm3
+	movups REGISTER_SAVE_XMM4(%rsp), %xmm4
+	movups REGISTER_SAVE_XMM5(%rsp), %xmm5
+	movups REGISTER_SAVE_XMM6(%rsp), %xmm6
+	movups REGISTER_SAVE_XMM7(%rsp), %xmm7
 	# Adjust stack(PLT did 2 pushes)
 	addq $(REGISTER_SAVE_AREA + 16), %rsp
 	cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))