[MPX] MPX-specific changes in dl_runtime routines
Commit Message
On Sat, Jul 11, 2015 at 12:46:54PM +0200, Ondřej Bílka wrote:
> On Thu, Jul 09, 2015 at 09:07:24AM -0700, H.J. Lu wrote:
> > On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> > > On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote:
> > >> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
> > >> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
> > >> >> wrote:
> > >> >> > Fixed in the attached patch
> > >> >> >
> > >> >>
> > >> >> I fixed some typos and updated sysdeps/i386/configure for
> > >> >> HAVE_MPX_SUPPORT. Please verify both with HAVE_MPX_SUPPORT and
> > >> >> without on i386 and x86-64.
> > >> >
> > >> > Done, all works fine
> > >> >
> > >>
> > >> I checked it in for you.
> > >>
> > > These are nice but you could have same problem with lazy tls allocation.
> > > I wrote patch to merge trampolines, which now conflicts. Could you write
> > > similar patch to solve that? Original purpose was to always save xmm
> > > registers so we could use sse2 routines which speeds up lookup time.
> >
> > So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How
> > much gain it will give us?
> >
> I couldn't measure that without patch. Gain now would be big as we now
> use byte-by-byte loop to check symbol name which is slow, especially
> with c++ name mangling. Would be following benchmark good to measure
> speedup or do I need to measure startup time which is bit harder?
>
Please try this.
H.J.
---
sysdeps/x86_64/dl-trampoline.S | 42 +++++++++++++++++++++++++++++++++++-------
1 file changed, 35 insertions(+), 7 deletions(-)
Comments
On Sat, Jul 11, 2015 at 1:27 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Sat, Jul 11, 2015 at 12:46:54PM +0200, Ondřej Bílka wrote:
>> On Thu, Jul 09, 2015 at 09:07:24AM -0700, H.J. Lu wrote:
>> > On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>> > > On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote:
>> > >> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
>> > >> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
>> > >> >> wrote:
>> > >> >> > Fixed in the attached patch
>> > >> >> >
>> > >> >>
>> > >> >> I fixed some typos and updated sysdeps/i386/configure for
>> > >> >> HAVE_MPX_SUPPORT. Please verify both with HAVE_MPX_SUPPORT and
>> > >> >> without on i386 and x86-64.
>> > >> >
>> > >> > Done, all works fine
>> > >> >
>> > >>
>> > >> I checked it in for you.
>> > >>
>> > > These are nice but you could have same problem with lazy tls allocation.
>> > > I wrote patch to merge trampolines, which now conflicts. Could you write
>> > > similar patch to solve that? Original purpose was to always save xmm
>> > > registers so we could use sse2 routines which speeds up lookup time.
>> >
>> > So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How
>> > much gain it will give us?
>> >
>> I couldn't measure that without patch. Gain now would be big as we now
>> use byte-by-byte loop to check symbol name which is slow, especially
>> with c++ name mangling. Would be following benchmark good to measure
>> speedup or do I need to measure startup time which is bit harder?
>>
>
> Please try this.
>
>
> H.J.
> ---
> sysdeps/x86_64/dl-trampoline.S | 42 +++++++++++++++++++++++++++++++++++-------
> 1 file changed, 35 insertions(+), 7 deletions(-)
>
> diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> index 678c57f..ea81d7d 100644
> --- a/sysdeps/x86_64/dl-trampoline.S
> +++ b/sysdeps/x86_64/dl-trampoline.S
> @@ -27,26 +27,36 @@
> /* Area on stack to save and restore registers used for parameter
> passing when calling _dl_fixup. */
> #ifdef __ILP32__
> -/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX. */
> -# define REGISTER_SAVE_AREA (8 * 7)
> -# define REGISTER_SAVE_RAX 0
> +/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as XMM0 to
> + XMM7. */
> +# define REGISTER_SAVE_AREA (8 * 7 + 16 * 8)
> +/* Align XMM register save area to 16 bytes. */
> +# define REGISTER_SAVE_XMM0 0
> # define PRESERVE_BND_REGS_PREFIX
> #else
> -/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0,
> - BND1, BND2, BND3. */
> -# define REGISTER_SAVE_AREA (8 * 7 + 16 * 4)
> +/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
> + BND0, BND1, BND2, BND3 and XMM0 to XMM7. */
> +# define REGISTER_SAVE_AREA (8 * 7 + 16 * 4 + 16 * 8)
> /* Align bound register save area to 16 bytes. */
> # define REGISTER_SAVE_BND0 0
> # define REGISTER_SAVE_BND1 (REGISTER_SAVE_BND0 + 16)
> # define REGISTER_SAVE_BND2 (REGISTER_SAVE_BND1 + 16)
> # define REGISTER_SAVE_BND3 (REGISTER_SAVE_BND2 + 16)
> -# define REGISTER_SAVE_RAX (REGISTER_SAVE_BND3 + 16)
> +# define REGISTER_SAVE_XMM0 (REGISTER_SAVE_BND3 + 16)
> # ifdef HAVE_MPX_SUPPORT
> # define PRESERVE_BND_REGS_PREFIX bnd
> # else
> # define PRESERVE_BND_REGS_PREFIX .byte 0xf2
> # endif
> #endif
> +#define REGISTER_SAVE_XMM1 (REGISTER_SAVE_XMM0 + 16)
> +#define REGISTER_SAVE_XMM2 (REGISTER_SAVE_XMM1 + 16)
> +#define REGISTER_SAVE_XMM3 (REGISTER_SAVE_XMM2 + 16)
> +#define REGISTER_SAVE_XMM4 (REGISTER_SAVE_XMM3 + 16)
> +#define REGISTER_SAVE_XMM5 (REGISTER_SAVE_XMM4 + 16)
> +#define REGISTER_SAVE_XMM6 (REGISTER_SAVE_XMM5 + 16)
> +#define REGISTER_SAVE_XMM7 (REGISTER_SAVE_XMM6 + 16)
> +#define REGISTER_SAVE_RAX (REGISTER_SAVE_XMM7 + 16)
> #define REGISTER_SAVE_RCX (REGISTER_SAVE_RAX + 8)
> #define REGISTER_SAVE_RDX (REGISTER_SAVE_RCX + 8)
> #define REGISTER_SAVE_RSI (REGISTER_SAVE_RDX + 8)
> @@ -71,6 +81,15 @@ _dl_runtime_resolve:
> movq %rdi, REGISTER_SAVE_RDI(%rsp)
> movq %r8, REGISTER_SAVE_R8(%rsp)
> movq %r9, REGISTER_SAVE_R9(%rsp)
> + # movaps is 1-byte shorter.
> + movaps %xmm0, REGISTER_SAVE_XMM0(%rsp)
> + movaps %xmm1, REGISTER_SAVE_XMM1(%rsp)
> + movaps %xmm2, REGISTER_SAVE_XMM2(%rsp)
> + movaps %xmm3, REGISTER_SAVE_XMM3(%rsp)
> + movaps %xmm4, REGISTER_SAVE_XMM4(%rsp)
> + movaps %xmm5, REGISTER_SAVE_XMM5(%rsp)
> + movaps %xmm6, REGISTER_SAVE_XMM6(%rsp)
> + movaps %xmm7, REGISTER_SAVE_XMM7(%rsp)
> #ifndef __ILP32__
> # We also have to preserve bound registers. These are nops if
> # Intel MPX isn't available or disabled.
> @@ -123,6 +142,15 @@ _dl_runtime_resolve:
> movq REGISTER_SAVE_RDX(%rsp), %rdx
> movq REGISTER_SAVE_RCX(%rsp), %rcx
> movq REGISTER_SAVE_RAX(%rsp), %rax
> + # movaps is 1-byte shorter.
> + movaps REGISTER_SAVE_XMM0(%rsp), %xmm0
> + movaps REGISTER_SAVE_XMM1(%rsp), %xmm1
> + movaps REGISTER_SAVE_XMM2(%rsp), %xmm2
> + movaps REGISTER_SAVE_XMM3(%rsp), %xmm3
> + movaps REGISTER_SAVE_XMM4(%rsp), %xmm4
> + movaps REGISTER_SAVE_XMM5(%rsp), %xmm5
> + movaps REGISTER_SAVE_XMM6(%rsp), %xmm6
> + movaps REGISTER_SAVE_XMM7(%rsp), %xmm7
> # Adjust stack(PLT did 2 pushes)
> addq $(REGISTER_SAVE_AREA + 16), %rsp
> cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))
> --
> 2.4.3
>
movaps doesn't work since the stack is misaligned when
__tls_get_addr is called due to:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
@@ -27,26 +27,36 @@
/* Area on stack to save and restore registers used for parameter
passing when calling _dl_fixup. */
#ifdef __ILP32__
-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX. */
-# define REGISTER_SAVE_AREA (8 * 7)
-# define REGISTER_SAVE_RAX 0
+/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as XMM0 to
+ XMM7. */
+# define REGISTER_SAVE_AREA (8 * 7 + 16 * 8)
+/* Align XMM register save area to 16 bytes. */
+# define REGISTER_SAVE_XMM0 0
# define PRESERVE_BND_REGS_PREFIX
#else
-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0,
- BND1, BND2, BND3. */
-# define REGISTER_SAVE_AREA (8 * 7 + 16 * 4)
+/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
+ BND0, BND1, BND2, BND3 and XMM0 to XMM7. */
+# define REGISTER_SAVE_AREA (8 * 7 + 16 * 4 + 16 * 8)
/* Align bound register save area to 16 bytes. */
# define REGISTER_SAVE_BND0 0
# define REGISTER_SAVE_BND1 (REGISTER_SAVE_BND0 + 16)
# define REGISTER_SAVE_BND2 (REGISTER_SAVE_BND1 + 16)
# define REGISTER_SAVE_BND3 (REGISTER_SAVE_BND2 + 16)
-# define REGISTER_SAVE_RAX (REGISTER_SAVE_BND3 + 16)
+# define REGISTER_SAVE_XMM0 (REGISTER_SAVE_BND3 + 16)
# ifdef HAVE_MPX_SUPPORT
# define PRESERVE_BND_REGS_PREFIX bnd
# else
# define PRESERVE_BND_REGS_PREFIX .byte 0xf2
# endif
#endif
+#define REGISTER_SAVE_XMM1 (REGISTER_SAVE_XMM0 + 16)
+#define REGISTER_SAVE_XMM2 (REGISTER_SAVE_XMM1 + 16)
+#define REGISTER_SAVE_XMM3 (REGISTER_SAVE_XMM2 + 16)
+#define REGISTER_SAVE_XMM4 (REGISTER_SAVE_XMM3 + 16)
+#define REGISTER_SAVE_XMM5 (REGISTER_SAVE_XMM4 + 16)
+#define REGISTER_SAVE_XMM6 (REGISTER_SAVE_XMM5 + 16)
+#define REGISTER_SAVE_XMM7 (REGISTER_SAVE_XMM6 + 16)
+#define REGISTER_SAVE_RAX (REGISTER_SAVE_XMM7 + 16)
#define REGISTER_SAVE_RCX (REGISTER_SAVE_RAX + 8)
#define REGISTER_SAVE_RDX (REGISTER_SAVE_RCX + 8)
#define REGISTER_SAVE_RSI (REGISTER_SAVE_RDX + 8)
@@ -71,6 +81,15 @@ _dl_runtime_resolve:
movq %rdi, REGISTER_SAVE_RDI(%rsp)
movq %r8, REGISTER_SAVE_R8(%rsp)
movq %r9, REGISTER_SAVE_R9(%rsp)
+ # movaps is 1-byte shorter.
+ movaps %xmm0, REGISTER_SAVE_XMM0(%rsp)
+ movaps %xmm1, REGISTER_SAVE_XMM1(%rsp)
+ movaps %xmm2, REGISTER_SAVE_XMM2(%rsp)
+ movaps %xmm3, REGISTER_SAVE_XMM3(%rsp)
+ movaps %xmm4, REGISTER_SAVE_XMM4(%rsp)
+ movaps %xmm5, REGISTER_SAVE_XMM5(%rsp)
+ movaps %xmm6, REGISTER_SAVE_XMM6(%rsp)
+ movaps %xmm7, REGISTER_SAVE_XMM7(%rsp)
#ifndef __ILP32__
# We also have to preserve bound registers. These are nops if
# Intel MPX isn't available or disabled.
@@ -123,6 +142,15 @@ _dl_runtime_resolve:
movq REGISTER_SAVE_RDX(%rsp), %rdx
movq REGISTER_SAVE_RCX(%rsp), %rcx
movq REGISTER_SAVE_RAX(%rsp), %rax
+ # movaps is 1-byte shorter.
+ movaps REGISTER_SAVE_XMM0(%rsp), %xmm0
+ movaps REGISTER_SAVE_XMM1(%rsp), %xmm1
+ movaps REGISTER_SAVE_XMM2(%rsp), %xmm2
+ movaps REGISTER_SAVE_XMM3(%rsp), %xmm3
+ movaps REGISTER_SAVE_XMM4(%rsp), %xmm4
+ movaps REGISTER_SAVE_XMM5(%rsp), %xmm5
+ movaps REGISTER_SAVE_XMM6(%rsp), %xmm6
+ movaps REGISTER_SAVE_XMM7(%rsp), %xmm7
# Adjust stack(PLT did 2 pushes)
addq $(REGISTER_SAVE_AREA + 16), %rsp
cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))