From patchwork Sat Jul 11 23:50:02 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: "H.J. Lu" X-Patchwork-Id: 7648 Received: (qmail 67729 invoked by alias); 11 Jul 2015 23:50:10 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 67720 invoked by uid 89); 11 Jul 2015 23:50:08 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.0 required=5.0 tests=AWL, BAYES_00, FREEMAIL_FROM, RCVD_IN_DNSWL_LOW, SPF_PASS autolearn=ham version=3.3.2 X-HELO: mail-pa0-f50.google.com X-Received: by 10.70.37.43 with SMTP id v11mr55590449pdj.89.1436658604847; Sat, 11 Jul 2015 16:50:04 -0700 (PDT) Date: Sat, 11 Jul 2015 16:50:02 -0700 From: "H.J. Lu" To: "H.J. Lu" Cc: =?utf-8?B?T25kxZllaiBCw61sa2E=?= , "libc-alpha@sourceware.org" Subject: [PATCH] Save and restore xmm0-xmm7 in _dl_runtime_resolve Message-ID: <20150711235002.GA7543@gmail.com> References: <0EFAB2BDD0F67E4FB6CCC8B9F87D75696A921FC8@IRSMSX101.ger.corp.intel.com> <0EFAB2BDD0F67E4FB6CCC8B9F87D75696A9220AE@IRSMSX101.ger.corp.intel.com> <0EFAB2BDD0F67E4FB6CCC8B9F87D75696A9235F6@IRSMSX101.ger.corp.intel.com> <20150709142827.GA18030@domone> <20150711104654.GA26570@domone> <20150711202742.GA9074@gmail.com> MIME-Version: 1.0 Content-Disposition: inline In-Reply-To: <20150711202742.GA9074@gmail.com> User-Agent: Mutt/1.5.23 (2014-03-12) On Sat, Jul 11, 2015 at 01:27:42PM -0700, H.J. Lu wrote: > On Sat, Jul 11, 2015 at 12:46:54PM +0200, Ondřej Bílka wrote: > > On Thu, Jul 09, 2015 at 09:07:24AM -0700, H.J. Lu wrote: > > > On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka wrote: > > > > On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote: > > > >> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor wrote: > > > >> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor > > > >> >> wrote: > > > >> >> > Fixed in the attached patch > > > >> >> > > > > >> >> > > > >> >> I fixed some typos and updated sysdeps/i386/configure for > > > >> >> HAVE_MPX_SUPPORT. Please verify both with HAVE_MPX_SUPPORT and > > > >> >> without on i386 and x86-64. > > > >> > > > > >> > Done, all works fine > > > >> > > > > >> > > > >> I checked it in for you. > > > >> > > > > These are nice but you could have same problem with lazy tls allocation. > > > > I wrote patch to merge trampolines, which now conflicts. Could you write > > > > similar patch to solve that? Original purpose was to always save xmm > > > > registers so we could use sse2 routines which speeds up lookup time. > > > > > > So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How > > > much gain it will give us? > > > > > I couldn't measure that without patch. Gain now would be big as we now > > use byte-by-byte loop to check symbol name which is slow, especially > > with c++ name mangling. Would be following benchmark good to measure > > speedup or do I need to measure startup time which is bit harder? > > > > Please try this. > We have to use movups instead of movaps due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066 H.J. --- sysdeps/x86_64/dl-trampoline.S | 51 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 678c57f..4a1928d 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -27,26 +27,36 @@ /* Area on stack to save and restore registers used for parameter passing when calling _dl_fixup. */ #ifdef __ILP32__ -/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX. */ -# define REGISTER_SAVE_AREA (8 * 7) -# define REGISTER_SAVE_RAX 0 +/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as XMM0 to + XMM7. */ +# define REGISTER_SAVE_AREA (8 * 7 + 16 * 8) +/* Align XMM register save area to 16 bytes. */ +# define REGISTER_SAVE_XMM0 0 # define PRESERVE_BND_REGS_PREFIX #else -/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0, - BND1, BND2, BND3. */ -# define REGISTER_SAVE_AREA (8 * 7 + 16 * 4) +/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as + BND0, BND1, BND2, BND3 and XMM0 to XMM7. */ +# define REGISTER_SAVE_AREA (8 * 7 + 16 * 4 + 16 * 8) /* Align bound register save area to 16 bytes. */ # define REGISTER_SAVE_BND0 0 # define REGISTER_SAVE_BND1 (REGISTER_SAVE_BND0 + 16) # define REGISTER_SAVE_BND2 (REGISTER_SAVE_BND1 + 16) # define REGISTER_SAVE_BND3 (REGISTER_SAVE_BND2 + 16) -# define REGISTER_SAVE_RAX (REGISTER_SAVE_BND3 + 16) +# define REGISTER_SAVE_XMM0 (REGISTER_SAVE_BND3 + 16) # ifdef HAVE_MPX_SUPPORT # define PRESERVE_BND_REGS_PREFIX bnd # else # define PRESERVE_BND_REGS_PREFIX .byte 0xf2 # endif #endif +#define REGISTER_SAVE_XMM1 (REGISTER_SAVE_XMM0 + 16) +#define REGISTER_SAVE_XMM2 (REGISTER_SAVE_XMM1 + 16) +#define REGISTER_SAVE_XMM3 (REGISTER_SAVE_XMM2 + 16) +#define REGISTER_SAVE_XMM4 (REGISTER_SAVE_XMM3 + 16) +#define REGISTER_SAVE_XMM5 (REGISTER_SAVE_XMM4 + 16) +#define REGISTER_SAVE_XMM6 (REGISTER_SAVE_XMM5 + 16) +#define REGISTER_SAVE_XMM7 (REGISTER_SAVE_XMM6 + 16) +#define REGISTER_SAVE_RAX (REGISTER_SAVE_XMM7 + 16) #define REGISTER_SAVE_RCX (REGISTER_SAVE_RAX + 8) #define REGISTER_SAVE_RDX (REGISTER_SAVE_RCX + 8) #define REGISTER_SAVE_RSI (REGISTER_SAVE_RDX + 8) @@ -54,6 +64,10 @@ #define REGISTER_SAVE_R8 (REGISTER_SAVE_RDI + 8) #define REGISTER_SAVE_R9 (REGISTER_SAVE_R8 + 8) +#if (REGISTER_SAVE_AREA % 16) != 8 +# error REGISTER_SAVE_AREA must be odd multples of 8 +#endif + .text .globl _dl_runtime_resolve .type _dl_runtime_resolve, @function @@ -71,6 +85,20 @@ _dl_runtime_resolve: movq %rdi, REGISTER_SAVE_RDI(%rsp) movq %r8, REGISTER_SAVE_R8(%rsp) movq %r9, REGISTER_SAVE_R9(%rsp) + # movaps is 1-byte shorter. + # We use movups instea of movaps since __tls_get_addr may be + # called with misaligned stack: + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066 + # On Intel processors since 2011, movups is as fast as movaps + # on aligned address. + movups %xmm0, REGISTER_SAVE_XMM0(%rsp) + movups %xmm1, REGISTER_SAVE_XMM1(%rsp) + movups %xmm2, REGISTER_SAVE_XMM2(%rsp) + movups %xmm3, REGISTER_SAVE_XMM3(%rsp) + movups %xmm4, REGISTER_SAVE_XMM4(%rsp) + movups %xmm5, REGISTER_SAVE_XMM5(%rsp) + movups %xmm6, REGISTER_SAVE_XMM6(%rsp) + movups %xmm7, REGISTER_SAVE_XMM7(%rsp) #ifndef __ILP32__ # We also have to preserve bound registers. These are nops if # Intel MPX isn't available or disabled. @@ -123,6 +151,15 @@ _dl_runtime_resolve: movq REGISTER_SAVE_RDX(%rsp), %rdx movq REGISTER_SAVE_RCX(%rsp), %rcx movq REGISTER_SAVE_RAX(%rsp), %rax + # movaps is 1-byte shorter. + movups REGISTER_SAVE_XMM0(%rsp), %xmm0 + movups REGISTER_SAVE_XMM1(%rsp), %xmm1 + movups REGISTER_SAVE_XMM2(%rsp), %xmm2 + movups REGISTER_SAVE_XMM3(%rsp), %xmm3 + movups REGISTER_SAVE_XMM4(%rsp), %xmm4 + movups REGISTER_SAVE_XMM5(%rsp), %xmm5 + movups REGISTER_SAVE_XMM6(%rsp), %xmm6 + movups REGISTER_SAVE_XMM7(%rsp), %xmm7 # Adjust stack(PLT did 2 pushes) addq $(REGISTER_SAVE_AREA + 16), %rsp cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))