From patchwork Tue Mar 28 21:10:41 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "H.J. Lu" X-Patchwork-Id: 19749 Received: (qmail 12524 invoked by alias); 28 Mar 2017 21:11:00 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 11097 invoked by uid 89); 28 Mar 2017 21:10:47 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-24.0 required=5.0 tests=AWL, BAYES_00, FREEMAIL_FROM, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_NONE, RCVD_IN_SORBS_SPAM, SPF_PASS autolearn=ham version=3.3.2 spammy=1996, 6429 X-HELO: mail-qk0-f181.google.com X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:mime-version:in-reply-to:references:from:date :message-id:subject:to:cc; bh=2TRbVkayts/Vr8q7JjTSuV3LLUcB2KqMuK8GtDPDiEU=; b=N40bXBJREEQfzI6I4Rmn7+YyfyhEmDnYuQulwsvDWsZt/nUBy+nqZUNutNH8E/jhNJ sMzcsXXD2SXSCqXD/kM62HHcfaFA7v5wCM9wmCkyzUJKZaGW8STVD0onP53mJ0QNYxst DdhlSDMKPl2YolH5/dyxVGgoVIvSMtQFlQq/c7b2CHydIqtMGc5iKkFd08Ud2CqD9LZR QVdd5aXtw5Q2YYNRXuxotImJpUF4xMmt3/fFA09rizMUSJkkmao50izRnaj9THkvAHr3 eP12LGrE0ZoFYpFK4dSAN7yVshJBAcQBXbbRTxAWTUY+HydAPRDNsUcEBaMxMgh3Vhll FA2A== X-Gm-Message-State: AFeK/H0FjRfQQvC90rHiINB7Z45CLdOtYGacuN+LvjS59ANXfsVg14LbYGKmbXCjPKcUf37gfQrJFCZCrQuXrw== X-Received: by 10.55.11.207 with SMTP id 198mr17142621qkl.100.1490735442462; Tue, 28 Mar 2017 14:10:42 -0700 (PDT) MIME-Version: 1.0 In-Reply-To: References: <20170316232737.GD24205@vapier> <100440FED7798443A51E0730E25E29E85AAC14B5@fmsmsx117.amr.corp.intel.com> <100440FED7798443A51E0730E25E29E85AAD9939@fmsmsx117.amr.corp.intel.com> <26492340-9184-75a9-88fa-1369534bb703@twiddle.net> From: "H.J. Lu" Date: Tue, 28 Mar 2017 14:10:41 -0700 Message-ID: Subject: Re: RFC: Should x86-64 support arbitrary calling conventions? To: Richard Henderson Cc: "Kreitzer, David L" , "Carlos O'Donell" , GNU C Library , "Joseph S. Myers" , Jeff Law , "Maslov, Sergey V" On Mon, Mar 27, 2017 at 8:15 AM, H.J. Lu wrote: > On Sun, Mar 26, 2017 at 11:27 PM, Richard Henderson wrote: >> On 03/25/2017 01:26 AM, H.J. Lu wrote: >>> >>> On Thu, Mar 23, 2017 at 4:14 PM, Richard Henderson >>> wrote: >>>> >>>> On 03/24/2017 01:41 AM, H.J. Lu wrote: >>>>> >>>>> >>>>> +# ifdef STATE_SAVE_MASK >>>>> + movl $STATE_SAVE_MASK, %eax >>>>> + xorl %edx, %edx >>>>> + # Clear the XSAVE Header. >>>>> + movq $0, (STATE_SAVE_OFFSET + 512)(%rsp) >>>>> + movq $0, (STATE_SAVE_OFFSET + 512 + 8)(%rsp) >>>>> + movq $0, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp) >>>>> + movq $0, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp) >>>>> + movq $0, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp) >>>>> + movq $0, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp) >>>>> + movq $0, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp) >>>>> + movq $0, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp) >>>>> # endif >>>> >>>> >>>> >>>> You've just cleared %rdx. Use that instead of 8*4 bytes of immediate >>>> zeros. >>>> >>>> Given that you have to ifdef this code into place, isn't it somewhat >>>> pointless to hide xsave behind a macro in the next line? >>>> >>>>> + STATE_SAVE STATE_SAVE_OFFSET(%rsp) >>>> >>>> >>>> >>>> I think it would be clearer to inline the two save instructions instead. >> >> >> That said... is there any reason not to use XSAVEC, if supported? >> >> From the description, it avoids saving components for which XINUSE=0, >> whereas I don't see that same language for XSAVE, even though the XSAVE_BV >> field would still be (un)set for not-in-use components. >> > > We can look into XSAVEC after switching to FXSAVE/XSAVE. > Here is the patch with XSAVEC. Tested on Skylake with XSAVEC. From 29d7bb98a34a0e87de72b967774ed6839faee516 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 23 Mar 2017 08:21:52 -0700 Subject: [PATCH] X86-64: Use fxsave/xsave/xsavec in _dl_runtime_resolve In _dl_runtime_resolve, use fxsave/xsave/xsavec to preserve all vector, mask and bound registers. It simplifies _dl_runtime_resolve and supports different calling conventions. However, use xsave can be 10X slower than saving and restoring vector and bound registers individually. --- sysdeps/x86/cpu-features-offsets.sym | 1 + sysdeps/x86/cpu-features.c | 80 +++++++++-- sysdeps/x86/cpu-features.h | 18 ++- sysdeps/x86_64/dl-machine.h | 38 ++--- sysdeps/x86_64/dl-trampoline.S | 87 ++++-------- sysdeps/x86_64/dl-trampoline.h | 266 ++++++++++------------------------- 6 files changed, 195 insertions(+), 295 deletions(-) diff --git a/sysdeps/x86/cpu-features-offsets.sym b/sysdeps/x86/cpu-features-offsets.sym index f6739fa..33dd094 100644 --- a/sysdeps/x86/cpu-features-offsets.sym +++ b/sysdeps/x86/cpu-features-offsets.sym @@ -15,6 +15,7 @@ CPUID_ECX_OFFSET offsetof (struct cpuid_registers, ecx) CPUID_EDX_OFFSET offsetof (struct cpuid_registers, edx) FAMILY_OFFSET offsetof (struct cpu_features, family) MODEL_OFFSET offsetof (struct cpu_features, model) +XSAVE_STATE_SIZE_OFFSET offsetof (struct cpu_features, xsave_state_size) FEATURE_OFFSET offsetof (struct cpu_features, feature) FEATURE_SIZE sizeof (unsigned int) diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index 33788ed..9cbbbc2 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -93,6 +93,72 @@ get_common_indeces (struct cpu_features *cpu_features, } } } + + /* For _dl_runtime_resolve, set xsave_state_size to xsave area + size + integer register save size and align it to 64 bytes. */ + if (cpu_features->max_cpuid >= 0xd) + { + unsigned int eax, ebx, ecx, edx; + + __cpuid_count (0xd, 0, eax, ebx, ecx, edx); + if (ebx != 0) + { + cpu_features->xsave_state_size + = (ebx + STATE_SAVE_OFFSET + 63) & -64; + + __cpuid_count (0xd, 1, eax, ebx, ecx, edx); + + /* Check if XSAVEC is available. */ + if ((eax & (1 << 1)) != 0) + { + unsigned int xstate_comp_offsets[32]; + unsigned int xstate_comp_sizes[32]; + unsigned int i; + + xstate_comp_offsets[0] = 0; + xstate_comp_offsets[1] = 160; + xstate_comp_offsets[2] = 576; + xstate_comp_sizes[0] = 160; + xstate_comp_sizes[1] = 256; + + for (i = 2; i < 32; i++) + { + if ((STATE_SAVE_MASK & (1 << i)) != 0) + { + __cpuid_count (0xd, i, eax, ebx, ecx, edx); + xstate_comp_sizes[i] = eax; + } + else + { + ecx = 0; + xstate_comp_sizes[i] = 0; + } + + if (i > 2) + { + xstate_comp_offsets[i] + = (xstate_comp_offsets[i - 1] + + xstate_comp_sizes[i -1]); + if ((ecx & (1 << 1)) != 0) + xstate_comp_offsets[i] + = (xstate_comp_offsets[i] + 63) & -64; + } + } + + + /* Use XSAVEC. */ + unsigned int size + = xstate_comp_offsets[31] + xstate_comp_sizes[31]; + if (size) + { + cpu_features->xsave_state_size + = (size + STATE_SAVE_OFFSET + 63) & -64; + cpu_features->feature[index_arch_XSAVEC_Usable] + |= bit_arch_XSAVEC_Usable; + } + } + } + } } } @@ -224,20 +290,6 @@ init_cpu_features (struct cpu_features *cpu_features) if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)) cpu_features->feature[index_arch_AVX_Fast_Unaligned_Load] |= bit_arch_AVX_Fast_Unaligned_Load; - - /* To avoid SSE transition penalty, use _dl_runtime_resolve_slow. - If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt. */ - cpu_features->feature[index_arch_Use_dl_runtime_resolve_slow] - |= bit_arch_Use_dl_runtime_resolve_slow; - if (cpu_features->max_cpuid >= 0xd) - { - unsigned int eax; - - __cpuid_count (0xd, 1, eax, ebx, ecx, edx); - if ((eax & (1 << 2)) != 0) - cpu_features->feature[index_arch_Use_dl_runtime_resolve_opt] - |= bit_arch_Use_dl_runtime_resolve_opt; - } } /* This spells out "AuthenticAMD". */ else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h index 95f0fcf..611d08c 100644 --- a/sysdeps/x86/cpu-features.h +++ b/sysdeps/x86/cpu-features.h @@ -37,8 +37,7 @@ #define bit_arch_Prefer_No_VZEROUPPER (1 << 17) #define bit_arch_Fast_Unaligned_Copy (1 << 18) #define bit_arch_Prefer_ERMS (1 << 19) -#define bit_arch_Use_dl_runtime_resolve_opt (1 << 20) -#define bit_arch_Use_dl_runtime_resolve_slow (1 << 21) +#define bit_arch_XSAVEC_Usable (1 << 20) /* CPUID Feature flags. */ @@ -76,6 +75,15 @@ /* The current maximum size of the feature integer bit array. */ #define FEATURE_INDEX_MAX 1 +/* Offset for fxsave/xsave area used by _dl_runtime_resolve. Also need + space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX. It must be + aligned to 16 bytes for fxsave and 64 bytes for xsave. */ +#define STATE_SAVE_OFFSET (8 * 7 + 8) + +/* Save SSE, AVX, AVX512, mask and bound registers. */ +#define STATE_SAVE_MASK \ + ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7)) + #ifdef __ASSEMBLER__ # include @@ -109,8 +117,6 @@ # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE # define index_arch_Fast_Unaligned_Copy FEATURE_INDEX_1*FEATURE_SIZE # define index_arch_Prefer_ERMS FEATURE_INDEX_1*FEATURE_SIZE -# define index_arch_Use_dl_runtime_resolve_opt FEATURE_INDEX_1*FEATURE_SIZE -# define index_arch_Use_dl_runtime_resolve_slow FEATURE_INDEX_1*FEATURE_SIZE # if defined (_LIBC) && !IS_IN (nonlib) @@ -199,6 +205,7 @@ struct cpu_features } cpuid[COMMON_CPUID_INDEX_MAX]; unsigned int family; unsigned int model; + uintptr_t xsave_state_size; unsigned int feature[FEATURE_INDEX_MAX]; }; @@ -281,8 +288,7 @@ extern const struct cpu_features *__get_cpu_features (void) # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1 # define index_arch_Fast_Unaligned_Copy FEATURE_INDEX_1 # define index_arch_Prefer_ERMS FEATURE_INDEX_1 -# define index_arch_Use_dl_runtime_resolve_opt FEATURE_INDEX_1 -# define index_arch_Use_dl_runtime_resolve_slow FEATURE_INDEX_1 +# define index_arch_XSAVEC_Usable FEATURE_INDEX_1 #endif /* !__ASSEMBLER__ */ diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h index daf4d8c..be0b861 100644 --- a/sysdeps/x86_64/dl-machine.h +++ b/sysdeps/x86_64/dl-machine.h @@ -66,12 +66,9 @@ static inline int __attribute__ ((unused, always_inline)) elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) { Elf64_Addr *got; - extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden; - extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden; - extern void _dl_runtime_resolve_avx_slow (ElfW(Word)) attribute_hidden; - extern void _dl_runtime_resolve_avx_opt (ElfW(Word)) attribute_hidden; - extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden; - extern void _dl_runtime_resolve_avx512_opt (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden; extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden; extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden; extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden; @@ -120,29 +117,14 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) /* This function will get called to fix up the GOT entry indicated by the offset on the stack, and then jump to the resolved address. */ - if (HAS_ARCH_FEATURE (AVX512F_Usable)) - { - if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt)) - *(ElfW(Addr) *) (got + 2) - = (ElfW(Addr)) &_dl_runtime_resolve_avx512_opt; - else - *(ElfW(Addr) *) (got + 2) - = (ElfW(Addr)) &_dl_runtime_resolve_avx512; - } - else if (HAS_ARCH_FEATURE (AVX_Usable)) - { - if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt)) - *(ElfW(Addr) *) (got + 2) - = (ElfW(Addr)) &_dl_runtime_resolve_avx_opt; - else if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_slow)) - *(ElfW(Addr) *) (got + 2) - = (ElfW(Addr)) &_dl_runtime_resolve_avx_slow; - else - *(ElfW(Addr) *) (got + 2) - = (ElfW(Addr)) &_dl_runtime_resolve_avx; - } + if (GLRO(dl_x86_cpu_features).xsave_state_size != 0) + *(ElfW(Addr) *) (got + 2) + = (HAS_ARCH_FEATURE (XSAVEC_Usable) + ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec + : (ElfW(Addr)) &_dl_runtime_resolve_xsave); else - *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse; + *(ElfW(Addr) *) (got + 2) + = (ElfW(Addr)) &_dl_runtime_resolve_fxsave; } } diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index c14c61a..a645572 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -34,41 +34,24 @@ # define DL_STACK_ALIGNMENT 8 #endif -#ifndef DL_RUNTIME_UNALIGNED_VEC_SIZE -/* The maximum size in bytes of unaligned vector load and store in the - dynamic linker. Since SSE optimized memory/string functions with - aligned SSE register load and store are used in the dynamic linker, - we must set this to 8 so that _dl_runtime_resolve_sse will align the - stack before calling _dl_fixup. */ -# define DL_RUNTIME_UNALIGNED_VEC_SIZE 8 -#endif - -/* True if _dl_runtime_resolve should align stack to VEC_SIZE bytes. */ +/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align + stack to 16 bytes before calling _dl_fixup. */ #define DL_RUNTIME_RESOLVE_REALIGN_STACK \ - (VEC_SIZE > DL_STACK_ALIGNMENT \ - && VEC_SIZE > DL_RUNTIME_UNALIGNED_VEC_SIZE) - -/* Align vector register save area to 16 bytes. */ -#define REGISTER_SAVE_VEC_OFF 0 + (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \ + || 16 > DL_STACK_ALIGNMENT) /* Area on stack to save and restore registers used for parameter passing when calling _dl_fixup. */ #ifdef __ILP32__ -# define REGISTER_SAVE_RAX (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8) # define PRESERVE_BND_REGS_PREFIX #else -/* Align bound register save area to 16 bytes. */ -# define REGISTER_SAVE_BND0 (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8) -# define REGISTER_SAVE_BND1 (REGISTER_SAVE_BND0 + 16) -# define REGISTER_SAVE_BND2 (REGISTER_SAVE_BND1 + 16) -# define REGISTER_SAVE_BND3 (REGISTER_SAVE_BND2 + 16) -# define REGISTER_SAVE_RAX (REGISTER_SAVE_BND3 + 16) # ifdef HAVE_MPX_SUPPORT # define PRESERVE_BND_REGS_PREFIX bnd # else # define PRESERVE_BND_REGS_PREFIX .byte 0xf2 # endif #endif +#define REGISTER_SAVE_RAX 0 #define REGISTER_SAVE_RCX (REGISTER_SAVE_RAX + 8) #define REGISTER_SAVE_RDX (REGISTER_SAVE_RCX + 8) #define REGISTER_SAVE_RSI (REGISTER_SAVE_RDX + 8) @@ -80,68 +63,56 @@ #define VEC_SIZE 64 #define VMOVA vmovdqa64 -#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT -# define VMOV vmovdqa64 -#else -# define VMOV vmovdqu64 -#endif #define VEC(i) zmm##i -#define _dl_runtime_resolve _dl_runtime_resolve_avx512 #define _dl_runtime_profile _dl_runtime_profile_avx512 #include "dl-trampoline.h" -#undef _dl_runtime_resolve #undef _dl_runtime_profile #undef VEC -#undef VMOV #undef VMOVA #undef VEC_SIZE #define VEC_SIZE 32 #define VMOVA vmovdqa -#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT -# define VMOV vmovdqa -#else -# define VMOV vmovdqu -#endif #define VEC(i) ymm##i -#define _dl_runtime_resolve _dl_runtime_resolve_avx -#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx_opt #define _dl_runtime_profile _dl_runtime_profile_avx #include "dl-trampoline.h" -#undef _dl_runtime_resolve -#undef _dl_runtime_resolve_opt #undef _dl_runtime_profile #undef VEC -#undef VMOV #undef VMOVA #undef VEC_SIZE /* movaps/movups is 1-byte shorter. */ #define VEC_SIZE 16 #define VMOVA movaps -#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT -# define VMOV movaps -#else -# define VMOV movups -#endif #define VEC(i) xmm##i -#define _dl_runtime_resolve _dl_runtime_resolve_sse #define _dl_runtime_profile _dl_runtime_profile_sse #undef RESTORE_AVX #include "dl-trampoline.h" -#undef _dl_runtime_resolve #undef _dl_runtime_profile -#undef VMOV +#undef VEC #undef VMOVA +#undef VEC_SIZE -/* Used by _dl_runtime_resolve_avx_opt/_dl_runtime_resolve_avx512_opt - to preserve the full vector registers with zero upper bits. */ -#define VMOVA vmovdqa -#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT -# define VMOV vmovdqa -#else -# define VMOV vmovdqu -#endif -#define _dl_runtime_resolve _dl_runtime_resolve_sse_vex -#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt +#define USE_FXSAVE +#define STATE_SAVE_ALIGNMENT 16 +#define _dl_runtime_resolve _dl_runtime_resolve_fxsave +#include "dl-trampoline.h" +#undef _dl_runtime_resolve +#undef USE_FXSAVE +#undef STATE_SAVE_ALIGNMENT + +#define USE_XSAVE +#define STATE_SAVE_ALIGNMENT 64 +#define _dl_runtime_resolve _dl_runtime_resolve_xsave +#include "dl-trampoline.h" +#undef _dl_runtime_resolve +#undef USE_XSAVE +#undef STATE_SAVE_ALIGNMENT + +#define USE_XSAVEC +#define STATE_SAVE_ALIGNMENT 64 +#define _dl_runtime_resolve _dl_runtime_resolve_xsavec #include "dl-trampoline.h" +#undef _dl_runtime_resolve +#undef USE_XSAVEC +#undef STATE_SAVE_ALIGNMENT diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h index 8db24c1..dfd7e4b 100644 --- a/sysdeps/x86_64/dl-trampoline.h +++ b/sysdeps/x86_64/dl-trampoline.h @@ -16,140 +16,47 @@ License along with the GNU C Library; if not, see . */ -#undef REGISTER_SAVE_AREA_RAW -#ifdef __ILP32__ -/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to - VEC7. */ -# define REGISTER_SAVE_AREA_RAW (8 * 7 + VEC_SIZE * 8) -#else -/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as - BND0, BND1, BND2, BND3 and VEC0 to VEC7. */ -# define REGISTER_SAVE_AREA_RAW (8 * 7 + 16 * 4 + VEC_SIZE * 8) -#endif + .text +#ifdef _dl_runtime_resolve -#undef REGISTER_SAVE_AREA -#undef LOCAL_STORAGE_AREA -#undef BASE -#if DL_RUNTIME_RESOLVE_REALIGN_STACK -# define REGISTER_SAVE_AREA (REGISTER_SAVE_AREA_RAW + 8) -/* Local stack area before jumping to function address: RBX. */ -# define LOCAL_STORAGE_AREA 8 -# define BASE rbx -# if (REGISTER_SAVE_AREA % VEC_SIZE) != 0 -# error REGISTER_SAVE_AREA must be multples of VEC_SIZE -# endif -#else -# define REGISTER_SAVE_AREA REGISTER_SAVE_AREA_RAW -/* Local stack area before jumping to function address: All saved - registers. */ -# define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA -# define BASE rsp -# if (REGISTER_SAVE_AREA % 16) != 8 -# error REGISTER_SAVE_AREA must be odd multples of 8 +# undef REGISTER_SAVE_AREA +# undef LOCAL_STORAGE_AREA +# undef BASE + +# if (STATE_SAVE_ALIGNMENT % 16) != 0 +# error STATE_SAVE_ALIGNMENT must be multples of 16 # endif -#endif - .text -#ifdef _dl_runtime_resolve_opt -/* Use the smallest vector registers to preserve the full YMM/ZMM - registers to avoid SSE transition penalty. */ - -# if VEC_SIZE == 32 -/* Check if the upper 128 bits in %ymm0 - %ymm7 registers are non-zero - and preserve %xmm0 - %xmm7 registers with the zero upper bits. Since - there is no SSE transition penalty on AVX512 processors which don't - support XGETBV with ECX == 1, _dl_runtime_resolve_avx512_slow isn't - provided. */ - .globl _dl_runtime_resolve_avx_slow - .hidden _dl_runtime_resolve_avx_slow - .type _dl_runtime_resolve_avx_slow, @function - .align 16 -_dl_runtime_resolve_avx_slow: - cfi_startproc - cfi_adjust_cfa_offset(16) # Incorporate PLT - vorpd %ymm0, %ymm1, %ymm8 - vorpd %ymm2, %ymm3, %ymm9 - vorpd %ymm4, %ymm5, %ymm10 - vorpd %ymm6, %ymm7, %ymm11 - vorpd %ymm8, %ymm9, %ymm9 - vorpd %ymm10, %ymm11, %ymm10 - vpcmpeqd %xmm8, %xmm8, %xmm8 - vorpd %ymm9, %ymm10, %ymm10 - vptest %ymm10, %ymm8 - # Preserve %ymm0 - %ymm7 registers if the upper 128 bits of any - # %ymm0 - %ymm7 registers aren't zero. - PRESERVE_BND_REGS_PREFIX - jnc _dl_runtime_resolve_avx - # Use vzeroupper to avoid SSE transition penalty. - vzeroupper - # Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits - # when the upper 128 bits of %ymm0 - %ymm7 registers are zero. - PRESERVE_BND_REGS_PREFIX - jmp _dl_runtime_resolve_sse_vex - cfi_adjust_cfa_offset(-16) # Restore PLT adjustment - cfi_endproc - .size _dl_runtime_resolve_avx_slow, .-_dl_runtime_resolve_avx_slow +# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0 +# error STATE_SAVE_OFFSET must be multples of STATE_SAVE_ALIGNMENT # endif -/* Use XGETBV with ECX == 1 to check which bits in vector registers are - non-zero and only preserve the non-zero lower bits with zero upper - bits. */ - .globl _dl_runtime_resolve_opt - .hidden _dl_runtime_resolve_opt - .type _dl_runtime_resolve_opt, @function - .align 16 -_dl_runtime_resolve_opt: - cfi_startproc - cfi_adjust_cfa_offset(16) # Incorporate PLT - pushq %rax - cfi_adjust_cfa_offset(8) - cfi_rel_offset(%rax, 0) - pushq %rcx - cfi_adjust_cfa_offset(8) - cfi_rel_offset(%rcx, 0) - pushq %rdx - cfi_adjust_cfa_offset(8) - cfi_rel_offset(%rdx, 0) - movl $1, %ecx - xgetbv - movl %eax, %r11d - popq %rdx - cfi_adjust_cfa_offset(-8) - cfi_restore (%rdx) - popq %rcx - cfi_adjust_cfa_offset(-8) - cfi_restore (%rcx) - popq %rax - cfi_adjust_cfa_offset(-8) - cfi_restore (%rax) -# if VEC_SIZE == 32 - # For YMM registers, check if YMM state is in use. - andl $bit_YMM_state, %r11d - # Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits if - # YMM state isn't in use. - PRESERVE_BND_REGS_PREFIX - jz _dl_runtime_resolve_sse_vex -# elif VEC_SIZE == 16 - # For ZMM registers, check if YMM state and ZMM state are in - # use. - andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d - cmpl $bit_YMM_state, %r11d - # Preserve %zmm0 - %zmm7 registers if ZMM state is in use. - PRESERVE_BND_REGS_PREFIX - jg _dl_runtime_resolve_avx512 - # Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if - # ZMM state isn't in use. - PRESERVE_BND_REGS_PREFIX - je _dl_runtime_resolve_avx - # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if - # neither YMM state nor ZMM state are in use. +# if DL_RUNTIME_RESOLVE_REALIGN_STACK +/* Local stack area before jumping to function address: RBX. */ +# define LOCAL_STORAGE_AREA 8 +# define BASE rbx +# ifdef USE_FXSAVE +/* Use fxsave to save XMM registers. */ +# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET) +# if (REGISTER_SAVE_AREA % 16) != 0 +# error REGISTER_SAVE_AREA must be multples of 16 +# endif +# endif # else -# error Unsupported VEC_SIZE! +# ifndef USE_FXSAVE +# error USE_FXSAVE must be defined +# endif +/* Use fxsave to save XMM registers. */ +# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET + 8) +/* Local stack area before jumping to function address: All saved + registers. */ +# define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA +# define BASE rsp +# if (REGISTER_SAVE_AREA % 16) != 8 +# error REGISTER_SAVE_AREA must be odd multples of 8 +# endif # endif - cfi_adjust_cfa_offset(-16) # Restore PLT adjustment - cfi_endproc - .size _dl_runtime_resolve_opt, .-_dl_runtime_resolve_opt -#endif + .globl _dl_runtime_resolve .hidden _dl_runtime_resolve .type _dl_runtime_resolve, @function @@ -157,21 +64,29 @@ _dl_runtime_resolve_opt: cfi_startproc _dl_runtime_resolve: cfi_adjust_cfa_offset(16) # Incorporate PLT -#if DL_RUNTIME_RESOLVE_REALIGN_STACK -# if LOCAL_STORAGE_AREA != 8 -# error LOCAL_STORAGE_AREA must be 8 -# endif +# if DL_RUNTIME_RESOLVE_REALIGN_STACK +# if LOCAL_STORAGE_AREA != 8 +# error LOCAL_STORAGE_AREA must be 8 +# endif pushq %rbx # push subtracts stack by 8. cfi_adjust_cfa_offset(8) cfi_rel_offset(%rbx, 0) mov %RSP_LP, %RBX_LP cfi_def_cfa_register(%rbx) - and $-VEC_SIZE, %RSP_LP -#endif + and $-STATE_SAVE_ALIGNMENT, %RSP_LP +# endif +# ifdef REGISTER_SAVE_AREA sub $REGISTER_SAVE_AREA, %RSP_LP -#if !DL_RUNTIME_RESOLVE_REALIGN_STACK +# if !DL_RUNTIME_RESOLVE_REALIGN_STACK cfi_adjust_cfa_offset(REGISTER_SAVE_AREA) -#endif +# endif +# else +# if IS_IN (rtld) + sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP +# else + sub _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP +# endif +# endif # Preserve registers otherwise clobbered. movq %rax, REGISTER_SAVE_RAX(%rsp) movq %rcx, REGISTER_SAVE_RCX(%rsp) @@ -180,59 +95,42 @@ _dl_runtime_resolve: movq %rdi, REGISTER_SAVE_RDI(%rsp) movq %r8, REGISTER_SAVE_R8(%rsp) movq %r9, REGISTER_SAVE_R9(%rsp) - VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp) - VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp) - VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp) - VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp) - VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp) - VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp) - VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp) - VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp) -#ifndef __ILP32__ - # We also have to preserve bound registers. These are nops if - # Intel MPX isn't available or disabled. -# ifdef HAVE_MPX_SUPPORT - bndmov %bnd0, REGISTER_SAVE_BND0(%rsp) - bndmov %bnd1, REGISTER_SAVE_BND1(%rsp) - bndmov %bnd2, REGISTER_SAVE_BND2(%rsp) - bndmov %bnd3, REGISTER_SAVE_BND3(%rsp) +# ifdef USE_FXSAVE + fxsave STATE_SAVE_OFFSET(%rsp) # else -# if REGISTER_SAVE_BND0 == 0 - .byte 0x66,0x0f,0x1b,0x04,0x24 + movl $STATE_SAVE_MASK, %eax + xorl %edx, %edx + # Clear the XSAVE Header. +# ifdef USE_XSAVE + movq %rdx, (STATE_SAVE_OFFSET + 512)(%rsp) + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp) +# endif + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp) + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp) + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp) + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp) + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp) + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp) +# ifdef USE_XSAVE + xsave STATE_SAVE_OFFSET(%rsp) # else - .byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0 + xsavec STATE_SAVE_OFFSET(%rsp) # endif - .byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1 - .byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2 - .byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3 # endif -#endif # Copy args pushed by PLT in register. # %rdi: link_map, %rsi: reloc_index mov (LOCAL_STORAGE_AREA + 8)(%BASE), %RSI_LP mov LOCAL_STORAGE_AREA(%BASE), %RDI_LP call _dl_fixup # Call resolver. mov %RAX_LP, %R11_LP # Save return value -#ifndef __ILP32__ - # Restore bound registers. These are nops if Intel MPX isn't - # avaiable or disabled. -# ifdef HAVE_MPX_SUPPORT - bndmov REGISTER_SAVE_BND3(%rsp), %bnd3 - bndmov REGISTER_SAVE_BND2(%rsp), %bnd2 - bndmov REGISTER_SAVE_BND1(%rsp), %bnd1 - bndmov REGISTER_SAVE_BND0(%rsp), %bnd0 + # Get register content back. +# ifdef USE_FXSAVE + fxrstor STATE_SAVE_OFFSET(%rsp) # else - .byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3 - .byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2 - .byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1 -# if REGISTER_SAVE_BND0 == 0 - .byte 0x66,0x0f,0x1a,0x04,0x24 -# else - .byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0 -# endif + movl $STATE_SAVE_MASK, %eax + xorl %edx, %edx + xrstor STATE_SAVE_OFFSET(%rsp) # endif -#endif - # Get register content back. movq REGISTER_SAVE_R9(%rsp), %r9 movq REGISTER_SAVE_R8(%rsp), %r8 movq REGISTER_SAVE_RDI(%rsp), %rdi @@ -240,20 +138,12 @@ _dl_runtime_resolve: movq REGISTER_SAVE_RDX(%rsp), %rdx movq REGISTER_SAVE_RCX(%rsp), %rcx movq REGISTER_SAVE_RAX(%rsp), %rax - VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0) - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1) - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2) - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3) - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4) - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5) - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6) - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7) -#if DL_RUNTIME_RESOLVE_REALIGN_STACK +# if DL_RUNTIME_RESOLVE_REALIGN_STACK mov %RBX_LP, %RSP_LP cfi_def_cfa_register(%rsp) movq (%rsp), %rbx cfi_restore(%rbx) -#endif +# endif # Adjust stack(PLT did 2 pushes) add $(LOCAL_STORAGE_AREA + 16), %RSP_LP cfi_adjust_cfa_offset(-(LOCAL_STORAGE_AREA + 16)) @@ -262,11 +152,9 @@ _dl_runtime_resolve: jmp *%r11 # Jump to function address. cfi_endproc .size _dl_runtime_resolve, .-_dl_runtime_resolve +#endif -/* To preserve %xmm0 - %xmm7 registers, dl-trampoline.h is included - twice, for _dl_runtime_resolve_sse and _dl_runtime_resolve_sse_vex. - But we don't need another _dl_runtime_profile for XMM registers. */ #if !defined PROF && defined _dl_runtime_profile # if (LR_VECTOR_OFFSET % VEC_SIZE) != 0 # error LR_VECTOR_OFFSET must be multples of VEC_SIZE -- 2.9.3