From patchwork Thu Jul 23 21:53:44 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Andrew Senkevich X-Patchwork-Id: 7828 Received: (qmail 663 invoked by alias); 23 Jul 2015 21:54:20 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 640 invoked by uid 89); 23 Jul 2015 21:54:20 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.5 required=5.0 tests=AWL, BAYES_20, FREEMAIL_FROM, RCVD_IN_DNSWL_LOW, SPF_PASS autolearn=ham version=3.3.2 X-HELO: mail-lb0-f172.google.com X-Received: by 10.152.203.233 with SMTP id kt9mr8972753lac.99.1437688453686; Thu, 23 Jul 2015 14:54:13 -0700 (PDT) MIME-Version: 1.0 From: Andrew Senkevich Date: Fri, 24 Jul 2015 00:53:44 +0300 Message-ID: Subject: [PATCH] [x86_64] Fixed libmvec AVX512 implementations To: libc-alpha Hi, this patch fixes several libmvec bugs found during testing on new KNL hardware. Fixed AVX512 IFUNC implementations, implementations of wrappers to AVX2 versions and KNL expf implementation. 2015-07-24 Andrew Senkevich * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC. * sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise. * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2. * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL version. Ok for trunk? --- WBR, Andrew diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S index ba3b66f..d0f4f27 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8v_cos) .type _ZGVeN8v_cos, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8v_cos_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8v_cos_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8v_cos_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8v_cos_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8v_cos) #define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S index 8f837fb..7b7c07d 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8v_exp) .type _ZGVeN8v_exp, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8v_exp_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8v_exp_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8v_exp_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8v_exp_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8v_exp) #define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S index 2f9e9d8..76375fd 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8v_log) .type _ZGVeN8v_log, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8v_log_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8v_log_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8v_log_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8v_log_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8v_log) #define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S index 3b11511..c1e5e76 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8vv_pow) .type _ZGVeN8vv_pow, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8vv_pow_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8vv_pow_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8vv_pow_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8vv_pow) #define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S index ba63102..131f2f4 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8v_sin) .type _ZGVeN8v_sin, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8v_sin_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8v_sin_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8v_sin_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8v_sin_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8v_sin) #define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S index 7228ba5..e331090 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8vvv_sincos) .type _ZGVeN8vvv_sincos, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8vvv_sincos_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8vvv_sincos_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8vvv_sincos_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8vvv_sincos) #define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S index 91564de..0654d3c 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16v_cosf) .type _ZGVeN16v_cosf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16v_cosf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16v_cosf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16v_cosf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16v_cosf) #define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S index 3b3489d..62858eb 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16v_expf) .type _ZGVeN16v_expf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16v_expf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16v_expf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16v_expf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16v_expf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16v_expf) #define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S index 8756750..68c57e4 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16v_logf) .type _ZGVeN16v_logf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16v_logf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16v_logf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16v_logf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16v_logf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16v_logf) #define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S index a4ba4fb..3aa9f95 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16vv_powf) .type _ZGVeN16vv_powf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16vv_powf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16vv_powf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16vv_powf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16vv_powf) #define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S index 0a1753e..bdcabab 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16vvv_sincosf) .type _ZGVeN16vvv_sincosf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16vvv_sincosf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16vvv_sincosf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16vvv_sincosf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16vvv_sincosf) #define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S index 7ed637b..3ec78a0 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16v_sinf) .type _ZGVeN16v_sinf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16v_sinf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16v_sinf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16v_sinf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16v_sinf) #define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h index bd93b8e..5c0ff89 100644 --- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h +++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h @@ -194,39 +194,39 @@ /* AVX512 ISA version as wrapper to AVX2 ISA version. */ .macro WRAPPER_IMPL_AVX512 callee - pushq %rbp + pushq %rbp cfi_adjust_cfa_offset (8) cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp + movq %rsp, %rbp cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $64, %rsp -/* Below is encoding for vmovaps %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x04 - .byte 0x24 -/* Below is encoding for vmovapd (%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x04 - .byte 0x24 - call HIDDEN_JUMPTARGET(\callee) -/* Below is encoding for vmovapd 32(%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x44 - .byte 0x24 - .byte 0x20 - call HIDDEN_JUMPTARGET(\callee) - movq %rbp, %rsp + andq $-64, %rsp + subq $128, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 + vmovupd (%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 64(%rsp) + vmovupd 32(%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 96(%rsp) +/* Below is encoding for vmovups 64(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x01 + movq %rbp, %rsp cfi_def_cfa_register (%rsp) - popq %rbp + popq %rbp cfi_adjust_cfa_offset (-8) cfi_restore (%rbp) ret @@ -234,61 +234,50 @@ /* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ .macro WRAPPER_IMPL_AVX512_ff callee - pushq %rbp + pushq %rbp cfi_adjust_cfa_offset (8) cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp + movq %rsp, %rbp cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $128, %rsp -/* Below is encoding for vmovaps %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x04 - .byte 0x24 -/* Below is encoding for vmovaps %zmm1, 64(%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x4c - .byte 0x24 -/* Below is encoding for vmovapd (%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x04 - .byte 0x24 -/* Below is encoding for vmovapd 64(%rsp), %ymm1. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x4c - .byte 0x24 - .byte 0x40 - call HIDDEN_JUMPTARGET(\callee) -/* Below is encoding for vmovapd 32(%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x44 - .byte 0x24 - .byte 0x20 -/* Below is encoding for vmovapd 96(%rsp), %ymm1. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x4c - .byte 0x24 - .byte 0x60 - call HIDDEN_JUMPTARGET(\callee) - movq %rbp, %rsp + andq $-64, %rsp + subq $192, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 +/* Below is encoding for vmovups %zmm1, 64(%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x4c + .byte 0x24 + .byte 0x01 + vmovupd (%rsp), %ymm0 + vmovupd 64(%rsp), %ymm1 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 128(%rsp) + vmovupd 32(%rsp), %ymm0 + vmovupd 96(%rsp), %ymm1 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 160(%rsp) +/* Below is encoding for vmovups 128(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x02 + movq %rbp, %rsp cfi_def_cfa_register (%rsp) - popq %rbp + popq %rbp cfi_adjust_cfa_offset (-8) cfi_restore (%rbp) ret @@ -310,61 +299,26 @@ cfi_rel_offset (%r13, 0) subq $176, %rsp movq %rsi, %r13 -/* Below is encoding for vmovaps %zmm0, (%rsp). */ +/* Below is encoding for vmovups %zmm0, (%rsp). */ .byte 0x62 .byte 0xf1 .byte 0x7c .byte 0x48 - .byte 0x29 + .byte 0x11 .byte 0x04 .byte 0x24 movq %rdi, %r12 -/* Below is encoding for vmovapd (%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x04 - .byte 0x24 + vmovupd (%rsp), %ymm0 call HIDDEN_JUMPTARGET(\callee) -/* Below is encoding for vmovapd 32(%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x44 - .byte 0x24 - .byte 0x20 + vmovupd 32(%rsp), %ymm0 lea 64(%rsp), %rdi lea 96(%rsp), %rsi call HIDDEN_JUMPTARGET(\callee) -/* Below is encoding for vmovapd 64(%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x44 - .byte 0x24 - .byte 0x40 -/* Below is encoding for vmovapd 96(%rsp), %ymm1. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x4c - .byte 0x24 - .byte 0x60 -/* Below is encoding for vmovapd %ymm0, 32(%r12). */ - .byte 0xc4 - .byte 0xc1 - .byte 0x7d - .byte 0x29 - .byte 0x44 - .byte 0x24 - .byte 0x20 -/* Below is encoding for vmovapd %ymm1, 32(%r13). */ - .byte 0xc4 - .byte 0xc1 - .byte 0x7d - .byte 0x29 - .byte 0x4d - .byte 0x20 + vmovupd 64(%rsp), %ymm0 + vmovupd 96(%rsp), %ymm1 + vmovupd %ymm0, 32(%r12) + vmovupd %ymm1, 32(%r13) + vzeroupper addq $176, %rsp popq %r13 cfi_adjust_cfa_offset (-8) diff --git a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h index 66bb081..d255d19 100644 --- a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h +++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h @@ -239,28 +239,39 @@ /* AVX512 ISA version as wrapper to AVX2 ISA version. */ .macro WRAPPER_IMPL_AVX512 callee - pushq %rbp + pushq %rbp cfi_adjust_cfa_offset (8) cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp + movq %rsp, %rbp cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $64, %rsp -/* Below is encoding for vmovaps %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x04 - .byte 0x24 - vmovaps (%rsp), %ymm0 - call HIDDEN_JUMPTARGET(\callee) - vmovaps 32(%rsp), %ymm0 - call HIDDEN_JUMPTARGET(\callee) - movq %rbp, %rsp + andq $-64, %rsp + subq $128, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 + vmovupd (%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 64(%rsp) + vmovupd 32(%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 96(%rsp) +/* Below is encoding for vmovups 64(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x01 + movq %rbp, %rsp cfi_def_cfa_register (%rsp) - popq %rbp + popq %rbp cfi_adjust_cfa_offset (-8) cfi_restore (%rbp) ret @@ -274,29 +285,41 @@ movq %rsp, %rbp cfi_def_cfa_register (%rbp) andq $-64, %rsp - subq $128, %rsp -/* Below is encoding for vmovaps %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x04 - .byte 0x24 -/* Below is encoding for vmovaps %zmm1, 64(%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x4c - .byte 0x24 - vmovaps (%rsp), %ymm0 - vmovaps 64(%rsp), %ymm1 + subq $192, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 +/* Below is encoding for vmovups %zmm1, 64(%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x4c + .byte 0x24 + .byte 0x01 + vmovups (%rsp), %ymm0 + vmovups 64(%rsp), %ymm1 call HIDDEN_JUMPTARGET(\callee) - vmovaps 32(%rsp), %ymm0 - vmovaps 96(%rsp), %ymm1 + vmovups %ymm0, 128(%rsp) + vmovups 32(%rsp), %ymm0 + vmovups 96(%rsp), %ymm1 call HIDDEN_JUMPTARGET(\callee) + vmovups %ymm0, 160(%rsp) +/* Below is encoding for vmovups 128(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x02 movq %rbp, %rsp cfi_def_cfa_register (%rsp) popq %rbp diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S index cb807e0..ec69055 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S @@ -46,6 +46,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf The table lookup is skipped if k = 0. For low accuracy approximation, exp(r) ~ 1 or 1+r. */ + pushq %rbp cfi_adjust_cfa_offset (8) cfi_rel_offset (%rbp, 0) movq %rsp, %rbp