From patchwork Thu Jul 23 21:53:44 2015
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Andrew Senkevich <andrew.n.senkevich@gmail.com>
X-Patchwork-Id: 7828
Received: (qmail 663 invoked by alias); 23 Jul 2015 21:54:20 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 640 invoked by uid 89); 23 Jul 2015 21:54:20 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-1.5 required=5.0 tests=AWL, BAYES_20,
	FREEMAIL_FROM, RCVD_IN_DNSWL_LOW,
	SPF_PASS autolearn=ham version=3.3.2
X-HELO: mail-lb0-f172.google.com
X-Received: by 10.152.203.233 with SMTP id kt9mr8972753lac.99.1437688453686;
	Thu, 23 Jul 2015 14:54:13 -0700 (PDT)
MIME-Version: 1.0
From: Andrew Senkevich <andrew.n.senkevich@gmail.com>
Date: Fri, 24 Jul 2015 00:53:44 +0300
Message-ID: 
 <CAMXFM3s-tWjppfZjG=i0TGb9vS-0DsYQ-NA3tn4f2+1Xg2Sp3A@mail.gmail.com>
Subject: [PATCH] [x86_64] Fixed libmvec AVX512 implementations
To: libc-alpha <libc-alpha@sourceware.org>

Hi,

this patch fixes several libmvec bugs found during testing on new KNL hardware.
Fixed AVX512 IFUNC implementations, implementations of wrappers to
AVX2 versions and KNL expf implementation.

2015-07-24  Andrew Senkevich  <andrew.senkevich@intel.com>

        * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC.
        * sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise.
        * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2.
        * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL
        version.


Ok for trunk?
---
WBR,
Andrew

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
index ba3b66f..d0f4f27 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8v_cos)
         .type   _ZGVeN8v_cos, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8v_cos_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_cos_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_cos_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8v_cos_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_cos)

 #define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
index 8f837fb..7b7c07d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8v_exp)
         .type   _ZGVeN8v_exp, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8v_exp_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_exp_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_exp_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8v_exp_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_exp)

 #define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
index 2f9e9d8..76375fd 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8v_log)
         .type   _ZGVeN8v_log, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8v_log_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_log_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_log_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8v_log_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_log)

 #define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
index 3b11511..c1e5e76 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8vv_pow)
         .type   _ZGVeN8vv_pow, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8vv_pow_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8vv_pow_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8vv_pow_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8vv_pow)

 #define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
index ba63102..131f2f4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8v_sin)
         .type   _ZGVeN8v_sin, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8v_sin_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_sin_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_sin_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8v_sin_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_sin)

 #define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
index 7228ba5..e331090 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8vvv_sincos)
         .type   _ZGVeN8vvv_sincos, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8vvv_sincos_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8vvv_sincos_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8vvv_sincos_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8vvv_sincos)

 #define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
index 91564de..0654d3c 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16v_cosf)
         .type   _ZGVeN16v_cosf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16v_cosf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_cosf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_cosf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_cosf)

 #define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
index 3b3489d..62858eb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16v_expf)
         .type   _ZGVeN16v_expf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16v_expf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_expf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_expf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16v_expf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_expf)

 #define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
index 8756750..68c57e4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16v_logf)
         .type   _ZGVeN16v_logf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16v_logf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_logf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_logf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16v_logf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_logf)

 #define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
index a4ba4fb..3aa9f95 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16vv_powf)
         .type   _ZGVeN16vv_powf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16vv_powf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16vv_powf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16vv_powf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16vv_powf)

 #define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
index 0a1753e..bdcabab 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16vvv_sincosf)
         .type   _ZGVeN16vvv_sincosf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16vvv_sincosf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16vvv_sincosf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16vvv_sincosf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16vvv_sincosf)

 #define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
index 7ed637b..3ec78a0 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16v_sinf)
         .type   _ZGVeN16v_sinf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16v_sinf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_sinf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_sinf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_sinf)

 #define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
index bd93b8e..5c0ff89 100644
--- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
@@ -194,39 +194,39 @@

 /* AVX512 ISA version as wrapper to AVX2 ISA version.  */
 .macro WRAPPER_IMPL_AVX512 callee
-        pushq  %rbp
+        pushq     %rbp
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
-        movq   %rsp, %rbp
+        movq      %rsp, %rbp
         cfi_def_cfa_register (%rbp)
-        andq   $-64, %rsp
-        subq   $64, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x04
-        .byte  0x24
-/* Below is encoding for vmovapd (%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x04
-        .byte  0x24
-        call   HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x44
-        .byte  0x24
-        .byte  0x20
-        call   HIDDEN_JUMPTARGET(\callee)
-        movq   %rbp, %rsp
+        andq      $-64, %rsp
+        subq      $128, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+        vmovupd   (%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 64(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 96(%rsp)
+/* Below is encoding for vmovups 64(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x01
+        movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
-        popq   %rbp
+        popq      %rbp
         cfi_adjust_cfa_offset (-8)
         cfi_restore (%rbp)
         ret
@@ -234,61 +234,50 @@

 /* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version.  */
 .macro WRAPPER_IMPL_AVX512_ff callee
-        pushq  %rbp
+        pushq     %rbp
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
-        movq   %rsp, %rbp
+        movq      %rsp, %rbp
         cfi_def_cfa_register (%rbp)
-        andq   $-64, %rsp
-        subq   $128, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x04
-        .byte  0x24
-/* Below is encoding for vmovaps %zmm1, 64(%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x4c
-        .byte  0x24
-/* Below is encoding for vmovapd (%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x04
-        .byte  0x24
-/* Below is encoding for vmovapd 64(%rsp), %ymm1.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x4c
-        .byte  0x24
-        .byte  0x40
-        call   HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x44
-        .byte  0x24
-        .byte  0x20
-/* Below is encoding for vmovapd 96(%rsp), %ymm1.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x4c
-        .byte  0x24
-        .byte  0x60
-        call   HIDDEN_JUMPTARGET(\callee)
-        movq   %rbp, %rsp
+        andq      $-64, %rsp
+        subq      $192, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+/* Below is encoding for vmovups %zmm1, 64(%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x4c
+        .byte   0x24
+        .byte   0x01
+        vmovupd   (%rsp), %ymm0
+        vmovupd   64(%rsp), %ymm1
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 128(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        vmovupd   96(%rsp), %ymm1
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 160(%rsp)
+/* Below is encoding for vmovups 128(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x02
+        movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
-        popq   %rbp
+        popq      %rbp
         cfi_adjust_cfa_offset (-8)
         cfi_restore (%rbp)
         ret
@@ -310,61 +299,26 @@
         cfi_rel_offset (%r13, 0)
         subq      $176, %rsp
         movq      %rsi, %r13
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
         .byte  0x62
         .byte  0xf1
         .byte  0x7c
         .byte  0x48
-        .byte  0x29
+        .byte  0x11
         .byte  0x04
         .byte  0x24
         movq    %rdi, %r12
-/* Below is encoding for vmovapd (%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x04
-        .byte  0x24
+        vmovupd (%rsp), %ymm0
         call      HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x44
-        .byte  0x24
-        .byte  0x20
+        vmovupd   32(%rsp), %ymm0
         lea       64(%rsp), %rdi
         lea       96(%rsp), %rsi
         call      HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 64(%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x44
-        .byte  0x24
-        .byte  0x40
-/* Below is encoding for vmovapd   96(%rsp), %ymm1.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x4c
-        .byte  0x24
-        .byte  0x60
-/* Below is encoding for vmovapd   %ymm0, 32(%r12).  */
-        .byte  0xc4
-        .byte  0xc1
-        .byte  0x7d
-        .byte  0x29
-        .byte  0x44
-        .byte  0x24
-        .byte  0x20
-/* Below is encoding for vmovapd   %ymm1, 32(%r13).  */
-        .byte  0xc4
-        .byte  0xc1
-        .byte  0x7d
-        .byte  0x29
-        .byte  0x4d
-        .byte  0x20
+        vmovupd   64(%rsp), %ymm0
+        vmovupd   96(%rsp), %ymm1
+        vmovupd   %ymm0, 32(%r12)
+        vmovupd   %ymm1, 32(%r13)
+        vzeroupper
         addq      $176, %rsp
         popq      %r13
         cfi_adjust_cfa_offset (-8)
diff --git a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
index 66bb081..d255d19 100644
--- a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
@@ -239,28 +239,39 @@

 /* AVX512 ISA version as wrapper to AVX2 ISA version.  */
 .macro WRAPPER_IMPL_AVX512 callee
-        pushq  %rbp
+        pushq     %rbp
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
-        movq   %rsp, %rbp
+        movq      %rsp, %rbp
         cfi_def_cfa_register (%rbp)
-        andq   $-64, %rsp
-        subq   $64, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x04
-        .byte  0x24
-        vmovaps (%rsp), %ymm0
-        call   HIDDEN_JUMPTARGET(\callee)
-        vmovaps 32(%rsp), %ymm0
-        call   HIDDEN_JUMPTARGET(\callee)
-        movq   %rbp, %rsp
+        andq      $-64, %rsp
+        subq      $128, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+        vmovupd   (%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 64(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 96(%rsp)
+/* Below is encoding for vmovups 64(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x01
+        movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
-        popq   %rbp
+        popq      %rbp
         cfi_adjust_cfa_offset (-8)
         cfi_restore (%rbp)
         ret
@@ -274,29 +285,41 @@
         movq      %rsp, %rbp
         cfi_def_cfa_register (%rbp)
         andq      $-64, %rsp
-        subq      $128, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x04
-        .byte  0x24
-/* Below is encoding for vmovaps %zmm1, 64(%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x4c
-        .byte  0x24
-        vmovaps (%rsp), %ymm0
-        vmovaps 64(%rsp), %ymm1
+        subq      $192, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+/* Below is encoding for vmovups %zmm1, 64(%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x4c
+        .byte   0x24
+        .byte   0x01
+        vmovups   (%rsp), %ymm0
+        vmovups   64(%rsp), %ymm1
         call      HIDDEN_JUMPTARGET(\callee)
-        vmovaps 32(%rsp), %ymm0
-        vmovaps 96(%rsp), %ymm1
+        vmovups   %ymm0, 128(%rsp)
+        vmovups   32(%rsp), %ymm0
+        vmovups   96(%rsp), %ymm1
         call      HIDDEN_JUMPTARGET(\callee)
+        vmovups   %ymm0, 160(%rsp)
+/* Below is encoding for vmovups 128(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x02
         movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq      %rbp
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
index cb807e0..ec69055 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
@@ -46,6 +46,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
      The table lookup is skipped if k = 0.
      For low accuracy approximation, exp(r) ~ 1 or 1+r.  */

+        pushq     %rbp
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
         movq      %rsp, %rbp