From patchwork Wed Aug 23 13:28:50 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "H.J. Lu" X-Patchwork-Id: 22328 Received: (qmail 100503 invoked by alias); 23 Aug 2017 13:29:00 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 100324 invoked by uid 89); 23 Aug 2017 13:28:58 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-26.4 required=5.0 tests=AWL, BAYES_00, FREEMAIL_FROM, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_NONE, RCVD_IN_SORBS_SPAM, SPF_PASS autolearn=ham version=3.3.2 spammy=$320, 0x5f, H*RU:209.85.192.196, Hx-spam-relays-external:209.85.192.196 X-HELO: mail-pf0-f196.google.com X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:date:from:to:subject:message-id:mime-version :content-disposition:user-agent; bh=AcjINo4C2GrXiuaMfcf7K95JVUndkS6rwj886CqrNSg=; b=Hwp7Ofaoh0kn8ShoDZ8rfLCSmf+pC8xLiJpCwe/qaUnvqHaYdRRpCy73d6BRjv7K3P 4rgUxp0oQ0TwxhWNx0Qa+VHMYdyIKTKsqGHRJHDmeKOzqeTtyaPMyAaBfBhFel8P/FHE u24bnzgphwJiJoTm7WmgIj0K7aMHK1nsk5N9r7Gn4u4bVpMYTABZaVDZMCD4NyIcvrow NgS1XkAJ1cvRtwTOPYA6F+qThlcEyMiXL5EEx+qRwuZo8X4K/iOzao4Gl0hHicQNRAu0 7+UU5JgczyDBQ50Uo/KsSafzqFY2BUAoQV0YX7dAMgzMRpzoIyEYp3QwF+sRKzAxaFuF jlBQ== X-Gm-Message-State: AHYfb5jAdLuacLzeqKVdyjZfUEoEaIK2D5gpASUqi7+1a7vTf/2Z9knh vRU+xs5uV5Z476wP X-Received: by 10.84.210.236 with SMTP id a99mr3084351pli.151.1503494931783; Wed, 23 Aug 2017 06:28:51 -0700 (PDT) Date: Wed, 23 Aug 2017 06:28:50 -0700 From: "H.J. Lu" To: GNU C Library Subject: [PATCH] x86_64: Replace AVX512F .byte sequences with instructions Message-ID: <20170823132850.GA1651@gmail.com> MIME-Version: 1.0 Content-Disposition: inline User-Agent: Mutt/1.8.3 (2017-05-23) Since binutils 2.25 or later is required to build glibc, we can replace AVX512F .byte sequences with AVX512F instructions. Tested on x86-64 and x32. There are no code differences in libmvec.so and libmvec.a. I am checking it in now. H.J. ---- * sysdeps/x86_64/fpu/svml_d_sincos8_core.S: Replace AVX512F .byte sequences with AVX512F instructions. * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Likewise. * sysdeps/x86_64/fpu/svml_s_sincosf16_core.S: Likewise. * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S: Likewise. --- ChangeLog | 12 +++ .../fpu/multiarch/svml_d_sincos8_core_avx512.S | 19 +---- .../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 63 ++-------------- sysdeps/x86_64/fpu/svml_d_sincos8_core.S | 41 +---------- sysdeps/x86_64/fpu/svml_d_wrapper_impl.h | 57 ++------------- sysdeps/x86_64/fpu/svml_s_sincosf16_core.S | 85 ++-------------------- sysdeps/x86_64/fpu/svml_s_wrapper_impl.h | 57 ++------------- 7 files changed, 44 insertions(+), 290 deletions(-) diff --git a/ChangeLog b/ChangeLog index 43c8880a76..a58de05af6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +2017-08-23 H.J. Lu + + * sysdeps/x86_64/fpu/svml_d_sincos8_core.S: Replace AVX512F + .byte sequences with AVX512F instructions. + * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Likewise. + * sysdeps/x86_64/fpu/svml_s_sincosf16_core.S: Likewise. + * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise. + * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S: + Likewise. + * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S: + Likewise. + 2017-08-22 Szabolcs Nagy Steve Ellcey diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S index c9207558c5..3667faa0fa 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S @@ -599,24 +599,9 @@ libmvec_hidden_def(_ZGVeN8vl8l8_sincos_skx) cfi_def_cfa_register (%rbp) andq $-64, %rsp subq $256, %rsp - /* Encoding for vmovups %zmm1, 128(%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x4c - .byte 0x24 - .byte 0x02 + vmovups %zmm1, 128(%rsp) lea (%rsp), %rdi - /* Encoding for vmovups %zmm2, 192(%rdi). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x57 - .byte 0x03 + vmovups %zmm2, 192(%rdi) lea 64(%rsp), %rsi call HIDDEN_JUMPTARGET(\callee) movq 128(%rsp), %rdx diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S index f73ab7de7c..8fa4255d6d 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S @@ -510,40 +510,11 @@ libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx) cfi_def_cfa_register (%rbp) andq $-64, %rsp subq $384, %rsp - /* Encoding for vmovups %zmm1, 128(%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x4c - .byte 0x24 - .byte 0x02 + vmovups %zmm1, 128(%rsp) lea (%rsp), %rdi - /* Encoding for vmovups %zmm2, 192(%rdi). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x57 - .byte 0x03 - /* Encoding for vmovups %zmm3, 256(%rdi). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x5f - .byte 0x04 - /* Encoding for vmovups %zmm4, 320(%rdi). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x67 - .byte 0x05 + vmovups %zmm2, 192(%rdi) + vmovups %zmm3, 256(%rdi) + vmovups %zmm4, 320(%rdi) lea 64(%rsp), %rsi call HIDDEN_JUMPTARGET(\callee) movq 128(%rsp), %rdx @@ -661,30 +632,8 @@ libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx) leal -112(%rbp), %esi leal -176(%rbp), %edi subl $296, %esp - /* Encoding for vmovdqa64 %zmm1, -240(%ebp). */ - .byte 0x67 - .byte 0x62 - .byte 0xf1 - .byte 0xfd - .byte 0x48 - .byte 0x7f - .byte 0x8d - .byte 0x10 - .byte 0xff - .byte 0xff - .byte 0xff - /* Encoding for vmovdqa64 %zmm2, -304(%ebp). */ - .byte 0x67 - .byte 0x62 - .byte 0xf1 - .byte 0xfd - .byte 0x48 - .byte 0x7f - .byte 0x95 - .byte 0xd0 - .byte 0xfe - .byte 0xff - .byte 0xff + vmovdqa64 %zmm1, -240(%ebp) + vmovdqa64 %zmm2, -304(%ebp) call HIDDEN_JUMPTARGET(\callee) movl -240(%ebp), %eax vmovss -176(%ebp), %xmm0 diff --git a/sysdeps/x86_64/fpu/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/svml_d_sincos8_core.S index c104539821..cdea30409a 100644 --- a/sysdeps/x86_64/fpu/svml_d_sincos8_core.S +++ b/sysdeps/x86_64/fpu/svml_d_sincos8_core.S @@ -35,32 +35,10 @@ END (_ZGVeN8vl8l8_sincos) cfi_def_cfa_register (%rbp) andq $-64, %rsp subq $320, %rsp - /* Encoding for vmovups %zmm0, 256(%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x44 - .byte 0x24 - .byte 0x04 + vmovups %zmm0, 256(%rsp) lea (%rsp), %rdi - /* Encoding for vmovups %zmm1, 128(%rdi). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x4f - .byte 0x02 - /* Encoding for vmovups %zmm2, 192(%rdi). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x57 - .byte 0x03 + vmovups %zmm1, 128(%rdi) + vmovups %zmm2, 192(%rdi) lea 64(%rsp), %rsi call HIDDEN_JUMPTARGET(\callee) vmovdqu 288(%rsp), %ymm0 @@ -142,18 +120,7 @@ END (_ZGVeN8vl8l8_sincos) subl $280, %esp vmovdqa %ymm1, -208(%ebp) vmovdqa %ymm2, -240(%ebp) - /* Encoding for vmovapd %zmm0, -304(%ebp). */ - .byte 0x67 - .byte 0x62 - .byte 0xf1 - .byte 0xfd - .byte 0x48 - .byte 0x29 - .byte 0x85 - .byte 0xd0 - .byte 0xfe - .byte 0xff - .byte 0xff + vmovapd %zmm0, -304(%ebp) call HIDDEN_JUMPTARGET(\callee) leal 32(%r12), %esi vmovupd -272(%ebp), %ymm0 diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h index 625eb6642b..39336447ab 100644 --- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h +++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h @@ -201,29 +201,14 @@ cfi_def_cfa_register (%rbp) andq $-64, %rsp subq $128, %rsp -/* Below is encoding for vmovups %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x04 - .byte 0x24 + vmovups %zmm0, (%rsp) vmovupd (%rsp), %ymm0 call HIDDEN_JUMPTARGET(\callee) vmovupd %ymm0, 64(%rsp) vmovupd 32(%rsp), %ymm0 call HIDDEN_JUMPTARGET(\callee) vmovupd %ymm0, 96(%rsp) -/* Below is encoding for vmovups 64(%rsp), %zmm0. */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x10 - .byte 0x44 - .byte 0x24 - .byte 0x01 + vmovups 64(%rsp), %zmm0 movq %rbp, %rsp cfi_def_cfa_register (%rsp) popq %rbp @@ -241,23 +226,8 @@ cfi_def_cfa_register (%rbp) andq $-64, %rsp subq $192, %rsp -/* Below is encoding for vmovups %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x04 - .byte 0x24 -/* Below is encoding for vmovups %zmm1, 64(%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x4c - .byte 0x24 - .byte 0x01 + vmovups %zmm0, (%rsp) + vmovups %zmm1, 64(%rsp) vmovupd (%rsp), %ymm0 vmovupd 64(%rsp), %ymm1 call HIDDEN_JUMPTARGET(\callee) @@ -266,15 +236,7 @@ vmovupd 96(%rsp), %ymm1 call HIDDEN_JUMPTARGET(\callee) vmovupd %ymm0, 160(%rsp) -/* Below is encoding for vmovups 128(%rsp), %zmm0. */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x10 - .byte 0x44 - .byte 0x24 - .byte 0x02 + vmovups 128(%rsp), %zmm0 movq %rbp, %rsp cfi_def_cfa_register (%rsp) popq %rbp @@ -299,14 +261,7 @@ cfi_rel_offset (%r13, 0) subq $176, %rsp movq %rsi, %r13 -/* Below is encoding for vmovups %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x04 - .byte 0x24 + vmovups %zmm0, (%rsp) movq %rdi, %r12 vmovupd (%rsp), %ymm0 call HIDDEN_JUMPTARGET(\callee) diff --git a/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S index d86c91380e..8ebcebb296 100644 --- a/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S +++ b/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S @@ -35,48 +35,12 @@ END (_ZGVeN16vl4l4_sincosf) cfi_def_cfa_register (%rbp) andq $-64, %rsp subq $448, %rsp - /* Encoding for vmovups %zmm0, 384(%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x44 - .byte 0x24 - .byte 0x06 + vmovups %zmm0, 384(%rsp) lea (%rsp), %rdi - /* Encoding for vmovups %zmm1, 128(%rdi). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x4f - .byte 0x02 - /* Encoding for vmovups %zmm2, 192(%rdi). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x57 - .byte 0x03 - /* Encoding for vmovups %zmm3, 256(%rdi). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x5f - .byte 0x04 - /* Encoding for vmovups %zmm4, 320(%rdi). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x67 - .byte 0x05 + vmovups %zmm1, 128(%rdi) + vmovups %zmm2, 192(%rdi) + vmovups %zmm3, 256(%rdi) + vmovups %zmm4, 320(%rdi) lea 64(%rsp), %rsi call HIDDEN_JUMPTARGET(\callee) vmovdqu 416(%rsp), %ymm0 @@ -204,42 +168,9 @@ END (_ZGVeN16vl4l4_sincosf) .cfi_escape 0x10,0x3,0x2,0x76,0x68 movq %rdi, %rbx subl $344, %esp - /* Encoding for vmovdqa64 %zmm1, -240(%ebp). */ - .byte 0x67 - .byte 0x62 - .byte 0xf1 - .byte 0xfd - .byte 0x48 - .byte 0x7f - .byte 0x8d - .byte 0x10 - .byte 0xff - .byte 0xff - .byte 0xff - /* Encoding for vmovdqa64 %zmm2, -304(%ebp). */ - .byte 0x67 - .byte 0x62 - .byte 0xf1 - .byte 0xfd - .byte 0x48 - .byte 0x7f - .byte 0x95 - .byte 0xd0 - .byte 0xfe - .byte 0xff - .byte 0xff - /* Encoding for vmovaps %zmm0, -368(%ebp). */ - .byte 0x67 - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x85 - .byte 0x90 - .byte 0xfe - .byte 0xff - .byte 0xff + vmovdqa64 %zmm1, -240(%ebp) + vmovdqa64 %zmm2, -304(%ebp) + vmovaps %zmm0, -368(%ebp) call HIDDEN_JUMPTARGET(\callee) leal 32(%r12), %esi vmovups -336(%ebp), %ymm0 diff --git a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h index cd6d58361c..00b86cd377 100644 --- a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h +++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h @@ -246,29 +246,14 @@ cfi_def_cfa_register (%rbp) andq $-64, %rsp subq $128, %rsp -/* Below is encoding for vmovups %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x04 - .byte 0x24 + vmovups %zmm0, (%rsp) vmovupd (%rsp), %ymm0 call HIDDEN_JUMPTARGET(\callee) vmovupd %ymm0, 64(%rsp) vmovupd 32(%rsp), %ymm0 call HIDDEN_JUMPTARGET(\callee) vmovupd %ymm0, 96(%rsp) -/* Below is encoding for vmovups 64(%rsp), %zmm0. */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x10 - .byte 0x44 - .byte 0x24 - .byte 0x01 + vmovups 64(%rsp), %zmm0 movq %rbp, %rsp cfi_def_cfa_register (%rsp) popq %rbp @@ -286,23 +271,8 @@ cfi_def_cfa_register (%rbp) andq $-64, %rsp subq $192, %rsp -/* Below is encoding for vmovups %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x04 - .byte 0x24 -/* Below is encoding for vmovups %zmm1, 64(%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x4c - .byte 0x24 - .byte 0x01 + vmovups %zmm0, (%rsp) + vmovups %zmm1, 64(%rsp) vmovups (%rsp), %ymm0 vmovups 64(%rsp), %ymm1 call HIDDEN_JUMPTARGET(\callee) @@ -311,15 +281,7 @@ vmovups 96(%rsp), %ymm1 call HIDDEN_JUMPTARGET(\callee) vmovups %ymm0, 160(%rsp) -/* Below is encoding for vmovups 128(%rsp), %zmm0. */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x10 - .byte 0x44 - .byte 0x24 - .byte 0x02 + vmovups 128(%rsp), %zmm0 movq %rbp, %rsp cfi_def_cfa_register (%rsp) popq %rbp @@ -340,14 +302,7 @@ pushq %r13 subq $176, %rsp movq %rsi, %r13 -/* Below is encoding for vmovaps %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x04 - .byte 0x24 + vmovaps %zmm0, (%rsp) movq %rdi, %r12 vmovaps (%rsp), %ymm0 call HIDDEN_JUMPTARGET(\callee)