From patchwork Fri Mar 5 16:53:11 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "H.J. Lu" X-Patchwork-Id: 42276 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id D5F9D3AAA0F1; Fri, 5 Mar 2021 16:53:28 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org D5F9D3AAA0F1 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1614963208; bh=gwoTXozsT+fKHOwpPWiG3/Qer593ddH0a7dXRDMjMx4=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=mEVe8+kzHXBk5tXKY3fZ1zomWyf2bVHzRT1zrIvbAavMK4cK28RtfX+XJC1mVLu8a +xPN6mUSyK9NRI1NGpA+ZwWDx+GPQPFJzJ3lIj1aPJ0Up+sRLDg1sASms81SPSVxas 28LE0r3P1EcLlg1gX1CqgYF4fEEVvTaIOhnrU8SQ= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pl1-x62f.google.com (mail-pl1-x62f.google.com [IPv6:2607:f8b0:4864:20::62f]) by sourceware.org (Postfix) with ESMTPS id E0DBB38618CC for ; Fri, 5 Mar 2021 16:53:22 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org E0DBB38618CC Received: by mail-pl1-x62f.google.com with SMTP id a24so1677933plm.11 for ; Fri, 05 Mar 2021 08:53:22 -0800 (PST) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=gwoTXozsT+fKHOwpPWiG3/Qer593ddH0a7dXRDMjMx4=; b=CUV6gAULL/QJp9blVynrXtHAw9Xa92dIrOvRItfC0db91qf07oiz1o4peKdiv7jMos XZj/7kjF1I/PbuvVLdQpTdoMkNqFIWu1g1xAlc9H+C4ZWM+I3aLkECpOxY00On41Tkrv tDtwsjjK/0U2v9ooVoFF6We3Y4/HZ2Fng0b9w0KxPTBcyXzJRpi0coQuGlzyKhm4B2Rn 3RlHnufyB5fgFCqEePMVnGfPBGokkXkrL+fZpF00rTYfLpjBJdqNTegjlZ3bD0I6PZSU 1QVBoa5FV2hUSReN3b8Lr1Gi3zxbqrImgHNT5fKjRxuafxuYADvYAM2vtmuN/0PX5vpF bWTw== X-Gm-Message-State: AOAM530QIjYmV2B3eVLYJFTSt66lCezKyqMbmdzeh7NemnYd+WuWkrHx Pqr5phqZ8ishW6oezJkVzndoyhFkbh4= X-Google-Smtp-Source: ABdhPJxgI7QDxK2INOZENVC515PS4IGRsMdQd9FZyJobBFSvSpRDfcDvm/fHY64CFJEfZYdQeaIa0A== X-Received: by 2002:a17:902:b598:b029:e2:daa2:161c with SMTP id a24-20020a170902b598b02900e2daa2161cmr9325406pls.20.1614963200931; Fri, 05 Mar 2021 08:53:20 -0800 (PST) Received: from gnu-cfl-2.localdomain ([172.56.38.48]) by smtp.gmail.com with ESMTPSA id k128sm3005432pfd.137.2021.03.05.08.53.18 for (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Fri, 05 Mar 2021 08:53:18 -0800 (PST) Received: from gnu-tgl-2.localdomain (gnu-tgl-2 [192.168.1.34]) by gnu-cfl-2.localdomain (Postfix) with ESMTPS id 2D0251A03E7 for ; Fri, 5 Mar 2021 08:53:17 -0800 (PST) Received: from gnu-tgl-2.?040none?041 (localhost [IPv6:::1]) by gnu-tgl-2.localdomain (Postfix) with ESMTP id 4347A3003A0 for ; Fri, 5 Mar 2021 08:53:16 -0800 (PST) To: libc-alpha@sourceware.org Subject: [PATCH 3/8] x86-64: Add strcpy family functions with 256-bit EVEX Date: Fri, 5 Mar 2021 08:53:11 -0800 Message-Id: <20210305165316.323467-4-hjl.tools@gmail.com> X-Mailer: git-send-email 2.29.2 In-Reply-To: <20210305165316.323467-1-hjl.tools@gmail.com> References: <20210305165316.323467-1-hjl.tools@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-3034.3 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_SHORT, RCVD_IN_BARRACUDACENTRAL, RCVD_IN_DNSWL_NONE, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: "H.J. Lu via Libc-alpha" From: "H.J. Lu" Reply-To: "H.J. Lu" Errors-To: libc-alpha-bounces@sourceware.org Sender: "Libc-alpha" Update ifunc-strcpy.h to select the function optimized with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at function exit. --- sysdeps/x86_64/multiarch/Makefile | 6 + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 24 + sysdeps/x86_64/multiarch/ifunc-strcpy.h | 13 +- sysdeps/x86_64/multiarch/stpcpy-evex.S | 3 + sysdeps/x86_64/multiarch/stpncpy-evex.S | 4 + sysdeps/x86_64/multiarch/strcat-evex.S | 283 ++++++ sysdeps/x86_64/multiarch/strcpy-evex.S | 1007 ++++++++++++++++++++ sysdeps/x86_64/multiarch/strncat-evex.S | 3 + sysdeps/x86_64/multiarch/strncpy-evex.S | 3 + 9 files changed, 1343 insertions(+), 3 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/stpcpy-evex.S create mode 100644 sysdeps/x86_64/multiarch/stpncpy-evex.S create mode 100644 sysdeps/x86_64/multiarch/strcat-evex.S create mode 100644 sysdeps/x86_64/multiarch/strcpy-evex.S create mode 100644 sysdeps/x86_64/multiarch/strncat-evex.S create mode 100644 sysdeps/x86_64/multiarch/strncpy-evex.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 5ce858823a..46783cd14b 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -43,11 +43,17 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ memchr-evex \ memrchr-evex \ rawmemchr-evex \ + stpcpy-evex \ + stpncpy-evex \ + strcat-evex \ strchr-evex \ strchrnul-evex \ strcmp-evex \ + strcpy-evex \ strlen-evex \ + strncat-evex \ strncmp-evex \ + strncpy-evex \ strnlen-evex \ strrchr-evex CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 3bf10d3714..74b20d8bd1 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -224,6 +224,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __stpncpy_ssse3) IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), __stpncpy_avx2) + IFUNC_IMPL_ADD (array, i, stpncpy, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __stpncpy_evex) IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2)) @@ -234,6 +238,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __stpcpy_ssse3) IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), __stpcpy_avx2) + IFUNC_IMPL_ADD (array, i, stpcpy, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __stpcpy_evex) IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2)) @@ -268,6 +276,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strcat, IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2), __strcat_avx2) + IFUNC_IMPL_ADD (array, i, strcat, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __strcat_evex) IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3), __strcat_ssse3) IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned) @@ -330,6 +342,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strcpy, IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2), __strcpy_avx2) + IFUNC_IMPL_ADD (array, i, strcpy, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __strcpy_evex) IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3), __strcpy_ssse3) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) @@ -373,6 +389,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strncat, IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2), __strncat_avx2) + IFUNC_IMPL_ADD (array, i, strncat, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __strncat_evex) IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3), __strncat_ssse3) IFUNC_IMPL_ADD (array, i, strncat, 1, @@ -383,6 +403,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strncpy, IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2), __strncpy_avx2) + IFUNC_IMPL_ADD (array, i, strncpy, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __strncpy_evex) IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3), __strncpy_ssse3) IFUNC_IMPL_ADD (array, i, strncpy, 1, diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h index 1100cd23c6..f31f436adf 100644 --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h @@ -25,16 +25,23 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURE_USABLE_P (cpu_features, AVX2) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) - return OPTIMIZE (avx2); + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + return OPTIMIZE (evex); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) return OPTIMIZE (sse2_unaligned); diff --git a/sysdeps/x86_64/multiarch/stpcpy-evex.S b/sysdeps/x86_64/multiarch/stpcpy-evex.S new file mode 100644 index 0000000000..7c6f26cd98 --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy-evex.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_evex +#include "strcpy-evex.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S new file mode 100644 index 0000000000..1570014d1c --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_evex +#include "strcpy-evex.S" diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S new file mode 100644 index 0000000000..97c3d85b6d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat-evex.S @@ -0,0 +1,283 @@ +/* strcat with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +# include + +# ifndef STRCAT +# define STRCAT __strcat_evex +# endif + +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +/* zero register */ +# define XMMZERO xmm16 +# define YMMZERO ymm16 +# define YMM0 ymm17 +# define YMM1 ymm18 + +# define USE_AS_STRCAT + +/* Number of bytes in a vector register */ +# define VEC_SIZE 32 + + .section .text.evex,"ax",@progbits +ENTRY (STRCAT) + mov %rdi, %r9 +# ifdef USE_AS_STRNCAT + mov %rdx, %r8 +# endif + + xor %eax, %eax + mov %edi, %ecx + and $((VEC_SIZE * 4) - 1), %ecx + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + cmp $(VEC_SIZE * 3), %ecx + ja L(fourth_vector_boundary) + vpcmpb $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %edx + test %edx, %edx + jnz L(exit_null_on_first_vector) + mov %rdi, %rax + and $-VEC_SIZE, %rax + jmp L(align_vec_size_start) +L(fourth_vector_boundary): + mov %rdi, %rax + and $-VEC_SIZE, %rax + vpcmpb $0, (%rax), %YMMZERO, %k0 + mov $-1, %r10d + sub %rax, %rcx + shl %cl, %r10d + kmovd %k0, %edx + and %r10d, %edx + jnz L(exit) + +L(align_vec_size_start): + vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 + kmovd %k0, %edx + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 + kmovd %k2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 + kmovd %k3, %edx + test %edx, %edx + jnz L(exit_null_on_fifth_vector) + + vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 + add $(VEC_SIZE * 4), %rax + kmovd %k4, %edx + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 + kmovd %k2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 + kmovd %k3, %edx + test %edx, %edx + jnz L(exit_null_on_fifth_vector) + + vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 + kmovd %k4, %edx + add $(VEC_SIZE * 4), %rax + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 + kmovd %k2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 + kmovd %k3, %edx + test %edx, %edx + jnz L(exit_null_on_fifth_vector) + + vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 + add $(VEC_SIZE * 4), %rax + kmovd %k4, %edx + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 + kmovd %k2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 + kmovd %k3, %edx + test %edx, %edx + jnz L(exit_null_on_fifth_vector) + + test $((VEC_SIZE * 4) - 1), %rax + jz L(align_four_vec_loop) + + vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 + add $(VEC_SIZE * 5), %rax + kmovd %k4, %edx + test %edx, %edx + jnz L(exit) + + test $((VEC_SIZE * 4) - 1), %rax + jz L(align_four_vec_loop) + + vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 + add $VEC_SIZE, %rax + kmovd %k0, %edx + test %edx, %edx + jnz L(exit) + + test $((VEC_SIZE * 4) - 1), %rax + jz L(align_four_vec_loop) + + vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 + add $VEC_SIZE, %rax + kmovd %k0, %edx + test %edx, %edx + jnz L(exit) + + test $((VEC_SIZE * 4) - 1), %rax + jz L(align_four_vec_loop) + + vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1 + add $VEC_SIZE, %rax + kmovd %k1, %edx + test %edx, %edx + jnz L(exit) + + add $VEC_SIZE, %rax + + .p2align 4 +L(align_four_vec_loop): + VMOVA (%rax), %YMM0 + VMOVA (VEC_SIZE * 2)(%rax), %YMM1 + vpminub VEC_SIZE(%rax), %YMM0, %YMM0 + vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1 + vpminub %YMM0, %YMM1, %YMM0 + /* If K0 != 0, there is a null byte. */ + vpcmpb $0, %YMM0, %YMMZERO, %k0 + add $(VEC_SIZE * 4), %rax + ktestd %k0, %k0 + jz L(align_four_vec_loop) + + vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0 + sub $(VEC_SIZE * 5), %rax + kmovd %k0, %edx + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 + kmovd %k2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 + kmovd %k3, %edx + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $(VEC_SIZE * 4), %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit): + sub %rdi, %rax +L(exit_null_on_first_vector): + bsf %rdx, %rdx + add %rdx, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_null_on_second_vector): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $VEC_SIZE, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_null_on_third_vector): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $(VEC_SIZE * 2), %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_null_on_fourth_vector): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $(VEC_SIZE * 3), %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_null_on_fifth_vector): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $(VEC_SIZE * 4), %rax + + .p2align 4 +L(StartStrcpyPart): + lea (%r9, %rax), %rdi + mov %rsi, %rcx + mov %r9, %rax /* save result */ + +# ifdef USE_AS_STRNCAT + test %r8, %r8 + jz L(ExitZero) +# define USE_AS_STRNCPY +# endif + +# include "strcpy-evex.S" +#endif diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S new file mode 100644 index 0000000000..e019cfbeac --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S @@ -0,0 +1,1007 @@ +/* strcpy with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +# ifndef USE_AS_STRCAT +# include + +# ifndef STRCPY +# define STRCPY __strcpy_evex +# endif + +# endif + +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +/* Number of bytes in a vector register */ +# ifndef VEC_SIZE +# define VEC_SIZE 32 +# endif + +# define XMM3 xmm19 + +# define YMM3 ymm19 +# define YMM4 ymm20 +# define YMM5 ymm21 +# define YMM6 ymm22 +# define YMM7 ymm23 + +# ifndef USE_AS_STRCAT + +/* zero register */ +# define XMMZERO xmm16 +# define YMMZERO ymm16 + +# define XMM1 xmm17 +# define XMM2 xmm18 + +# define YMM1 ymm17 +# define YMM2 ymm18 + + .section .text.evex,"ax",@progbits +ENTRY (STRCPY) +# ifdef USE_AS_STRNCPY + mov %RDX_LP, %R8_LP + test %R8_LP, %R8_LP + jz L(ExitZero) +# endif + mov %rsi, %rcx +# ifndef USE_AS_STPCPY + mov %rdi, %rax /* save result */ +# endif + +# endif + + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + + and $((VEC_SIZE * 4) - 1), %ecx + cmp $(VEC_SIZE * 2), %ecx + jbe L(SourceStringAlignmentLessTwoVecSize) + + and $-VEC_SIZE, %rsi + and $(VEC_SIZE - 1), %ecx + + vpcmpb $0, (%rsi), %YMMZERO, %k0 + kmovd %k0, %edx + shr %cl, %rdx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + mov $VEC_SIZE, %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# else + mov $(VEC_SIZE + 1), %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# endif + jbe L(CopyVecSizeTailCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyVecSizeTail) + + vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1 + kmovd %k1, %edx + +# ifdef USE_AS_STRNCPY + add $VEC_SIZE, %r10 + cmp %r10, %r8 + jbe L(CopyTwoVecSizeCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyTwoVecSize) + + VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */ + VMOVU %YMM2, (%rdi) + +/* If source address alignment != destination address alignment */ + .p2align 4 +L(UnalignVecSizeBoth): + sub %rcx, %rdi +# ifdef USE_AS_STRNCPY + add %rcx, %r8 + sbb %rcx, %rcx + or %rcx, %r8 +# endif + mov $VEC_SIZE, %rcx + VMOVA (%rsi, %rcx), %YMM2 + VMOVU %YMM2, (%rdi, %rcx) + VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 + vpcmpb $0, %YMM2, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $(VEC_SIZE * 3), %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec2) +# else + jnz L(CopyVecSize) +# endif + + VMOVU %YMM2, (%rdi, %rcx) + VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 + vpcmpb $0, %YMM3, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec3) +# else + jnz L(CopyVecSize) +# endif + + VMOVU %YMM3, (%rdi, %rcx) + VMOVA VEC_SIZE(%rsi, %rcx), %YMM4 + vpcmpb $0, %YMM4, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec4) +# else + jnz L(CopyVecSize) +# endif + + VMOVU %YMM4, (%rdi, %rcx) + VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 + vpcmpb $0, %YMM2, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec2) +# else + jnz L(CopyVecSize) +# endif + + VMOVU %YMM2, (%rdi, %rcx) + VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 + vpcmpb $0, %YMM2, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec2) +# else + jnz L(CopyVecSize) +# endif + + VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 + VMOVU %YMM2, (%rdi, %rcx) + vpcmpb $0, %YMM3, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec3) +# else + jnz L(CopyVecSize) +# endif + + VMOVU %YMM3, (%rdi, %rcx) + mov %rsi, %rdx + lea VEC_SIZE(%rsi, %rcx), %rsi + and $-(VEC_SIZE * 4), %rsi + sub %rsi, %rdx + sub %rdx, %rdi +# ifdef USE_AS_STRNCPY + lea (VEC_SIZE * 8)(%r8, %rdx), %r8 +# endif +L(UnalignedFourVecSizeLoop): + VMOVA (%rsi), %YMM4 + VMOVA VEC_SIZE(%rsi), %YMM5 + VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 + VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 + vpminub %YMM5, %YMM4, %YMM2 + vpminub %YMM7, %YMM6, %YMM3 + vpminub %YMM2, %YMM3, %YMM2 + /* If K7 != 0, there is a null byte. */ + vpcmpb $0, %YMM2, %YMMZERO, %k7 + kmovd %k7, %edx +# ifdef USE_AS_STRNCPY + sub $(VEC_SIZE * 4), %r8 + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jnz L(UnalignedFourVecSizeLeave) + +L(UnalignedFourVecSizeLoop_start): + add $(VEC_SIZE * 4), %rdi + add $(VEC_SIZE * 4), %rsi + VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi) + VMOVA (%rsi), %YMM4 + VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi) + VMOVA VEC_SIZE(%rsi), %YMM5 + vpminub %YMM5, %YMM4, %YMM2 + VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi) + VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 + VMOVU %YMM7, -VEC_SIZE(%rdi) + VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 + vpminub %YMM7, %YMM6, %YMM3 + vpminub %YMM2, %YMM3, %YMM2 + /* If K7 != 0, there is a null byte. */ + vpcmpb $0, %YMM2, %YMMZERO, %k7 + kmovd %k7, %edx +# ifdef USE_AS_STRNCPY + sub $(VEC_SIZE * 4), %r8 + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jz L(UnalignedFourVecSizeLoop_start) + +L(UnalignedFourVecSizeLeave): + vpcmpb $0, %YMM4, %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(CopyVecSizeUnaligned_0) + + vpcmpb $0, %YMM5, %YMMZERO, %k2 + kmovd %k2, %ecx + test %ecx, %ecx + jnz L(CopyVecSizeUnaligned_16) + + vpcmpb $0, %YMM6, %YMMZERO, %k3 + kmovd %k3, %edx + test %edx, %edx + jnz L(CopyVecSizeUnaligned_32) + + vpcmpb $0, %YMM7, %YMMZERO, %k4 + kmovd %k4, %ecx + bsf %ecx, %edx + VMOVU %YMM4, (%rdi) + VMOVU %YMM5, VEC_SIZE(%rdi) + VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea (VEC_SIZE * 3)(%rdi, %rdx), %rax +# endif + VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) + add $(VEC_SIZE - 1), %r8 + sub %rdx, %r8 + lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $(VEC_SIZE * 3), %rsi + add $(VEC_SIZE * 3), %rdi + jmp L(CopyVecSizeExit) +# endif + +/* If source address alignment == destination address alignment */ + +L(SourceStringAlignmentLessTwoVecSize): + VMOVU (%rsi), %YMM3 + VMOVU VEC_SIZE(%rsi), %YMM2 + vpcmpb $0, %YMM3, %YMMZERO, %k0 + kmovd %k0, %edx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $VEC_SIZE, %r8 +# else + cmp $(VEC_SIZE + 1), %r8 +# endif + jbe L(CopyVecSizeTail1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyVecSizeTail1) + + VMOVU %YMM3, (%rdi) + vpcmpb $0, %YMM2, %YMMZERO, %k0 + kmovd %k0, %edx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $(VEC_SIZE * 2), %r8 +# else + cmp $((VEC_SIZE * 2) + 1), %r8 +# endif + jbe L(CopyTwoVecSize1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyTwoVecSize1) + + and $-VEC_SIZE, %rsi + and $(VEC_SIZE - 1), %ecx + jmp L(UnalignVecSizeBoth) + +/*------End of main part with loops---------------------*/ + +/* Case1 */ + +# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) + .p2align 4 +L(CopyVecSize): + add %rcx, %rdi +# endif +L(CopyVecSizeTail): + add %rcx, %rsi +L(CopyVecSizeTail1): + bsf %edx, %edx +L(CopyVecSizeExit): + cmp $32, %edx + jae L(Exit32_63) + cmp $16, %edx + jae L(Exit16_31) + cmp $8, %edx + jae L(Exit8_15) + cmp $4, %edx + jae L(Exit4_7) + cmp $3, %edx + je L(Exit3) + cmp $1, %edx + ja L(Exit2) + je L(Exit1) + movb $0, (%rdi) +# ifdef USE_AS_STPCPY + lea (%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $1, %r8 + lea 1(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(CopyTwoVecSize1): + add $VEC_SIZE, %rsi + add $VEC_SIZE, %rdi +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $VEC_SIZE, %r8 +# endif + jmp L(CopyVecSizeTail1) + + .p2align 4 +L(CopyTwoVecSize): + bsf %edx, %edx + add %rcx, %rsi + add $VEC_SIZE, %edx + sub %ecx, %edx + jmp L(CopyVecSizeExit) + + .p2align 4 +L(CopyVecSizeUnaligned_0): + bsf %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + VMOVU %YMM4, (%rdi) + add $((VEC_SIZE * 4) - 1), %r8 + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + jmp L(CopyVecSizeExit) +# endif + + .p2align 4 +L(CopyVecSizeUnaligned_16): + bsf %ecx, %edx + VMOVU %YMM4, (%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea VEC_SIZE(%rdi, %rdx), %rax +# endif + VMOVU %YMM5, VEC_SIZE(%rdi) + add $((VEC_SIZE * 3) - 1), %r8 + sub %rdx, %r8 + lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $VEC_SIZE, %rsi + add $VEC_SIZE, %rdi + jmp L(CopyVecSizeExit) +# endif + + .p2align 4 +L(CopyVecSizeUnaligned_32): + bsf %edx, %edx + VMOVU %YMM4, (%rdi) + VMOVU %YMM5, VEC_SIZE(%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea (VEC_SIZE * 2)(%rdi, %rdx), %rax +# endif + VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) + add $((VEC_SIZE * 2) - 1), %r8 + sub %rdx, %r8 + lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $(VEC_SIZE * 2), %rsi + add $(VEC_SIZE * 2), %rdi + jmp L(CopyVecSizeExit) +# endif + +# ifdef USE_AS_STRNCPY +# ifndef USE_AS_STRCAT + .p2align 4 +L(CopyVecSizeUnalignedVec6): + VMOVU %YMM6, (%rdi, %rcx) + jmp L(CopyVecSizeVecExit) + + .p2align 4 +L(CopyVecSizeUnalignedVec5): + VMOVU %YMM5, (%rdi, %rcx) + jmp L(CopyVecSizeVecExit) + + .p2align 4 +L(CopyVecSizeUnalignedVec4): + VMOVU %YMM4, (%rdi, %rcx) + jmp L(CopyVecSizeVecExit) + + .p2align 4 +L(CopyVecSizeUnalignedVec3): + VMOVU %YMM3, (%rdi, %rcx) + jmp L(CopyVecSizeVecExit) +# endif + +/* Case2 */ + + .p2align 4 +L(CopyVecSizeCase2): + add $VEC_SIZE, %r8 + add %rcx, %rdi + add %rcx, %rsi + bsf %edx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) + jmp L(StrncpyExit) + + .p2align 4 +L(CopyTwoVecSizeCase2): + add %rcx, %rsi + bsf %edx, %edx + add $VEC_SIZE, %edx + sub %ecx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) + jmp L(StrncpyExit) + +L(CopyVecSizeTailCase2): + add %rcx, %rsi + bsf %edx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) + jmp L(StrncpyExit) + +L(CopyVecSizeTail1Case2): + bsf %edx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) + jmp L(StrncpyExit) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyVecSizeCase2OrCase3): + test %rdx, %rdx + jnz L(CopyVecSizeCase2) +L(CopyVecSizeCase3): + add $VEC_SIZE, %r8 + add %rcx, %rdi + add %rcx, %rsi + jmp L(StrncpyExit) + + .p2align 4 +L(CopyTwoVecSizeCase2OrCase3): + test %rdx, %rdx + jnz L(CopyTwoVecSizeCase2) + add %rcx, %rsi + jmp L(StrncpyExit) + + .p2align 4 +L(CopyVecSizeTailCase2OrCase3): + test %rdx, %rdx + jnz L(CopyVecSizeTailCase2) + add %rcx, %rsi + jmp L(StrncpyExit) + + .p2align 4 +L(CopyTwoVecSize1Case2OrCase3): + add $VEC_SIZE, %rdi + add $VEC_SIZE, %rsi + sub $VEC_SIZE, %r8 +L(CopyVecSizeTail1Case2OrCase3): + test %rdx, %rdx + jnz L(CopyVecSizeTail1Case2) + jmp L(StrncpyExit) +# endif + +/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ + + .p2align 4 +L(Exit1): + movzwl (%rsi), %edx + mov %dx, (%rdi) +# ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $2, %r8 + lea 2(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit2): + movzwl (%rsi), %ecx + mov %cx, (%rdi) + movb $0, 2(%rdi) +# ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $3, %r8 + lea 3(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit3): + mov (%rsi), %edx + mov %edx, (%rdi) +# ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $4, %r8 + lea 4(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit4_7): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov -3(%rsi, %rdx), %ecx + mov %ecx, -3(%rdi, %rdx) +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub %rdx, %r8 + sub $1, %r8 + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit8_15): + mov (%rsi), %rcx + mov -7(%rsi, %rdx), %r9 + mov %rcx, (%rdi) + mov %r9, -7(%rdi, %rdx) +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub %rdx, %r8 + sub $1, %r8 + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit16_31): + VMOVU (%rsi), %XMM2 + VMOVU -15(%rsi, %rdx), %XMM3 + VMOVU %XMM2, (%rdi) + VMOVU %XMM3, -15(%rdi, %rdx) +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub %rdx, %r8 + sub $1, %r8 + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit32_63): + VMOVU (%rsi), %YMM2 + VMOVU -31(%rsi, %rdx), %YMM3 + VMOVU %YMM2, (%rdi) + VMOVU %YMM3, -31(%rdi, %rdx) +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub %rdx, %r8 + sub $1, %r8 + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + +# ifdef USE_AS_STRNCPY + + .p2align 4 +L(StrncpyExit1): + movzbl (%rsi), %edx + mov %dl, (%rdi) +# ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, 1(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit2): + movzwl (%rsi), %edx + mov %dx, (%rdi) +# ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, 2(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit3_4): + movzwl (%rsi), %ecx + movzwl -2(%rsi, %r8), %edx + mov %cx, (%rdi) + mov %dx, -2(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + ret + + .p2align 4 +L(StrncpyExit5_8): + mov (%rsi), %ecx + mov -4(%rsi, %r8), %edx + mov %ecx, (%rdi) + mov %edx, -4(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + ret + + .p2align 4 +L(StrncpyExit9_16): + mov (%rsi), %rcx + mov -8(%rsi, %r8), %rdx + mov %rcx, (%rdi) + mov %rdx, -8(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + ret + + .p2align 4 +L(StrncpyExit17_32): + VMOVU (%rsi), %XMM2 + VMOVU -16(%rsi, %r8), %XMM3 + VMOVU %XMM2, (%rdi) + VMOVU %XMM3, -16(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + ret + + .p2align 4 +L(StrncpyExit33_64): + /* 0/32, 31/16 */ + VMOVU (%rsi), %YMM2 + VMOVU -VEC_SIZE(%rsi, %r8), %YMM3 + VMOVU %YMM2, (%rdi) + VMOVU %YMM3, -VEC_SIZE(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + ret + + .p2align 4 +L(StrncpyExit65): + /* 0/32, 32/32, 64/1 */ + VMOVU (%rsi), %YMM2 + VMOVU 32(%rsi), %YMM3 + mov 64(%rsi), %cl + VMOVU %YMM2, (%rdi) + VMOVU %YMM3, 32(%rdi) + mov %cl, 64(%rdi) +# ifdef USE_AS_STPCPY + lea 65(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, 65(%rdi) +# endif + ret + +# ifndef USE_AS_STRCAT + + .p2align 4 +L(Fill1): + mov %dl, (%rdi) + ret + + .p2align 4 +L(Fill2): + mov %dx, (%rdi) + ret + + .p2align 4 +L(Fill3_4): + mov %dx, (%rdi) + mov %dx, -2(%rdi, %r8) + ret + + .p2align 4 +L(Fill5_8): + mov %edx, (%rdi) + mov %edx, -4(%rdi, %r8) + ret + + .p2align 4 +L(Fill9_16): + mov %rdx, (%rdi) + mov %rdx, -8(%rdi, %r8) + ret + + .p2align 4 +L(Fill17_32): + VMOVU %XMMZERO, (%rdi) + VMOVU %XMMZERO, -16(%rdi, %r8) + ret + + .p2align 4 +L(CopyVecSizeUnalignedVec2): + VMOVU %YMM2, (%rdi, %rcx) + + .p2align 4 +L(CopyVecSizeVecExit): + bsf %edx, %edx + add $(VEC_SIZE - 1), %r8 + add %rcx, %rdi +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + + .p2align 4 +L(StrncpyFillTailWithZero): + xor %edx, %edx + sub $VEC_SIZE, %r8 + jbe L(StrncpyFillExit) + + VMOVU %YMMZERO, (%rdi) + add $VEC_SIZE, %rdi + + mov %rdi, %rsi + and $(VEC_SIZE - 1), %esi + sub %rsi, %rdi + add %rsi, %r8 + sub $(VEC_SIZE * 4), %r8 + jb L(StrncpyFillLessFourVecSize) + +L(StrncpyFillLoopVmovdqa): + VMOVA %YMMZERO, (%rdi) + VMOVA %YMMZERO, VEC_SIZE(%rdi) + VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi) + VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi) + add $(VEC_SIZE * 4), %rdi + sub $(VEC_SIZE * 4), %r8 + jae L(StrncpyFillLoopVmovdqa) + +L(StrncpyFillLessFourVecSize): + add $(VEC_SIZE * 2), %r8 + jl L(StrncpyFillLessTwoVecSize) + VMOVA %YMMZERO, (%rdi) + VMOVA %YMMZERO, VEC_SIZE(%rdi) + add $(VEC_SIZE * 2), %rdi + sub $VEC_SIZE, %r8 + jl L(StrncpyFillExit) + VMOVA %YMMZERO, (%rdi) + add $VEC_SIZE, %rdi + jmp L(Fill) + + .p2align 4 +L(StrncpyFillLessTwoVecSize): + add $VEC_SIZE, %r8 + jl L(StrncpyFillExit) + VMOVA %YMMZERO, (%rdi) + add $VEC_SIZE, %rdi + jmp L(Fill) + + .p2align 4 +L(StrncpyFillExit): + add $VEC_SIZE, %r8 +L(Fill): + cmp $17, %r8d + jae L(Fill17_32) + cmp $9, %r8d + jae L(Fill9_16) + cmp $5, %r8d + jae L(Fill5_8) + cmp $3, %r8d + jae L(Fill3_4) + cmp $1, %r8d + ja L(Fill2) + je L(Fill1) + ret + +/* end of ifndef USE_AS_STRCAT */ +# endif + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %rdx, %rdx + jnz L(UnalignedFourVecSizeLeaveCase2) +L(UnalignedFourVecSizeLeaveCase3): + lea (VEC_SIZE * 4)(%r8), %rcx + and $-VEC_SIZE, %rcx + add $(VEC_SIZE * 3), %r8 + jl L(CopyVecSizeCase3) + VMOVU %YMM4, (%rdi) + sub $VEC_SIZE, %r8 + jb L(CopyVecSizeCase3) + VMOVU %YMM5, VEC_SIZE(%rdi) + sub $VEC_SIZE, %r8 + jb L(CopyVecSizeCase3) + VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) + sub $VEC_SIZE, %r8 + jb L(CopyVecSizeCase3) + VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) +# ifdef USE_AS_STPCPY + lea (VEC_SIZE * 4)(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (VEC_SIZE * 4)(%rdi) +# endif + ret + + .p2align 4 +L(UnalignedFourVecSizeLeaveCase2): + xor %ecx, %ecx + vpcmpb $0, %YMM4, %YMMZERO, %k1 + kmovd %k1, %edx + add $(VEC_SIZE * 3), %r8 + jle L(CopyVecSizeCase2OrCase3) + test %edx, %edx +# ifndef USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec4) +# else + jnz L(CopyVecSize) +# endif + vpcmpb $0, %YMM5, %YMMZERO, %k2 + kmovd %k2, %edx + VMOVU %YMM4, (%rdi) + add $VEC_SIZE, %rcx + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) + test %edx, %edx +# ifndef USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec5) +# else + jnz L(CopyVecSize) +# endif + + vpcmpb $0, %YMM6, %YMMZERO, %k3 + kmovd %k3, %edx + VMOVU %YMM5, VEC_SIZE(%rdi) + add $VEC_SIZE, %rcx + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) + test %edx, %edx +# ifndef USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec6) +# else + jnz L(CopyVecSize) +# endif + + vpcmpb $0, %YMM7, %YMMZERO, %k4 + kmovd %k4, %edx + VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) + lea VEC_SIZE(%rdi, %rcx), %rdi + lea VEC_SIZE(%rsi, %rcx), %rsi + bsf %edx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) +L(StrncpyExit): + cmp $65, %r8d + je L(StrncpyExit65) + cmp $33, %r8d + jae L(StrncpyExit33_64) + cmp $17, %r8d + jae L(StrncpyExit17_32) + cmp $9, %r8d + jae L(StrncpyExit9_16) + cmp $5, %r8d + jae L(StrncpyExit5_8) + cmp $3, %r8d + jae L(StrncpyExit3_4) + cmp $1, %r8d + ja L(StrncpyExit2) + je L(StrncpyExit1) +# ifdef USE_AS_STPCPY + mov %rdi, %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi) +# endif + ret + + .p2align 4 +L(ExitZero): +# ifndef USE_AS_STRCAT + mov %rdi, %rax +# endif + ret + +# endif + +# ifndef USE_AS_STRCAT +END (STRCPY) +# else +END (STRCAT) +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S new file mode 100644 index 0000000000..8884f02371 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncat-evex.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCAT +#define STRCAT __strncat_evex +#include "strcat-evex.S" diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S new file mode 100644 index 0000000000..40e391f0da --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy-evex.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_evex +#include "strcpy-evex.S"