From patchwork Sat May 20 13:56:19 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "H.J. Lu" X-Patchwork-Id: 20516 Received: (qmail 100226 invoked by alias); 20 May 2017 13:56:22 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 100217 invoked by uid 89); 20 May 2017 13:56:21 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-24.4 required=5.0 tests=AWL, BAYES_00, FREEMAIL_FROM, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_NONE, SPF_PASS autolearn=ham version=3.3.2 spammy=lived, 2097, ivy, Ivy X-HELO: mail-yb0-f195.google.com X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:date:from:to:subject:message-id:mime-version :content-disposition:user-agent; bh=WRWauWA2WOD5s2yILvGm01f6zm1jGbEdf2yRoiOtiFs=; b=quhU8X51v6iXID9MqD2qZH4BuigqYfiiX8cPcVKL0PuyMX/lOedFOgIuxKwc5I9LHe uyraWdeAPLQAb9LHQW8JFluj/L12ZOocMzK4ScrllNv3ckdKlRIRJ3mlavL8E/DtC+cG 3FgW7mPOqbgEyIEI4TwUmS2+SdSvBYDuVevLtl5gID1Sv1TbXWprpEoQmdMIVhMyhWR0 Xr94VKvDVNdJtb2BYFLe0w0KcFLFHm/vnq6j7b6T9ZM5aGcEKHMLzjvrw5HMooNw3+PD +xkjYkfE7neEvMb1OhiGIWNnzt2A22Kp+IShtNEsBBFU+wx1imCk+hHNoKkizsT4RBeF hDTg== X-Gm-Message-State: AODbwcCu5WifzPcllI7c7I6WMThVCBJrwP672PvwUotBJCHlxh0ImMp3 GbsbuefJ25/9xsue X-Received: by 10.37.209.150 with SMTP id i144mr12258469ybg.121.1495288580897; Sat, 20 May 2017 06:56:20 -0700 (PDT) Date: Sat, 20 May 2017 06:56:19 -0700 From: "H.J. Lu" To: GNU C Library Subject: [PATCH] x86-64: Update strlen.S to support wcslen/wcsnlen Message-ID: <20170520135619.GA17481@gmail.com> MIME-Version: 1.0 Content-Disposition: inline User-Agent: Mutt/1.8.0 (2017-02-23) The difference between strlen and wcslen is byte vs int. We can replace pminub and pcmpeqb with pminud and pcmpeqd to turn strlen into wcslen. Tested on Ivy Bridge with benchtests/bench-wcslen.c, the new strlen based wcslen is as fast as the old wcslen. OK for master? H.J. --- * sysdeps/x86_64/strlen.S (PMINU): New. (PCMPEQ): Likewise. (SHIFT_RETURN): Likewise. (FIND_ZERO): Replace pcmpeqb with PCMPEQ. (strlen): Add SHIFT_RETURN before ret. Replace pcmpeqb and pminub with PCMPEQ and PMINU. * sysdeps/x86_64/wcslen.S: Define AS_WCSLEN and strlen. Include "strlen.S". * sysdeps/x86_64/wcsnlen.S: New file. --- sysdeps/x86_64/strlen.S | 61 +++++++----- sysdeps/x86_64/wcslen.S | 238 +---------------------------------------------- sysdeps/x86_64/wcsnlen.S | 7 ++ 3 files changed, 50 insertions(+), 256 deletions(-) create mode 100644 sysdeps/x86_64/wcsnlen.S diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S index 5896e6b9..b5ab117 100644 --- a/sysdeps/x86_64/strlen.S +++ b/sysdeps/x86_64/strlen.S @@ -1,4 +1,4 @@ -/* SSE2 version of strlen. +/* SSE2 version of strlen/wcslen. Copyright (C) 2012-2017 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -18,6 +18,16 @@ #include +#ifdef AS_WCSLEN +# define PMINU pminud +# define PCMPEQ pcmpeqd +# define SHIFT_RETURN shrq $2, %rax +#else +# define PMINU pminub +# define PCMPEQ pcmpeqb +# define SHIFT_RETURN +#endif + /* Long lived register in strlen(s), strnlen(s, n) are: %xmm3 - zero @@ -32,10 +42,10 @@ ENTRY(strlen) /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ #define FIND_ZERO \ - pcmpeqb (%rax), %xmm0; \ - pcmpeqb 16(%rax), %xmm1; \ - pcmpeqb 32(%rax), %xmm2; \ - pcmpeqb 48(%rax), %xmm3; \ + PCMPEQ (%rax), %xmm0; \ + PCMPEQ 16(%rax), %xmm1; \ + PCMPEQ 32(%rax), %xmm2; \ + PCMPEQ 48(%rax), %xmm3; \ pmovmskb %xmm0, %esi; \ pmovmskb %xmm1, %edx; \ pmovmskb %xmm2, %r8d; \ @@ -54,6 +64,9 @@ ENTRY(strlen) xor %rax, %rax ret L(n_nonzero): +# ifdef AS_WCSLEN + shlq $2, %rsi +# endif /* Initialize long lived registers. */ @@ -96,6 +109,7 @@ L(n_nonzero): test %rdx, %rdx; \ je L(lab); \ bsfq %rdx, %rax; \ + SHIFT_RETURN; \ ret #ifdef AS_STRNLEN @@ -104,19 +118,20 @@ L(n_nonzero): #else /* Test first 16 bytes unaligned. */ movdqu (%rax), %xmm4 - pcmpeqb %xmm0, %xmm4 + PCMPEQ %xmm0, %xmm4 pmovmskb %xmm4, %edx test %edx, %edx je L(next48_bytes) bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ + SHIFT_RETURN ret L(next48_bytes): /* Same as FIND_ZERO except we do not check first 16 bytes. */ andq $-16, %rax - pcmpeqb 16(%rax), %xmm1 - pcmpeqb 32(%rax), %xmm2 - pcmpeqb 48(%rax), %xmm3 + PCMPEQ 16(%rax), %xmm1 + PCMPEQ 32(%rax), %xmm2 + PCMPEQ 48(%rax), %xmm3 pmovmskb %xmm1, %edx pmovmskb %xmm2, %r8d pmovmskb %xmm3, %ecx @@ -145,6 +160,7 @@ L(strnlen_ret): test %rdx, %rdx je L(loop_init) bsfq %rdx, %rax + SHIFT_RETURN ret #endif .p2align 4 @@ -161,10 +177,10 @@ L(loop): je L(exit_end) movdqa (%rax), %xmm0 - pminub 16(%rax), %xmm0 - pminub 32(%rax), %xmm0 - pminub 48(%rax), %xmm0 - pcmpeqb %xmm3, %xmm0 + PMINU 16(%rax), %xmm0 + PMINU 32(%rax), %xmm0 + PMINU 48(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 pmovmskb %xmm0, %edx testl %edx, %edx jne L(exit) @@ -182,6 +198,7 @@ L(first): bsfq %rdx, %rdx addq %rdx, %rax subq %rdi, %rax + SHIFT_RETURN ret .p2align 4 @@ -192,6 +209,7 @@ L(exit): bsfq %rdx, %rdx addq %rdx, %rax subq %rdi, %rax + SHIFT_RETURN ret #else @@ -201,10 +219,10 @@ L(exit): L(loop): movdqa 64(%rax), %xmm0 - pminub 80(%rax), %xmm0 - pminub 96(%rax), %xmm0 - pminub 112(%rax), %xmm0 - pcmpeqb %xmm3, %xmm0 + PMINU 80(%rax), %xmm0 + PMINU 96(%rax), %xmm0 + PMINU 112(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 pmovmskb %xmm0, %edx testl %edx, %edx jne L(exit64) @@ -212,10 +230,10 @@ L(loop): subq $-128, %rax movdqa (%rax), %xmm0 - pminub 16(%rax), %xmm0 - pminub 32(%rax), %xmm0 - pminub 48(%rax), %xmm0 - pcmpeqb %xmm3, %xmm0 + PMINU 16(%rax), %xmm0 + PMINU 32(%rax), %xmm0 + PMINU 48(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 pmovmskb %xmm0, %edx testl %edx, %edx jne L(exit0) @@ -231,6 +249,7 @@ L(exit0): bsfq %rdx, %rdx addq %rdx, %rax subq %rdi, %rax + SHIFT_RETURN ret #endif diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S index c6081a4..88ecdb2 100644 --- a/sysdeps/x86_64/wcslen.S +++ b/sysdeps/x86_64/wcslen.S @@ -1,238 +1,6 @@ -/* Optimized wcslen for x86-64 with SSE2. - Copyright (C) 2011-2017 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. +#define AS_WCSLEN +#define strlen __wcslen - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include - - .text -ENTRY (__wcslen) - cmpl $0, (%rdi) - jz L(exit_tail0) - cmpl $0, 4(%rdi) - jz L(exit_tail1) - cmpl $0, 8(%rdi) - jz L(exit_tail2) - cmpl $0, 12(%rdi) - jz L(exit_tail3) - cmpl $0, 16(%rdi) - jz L(exit_tail4) - cmpl $0, 20(%rdi) - jz L(exit_tail5) - cmpl $0, 24(%rdi) - jz L(exit_tail6) - cmpl $0, 28(%rdi) - jz L(exit_tail7) - - pxor %xmm0, %xmm0 - - lea 32(%rdi), %rax - lea 16(%rdi), %rcx - and $-16, %rax - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - pxor %xmm2, %xmm2 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - pxor %xmm3, %xmm3 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - and $-0x40, %rax - - .p2align 4 -L(aligned_64_loop): - movaps (%rax), %xmm0 - movaps 16(%rax), %xmm1 - movaps 32(%rax), %xmm2 - movaps 48(%rax), %xmm6 - - pminub %xmm1, %xmm0 - pminub %xmm6, %xmm2 - pminub %xmm0, %xmm2 - pcmpeqd %xmm3, %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 64(%rax), %rax - jz L(aligned_64_loop) - - pcmpeqd -64(%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 48(%rcx), %rcx - jnz L(exit) - - pcmpeqd %xmm1, %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea -16(%rcx), %rcx - jnz L(exit) - - pcmpeqd -32(%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea -16(%rcx), %rcx - jnz L(exit) - - pcmpeqd %xmm6, %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea -16(%rcx), %rcx - jnz L(exit) - - jmp L(aligned_64_loop) - - .p2align 4 -L(exit): - sub %rcx, %rax - shr $2, %rax - test %dl, %dl - jz L(exit_high) - - mov %dl, %cl - and $15, %cl - jz L(exit_1) - ret - - .p2align 4 -L(exit_high): - mov %dh, %ch - and $15, %ch - jz L(exit_3) - add $2, %rax - ret - - .p2align 4 -L(exit_1): - add $1, %rax - ret - - .p2align 4 -L(exit_3): - add $3, %rax - ret - - .p2align 4 -L(exit_tail0): - xor %rax, %rax - ret - - .p2align 4 -L(exit_tail1): - mov $1, %rax - ret - - .p2align 4 -L(exit_tail2): - mov $2, %rax - ret - - .p2align 4 -L(exit_tail3): - mov $3, %rax - ret - - .p2align 4 -L(exit_tail4): - mov $4, %rax - ret - - .p2align 4 -L(exit_tail5): - mov $5, %rax - ret - - .p2align 4 -L(exit_tail6): - mov $6, %rax - ret - - .p2align 4 -L(exit_tail7): - mov $7, %rax - ret - -END (__wcslen) +#include "strlen.S" weak_alias(__wcslen, wcslen) diff --git a/sysdeps/x86_64/wcsnlen.S b/sysdeps/x86_64/wcsnlen.S new file mode 100644 index 0000000..968bb69 --- /dev/null +++ b/sysdeps/x86_64/wcsnlen.S @@ -0,0 +1,7 @@ +#define AS_WCSLEN +#define AS_STRNLEN +#define strlen __wcsnlen + +#include "strlen.S" + +weak_alias(__wcsnlen, wcsnlen)