[v2] x86: Unifies 'strlen-evex' and 'strlen-evex512' implementations.
Checks
Context |
Check |
Description |
redhat-pt-bot/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 |
success
|
Testing passed
|
redhat-pt-bot/TryBot-32bit |
success
|
Build for i686
|
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_glibc_build--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-arm |
success
|
Testing passed
|
Commit Message
This commit uses a common implementation 'strlen-evex-base.S' for both
'strlen-evex' and 'strlen-evex512'
The motivation is to reduce the number of implementations to maintain.
This incidentally gives a small performance improvement.
All tests pass on x86.
Benchmarks were taken on SKX.
https://www.intel.com/content/www/us/en/products/sku/123613/intel-core-i97900x-xseries-processor-13-75m-cache-up-to-4-30-ghz/specifications.html
Geometric mean for strlen-evex512 over all benchmarks (N=10) was (new/old) 0.939
Geometric mean for wcslen-evex512 over all benchmarks (N=10) was (new/old) 0.965
Code Size Changes:
strlen-evex512.S : +24 bytes
wcslen-evex512.S : +54 bytes
---
sysdeps/x86_64/multiarch/strlen-evex-base.S | 380 ++++++++------------
sysdeps/x86_64/multiarch/strlen-evex.S | 250 +------------
sysdeps/x86_64/multiarch/strnlen-evex512.S | 266 +++++++++++++-
sysdeps/x86_64/multiarch/wcslen-evex512.S | 6 +-
sysdeps/x86_64/multiarch/wcsnlen-evex512.S | 9 +-
5 files changed, 439 insertions(+), 472 deletions(-)
Comments
On Thu, Dec 14, 2023 at 4:37 PM Matthew Sterrett
<matthew.sterrett@intel.com> wrote:
>
> This commit uses a common implementation 'strlen-evex-base.S' for both
> 'strlen-evex' and 'strlen-evex512'
>
> The motivation is to reduce the number of implementations to maintain.
> This incidentally gives a small performance improvement.
>
> All tests pass on x86.
>
> Benchmarks were taken on SKX.
> https://www.intel.com/content/www/us/en/products/sku/123613/intel-core-i97900x-xseries-processor-13-75m-cache-up-to-4-30-ghz/specifications.html
>
> Geometric mean for strlen-evex512 over all benchmarks (N=10) was (new/old) 0.939
> Geometric mean for wcslen-evex512 over all benchmarks (N=10) was (new/old) 0.965
>
> Code Size Changes:
> strlen-evex512.S : +24 bytes
> wcslen-evex512.S : +54 bytes
> ---
> sysdeps/x86_64/multiarch/strlen-evex-base.S | 380 ++++++++------------
> sysdeps/x86_64/multiarch/strlen-evex.S | 250 +------------
> sysdeps/x86_64/multiarch/strnlen-evex512.S | 266 +++++++++++++-
> sysdeps/x86_64/multiarch/wcslen-evex512.S | 6 +-
> sysdeps/x86_64/multiarch/wcsnlen-evex512.S | 9 +-
> 5 files changed, 439 insertions(+), 472 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> index 7305b24e28..6ea9e85aa0 100644
> --- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
> +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> @@ -1,5 +1,5 @@
> -/* Placeholder function, not used by any processor at the moment.
> - Copyright (C) 2022-2023 Free Software Foundation, Inc.
> +/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
256/512 bit EVEX....
> + Copyright (C) 2021-2023 Free Software Foundation, Inc.
> This file is part of the GNU C Library.
>
> The GNU C Library is free software; you can redistribute it and/or
> @@ -16,7 +16,6 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -/* UNUSED. Exists purely as reference implementation. */
>
> #include <isa-level.h>
>
> @@ -26,272 +25,211 @@
>
> # ifdef USE_AS_WCSLEN
> # define VPCMPEQ vpcmpeqd
> +# define VPCMPNEQ vpcmpneqd
> # define VPTESTN vptestnmd
> +# define VPTEST vptestmd
> # define VPMINU vpminud
> # define CHAR_SIZE 4
> +# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg
> # else
> # define VPCMPEQ vpcmpeqb
> +# define VPCMPNEQ vpcmpneqb
> # define VPTESTN vptestnmb
> +# define VPTEST vptestmb
> # define VPMINU vpminub
> # define CHAR_SIZE 1
> +# define CHAR_SIZE_SHIFT_REG(reg)
> +
> +# define REG_WIDTH VEC_SIZE
> # endif
>
> -# define PAGE_SIZE 4096
> # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
>
> - .section SECTION(.text),"ax",@progbits
> -/* Aligning entry point to 64 byte, provides better performance for
> - one vector length string. */
> -ENTRY_P2ALIGN (STRLEN, 6)
> -# ifdef USE_AS_STRNLEN
> - /* Check zero length. */
> - test %RSI_LP, %RSI_LP
> - jz L(ret_max)
> -# ifdef __ILP32__
> - /* Clear the upper 32 bits. */
> - movl %esi, %esi
> -# endif
> +# include "reg-macros.h"
> +
> +# if CHAR_PER_VEC == 64
> +
> +# define TAIL_RETURN_LBL first_vec_x2
> +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2)
> +
> +# define FALLTHROUGH_RETURN_LBL first_vec_x3
> +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
> +
> +# else
> +
> +# define TAIL_RETURN_LBL first_vec_x3
> +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3)
> +
> +# define FALLTHROUGH_RETURN_LBL first_vec_x2
> +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
> # endif
>
> +# define XZERO VMM_128(0)
> +# define VZERO VMM(0)
> +# define PAGE_SIZE 4096
> +
> + .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN(STRLEN, 6)
> movl %edi, %eax
> - vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0)
> - sall $20, %eax
> - cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> - ja L(page_cross)
> -
> - /* Compare [w]char for null, mask bit will be set for match. */
> - VPCMPEQ (%rdi), %VMM(0), %k0
> -# ifdef USE_AS_STRNLEN
> - KMOV %k0, %VRCX
> - /* Store max length in rax. */
> - mov %rsi, %rax
> - /* If rcx is 0, rax will have max length. We can not use VRCX
> - and VRAX here for evex256 because, upper 32 bits may be
> - undefined for ecx and eax. */
> - bsfq %rcx, %rax
> - cmp $CHAR_PER_VEC, %rax
> - ja L(align_more)
> - cmpq %rax, %rsi
> - cmovb %esi, %eax
> -# else
> + vpxorq %XZERO, %XZERO, %XZERO
> + andl $(PAGE_SIZE - 1), %eax
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> + ja L(cross_page_boundary)
> +
> + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
> + null byte. */
> + VPCMPEQ (%rdi), %VZERO, %k0
> KMOV %k0, %VRAX
> test %VRAX, %VRAX
> - jz L(align_more)
> + jz L(aligned_more)
> bsf %VRAX, %VRAX
> -# endif
> ret
>
> - /* At this point vector max length reached. */
> -# ifdef USE_AS_STRNLEN
> - .p2align 4,,3
> -L(ret_max):
> - movq %rsi, %rax
> + .p2align 4,, 8
> +L(first_vec_x4):
> + bsf %VRAX, %VRAX
> + subl %ecx, %edi
> + CHAR_SIZE_SHIFT_REG (edi)
> + leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
> ret
> -# endif
>
> -L(align_more):
> - mov %rdi, %rax
> - /* Align rax to VEC_SIZE. */
> - andq $-VEC_SIZE, %rax
> -# ifdef USE_AS_STRNLEN
> - movq %rdi, %rdx
> - subq %rax, %rdx
> -# ifdef USE_AS_WCSLEN
> - shr $2, %VRDX
> -# endif
> - /* At this point rdx contains [w]chars already compared. */
> - leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx
> - /* At this point rdx contains number of w[char] needs to go.
> - Now onwards rdx will keep decrementing with each compare. */
> -# endif
> -
> - /* Loop unroll 4 times for 4 vector loop. */
> - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> - subq $-VEC_SIZE, %rax
> - KMOV %k0, %VRCX
> - test %VRCX, %VRCX
> - jnz L(ret_vec_x1)
>
> -# ifdef USE_AS_STRNLEN
> - subq $CHAR_PER_VEC, %rdx
> - jbe L(ret_max)
> -# endif
>
> - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> - KMOV %k0, %VRCX
> - test %VRCX, %VRCX
> - jnz L(ret_vec_x2)
> + /* Aligned more for strnlen compares remaining length vs 2 *
> + CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
> + going to the loop. */
> + .p2align 4,, 10
> +L(aligned_more):
> + movq %rdi, %rcx
> + andq $(VEC_SIZE * -1), %rdi
> +L(cross_page_continue):
> + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> + rechecking bounds. */
> + VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
> + KMOV %k0, %VRAX
> + test %VRAX, %VRAX
> + jnz L(first_vec_x1)
>
> -# ifdef USE_AS_STRNLEN
> - subq $CHAR_PER_VEC, %rdx
> - jbe L(ret_max)
> -# endif
> + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> + KMOV %k0, %VRAX
> + test %VRAX, %VRAX
> + jnz L(first_vec_x2)
>
> - VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
> - KMOV %k0, %VRCX
> - test %VRCX, %VRCX
> - jnz L(ret_vec_x3)
> + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> + KMOV %k0, %VRAX
> + test %VRAX, %VRAX
> + jnz L(first_vec_x3)
>
> -# ifdef USE_AS_STRNLEN
> - subq $CHAR_PER_VEC, %rdx
> - jbe L(ret_max)
> -# endif
> + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> + KMOV %k0, %VRAX
> + test %VRAX, %VRAX
> + jnz L(first_vec_x4)
>
> - VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
> - KMOV %k0, %VRCX
> - test %VRCX, %VRCX
> - jnz L(ret_vec_x4)
> + subq $(VEC_SIZE * -1), %rdi
>
> -# ifdef USE_AS_STRNLEN
> - subq $CHAR_PER_VEC, %rdx
> - jbe L(ret_max)
> - /* Save pointer before 4 x VEC_SIZE alignment. */
> - movq %rax, %rcx
> +# if CHAR_PER_VEC == 64
> + /* No partial register stalls on processors that we use evex512
> + on and this saves code size. */
> + xorb %dil, %dil
> +# else
> + andq $-(VEC_SIZE * 4), %rdi
> # endif
>
> - /* Align address to VEC_SIZE * 4 for loop. */
> - andq $-(VEC_SIZE * 4), %rax
> -
> -# ifdef USE_AS_STRNLEN
> - subq %rax, %rcx
> -# ifdef USE_AS_WCSLEN
> - shr $2, %VRCX
> -# endif
> - /* rcx contains number of [w]char will be recompared due to
> - alignment fixes. rdx must be incremented by rcx to offset
> - alignment adjustment. */
> - addq %rcx, %rdx
> - /* Need jump as we don't want to add/subtract rdx for first
> - iteration of 4 x VEC_SIZE aligned loop. */
> -# endif
>
> - .p2align 4,,11
> -L(loop):
> - /* VPMINU and VPCMP combination provide better performance as
> - compared to alternative combinations. */
> - VMOVA (VEC_SIZE * 4)(%rax), %VMM(1)
> - VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
> - VMOVA (VEC_SIZE * 6)(%rax), %VMM(3)
> - VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
>
> + /* Compare 4 * VEC at a time forward. */
> + .p2align 4
> +L(loop_4x_vec):
> + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
> + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
> + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
> VPTESTN %VMM(2), %VMM(2), %k0
> - VPTESTN %VMM(4), %VMM(4), %k1
> + VPTESTN %VMM(4), %VMM(4), %k2
>
> - subq $-(VEC_SIZE * 4), %rax
> - KORTEST %k0, %k1
> + subq $-(VEC_SIZE * 4), %rdi
> + KORTEST %k0, %k2
> + jz L(loop_4x_vec)
>
> -# ifndef USE_AS_STRNLEN
> - jz L(loop)
> + VPTESTN %VMM(1), %VMM(1), %k1
> + KMOV %k1, %VRAX
> + test %VRAX, %VRAX
> + jnz L(first_vec_x0)
> +
> + KMOV %k0, %VRAX
> + test %VRAX, %VRAX
> + jnz L(first_vec_x1)
> +
> + VPTESTN %VMM(3), %VMM(3), %k0
> +
> +# if CHAR_PER_VEC == 64
> + KMOV %k0, %VRAX
> + test %VRAX, %VRAX
> + jnz L(first_vec_x2)
> + KMOV %k2, %VRAX
> # else
> - jnz L(loopend)
> - subq $(CHAR_PER_VEC * 4), %rdx
> - ja L(loop)
> - mov %rsi, %rax
> + /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. */
> + kmovd %k2, %edx
> + kmovd %k0, %eax
> + salq $CHAR_PER_VEC, %rdx
> + orq %rdx, %rax
> +# endif
> +
> + /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. */
> + .p2align 4,, 2
> +L(FALLTHROUGH_RETURN_LBL):
> + bsfq %rax, %rax
> + subq %rcx, %rdi
> + CHAR_SIZE_SHIFT_REG (rdi)
> + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
> ret
> -# endif
>
> -L(loopend):
> -
> - VPTESTN %VMM(1), %VMM(1), %k2
> - KMOV %k2, %VRCX
> - test %VRCX, %VRCX
> - jnz L(ret_vec_x1)
> -
> - KMOV %k0, %VRCX
> - /* At this point, if k0 is non zero, null char must be in the
> - second vector. */
> - test %VRCX, %VRCX
> - jnz L(ret_vec_x2)
> -
> - VPTESTN %VMM(3), %VMM(3), %k3
> - KMOV %k3, %VRCX
> - test %VRCX, %VRCX
> - jnz L(ret_vec_x3)
> - /* At this point null [w]char must be in the fourth vector so no
> - need to check. */
> - KMOV %k1, %VRCX
> -
> - /* Fourth, third, second vector terminating are pretty much
> - same, implemented this way to avoid branching and reuse code
> - from pre loop exit condition. */
> -L(ret_vec_x4):
> - bsf %VRCX, %VRCX
> - subq %rdi, %rax
> -# ifdef USE_AS_WCSLEN
> - subq $-(VEC_SIZE * 3), %rax
> - shrq $2, %rax
> - addq %rcx, %rax
> -# else
> - leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
> -# endif
> -# ifdef USE_AS_STRNLEN
> - cmpq %rsi, %rax
> - cmovnb %rsi, %rax
> -# endif
> + .p2align 4,, 8
> +L(first_vec_x0):
> + bsf %VRAX, %VRAX
> + sub %rcx, %rdi
> + CHAR_SIZE_SHIFT_REG (rdi)
> + addq %rdi, %rax
> ret
>
> -L(ret_vec_x3):
> - bsf %VRCX, %VRCX
> - subq %rdi, %rax
> -# ifdef USE_AS_WCSLEN
> - subq $-(VEC_SIZE * 2), %rax
> - shrq $2, %rax
> - addq %rcx, %rax
> -# else
> - leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
> -# endif
> -# ifdef USE_AS_STRNLEN
> - cmpq %rsi, %rax
> - cmovnb %rsi, %rax
> -# endif
> + .p2align 4,, 10
> +L(first_vec_x1):
> + bsf %VRAX, %VRAX
> + sub %rcx, %rdi
> + CHAR_SIZE_SHIFT_REG (rdi)
> + leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
> ret
>
> -L(ret_vec_x2):
> - subq $-VEC_SIZE, %rax
> -L(ret_vec_x1):
> - bsf %VRCX, %VRCX
> - subq %rdi, %rax
> -# ifdef USE_AS_WCSLEN
> - shrq $2, %rax
> -# endif
> - addq %rcx, %rax
> -# ifdef USE_AS_STRNLEN
> - cmpq %rsi, %rax
> - cmovnb %rsi, %rax
> -# endif
> + .p2align 4,, 10
> + /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM. */
> +L(TAIL_RETURN_LBL):
> + bsf %VRAX, %VRAX
> + sub %VRCX, %VRDI
> + CHAR_SIZE_SHIFT_REG (VRDI)
> + lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
> ret
>
> -L(page_cross):
> - mov %rdi, %rax
> - movl %edi, %ecx
> - andl $(VEC_SIZE - 1), %ecx
> + .p2align 4,, 8
> +L(cross_page_boundary):
> + movq %rdi, %rcx
> + /* Align data to VEC_SIZE. */
> + andq $-VEC_SIZE, %rdi
> +
> + VPCMPEQ (%rdi), %VZERO, %k0
> +
> + KMOV %k0, %VRAX
> # ifdef USE_AS_WCSLEN
> - sarl $2, %ecx
> -# endif
> - /* ecx contains number of w[char] to be skipped as a result
> - of address alignment. */
> - andq $-VEC_SIZE, %rax
> - VPCMPEQ (%rax), %VMM(0), %k0
> - KMOV %k0, %VRDX
> - /* Ignore number of character for alignment adjustment. */
> - shr %cl, %VRDX
> -# ifdef USE_AS_STRNLEN
> - jnz L(page_cross_end)
> - movl $CHAR_PER_VEC, %eax
> - sub %ecx, %eax
> - cmp %rax, %rsi
> - ja L(align_more)
> + movl %ecx, %edx
> + shrl $2, %edx
> + andl $(CHAR_PER_VEC - 1), %edx
> + shrx %edx, %eax, %eax
> + testl %eax, %eax
> # else
> - jz L(align_more)
> -# endif
> -
> -L(page_cross_end):
> - bsf %VRDX, %VRAX
> -# ifdef USE_AS_STRNLEN
> - cmpq %rsi, %rax
> - cmovnb %esi, %eax
> + shr %cl, %VRAX
> # endif
> + jz L(cross_page_continue)
> + bsf %VRAX, %VRAX
> ret
>
> -END (STRLEN)
> +END(STRLEN)
> #endif
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
> index 364eeffff6..93ad15e356 100644
> --- a/sysdeps/x86_64/multiarch/strlen-evex.S
> +++ b/sysdeps/x86_64/multiarch/strlen-evex.S
> @@ -1,245 +1,7 @@
> -/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
> - Copyright (C) 2021-2023 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <isa-level.h>
> -
> -#if ISA_SHOULD_BUILD (4)
> -
> -# include <sysdep.h>
> -
> -# ifndef STRLEN
> -# define STRLEN __strlen_evex
> -# endif
> -
> -# ifndef VEC_SIZE
> -# include "x86-evex256-vecs.h"
> -# endif
> -
> -# ifdef USE_AS_WCSLEN
> -# define VPCMPEQ vpcmpeqd
> -# define VPCMPNEQ vpcmpneqd
> -# define VPTESTN vptestnmd
> -# define VPTEST vptestmd
> -# define VPMINU vpminud
> -# define CHAR_SIZE 4
> -# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg
> -# else
> -# define VPCMPEQ vpcmpeqb
> -# define VPCMPNEQ vpcmpneqb
> -# define VPTESTN vptestnmb
> -# define VPTEST vptestmb
> -# define VPMINU vpminub
> -# define CHAR_SIZE 1
> -# define CHAR_SIZE_SHIFT_REG(reg)
> -
> -# define REG_WIDTH VEC_SIZE
> -# endif
> -
> -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> -
> -# include "reg-macros.h"
> -
> -# if CHAR_PER_VEC == 64
> -
> -# define TAIL_RETURN_LBL first_vec_x2
> -# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2)
> -
> -# define FALLTHROUGH_RETURN_LBL first_vec_x3
> -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
> -
> -# else
> -
> -# define TAIL_RETURN_LBL first_vec_x3
> -# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3)
> -
> -# define FALLTHROUGH_RETURN_LBL first_vec_x2
> -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
> -# endif
> -
> -# define XZERO VMM_128(0)
> -# define VZERO VMM(0)
> -# define PAGE_SIZE 4096
> -
> - .section SECTION(.text), "ax", @progbits
> -ENTRY_P2ALIGN (STRLEN, 6)
> - movl %edi, %eax
> - vpxorq %XZERO, %XZERO, %XZERO
> - andl $(PAGE_SIZE - 1), %eax
> - cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> - ja L(cross_page_boundary)
> -
> - /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
> - null byte. */
> - VPCMPEQ (%rdi), %VZERO, %k0
> - KMOV %k0, %VRAX
> - test %VRAX, %VRAX
> - jz L(aligned_more)
> - bsf %VRAX, %VRAX
> - ret
> -
> - .p2align 4,, 8
> -L(first_vec_x4):
> - bsf %VRAX, %VRAX
> - subl %ecx, %edi
> - CHAR_SIZE_SHIFT_REG (edi)
> - leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
> - ret
> -
> -
> -
> - /* Aligned more for strnlen compares remaining length vs 2 *
> - CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
> - going to the loop. */
> - .p2align 4,, 10
> -L(aligned_more):
> - movq %rdi, %rcx
> - andq $(VEC_SIZE * -1), %rdi
> -L(cross_page_continue):
> - /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> - rechecking bounds. */
> - VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
> - KMOV %k0, %VRAX
> - test %VRAX, %VRAX
> - jnz L(first_vec_x1)
> -
> - VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> - KMOV %k0, %VRAX
> - test %VRAX, %VRAX
> - jnz L(first_vec_x2)
> -
> - VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> - KMOV %k0, %VRAX
> - test %VRAX, %VRAX
> - jnz L(first_vec_x3)
> -
> - VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> - KMOV %k0, %VRAX
> - test %VRAX, %VRAX
> - jnz L(first_vec_x4)
> -
> - subq $(VEC_SIZE * -1), %rdi
> -
> -# if CHAR_PER_VEC == 64
> - /* No partial register stalls on processors that we use evex512
> - on and this saves code size. */
> - xorb %dil, %dil
> -# else
> - andq $-(VEC_SIZE * 4), %rdi
> -# endif
> -
> -
> -
> - /* Compare 4 * VEC at a time forward. */
> - .p2align 4
> -L(loop_4x_vec):
> - VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
> - VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> - VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
> - VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
> - VPTESTN %VMM(2), %VMM(2), %k0
> - VPTESTN %VMM(4), %VMM(4), %k2
> -
> - subq $-(VEC_SIZE * 4), %rdi
> - KORTEST %k0, %k2
> - jz L(loop_4x_vec)
> -
> - VPTESTN %VMM(1), %VMM(1), %k1
> - KMOV %k1, %VRAX
> - test %VRAX, %VRAX
> - jnz L(first_vec_x0)
> -
> - KMOV %k0, %VRAX
> - test %VRAX, %VRAX
> - jnz L(first_vec_x1)
> -
> - VPTESTN %VMM(3), %VMM(3), %k0
> -
> -# if CHAR_PER_VEC == 64
> - KMOV %k0, %VRAX
> - test %VRAX, %VRAX
> - jnz L(first_vec_x2)
> - KMOV %k2, %VRAX
> -# else
> - /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
> - */
> - kmovd %k2, %edx
> - kmovd %k0, %eax
> - salq $CHAR_PER_VEC, %rdx
> - orq %rdx, %rax
> -# endif
> -
> - /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
> - */
> - .p2align 4,, 2
> -L(FALLTHROUGH_RETURN_LBL):
> - bsfq %rax, %rax
> - subq %rcx, %rdi
> - CHAR_SIZE_SHIFT_REG (rdi)
> - leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
> - ret
> -
> - .p2align 4,, 8
> -L(first_vec_x0):
> - bsf %VRAX, %VRAX
> - sub %rcx, %rdi
> - CHAR_SIZE_SHIFT_REG (rdi)
> - addq %rdi, %rax
> - ret
> -
> - .p2align 4,, 10
> -L(first_vec_x1):
> - bsf %VRAX, %VRAX
> - sub %rcx, %rdi
> - CHAR_SIZE_SHIFT_REG (rdi)
> - leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
> - ret
> -
> - .p2align 4,, 10
> - /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
> - */
> -L(TAIL_RETURN_LBL):
> - bsf %VRAX, %VRAX
> - sub %VRCX, %VRDI
> - CHAR_SIZE_SHIFT_REG (VRDI)
> - lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
> - ret
> -
> - .p2align 4,, 8
> -L(cross_page_boundary):
> - movq %rdi, %rcx
> - /* Align data to VEC_SIZE. */
> - andq $-VEC_SIZE, %rdi
> -
> - VPCMPEQ (%rdi), %VZERO, %k0
> -
> - KMOV %k0, %VRAX
> -# ifdef USE_AS_WCSLEN
> - movl %ecx, %edx
> - shrl $2, %edx
> - andl $(CHAR_PER_VEC - 1), %edx
> - shrx %edx, %eax, %eax
> - testl %eax, %eax
> -# else
> - shr %cl, %VRAX
> -# endif
> - jz L(cross_page_continue)
> - bsf %VRAX, %VRAX
> - ret
> -
> -END (STRLEN)
> +#ifndef STRLEN
> +# define STRLEN __strlen_evex
> #endif
> +
> +#include "x86-evex256-vecs.h"
> +#include "reg-macros.h"
> +#include "strlen-evex-base.S"
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> index 0b7f220214..ebf22c259f 100644
> --- a/sysdeps/x86_64/multiarch/strnlen-evex512.S
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> @@ -1,4 +1,264 @@
> -#define STRLEN __strnlen_evex512
> -#define USE_AS_STRNLEN 1
> +/* Placeholder function, not used by any processor at the moment.
> + Copyright (C) 2022-2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
>
> -#include "strlen-evex512.S"
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#ifndef STRNLEN
> +#define STRNLEN __strnlen_evex512
> +#endif
> +
> +#include "x86-evex512-vecs.h"
> +#include "reg-macros.h"
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +# include <sysdep.h>
> +
> +# ifdef USE_AS_WCSLEN
> +# define VPCMPEQ vpcmpeqd
> +# define VPTESTN vptestnmd
> +# define VPMINU vpminud
> +# define CHAR_SIZE 4
> +# else
> +# define VPCMPEQ vpcmpeqb
> +# define VPTESTN vptestnmb
> +# define VPMINU vpminub
> +# define CHAR_SIZE 1
> +# endif
> +
> +# define PAGE_SIZE 4096
> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> +
> + .section SECTION(.text),"ax",@progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> + one vector length string. */
> +ENTRY_P2ALIGN (STRNLEN, 6)
> + /* Check zero length. */
> + test %RSI_LP, %RSI_LP
> + jz L(ret_max)
> +# ifdef __ILP32__
> + /* Clear the upper 32 bits. */
> + movl %esi, %esi
> +# endif
> +
> + movl %edi, %eax
> + vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0)
> + sall $20, %eax
> + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> + ja L(page_cross)
> +
> + /* Compare [w]char for null, mask bit will be set for match. */
> + VPCMPEQ (%rdi), %VMM(0), %k0
> + KMOV %k0, %VRCX
> + /* Store max length in rax. */
> + mov %rsi, %rax
> + /* If rcx is 0, rax will have max length. We can not use VRCX
> + and VRAX here for evex256 because, upper 32 bits may be
> + undefined for ecx and eax. */
> + bsfq %rcx, %rax
> + cmp $CHAR_PER_VEC, %rax
> + ja L(align_more)
> + cmpq %rax, %rsi
> + cmovb %esi, %eax
> + ret
> +
> + /* At this point vector max length reached. */
> + .p2align 4,,3
> +L(ret_max):
> + movq %rsi, %rax
> + ret
> +
> +L(align_more):
> + mov %rdi, %rax
> + /* Align rax to VEC_SIZE. */
> + andq $-VEC_SIZE, %rax
> + movq %rdi, %rdx
> + subq %rax, %rdx
> +# ifdef USE_AS_WCSLEN
> + shr $2, %VRDX
> +# endif
> + /* At this point rdx contains [w]chars already compared. */
> + leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx
> + /* At this point rdx contains number of w[char] needs to go.
> + Now onwards rdx will keep decrementing with each compare. */
> +
> + /* Loop unroll 4 times for 4 vector loop. */
> + VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> + subq $-VEC_SIZE, %rax
> + KMOV %k0, %VRCX
> + test %VRCX, %VRCX
> + jnz L(ret_vec_x1)
> +
> + subq $CHAR_PER_VEC, %rdx
> + jbe L(ret_max)
> +
> + VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> + KMOV %k0, %VRCX
> + test %VRCX, %VRCX
> + jnz L(ret_vec_x2)
> +
> + subq $CHAR_PER_VEC, %rdx
> + jbe L(ret_max)
> +
> + VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
> + KMOV %k0, %VRCX
> + test %VRCX, %VRCX
> + jnz L(ret_vec_x3)
> +
> + subq $CHAR_PER_VEC, %rdx
> + jbe L(ret_max)
> +
> + VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
> + KMOV %k0, %VRCX
> + test %VRCX, %VRCX
> + jnz L(ret_vec_x4)
> +
> + subq $CHAR_PER_VEC, %rdx
> + jbe L(ret_max)
> + /* Save pointer before 4 x VEC_SIZE alignment. */
> + movq %rax, %rcx
> +
> + /* Align address to VEC_SIZE * 4 for loop. */
> + andq $-(VEC_SIZE * 4), %rax
> +
> + subq %rax, %rcx
> +# ifdef USE_AS_WCSLEN
> + shr $2, %VRCX
> +# endif
> + /* rcx contains number of [w]char will be recompared due to
> + alignment fixes. rdx must be incremented by rcx to offset
> + alignment adjustment. */
> + addq %rcx, %rdx
> + /* Need jump as we don't want to add/subtract rdx for first
> + iteration of 4 x VEC_SIZE aligned loop. */
> +
> + .p2align 4,,11
> +L(loop):
> + /* VPMINU and VPCMP combination provide better performance as
> + compared to alternative combinations. */
> + VMOVA (VEC_SIZE * 4)(%rax), %VMM(1)
> + VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
> + VMOVA (VEC_SIZE * 6)(%rax), %VMM(3)
> + VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
> +
> + VPTESTN %VMM(2), %VMM(2), %k0
> + VPTESTN %VMM(4), %VMM(4), %k1
> +
> + subq $-(VEC_SIZE * 4), %rax
> + KORTEST %k0, %k1
> +
> + jnz L(loopend)
> + subq $(CHAR_PER_VEC * 4), %rdx
> + ja L(loop)
> + mov %rsi, %rax
> + ret
> +
> +L(loopend):
> +
> + VPTESTN %VMM(1), %VMM(1), %k2
> + KMOV %k2, %VRCX
> + test %VRCX, %VRCX
> + jnz L(ret_vec_x1)
> +
> + KMOV %k0, %VRCX
> + /* At this point, if k0 is non zero, null char must be in the
> + second vector. */
> + test %VRCX, %VRCX
> + jnz L(ret_vec_x2)
> +
> + VPTESTN %VMM(3), %VMM(3), %k3
> + KMOV %k3, %VRCX
> + test %VRCX, %VRCX
> + jnz L(ret_vec_x3)
> + /* At this point null [w]char must be in the fourth vector so no
> + need to check. */
> + KMOV %k1, %VRCX
> +
> + /* Fourth, third, second vector terminating are pretty much
> + same, implemented this way to avoid branching and reuse code
> + from pre loop exit condition. */
> +L(ret_vec_x4):
> + bsf %VRCX, %VRCX
> + subq %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> + subq $-(VEC_SIZE * 3), %rax
> + shrq $2, %rax
> + addq %rcx, %rax
> +# else
> + leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
> +# endif
> +
> + cmpq %rsi, %rax
> + cmovnb %rsi, %rax
> + ret
> +
> +L(ret_vec_x3):
> + bsf %VRCX, %VRCX
> + subq %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> + subq $-(VEC_SIZE * 2), %rax
> + shrq $2, %rax
> + addq %rcx, %rax
> +# else
> + leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
> +# endif
> + cmpq %rsi, %rax
> + cmovnb %rsi, %rax
> + ret
> +
> +L(ret_vec_x2):
> + subq $-VEC_SIZE, %rax
> +L(ret_vec_x1):
> + bsf %VRCX, %VRCX
> + subq %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> + shrq $2, %rax
> +# endif
> + addq %rcx, %rax
> + cmpq %rsi, %rax
> + cmovnb %rsi, %rax
> + ret
> +
> +L(page_cross):
> + mov %rdi, %rax
> + movl %edi, %ecx
> + andl $(VEC_SIZE - 1), %ecx
> +# ifdef USE_AS_WCSLEN
> + sarl $2, %ecx
> +# endif
> + /* ecx contains number of w[char] to be skipped as a result
> + of address alignment. */
> + andq $-VEC_SIZE, %rax
> + VPCMPEQ (%rax), %VMM(0), %k0
> + KMOV %k0, %VRDX
> + /* Ignore number of character for alignment adjustment. */
> + shr %cl, %VRDX
> + jnz L(page_cross_end)
> + movl $CHAR_PER_VEC, %eax
> + sub %ecx, %eax
> + cmp %rax, %rsi
> + ja L(align_more)
> +
> +L(page_cross_end):
> + bsf %VRDX, %VRAX
> + cmpq %rsi, %rax
> + cmovnb %esi, %eax
> + ret
> +
> +END (STRNLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> index f59c372b78..aff288a66b 100644
> --- a/sysdeps/x86_64/multiarch/wcslen-evex512.S
> +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> @@ -1,4 +1,8 @@
> -#define STRLEN __wcslen_evex512
> +#ifndef WCSLEN
> +# define WCSLEN __wcslen_evex512
> +#endif
> +
> +#define STRLEN WCSLEN
> #define USE_AS_WCSLEN 1
>
> #include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> index 73dcf2f210..1c37d74fc9 100644
> --- a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> @@ -1,5 +1,8 @@
> -#define STRLEN __wcsnlen_evex512
> +#ifndef WCSNLEN
> +# define WCSNLEN __wcsnlen_evex512
> +#endif
> +
> +#define STRNLEN WCSNLEN
> #define USE_AS_WCSLEN 1
> -#define USE_AS_STRNLEN 1
>
> -#include "strlen-evex512.S"
> +#include "strnlen-evex512.S"
> --
> 2.37.2
>
@@ -1,5 +1,5 @@
-/* Placeholder function, not used by any processor at the moment.
- Copyright (C) 2022-2023 Free Software Foundation, Inc.
+/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
+ Copyright (C) 2021-2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,7 +16,6 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-/* UNUSED. Exists purely as reference implementation. */
#include <isa-level.h>
@@ -26,272 +25,211 @@
# ifdef USE_AS_WCSLEN
# define VPCMPEQ vpcmpeqd
+# define VPCMPNEQ vpcmpneqd
# define VPTESTN vptestnmd
+# define VPTEST vptestmd
# define VPMINU vpminud
# define CHAR_SIZE 4
+# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg
# else
# define VPCMPEQ vpcmpeqb
+# define VPCMPNEQ vpcmpneqb
# define VPTESTN vptestnmb
+# define VPTEST vptestmb
# define VPMINU vpminub
# define CHAR_SIZE 1
+# define CHAR_SIZE_SHIFT_REG(reg)
+
+# define REG_WIDTH VEC_SIZE
# endif
-# define PAGE_SIZE 4096
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
- .section SECTION(.text),"ax",@progbits
-/* Aligning entry point to 64 byte, provides better performance for
- one vector length string. */
-ENTRY_P2ALIGN (STRLEN, 6)
-# ifdef USE_AS_STRNLEN
- /* Check zero length. */
- test %RSI_LP, %RSI_LP
- jz L(ret_max)
-# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %esi, %esi
-# endif
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 64
+
+# define TAIL_RETURN_LBL first_vec_x2
+# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2)
+
+# define FALLTHROUGH_RETURN_LBL first_vec_x3
+# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
+
+# else
+
+# define TAIL_RETURN_LBL first_vec_x3
+# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3)
+
+# define FALLTHROUGH_RETURN_LBL first_vec_x2
+# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
# endif
+# define XZERO VMM_128(0)
+# define VZERO VMM(0)
+# define PAGE_SIZE 4096
+
+ .section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(STRLEN, 6)
movl %edi, %eax
- vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0)
- sall $20, %eax
- cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
- ja L(page_cross)
-
- /* Compare [w]char for null, mask bit will be set for match. */
- VPCMPEQ (%rdi), %VMM(0), %k0
-# ifdef USE_AS_STRNLEN
- KMOV %k0, %VRCX
- /* Store max length in rax. */
- mov %rsi, %rax
- /* If rcx is 0, rax will have max length. We can not use VRCX
- and VRAX here for evex256 because, upper 32 bits may be
- undefined for ecx and eax. */
- bsfq %rcx, %rax
- cmp $CHAR_PER_VEC, %rax
- ja L(align_more)
- cmpq %rax, %rsi
- cmovb %esi, %eax
-# else
+ vpxorq %XZERO, %XZERO, %XZERO
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
+ null byte. */
+ VPCMPEQ (%rdi), %VZERO, %k0
KMOV %k0, %VRAX
test %VRAX, %VRAX
- jz L(align_more)
+ jz L(aligned_more)
bsf %VRAX, %VRAX
-# endif
ret
- /* At this point vector max length reached. */
-# ifdef USE_AS_STRNLEN
- .p2align 4,,3
-L(ret_max):
- movq %rsi, %rax
+ .p2align 4,, 8
+L(first_vec_x4):
+ bsf %VRAX, %VRAX
+ subl %ecx, %edi
+ CHAR_SIZE_SHIFT_REG (edi)
+ leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
ret
-# endif
-L(align_more):
- mov %rdi, %rax
- /* Align rax to VEC_SIZE. */
- andq $-VEC_SIZE, %rax
-# ifdef USE_AS_STRNLEN
- movq %rdi, %rdx
- subq %rax, %rdx
-# ifdef USE_AS_WCSLEN
- shr $2, %VRDX
-# endif
- /* At this point rdx contains [w]chars already compared. */
- leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx
- /* At this point rdx contains number of w[char] needs to go.
- Now onwards rdx will keep decrementing with each compare. */
-# endif
-
- /* Loop unroll 4 times for 4 vector loop. */
- VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
- subq $-VEC_SIZE, %rax
- KMOV %k0, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x1)
-# ifdef USE_AS_STRNLEN
- subq $CHAR_PER_VEC, %rdx
- jbe L(ret_max)
-# endif
- VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
- KMOV %k0, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x2)
+ /* Aligned more for strnlen compares remaining length vs 2 *
+ CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+ going to the loop. */
+ .p2align 4,, 10
+L(aligned_more):
+ movq %rdi, %rcx
+ andq $(VEC_SIZE * -1), %rdi
+L(cross_page_continue):
+ /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+ rechecking bounds. */
+ VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x1)
-# ifdef USE_AS_STRNLEN
- subq $CHAR_PER_VEC, %rdx
- jbe L(ret_max)
-# endif
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x2)
- VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
- KMOV %k0, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x3)
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x3)
-# ifdef USE_AS_STRNLEN
- subq $CHAR_PER_VEC, %rdx
- jbe L(ret_max)
-# endif
+ VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x4)
- VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
- KMOV %k0, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x4)
+ subq $(VEC_SIZE * -1), %rdi
-# ifdef USE_AS_STRNLEN
- subq $CHAR_PER_VEC, %rdx
- jbe L(ret_max)
- /* Save pointer before 4 x VEC_SIZE alignment. */
- movq %rax, %rcx
+# if CHAR_PER_VEC == 64
+ /* No partial register stalls on processors that we use evex512
+ on and this saves code size. */
+ xorb %dil, %dil
+# else
+ andq $-(VEC_SIZE * 4), %rdi
# endif
- /* Align address to VEC_SIZE * 4 for loop. */
- andq $-(VEC_SIZE * 4), %rax
-
-# ifdef USE_AS_STRNLEN
- subq %rax, %rcx
-# ifdef USE_AS_WCSLEN
- shr $2, %VRCX
-# endif
- /* rcx contains number of [w]char will be recompared due to
- alignment fixes. rdx must be incremented by rcx to offset
- alignment adjustment. */
- addq %rcx, %rdx
- /* Need jump as we don't want to add/subtract rdx for first
- iteration of 4 x VEC_SIZE aligned loop. */
-# endif
- .p2align 4,,11
-L(loop):
- /* VPMINU and VPCMP combination provide better performance as
- compared to alternative combinations. */
- VMOVA (VEC_SIZE * 4)(%rax), %VMM(1)
- VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
- VMOVA (VEC_SIZE * 6)(%rax), %VMM(3)
- VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
+ /* Compare 4 * VEC at a time forward. */
+ .p2align 4
+L(loop_4x_vec):
+ VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
+ VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+ VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
+ VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
VPTESTN %VMM(2), %VMM(2), %k0
- VPTESTN %VMM(4), %VMM(4), %k1
+ VPTESTN %VMM(4), %VMM(4), %k2
- subq $-(VEC_SIZE * 4), %rax
- KORTEST %k0, %k1
+ subq $-(VEC_SIZE * 4), %rdi
+ KORTEST %k0, %k2
+ jz L(loop_4x_vec)
-# ifndef USE_AS_STRNLEN
- jz L(loop)
+ VPTESTN %VMM(1), %VMM(1), %k1
+ KMOV %k1, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x0)
+
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x1)
+
+ VPTESTN %VMM(3), %VMM(3), %k0
+
+# if CHAR_PER_VEC == 64
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x2)
+ KMOV %k2, %VRAX
# else
- jnz L(loopend)
- subq $(CHAR_PER_VEC * 4), %rdx
- ja L(loop)
- mov %rsi, %rax
+ /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. */
+ kmovd %k2, %edx
+ kmovd %k0, %eax
+ salq $CHAR_PER_VEC, %rdx
+ orq %rdx, %rax
+# endif
+
+ /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. */
+ .p2align 4,, 2
+L(FALLTHROUGH_RETURN_LBL):
+ bsfq %rax, %rax
+ subq %rcx, %rdi
+ CHAR_SIZE_SHIFT_REG (rdi)
+ leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
ret
-# endif
-L(loopend):
-
- VPTESTN %VMM(1), %VMM(1), %k2
- KMOV %k2, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x1)
-
- KMOV %k0, %VRCX
- /* At this point, if k0 is non zero, null char must be in the
- second vector. */
- test %VRCX, %VRCX
- jnz L(ret_vec_x2)
-
- VPTESTN %VMM(3), %VMM(3), %k3
- KMOV %k3, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x3)
- /* At this point null [w]char must be in the fourth vector so no
- need to check. */
- KMOV %k1, %VRCX
-
- /* Fourth, third, second vector terminating are pretty much
- same, implemented this way to avoid branching and reuse code
- from pre loop exit condition. */
-L(ret_vec_x4):
- bsf %VRCX, %VRCX
- subq %rdi, %rax
-# ifdef USE_AS_WCSLEN
- subq $-(VEC_SIZE * 3), %rax
- shrq $2, %rax
- addq %rcx, %rax
-# else
- leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
-# endif
-# ifdef USE_AS_STRNLEN
- cmpq %rsi, %rax
- cmovnb %rsi, %rax
-# endif
+ .p2align 4,, 8
+L(first_vec_x0):
+ bsf %VRAX, %VRAX
+ sub %rcx, %rdi
+ CHAR_SIZE_SHIFT_REG (rdi)
+ addq %rdi, %rax
ret
-L(ret_vec_x3):
- bsf %VRCX, %VRCX
- subq %rdi, %rax
-# ifdef USE_AS_WCSLEN
- subq $-(VEC_SIZE * 2), %rax
- shrq $2, %rax
- addq %rcx, %rax
-# else
- leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
-# endif
-# ifdef USE_AS_STRNLEN
- cmpq %rsi, %rax
- cmovnb %rsi, %rax
-# endif
+ .p2align 4,, 10
+L(first_vec_x1):
+ bsf %VRAX, %VRAX
+ sub %rcx, %rdi
+ CHAR_SIZE_SHIFT_REG (rdi)
+ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
ret
-L(ret_vec_x2):
- subq $-VEC_SIZE, %rax
-L(ret_vec_x1):
- bsf %VRCX, %VRCX
- subq %rdi, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
- addq %rcx, %rax
-# ifdef USE_AS_STRNLEN
- cmpq %rsi, %rax
- cmovnb %rsi, %rax
-# endif
+ .p2align 4,, 10
+ /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM. */
+L(TAIL_RETURN_LBL):
+ bsf %VRAX, %VRAX
+ sub %VRCX, %VRDI
+ CHAR_SIZE_SHIFT_REG (VRDI)
+ lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
ret
-L(page_cross):
- mov %rdi, %rax
- movl %edi, %ecx
- andl $(VEC_SIZE - 1), %ecx
+ .p2align 4,, 8
+L(cross_page_boundary):
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE. */
+ andq $-VEC_SIZE, %rdi
+
+ VPCMPEQ (%rdi), %VZERO, %k0
+
+ KMOV %k0, %VRAX
# ifdef USE_AS_WCSLEN
- sarl $2, %ecx
-# endif
- /* ecx contains number of w[char] to be skipped as a result
- of address alignment. */
- andq $-VEC_SIZE, %rax
- VPCMPEQ (%rax), %VMM(0), %k0
- KMOV %k0, %VRDX
- /* Ignore number of character for alignment adjustment. */
- shr %cl, %VRDX
-# ifdef USE_AS_STRNLEN
- jnz L(page_cross_end)
- movl $CHAR_PER_VEC, %eax
- sub %ecx, %eax
- cmp %rax, %rsi
- ja L(align_more)
+ movl %ecx, %edx
+ shrl $2, %edx
+ andl $(CHAR_PER_VEC - 1), %edx
+ shrx %edx, %eax, %eax
+ testl %eax, %eax
# else
- jz L(align_more)
-# endif
-
-L(page_cross_end):
- bsf %VRDX, %VRAX
-# ifdef USE_AS_STRNLEN
- cmpq %rsi, %rax
- cmovnb %esi, %eax
+ shr %cl, %VRAX
# endif
+ jz L(cross_page_continue)
+ bsf %VRAX, %VRAX
ret
-END (STRLEN)
+END(STRLEN)
#endif
@@ -1,245 +1,7 @@
-/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
- Copyright (C) 2021-2023 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-# include <sysdep.h>
-
-# ifndef STRLEN
-# define STRLEN __strlen_evex
-# endif
-
-# ifndef VEC_SIZE
-# include "x86-evex256-vecs.h"
-# endif
-
-# ifdef USE_AS_WCSLEN
-# define VPCMPEQ vpcmpeqd
-# define VPCMPNEQ vpcmpneqd
-# define VPTESTN vptestnmd
-# define VPTEST vptestmd
-# define VPMINU vpminud
-# define CHAR_SIZE 4
-# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg
-# else
-# define VPCMPEQ vpcmpeqb
-# define VPCMPNEQ vpcmpneqb
-# define VPTESTN vptestnmb
-# define VPTEST vptestmb
-# define VPMINU vpminub
-# define CHAR_SIZE 1
-# define CHAR_SIZE_SHIFT_REG(reg)
-
-# define REG_WIDTH VEC_SIZE
-# endif
-
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-
-# include "reg-macros.h"
-
-# if CHAR_PER_VEC == 64
-
-# define TAIL_RETURN_LBL first_vec_x2
-# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2)
-
-# define FALLTHROUGH_RETURN_LBL first_vec_x3
-# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
-
-# else
-
-# define TAIL_RETURN_LBL first_vec_x3
-# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3)
-
-# define FALLTHROUGH_RETURN_LBL first_vec_x2
-# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
-# endif
-
-# define XZERO VMM_128(0)
-# define VZERO VMM(0)
-# define PAGE_SIZE 4096
-
- .section SECTION(.text), "ax", @progbits
-ENTRY_P2ALIGN (STRLEN, 6)
- movl %edi, %eax
- vpxorq %XZERO, %XZERO, %XZERO
- andl $(PAGE_SIZE - 1), %eax
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
- ja L(cross_page_boundary)
-
- /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
- null byte. */
- VPCMPEQ (%rdi), %VZERO, %k0
- KMOV %k0, %VRAX
- test %VRAX, %VRAX
- jz L(aligned_more)
- bsf %VRAX, %VRAX
- ret
-
- .p2align 4,, 8
-L(first_vec_x4):
- bsf %VRAX, %VRAX
- subl %ecx, %edi
- CHAR_SIZE_SHIFT_REG (edi)
- leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
- ret
-
-
-
- /* Aligned more for strnlen compares remaining length vs 2 *
- CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
- going to the loop. */
- .p2align 4,, 10
-L(aligned_more):
- movq %rdi, %rcx
- andq $(VEC_SIZE * -1), %rdi
-L(cross_page_continue):
- /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
- rechecking bounds. */
- VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
- KMOV %k0, %VRAX
- test %VRAX, %VRAX
- jnz L(first_vec_x1)
-
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
- KMOV %k0, %VRAX
- test %VRAX, %VRAX
- jnz L(first_vec_x2)
-
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
- KMOV %k0, %VRAX
- test %VRAX, %VRAX
- jnz L(first_vec_x3)
-
- VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
- KMOV %k0, %VRAX
- test %VRAX, %VRAX
- jnz L(first_vec_x4)
-
- subq $(VEC_SIZE * -1), %rdi
-
-# if CHAR_PER_VEC == 64
- /* No partial register stalls on processors that we use evex512
- on and this saves code size. */
- xorb %dil, %dil
-# else
- andq $-(VEC_SIZE * 4), %rdi
-# endif
-
-
-
- /* Compare 4 * VEC at a time forward. */
- .p2align 4
-L(loop_4x_vec):
- VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
- VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
- VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
- VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
- VPTESTN %VMM(2), %VMM(2), %k0
- VPTESTN %VMM(4), %VMM(4), %k2
-
- subq $-(VEC_SIZE * 4), %rdi
- KORTEST %k0, %k2
- jz L(loop_4x_vec)
-
- VPTESTN %VMM(1), %VMM(1), %k1
- KMOV %k1, %VRAX
- test %VRAX, %VRAX
- jnz L(first_vec_x0)
-
- KMOV %k0, %VRAX
- test %VRAX, %VRAX
- jnz L(first_vec_x1)
-
- VPTESTN %VMM(3), %VMM(3), %k0
-
-# if CHAR_PER_VEC == 64
- KMOV %k0, %VRAX
- test %VRAX, %VRAX
- jnz L(first_vec_x2)
- KMOV %k2, %VRAX
-# else
- /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
- */
- kmovd %k2, %edx
- kmovd %k0, %eax
- salq $CHAR_PER_VEC, %rdx
- orq %rdx, %rax
-# endif
-
- /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
- */
- .p2align 4,, 2
-L(FALLTHROUGH_RETURN_LBL):
- bsfq %rax, %rax
- subq %rcx, %rdi
- CHAR_SIZE_SHIFT_REG (rdi)
- leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
- ret
-
- .p2align 4,, 8
-L(first_vec_x0):
- bsf %VRAX, %VRAX
- sub %rcx, %rdi
- CHAR_SIZE_SHIFT_REG (rdi)
- addq %rdi, %rax
- ret
-
- .p2align 4,, 10
-L(first_vec_x1):
- bsf %VRAX, %VRAX
- sub %rcx, %rdi
- CHAR_SIZE_SHIFT_REG (rdi)
- leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
- ret
-
- .p2align 4,, 10
- /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
- */
-L(TAIL_RETURN_LBL):
- bsf %VRAX, %VRAX
- sub %VRCX, %VRDI
- CHAR_SIZE_SHIFT_REG (VRDI)
- lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
- ret
-
- .p2align 4,, 8
-L(cross_page_boundary):
- movq %rdi, %rcx
- /* Align data to VEC_SIZE. */
- andq $-VEC_SIZE, %rdi
-
- VPCMPEQ (%rdi), %VZERO, %k0
-
- KMOV %k0, %VRAX
-# ifdef USE_AS_WCSLEN
- movl %ecx, %edx
- shrl $2, %edx
- andl $(CHAR_PER_VEC - 1), %edx
- shrx %edx, %eax, %eax
- testl %eax, %eax
-# else
- shr %cl, %VRAX
-# endif
- jz L(cross_page_continue)
- bsf %VRAX, %VRAX
- ret
-
-END (STRLEN)
+#ifndef STRLEN
+# define STRLEN __strlen_evex
#endif
+
+#include "x86-evex256-vecs.h"
+#include "reg-macros.h"
+#include "strlen-evex-base.S"
@@ -1,4 +1,264 @@
-#define STRLEN __strnlen_evex512
-#define USE_AS_STRNLEN 1
+/* Placeholder function, not used by any processor at the moment.
+ Copyright (C) 2022-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
-#include "strlen-evex512.S"
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRNLEN
+#define STRNLEN __strnlen_evex512
+#endif
+
+#include "x86-evex512-vecs.h"
+#include "reg-macros.h"
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSLEN
+# define VPCMPEQ vpcmpeqd
+# define VPTESTN vptestnmd
+# define VPMINU vpminud
+# define CHAR_SIZE 4
+# else
+# define VPCMPEQ vpcmpeqb
+# define VPTESTN vptestnmb
+# define VPMINU vpminub
+# define CHAR_SIZE 1
+# endif
+
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+ .section SECTION(.text),"ax",@progbits
+/* Aligning entry point to 64 byte, provides better performance for
+ one vector length string. */
+ENTRY_P2ALIGN (STRNLEN, 6)
+ /* Check zero length. */
+ test %RSI_LP, %RSI_LP
+ jz L(ret_max)
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %esi, %esi
+# endif
+
+ movl %edi, %eax
+ vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0)
+ sall $20, %eax
+ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
+ ja L(page_cross)
+
+ /* Compare [w]char for null, mask bit will be set for match. */
+ VPCMPEQ (%rdi), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ /* Store max length in rax. */
+ mov %rsi, %rax
+ /* If rcx is 0, rax will have max length. We can not use VRCX
+ and VRAX here for evex256 because, upper 32 bits may be
+ undefined for ecx and eax. */
+ bsfq %rcx, %rax
+ cmp $CHAR_PER_VEC, %rax
+ ja L(align_more)
+ cmpq %rax, %rsi
+ cmovb %esi, %eax
+ ret
+
+ /* At this point vector max length reached. */
+ .p2align 4,,3
+L(ret_max):
+ movq %rsi, %rax
+ ret
+
+L(align_more):
+ mov %rdi, %rax
+ /* Align rax to VEC_SIZE. */
+ andq $-VEC_SIZE, %rax
+ movq %rdi, %rdx
+ subq %rax, %rdx
+# ifdef USE_AS_WCSLEN
+ shr $2, %VRDX
+# endif
+ /* At this point rdx contains [w]chars already compared. */
+ leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx
+ /* At this point rdx contains number of w[char] needs to go.
+ Now onwards rdx will keep decrementing with each compare. */
+
+ /* Loop unroll 4 times for 4 vector loop. */
+ VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
+ subq $-VEC_SIZE, %rax
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x1)
+
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+
+ VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x2)
+
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+
+ VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x3)
+
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+
+ VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x4)
+
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+ /* Save pointer before 4 x VEC_SIZE alignment. */
+ movq %rax, %rcx
+
+ /* Align address to VEC_SIZE * 4 for loop. */
+ andq $-(VEC_SIZE * 4), %rax
+
+ subq %rax, %rcx
+# ifdef USE_AS_WCSLEN
+ shr $2, %VRCX
+# endif
+ /* rcx contains number of [w]char will be recompared due to
+ alignment fixes. rdx must be incremented by rcx to offset
+ alignment adjustment. */
+ addq %rcx, %rdx
+ /* Need jump as we don't want to add/subtract rdx for first
+ iteration of 4 x VEC_SIZE aligned loop. */
+
+ .p2align 4,,11
+L(loop):
+ /* VPMINU and VPCMP combination provide better performance as
+ compared to alternative combinations. */
+ VMOVA (VEC_SIZE * 4)(%rax), %VMM(1)
+ VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
+ VMOVA (VEC_SIZE * 6)(%rax), %VMM(3)
+ VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
+
+ VPTESTN %VMM(2), %VMM(2), %k0
+ VPTESTN %VMM(4), %VMM(4), %k1
+
+ subq $-(VEC_SIZE * 4), %rax
+ KORTEST %k0, %k1
+
+ jnz L(loopend)
+ subq $(CHAR_PER_VEC * 4), %rdx
+ ja L(loop)
+ mov %rsi, %rax
+ ret
+
+L(loopend):
+
+ VPTESTN %VMM(1), %VMM(1), %k2
+ KMOV %k2, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x1)
+
+ KMOV %k0, %VRCX
+ /* At this point, if k0 is non zero, null char must be in the
+ second vector. */
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x2)
+
+ VPTESTN %VMM(3), %VMM(3), %k3
+ KMOV %k3, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x3)
+ /* At this point null [w]char must be in the fourth vector so no
+ need to check. */
+ KMOV %k1, %VRCX
+
+ /* Fourth, third, second vector terminating are pretty much
+ same, implemented this way to avoid branching and reuse code
+ from pre loop exit condition. */
+L(ret_vec_x4):
+ bsf %VRCX, %VRCX
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ subq $-(VEC_SIZE * 3), %rax
+ shrq $2, %rax
+ addq %rcx, %rax
+# else
+ leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
+# endif
+
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+ ret
+
+L(ret_vec_x3):
+ bsf %VRCX, %VRCX
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ subq $-(VEC_SIZE * 2), %rax
+ shrq $2, %rax
+ addq %rcx, %rax
+# else
+ leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
+# endif
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+ ret
+
+L(ret_vec_x2):
+ subq $-VEC_SIZE, %rax
+L(ret_vec_x1):
+ bsf %VRCX, %VRCX
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ addq %rcx, %rax
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+ ret
+
+L(page_cross):
+ mov %rdi, %rax
+ movl %edi, %ecx
+ andl $(VEC_SIZE - 1), %ecx
+# ifdef USE_AS_WCSLEN
+ sarl $2, %ecx
+# endif
+ /* ecx contains number of w[char] to be skipped as a result
+ of address alignment. */
+ andq $-VEC_SIZE, %rax
+ VPCMPEQ (%rax), %VMM(0), %k0
+ KMOV %k0, %VRDX
+ /* Ignore number of character for alignment adjustment. */
+ shr %cl, %VRDX
+ jnz L(page_cross_end)
+ movl $CHAR_PER_VEC, %eax
+ sub %ecx, %eax
+ cmp %rax, %rsi
+ ja L(align_more)
+
+L(page_cross_end):
+ bsf %VRDX, %VRAX
+ cmpq %rsi, %rax
+ cmovnb %esi, %eax
+ ret
+
+END (STRNLEN)
+#endif
@@ -1,4 +1,8 @@
-#define STRLEN __wcslen_evex512
+#ifndef WCSLEN
+# define WCSLEN __wcslen_evex512
+#endif
+
+#define STRLEN WCSLEN
#define USE_AS_WCSLEN 1
#include "strlen-evex512.S"
@@ -1,5 +1,8 @@
-#define STRLEN __wcsnlen_evex512
+#ifndef WCSNLEN
+# define WCSNLEN __wcsnlen_evex512
+#endif
+
+#define STRNLEN WCSNLEN
#define USE_AS_WCSLEN 1
-#define USE_AS_STRNLEN 1
-#include "strlen-evex512.S"
+#include "strnlen-evex512.S"