x86: Unifies 'strnlen-evex' and 'strnlen-evex512' implementations.
Checks
Context |
Check |
Description |
redhat-pt-bot/TryBot-32bit |
success
|
Build for i686
|
redhat-pt-bot/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
linaro-tcwg-bot/tcwg_glibc_build--master-arm |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-arm |
success
|
Test passed
|
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 |
success
|
Test passed
|
Commit Message
This commit uses a common implementation 'strnlen-evex-base.S' for both
'strnlen-evex' and 'strnlen-evex512'
This patch serves both to reduce the number of implementations, and it also does some small optimizations that benefit strnlen-evex and strnlen-evex512.
All tests pass on x86.
Benchmarks were taken on SKX.
https://www.intel.com/content/www/us/en/products/sku/123613/intel-core-i97900x-xseries-processor-13-75m-cache-up-to-4-30-ghz/specifications.html
Geometric mean for strnlen-evex over all benchmarks (N=10) was (new/old) 0.881
Geometric mean for strnlen-evex512 over all benchmarks (N=10) was (new/old) 0.953
Code Size Changes:
strnlen-evex : +31 bytes
strnlen-evex512 : +156 bytes
---
sysdeps/x86_64/multiarch/strnlen-evex-base.S | 462 +++++++++++++++++++
sysdeps/x86_64/multiarch/strnlen-evex.S | 428 +----------------
sysdeps/x86_64/multiarch/strnlen-evex512.S | 259 +----------
3 files changed, 469 insertions(+), 680 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex-base.S
Comments
On Fri, Aug 9, 2024 at 2:50 PM Matthew Sterrett
<matthew.sterrett@intel.com> wrote:
>
> This commit uses a common implementation 'strnlen-evex-base.S' for both
> 'strnlen-evex' and 'strnlen-evex512'
>
> This patch serves both to reduce the number of implementations, and it also does some small optimizations that benefit strnlen-evex and strnlen-evex512.
>
> All tests pass on x86.
>
> Benchmarks were taken on SKX.
> https://www.intel.com/content/www/us/en/products/sku/123613/intel-core-i97900x-xseries-processor-13-75m-cache-up-to-4-30-ghz/specifications.html
>
> Geometric mean for strnlen-evex over all benchmarks (N=10) was (new/old) 0.881
> Geometric mean for strnlen-evex512 over all benchmarks (N=10) was (new/old) 0.953
>
> Code Size Changes:
> strnlen-evex : +31 bytes
> strnlen-evex512 : +156 bytes
> ---
> sysdeps/x86_64/multiarch/strnlen-evex-base.S | 462 +++++++++++++++++++
> sysdeps/x86_64/multiarch/strnlen-evex.S | 428 +----------------
> sysdeps/x86_64/multiarch/strnlen-evex512.S | 259 +----------
> 3 files changed, 469 insertions(+), 680 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex-base.S
>
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex-base.S b/sysdeps/x86_64/multiarch/strnlen-evex-base.S
> new file mode 100644
> index 0000000000..1c2cfdfe06
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex-base.S
> @@ -0,0 +1,462 @@
> +/* strnlen/wcsnlen optimized with 256/512-bit EVEX instructions.
> + Copyright (C) 2022-2024 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +# include <sysdep.h>
> +
> +#ifdef USE_AS_WCSLEN
> +# define VPCMPEQ vpcmpeqd
> +# define VPTESTN vptestnmd
> +# define VPMINU vpminud
> +# define CHAR_SIZE 4
> +#else
> +# define VPCMPEQ vpcmpeqb
> +# define VPTESTN vptestnmb
> +# define VPMINU vpminub
> +# define CHAR_SIZE 1
> +#endif
> +
> +#define XZERO VMM_128(0)
> +#define VZERO VMM(0)
> +#define PAGE_SIZE 4096
> +#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> +
> +#if CHAR_PER_VEC == 32
> +# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8)
> +#else
> +# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32)
> +#endif
> +
> +#ifdef USE_AS_WCSLEN
> +/* For wide-character, we care more about limitting code size
> + than optimally aligning targets, so just cap nop padding
> + reasonably low. */
> +# define P2ALIGN(...) .p2align 4,, 6
> +# define P2ALIGN_CLAMPED(...) P2ALIGN(__VA_ARGS__)
> +#else
> +# define P2ALIGN(x) .p2align x
> +# define P2ALIGN_CLAMPED(x, y) .p2align x,, y
> +#endif
> +
> + .section SECTION(.text), "ax", @progbits
> + /* Aligning entry point to 64 byte, provides better performance for
> + one vector length string. */
> +ENTRY_P2ALIGN(STRNLEN, 6)
> + /* rdi is pointer to array, rsi is the upper limit. */
> +
> + /* Check zero length. */
> + test %RSI_LP, %RSI_LP
> + jz L(zero)
> +
> +#ifdef __ILP32__
> + /* Clear the upper 32 bits. */
> + movl %esi, %esi
> +#endif
> +
> + vpxorq %XZERO, %XZERO, %XZERO
> +
> + /* Check that we won't cross a page boundary with our first load. */
> + movl %edi, %eax
> + shll $20, %eax
> + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> + ja L(crosses_page_boundary)
> +
> + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
> + null byte. */
> + VPCMPEQ (%rdi), %VZERO, %k0
> + KMOV %k0, %VRCX
> +
> + /* If src (rcx) is zero, bsf does not change the result. NB:
> + Must use 64-bit bsf here so that upper bits of len are not
> + cleared. */
> + movq %rsi, %rax
> + bsfq %rcx, %rax
> +
> + /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
> + CHAR) and rsi must be > CHAR_PER_VEC. */
> + cmpq $CHAR_PER_VEC, %rax
> + ja L(more_1x_vec)
> +
> + /* Check if first match in bounds. */
> + cmpq %rax, %rsi
> + cmovb %esi, %eax
> + ret
> +
> +#if VEC_SIZE == 32
> + P2ALIGN_CLAMPED(4, 2)
> +L(zero):
> +L(max_0):
> + movl %esi, %eax
> + ret
> +#endif
> +
> + P2ALIGN_CLAMPED(4, 10)
> +L(more_1x_vec):
> +L(cross_page_continue):
> + /* After this calculation, rax stores the number of elements
> + left to be processed The complexity comes from the fact some
> + elements get read twice due to alignment and we need to be
> + sure we don't count them twice (else, it would just be rsi -
> + CHAR_PER_VEC). */
> +
> +#ifdef USE_AS_WCSLEN
> + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> + overflow. */
> + movq %rdi, %rax
> + andq $(VEC_SIZE * -1), %rdi
> + subq %rdi, %rax
> + sarq $2, %rax
> + leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
> +#else
> + /* Calculate ptr + N - VEC_SIZE, then mask off the low bits,
> + then subtract ptr to get the new aligned limit value. */
> + leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax
> + andq $(VEC_SIZE * -1), %rdi
> + subq %rdi, %rax
> +#endif
> +
> + VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
> +
> + /* Checking here is faster for 256-bit but not 512-bit */
> +#if VEC_SIZE == 0
> + KMOV %k0, %VRDX
> + test %VRDX, %VRDX
> + jnz L(last_vec_check)
> +#endif
> +
> + cmpq $(CHAR_PER_VEC * 2), %rax
> + ja L(more_2x_vec)
> +
> +L(last_2x_vec_or_less):
> +
> + /* Checking here is faster for 512-bit but not 256-bit */
> +#if VEC_SIZE != 0
> + KMOV %k0, %VRDX
> + test %VRDX, %VRDX
> + jnz L(last_vec_check)
> +#endif
> +
> + /* Check for the end of data. */
> + SUB_SHORT (CHAR_PER_VEC, rax)
> + jbe L(max_0)
> +
> + /* Check the final remaining vector. */
> + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> + KMOV %k0, %VRDX
> + test %VRDX, %VRDX
> +#if VEC_SIZE == 32
> + jz L(max_0)
> +#else
> + jnz L(last_vec_check)
> + P2ALIGN_CLAMPED(4, 2)
> +L(zero):
> +L(max_0):
> + movl %esi, %eax
> + ret
> +
> +#endif
> + P2ALIGN_CLAMPED(4, 4)
> +L(last_vec_check):
> + bsf %VRDX, %VRDX
> + sub %eax, %edx
> + lea (%rsi, %rdx), %eax
> + cmovae %esi, %eax
> + ret
> +
> +
> +#if VEC_SIZE == 32
> + P2ALIGN_CLAMPED(4, 8)
> +#endif
> +L(last_4x_vec_or_less):
> + addl $(CHAR_PER_VEC * -4), %eax
> + VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
> +
> +#if VEC_SIZE == 64
> + KMOV %k0, %VRDX
> + test %VRDX, %VRDX
> + jnz L(last_vec_check)
> +#endif
> +
> + subq $(VEC_SIZE * -4), %rdi
> + cmpl $(CHAR_PER_VEC * 2), %eax
> + jbe L(last_2x_vec_or_less)
> +
> + P2ALIGN_CLAMPED(4, 6)
> +L(more_2x_vec):
> + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> + rechecking bounds. */
> +
> + /* Already checked in 256-bit case */
> +#if VEC_SIZE != 0
> + KMOV %k0, %VRDX
> +
> + test %VRDX, %VRDX
> + jnz L(first_vec_x1)
> +#endif
> +
> + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> + KMOV %k0, %VRDX
> +
> + test %VRDX, %VRDX
> + jnz L(first_vec_x2)
> +
> + cmpq $(CHAR_PER_VEC * 4), %rax
> + ja L(more_4x_vec)
> +
> +
> + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> + KMOV %k0, %VRDX
> + addl $(CHAR_PER_VEC * -2), %eax
> + test %VRDX, %VRDX
> + jnz L(last_vec_check)
> +
> + subb $(CHAR_PER_VEC), %al
> + jbe L(max_1)
> +
> + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> + KMOV %k0, %VRDX
> +
> + test %VRDX, %VRDX
> + jnz L(last_vec_check)
> +L(max_1):
> + movl %esi, %eax
> + ret
> +
> +
> + P2ALIGN_CLAMPED(4, 14)
> +L(first_vec_x2):
> +#if VEC_SIZE == 64
> + /* If VEC_SIZE == 64 we can fit logic for full return label in
> + spare bytes before next cache line. */
> + bsf %VRDX, %VRDX
> + sub %eax, %esi
> + leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
> + ret
> + P2ALIGN_CLAMPED(4, 6)
> +#else
> + addl $CHAR_PER_VEC, %esi
> +#endif
> +L(first_vec_x1):
> + bsf %VRDX, %VRDX
> + sub %eax, %esi
> + leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
> + ret
> +
> +#if VEC_SIZE == 64
> + P2ALIGN_CLAMPED(4, 6)
> +L(first_vec_x4):
> +# if VEC_SIZE == 64
> + /* If VEC_SIZE == 64 we can fit logic for full return label in
> + spare bytes before next cache line. */
> + bsf %VRDX, %VRDX
> + sub %eax, %esi
> + leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
> + ret
> + P2ALIGN_CLAMPED(4, 6)
> +# else
> + addl $CHAR_PER_VEC, %esi
> +# endif
> +L(first_vec_x3):
> + bsf %VRDX, %VRDX
> + sub %eax, %esi
> + leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
> + ret
> +#endif
> +
> + P2ALIGN_CLAMPED(6, 20)
> +L(more_4x_vec):
> + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> + KMOV %k0, %VRDX
> + test %VRDX, %VRDX
> + jnz L(first_vec_x3)
> +
> + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> + KMOV %k0, %VRDX
> + test %VRDX, %VRDX
> + jnz L(first_vec_x4)
> +
> + /* Check if at last VEC_SIZE * 4 length before aligning for the
> + loop. */
> + cmpq $(CHAR_PER_VEC * 8), %rax
> + jbe L(last_4x_vec_or_less)
> +
> +
> + /* Compute number of words checked after aligning. */
> +#ifdef USE_AS_WCSLEN
> + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> + overflow. */
> + leaq (VEC_SIZE * -3)(%rdi), %rdx
> +#else
> + leaq (VEC_SIZE * -3)(%rdi, %rax), %rax
> +#endif
> +
> + subq $(VEC_SIZE * -1), %rdi
> +
> + /* Align data to VEC_SIZE * 4. */
> +#if VEC_SIZE == 64
> + /* Saves code size. No evex512 processor has partial register
> + stalls. If that change this can be replaced with `andq
> + $-(VEC_SIZE * 4), %rdi`. */
> + xorb %dil, %dil
> +#else
> + andq $-(VEC_SIZE * 4), %rdi
> +#endif
> +
> +#ifdef USE_AS_WCSLEN
> + subq %rdi, %rdx
> + sarq $2, %rdx
> + addq %rdx, %rax
> +#else
> + subq %rdi, %rax
> +#endif
> +
> + // mov %rdi, %rdx
> +
> + P2ALIGN(6)
> +L(loop):
> + /* VPMINU and VPCMP combination provide better performance as
> + compared to alternative combinations. */
> + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
> + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
> + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
> +
> + VPTESTN %VMM(2), %VMM(2), %k0
> + VPTESTN %VMM(4), %VMM(4), %k1
> +
> + subq $-(VEC_SIZE * 4), %rdi
> + KORTEST %k0, %k1
> +
> + jnz L(loopend)
> + subq $(CHAR_PER_VEC * 4), %rax
> + ja L(loop)
> + mov %rsi, %rax
> + ret
> +
> +
> +#if VEC_SIZE == 32
> + P2ALIGN_CLAMPED(4, 6)
> +L(first_vec_x4):
> +# if VEC_SIZE == 64
> + /* If VEC_SIZE == 64 we can fit logic for full return label in
> + spare bytes before next cache line. */
> + bsf %VRDX, %VRDX
> + sub %eax, %esi
> + leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
> + ret
> + P2ALIGN_CLAMPED(4, 6)
> +# else
> + addl $CHAR_PER_VEC, %esi
> +# endif
> +L(first_vec_x3):
> + bsf %VRDX, %VRDX
> + sub %eax, %esi
> + leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
> + ret
> +#endif
> +
> +
> + P2ALIGN_CLAMPED(4, 11)
> +L(loopend):
> + /* We found a null terminator in one of the 4 vectors. */
> +
> + /* Check the first vector. */
> + movq %rax, %r8
> + VPTESTN %VMM(1), %VMM(1), %k2
> + KMOV %k2, %VRCX
> + bsf %rcx, %r8
> +
> + cmpq $(CHAR_PER_VEC), %r8
> + jbe L(end_vec)
> +
> + /* Check the second vector. */
> + subq $(CHAR_PER_VEC), %rax
> + movq %rax, %r8
> + KMOV %k0, %VRCX
> + bsf %rcx, %r8
> +
> + cmpq $(CHAR_PER_VEC), %r8
> + jbe L(end_vec)
> +
> + /* Check the third vector. */
> + subq $(CHAR_PER_VEC), %rax
> + movq %rax, %r8
> + VPTESTN %VMM(3), %VMM(3), %k2
> + KMOV %k2, %VRCX
> + bsf %rcx, %r8
> +
> + cmpq $(CHAR_PER_VEC), %r8
> + jbe L(end_vec)
> +
> + /* It is in the fourth vector. */
> + subq $(CHAR_PER_VEC), %rax
> + movq %rax, %r8
> + KMOV %k1, %VRCX
> + bsf %rcx, %r8
> +
> + P2ALIGN_CLAMPED(4, 3)
> +L(end_vec):
> + /* Get the number that has been processed. */
> + movq %rsi, %rcx
> + subq %rax, %rcx
> +
> + /* Add that to the offset we found the null terminator at. */
> + leaq (%r8, %rcx), %rax
> +
> + /* Take the min of that and the limit. */
> + cmpq %rsi, %rax
> + cmovnb %rsi, %rax
> + ret
> +
> + P2ALIGN_CLAMPED(4, 11)
> +L(crosses_page_boundary):
> + /* Align data backwards to VEC_SIZE. */
> + shrl $20, %eax
> + movq %rdi, %rcx
> + andq $-VEC_SIZE, %rcx
> + VPCMPEQ (%rcx), %VZERO, %k0
> +
> + KMOV %k0, %VRCX
> +#ifdef USE_AS_WCSLEN
> + shrl $2, %eax
> + andl $(CHAR_PER_VEC - 1), %eax
> +#endif
> + /* By this point rax contains number of bytes we need to skip. */
> + shrx %VRAX, %VRCX, %VRCX
> +
> + /* Calculates CHAR_PER_VEC - eax and stores in eax. */
> + negl %eax
> + andl $(CHAR_PER_VEC - 1), %eax
> +
> + movq %rsi, %rdx
> + bsf %VRCX, %VRDX
> + cmpq %rax, %rdx
> + ja L(cross_page_continue)
> +
> + /* The vector had a null terminator or we are at the limit. */
> + movl %edx, %eax
> + cmpq %rdx, %rsi
> + cmovb %esi, %eax
> + ret
> +
> +END(STRNLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
> index 91b16830eb..cba585452c 100644
> --- a/sysdeps/x86_64/multiarch/strnlen-evex.S
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
> @@ -1,423 +1,7 @@
> -/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
> - Copyright (C) 2022-2024 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <isa-level.h>
> -#include <sysdep.h>
> -
> -#if ISA_SHOULD_BUILD (4)
> -
> -# ifndef VEC_SIZE
> -# include "x86-evex256-vecs.h"
> -# endif
> -
> -
> -# ifndef STRNLEN
> -# define STRNLEN __strnlen_evex
> -# endif
> -
> -# ifdef USE_AS_WCSLEN
> -# define VPCMPEQ vpcmpeqd
> -# define VPCMPNEQ vpcmpneqd
> -# define VPTESTN vptestnmd
> -# define VPTEST vptestmd
> -# define VPMINU vpminud
> -# define CHAR_SIZE 4
> -
> -# else
> -# define VPCMPEQ vpcmpeqb
> -# define VPCMPNEQ vpcmpneqb
> -# define VPTESTN vptestnmb
> -# define VPTEST vptestmb
> -# define VPMINU vpminub
> -# define CHAR_SIZE 1
> -
> -# define REG_WIDTH VEC_SIZE
> -# endif
> -
> -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> -
> -# include "reg-macros.h"
> -
> -# if CHAR_PER_VEC == 32
> -# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8)
> -# else
> -# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32)
> -# endif
> -
> -
> -
> -# if CHAR_PER_VEC == 64
> -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
> -# else
> -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
> -# endif
> -
> -
> -# define XZERO VMM_128(0)
> -# define VZERO VMM(0)
> -# define PAGE_SIZE 4096
> -
> - .section SECTION(.text), "ax", @progbits
> -ENTRY_P2ALIGN (STRNLEN, 6)
> - /* Check zero length. */
> - test %RSI_LP, %RSI_LP
> - jz L(zero)
> -# ifdef __ILP32__
> - /* Clear the upper 32 bits. */
> - movl %esi, %esi
> -# endif
> -
> - movl %edi, %eax
> - vpxorq %XZERO, %XZERO, %XZERO
> - andl $(PAGE_SIZE - 1), %eax
> - cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> - ja L(cross_page_boundary)
> -
> - /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
> - null byte. */
> - VPCMPEQ (%rdi), %VZERO, %k0
> -
> - KMOV %k0, %VRCX
> - movq %rsi, %rax
> -
> - /* If src (rcx) is zero, bsf does not change the result. NB:
> - Must use 64-bit bsf here so that upper bits of len are not
> - cleared. */
> - bsfq %rcx, %rax
> - /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
> - CHAR) and rsi must be > CHAR_PER_VEC. */
> - cmpq $CHAR_PER_VEC, %rax
> - ja L(more_1x_vec)
> - /* Check if first match in bounds. */
> - cmpq %rax, %rsi
> - cmovb %esi, %eax
> - ret
> -
> -
> -# if CHAR_PER_VEC != 32
> - .p2align 4,, 2
> -L(zero):
> -L(max_0):
> - movl %esi, %eax
> - ret
> -# endif
> -
> - /* Aligned more for strnlen compares remaining length vs 2 *
> - CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
> - going to the loop. */
> - .p2align 4,, 10
> -L(more_1x_vec):
> -L(cross_page_continue):
> - /* Compute number of words checked after aligning. */
> -# ifdef USE_AS_WCSLEN
> - /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> - overflow. */
> - movq %rdi, %rax
> - andq $(VEC_SIZE * -1), %rdi
> - subq %rdi, %rax
> - sarq $2, %rax
> - leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
> -# else
> - leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax
> - andq $(VEC_SIZE * -1), %rdi
> - subq %rdi, %rax
> -# endif
> -
> -
> - VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
> -
> - cmpq $(CHAR_PER_VEC * 2), %rax
> - ja L(more_2x_vec)
> -
> -L(last_2x_vec_or_less):
> - KMOV %k0, %VRDX
> - test %VRDX, %VRDX
> - jnz L(last_vec_check)
> -
> - /* Check the end of data. */
> - SUB_SHORT (CHAR_PER_VEC, rax)
> - jbe L(max_0)
> - VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> - KMOV %k0, %VRDX
> - test %VRDX, %VRDX
> - jz L(max_0)
> - /* Best place for LAST_VEC_CHECK if ZMM. */
> - .p2align 4,, 8
> -L(last_vec_check):
> - bsf %VRDX, %VRDX
> - sub %eax, %edx
> - lea (%rsi, %rdx), %eax
> - cmovae %esi, %eax
> - ret
> -
> -# if CHAR_PER_VEC == 32
> - .p2align 4,, 2
> -L(zero):
> -L(max_0):
> - movl %esi, %eax
> - ret
> -# endif
> -
> - .p2align 4,, 8
> -L(last_4x_vec_or_less):
> - addl $(CHAR_PER_VEC * -4), %eax
> - VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
> - subq $(VEC_SIZE * -4), %rdi
> - cmpl $(CHAR_PER_VEC * 2), %eax
> - jbe L(last_2x_vec_or_less)
> -
> - .p2align 4,, 6
> -L(more_2x_vec):
> - /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> - rechecking bounds. */
> -
> - KMOV %k0, %VRDX
> -
> - test %VRDX, %VRDX
> - jnz L(first_vec_x1)
> -
> - VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> - KMOV %k0, %VRDX
> - test %VRDX, %VRDX
> - jnz L(first_vec_x2)
> -
> - cmpq $(CHAR_PER_VEC * 4), %rax
> - ja L(more_4x_vec)
> -
> -
> - VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> - KMOV %k0, %VRDX
> - addl $(CHAR_PER_VEC * -2), %eax
> - test %VRDX, %VRDX
> - jnz L(last_vec_check)
> -
> - subl $(CHAR_PER_VEC), %eax
> - jbe L(max_1)
> -
> - VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> - KMOV %k0, %VRDX
> -
> - test %VRDX, %VRDX
> - jnz L(last_vec_check)
> -L(max_1):
> - movl %esi, %eax
> - ret
> -
> - .p2align 4,, 3
> -L(first_vec_x2):
> -# if VEC_SIZE == 64
> - /* If VEC_SIZE == 64 we can fit logic for full return label in
> - spare bytes before next cache line. */
> - bsf %VRDX, %VRDX
> - sub %eax, %esi
> - leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
> - ret
> - .p2align 4,, 6
> -# else
> - addl $CHAR_PER_VEC, %esi
> -# endif
> -L(first_vec_x1):
> - bsf %VRDX, %VRDX
> - sub %eax, %esi
> - leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
> - ret
> -
> -
> - .p2align 4,, 6
> -L(first_vec_x4):
> -# if VEC_SIZE == 64
> - /* If VEC_SIZE == 64 we can fit logic for full return label in
> - spare bytes before next cache line. */
> - bsf %VRDX, %VRDX
> - sub %eax, %esi
> - leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
> - ret
> - .p2align 4,, 6
> -# else
> - addl $CHAR_PER_VEC, %esi
> -# endif
> -L(first_vec_x3):
> - bsf %VRDX, %VRDX
> - sub %eax, %esi
> - leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
> - ret
> -
> - .p2align 4,, 5
> -L(more_4x_vec):
> - VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> - KMOV %k0, %VRDX
> - test %VRDX, %VRDX
> - jnz L(first_vec_x3)
> -
> - VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> - KMOV %k0, %VRDX
> - test %VRDX, %VRDX
> - jnz L(first_vec_x4)
> -
> - /* Check if at last VEC_SIZE * 4 length before aligning for the
> - loop. */
> - cmpq $(CHAR_PER_VEC * 8), %rax
> - jbe L(last_4x_vec_or_less)
> -
> -
> - /* Compute number of words checked after aligning. */
> -# ifdef USE_AS_WCSLEN
> - /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> - overflow. */
> - leaq (VEC_SIZE * -3)(%rdi), %rdx
> -# else
> - leaq (VEC_SIZE * -3)(%rdi, %rax), %rax
> -# endif
> -
> - subq $(VEC_SIZE * -1), %rdi
> -
> - /* Align data to VEC_SIZE * 4. */
> -# if VEC_SIZE == 64
> - /* Saves code size. No evex512 processor has partial register
> - stalls. If that change this can be replaced with `andq
> - $-(VEC_SIZE * 4), %rdi`. */
> - xorb %dil, %dil
> -# else
> - andq $-(VEC_SIZE * 4), %rdi
> -# endif
> -
> -# ifdef USE_AS_WCSLEN
> - subq %rdi, %rdx
> - sarq $2, %rdx
> - addq %rdx, %rax
> -# else
> - subq %rdi, %rax
> -# endif
> - /* Compare 4 * VEC at a time forward. */
> - .p2align 4,, 11
> -L(loop_4x_vec):
> - VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
> - VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> - VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
> - VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
> - VPTESTN %VMM(2), %VMM(2), %k0
> - VPTESTN %VMM(4), %VMM(4), %k2
> - subq $-(VEC_SIZE * 4), %rdi
> - /* Break if at end of length. */
> - subq $(CHAR_PER_VEC * 4), %rax
> - jbe L(loop_len_end)
> -
> -
> - KORTEST %k0, %k2
> - jz L(loop_4x_vec)
> -
> -
> -L(loop_last_4x_vec):
> - movq %rsi, %rcx
> - subq %rax, %rsi
> - VPTESTN %VMM(1), %VMM(1), %k1
> - KMOV %k1, %VRDX
> - test %VRDX, %VRDX
> - jnz L(last_vec_x0)
> -
> - KMOV %k0, %VRDX
> - test %VRDX, %VRDX
> - jnz L(last_vec_x1)
> -
> - VPTESTN %VMM(3), %VMM(3), %k0
> -
> - /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
> - returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
> - individually, for VEC_SIZE == 32 we combine them in a single
> - 64-bit GPR. */
> -# if CHAR_PER_VEC == 64
> - KMOV %k0, %VRDX
> - test %VRDX, %VRDX
> - jnz L(last_vec_x2)
> - KMOV %k2, %VRDX
> -# else
> - /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
> - */
> - kmovd %k2, %edx
> - kmovd %k0, %eax
> - salq $CHAR_PER_VEC, %rdx
> - orq %rax, %rdx
> -# endif
> -
> - /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
> - */
> - bsfq %rdx, %rdx
> - leaq (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
> - cmpq %rax, %rcx
> - cmovb %rcx, %rax
> - ret
> -
> - /* Handle last 4x VEC after loop. All VECs have been loaded. */
> - .p2align 4,, 4
> -L(loop_len_end):
> - KORTEST %k0, %k2
> - jnz L(loop_last_4x_vec)
> - movq %rsi, %rax
> - ret
> -
> -
> -# if CHAR_PER_VEC == 64
> - /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
> - need return label for it. */
> - .p2align 4,, 8
> -L(last_vec_x2):
> - bsf %VRDX, %VRDX
> - leaq (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
> - cmpq %rax, %rcx
> - cmovb %rcx, %rax
> - ret
> -# endif
> -
> -
> - .p2align 4,, 10
> -L(last_vec_x1):
> - addq $CHAR_PER_VEC, %rsi
> -L(last_vec_x0):
> - bsf %VRDX, %VRDX
> - leaq (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
> - cmpq %rax, %rcx
> - cmovb %rcx, %rax
> - ret
> -
> -
> - .p2align 4,, 8
> -L(cross_page_boundary):
> - /* Align data to VEC_SIZE. */
> - movq %rdi, %rcx
> - andq $-VEC_SIZE, %rcx
> - VPCMPEQ (%rcx), %VZERO, %k0
> -
> - KMOV %k0, %VRCX
> -# ifdef USE_AS_WCSLEN
> - shrl $2, %eax
> - andl $(CHAR_PER_VEC - 1), %eax
> -# endif
> - shrx %VRAX, %VRCX, %VRCX
> -
> - negl %eax
> - andl $(CHAR_PER_VEC - 1), %eax
> - movq %rsi, %rdx
> - bsf %VRCX, %VRDX
> - cmpq %rax, %rdx
> - ja L(cross_page_continue)
> - movl %edx, %eax
> - cmpq %rdx, %rsi
> - cmovb %esi, %eax
> - ret
> -END (STRNLEN)
> +#ifndef STRNLEN
> +#define STRNLEN __strnlen_evex
> #endif
> +
> +#include "x86-evex256-vecs.h"
> +#include "reg-macros.h"
> +#include "strnlen-evex-base.S"
> \ No newline at end of file
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> index f8e55883bb..8ef54078f8 100644
> --- a/sysdeps/x86_64/multiarch/strnlen-evex512.S
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> @@ -1,264 +1,7 @@
> -/* Placeholder function, not used by any processor at the moment.
> - Copyright (C) 2022-2024 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> #ifndef STRNLEN
> #define STRNLEN __strnlen_evex512
> #endif
>
> #include "x86-evex512-vecs.h"
> #include "reg-macros.h"
> -
> -#include <isa-level.h>
> -
> -#if ISA_SHOULD_BUILD (4)
> -
> -# include <sysdep.h>
> -
> -# ifdef USE_AS_WCSLEN
> -# define VPCMPEQ vpcmpeqd
> -# define VPTESTN vptestnmd
> -# define VPMINU vpminud
> -# define CHAR_SIZE 4
> -# else
> -# define VPCMPEQ vpcmpeqb
> -# define VPTESTN vptestnmb
> -# define VPMINU vpminub
> -# define CHAR_SIZE 1
> -# endif
> -
> -# define PAGE_SIZE 4096
> -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> -
> - .section SECTION(.text),"ax",@progbits
> -/* Aligning entry point to 64 byte, provides better performance for
> - one vector length string. */
> -ENTRY_P2ALIGN (STRNLEN, 6)
> - /* Check zero length. */
> - test %RSI_LP, %RSI_LP
> - jz L(ret_max)
> -# ifdef __ILP32__
> - /* Clear the upper 32 bits. */
> - movl %esi, %esi
> -# endif
> -
> - movl %edi, %eax
> - vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0)
> - sall $20, %eax
> - cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> - ja L(page_cross)
> -
> - /* Compare [w]char for null, mask bit will be set for match. */
> - VPCMPEQ (%rdi), %VMM(0), %k0
> - KMOV %k0, %VRCX
> - /* Store max length in rax. */
> - mov %rsi, %rax
> - /* If rcx is 0, rax will have max length. We can not use VRCX
> - and VRAX here for evex256 because, upper 32 bits may be
> - undefined for ecx and eax. */
> - bsfq %rcx, %rax
> - cmp $CHAR_PER_VEC, %rax
> - ja L(align_more)
> - cmpq %rax, %rsi
> - cmovb %esi, %eax
> - ret
> -
> - /* At this point vector max length reached. */
> - .p2align 4,,3
> -L(ret_max):
> - movq %rsi, %rax
> - ret
> -
> -L(align_more):
> - mov %rdi, %rax
> - /* Align rax to VEC_SIZE. */
> - andq $-VEC_SIZE, %rax
> - movq %rdi, %rdx
> - subq %rax, %rdx
> -# ifdef USE_AS_WCSLEN
> - shr $2, %VRDX
> -# endif
> - /* At this point rdx contains [w]chars already compared. */
> - leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx
> - /* At this point rdx contains number of w[char] needs to go.
> - Now onwards rdx will keep decrementing with each compare. */
> -
> - /* Loop unroll 4 times for 4 vector loop. */
> - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> - subq $-VEC_SIZE, %rax
> - KMOV %k0, %VRCX
> - test %VRCX, %VRCX
> - jnz L(ret_vec_x1)
> -
> - subq $CHAR_PER_VEC, %rdx
> - jbe L(ret_max)
> -
> - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> - KMOV %k0, %VRCX
> - test %VRCX, %VRCX
> - jnz L(ret_vec_x2)
> -
> - subq $CHAR_PER_VEC, %rdx
> - jbe L(ret_max)
> -
> - VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
> - KMOV %k0, %VRCX
> - test %VRCX, %VRCX
> - jnz L(ret_vec_x3)
> -
> - subq $CHAR_PER_VEC, %rdx
> - jbe L(ret_max)
> -
> - VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
> - KMOV %k0, %VRCX
> - test %VRCX, %VRCX
> - jnz L(ret_vec_x4)
> -
> - subq $CHAR_PER_VEC, %rdx
> - jbe L(ret_max)
> - /* Save pointer before 4 x VEC_SIZE alignment. */
> - movq %rax, %rcx
> -
> - /* Align address to VEC_SIZE * 4 for loop. */
> - andq $-(VEC_SIZE * 4), %rax
> -
> - subq %rax, %rcx
> -# ifdef USE_AS_WCSLEN
> - shr $2, %VRCX
> -# endif
> - /* rcx contains number of [w]char will be recompared due to
> - alignment fixes. rdx must be incremented by rcx to offset
> - alignment adjustment. */
> - addq %rcx, %rdx
> - /* Need jump as we don't want to add/subtract rdx for first
> - iteration of 4 x VEC_SIZE aligned loop. */
> -
> - .p2align 4,,11
> -L(loop):
> - /* VPMINU and VPCMP combination provide better performance as
> - compared to alternative combinations. */
> - VMOVA (VEC_SIZE * 4)(%rax), %VMM(1)
> - VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
> - VMOVA (VEC_SIZE * 6)(%rax), %VMM(3)
> - VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
> -
> - VPTESTN %VMM(2), %VMM(2), %k0
> - VPTESTN %VMM(4), %VMM(4), %k1
> -
> - subq $-(VEC_SIZE * 4), %rax
> - KORTEST %k0, %k1
> -
> - jnz L(loopend)
> - subq $(CHAR_PER_VEC * 4), %rdx
> - ja L(loop)
> - mov %rsi, %rax
> - ret
> -
> -L(loopend):
> -
> - VPTESTN %VMM(1), %VMM(1), %k2
> - KMOV %k2, %VRCX
> - test %VRCX, %VRCX
> - jnz L(ret_vec_x1)
> -
> - KMOV %k0, %VRCX
> - /* At this point, if k0 is non zero, null char must be in the
> - second vector. */
> - test %VRCX, %VRCX
> - jnz L(ret_vec_x2)
> -
> - VPTESTN %VMM(3), %VMM(3), %k3
> - KMOV %k3, %VRCX
> - test %VRCX, %VRCX
> - jnz L(ret_vec_x3)
> - /* At this point null [w]char must be in the fourth vector so no
> - need to check. */
> - KMOV %k1, %VRCX
> -
> - /* Fourth, third, second vector terminating are pretty much
> - same, implemented this way to avoid branching and reuse code
> - from pre loop exit condition. */
> -L(ret_vec_x4):
> - bsf %VRCX, %VRCX
> - subq %rdi, %rax
> -# ifdef USE_AS_WCSLEN
> - subq $-(VEC_SIZE * 3), %rax
> - shrq $2, %rax
> - addq %rcx, %rax
> -# else
> - leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
> -# endif
> -
> - cmpq %rsi, %rax
> - cmovnb %rsi, %rax
> - ret
> -
> -L(ret_vec_x3):
> - bsf %VRCX, %VRCX
> - subq %rdi, %rax
> -# ifdef USE_AS_WCSLEN
> - subq $-(VEC_SIZE * 2), %rax
> - shrq $2, %rax
> - addq %rcx, %rax
> -# else
> - leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
> -# endif
> - cmpq %rsi, %rax
> - cmovnb %rsi, %rax
> - ret
> -
> -L(ret_vec_x2):
> - subq $-VEC_SIZE, %rax
> -L(ret_vec_x1):
> - bsf %VRCX, %VRCX
> - subq %rdi, %rax
> -# ifdef USE_AS_WCSLEN
> - shrq $2, %rax
> -# endif
> - addq %rcx, %rax
> - cmpq %rsi, %rax
> - cmovnb %rsi, %rax
> - ret
> -
> -L(page_cross):
> - mov %rdi, %rax
> - movl %edi, %ecx
> - andl $(VEC_SIZE - 1), %ecx
> -# ifdef USE_AS_WCSLEN
> - sarl $2, %ecx
> -# endif
> - /* ecx contains number of w[char] to be skipped as a result
> - of address alignment. */
> - andq $-VEC_SIZE, %rax
> - VPCMPEQ (%rax), %VMM(0), %k0
> - KMOV %k0, %VRDX
> - /* Ignore number of character for alignment adjustment. */
> - shr %cl, %VRDX
> - jnz L(page_cross_end)
> - movl $CHAR_PER_VEC, %eax
> - sub %ecx, %eax
> - cmp %rax, %rsi
> - ja L(align_more)
> -
> -L(page_cross_end):
> - bsf %VRDX, %VRAX
> - cmpq %rsi, %rax
> - cmovnb %esi, %eax
> - ret
> -
> -END (STRNLEN)
> -#endif
> +#include "strnlen-evex-base.S"
> \ No newline at end of file
> --
> 2.37.2
>
LGTM
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
On Sun, Aug 18, 2024 at 4:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Aug 9, 2024 at 2:50 PM Matthew Sterrett
> <matthew.sterrett@intel.com> wrote:
> >
> > This commit uses a common implementation 'strnlen-evex-base.S' for both
> > 'strnlen-evex' and 'strnlen-evex512'
> >
> > This patch serves both to reduce the number of implementations, and it also does some small optimizations that benefit strnlen-evex and strnlen-evex512.
> >
> > All tests pass on x86.
> >
> > Benchmarks were taken on SKX.
> > https://www.intel.com/content/www/us/en/products/sku/123613/intel-core-i97900x-xseries-processor-13-75m-cache-up-to-4-30-ghz/specifications.html
> >
> > Geometric mean for strnlen-evex over all benchmarks (N=10) was (new/old) 0.881
> > Geometric mean for strnlen-evex512 over all benchmarks (N=10) was (new/old) 0.953
> >
> > Code Size Changes:
> > strnlen-evex : +31 bytes
> > strnlen-evex512 : +156 bytes
> > ---
> > sysdeps/x86_64/multiarch/strnlen-evex-base.S | 462 +++++++++++++++++++
> > sysdeps/x86_64/multiarch/strnlen-evex.S | 428 +----------------
> > sysdeps/x86_64/multiarch/strnlen-evex512.S | 259 +----------
> > 3 files changed, 469 insertions(+), 680 deletions(-)
> > create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex-base.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex-base.S b/sysdeps/x86_64/multiarch/strnlen-evex-base.S
> > new file mode 100644
> > index 0000000000..1c2cfdfe06
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strnlen-evex-base.S
> > @@ -0,0 +1,462 @@
> > +/* strnlen/wcsnlen optimized with 256/512-bit EVEX instructions.
> > + Copyright (C) 2022-2024 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
> > +#include <isa-level.h>
> > +
> > +#if ISA_SHOULD_BUILD (4)
> > +
> > +# include <sysdep.h>
> > +
> > +#ifdef USE_AS_WCSLEN
> > +# define VPCMPEQ vpcmpeqd
> > +# define VPTESTN vptestnmd
> > +# define VPMINU vpminud
> > +# define CHAR_SIZE 4
> > +#else
> > +# define VPCMPEQ vpcmpeqb
> > +# define VPTESTN vptestnmb
> > +# define VPMINU vpminub
> > +# define CHAR_SIZE 1
> > +#endif
> > +
> > +#define XZERO VMM_128(0)
> > +#define VZERO VMM(0)
> > +#define PAGE_SIZE 4096
> > +#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> > +
> > +#if CHAR_PER_VEC == 32
> > +# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8)
> > +#else
> > +# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32)
> > +#endif
> > +
> > +#ifdef USE_AS_WCSLEN
> > +/* For wide-character, we care more about limitting code size
> > + than optimally aligning targets, so just cap nop padding
> > + reasonably low. */
> > +# define P2ALIGN(...) .p2align 4,, 6
> > +# define P2ALIGN_CLAMPED(...) P2ALIGN(__VA_ARGS__)
> > +#else
> > +# define P2ALIGN(x) .p2align x
> > +# define P2ALIGN_CLAMPED(x, y) .p2align x,, y
> > +#endif
> > +
> > + .section SECTION(.text), "ax", @progbits
> > + /* Aligning entry point to 64 byte, provides better performance for
> > + one vector length string. */
> > +ENTRY_P2ALIGN(STRNLEN, 6)
> > + /* rdi is pointer to array, rsi is the upper limit. */
> > +
> > + /* Check zero length. */
> > + test %RSI_LP, %RSI_LP
> > + jz L(zero)
> > +
> > +#ifdef __ILP32__
> > + /* Clear the upper 32 bits. */
> > + movl %esi, %esi
> > +#endif
> > +
> > + vpxorq %XZERO, %XZERO, %XZERO
> > +
> > + /* Check that we won't cross a page boundary with our first load. */
> > + movl %edi, %eax
> > + shll $20, %eax
> > + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> > + ja L(crosses_page_boundary)
> > +
> > + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
> > + null byte. */
> > + VPCMPEQ (%rdi), %VZERO, %k0
> > + KMOV %k0, %VRCX
> > +
> > + /* If src (rcx) is zero, bsf does not change the result. NB:
> > + Must use 64-bit bsf here so that upper bits of len are not
> > + cleared. */
> > + movq %rsi, %rax
> > + bsfq %rcx, %rax
> > +
> > + /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
> > + CHAR) and rsi must be > CHAR_PER_VEC. */
> > + cmpq $CHAR_PER_VEC, %rax
> > + ja L(more_1x_vec)
> > +
> > + /* Check if first match in bounds. */
> > + cmpq %rax, %rsi
> > + cmovb %esi, %eax
> > + ret
> > +
> > +#if VEC_SIZE == 32
> > + P2ALIGN_CLAMPED(4, 2)
> > +L(zero):
> > +L(max_0):
> > + movl %esi, %eax
> > + ret
> > +#endif
> > +
> > + P2ALIGN_CLAMPED(4, 10)
> > +L(more_1x_vec):
> > +L(cross_page_continue):
> > + /* After this calculation, rax stores the number of elements
> > + left to be processed The complexity comes from the fact some
> > + elements get read twice due to alignment and we need to be
> > + sure we don't count them twice (else, it would just be rsi -
> > + CHAR_PER_VEC). */
> > +
> > +#ifdef USE_AS_WCSLEN
> > + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> > + overflow. */
> > + movq %rdi, %rax
> > + andq $(VEC_SIZE * -1), %rdi
> > + subq %rdi, %rax
> > + sarq $2, %rax
> > + leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
> > +#else
> > + /* Calculate ptr + N - VEC_SIZE, then mask off the low bits,
> > + then subtract ptr to get the new aligned limit value. */
> > + leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax
> > + andq $(VEC_SIZE * -1), %rdi
> > + subq %rdi, %rax
> > +#endif
> > +
> > + VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
> > +
> > + /* Checking here is faster for 256-bit but not 512-bit */
> > +#if VEC_SIZE == 0
> > + KMOV %k0, %VRDX
> > + test %VRDX, %VRDX
> > + jnz L(last_vec_check)
> > +#endif
> > +
> > + cmpq $(CHAR_PER_VEC * 2), %rax
> > + ja L(more_2x_vec)
> > +
> > +L(last_2x_vec_or_less):
> > +
> > + /* Checking here is faster for 512-bit but not 256-bit */
> > +#if VEC_SIZE != 0
> > + KMOV %k0, %VRDX
> > + test %VRDX, %VRDX
> > + jnz L(last_vec_check)
> > +#endif
> > +
> > + /* Check for the end of data. */
> > + SUB_SHORT (CHAR_PER_VEC, rax)
> > + jbe L(max_0)
> > +
> > + /* Check the final remaining vector. */
> > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> > + KMOV %k0, %VRDX
> > + test %VRDX, %VRDX
> > +#if VEC_SIZE == 32
> > + jz L(max_0)
> > +#else
> > + jnz L(last_vec_check)
> > + P2ALIGN_CLAMPED(4, 2)
> > +L(zero):
> > +L(max_0):
> > + movl %esi, %eax
> > + ret
> > +
> > +#endif
> > + P2ALIGN_CLAMPED(4, 4)
> > +L(last_vec_check):
> > + bsf %VRDX, %VRDX
> > + sub %eax, %edx
> > + lea (%rsi, %rdx), %eax
> > + cmovae %esi, %eax
> > + ret
> > +
> > +
> > +#if VEC_SIZE == 32
> > + P2ALIGN_CLAMPED(4, 8)
> > +#endif
> > +L(last_4x_vec_or_less):
> > + addl $(CHAR_PER_VEC * -4), %eax
> > + VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
> > +
> > +#if VEC_SIZE == 64
> > + KMOV %k0, %VRDX
> > + test %VRDX, %VRDX
> > + jnz L(last_vec_check)
> > +#endif
> > +
> > + subq $(VEC_SIZE * -4), %rdi
> > + cmpl $(CHAR_PER_VEC * 2), %eax
> > + jbe L(last_2x_vec_or_less)
> > +
> > + P2ALIGN_CLAMPED(4, 6)
> > +L(more_2x_vec):
> > + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> > + rechecking bounds. */
> > +
> > + /* Already checked in 256-bit case */
> > +#if VEC_SIZE != 0
> > + KMOV %k0, %VRDX
> > +
> > + test %VRDX, %VRDX
> > + jnz L(first_vec_x1)
> > +#endif
> > +
> > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> > + KMOV %k0, %VRDX
> > +
> > + test %VRDX, %VRDX
> > + jnz L(first_vec_x2)
> > +
> > + cmpq $(CHAR_PER_VEC * 4), %rax
> > + ja L(more_4x_vec)
> > +
> > +
> > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> > + KMOV %k0, %VRDX
> > + addl $(CHAR_PER_VEC * -2), %eax
> > + test %VRDX, %VRDX
> > + jnz L(last_vec_check)
> > +
> > + subb $(CHAR_PER_VEC), %al
> > + jbe L(max_1)
> > +
> > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> > + KMOV %k0, %VRDX
> > +
> > + test %VRDX, %VRDX
> > + jnz L(last_vec_check)
> > +L(max_1):
> > + movl %esi, %eax
> > + ret
> > +
> > +
> > + P2ALIGN_CLAMPED(4, 14)
> > +L(first_vec_x2):
> > +#if VEC_SIZE == 64
> > + /* If VEC_SIZE == 64 we can fit logic for full return label in
> > + spare bytes before next cache line. */
> > + bsf %VRDX, %VRDX
> > + sub %eax, %esi
> > + leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
> > + ret
> > + P2ALIGN_CLAMPED(4, 6)
> > +#else
> > + addl $CHAR_PER_VEC, %esi
> > +#endif
> > +L(first_vec_x1):
> > + bsf %VRDX, %VRDX
> > + sub %eax, %esi
> > + leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
> > + ret
> > +
> > +#if VEC_SIZE == 64
> > + P2ALIGN_CLAMPED(4, 6)
> > +L(first_vec_x4):
> > +# if VEC_SIZE == 64
> > + /* If VEC_SIZE == 64 we can fit logic for full return label in
> > + spare bytes before next cache line. */
> > + bsf %VRDX, %VRDX
> > + sub %eax, %esi
> > + leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
> > + ret
> > + P2ALIGN_CLAMPED(4, 6)
> > +# else
> > + addl $CHAR_PER_VEC, %esi
> > +# endif
> > +L(first_vec_x3):
> > + bsf %VRDX, %VRDX
> > + sub %eax, %esi
> > + leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
> > + ret
> > +#endif
> > +
> > + P2ALIGN_CLAMPED(6, 20)
> > +L(more_4x_vec):
> > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> > + KMOV %k0, %VRDX
> > + test %VRDX, %VRDX
> > + jnz L(first_vec_x3)
> > +
> > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> > + KMOV %k0, %VRDX
> > + test %VRDX, %VRDX
> > + jnz L(first_vec_x4)
> > +
> > + /* Check if at last VEC_SIZE * 4 length before aligning for the
> > + loop. */
> > + cmpq $(CHAR_PER_VEC * 8), %rax
> > + jbe L(last_4x_vec_or_less)
> > +
> > +
> > + /* Compute number of words checked after aligning. */
> > +#ifdef USE_AS_WCSLEN
> > + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> > + overflow. */
> > + leaq (VEC_SIZE * -3)(%rdi), %rdx
> > +#else
> > + leaq (VEC_SIZE * -3)(%rdi, %rax), %rax
> > +#endif
> > +
> > + subq $(VEC_SIZE * -1), %rdi
> > +
> > + /* Align data to VEC_SIZE * 4. */
> > +#if VEC_SIZE == 64
> > + /* Saves code size. No evex512 processor has partial register
> > + stalls. If that change this can be replaced with `andq
> > + $-(VEC_SIZE * 4), %rdi`. */
> > + xorb %dil, %dil
> > +#else
> > + andq $-(VEC_SIZE * 4), %rdi
> > +#endif
> > +
> > +#ifdef USE_AS_WCSLEN
> > + subq %rdi, %rdx
> > + sarq $2, %rdx
> > + addq %rdx, %rax
> > +#else
> > + subq %rdi, %rax
> > +#endif
> > +
> > + // mov %rdi, %rdx
> > +
> > + P2ALIGN(6)
> > +L(loop):
> > + /* VPMINU and VPCMP combination provide better performance as
> > + compared to alternative combinations. */
> > + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
> > + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> > + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
> > + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
> > +
> > + VPTESTN %VMM(2), %VMM(2), %k0
> > + VPTESTN %VMM(4), %VMM(4), %k1
> > +
> > + subq $-(VEC_SIZE * 4), %rdi
> > + KORTEST %k0, %k1
> > +
> > + jnz L(loopend)
> > + subq $(CHAR_PER_VEC * 4), %rax
> > + ja L(loop)
> > + mov %rsi, %rax
> > + ret
> > +
> > +
> > +#if VEC_SIZE == 32
> > + P2ALIGN_CLAMPED(4, 6)
> > +L(first_vec_x4):
> > +# if VEC_SIZE == 64
> > + /* If VEC_SIZE == 64 we can fit logic for full return label in
> > + spare bytes before next cache line. */
> > + bsf %VRDX, %VRDX
> > + sub %eax, %esi
> > + leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
> > + ret
> > + P2ALIGN_CLAMPED(4, 6)
> > +# else
> > + addl $CHAR_PER_VEC, %esi
> > +# endif
> > +L(first_vec_x3):
> > + bsf %VRDX, %VRDX
> > + sub %eax, %esi
> > + leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
> > + ret
> > +#endif
> > +
> > +
> > + P2ALIGN_CLAMPED(4, 11)
> > +L(loopend):
> > + /* We found a null terminator in one of the 4 vectors. */
> > +
> > + /* Check the first vector. */
> > + movq %rax, %r8
> > + VPTESTN %VMM(1), %VMM(1), %k2
> > + KMOV %k2, %VRCX
> > + bsf %rcx, %r8
> > +
> > + cmpq $(CHAR_PER_VEC), %r8
> > + jbe L(end_vec)
> > +
> > + /* Check the second vector. */
> > + subq $(CHAR_PER_VEC), %rax
> > + movq %rax, %r8
> > + KMOV %k0, %VRCX
> > + bsf %rcx, %r8
> > +
> > + cmpq $(CHAR_PER_VEC), %r8
> > + jbe L(end_vec)
> > +
> > + /* Check the third vector. */
> > + subq $(CHAR_PER_VEC), %rax
> > + movq %rax, %r8
> > + VPTESTN %VMM(3), %VMM(3), %k2
> > + KMOV %k2, %VRCX
> > + bsf %rcx, %r8
> > +
> > + cmpq $(CHAR_PER_VEC), %r8
> > + jbe L(end_vec)
> > +
> > + /* It is in the fourth vector. */
> > + subq $(CHAR_PER_VEC), %rax
> > + movq %rax, %r8
> > + KMOV %k1, %VRCX
> > + bsf %rcx, %r8
> > +
> > + P2ALIGN_CLAMPED(4, 3)
> > +L(end_vec):
> > + /* Get the number that has been processed. */
> > + movq %rsi, %rcx
> > + subq %rax, %rcx
> > +
> > + /* Add that to the offset we found the null terminator at. */
> > + leaq (%r8, %rcx), %rax
> > +
> > + /* Take the min of that and the limit. */
> > + cmpq %rsi, %rax
> > + cmovnb %rsi, %rax
> > + ret
> > +
> > + P2ALIGN_CLAMPED(4, 11)
> > +L(crosses_page_boundary):
> > + /* Align data backwards to VEC_SIZE. */
> > + shrl $20, %eax
> > + movq %rdi, %rcx
> > + andq $-VEC_SIZE, %rcx
> > + VPCMPEQ (%rcx), %VZERO, %k0
> > +
> > + KMOV %k0, %VRCX
> > +#ifdef USE_AS_WCSLEN
> > + shrl $2, %eax
> > + andl $(CHAR_PER_VEC - 1), %eax
> > +#endif
> > + /* By this point rax contains number of bytes we need to skip. */
> > + shrx %VRAX, %VRCX, %VRCX
> > +
> > + /* Calculates CHAR_PER_VEC - eax and stores in eax. */
> > + negl %eax
> > + andl $(CHAR_PER_VEC - 1), %eax
> > +
> > + movq %rsi, %rdx
> > + bsf %VRCX, %VRDX
> > + cmpq %rax, %rdx
> > + ja L(cross_page_continue)
> > +
> > + /* The vector had a null terminator or we are at the limit. */
> > + movl %edx, %eax
> > + cmpq %rdx, %rsi
> > + cmovb %esi, %eax
> > + ret
> > +
> > +END(STRNLEN)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
> > index 91b16830eb..cba585452c 100644
> > --- a/sysdeps/x86_64/multiarch/strnlen-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
> > @@ -1,423 +1,7 @@
> > -/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
> > - Copyright (C) 2022-2024 Free Software Foundation, Inc.
> > - This file is part of the GNU C Library.
> > -
> > - The GNU C Library is free software; you can redistribute it and/or
> > - modify it under the terms of the GNU Lesser General Public
> > - License as published by the Free Software Foundation; either
> > - version 2.1 of the License, or (at your option) any later version.
> > -
> > - The GNU C Library is distributed in the hope that it will be useful,
> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > - Lesser General Public License for more details.
> > -
> > - You should have received a copy of the GNU Lesser General Public
> > - License along with the GNU C Library; if not, see
> > - <https://www.gnu.org/licenses/>. */
> > -
> > -#include <isa-level.h>
> > -#include <sysdep.h>
> > -
> > -#if ISA_SHOULD_BUILD (4)
> > -
> > -# ifndef VEC_SIZE
> > -# include "x86-evex256-vecs.h"
> > -# endif
> > -
> > -
> > -# ifndef STRNLEN
> > -# define STRNLEN __strnlen_evex
> > -# endif
> > -
> > -# ifdef USE_AS_WCSLEN
> > -# define VPCMPEQ vpcmpeqd
> > -# define VPCMPNEQ vpcmpneqd
> > -# define VPTESTN vptestnmd
> > -# define VPTEST vptestmd
> > -# define VPMINU vpminud
> > -# define CHAR_SIZE 4
> > -
> > -# else
> > -# define VPCMPEQ vpcmpeqb
> > -# define VPCMPNEQ vpcmpneqb
> > -# define VPTESTN vptestnmb
> > -# define VPTEST vptestmb
> > -# define VPMINU vpminub
> > -# define CHAR_SIZE 1
> > -
> > -# define REG_WIDTH VEC_SIZE
> > -# endif
> > -
> > -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> > -
> > -# include "reg-macros.h"
> > -
> > -# if CHAR_PER_VEC == 32
> > -# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8)
> > -# else
> > -# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32)
> > -# endif
> > -
> > -
> > -
> > -# if CHAR_PER_VEC == 64
> > -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
> > -# else
> > -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
> > -# endif
> > -
> > -
> > -# define XZERO VMM_128(0)
> > -# define VZERO VMM(0)
> > -# define PAGE_SIZE 4096
> > -
> > - .section SECTION(.text), "ax", @progbits
> > -ENTRY_P2ALIGN (STRNLEN, 6)
> > - /* Check zero length. */
> > - test %RSI_LP, %RSI_LP
> > - jz L(zero)
> > -# ifdef __ILP32__
> > - /* Clear the upper 32 bits. */
> > - movl %esi, %esi
> > -# endif
> > -
> > - movl %edi, %eax
> > - vpxorq %XZERO, %XZERO, %XZERO
> > - andl $(PAGE_SIZE - 1), %eax
> > - cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > - ja L(cross_page_boundary)
> > -
> > - /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
> > - null byte. */
> > - VPCMPEQ (%rdi), %VZERO, %k0
> > -
> > - KMOV %k0, %VRCX
> > - movq %rsi, %rax
> > -
> > - /* If src (rcx) is zero, bsf does not change the result. NB:
> > - Must use 64-bit bsf here so that upper bits of len are not
> > - cleared. */
> > - bsfq %rcx, %rax
> > - /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
> > - CHAR) and rsi must be > CHAR_PER_VEC. */
> > - cmpq $CHAR_PER_VEC, %rax
> > - ja L(more_1x_vec)
> > - /* Check if first match in bounds. */
> > - cmpq %rax, %rsi
> > - cmovb %esi, %eax
> > - ret
> > -
> > -
> > -# if CHAR_PER_VEC != 32
> > - .p2align 4,, 2
> > -L(zero):
> > -L(max_0):
> > - movl %esi, %eax
> > - ret
> > -# endif
> > -
> > - /* Aligned more for strnlen compares remaining length vs 2 *
> > - CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
> > - going to the loop. */
> > - .p2align 4,, 10
> > -L(more_1x_vec):
> > -L(cross_page_continue):
> > - /* Compute number of words checked after aligning. */
> > -# ifdef USE_AS_WCSLEN
> > - /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> > - overflow. */
> > - movq %rdi, %rax
> > - andq $(VEC_SIZE * -1), %rdi
> > - subq %rdi, %rax
> > - sarq $2, %rax
> > - leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
> > -# else
> > - leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax
> > - andq $(VEC_SIZE * -1), %rdi
> > - subq %rdi, %rax
> > -# endif
> > -
> > -
> > - VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
> > -
> > - cmpq $(CHAR_PER_VEC * 2), %rax
> > - ja L(more_2x_vec)
> > -
> > -L(last_2x_vec_or_less):
> > - KMOV %k0, %VRDX
> > - test %VRDX, %VRDX
> > - jnz L(last_vec_check)
> > -
> > - /* Check the end of data. */
> > - SUB_SHORT (CHAR_PER_VEC, rax)
> > - jbe L(max_0)
> > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> > - KMOV %k0, %VRDX
> > - test %VRDX, %VRDX
> > - jz L(max_0)
> > - /* Best place for LAST_VEC_CHECK if ZMM. */
> > - .p2align 4,, 8
> > -L(last_vec_check):
> > - bsf %VRDX, %VRDX
> > - sub %eax, %edx
> > - lea (%rsi, %rdx), %eax
> > - cmovae %esi, %eax
> > - ret
> > -
> > -# if CHAR_PER_VEC == 32
> > - .p2align 4,, 2
> > -L(zero):
> > -L(max_0):
> > - movl %esi, %eax
> > - ret
> > -# endif
> > -
> > - .p2align 4,, 8
> > -L(last_4x_vec_or_less):
> > - addl $(CHAR_PER_VEC * -4), %eax
> > - VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
> > - subq $(VEC_SIZE * -4), %rdi
> > - cmpl $(CHAR_PER_VEC * 2), %eax
> > - jbe L(last_2x_vec_or_less)
> > -
> > - .p2align 4,, 6
> > -L(more_2x_vec):
> > - /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> > - rechecking bounds. */
> > -
> > - KMOV %k0, %VRDX
> > -
> > - test %VRDX, %VRDX
> > - jnz L(first_vec_x1)
> > -
> > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> > - KMOV %k0, %VRDX
> > - test %VRDX, %VRDX
> > - jnz L(first_vec_x2)
> > -
> > - cmpq $(CHAR_PER_VEC * 4), %rax
> > - ja L(more_4x_vec)
> > -
> > -
> > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> > - KMOV %k0, %VRDX
> > - addl $(CHAR_PER_VEC * -2), %eax
> > - test %VRDX, %VRDX
> > - jnz L(last_vec_check)
> > -
> > - subl $(CHAR_PER_VEC), %eax
> > - jbe L(max_1)
> > -
> > - VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> > - KMOV %k0, %VRDX
> > -
> > - test %VRDX, %VRDX
> > - jnz L(last_vec_check)
> > -L(max_1):
> > - movl %esi, %eax
> > - ret
> > -
> > - .p2align 4,, 3
> > -L(first_vec_x2):
> > -# if VEC_SIZE == 64
> > - /* If VEC_SIZE == 64 we can fit logic for full return label in
> > - spare bytes before next cache line. */
> > - bsf %VRDX, %VRDX
> > - sub %eax, %esi
> > - leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
> > - ret
> > - .p2align 4,, 6
> > -# else
> > - addl $CHAR_PER_VEC, %esi
> > -# endif
> > -L(first_vec_x1):
> > - bsf %VRDX, %VRDX
> > - sub %eax, %esi
> > - leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
> > - ret
> > -
> > -
> > - .p2align 4,, 6
> > -L(first_vec_x4):
> > -# if VEC_SIZE == 64
> > - /* If VEC_SIZE == 64 we can fit logic for full return label in
> > - spare bytes before next cache line. */
> > - bsf %VRDX, %VRDX
> > - sub %eax, %esi
> > - leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
> > - ret
> > - .p2align 4,, 6
> > -# else
> > - addl $CHAR_PER_VEC, %esi
> > -# endif
> > -L(first_vec_x3):
> > - bsf %VRDX, %VRDX
> > - sub %eax, %esi
> > - leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
> > - ret
> > -
> > - .p2align 4,, 5
> > -L(more_4x_vec):
> > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> > - KMOV %k0, %VRDX
> > - test %VRDX, %VRDX
> > - jnz L(first_vec_x3)
> > -
> > - VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> > - KMOV %k0, %VRDX
> > - test %VRDX, %VRDX
> > - jnz L(first_vec_x4)
> > -
> > - /* Check if at last VEC_SIZE * 4 length before aligning for the
> > - loop. */
> > - cmpq $(CHAR_PER_VEC * 8), %rax
> > - jbe L(last_4x_vec_or_less)
> > -
> > -
> > - /* Compute number of words checked after aligning. */
> > -# ifdef USE_AS_WCSLEN
> > - /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> > - overflow. */
> > - leaq (VEC_SIZE * -3)(%rdi), %rdx
> > -# else
> > - leaq (VEC_SIZE * -3)(%rdi, %rax), %rax
> > -# endif
> > -
> > - subq $(VEC_SIZE * -1), %rdi
> > -
> > - /* Align data to VEC_SIZE * 4. */
> > -# if VEC_SIZE == 64
> > - /* Saves code size. No evex512 processor has partial register
> > - stalls. If that change this can be replaced with `andq
> > - $-(VEC_SIZE * 4), %rdi`. */
> > - xorb %dil, %dil
> > -# else
> > - andq $-(VEC_SIZE * 4), %rdi
> > -# endif
> > -
> > -# ifdef USE_AS_WCSLEN
> > - subq %rdi, %rdx
> > - sarq $2, %rdx
> > - addq %rdx, %rax
> > -# else
> > - subq %rdi, %rax
> > -# endif
> > - /* Compare 4 * VEC at a time forward. */
> > - .p2align 4,, 11
> > -L(loop_4x_vec):
> > - VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
> > - VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> > - VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
> > - VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
> > - VPTESTN %VMM(2), %VMM(2), %k0
> > - VPTESTN %VMM(4), %VMM(4), %k2
> > - subq $-(VEC_SIZE * 4), %rdi
> > - /* Break if at end of length. */
> > - subq $(CHAR_PER_VEC * 4), %rax
> > - jbe L(loop_len_end)
> > -
> > -
> > - KORTEST %k0, %k2
> > - jz L(loop_4x_vec)
> > -
> > -
> > -L(loop_last_4x_vec):
> > - movq %rsi, %rcx
> > - subq %rax, %rsi
> > - VPTESTN %VMM(1), %VMM(1), %k1
> > - KMOV %k1, %VRDX
> > - test %VRDX, %VRDX
> > - jnz L(last_vec_x0)
> > -
> > - KMOV %k0, %VRDX
> > - test %VRDX, %VRDX
> > - jnz L(last_vec_x1)
> > -
> > - VPTESTN %VMM(3), %VMM(3), %k0
> > -
> > - /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
> > - returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
> > - individually, for VEC_SIZE == 32 we combine them in a single
> > - 64-bit GPR. */
> > -# if CHAR_PER_VEC == 64
> > - KMOV %k0, %VRDX
> > - test %VRDX, %VRDX
> > - jnz L(last_vec_x2)
> > - KMOV %k2, %VRDX
> > -# else
> > - /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
> > - */
> > - kmovd %k2, %edx
> > - kmovd %k0, %eax
> > - salq $CHAR_PER_VEC, %rdx
> > - orq %rax, %rdx
> > -# endif
> > -
> > - /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
> > - */
> > - bsfq %rdx, %rdx
> > - leaq (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
> > - cmpq %rax, %rcx
> > - cmovb %rcx, %rax
> > - ret
> > -
> > - /* Handle last 4x VEC after loop. All VECs have been loaded. */
> > - .p2align 4,, 4
> > -L(loop_len_end):
> > - KORTEST %k0, %k2
> > - jnz L(loop_last_4x_vec)
> > - movq %rsi, %rax
> > - ret
> > -
> > -
> > -# if CHAR_PER_VEC == 64
> > - /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
> > - need return label for it. */
> > - .p2align 4,, 8
> > -L(last_vec_x2):
> > - bsf %VRDX, %VRDX
> > - leaq (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
> > - cmpq %rax, %rcx
> > - cmovb %rcx, %rax
> > - ret
> > -# endif
> > -
> > -
> > - .p2align 4,, 10
> > -L(last_vec_x1):
> > - addq $CHAR_PER_VEC, %rsi
> > -L(last_vec_x0):
> > - bsf %VRDX, %VRDX
> > - leaq (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
> > - cmpq %rax, %rcx
> > - cmovb %rcx, %rax
> > - ret
> > -
> > -
> > - .p2align 4,, 8
> > -L(cross_page_boundary):
> > - /* Align data to VEC_SIZE. */
> > - movq %rdi, %rcx
> > - andq $-VEC_SIZE, %rcx
> > - VPCMPEQ (%rcx), %VZERO, %k0
> > -
> > - KMOV %k0, %VRCX
> > -# ifdef USE_AS_WCSLEN
> > - shrl $2, %eax
> > - andl $(CHAR_PER_VEC - 1), %eax
> > -# endif
> > - shrx %VRAX, %VRCX, %VRCX
> > -
> > - negl %eax
> > - andl $(CHAR_PER_VEC - 1), %eax
> > - movq %rsi, %rdx
> > - bsf %VRCX, %VRDX
> > - cmpq %rax, %rdx
> > - ja L(cross_page_continue)
> > - movl %edx, %eax
> > - cmpq %rdx, %rsi
> > - cmovb %esi, %eax
> > - ret
> > -END (STRNLEN)
> > +#ifndef STRNLEN
> > +#define STRNLEN __strnlen_evex
> > #endif
> > +
> > +#include "x86-evex256-vecs.h"
> > +#include "reg-macros.h"
> > +#include "strnlen-evex-base.S"
> > \ No newline at end of file
> > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > index f8e55883bb..8ef54078f8 100644
> > --- a/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > @@ -1,264 +1,7 @@
> > -/* Placeholder function, not used by any processor at the moment.
> > - Copyright (C) 2022-2024 Free Software Foundation, Inc.
> > - This file is part of the GNU C Library.
> > -
> > - The GNU C Library is free software; you can redistribute it and/or
> > - modify it under the terms of the GNU Lesser General Public
> > - License as published by the Free Software Foundation; either
> > - version 2.1 of the License, or (at your option) any later version.
> > -
> > - The GNU C Library is distributed in the hope that it will be useful,
> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > - Lesser General Public License for more details.
> > -
> > - You should have received a copy of the GNU Lesser General Public
> > - License along with the GNU C Library; if not, see
> > - <https://www.gnu.org/licenses/>. */
> > -
> > #ifndef STRNLEN
> > #define STRNLEN __strnlen_evex512
> > #endif
> >
> > #include "x86-evex512-vecs.h"
> > #include "reg-macros.h"
> > -
> > -#include <isa-level.h>
> > -
> > -#if ISA_SHOULD_BUILD (4)
> > -
> > -# include <sysdep.h>
> > -
> > -# ifdef USE_AS_WCSLEN
> > -# define VPCMPEQ vpcmpeqd
> > -# define VPTESTN vptestnmd
> > -# define VPMINU vpminud
> > -# define CHAR_SIZE 4
> > -# else
> > -# define VPCMPEQ vpcmpeqb
> > -# define VPTESTN vptestnmb
> > -# define VPMINU vpminub
> > -# define CHAR_SIZE 1
> > -# endif
> > -
> > -# define PAGE_SIZE 4096
> > -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> > -
> > - .section SECTION(.text),"ax",@progbits
> > -/* Aligning entry point to 64 byte, provides better performance for
> > - one vector length string. */
> > -ENTRY_P2ALIGN (STRNLEN, 6)
> > - /* Check zero length. */
> > - test %RSI_LP, %RSI_LP
> > - jz L(ret_max)
> > -# ifdef __ILP32__
> > - /* Clear the upper 32 bits. */
> > - movl %esi, %esi
> > -# endif
> > -
> > - movl %edi, %eax
> > - vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0)
> > - sall $20, %eax
> > - cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> > - ja L(page_cross)
> > -
> > - /* Compare [w]char for null, mask bit will be set for match. */
> > - VPCMPEQ (%rdi), %VMM(0), %k0
> > - KMOV %k0, %VRCX
> > - /* Store max length in rax. */
> > - mov %rsi, %rax
> > - /* If rcx is 0, rax will have max length. We can not use VRCX
> > - and VRAX here for evex256 because, upper 32 bits may be
> > - undefined for ecx and eax. */
> > - bsfq %rcx, %rax
> > - cmp $CHAR_PER_VEC, %rax
> > - ja L(align_more)
> > - cmpq %rax, %rsi
> > - cmovb %esi, %eax
> > - ret
> > -
> > - /* At this point vector max length reached. */
> > - .p2align 4,,3
> > -L(ret_max):
> > - movq %rsi, %rax
> > - ret
> > -
> > -L(align_more):
> > - mov %rdi, %rax
> > - /* Align rax to VEC_SIZE. */
> > - andq $-VEC_SIZE, %rax
> > - movq %rdi, %rdx
> > - subq %rax, %rdx
> > -# ifdef USE_AS_WCSLEN
> > - shr $2, %VRDX
> > -# endif
> > - /* At this point rdx contains [w]chars already compared. */
> > - leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx
> > - /* At this point rdx contains number of w[char] needs to go.
> > - Now onwards rdx will keep decrementing with each compare. */
> > -
> > - /* Loop unroll 4 times for 4 vector loop. */
> > - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> > - subq $-VEC_SIZE, %rax
> > - KMOV %k0, %VRCX
> > - test %VRCX, %VRCX
> > - jnz L(ret_vec_x1)
> > -
> > - subq $CHAR_PER_VEC, %rdx
> > - jbe L(ret_max)
> > -
> > - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> > - KMOV %k0, %VRCX
> > - test %VRCX, %VRCX
> > - jnz L(ret_vec_x2)
> > -
> > - subq $CHAR_PER_VEC, %rdx
> > - jbe L(ret_max)
> > -
> > - VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
> > - KMOV %k0, %VRCX
> > - test %VRCX, %VRCX
> > - jnz L(ret_vec_x3)
> > -
> > - subq $CHAR_PER_VEC, %rdx
> > - jbe L(ret_max)
> > -
> > - VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
> > - KMOV %k0, %VRCX
> > - test %VRCX, %VRCX
> > - jnz L(ret_vec_x4)
> > -
> > - subq $CHAR_PER_VEC, %rdx
> > - jbe L(ret_max)
> > - /* Save pointer before 4 x VEC_SIZE alignment. */
> > - movq %rax, %rcx
> > -
> > - /* Align address to VEC_SIZE * 4 for loop. */
> > - andq $-(VEC_SIZE * 4), %rax
> > -
> > - subq %rax, %rcx
> > -# ifdef USE_AS_WCSLEN
> > - shr $2, %VRCX
> > -# endif
> > - /* rcx contains number of [w]char will be recompared due to
> > - alignment fixes. rdx must be incremented by rcx to offset
> > - alignment adjustment. */
> > - addq %rcx, %rdx
> > - /* Need jump as we don't want to add/subtract rdx for first
> > - iteration of 4 x VEC_SIZE aligned loop. */
> > -
> > - .p2align 4,,11
> > -L(loop):
> > - /* VPMINU and VPCMP combination provide better performance as
> > - compared to alternative combinations. */
> > - VMOVA (VEC_SIZE * 4)(%rax), %VMM(1)
> > - VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
> > - VMOVA (VEC_SIZE * 6)(%rax), %VMM(3)
> > - VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
> > -
> > - VPTESTN %VMM(2), %VMM(2), %k0
> > - VPTESTN %VMM(4), %VMM(4), %k1
> > -
> > - subq $-(VEC_SIZE * 4), %rax
> > - KORTEST %k0, %k1
> > -
> > - jnz L(loopend)
> > - subq $(CHAR_PER_VEC * 4), %rdx
> > - ja L(loop)
> > - mov %rsi, %rax
> > - ret
> > -
> > -L(loopend):
> > -
> > - VPTESTN %VMM(1), %VMM(1), %k2
> > - KMOV %k2, %VRCX
> > - test %VRCX, %VRCX
> > - jnz L(ret_vec_x1)
> > -
> > - KMOV %k0, %VRCX
> > - /* At this point, if k0 is non zero, null char must be in the
> > - second vector. */
> > - test %VRCX, %VRCX
> > - jnz L(ret_vec_x2)
> > -
> > - VPTESTN %VMM(3), %VMM(3), %k3
> > - KMOV %k3, %VRCX
> > - test %VRCX, %VRCX
> > - jnz L(ret_vec_x3)
> > - /* At this point null [w]char must be in the fourth vector so no
> > - need to check. */
> > - KMOV %k1, %VRCX
> > -
> > - /* Fourth, third, second vector terminating are pretty much
> > - same, implemented this way to avoid branching and reuse code
> > - from pre loop exit condition. */
> > -L(ret_vec_x4):
> > - bsf %VRCX, %VRCX
> > - subq %rdi, %rax
> > -# ifdef USE_AS_WCSLEN
> > - subq $-(VEC_SIZE * 3), %rax
> > - shrq $2, %rax
> > - addq %rcx, %rax
> > -# else
> > - leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
> > -# endif
> > -
> > - cmpq %rsi, %rax
> > - cmovnb %rsi, %rax
> > - ret
> > -
> > -L(ret_vec_x3):
> > - bsf %VRCX, %VRCX
> > - subq %rdi, %rax
> > -# ifdef USE_AS_WCSLEN
> > - subq $-(VEC_SIZE * 2), %rax
> > - shrq $2, %rax
> > - addq %rcx, %rax
> > -# else
> > - leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
> > -# endif
> > - cmpq %rsi, %rax
> > - cmovnb %rsi, %rax
> > - ret
> > -
> > -L(ret_vec_x2):
> > - subq $-VEC_SIZE, %rax
> > -L(ret_vec_x1):
> > - bsf %VRCX, %VRCX
> > - subq %rdi, %rax
> > -# ifdef USE_AS_WCSLEN
> > - shrq $2, %rax
> > -# endif
> > - addq %rcx, %rax
> > - cmpq %rsi, %rax
> > - cmovnb %rsi, %rax
> > - ret
> > -
> > -L(page_cross):
> > - mov %rdi, %rax
> > - movl %edi, %ecx
> > - andl $(VEC_SIZE - 1), %ecx
> > -# ifdef USE_AS_WCSLEN
> > - sarl $2, %ecx
> > -# endif
> > - /* ecx contains number of w[char] to be skipped as a result
> > - of address alignment. */
> > - andq $-VEC_SIZE, %rax
> > - VPCMPEQ (%rax), %VMM(0), %k0
> > - KMOV %k0, %VRDX
> > - /* Ignore number of character for alignment adjustment. */
> > - shr %cl, %VRDX
> > - jnz L(page_cross_end)
> > - movl $CHAR_PER_VEC, %eax
> > - sub %ecx, %eax
> > - cmp %rax, %rsi
> > - ja L(align_more)
> > -
> > -L(page_cross_end):
> > - bsf %VRDX, %VRAX
> > - cmpq %rsi, %rax
> > - cmovnb %esi, %eax
> > - ret
> > -
> > -END (STRNLEN)
> > -#endif
> > +#include "strnlen-evex-base.S"
> > \ No newline at end of file
> > --
> > 2.37.2
> >
>
> LGTM
>
> Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
I pushed onto master branch.
new file mode 100644
@@ -0,0 +1,462 @@
+/* strnlen/wcsnlen optimized with 256/512-bit EVEX instructions.
+ Copyright (C) 2022-2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+#ifdef USE_AS_WCSLEN
+# define VPCMPEQ vpcmpeqd
+# define VPTESTN vptestnmd
+# define VPMINU vpminud
+# define CHAR_SIZE 4
+#else
+# define VPCMPEQ vpcmpeqb
+# define VPTESTN vptestnmb
+# define VPMINU vpminub
+# define CHAR_SIZE 1
+#endif
+
+#define XZERO VMM_128(0)
+#define VZERO VMM(0)
+#define PAGE_SIZE 4096
+#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+#if CHAR_PER_VEC == 32
+# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8)
+#else
+# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32)
+#endif
+
+#ifdef USE_AS_WCSLEN
+/* For wide-character, we care more about limitting code size
+ than optimally aligning targets, so just cap nop padding
+ reasonably low. */
+# define P2ALIGN(...) .p2align 4,, 6
+# define P2ALIGN_CLAMPED(...) P2ALIGN(__VA_ARGS__)
+#else
+# define P2ALIGN(x) .p2align x
+# define P2ALIGN_CLAMPED(x, y) .p2align x,, y
+#endif
+
+ .section SECTION(.text), "ax", @progbits
+ /* Aligning entry point to 64 byte, provides better performance for
+ one vector length string. */
+ENTRY_P2ALIGN(STRNLEN, 6)
+ /* rdi is pointer to array, rsi is the upper limit. */
+
+ /* Check zero length. */
+ test %RSI_LP, %RSI_LP
+ jz L(zero)
+
+#ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %esi, %esi
+#endif
+
+ vpxorq %XZERO, %XZERO, %XZERO
+
+ /* Check that we won't cross a page boundary with our first load. */
+ movl %edi, %eax
+ shll $20, %eax
+ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
+ ja L(crosses_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
+ null byte. */
+ VPCMPEQ (%rdi), %VZERO, %k0
+ KMOV %k0, %VRCX
+
+ /* If src (rcx) is zero, bsf does not change the result. NB:
+ Must use 64-bit bsf here so that upper bits of len are not
+ cleared. */
+ movq %rsi, %rax
+ bsfq %rcx, %rax
+
+ /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
+ CHAR) and rsi must be > CHAR_PER_VEC. */
+ cmpq $CHAR_PER_VEC, %rax
+ ja L(more_1x_vec)
+
+ /* Check if first match in bounds. */
+ cmpq %rax, %rsi
+ cmovb %esi, %eax
+ ret
+
+#if VEC_SIZE == 32
+ P2ALIGN_CLAMPED(4, 2)
+L(zero):
+L(max_0):
+ movl %esi, %eax
+ ret
+#endif
+
+ P2ALIGN_CLAMPED(4, 10)
+L(more_1x_vec):
+L(cross_page_continue):
+ /* After this calculation, rax stores the number of elements
+ left to be processed The complexity comes from the fact some
+ elements get read twice due to alignment and we need to be
+ sure we don't count them twice (else, it would just be rsi -
+ CHAR_PER_VEC). */
+
+#ifdef USE_AS_WCSLEN
+ /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+ overflow. */
+ movq %rdi, %rax
+ andq $(VEC_SIZE * -1), %rdi
+ subq %rdi, %rax
+ sarq $2, %rax
+ leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
+#else
+ /* Calculate ptr + N - VEC_SIZE, then mask off the low bits,
+ then subtract ptr to get the new aligned limit value. */
+ leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax
+ andq $(VEC_SIZE * -1), %rdi
+ subq %rdi, %rax
+#endif
+
+ VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
+
+ /* Checking here is faster for 256-bit but not 512-bit */
+#if VEC_SIZE == 0
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(last_vec_check)
+#endif
+
+ cmpq $(CHAR_PER_VEC * 2), %rax
+ ja L(more_2x_vec)
+
+L(last_2x_vec_or_less):
+
+ /* Checking here is faster for 512-bit but not 256-bit */
+#if VEC_SIZE != 0
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(last_vec_check)
+#endif
+
+ /* Check for the end of data. */
+ SUB_SHORT (CHAR_PER_VEC, rax)
+ jbe L(max_0)
+
+ /* Check the final remaining vector. */
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+#if VEC_SIZE == 32
+ jz L(max_0)
+#else
+ jnz L(last_vec_check)
+ P2ALIGN_CLAMPED(4, 2)
+L(zero):
+L(max_0):
+ movl %esi, %eax
+ ret
+
+#endif
+ P2ALIGN_CLAMPED(4, 4)
+L(last_vec_check):
+ bsf %VRDX, %VRDX
+ sub %eax, %edx
+ lea (%rsi, %rdx), %eax
+ cmovae %esi, %eax
+ ret
+
+
+#if VEC_SIZE == 32
+ P2ALIGN_CLAMPED(4, 8)
+#endif
+L(last_4x_vec_or_less):
+ addl $(CHAR_PER_VEC * -4), %eax
+ VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
+
+#if VEC_SIZE == 64
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(last_vec_check)
+#endif
+
+ subq $(VEC_SIZE * -4), %rdi
+ cmpl $(CHAR_PER_VEC * 2), %eax
+ jbe L(last_2x_vec_or_less)
+
+ P2ALIGN_CLAMPED(4, 6)
+L(more_2x_vec):
+ /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+ rechecking bounds. */
+
+ /* Already checked in 256-bit case */
+#if VEC_SIZE != 0
+ KMOV %k0, %VRDX
+
+ test %VRDX, %VRDX
+ jnz L(first_vec_x1)
+#endif
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+
+ test %VRDX, %VRDX
+ jnz L(first_vec_x2)
+
+ cmpq $(CHAR_PER_VEC * 4), %rax
+ ja L(more_4x_vec)
+
+
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+ addl $(CHAR_PER_VEC * -2), %eax
+ test %VRDX, %VRDX
+ jnz L(last_vec_check)
+
+ subb $(CHAR_PER_VEC), %al
+ jbe L(max_1)
+
+ VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+
+ test %VRDX, %VRDX
+ jnz L(last_vec_check)
+L(max_1):
+ movl %esi, %eax
+ ret
+
+
+ P2ALIGN_CLAMPED(4, 14)
+L(first_vec_x2):
+#if VEC_SIZE == 64
+ /* If VEC_SIZE == 64 we can fit logic for full return label in
+ spare bytes before next cache line. */
+ bsf %VRDX, %VRDX
+ sub %eax, %esi
+ leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
+ ret
+ P2ALIGN_CLAMPED(4, 6)
+#else
+ addl $CHAR_PER_VEC, %esi
+#endif
+L(first_vec_x1):
+ bsf %VRDX, %VRDX
+ sub %eax, %esi
+ leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
+ ret
+
+#if VEC_SIZE == 64
+ P2ALIGN_CLAMPED(4, 6)
+L(first_vec_x4):
+# if VEC_SIZE == 64
+ /* If VEC_SIZE == 64 we can fit logic for full return label in
+ spare bytes before next cache line. */
+ bsf %VRDX, %VRDX
+ sub %eax, %esi
+ leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
+ ret
+ P2ALIGN_CLAMPED(4, 6)
+# else
+ addl $CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x3):
+ bsf %VRDX, %VRDX
+ sub %eax, %esi
+ leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
+ ret
+#endif
+
+ P2ALIGN_CLAMPED(6, 20)
+L(more_4x_vec):
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(first_vec_x3)
+
+ VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(first_vec_x4)
+
+ /* Check if at last VEC_SIZE * 4 length before aligning for the
+ loop. */
+ cmpq $(CHAR_PER_VEC * 8), %rax
+ jbe L(last_4x_vec_or_less)
+
+
+ /* Compute number of words checked after aligning. */
+#ifdef USE_AS_WCSLEN
+ /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+ overflow. */
+ leaq (VEC_SIZE * -3)(%rdi), %rdx
+#else
+ leaq (VEC_SIZE * -3)(%rdi, %rax), %rax
+#endif
+
+ subq $(VEC_SIZE * -1), %rdi
+
+ /* Align data to VEC_SIZE * 4. */
+#if VEC_SIZE == 64
+ /* Saves code size. No evex512 processor has partial register
+ stalls. If that change this can be replaced with `andq
+ $-(VEC_SIZE * 4), %rdi`. */
+ xorb %dil, %dil
+#else
+ andq $-(VEC_SIZE * 4), %rdi
+#endif
+
+#ifdef USE_AS_WCSLEN
+ subq %rdi, %rdx
+ sarq $2, %rdx
+ addq %rdx, %rax
+#else
+ subq %rdi, %rax
+#endif
+
+ // mov %rdi, %rdx
+
+ P2ALIGN(6)
+L(loop):
+ /* VPMINU and VPCMP combination provide better performance as
+ compared to alternative combinations. */
+ VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
+ VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+ VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
+ VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+
+ VPTESTN %VMM(2), %VMM(2), %k0
+ VPTESTN %VMM(4), %VMM(4), %k1
+
+ subq $-(VEC_SIZE * 4), %rdi
+ KORTEST %k0, %k1
+
+ jnz L(loopend)
+ subq $(CHAR_PER_VEC * 4), %rax
+ ja L(loop)
+ mov %rsi, %rax
+ ret
+
+
+#if VEC_SIZE == 32
+ P2ALIGN_CLAMPED(4, 6)
+L(first_vec_x4):
+# if VEC_SIZE == 64
+ /* If VEC_SIZE == 64 we can fit logic for full return label in
+ spare bytes before next cache line. */
+ bsf %VRDX, %VRDX
+ sub %eax, %esi
+ leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
+ ret
+ P2ALIGN_CLAMPED(4, 6)
+# else
+ addl $CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x3):
+ bsf %VRDX, %VRDX
+ sub %eax, %esi
+ leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
+ ret
+#endif
+
+
+ P2ALIGN_CLAMPED(4, 11)
+L(loopend):
+ /* We found a null terminator in one of the 4 vectors. */
+
+ /* Check the first vector. */
+ movq %rax, %r8
+ VPTESTN %VMM(1), %VMM(1), %k2
+ KMOV %k2, %VRCX
+ bsf %rcx, %r8
+
+ cmpq $(CHAR_PER_VEC), %r8
+ jbe L(end_vec)
+
+ /* Check the second vector. */
+ subq $(CHAR_PER_VEC), %rax
+ movq %rax, %r8
+ KMOV %k0, %VRCX
+ bsf %rcx, %r8
+
+ cmpq $(CHAR_PER_VEC), %r8
+ jbe L(end_vec)
+
+ /* Check the third vector. */
+ subq $(CHAR_PER_VEC), %rax
+ movq %rax, %r8
+ VPTESTN %VMM(3), %VMM(3), %k2
+ KMOV %k2, %VRCX
+ bsf %rcx, %r8
+
+ cmpq $(CHAR_PER_VEC), %r8
+ jbe L(end_vec)
+
+ /* It is in the fourth vector. */
+ subq $(CHAR_PER_VEC), %rax
+ movq %rax, %r8
+ KMOV %k1, %VRCX
+ bsf %rcx, %r8
+
+ P2ALIGN_CLAMPED(4, 3)
+L(end_vec):
+ /* Get the number that has been processed. */
+ movq %rsi, %rcx
+ subq %rax, %rcx
+
+ /* Add that to the offset we found the null terminator at. */
+ leaq (%r8, %rcx), %rax
+
+ /* Take the min of that and the limit. */
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+ ret
+
+ P2ALIGN_CLAMPED(4, 11)
+L(crosses_page_boundary):
+ /* Align data backwards to VEC_SIZE. */
+ shrl $20, %eax
+ movq %rdi, %rcx
+ andq $-VEC_SIZE, %rcx
+ VPCMPEQ (%rcx), %VZERO, %k0
+
+ KMOV %k0, %VRCX
+#ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+ andl $(CHAR_PER_VEC - 1), %eax
+#endif
+ /* By this point rax contains number of bytes we need to skip. */
+ shrx %VRAX, %VRCX, %VRCX
+
+ /* Calculates CHAR_PER_VEC - eax and stores in eax. */
+ negl %eax
+ andl $(CHAR_PER_VEC - 1), %eax
+
+ movq %rsi, %rdx
+ bsf %VRCX, %VRDX
+ cmpq %rax, %rdx
+ ja L(cross_page_continue)
+
+ /* The vector had a null terminator or we are at the limit. */
+ movl %edx, %eax
+ cmpq %rdx, %rsi
+ cmovb %esi, %eax
+ ret
+
+END(STRNLEN)
+#endif
@@ -1,423 +1,7 @@
-/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
- Copyright (C) 2022-2024 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <isa-level.h>
-#include <sysdep.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-# ifndef VEC_SIZE
-# include "x86-evex256-vecs.h"
-# endif
-
-
-# ifndef STRNLEN
-# define STRNLEN __strnlen_evex
-# endif
-
-# ifdef USE_AS_WCSLEN
-# define VPCMPEQ vpcmpeqd
-# define VPCMPNEQ vpcmpneqd
-# define VPTESTN vptestnmd
-# define VPTEST vptestmd
-# define VPMINU vpminud
-# define CHAR_SIZE 4
-
-# else
-# define VPCMPEQ vpcmpeqb
-# define VPCMPNEQ vpcmpneqb
-# define VPTESTN vptestnmb
-# define VPTEST vptestmb
-# define VPMINU vpminub
-# define CHAR_SIZE 1
-
-# define REG_WIDTH VEC_SIZE
-# endif
-
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-
-# include "reg-macros.h"
-
-# if CHAR_PER_VEC == 32
-# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8)
-# else
-# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32)
-# endif
-
-
-
-# if CHAR_PER_VEC == 64
-# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
-# else
-# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
-# endif
-
-
-# define XZERO VMM_128(0)
-# define VZERO VMM(0)
-# define PAGE_SIZE 4096
-
- .section SECTION(.text), "ax", @progbits
-ENTRY_P2ALIGN (STRNLEN, 6)
- /* Check zero length. */
- test %RSI_LP, %RSI_LP
- jz L(zero)
-# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %esi, %esi
-# endif
-
- movl %edi, %eax
- vpxorq %XZERO, %XZERO, %XZERO
- andl $(PAGE_SIZE - 1), %eax
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
- ja L(cross_page_boundary)
-
- /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
- null byte. */
- VPCMPEQ (%rdi), %VZERO, %k0
-
- KMOV %k0, %VRCX
- movq %rsi, %rax
-
- /* If src (rcx) is zero, bsf does not change the result. NB:
- Must use 64-bit bsf here so that upper bits of len are not
- cleared. */
- bsfq %rcx, %rax
- /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
- CHAR) and rsi must be > CHAR_PER_VEC. */
- cmpq $CHAR_PER_VEC, %rax
- ja L(more_1x_vec)
- /* Check if first match in bounds. */
- cmpq %rax, %rsi
- cmovb %esi, %eax
- ret
-
-
-# if CHAR_PER_VEC != 32
- .p2align 4,, 2
-L(zero):
-L(max_0):
- movl %esi, %eax
- ret
-# endif
-
- /* Aligned more for strnlen compares remaining length vs 2 *
- CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
- going to the loop. */
- .p2align 4,, 10
-L(more_1x_vec):
-L(cross_page_continue):
- /* Compute number of words checked after aligning. */
-# ifdef USE_AS_WCSLEN
- /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
- overflow. */
- movq %rdi, %rax
- andq $(VEC_SIZE * -1), %rdi
- subq %rdi, %rax
- sarq $2, %rax
- leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
-# else
- leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax
- andq $(VEC_SIZE * -1), %rdi
- subq %rdi, %rax
-# endif
-
-
- VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
-
- cmpq $(CHAR_PER_VEC * 2), %rax
- ja L(more_2x_vec)
-
-L(last_2x_vec_or_less):
- KMOV %k0, %VRDX
- test %VRDX, %VRDX
- jnz L(last_vec_check)
-
- /* Check the end of data. */
- SUB_SHORT (CHAR_PER_VEC, rax)
- jbe L(max_0)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
- KMOV %k0, %VRDX
- test %VRDX, %VRDX
- jz L(max_0)
- /* Best place for LAST_VEC_CHECK if ZMM. */
- .p2align 4,, 8
-L(last_vec_check):
- bsf %VRDX, %VRDX
- sub %eax, %edx
- lea (%rsi, %rdx), %eax
- cmovae %esi, %eax
- ret
-
-# if CHAR_PER_VEC == 32
- .p2align 4,, 2
-L(zero):
-L(max_0):
- movl %esi, %eax
- ret
-# endif
-
- .p2align 4,, 8
-L(last_4x_vec_or_less):
- addl $(CHAR_PER_VEC * -4), %eax
- VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
- subq $(VEC_SIZE * -4), %rdi
- cmpl $(CHAR_PER_VEC * 2), %eax
- jbe L(last_2x_vec_or_less)
-
- .p2align 4,, 6
-L(more_2x_vec):
- /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
- rechecking bounds. */
-
- KMOV %k0, %VRDX
-
- test %VRDX, %VRDX
- jnz L(first_vec_x1)
-
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
- KMOV %k0, %VRDX
- test %VRDX, %VRDX
- jnz L(first_vec_x2)
-
- cmpq $(CHAR_PER_VEC * 4), %rax
- ja L(more_4x_vec)
-
-
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
- KMOV %k0, %VRDX
- addl $(CHAR_PER_VEC * -2), %eax
- test %VRDX, %VRDX
- jnz L(last_vec_check)
-
- subl $(CHAR_PER_VEC), %eax
- jbe L(max_1)
-
- VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
- KMOV %k0, %VRDX
-
- test %VRDX, %VRDX
- jnz L(last_vec_check)
-L(max_1):
- movl %esi, %eax
- ret
-
- .p2align 4,, 3
-L(first_vec_x2):
-# if VEC_SIZE == 64
- /* If VEC_SIZE == 64 we can fit logic for full return label in
- spare bytes before next cache line. */
- bsf %VRDX, %VRDX
- sub %eax, %esi
- leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
- ret
- .p2align 4,, 6
-# else
- addl $CHAR_PER_VEC, %esi
-# endif
-L(first_vec_x1):
- bsf %VRDX, %VRDX
- sub %eax, %esi
- leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
- ret
-
-
- .p2align 4,, 6
-L(first_vec_x4):
-# if VEC_SIZE == 64
- /* If VEC_SIZE == 64 we can fit logic for full return label in
- spare bytes before next cache line. */
- bsf %VRDX, %VRDX
- sub %eax, %esi
- leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
- ret
- .p2align 4,, 6
-# else
- addl $CHAR_PER_VEC, %esi
-# endif
-L(first_vec_x3):
- bsf %VRDX, %VRDX
- sub %eax, %esi
- leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
- ret
-
- .p2align 4,, 5
-L(more_4x_vec):
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
- KMOV %k0, %VRDX
- test %VRDX, %VRDX
- jnz L(first_vec_x3)
-
- VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
- KMOV %k0, %VRDX
- test %VRDX, %VRDX
- jnz L(first_vec_x4)
-
- /* Check if at last VEC_SIZE * 4 length before aligning for the
- loop. */
- cmpq $(CHAR_PER_VEC * 8), %rax
- jbe L(last_4x_vec_or_less)
-
-
- /* Compute number of words checked after aligning. */
-# ifdef USE_AS_WCSLEN
- /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
- overflow. */
- leaq (VEC_SIZE * -3)(%rdi), %rdx
-# else
- leaq (VEC_SIZE * -3)(%rdi, %rax), %rax
-# endif
-
- subq $(VEC_SIZE * -1), %rdi
-
- /* Align data to VEC_SIZE * 4. */
-# if VEC_SIZE == 64
- /* Saves code size. No evex512 processor has partial register
- stalls. If that change this can be replaced with `andq
- $-(VEC_SIZE * 4), %rdi`. */
- xorb %dil, %dil
-# else
- andq $-(VEC_SIZE * 4), %rdi
-# endif
-
-# ifdef USE_AS_WCSLEN
- subq %rdi, %rdx
- sarq $2, %rdx
- addq %rdx, %rax
-# else
- subq %rdi, %rax
-# endif
- /* Compare 4 * VEC at a time forward. */
- .p2align 4,, 11
-L(loop_4x_vec):
- VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
- VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
- VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
- VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
- VPTESTN %VMM(2), %VMM(2), %k0
- VPTESTN %VMM(4), %VMM(4), %k2
- subq $-(VEC_SIZE * 4), %rdi
- /* Break if at end of length. */
- subq $(CHAR_PER_VEC * 4), %rax
- jbe L(loop_len_end)
-
-
- KORTEST %k0, %k2
- jz L(loop_4x_vec)
-
-
-L(loop_last_4x_vec):
- movq %rsi, %rcx
- subq %rax, %rsi
- VPTESTN %VMM(1), %VMM(1), %k1
- KMOV %k1, %VRDX
- test %VRDX, %VRDX
- jnz L(last_vec_x0)
-
- KMOV %k0, %VRDX
- test %VRDX, %VRDX
- jnz L(last_vec_x1)
-
- VPTESTN %VMM(3), %VMM(3), %k0
-
- /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
- returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
- individually, for VEC_SIZE == 32 we combine them in a single
- 64-bit GPR. */
-# if CHAR_PER_VEC == 64
- KMOV %k0, %VRDX
- test %VRDX, %VRDX
- jnz L(last_vec_x2)
- KMOV %k2, %VRDX
-# else
- /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
- */
- kmovd %k2, %edx
- kmovd %k0, %eax
- salq $CHAR_PER_VEC, %rdx
- orq %rax, %rdx
-# endif
-
- /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
- */
- bsfq %rdx, %rdx
- leaq (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
- cmpq %rax, %rcx
- cmovb %rcx, %rax
- ret
-
- /* Handle last 4x VEC after loop. All VECs have been loaded. */
- .p2align 4,, 4
-L(loop_len_end):
- KORTEST %k0, %k2
- jnz L(loop_last_4x_vec)
- movq %rsi, %rax
- ret
-
-
-# if CHAR_PER_VEC == 64
- /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
- need return label for it. */
- .p2align 4,, 8
-L(last_vec_x2):
- bsf %VRDX, %VRDX
- leaq (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
- cmpq %rax, %rcx
- cmovb %rcx, %rax
- ret
-# endif
-
-
- .p2align 4,, 10
-L(last_vec_x1):
- addq $CHAR_PER_VEC, %rsi
-L(last_vec_x0):
- bsf %VRDX, %VRDX
- leaq (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
- cmpq %rax, %rcx
- cmovb %rcx, %rax
- ret
-
-
- .p2align 4,, 8
-L(cross_page_boundary):
- /* Align data to VEC_SIZE. */
- movq %rdi, %rcx
- andq $-VEC_SIZE, %rcx
- VPCMPEQ (%rcx), %VZERO, %k0
-
- KMOV %k0, %VRCX
-# ifdef USE_AS_WCSLEN
- shrl $2, %eax
- andl $(CHAR_PER_VEC - 1), %eax
-# endif
- shrx %VRAX, %VRCX, %VRCX
-
- negl %eax
- andl $(CHAR_PER_VEC - 1), %eax
- movq %rsi, %rdx
- bsf %VRCX, %VRDX
- cmpq %rax, %rdx
- ja L(cross_page_continue)
- movl %edx, %eax
- cmpq %rdx, %rsi
- cmovb %esi, %eax
- ret
-END (STRNLEN)
+#ifndef STRNLEN
+#define STRNLEN __strnlen_evex
#endif
+
+#include "x86-evex256-vecs.h"
+#include "reg-macros.h"
+#include "strnlen-evex-base.S"
\ No newline at end of file
@@ -1,264 +1,7 @@
-/* Placeholder function, not used by any processor at the moment.
- Copyright (C) 2022-2024 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
#ifndef STRNLEN
#define STRNLEN __strnlen_evex512
#endif
#include "x86-evex512-vecs.h"
#include "reg-macros.h"
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-# include <sysdep.h>
-
-# ifdef USE_AS_WCSLEN
-# define VPCMPEQ vpcmpeqd
-# define VPTESTN vptestnmd
-# define VPMINU vpminud
-# define CHAR_SIZE 4
-# else
-# define VPCMPEQ vpcmpeqb
-# define VPTESTN vptestnmb
-# define VPMINU vpminub
-# define CHAR_SIZE 1
-# endif
-
-# define PAGE_SIZE 4096
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-
- .section SECTION(.text),"ax",@progbits
-/* Aligning entry point to 64 byte, provides better performance for
- one vector length string. */
-ENTRY_P2ALIGN (STRNLEN, 6)
- /* Check zero length. */
- test %RSI_LP, %RSI_LP
- jz L(ret_max)
-# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %esi, %esi
-# endif
-
- movl %edi, %eax
- vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0)
- sall $20, %eax
- cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
- ja L(page_cross)
-
- /* Compare [w]char for null, mask bit will be set for match. */
- VPCMPEQ (%rdi), %VMM(0), %k0
- KMOV %k0, %VRCX
- /* Store max length in rax. */
- mov %rsi, %rax
- /* If rcx is 0, rax will have max length. We can not use VRCX
- and VRAX here for evex256 because, upper 32 bits may be
- undefined for ecx and eax. */
- bsfq %rcx, %rax
- cmp $CHAR_PER_VEC, %rax
- ja L(align_more)
- cmpq %rax, %rsi
- cmovb %esi, %eax
- ret
-
- /* At this point vector max length reached. */
- .p2align 4,,3
-L(ret_max):
- movq %rsi, %rax
- ret
-
-L(align_more):
- mov %rdi, %rax
- /* Align rax to VEC_SIZE. */
- andq $-VEC_SIZE, %rax
- movq %rdi, %rdx
- subq %rax, %rdx
-# ifdef USE_AS_WCSLEN
- shr $2, %VRDX
-# endif
- /* At this point rdx contains [w]chars already compared. */
- leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx
- /* At this point rdx contains number of w[char] needs to go.
- Now onwards rdx will keep decrementing with each compare. */
-
- /* Loop unroll 4 times for 4 vector loop. */
- VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
- subq $-VEC_SIZE, %rax
- KMOV %k0, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x1)
-
- subq $CHAR_PER_VEC, %rdx
- jbe L(ret_max)
-
- VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
- KMOV %k0, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x2)
-
- subq $CHAR_PER_VEC, %rdx
- jbe L(ret_max)
-
- VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
- KMOV %k0, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x3)
-
- subq $CHAR_PER_VEC, %rdx
- jbe L(ret_max)
-
- VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
- KMOV %k0, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x4)
-
- subq $CHAR_PER_VEC, %rdx
- jbe L(ret_max)
- /* Save pointer before 4 x VEC_SIZE alignment. */
- movq %rax, %rcx
-
- /* Align address to VEC_SIZE * 4 for loop. */
- andq $-(VEC_SIZE * 4), %rax
-
- subq %rax, %rcx
-# ifdef USE_AS_WCSLEN
- shr $2, %VRCX
-# endif
- /* rcx contains number of [w]char will be recompared due to
- alignment fixes. rdx must be incremented by rcx to offset
- alignment adjustment. */
- addq %rcx, %rdx
- /* Need jump as we don't want to add/subtract rdx for first
- iteration of 4 x VEC_SIZE aligned loop. */
-
- .p2align 4,,11
-L(loop):
- /* VPMINU and VPCMP combination provide better performance as
- compared to alternative combinations. */
- VMOVA (VEC_SIZE * 4)(%rax), %VMM(1)
- VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
- VMOVA (VEC_SIZE * 6)(%rax), %VMM(3)
- VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
-
- VPTESTN %VMM(2), %VMM(2), %k0
- VPTESTN %VMM(4), %VMM(4), %k1
-
- subq $-(VEC_SIZE * 4), %rax
- KORTEST %k0, %k1
-
- jnz L(loopend)
- subq $(CHAR_PER_VEC * 4), %rdx
- ja L(loop)
- mov %rsi, %rax
- ret
-
-L(loopend):
-
- VPTESTN %VMM(1), %VMM(1), %k2
- KMOV %k2, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x1)
-
- KMOV %k0, %VRCX
- /* At this point, if k0 is non zero, null char must be in the
- second vector. */
- test %VRCX, %VRCX
- jnz L(ret_vec_x2)
-
- VPTESTN %VMM(3), %VMM(3), %k3
- KMOV %k3, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x3)
- /* At this point null [w]char must be in the fourth vector so no
- need to check. */
- KMOV %k1, %VRCX
-
- /* Fourth, third, second vector terminating are pretty much
- same, implemented this way to avoid branching and reuse code
- from pre loop exit condition. */
-L(ret_vec_x4):
- bsf %VRCX, %VRCX
- subq %rdi, %rax
-# ifdef USE_AS_WCSLEN
- subq $-(VEC_SIZE * 3), %rax
- shrq $2, %rax
- addq %rcx, %rax
-# else
- leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
-# endif
-
- cmpq %rsi, %rax
- cmovnb %rsi, %rax
- ret
-
-L(ret_vec_x3):
- bsf %VRCX, %VRCX
- subq %rdi, %rax
-# ifdef USE_AS_WCSLEN
- subq $-(VEC_SIZE * 2), %rax
- shrq $2, %rax
- addq %rcx, %rax
-# else
- leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
-# endif
- cmpq %rsi, %rax
- cmovnb %rsi, %rax
- ret
-
-L(ret_vec_x2):
- subq $-VEC_SIZE, %rax
-L(ret_vec_x1):
- bsf %VRCX, %VRCX
- subq %rdi, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
- addq %rcx, %rax
- cmpq %rsi, %rax
- cmovnb %rsi, %rax
- ret
-
-L(page_cross):
- mov %rdi, %rax
- movl %edi, %ecx
- andl $(VEC_SIZE - 1), %ecx
-# ifdef USE_AS_WCSLEN
- sarl $2, %ecx
-# endif
- /* ecx contains number of w[char] to be skipped as a result
- of address alignment. */
- andq $-VEC_SIZE, %rax
- VPCMPEQ (%rax), %VMM(0), %k0
- KMOV %k0, %VRDX
- /* Ignore number of character for alignment adjustment. */
- shr %cl, %VRDX
- jnz L(page_cross_end)
- movl $CHAR_PER_VEC, %eax
- sub %ecx, %eax
- cmp %rax, %rsi
- ja L(align_more)
-
-L(page_cross_end):
- bsf %VRDX, %VRAX
- cmpq %rsi, %rax
- cmovnb %esi, %eax
- ret
-
-END (STRNLEN)
-#endif
+#include "strnlen-evex-base.S"
\ No newline at end of file