From patchwork Fri Dec 15 20:04:05 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Matthew Sterrett <matthew.sterrett@intel.com>
X-Patchwork-Id: 82272
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id 40CF7384CBAF
	for <patchwork@sourceware.org>; Fri, 15 Dec 2023 20:05:48 +0000 (GMT)
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.10])
 by sourceware.org (Postfix) with ESMTPS id 8AA1C3858C52
 for <libc-alpha@sourceware.org>; Fri, 15 Dec 2023 20:05:30 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 8AA1C3858C52
Authentication-Results: sourceware.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=intel.com
ARC-Filter: OpenARC Filter v1.0.0 sourceware.org 8AA1C3858C52
Authentication-Results: server2.sourceware.org;
 arc=none smtp.remote-ip=192.198.163.10
ARC-Seal: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1702670734; cv=none;
 b=ivfmfrDa5LUVhRzK18+2Jx66E0hQ8C1bCl6+q2+2KnsmW9VoMcaD2+TLksxsOxtrK3ExvAqhHcVjdBVwjr6cncDa5MszZN+Jw+PfN5grWyAuToOPOGVr0KwYPBDIgLIFtsJ58RKX24jlmnyDprUjsEfH+byc33MQifnGnUulZaE=
ARC-Message-Signature: i=1; a=rsa-sha256; d=sourceware.org; s=key;
 t=1702670734; c=relaxed/simple;
 bh=jv5YpTQI8bt2d3ITsVoyv/sBgijhSZQxzLfZG9V23+c=;
 h=DKIM-Signature:From:To:Subject:Date:Message-Id:MIME-Version;
 b=bxYdxFIGgYgfkQPuE9AxLXuxvuoqfjfwE4gl6EwQaCDcPqiCYqj43Cq9rPWsvAVtRRBY0S5z9wFu7TsZsKmhTln2JCqTFqEYQMJ16ZpNSk3lg+w6oPnbGFPDcEejPf95US2FIXQ0FqrzcB5U528J8YwPmowhybx37iZsTt/tfxw=
ARC-Authentication-Results: i=1; server2.sourceware.org
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
 d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
 t=1702670731; x=1734206731;
 h=from:to:cc:subject:date:message-id:in-reply-to:
 references:mime-version:content-transfer-encoding;
 bh=jv5YpTQI8bt2d3ITsVoyv/sBgijhSZQxzLfZG9V23+c=;
 b=NrdKEErWm+NoqxtfbwZsAbzrTaAEqmx0RxzbSmtHh27x7kPKwrkbaRl5
 jZKSpdHv7UqbvxFE/nSPRIbn48l6TpWl9I/lPtMyzZ/KhCcZEUMDovkPJ
 DjCKwvGU9gYk3kiK6VWLKnN9656yUmOg0n5iTO8dvyJ7teVO0g3i3QFE7
 CBIIq6k5ampcP6oCeDUCHaE8byaZqNB+UM9tZcc0j04t4bLt+BOfSF9W9
 LbsJ3sf7HkWYpznh92qvjuwcSie0+IqyaePSZK5oQT04tpu5t2HS24XHI
 igQMqF9UkgeeZVipeNSbSUPZ7EzOhoYdHO8gZpow79dXiHKcyBOgHURSX w==;
X-IronPort-AV: E=McAfee;i="6600,9927,10925"; a="2491266"
X-IronPort-AV: E=Sophos;i="6.04,279,1695711600";
   d="scan'208";a="2491266"
Received: from fmsmga008.fm.intel.com ([10.253.24.58])
 by fmvoesa104.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Dec 2023 12:05:30 -0800
X-ExtLoop1: 1
X-IronPort-AV: E=McAfee;i="6600,9927,10925"; a="840777057"
X-IronPort-AV: E=Sophos;i="6.04,279,1695711600"; d="scan'208";a="840777057"
Received: from raghuveer-skx.jf.intel.com ([10.54.74.147])
 by fmsmga008-auth.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Dec 2023 12:05:29 -0800
From: Matthew Sterrett <matthew.sterrett@intel.com>
To: libc-alpha@sourceware.org
Cc: goldstein.w.n@gmail.com, hjl.tools@gmail.com, carlos@systemhalted.org,
 matthewsterrett2@gmail.com, matthew.sterrett@intel.com
Subject: [PATCH v3] x86: Unifies 'strlen-evex' and 'strlen-evex512'
 implementations.
Date: Fri, 15 Dec 2023 12:04:05 -0800
Message-Id: <20231215200405.1932246-1-matthew.sterrett@intel.com>
X-Mailer: git-send-email 2.37.2
In-Reply-To: <20231213195735.3952627-1-matthew.sterrett@intel.com>
References: <20231213195735.3952627-1-matthew.sterrett@intel.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-12.4 required=5.0 tests=BAYES_00, DKIMWL_WL_HIGH,
 DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, GIT_PATCH_0,
 KAM_SHORT,
 SPF_HELO_NONE, SPF_NONE, TXREP,
 T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.30
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org

This commit uses a common implementation 'strlen-evex-base.S' for both
'strlen-evex' and 'strlen-evex512'

The motivation is to reduce the number of implementations to maintain.
This incidentally gives a small performance improvement.

All tests pass on x86.

Benchmarks were taken on SKX.
https://www.intel.com/content/www/us/en/products/sku/123613/intel-core-i97900x-xseries-processor-13-75m-cache-up-to-4-30-ghz/specifications.html

Geometric mean for strlen-evex512 over all benchmarks (N=10) was (new/old) 0.939
Geometric mean for wcslen-evex512 over all benchmarks (N=10) was (new/old) 0.965

Code Size Changes:
    strlen-evex512.S    :  +24 bytes
    wcslen-evex512.S    :  +54 bytes
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/strlen-evex-base.S | 380 ++++++++------------
 sysdeps/x86_64/multiarch/strlen-evex.S      | 250 +------------
 sysdeps/x86_64/multiarch/strnlen-evex512.S  | 266 +++++++++++++-
 sysdeps/x86_64/multiarch/wcslen-evex512.S   |   6 +-
 sysdeps/x86_64/multiarch/wcsnlen-evex512.S  |   9 +-
 5 files changed, 439 insertions(+), 472 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
index 7305b24e28..77dc89900a 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -1,5 +1,5 @@
-/* Placeholder function, not used by any processor at the moment.
-   Copyright (C) 2022-2023 Free Software Foundation, Inc.
+/* strlen/wcslen optimized with 256/512-bit EVEX instructions.
+   Copyright (C) 2021-2023 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,7 +16,6 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-/* UNUSED. Exists purely as reference implementation.  */
 
 #include <isa-level.h>
 
@@ -26,272 +25,211 @@
 
 # ifdef USE_AS_WCSLEN
 #  define VPCMPEQ	vpcmpeqd
+#  define VPCMPNEQ	vpcmpneqd
 #  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
 #  define VPMINU	vpminud
 #  define CHAR_SIZE	4
+#  define CHAR_SIZE_SHIFT_REG(reg)	sar $2, %reg
 # else
 #  define VPCMPEQ	vpcmpeqb
+#  define VPCMPNEQ	vpcmpneqb
 #  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
 #  define VPMINU	vpminub
 #  define CHAR_SIZE	1
+#  define CHAR_SIZE_SHIFT_REG(reg)
+
+#  define REG_WIDTH	VEC_SIZE
 # endif
 
-# define PAGE_SIZE	4096
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-	.section SECTION(.text),"ax",@progbits
-/* Aligning entry point to 64 byte, provides better performance for
-   one vector length string.  */
-ENTRY_P2ALIGN (STRLEN, 6)
-# ifdef USE_AS_STRNLEN
-	/* Check zero length.  */
-	test	%RSI_LP, %RSI_LP
-	jz	L(ret_max)
-#  ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%esi, %esi
-#  endif
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 64
+
+#  define TAIL_RETURN_LBL	first_vec_x2
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x3
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+# else
+
+#  define TAIL_RETURN_LBL	first_vec_x3
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x2
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
 # endif
 
+# define XZERO	VMM_128(0)
+# define VZERO	VMM(0)
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(STRLEN, 6)
 	movl	%edi, %eax
-	vpxorq	%VMM_128(0), %VMM_128(0), %VMM_128(0)
-	sall	$20, %eax
-	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
-	ja	L(page_cross)
-
-	/* Compare [w]char for null, mask bit will be set for match.  */
-	VPCMPEQ	(%rdi), %VMM(0), %k0
-# ifdef USE_AS_STRNLEN
-	KMOV	%k0, %VRCX
-	/* Store max length in rax.  */
-	mov	%rsi, %rax
-	/* If rcx is 0, rax will have max length.  We can not use VRCX
-	   and VRAX here for evex256 because, upper 32 bits may be
-	   undefined for ecx and eax.  */
-	bsfq	%rcx, %rax
-	cmp	$CHAR_PER_VEC, %rax
-	ja	L(align_more)
-	cmpq	%rax, %rsi
-	cmovb	%esi, %eax
-# else
+	vpxorq	%XZERO, %XZERO, %XZERO
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+	   null byte.  */
+	VPCMPEQ	(%rdi), %VZERO, %k0
 	KMOV	%k0, %VRAX
 	test	%VRAX, %VRAX
-	jz	L(align_more)
+	jz	L(aligned_more)
 	bsf	%VRAX, %VRAX
-# endif
 	ret
 
-	/* At this point vector max length reached.  */
-# ifdef USE_AS_STRNLEN
-	.p2align 4,,3
-L(ret_max):
-	movq	%rsi, %rax
+	.p2align 4,, 8
+L(first_vec_x4):
+	bsf	%VRAX, %VRAX
+	subl	%ecx, %edi
+	CHAR_SIZE_SHIFT_REG (edi)
+	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
 	ret
-# endif
 
-L(align_more):
-	mov	%rdi, %rax
-	/* Align rax to VEC_SIZE.  */
-	andq	$-VEC_SIZE, %rax
-# ifdef USE_AS_STRNLEN
-	movq	%rdi, %rdx
-	subq	%rax, %rdx
-#  ifdef USE_AS_WCSLEN
-	shr	$2, %VRDX
-#  endif
-	/* At this point rdx contains [w]chars already compared.  */
-	leaq	-CHAR_PER_VEC(%rsi, %rdx), %rdx
-	/* At this point rdx contains number of w[char] needs to go.
-	   Now onwards rdx will keep decrementing with each compare.  */
-# endif
-
-	/* Loop unroll 4 times for 4 vector loop.  */
-	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
-	subq	$-VEC_SIZE, %rax
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x1)
 
-# ifdef USE_AS_STRNLEN
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-# endif
 
-	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x2)
+	/* Aligned more for strnlen compares remaining length vs 2 *
+	   CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+	   going to the loop.  */
+	.p2align 4,, 10
+L(aligned_more):
+	movq	%rdi, %rcx
+	andq	$(VEC_SIZE * -1), %rdi
+L(cross_page_continue):
+	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+	   rechecking bounds.  */
+	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x1)
 
-# ifdef USE_AS_STRNLEN
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-# endif
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
 
-	VPCMPEQ	(VEC_SIZE * 2)(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x3)
 
-# ifdef USE_AS_STRNLEN
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-# endif
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x4)
 
-	VPCMPEQ	(VEC_SIZE * 3)(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x4)
+	subq	$(VEC_SIZE * -1), %rdi
 
-# ifdef USE_AS_STRNLEN
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-	/* Save pointer before 4 x VEC_SIZE alignment.  */
-	movq	%rax, %rcx
+# if CHAR_PER_VEC == 64
+	/* No partial register stalls on processors that we use evex512
+	   on and this saves code size.  */
+	xorb	%dil, %dil
+# else
+	andq	$-(VEC_SIZE * 4), %rdi
 # endif
 
-	/* Align address to VEC_SIZE * 4 for loop.  */
-	andq	$-(VEC_SIZE * 4), %rax
-
-# ifdef USE_AS_STRNLEN
-	subq	%rax, %rcx
-#  ifdef USE_AS_WCSLEN
-	shr	$2, %VRCX
-#  endif
-	/* rcx contains number of [w]char will be recompared due to
-	   alignment fixes.  rdx must be incremented by rcx to offset
-	   alignment adjustment.  */
-	addq	%rcx, %rdx
-	/* Need jump as we don't want to add/subtract rdx for first
-	   iteration of 4 x VEC_SIZE aligned loop.  */
-# endif
 
-	.p2align 4,,11
-L(loop):
-	/* VPMINU and VPCMP combination provide better performance as
-	   compared to alternative combinations.  */
-	VMOVA	(VEC_SIZE * 4)(%rax), %VMM(1)
-	VPMINU	(VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
-	VMOVA	(VEC_SIZE * 6)(%rax), %VMM(3)
-	VPMINU	(VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
 
+	/* Compare 4 * VEC at a time forward.  */
+	.p2align 4
+L(loop_4x_vec):
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
 	VPTESTN	%VMM(2), %VMM(2), %k0
-	VPTESTN	%VMM(4), %VMM(4), %k1
+	VPTESTN	%VMM(4), %VMM(4), %k2
 
-	subq	$-(VEC_SIZE * 4), %rax
-	KORTEST	%k0, %k1
+	subq	$-(VEC_SIZE * 4), %rdi
+	KORTEST	%k0, %k2
+	jz	L(loop_4x_vec)
 
-# ifndef USE_AS_STRNLEN
-	jz      L(loop)
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x0)
+
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x1)
+
+	VPTESTN	%VMM(3), %VMM(3), %k0
+
+# if CHAR_PER_VEC == 64
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
+	KMOV	%k2, %VRAX
 # else
-	jnz	L(loopend)
-	subq	$(CHAR_PER_VEC * 4), %rdx
-	ja	L(loop)
-	mov	%rsi, %rax
+	/* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.  */
+	kmovd	%k2, %edx
+	kmovd	%k0, %eax
+	salq	$CHAR_PER_VEC, %rdx
+	orq	%rdx, %rax
+# endif
+
+	/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.  */
+	.p2align 4,, 2
+L(FALLTHROUGH_RETURN_LBL):
+	bsfq	%rax, %rax
+	subq	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
+	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
 	ret
-# endif
 
-L(loopend):
-
-	VPTESTN	%VMM(1), %VMM(1), %k2
-	KMOV	%k2, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x1)
-
-	KMOV	%k0, %VRCX
-	/* At this point, if k0 is non zero, null char must be in the
-	   second vector.  */
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x2)
-
-	VPTESTN	%VMM(3), %VMM(3), %k3
-	KMOV	%k3, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x3)
-	/* At this point null [w]char must be in the fourth vector so no
-	   need to check.  */
-	KMOV	%k1, %VRCX
-
-	/* Fourth, third, second vector terminating are pretty much
-	   same, implemented this way to avoid branching and reuse code
-	   from pre loop exit condition.  */
-L(ret_vec_x4):
-	bsf	%VRCX, %VRCX
-	subq	%rdi, %rax
-# ifdef USE_AS_WCSLEN
-	subq	$-(VEC_SIZE * 3), %rax
-	shrq	$2, %rax
-	addq	%rcx, %rax
-# else
-	leaq	(VEC_SIZE * 3)(%rcx, %rax), %rax
-# endif
-# ifdef USE_AS_STRNLEN
-	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
-# endif
+	.p2align 4,, 8
+L(first_vec_x0):
+	bsf	%VRAX, %VRAX
+	sub	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
+	addq	%rdi, %rax
 	ret
 
-L(ret_vec_x3):
-	bsf	%VRCX, %VRCX
-	subq	%rdi, %rax
-# ifdef USE_AS_WCSLEN
-	subq	$-(VEC_SIZE * 2), %rax
-	shrq	$2, %rax
-	addq	%rcx, %rax
-# else
-	leaq	(VEC_SIZE * 2)(%rcx, %rax), %rax
-# endif
-# ifdef USE_AS_STRNLEN
-	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
-# endif
+	.p2align 4,, 10
+L(first_vec_x1):
+	bsf	%VRAX, %VRAX
+	sub	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
+	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
 	ret
 
-L(ret_vec_x2):
-	subq	$-VEC_SIZE, %rax
-L(ret_vec_x1):
-	bsf	%VRCX, %VRCX
-	subq	%rdi, %rax
-# ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
-# endif
-	addq	%rcx, %rax
-# ifdef USE_AS_STRNLEN
-	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
-# endif
+	.p2align 4,, 10
+	/* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.  */
+L(TAIL_RETURN_LBL):
+	bsf	%VRAX, %VRAX
+	sub	%VRCX, %VRDI
+	CHAR_SIZE_SHIFT_REG (VRDI)
+	lea	(TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
 	ret
 
-L(page_cross):
-	mov	%rdi, %rax
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
+	.p2align 4,, 8
+L(cross_page_boundary):
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rdi
+
+	VPCMPEQ	(%rdi), %VZERO, %k0
+
+	KMOV	%k0, %VRAX
 # ifdef USE_AS_WCSLEN
-	sarl	$2, %ecx
-# endif
-	/* ecx contains number of w[char] to be skipped as a result
-	   of address alignment.  */
-	andq	$-VEC_SIZE, %rax
-	VPCMPEQ	(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRDX
-	/* Ignore number of character for alignment adjustment.  */
-	shr	%cl, %VRDX
-# ifdef USE_AS_STRNLEN
-	jnz	L(page_cross_end)
-	movl    $CHAR_PER_VEC, %eax
-	sub     %ecx, %eax
-	cmp	%rax, %rsi
-	ja	L(align_more)
+	movl	%ecx, %edx
+	shrl	$2, %edx
+	andl	$(CHAR_PER_VEC - 1), %edx
+	shrx	%edx, %eax, %eax
+	testl	%eax, %eax
 # else
-	jz	L(align_more)
-# endif
-
-L(page_cross_end):
-	bsf	%VRDX, %VRAX
-# ifdef USE_AS_STRNLEN
-	cmpq	%rsi, %rax
-	cmovnb	%esi, %eax
+	shr	%cl, %VRAX
 # endif
+	jz	L(cross_page_continue)
+	bsf	%VRAX, %VRAX
 	ret
 
-END (STRLEN)
+END(STRLEN)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
index 364eeffff6..93ad15e356 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -1,245 +1,7 @@
-/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
-   Copyright (C) 2021-2023 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-# include <sysdep.h>
-
-# ifndef STRLEN
-#  define STRLEN	__strlen_evex
-# endif
-
-# ifndef VEC_SIZE
-#  include "x86-evex256-vecs.h"
-# endif
-
-# ifdef USE_AS_WCSLEN
-#  define VPCMPEQ	vpcmpeqd
-#  define VPCMPNEQ	vpcmpneqd
-#  define VPTESTN	vptestnmd
-#  define VPTEST	vptestmd
-#  define VPMINU	vpminud
-#  define CHAR_SIZE	4
-#  define CHAR_SIZE_SHIFT_REG(reg)	sar $2, %reg
-# else
-#  define VPCMPEQ	vpcmpeqb
-#  define VPCMPNEQ	vpcmpneqb
-#  define VPTESTN	vptestnmb
-#  define VPTEST	vptestmb
-#  define VPMINU	vpminub
-#  define CHAR_SIZE	1
-#  define CHAR_SIZE_SHIFT_REG(reg)
-
-#  define REG_WIDTH	VEC_SIZE
-# endif
-
-# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
-
-# include "reg-macros.h"
-
-# if CHAR_PER_VEC == 64
-
-#  define TAIL_RETURN_LBL	first_vec_x2
-#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 2)
-
-#  define FALLTHROUGH_RETURN_LBL	first_vec_x3
-#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
-
-# else
-
-#  define TAIL_RETURN_LBL	first_vec_x3
-#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 3)
-
-#  define FALLTHROUGH_RETURN_LBL	first_vec_x2
-#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
-# endif
-
-# define XZERO	VMM_128(0)
-# define VZERO	VMM(0)
-# define PAGE_SIZE	4096
-
-	.section SECTION(.text), "ax", @progbits
-ENTRY_P2ALIGN (STRLEN, 6)
-	movl	%edi, %eax
-	vpxorq	%XZERO, %XZERO, %XZERO
-	andl	$(PAGE_SIZE - 1), %eax
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	ja	L(cross_page_boundary)
-
-	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
-	   null byte.  */
-	VPCMPEQ	(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jz	L(aligned_more)
-	bsf	%VRAX, %VRAX
-	ret
-
-	.p2align 4,, 8
-L(first_vec_x4):
-	bsf	%VRAX, %VRAX
-	subl	%ecx, %edi
-	CHAR_SIZE_SHIFT_REG (edi)
-	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
-	ret
-
-
-
-	/* Aligned more for strnlen compares remaining length vs 2 *
-	   CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
-	   going to the loop.  */
-	.p2align 4,, 10
-L(aligned_more):
-	movq	%rdi, %rcx
-	andq	$(VEC_SIZE * -1), %rdi
-L(cross_page_continue):
-	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
-	   rechecking bounds.  */
-	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x1)
-
-	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x2)
-
-	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x3)
-
-	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x4)
-
-	subq	$(VEC_SIZE * -1), %rdi
-
-# if CHAR_PER_VEC == 64
-	/* No partial register stalls on processors that we use evex512
-	   on and this saves code size.  */
-	xorb	%dil, %dil
-# else
-	andq	$-(VEC_SIZE * 4), %rdi
-# endif
-
-
-
-	/* Compare 4 * VEC at a time forward.  */
-	.p2align 4
-L(loop_4x_vec):
-	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
-	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
-	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
-	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
-	VPTESTN	%VMM(2), %VMM(2), %k0
-	VPTESTN	%VMM(4), %VMM(4), %k2
-
-	subq	$-(VEC_SIZE * 4), %rdi
-	KORTEST %k0, %k2
-	jz	L(loop_4x_vec)
-
-	VPTESTN	%VMM(1), %VMM(1), %k1
-	KMOV	%k1, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x0)
-
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x1)
-
-	VPTESTN	%VMM(3), %VMM(3), %k0
-
-# if CHAR_PER_VEC == 64
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x2)
-	KMOV	%k2, %VRAX
-# else
-	/* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
-	 */
-	kmovd	%k2, %edx
-	kmovd	%k0, %eax
-	salq	$CHAR_PER_VEC, %rdx
-	orq	%rdx, %rax
-# endif
-
-	/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
-	 */
-	.p2align 4,, 2
-L(FALLTHROUGH_RETURN_LBL):
-	bsfq	%rax, %rax
-	subq	%rcx, %rdi
-	CHAR_SIZE_SHIFT_REG (rdi)
-	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
-	ret
-
-	.p2align 4,, 8
-L(first_vec_x0):
-	bsf	%VRAX, %VRAX
-	sub	%rcx, %rdi
-	CHAR_SIZE_SHIFT_REG (rdi)
-	addq	%rdi, %rax
-	ret
-
-	.p2align 4,, 10
-L(first_vec_x1):
-	bsf	%VRAX, %VRAX
-	sub	%rcx, %rdi
-	CHAR_SIZE_SHIFT_REG (rdi)
-	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
-	ret
-
-	.p2align 4,, 10
-	/* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
-	 */
-L(TAIL_RETURN_LBL):
-	bsf	%VRAX, %VRAX
-	sub	%VRCX, %VRDI
-	CHAR_SIZE_SHIFT_REG (VRDI)
-	lea	(TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
-	ret
-
-	.p2align 4,, 8
-L(cross_page_boundary):
-	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE.  */
-	andq	$-VEC_SIZE, %rdi
-
-	VPCMPEQ	(%rdi), %VZERO, %k0
-
-	KMOV	%k0, %VRAX
-# ifdef USE_AS_WCSLEN
-	movl	%ecx, %edx
-	shrl	$2, %edx
-	andl	$(CHAR_PER_VEC - 1), %edx
-	shrx	%edx, %eax, %eax
-	testl	%eax, %eax
-# else
-	shr	%cl, %VRAX
-# endif
-	jz	L(cross_page_continue)
-	bsf	%VRAX, %VRAX
-	ret
-
-END (STRLEN)
+#ifndef STRLEN
+# define STRLEN		__strlen_evex
 #endif
+
+#include "x86-evex256-vecs.h"
+#include "reg-macros.h"
+#include "strlen-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
index 0b7f220214..ebf22c259f 100644
--- a/sysdeps/x86_64/multiarch/strnlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -1,4 +1,264 @@
-#define STRLEN __strnlen_evex512
-#define USE_AS_STRNLEN 1
+/* Placeholder function, not used by any processor at the moment.
+   Copyright (C) 2022-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
 
-#include "strlen-evex512.S"
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRNLEN
+#define STRNLEN __strnlen_evex512
+#endif
+
+#include "x86-evex512-vecs.h"
+#include "reg-macros.h"
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMPEQ	vpcmpeqd
+#  define VPTESTN	vptestnmd
+#  define VPMINU	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define VPCMPEQ	vpcmpeqb
+#  define VPTESTN	vptestnmb
+#  define VPMINU	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+	.section SECTION(.text),"ax",@progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (STRNLEN, 6)
+	/* Check zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(ret_max)
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+#  endif
+
+	movl	%edi, %eax
+	vpxorq	%VMM_128(0), %VMM_128(0), %VMM_128(0)
+	sall	$20, %eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
+	ja	L(page_cross)
+
+	/* Compare [w]char for null, mask bit will be set for match.  */
+	VPCMPEQ	(%rdi), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	/* Store max length in rax.  */
+	mov	%rsi, %rax
+	/* If rcx is 0, rax will have max length.  We can not use VRCX
+	   and VRAX here for evex256 because, upper 32 bits may be
+	   undefined for ecx and eax.  */
+	bsfq	%rcx, %rax
+	cmp	$CHAR_PER_VEC, %rax
+	ja	L(align_more)
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+	ret
+
+	/* At this point vector max length reached.  */
+	.p2align 4,,3
+L(ret_max):
+	movq	%rsi, %rax
+	ret
+
+L(align_more):
+	mov	%rdi, %rax
+	/* Align rax to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rax
+	movq	%rdi, %rdx
+	subq	%rax, %rdx
+#  ifdef USE_AS_WCSLEN
+	shr	$2, %VRDX
+#  endif
+	/* At this point rdx contains [w]chars already compared.  */
+	leaq	-CHAR_PER_VEC(%rsi, %rdx), %rdx
+	/* At this point rdx contains number of w[char] needs to go.
+	   Now onwards rdx will keep decrementing with each compare.  */
+
+	/* Loop unroll 4 times for 4 vector loop.  */
+	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
+	subq	$-VEC_SIZE, %rax
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+
+	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x4)
+
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+	/* Save pointer before 4 x VEC_SIZE alignment.  */
+	movq	%rax, %rcx
+
+	/* Align address to VEC_SIZE * 4 for loop.  */
+	andq	$-(VEC_SIZE * 4), %rax
+
+	subq	%rax, %rcx
+#  ifdef USE_AS_WCSLEN
+	shr	$2, %VRCX
+#  endif
+	/* rcx contains number of [w]char will be recompared due to
+	   alignment fixes.  rdx must be incremented by rcx to offset
+	   alignment adjustment.  */
+	addq	%rcx, %rdx
+	/* Need jump as we don't want to add/subtract rdx for first
+	   iteration of 4 x VEC_SIZE aligned loop.  */
+
+	.p2align 4,,11
+L(loop):
+	/* VPMINU and VPCMP combination provide better performance as
+	   compared to alternative combinations.  */
+	VMOVA	(VEC_SIZE * 4)(%rax), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rax), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k1
+
+	subq	$-(VEC_SIZE * 4), %rax
+	KORTEST	%k0, %k1
+
+	jnz	L(loopend)
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(loop)
+	mov	%rsi, %rax
+	ret
+
+L(loopend):
+
+	VPTESTN	%VMM(1), %VMM(1), %k2
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+
+	KMOV	%k0, %VRCX
+	/* At this point, if k0 is non zero, null char must be in the
+	   second vector.  */
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+
+	VPTESTN	%VMM(3), %VMM(3), %k3
+	KMOV	%k3, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+	/* At this point null [w]char must be in the fourth vector so no
+	   need to check.  */
+	KMOV	%k1, %VRCX
+
+	/* Fourth, third, second vector terminating are pretty much
+	   same, implemented this way to avoid branching and reuse code
+	   from pre loop exit condition.  */
+L(ret_vec_x4):
+	bsf	%VRCX, %VRCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	subq	$-(VEC_SIZE * 3), %rax
+	shrq	$2, %rax
+	addq	%rcx, %rax
+# else
+	leaq	(VEC_SIZE * 3)(%rcx, %rax), %rax
+# endif
+
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+	ret
+
+L(ret_vec_x3):
+	bsf	%VRCX, %VRCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	subq	$-(VEC_SIZE * 2), %rax
+	shrq	$2, %rax
+	addq	%rcx, %rax
+# else
+	leaq	(VEC_SIZE * 2)(%rcx, %rax), %rax
+# endif
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+	ret
+
+L(ret_vec_x2):
+	subq	$-VEC_SIZE, %rax
+L(ret_vec_x1):
+	bsf	%VRCX, %VRCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	addq	%rcx, %rax
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+	ret
+
+L(page_cross):
+	mov	%rdi, %rax
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE - 1), %ecx
+# ifdef USE_AS_WCSLEN
+	sarl	$2, %ecx
+# endif
+	/* ecx contains number of w[char] to be skipped as a result
+	   of address alignment.  */
+	andq	$-VEC_SIZE, %rax
+	VPCMPEQ	(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRDX
+	/* Ignore number of character for alignment adjustment.  */
+	shr	%cl, %VRDX
+	jnz	L(page_cross_end)
+	movl    $CHAR_PER_VEC, %eax
+	sub     %ecx, %eax
+	cmp	%rax, %rsi
+	ja	L(align_more)
+
+L(page_cross_end):
+	bsf	%VRDX, %VRAX
+	cmpq	%rsi, %rax
+	cmovnb	%esi, %eax
+	ret
+
+END (STRNLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
index f59c372b78..aff288a66b 100644
--- a/sysdeps/x86_64/multiarch/wcslen-evex512.S
+++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
@@ -1,4 +1,8 @@
-#define STRLEN __wcslen_evex512
+#ifndef WCSLEN
+# define WCSLEN	__wcslen_evex512
+#endif
+
+#define STRLEN WCSLEN
 #define USE_AS_WCSLEN 1
 
 #include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
index 73dcf2f210..1c37d74fc9 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
@@ -1,5 +1,8 @@
-#define STRLEN __wcsnlen_evex512
+#ifndef WCSNLEN
+# define WCSNLEN	__wcsnlen_evex512
+#endif
+
+#define STRNLEN	WCSNLEN
 #define USE_AS_WCSLEN 1
-#define USE_AS_STRNLEN 1
 
-#include "strlen-evex512.S"
+#include "strnlen-evex512.S"