From patchwork Fri Apr 23 19:56:24 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 43132
Return-Path: <libc-alpha-bounces@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id 7610939CE435;
	Fri, 23 Apr 2021 19:56:38 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 7610939CE435
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1619207798;
	bh=tB9Y/uQZ2ayk0O+ud0Q5nJDH/h3CGb04xoG/4UENf+I=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=sxFye8w9ftVcEWZc3V4toE+8B7PRcqCtXQXBOdwz11HlW6KxgnNh51wG7W1nHuzJ0
	 xThUoqRVgLv0G/J7+erP6tDQHBkcBz+gLIwUKgOP9aSP73is5kEmVkZxnmalysRFg8
	 ktdhL1t0QG6oHGoBsKJs13Dnd8mJ97fbeE7sBBjM=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-qk1-x735.google.com (mail-qk1-x735.google.com
 [IPv6:2607:f8b0:4864:20::735])
 by sourceware.org (Postfix) with ESMTPS id 9C6B3393C853
 for <libc-alpha@sourceware.org>; Fri, 23 Apr 2021 19:56:33 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 9C6B3393C853
Received: by mail-qk1-x735.google.com with SMTP id s5so42038649qkj.5
 for <libc-alpha@sourceware.org>; Fri, 23 Apr 2021 12:56:33 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20161025;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=tB9Y/uQZ2ayk0O+ud0Q5nJDH/h3CGb04xoG/4UENf+I=;
 b=iaPZUcQwba+ENbOvxdZ1q5eHULnJzGSsBh/mpqQkwd3YexhLTOb83gi7VuvtDV2xfI
 8aFhek+8qidd1Li7odvjeOymDbqvob0CNZ2RwlHvowzMoptH7uUZaKXmGNw1ecIsIKxG
 cWs94M/I02CVK/S9JQN83NGeYbqJtfvVY/yMYYMKvyMCGa56TiT6lSvrOxjUt9XvgxKZ
 Uxizr7pQCCWbv+yHOtAeNKIF4HXyS9m/YtoNnrgjwqCBNITKPj3vMp/eVKTnPXIoiRHK
 JwmGsKP7l8P4hZY5PC5gFYu8y6wt1K0ovoDELMh87zVr0aQ/5WjdLop49dOpgWeDN6pO
 xWaA==
X-Gm-Message-State: AOAM532zi6wFy809ZFTYm0N5MKOTb5wMFyEDgUCcfgl116i2u2eM/5xV
 Y+DIHDTErbVFIV49O/6d9BrrnwryFp4=
X-Google-Smtp-Source: 
 ABdhPJxp119ZAhGV6Hi6EN7HzBWYtCB5JwmQFFF6BvT5bw0TZE0udWng9ggACwyO7EVUx1fy6TbmQA==
X-Received: by 2002:a37:ec9:: with SMTP id 192mr5820391qko.328.1619207792958;
 Fri, 23 Apr 2021 12:56:32 -0700 (PDT)
Received: from localhost.localdomain
 ([2600:1009:b054:2521:5e92:4ab7:a8bb:883b])
 by smtp.googlemail.com with ESMTPSA id
 u184sm5030995qkd.82.2021.04.23.12.56.32
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Fri, 23 Apr 2021 12:56:32 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v4 1/2] x86: Optimize strchr-avx2.S
Date: Fri, 23 Apr 2021 15:56:24 -0400
Message-Id: <20210423195625.2871522-1-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.29.2
In-Reply-To: <20210421213951.404588-1-goldstein.w.n@gmail.com>
References: <20210421213951.404588-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-12.6 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.2
X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces@sourceware.org
Sender: "Libc-alpha" <libc-alpha-bounces@sourceware.org>

No bug. This commit optimizes strchr-avx2.S. The optimizations are all
small things such as save an ALU in the alignment process, saving a
few instructions in the loop return, saving some bytes in the main
loop, and increasing the ILP in the return cases. test-strchr,
test-strchrnul, test-wcschr, and test-wcschrnul are all passing.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/strchr-avx2.S | 286 +++++++++++++++----------
 1 file changed, 169 insertions(+), 117 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index 25bec38b5d..413942b96a 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -49,132 +49,144 @@
 
 	.section SECTION(.text),"ax",@progbits
 ENTRY (STRCHR)
-	movl	%edi, %ecx
-# ifndef USE_AS_STRCHRNUL
-	xorl	%edx, %edx
-# endif
-
 	/* Broadcast CHAR to YMM0.	*/
 	vmovd	%esi, %xmm0
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	VPBROADCAST	%xmm0, %ymm0
 	vpxor	%xmm9, %xmm9, %xmm9
-	VPBROADCAST %xmm0, %ymm0
 
 	/* Check if we cross page boundary with one vector load.  */
-	andl	$(PAGE_SIZE - 1), %ecx
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
-	ja  L(cross_page_boundary)
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
 	   null byte.  */
 	vmovdqu	(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
-	jz	L(more_vecs)
+	jz	L(aligned_more)
 	tzcntl	%eax, %eax
-	/* Found CHAR or the null byte.	 */
-	addq	%rdi, %rax
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
 # endif
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
-
-	.p2align 4
-L(more_vecs):
-	/* Align data for aligned loads in the loop.  */
-	andq	$-VEC_SIZE, %rdi
-L(aligned_more):
-
-	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.	*/
-	vmovdqa	VEC_SIZE(%rdi), %ymm8
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	vmovdqa	VEC_SIZE(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jz	L(prep_loop_4x)
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
+	/* .p2align 5 helps keep performance more consistent if ENTRY()
+	   alignment % 32 was either 16 or 0. As well this makes the
+	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
+	   easier.  */
+	.p2align 5
+L(first_vec_x4):
 	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
 # endif
+	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(first_vec_x0):
-	tzcntl	%eax, %eax
-	/* Found CHAR or the null byte.	 */
-	addq	%rdi, %rax
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
-# endif
+L(zero):
+	xorl	%eax, %eax
 	VZEROUPPER_RETURN
+# endif
+
 
 	.p2align 4
 L(first_vec_x1):
 	tzcntl	%eax, %eax
-	leaq	VEC_SIZE(%rdi, %rax), %rax
+	incq	%rdi
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
 # endif
+	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x2):
 	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE + 1), %rdi
+# ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 2 + 1), %rdi
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
 # endif
+	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
-L(prep_loop_4x):
-	/* Align data to 4 * VEC_SIZE.	*/
-	andq	$-(VEC_SIZE * 4), %rdi
+	.p2align 4
+L(aligned_more):
+	/* Align data to VEC_SIZE - 1. This is the same number of
+	   instructions as using andq -VEC_SIZE but saves 4 bytes of code
+	   on x4 check.  */
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+	vmovdqa	1(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
 
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+	/* Align data to VEC_SIZE * 4 - 1.	*/
+	addq	$(VEC_SIZE * 4 + 1), %rdi
+	andq	$-(VEC_SIZE * 4), %rdi
 	.p2align 4
 L(loop_4x_vec):
 	/* Compare 4 * VEC at a time forward.  */
-	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
-	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
-	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
-	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
+	vmovdqa	(%rdi), %ymm5
+	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
 
 	/* Leaves only CHARS matching esi as 0.	 */
 	vpxor	%ymm5, %ymm0, %ymm1
@@ -190,62 +202,102 @@ L(loop_4x_vec):
 	VPMINU	%ymm1, %ymm2, %ymm5
 	VPMINU	%ymm3, %ymm4, %ymm6
 
-	VPMINU	%ymm5, %ymm6, %ymm5
+	VPMINU	%ymm5, %ymm6, %ymm6
 
-	VPCMPEQ %ymm5, %ymm9, %ymm5
-	vpmovmskb %ymm5, %eax
+	VPCMPEQ	%ymm6, %ymm9, %ymm6
+	vpmovmskb %ymm6, %ecx
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
 
-	addq	$(VEC_SIZE * 4), %rdi
-	testl	%eax, %eax
-	jz  L(loop_4x_vec)
 
-	VPCMPEQ %ymm1, %ymm9, %ymm1
+	VPCMPEQ	%ymm1, %ymm9, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	jnz	L(last_vec_x0)
 
-	VPCMPEQ %ymm2, %ymm9, %ymm2
+
+	VPCMPEQ	%ymm5, %ymm9, %ymm2
 	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	jnz	L(last_vec_x1)
+
+	VPCMPEQ	%ymm3, %ymm9, %ymm3
+	vpmovmskb %ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used
+	   if the first 3 other VEC all did not contain a match.  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+	subq	$(VEC_SIZE * 2), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
 
-	VPCMPEQ %ymm3, %ymm9, %ymm3
-	VPCMPEQ %ymm4, %ymm9, %ymm4
-	vpmovmskb %ymm3, %ecx
-	vpmovmskb %ymm4, %eax
-	salq	$32, %rax
-	orq %rcx, %rax
-	tzcntq  %rax, %rax
-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+	.p2align 4
+L(last_vec_x0):
+	tzcntl	%eax, %eax
+	addq	$-(VEC_SIZE * 4), %rdi
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
 # endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+	xorl	%eax, %eax
 	VZEROUPPER_RETURN
+# endif
+
+	.p2align 4
+L(last_vec_x1):
+	tzcntl	%eax, %eax
+	subq	$(VEC_SIZE * 3), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
 
 	/* Cold case for crossing page with first load.	 */
 	.p2align 4
 L(cross_page_boundary):
-	andq	$-VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-
-	vmovdqa	(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
+	movq	%rdi, %rdx
+	/* Align rdi to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
-	/* Remove the leading bits.	 */
-	sarxl	%ecx, %eax, %eax
+	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+	   so no need to manually mod edx.  */
+	sarxl	%edx, %eax, %eax
 	testl	%eax, %eax
-	jz	L(aligned_more)
+	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
-	addq	%rcx, %rdi
-	addq	%rdi, %rax
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	xorl	%ecx, %ecx
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdx, %rax), %CHAR_REG
+	leaq	(%rdx, %rax), %rax
+	cmovne	%rcx, %rax
+# else
+	addq	%rdx, %rax
 # endif
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 END (STRCHR)
 # endif

From patchwork Fri Apr 23 19:56:25 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 43133
Return-Path: <libc-alpha-bounces@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id 0AEDD39CE430;
	Fri, 23 Apr 2021 19:56:41 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 0AEDD39CE430
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1619207801;
	bh=+LKTeSvV96AUFH0pWhBTlkwepqtmeSqH7njRxdqTSow=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=QN/2X5sM2citRU18rVpm98eHfzxyseJnL2mipCkw9UeevCZvJTV2eBFhJFCZosOKi
	 67rWShiiaErZoPFDZPwmmJetc/0eVRFmMoLIt5VeNtnD+uX057SgqlEsWKp9aXBaHf
	 wQazwEpFjSX1q8pbOhiIqadIW0AqQSUKIWJyCE5c=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-qk1-x72d.google.com (mail-qk1-x72d.google.com
 [IPv6:2607:f8b0:4864:20::72d])
 by sourceware.org (Postfix) with ESMTPS id B92F139CE430
 for <libc-alpha@sourceware.org>; Fri, 23 Apr 2021 19:56:35 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org B92F139CE430
Received: by mail-qk1-x72d.google.com with SMTP id t17so22349402qkg.4
 for <libc-alpha@sourceware.org>; Fri, 23 Apr 2021 12:56:35 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20161025;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=+LKTeSvV96AUFH0pWhBTlkwepqtmeSqH7njRxdqTSow=;
 b=Or6mEWFWeDiGokJh8z1qoHi0gX4Vk49ELJogIHKF5N9nd57bGnTz5HAPdVLkRNZI2I
 b9e1oDOSN/G7BwCp89eDG5ELU3pdf/kRKwNZTGtWpIAfSK1VKPyB2n0dLSZ1krEXFOyZ
 3mghR9u2tsNJGomC+mrnHAgfjWbxgD2Mvf4qjRm7l+/EKhVN+TVofGn8gVECmR9NiqKS
 5YfQgUCxwAO4kCdcZJ/4u6uXgHihBPOSutBUf6pD2JZPVcofJCiv65jNBdF2T37WjU8D
 yKnUe2eCAV+MZ2MIzvtwQMw7WbZrOpFzFef6CmOEWLyHyujxpR9earkDfWSUZfs0ON5m
 eHbg==
X-Gm-Message-State: AOAM530RbOaFqTE/4UlpByRAV/2RnsgIc1YVkD7Vq9P3oG6FZI9Z/IBo
 f/86UcvOkoWcxCP+tOp7uYbTprPkva4=
X-Google-Smtp-Source: 
 ABdhPJxfKEQ+YqojVNYqiQGa5lxKMhapcJZhJih/KbO4SH1BIq4pSXeOMOQH55QfJ6XwCrEB9G8z1Q==
X-Received: by 2002:a37:de14:: with SMTP id h20mr5807611qkj.34.1619207794802;
 Fri, 23 Apr 2021 12:56:34 -0700 (PDT)
Received: from localhost.localdomain
 ([2600:1009:b054:2521:5e92:4ab7:a8bb:883b])
 by smtp.googlemail.com with ESMTPSA id
 u184sm5030995qkd.82.2021.04.23.12.56.33
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Fri, 23 Apr 2021 12:56:34 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v4 2/2] x86: Optimize strchr-evex.S
Date: Fri, 23 Apr 2021 15:56:25 -0400
Message-Id: <20210423195625.2871522-2-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.29.2
In-Reply-To: <20210423195625.2871522-1-goldstein.w.n@gmail.com>
References: <20210421213951.404588-1-goldstein.w.n@gmail.com>
 <20210423195625.2871522-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-12.6 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.2
X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces@sourceware.org
Sender: "Libc-alpha" <libc-alpha-bounces@sourceware.org>

No bug. This commit optimizes strchr-evex.S. The optimizations are
mostly small things such as save an ALU in the alignment process,
saving a few instructions in the loop return. The one significant
change is saving 2 instructions in the 4x loop. test-strchr,
test-strchrnul, test-wcschr, and test-wcschrnul are all passing.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/strchr-evex.S | 392 ++++++++++++++-----------
 1 file changed, 218 insertions(+), 174 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
index ddc86a7058..7f9d4ee48d 100644
--- a/sysdeps/x86_64/multiarch/strchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -32,13 +32,15 @@
 #  define VPCMP		vpcmpd
 #  define VPMINU	vpminud
 #  define CHAR_REG	esi
-#  define SHIFT_REG	r8d
+#  define SHIFT_REG	ecx
+#  define CHAR_SIZE	4
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMP		vpcmpb
 #  define VPMINU	vpminub
 #  define CHAR_REG	sil
-#  define SHIFT_REG	ecx
+#  define SHIFT_REG	edx
+#  define CHAR_SIZE	1
 # endif
 
 # define XMMZERO	xmm16
@@ -56,23 +58,20 @@
 
 # define VEC_SIZE 32
 # define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
 
 	.section .text.evex,"ax",@progbits
 ENTRY (STRCHR)
-	movl	%edi, %ecx
-# ifndef USE_AS_STRCHRNUL
-	xorl	%edx, %edx
-# endif
-
 	/* Broadcast CHAR to YMM0.	*/
-	VPBROADCAST %esi, %YMM0
-
+	VPBROADCAST	%esi, %YMM0
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
 
-	/* Check if we cross page boundary with one vector load.  */
-	andl	$(PAGE_SIZE - 1), %ecx
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
-	ja  L(cross_page_boundary)
+	/* Check if we cross page boundary with one vector load.
+	   Otherwise it is safe to use an unaligned load.  */
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
 	   null bytes.  */
@@ -83,251 +82,296 @@ ENTRY (STRCHR)
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 	VPCMP	$0, %YMMZERO, %YMM2, %k0
-	ktestd	%k0, %k0
-	jz	L(more_vecs)
 	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jz	L(aligned_more)
 	tzcntl	%eax, %eax
-	/* Found CHAR or the null byte.	 */
 # ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(%rdi, %rax, 4), %rax
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+	 */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
 	addq	%rdi, %rax
 # endif
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
 # endif
 	ret
 
-	.p2align 4
-L(more_vecs):
-	/* Align data for aligned loads in the loop.  */
-	andq	$-VEC_SIZE, %rdi
-L(aligned_more):
-
-	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.	*/
-	VMOVA	VEC_SIZE(%rdi), %YMM1
-	addq	$VEC_SIZE, %rdi
-
-	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	VMOVA	VEC_SIZE(%rdi), %YMM1
-	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
-	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
-	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
-	ktestd	%k0, %k0
-	jz	L(prep_loop_4x)
-
-	kmovd	%k0, %eax
+	/* .p2align 5 helps keep performance more consistent if ENTRY()
+	   alignment % 32 was either 16 or 0. As well this makes the
+	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
+	   easier.  */
+	.p2align 5
+L(first_vec_x3):
 	tzcntl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
-# ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
-# else
-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero)
 # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
-# endif
+L(zero):
+	xorl	%eax, %eax
 	ret
+# endif
 
 	.p2align 4
-L(first_vec_x0):
+L(first_vec_x4):
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if first match was CHAR (k0) or null (k1).  */
+	kmovd	%k0, %eax
 	tzcntl	%eax, %eax
-	/* Found CHAR or the null byte.	 */
-# ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(%rdi, %rax, 4), %rax
+	kmovd	%k1, %ecx
+	/* bzhil will not be 0 if first match was null.  */
+	bzhil	%eax, %ecx, %ecx
+	jne	L(zero)
 # else
-	addq	%rdi, %rax
-# endif
-# ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Combine CHAR and null matches.  */
+	kord	%k0, %k1, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
 # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
 	.p2align 4
 L(first_vec_x1):
 	tzcntl	%eax, %eax
-	/* Found CHAR or the null byte.	 */
-# ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
-# else
-	leaq	VEC_SIZE(%rdi, %rax), %rax
-# endif
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero)
+
 # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
 	.p2align 4
 L(first_vec_x2):
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if first match was CHAR (k0) or null (k1).  */
+	kmovd	%k0, %eax
 	tzcntl	%eax, %eax
-	/* Found CHAR or the null byte.	 */
-# ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+	kmovd	%k1, %ecx
+	/* bzhil will not be 0 if first match was null.  */
+	bzhil	%eax, %ecx, %ecx
+	jne	L(zero)
 # else
-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
-# endif
-# ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Combine CHAR and null matches.  */
+	kord	%k0, %k1, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
 # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-L(prep_loop_4x):
-	/* Align data to 4 * VEC_SIZE.	*/
+	.p2align 4
+L(aligned_more):
+	/* Align data to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rdi
+L(cross_page_continue):
+	/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
+	   data is only aligned to VEC_SIZE. Use two alternating methods
+	   for checking VEC to balance latency and port contention.  */
+
+	/* This method has higher latency but has better port
+	   distribution.  */
+	VMOVA	(VEC_SIZE)(%rdi), %YMM1
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	/* This method has higher latency but has better port
+	   distribution.  */
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
+	/* Each bit in K0 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMM0, %k0
+	/* Each bit in K1 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMMZERO, %k1
+	kortestd	%k0, %k1
+	jnz	L(first_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+	/* Each bit in K0 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMM0, %k0
+	/* Each bit in K1 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMMZERO, %k1
+	kortestd	%k0, %k1
+	jnz	L(first_vec_x4)
+
+	/* Align data to VEC_SIZE * 4 for the loop.  */
+	addq	$VEC_SIZE, %rdi
 	andq	$-(VEC_SIZE * 4), %rdi
 
 	.p2align 4
 L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
+	/* Check 4x VEC at a time. No penalty to imm32 offset with evex
+	   encoding.  */
 	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
 	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
 	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
 	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
 
-	/* Leaves only CHARS matching esi as 0.  */
+	/* For YMM1 and YMM3 use xor to set the CHARs matching esi to
+	   zero.  */
 	vpxorq	%YMM1, %YMM0, %YMM5
-	vpxorq	%YMM2, %YMM0, %YMM6
+	/* For YMM2 and YMM4 cmp not equals to CHAR and store result in
+	   k register. Its possible to save either 1 or 2 instructions
+	   using cmp no equals method for either YMM1 or YMM1 and YMM3
+	   respectively but bottleneck on p5 makes it not worth it.  */
+	VPCMP	$4, %YMM0, %YMM2, %k2
 	vpxorq	%YMM3, %YMM0, %YMM7
-	vpxorq	%YMM4, %YMM0, %YMM8
-
-	VPMINU	%YMM5, %YMM1, %YMM5
-	VPMINU	%YMM6, %YMM2, %YMM6
-	VPMINU	%YMM7, %YMM3, %YMM7
-	VPMINU	%YMM8, %YMM4, %YMM8
-
-	VPMINU	%YMM5, %YMM6, %YMM1
-	VPMINU	%YMM7, %YMM8, %YMM2
-
-	VPMINU	%YMM1, %YMM2, %YMM1
-
-	/* Each bit in K0 represents a CHAR or a null byte.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-
-	addq	$(VEC_SIZE * 4), %rdi
-
-	ktestd	%k0, %k0
+	VPCMP	$4, %YMM0, %YMM4, %k4
+
+	/* Use min to select all zeros from either xor or end of string).
+	 */
+	VPMINU	%YMM1, %YMM5, %YMM1
+	VPMINU	%YMM3, %YMM7, %YMM3
+
+	/* Use min + zeromask to select for zeros. Since k2 and k4 will
+	   have 0 as positions that matched with CHAR which will set
+	   zero in the corresponding destination bytes in YMM2 / YMM4.
+	 */
+	VPMINU	%YMM1, %YMM2, %YMM2{%k2}{z}
+	VPMINU	%YMM3, %YMM4, %YMM4
+	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
+
+	VPCMP	$0, %YMMZERO, %YMM4, %k1
+	kmovd	%k1, %ecx
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
 	jz	L(loop_4x_vec)
 
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM5, %k0
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	jnz	L(last_vec_x1)
 
-	/* Each bit in K1 represents a CHAR or a null byte in YMM2.  */
-	VPCMP	$0, %YMMZERO, %YMM6, %k1
-	kmovd	%k1, %eax
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-
-	/* Each bit in K2 represents a CHAR or a null byte in YMM3.  */
-	VPCMP	$0, %YMMZERO, %YMM7, %k2
-	/* Each bit in K3 represents a CHAR or a null byte in YMM4.  */
-	VPCMP	$0, %YMMZERO, %YMM8, %k3
+	jnz	L(last_vec_x2)
 
+	VPCMP	$0, %YMMZERO, %YMM3, %k0
+	kmovd	%k0, %eax
+	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
 # ifdef USE_AS_WCSCHR
-	/* NB: Each bit in K2/K3 represents 4-byte element.  */
-	kshiftlw $8, %k3, %k1
+	sall	$8, %ecx
+	orl	%ecx, %eax
+	tzcntl	%eax, %eax
 # else
-	kshiftlq $32, %k3, %k1
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
 # endif
+# ifndef USE_AS_STRCHRNUL
+	/* Check if match was CHAR or null.  */
+	cmp	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Each bit in K1 represents a NULL or a mismatch.  */
-	korq	%k1, %k2, %k1
-	kmovq	%k1, %rax
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+	xorl	%eax, %eax
+	ret
+# endif
 
-	tzcntq  %rax, %rax
-# ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
-# else
-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+	.p2align 4
+L(last_vec_x1):
+	tzcntl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+	/* Check if match was null.  */
+	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
 # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+L(last_vec_x2):
+	tzcntl	%eax, %eax
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Check if match was null.  */
+	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
 # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
 	/* Cold case for crossing page with first load.	 */
 	.p2align 4
 L(cross_page_boundary):
+	movq	%rdi, %rdx
+	/* Align rdi.  */
 	andq	$-VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-
 	VMOVA	(%rdi), %YMM1
-
 	/* Leaves only CHARS matching esi as 0.  */
 	vpxorq	%YMM1, %YMM0, %YMM2
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 	VPCMP	$0, %YMMZERO, %YMM2, %k0
 	kmovd	%k0, %eax
-	testl	%eax, %eax
-
+	/* Remove the leading bits.	 */
 # ifdef USE_AS_WCSCHR
+	movl	%edx, %SHIFT_REG
 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
 	   bytes.  */
-	movl	%ecx, %SHIFT_REG
-	sarl    $2, %SHIFT_REG
+	sarl	$2, %SHIFT_REG
+	andl	$(CHAR_PER_VEC - 1), %SHIFT_REG
 # endif
-
-	/* Remove the leading bits.	 */
 	sarxl	%SHIFT_REG, %eax, %eax
+	/* If eax is zero continue.  */
 	testl	%eax, %eax
-
-	jz	L(aligned_more)
+	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
-	addq	%rcx, %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if match was CHAR or null.  */
+	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
+# endif
 # ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(%rdi, %rax, 4), %rax
+	/* NB: Multiply wchar_t count by 4 to get the number of
+	   bytes.  */
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
 # else
-	addq	%rdi, %rax
-# endif
-# ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	addq	%rdx, %rax
 # endif
 	ret