From patchwork Wed Apr 21 21:39:53 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 43064
Return-Path: <libc-alpha-bounces@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id BA7F5398B86E;
	Wed, 21 Apr 2021 21:40:50 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org BA7F5398B86E
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1619041250;
	bh=TwFZph6rpqUQx60nGrcmk58uo9BIAdT/vBPYpbwY3Mk=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=dMjid8CKa/scHSqCuguPFXykLivQStnUNGn2ZnaLOGzwglIAVNQoqXvFa1FNjAIEF
	 vhTYOjk5yQnaN3bSrj5vhiwL+VPz3aVi5m3gEKiredNk/GYWznLpJJo6I+goEAnwiT
	 x5YqnV0aKHVAF1s0/wlwWp0r7ANvyqzmxxd8PuvI=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-qk1-x72c.google.com (mail-qk1-x72c.google.com
 [IPv6:2607:f8b0:4864:20::72c])
 by sourceware.org (Postfix) with ESMTPS id 3FB55398B879
 for <libc-alpha@sourceware.org>; Wed, 21 Apr 2021 21:40:47 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 3FB55398B879
Received: by mail-qk1-x72c.google.com with SMTP id 8so10642576qkv.8
 for <libc-alpha@sourceware.org>; Wed, 21 Apr 2021 14:40:47 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20161025;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=TwFZph6rpqUQx60nGrcmk58uo9BIAdT/vBPYpbwY3Mk=;
 b=aO0k/OaDjOTj3Rebsc0lhkRgOu5RarJqN6XaKRAG8nI7UWNdBIlEHbpuE3hoJiiEJo
 ZV/mh8+4yXsesWiGmUP+CZHNEzJiLoIrGTmkf0SdBFVIrHTK0r1SLaziJaQdlO/Z/roR
 4wkgRrx5b2nBywYmEuUcXep/wpfC9OgzBeLI5DZHGb2C2IlIC3h8Fyx+SMbrF97WVvDh
 bxguf6YNLDr5lCdT0brPA/GecgEMb9cB07PS+NluO9EU/Gys6B8NDTBbJBnD0jqhxp+W
 GNyQzlRSG6hzSIr49Q00kTcCVzv2ROKm3cXumjkowAsQBiKakrPbcaRbv66qJEuDF0i3
 Q45A==
X-Gm-Message-State: AOAM5333oWA1pdQoiYEtVGlSwg0e6uWiFBbMpoZMj0j5EOZk0nDV73aO
 XB1336eyZHuj5Vzhwdt0fAQ306GE15Q=
X-Google-Smtp-Source: 
 ABdhPJwL8arWhhZtoSlMej3mVeUwY4xLSrgomYmjyzFgJgUw9c/xgr0sZGKbKWSh9E+KuoI6sG6W2w==
X-Received: by 2002:a37:8a46:: with SMTP id m67mr257626qkd.259.1619041246253;
 Wed, 21 Apr 2021 14:40:46 -0700 (PDT)
Received: from localhost.localdomain
 (pool-71-245-178-39.pitbpa.fios.verizon.net. [71.245.178.39])
 by smtp.googlemail.com with ESMTPSA id m29sm572365qkm.101.2021.04.21.14.40.45
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Wed, 21 Apr 2021 14:40:45 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v1 2/2] x86: Optimize strchr-evex.S
Date: Wed, 21 Apr 2021 17:39:53 -0400
Message-Id: <20210421213951.404588-2-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.29.2
In-Reply-To: <20210421213951.404588-1-goldstein.w.n@gmail.com>
References: <20210421213951.404588-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-12.6 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.2
X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces@sourceware.org
Sender: "Libc-alpha" <libc-alpha-bounces@sourceware.org>

No bug. This commit optimizes strlen-evex.S. The optimizations are
mostly small things such as save an ALU in the alignment process,
saving a few instructions in the loop return. The one significant
change is saving 2 instructions in the 4x loop. test-strchr,
test-strchrnul, test-wcschr, and test-wcschrnul are all passing.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/strchr-evex.S | 388 ++++++++++++++-----------
 1 file changed, 214 insertions(+), 174 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
index ddc86a7058..7cd111e96c 100644
--- a/sysdeps/x86_64/multiarch/strchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -24,23 +24,26 @@
 #  define STRCHR	__strchr_evex
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
+# define VMOVU	vmovdqu64
+# define VMOVA	vmovdqa64
 
 # ifdef USE_AS_WCSCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMP		vpcmpd
 #  define VPMINU	vpminud
 #  define CHAR_REG	esi
-#  define SHIFT_REG	r8d
+#  define SHIFT_REG	ecx
+#  define CHAR_SIZE	4
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMP		vpcmpb
 #  define VPMINU	vpminub
 #  define CHAR_REG	sil
-#  define SHIFT_REG	ecx
+#  define SHIFT_REG	edx
+#  define CHAR_SIZE	1
 # endif
 
+
 # define XMMZERO	xmm16
 
 # define YMMZERO	ymm16
@@ -56,23 +59,20 @@
 
 # define VEC_SIZE 32
 # define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
 
 	.section .text.evex,"ax",@progbits
 ENTRY (STRCHR)
-	movl	%edi, %ecx
-# ifndef USE_AS_STRCHRNUL
-	xorl	%edx, %edx
-# endif
-
 	/* Broadcast CHAR to YMM0.	*/
-	VPBROADCAST %esi, %YMM0
-
+	VPBROADCAST	%esi, %YMM0
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
 
-	/* Check if we cross page boundary with one vector load.  */
-	andl	$(PAGE_SIZE - 1), %ecx
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
-	ja  L(cross_page_boundary)
+	/* Check if we cross page boundary with one vector load. Otherwise
+	   it is safe to use an unaligned load.  */
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
 	   null bytes.  */
@@ -83,251 +83,291 @@ ENTRY (STRCHR)
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 	VPCMP	$0, %YMMZERO, %YMM2, %k0
-	ktestd	%k0, %k0
-	jz	L(more_vecs)
 	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jz	L(aligned_more)
 	tzcntl	%eax, %eax
-	/* Found CHAR or the null byte.	 */
 # ifdef USE_AS_WCSCHR
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(%rdi, %rax, 4), %rax
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
 	addq	%rdi, %rax
 # endif
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
 # endif
 	ret
 
-	.p2align 4
-L(more_vecs):
-	/* Align data for aligned loads in the loop.  */
-	andq	$-VEC_SIZE, %rdi
-L(aligned_more):
-
-	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.	*/
-	VMOVA	VEC_SIZE(%rdi), %YMM1
-	addq	$VEC_SIZE, %rdi
-
-	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	VMOVA	VEC_SIZE(%rdi), %YMM1
-	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
-	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
-	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
-	ktestd	%k0, %k0
-	jz	L(prep_loop_4x)
-
-	kmovd	%k0, %eax
+	/* .p2align 5 helps keep performance more consistent if ENTRY()
+	   alignment % 32 was either 16 or 0. As well this makes the
+	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
+	   easier.  */
+	.p2align 5
+L(first_vec_x3):
 	tzcntl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
-# ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
-# else
-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero)
 # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
-# endif
+L(zero):
+	xorl	%eax, %eax
 	ret
+# endif
 
 	.p2align 4
-L(first_vec_x0):
+L(first_vec_x4):
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if first match was CHAR (k0) or null (k1).  */
+	kmovd	%k0, %eax
 	tzcntl	%eax, %eax
-	/* Found CHAR or the null byte.	 */
-# ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(%rdi, %rax, 4), %rax
+	kmovd	%k1, %ecx
+	/* bzhil will not be 0 if first match was null.  */
+	bzhil	%eax, %ecx, %ecx
+	jne	L(zero)
 # else
-	addq	%rdi, %rax
-# endif
-# ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Combine CHAR and null matches.  */
+	kord	%k0, %k1, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
 # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
 	.p2align 4
 L(first_vec_x1):
 	tzcntl	%eax, %eax
-	/* Found CHAR or the null byte.	 */
-# ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
-# else
-	leaq	VEC_SIZE(%rdi, %rax), %rax
-# endif
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero)
+
 # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
 	.p2align 4
 L(first_vec_x2):
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if first match was CHAR (k0) or null (k1).  */
+	kmovd	%k0, %eax
 	tzcntl	%eax, %eax
-	/* Found CHAR or the null byte.	 */
-# ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+	kmovd	%k1, %ecx
+	/* bzhil will not be 0 if first match was null.  */
+	bzhil	%eax, %ecx, %ecx
+	jne	L(zero)
 # else
-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
-# endif
-# ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Combine CHAR and null matches.  */
+	kord	%k0, %k1, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
 # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-L(prep_loop_4x):
-	/* Align data to 4 * VEC_SIZE.	*/
+	.p2align 4
+L(aligned_more):
+	/* Align data to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rdi
+L(cross_page_continue):
+	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time since
+	   data is only aligned to VEC_SIZE. Use two alternating methods for
+	   checking VEC to balance latency and port contention.  */
+
+	/* This method has higher latency but has better port
+	   distribution.  */
+	VMOVA	(VEC_SIZE)(%rdi), %YMM1
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	/* This method has higher latency but has better port
+	   distribution.  */
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
+	/* Each bit in K0 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMM0, %k0
+	/* Each bit in K1 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMMZERO, %k1
+	kortestd	%k0, %k1
+	jnz	L(first_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+	/* Each bit in K0 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMM0, %k0
+	/* Each bit in K1 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMMZERO, %k1
+	kortestd	%k0, %k1
+	jnz	L(first_vec_x4)
+
+	/* Align data to VEC_SIZE * 4 for the loop.  */
+	addq	$VEC_SIZE, %rdi
 	andq	$-(VEC_SIZE * 4), %rdi
 
 	.p2align 4
 L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
+	/* Check 4x VEC at a time. No penalty to imm32 offset with evex
+	   encoding.  */
 	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
 	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
 	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
 	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
 
-	/* Leaves only CHARS matching esi as 0.  */
+	/* For YMM1 and YMM3 use xor to set the CHARs matching esi to zero.  */
 	vpxorq	%YMM1, %YMM0, %YMM5
-	vpxorq	%YMM2, %YMM0, %YMM6
+	/* For YMM2 and YMM4 cmp not equals to CHAR and store result in k
+	   register. Its possible to save either 1 or 2 instructions using cmp no
+	   equals method for either YMM1 or YMM1 and YMM3 respectively but
+	   bottleneck on p5 makes it no worth it.  */
+	VPCMP	$4, %YMM0, %YMM2, %k2
 	vpxorq	%YMM3, %YMM0, %YMM7
-	vpxorq	%YMM4, %YMM0, %YMM8
-
-	VPMINU	%YMM5, %YMM1, %YMM5
-	VPMINU	%YMM6, %YMM2, %YMM6
-	VPMINU	%YMM7, %YMM3, %YMM7
-	VPMINU	%YMM8, %YMM4, %YMM8
-
-	VPMINU	%YMM5, %YMM6, %YMM1
-	VPMINU	%YMM7, %YMM8, %YMM2
-
-	VPMINU	%YMM1, %YMM2, %YMM1
-
-	/* Each bit in K0 represents a CHAR or a null byte.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-
-	addq	$(VEC_SIZE * 4), %rdi
-
-	ktestd	%k0, %k0
+	VPCMP	$4, %YMM0, %YMM4, %k4
+
+	/* Use min to select all zeros (either from xor or end of string).  */
+	VPMINU	%YMM1, %YMM5, %YMM1
+	VPMINU	%YMM3, %YMM7, %YMM3
+
+	/* Use min + zeromask to select for zeros. Since k2 and k4 will be
+	   have 0 as positions that matched with CHAR which will set zero in
+	   the corresponding destination bytes in YMM2 / YMM4.  */
+	VPMINU	%YMM1, %YMM2, %YMM2{%k2}{z}
+	VPMINU	%YMM3, %YMM4, %YMM4
+	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
+
+	VPCMP	$0, %YMMZERO, %YMM4, %k1
+	kmovd	%k1, %ecx
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
 	jz	L(loop_4x_vec)
 
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM5, %k0
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	jnz	L(last_vec_x1)
 
-	/* Each bit in K1 represents a CHAR or a null byte in YMM2.  */
-	VPCMP	$0, %YMMZERO, %YMM6, %k1
-	kmovd	%k1, %eax
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-
-	/* Each bit in K2 represents a CHAR or a null byte in YMM3.  */
-	VPCMP	$0, %YMMZERO, %YMM7, %k2
-	/* Each bit in K3 represents a CHAR or a null byte in YMM4.  */
-	VPCMP	$0, %YMMZERO, %YMM8, %k3
+	jnz	L(last_vec_x2)
 
+	VPCMP	$0, %YMMZERO, %YMM3, %k0
+	kmovd	%k0, %eax
+	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
 # ifdef USE_AS_WCSCHR
-	/* NB: Each bit in K2/K3 represents 4-byte element.  */
-	kshiftlw $8, %k3, %k1
+	sall	$8, %ecx
+	orl	%ecx, %eax
+	tzcntl	%eax, %eax
 # else
-	kshiftlq $32, %k3, %k1
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
 # endif
+# ifndef USE_AS_STRCHRNUL
+	/* Check if match was CHAR or null.  */
+	cmp	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Each bit in K1 represents a NULL or a mismatch.  */
-	korq	%k1, %k2, %k1
-	kmovq	%k1, %rax
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+	xorl	%eax, %eax
+	ret
+# endif
 
-	tzcntq  %rax, %rax
-# ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
-# else
-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+	.p2align 4
+L(last_vec_x1):
+	tzcntl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+	/* Check if match was null.  */
+	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
 # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+L(last_vec_x2):
+	tzcntl	%eax, %eax
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Check if match was null.  */
+	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
 # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
 	/* Cold case for crossing page with first load.	 */
 	.p2align 4
 L(cross_page_boundary):
+	movq	%rdi, %rdx
+	/* Align rdi.  */
 	andq	$-VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-
 	VMOVA	(%rdi), %YMM1
-
 	/* Leaves only CHARS matching esi as 0.  */
 	vpxorq	%YMM1, %YMM0, %YMM2
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 	VPCMP	$0, %YMMZERO, %YMM2, %k0
 	kmovd	%k0, %eax
-	testl	%eax, %eax
-
+	/* Remove the leading bits.	 */
 # ifdef USE_AS_WCSCHR
+	movl	%edx, %SHIFT_REG
 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
 	   bytes.  */
-	movl	%ecx, %SHIFT_REG
-	sarl    $2, %SHIFT_REG
+	sarl	$2, %SHIFT_REG
+	andl	$(CHAR_PER_VEC - 1), %SHIFT_REG
 # endif
-
-	/* Remove the leading bits.	 */
 	sarxl	%SHIFT_REG, %eax, %eax
+	/* If eax is zero continue.  */
 	testl	%eax, %eax
-
-	jz	L(aligned_more)
+	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
-	addq	%rcx, %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if match was CHAR or null.  */
+	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
+# endif
 # ifdef USE_AS_WCSCHR
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(%rdi, %rax, 4), %rax
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
 # else
-	addq	%rdi, %rax
-# endif
-# ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	addq	%rdx, %rax
 # endif
 	ret