From patchwork Tue Jul 12 19:29:03 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 55989
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id 818B9382458F
	for <patchwork@sourceware.org>; Tue, 12 Jul 2022 19:30:30 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 818B9382458F
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1657654230;
	bh=8XokivGxILiDVm1OfBNunJSR8Thh1U4q+JlSich6Hts=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=ualS/JviDG+90YDGGSCWHU5QHFy1B6p4Vvq4Bo9kFFn4RGzKJMZMud8egB3P2s3NH
	 vv/gil92z/6NX79p417ieY3548bF4OtMgAaaOG5X/zzJaf+ql/qbGczPWUNOKF92bm
	 dTJk2hYo/mJkfqqaOdSgFRvMeD25JKWzKN1LiRdE=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-pj1-x1029.google.com (mail-pj1-x1029.google.com
 [IPv6:2607:f8b0:4864:20::1029])
 by sourceware.org (Postfix) with ESMTPS id AFC793876880
 for <libc-alpha@sourceware.org>; Tue, 12 Jul 2022 19:29:16 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org AFC793876880
Received: by mail-pj1-x1029.google.com with SMTP id fz10so8868974pjb.2
 for <libc-alpha@sourceware.org>; Tue, 12 Jul 2022 12:29:16 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=8XokivGxILiDVm1OfBNunJSR8Thh1U4q+JlSich6Hts=;
 b=6QERY5j1fFgq8SGMshUzrGIrBOmyslhv/RhX3VzFRE3omC5OAo34tSZhtZBJ6ptVVR
 ZqWngDZPjhP4TPET5x5NE0tMNi1+vT5cQFPKI04EwR/DH37KW2Bq5shaLXFHu9HwBdJF
 thVrjDq4vpRvpo17nG3me07sH3ecFoCn2KE9JH+6gvWmVWlBbCbZ1O7qgVoRPW7GO5c5
 midwPgPZk4njBG6a1Ub/NfwGeKE0R1KsyJdC/nmJckSRloCel17a6apB20fY4fLmofMe
 TJWKNRJvHt92kJ0f9au5uvEpchznRlH+zEh0CAA5CBhH0aEP1d5GrOHh7u6QqMUkMzZT
 2Syg==
X-Gm-Message-State: AJIora9cg9DYv7gHFR9SvrK6WZ6Eck2XrjnxW0RrKztELj5w8A4Jg5NB
 KLspu4rPJo0KgVJKbEhDOAwziSB9kN4=
X-Google-Smtp-Source: 
 AGRyM1vbGB9327JMqwH1dUcxuA/I7XhWEt4ti1NeIDH4BnzD7cLzv3bkxb3h7whOUt4YeNm2m+y3Ng==
X-Received: by 2002:a17:90a:4704:b0:1ef:f369:bd0e with SMTP id
 h4-20020a17090a470400b001eff369bd0emr5988077pjg.20.1657654155238;
 Tue, 12 Jul 2022 12:29:15 -0700 (PDT)
Received: from noah-tgl.. ([192.55.60.37]) by smtp.gmail.com with ESMTPSA id
 w7-20020a170902e88700b0016c28fbd7e5sm7274704plg.268.2022.07.12.12.29.14
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Tue, 12 Jul 2022 12:29:14 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v1] x86: Move memrchr SSE2 implementation to
 multiarch/memrchr-sse2.S
Date: Tue, 12 Jul 2022 12:29:03 -0700
Message-Id: <20220712192910.351121-3-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20220712192910.351121-1-goldstein.w.n@gmail.com>
References: <20220712192910.351121-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-12.1 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP,
 T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.

Tested build on x86_64 and x86_32 with/without multiarch.
---
 sysdeps/x86_64/memrchr.S                | 332 +----------------------
 sysdeps/x86_64/multiarch/memrchr-sse2.S | 336 +++++++++++++++++++++++-
 2 files changed, 334 insertions(+), 334 deletions(-)

diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index b0dffd2ae2..385e2c5668 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -17,334 +17,6 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#define VEC_SIZE			16
-#define PAGE_SIZE			4096
-
-	.text
-ENTRY_P2ALIGN(__memrchr, 6)
-#ifdef __ILP32__
-	/* Clear upper bits.  */
-	mov	%RDX_LP, %RDX_LP
-#endif
-	movd	%esi, %xmm0
-
-	/* Get end pointer.  */
-	leaq	(%rdx, %rdi), %rcx
-
-	punpcklbw %xmm0, %xmm0
-	punpcklwd %xmm0, %xmm0
-	pshufd	$0, %xmm0, %xmm0
-
-	/* Check if we can load 1x VEC without cross a page.  */
-	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
-	jz	L(page_cross)
-
-	/* NB: This load happens regardless of whether rdx (len) is zero. Since
-	   it doesn't cross a page and the standard gurantees any pointer have
-	   at least one-valid byte this load must be safe. For the entire
-	   history of the x86 memrchr implementation this has been possible so
-	   no code "should" be relying on a zero-length check before this load.
-	   The zero-length check is moved to the page cross case because it is
-	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
-	   into 2-cache lines.  */
-	movups	-(VEC_SIZE)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-
-	subq	$VEC_SIZE, %rdx
-	ja	L(more_1x_vec)
-L(ret_vec_x0_test):
-	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
-	   zero.  */
-	bsrl	%eax, %eax
-	jz	L(ret_0)
-	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
-	   if out of bounds.  */
-	addl	%edx, %eax
-	jl	L(zero_0)
-	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
-	   ptr.  */
-	addq	%rdi, %rax
-L(ret_0):
-	ret
-
-	.p2align 4,, 5
-L(ret_vec_x0):
-	bsrl	%eax, %eax
-	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
-	ret
-
-	.p2align 4,, 2
-L(zero_0):
-	xorl	%eax, %eax
-	ret
-
-
-	.p2align 4,, 8
-L(more_1x_vec):
-	testl	%eax, %eax
-	jnz	L(ret_vec_x0)
-
-	/* Align rcx (pointer to string).  */
-	decq	%rcx
-	andq	$-VEC_SIZE, %rcx
-
-	movq	%rcx, %rdx
-	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
-	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
-	   it adds more frontend uops (even if the moves can be eliminated) and
-	   some percentage of the time actual backend uops.  */
-	movaps	-(VEC_SIZE)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	subq	%rdi, %rdx
-	pmovmskb %xmm1, %eax
-
-	cmpq	$(VEC_SIZE * 2), %rdx
-	ja	L(more_2x_vec)
-L(last_2x_vec):
-	subl	$VEC_SIZE, %edx
-	jbe	L(ret_vec_x0_test)
-
-	testl	%eax, %eax
-	jnz	L(ret_vec_x0)
-
-	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-
-	subl	$VEC_SIZE, %edx
-	bsrl	%eax, %eax
-	jz	L(ret_1)
-	addl	%edx, %eax
-	jl	L(zero_0)
-	addq	%rdi, %rax
-L(ret_1):
-	ret
-
-	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
-	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
-	   lines.  Naturally aligned % 16 to 8-bytes.  */
-L(page_cross):
-	/* Zero length check.  */
-	testq	%rdx, %rdx
-	jz	L(zero_0)
-
-	leaq	-1(%rcx), %r8
-	andq	$-(VEC_SIZE), %r8
-
-	movaps	(%r8), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %esi
-	/* Shift out negative alignment (because we are starting from endptr and
-	   working backwards).  */
-	negl	%ecx
-	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
-	   explicitly.  */
-	andl	$(VEC_SIZE - 1), %ecx
-	shl	%cl, %esi
-	movzwl	%si, %eax
-	leaq	(%rdi, %rdx), %rcx
-	cmpq	%rdi, %r8
-	ja	L(more_1x_vec)
-	subl	$VEC_SIZE, %edx
-	bsrl	%eax, %eax
-	jz	L(ret_2)
-	addl	%edx, %eax
-	jl	L(zero_1)
-	addq	%rdi, %rax
-L(ret_2):
-	ret
-
-	/* Fits in aliging bytes.  */
-L(zero_1):
-	xorl	%eax, %eax
-	ret
-
-	.p2align 4,, 5
-L(ret_vec_x1):
-	bsrl	%eax, %eax
-	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
-	ret
-
-	.p2align 4,, 8
-L(more_2x_vec):
-	testl	%eax, %eax
-	jnz	L(ret_vec_x0)
-
-	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	testl	%eax, %eax
-	jnz	L(ret_vec_x1)
-
-
-	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-
-	subq	$(VEC_SIZE * 4), %rdx
-	ja	L(more_4x_vec)
-
-	addl	$(VEC_SIZE), %edx
-	jle	L(ret_vec_x2_test)
-
-L(last_vec):
-	testl	%eax, %eax
-	jnz	L(ret_vec_x2)
-
-	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-
-	subl	$(VEC_SIZE), %edx
-	bsrl	%eax, %eax
-	jz	L(ret_3)
-	addl	%edx, %eax
-	jl	L(zero_2)
-	addq	%rdi, %rax
-L(ret_3):
-	ret
-
-	.p2align 4,, 6
-L(ret_vec_x2_test):
-	bsrl	%eax, %eax
-	jz	L(zero_2)
-	addl	%edx, %eax
-	jl	L(zero_2)
-	addq	%rdi, %rax
-	ret
-
-L(zero_2):
-	xorl	%eax, %eax
-	ret
-
-
-	.p2align 4,, 5
-L(ret_vec_x2):
-	bsrl	%eax, %eax
-	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
-	ret
-
-	.p2align 4,, 5
-L(ret_vec_x3):
-	bsrl	%eax, %eax
-	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
-	ret
-
-	.p2align 4,, 8
-L(more_4x_vec):
-	testl	%eax, %eax
-	jnz	L(ret_vec_x2)
-
-	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-
-	testl	%eax, %eax
-	jnz	L(ret_vec_x3)
-
-	addq	$-(VEC_SIZE * 4), %rcx
-	cmpq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec)
-
-	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
-	   keeping the code from spilling to the next cache line.  */
-	addq	$(VEC_SIZE * 4 - 1), %rcx
-	andq	$-(VEC_SIZE * 4), %rcx
-	leaq	(VEC_SIZE * 4)(%rdi), %rdx
-	andq	$-(VEC_SIZE * 4), %rdx
-
-	.p2align 4,, 11
-L(loop_4x_vec):
-	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
-	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
-	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
-	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
-	pcmpeqb	%xmm0, %xmm1
-	pcmpeqb	%xmm0, %xmm2
-	pcmpeqb	%xmm0, %xmm3
-	pcmpeqb	%xmm0, %xmm4
-
-	por	%xmm1, %xmm2
-	por	%xmm3, %xmm4
-	por	%xmm2, %xmm4
-
-	pmovmskb %xmm4, %esi
-	testl	%esi, %esi
-	jnz	L(loop_end)
-
-	addq	$-(VEC_SIZE * 4), %rcx
-	cmpq	%rdx, %rcx
-	jne	L(loop_4x_vec)
-
-	subl	%edi, %edx
-
-	/* Ends up being 1-byte nop.  */
-	.p2align 4,, 2
-L(last_4x_vec):
-	movaps	-(VEC_SIZE)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
-
-	testl	%eax, %eax
-	jnz	L(ret_vec_x0)
-
-
-	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-
-	testl	%eax, %eax
-	jnz	L(ret_vec_end)
-
-	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-
-	subl	$(VEC_SIZE * 3), %edx
-	ja	L(last_vec)
-	bsrl	%eax, %eax
-	jz	L(ret_4)
-	addl	%edx, %eax
-	jl	L(zero_3)
-	addq	%rdi, %rax
-L(ret_4):
-	ret
-
-	/* Ends up being 1-byte nop.  */
-	.p2align 4,, 3
-L(loop_end):
-	pmovmskb %xmm1, %eax
-	sall	$16, %eax
-	jnz	L(ret_vec_end)
-
-	pmovmskb %xmm2, %eax
-	testl	%eax, %eax
-	jnz	L(ret_vec_end)
-
-	pmovmskb %xmm3, %eax
-	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
-	   then it won't affect the result in esi (VEC4). If ecx is non-zero
-	   then CHAR in VEC3 and bsrq will use that position.  */
-	sall	$16, %eax
-	orl	%esi, %eax
-	bsrl	%eax, %eax
-	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
-	ret
-
-L(ret_vec_end):
-	bsrl	%eax, %eax
-	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
-	ret
-	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
-	   aligning bytes.  */
-L(zero_3):
-	xorl	%eax, %eax
-	ret
-	/* 2-bytes from next cache line.  */
-END(__memrchr)
+#define MEMRCHR	__memrchr
+#include "multiarch/memrchr-sse2.S"
 weak_alias (__memrchr, memrchr)
diff --git a/sysdeps/x86_64/multiarch/memrchr-sse2.S b/sysdeps/x86_64/multiarch/memrchr-sse2.S
index b04202e171..d92a4022dc 100644
--- a/sysdeps/x86_64/multiarch/memrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-sse2.S
@@ -17,10 +17,338 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define __memrchr __memrchr_sse2
+# ifndef MEMRCHR
+#  define MEMRCHR __memrchr_sse2
+# endif
+#endif
+
+#include <sysdep.h>
+#define VEC_SIZE			16
+#define PAGE_SIZE			4096
 
-# undef weak_alias
-# define weak_alias(__memrchr, memrchr)
+	.text
+ENTRY_P2ALIGN(MEMRCHR, 6)
+#ifdef __ILP32__
+	/* Clear upper bits.  */
+	mov	%RDX_LP, %RDX_LP
 #endif
+	movd	%esi, %xmm0
+
+	/* Get end pointer.  */
+	leaq	(%rdx, %rdi), %rcx
+
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	jz	L(page_cross)
+
+	/* NB: This load happens regardless of whether rdx (len) is zero. Since
+	   it doesn't cross a page and the standard gurantees any pointer have
+	   at least one-valid byte this load must be safe. For the entire
+	   history of the x86 memrchr implementation this has been possible so
+	   no code "should" be relying on a zero-length check before this load.
+	   The zero-length check is moved to the page cross case because it is
+	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
+	   into 2-cache lines.  */
+	movups	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
+	   zero.  */
+	bsrl	%eax, %eax
+	jz	L(ret_0)
+	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
+	   if out of bounds.  */
+	addl	%edx, %eax
+	jl	L(zero_0)
+	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
+	   ptr.  */
+	addq	%rdi, %rax
+L(ret_0):
+	ret
+
+	.p2align 4,, 5
+L(ret_vec_x0):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
+	ret
+
+	.p2align 4,, 2
+L(zero_0):
+	xorl	%eax, %eax
+	ret
+
+
+	.p2align 4,, 8
+L(more_1x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	/* Align rcx (pointer to string).  */
+	decq	%rcx
+	andq	$-VEC_SIZE, %rcx
+
+	movq	%rcx, %rdx
+	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
+	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
+	   it adds more frontend uops (even if the moves can be eliminated) and
+	   some percentage of the time actual backend uops.  */
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	subq	%rdi, %rdx
+	pmovmskb %xmm1, %eax
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+	subl	$VEC_SIZE, %edx
+	jbe	L(ret_vec_x0_test)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_1)
+	addl	%edx, %eax
+	jl	L(zero_0)
+	addq	%rdi, %rax
+L(ret_1):
+	ret
+
+	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
+	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
+	   lines.  Naturally aligned % 16 to 8-bytes.  */
+L(page_cross):
+	/* Zero length check.  */
+	testq	%rdx, %rdx
+	jz	L(zero_0)
+
+	leaq	-1(%rcx), %r8
+	andq	$-(VEC_SIZE), %r8
+
+	movaps	(%r8), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	negl	%ecx
+	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
+	   explicitly.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	shl	%cl, %esi
+	movzwl	%si, %eax
+	leaq	(%rdi, %rdx), %rcx
+	cmpq	%rdi, %r8
+	ja	L(more_1x_vec)
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_2)
+	addl	%edx, %eax
+	jl	L(zero_1)
+	addq	%rdi, %rax
+L(ret_2):
+	ret
+
+	/* Fits in aliging bytes.  */
+L(zero_1):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4,, 5
+L(ret_vec_x1):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_x1)
+
+
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	addl	$(VEC_SIZE), %edx
+	jle	L(ret_vec_x2_test)
+
+L(last_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
+
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$(VEC_SIZE), %edx
+	bsrl	%eax, %eax
+	jz	L(ret_3)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
+L(ret_3):
+	ret
+
+	.p2align 4,, 6
+L(ret_vec_x2_test):
+	bsrl	%eax, %eax
+	jz	L(zero_2)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
+	ret
+
+L(zero_2):
+	xorl	%eax, %eax
+	ret
+
+
+	.p2align 4,, 5
+L(ret_vec_x2):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
+
+	.p2align 4,, 5
+L(ret_vec_x3):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
+
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x3)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
+	   keeping the code from spilling to the next cache line.  */
+	addq	$(VEC_SIZE * 4 - 1), %rcx
+	andq	$-(VEC_SIZE * 4), %rcx
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
+	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
+	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
+	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
+	pcmpeqb	%xmm0, %xmm1
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm0, %xmm4
+
+	por	%xmm1, %xmm2
+	por	%xmm3, %xmm4
+	por	%xmm2, %xmm4
+
+	pmovmskb %xmm4, %esi
+	testl	%esi, %esi
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	%rdx, %rcx
+	jne	L(loop_4x_vec)
+
+	subl	%edi, %edx
+
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 2
+L(last_4x_vec):
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+	bsrl	%eax, %eax
+	jz	L(ret_4)
+	addl	%edx, %eax
+	jl	L(zero_3)
+	addq	%rdi, %rax
+L(ret_4):
+	ret
+
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 3
+L(loop_end):
+	pmovmskb %xmm1, %eax
+	sall	$16, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm2, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm3, %eax
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	sall	$16, %eax
+	orl	%esi, %eax
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-#include "../memrchr.S"
+L(ret_vec_end):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
+	ret
+	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
+	   aligning bytes.  */
+L(zero_3):
+	xorl	%eax, %eax
+	ret
+	/* 2-bytes from next cache line.  */
+END(MEMRCHR)