From patchwork Sat Dec 25 03:22:57 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 49253
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id 8653C3858424
	for <patchwork@sourceware.org>; Sat, 25 Dec 2021 03:24:14 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 8653C3858424
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1640402654;
	bh=eW7bKvVKCDY/PYYXAzBrVdnZQofvJds/FWT06PUQfQ8=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=lYkmTanCqPoipoKGwuSO4B9JpRotdU4N87Q7qfTn7lEQZsTF29AXCT0Ycbmt4zkZY
	 irx2MTZdWypBe9InybCP8fv7Qi/afXwHne1nmD966xZHtZD+PDburE7vYkxHvKXp5O
	 SJsBPWtDUFf4NNzuZb7llXdZWn+63AoYwRhcdaCg=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-pj1-x1033.google.com (mail-pj1-x1033.google.com
 [IPv6:2607:f8b0:4864:20::1033])
 by sourceware.org (Postfix) with ESMTPS id 01390385840B
 for <libc-alpha@sourceware.org>; Sat, 25 Dec 2021 03:23:08 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 01390385840B
Received: by mail-pj1-x1033.google.com with SMTP id iy13so8886341pjb.5
 for <libc-alpha@sourceware.org>; Fri, 24 Dec 2021 19:23:07 -0800 (PST)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=eW7bKvVKCDY/PYYXAzBrVdnZQofvJds/FWT06PUQfQ8=;
 b=YEtAeL0+hp/nfVqh3RnYLwLRJS06ryoc8FfGe25UtlNqHGhkivaUjpHghYlvjd7nvd
 Wk3bkwPGEGA5fyI6oZIyVMkc8WvM//yGQRxEBRLLCprb5sH3jNv+1kaN/e17JKiJvyKK
 Odd8DkWTWH616eYAM/CN/kqHnnGBHA6fLm+sZurUF5FB9L6C+RjwmQAh3QYuO+qAoDRy
 rjqNk7MKTkNhs7Eip4CqDnA7DOyVXJCWYtKoWhYm1Bgp4SX6awMEax66GVMQ1L/8iD8D
 HAdFAv2VVVG/u3yezjyzUPEmOUxLbgExHWbrZP0nDNiQ7Wu0CjUNCP0KSl3A3k2DmT/Q
 ot2Q==
X-Gm-Message-State: AOAM531sK4815NpvmKcKUyeJ01y69Jlu94Qn756kFql8b6NNvpN0OzhP
 o9wZ6x/RtEmAZiSteMdwwZhB8x1kxHs=
X-Google-Smtp-Source: 
 ABdhPJxoEakpvTJCbIgkcdDUVbN95bLtXSrYMqybdrGQ7euNp1Yh3HdWtqRXR+zxdi5GsWJftxO/ew==
X-Received: by 2002:a17:902:8214:b0:148:e748:84c3 with SMTP id
 x20-20020a170902821400b00148e74884c3mr8858819pln.77.1640402586920;
 Fri, 24 Dec 2021 19:23:06 -0800 (PST)
Received: from localhost.localdomain (c-73-202-60-99.hsd1.ca.comcast.net.
 [73.202.60.99])
 by smtp.googlemail.com with ESMTPSA id
 q190sm9155601pgq.38.2021.12.24.19.23.06
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Fri, 24 Dec 2021 19:23:06 -0800 (PST)
To: libc-alpha@sourceware.org
Subject: [PATCH v1 2/2] x86: Optimize L(less_vec) case in memcmpeq-evex.S
Date: Fri, 24 Dec 2021 21:22:57 -0600
Message-Id: <20211225032257.2887327-2-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20211225032257.2887327-1-goldstein.w.n@gmail.com>
References: <20211225032257.2887327-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-12.0 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

No bug.
Optimizations are twofold.

1) Replace page cross and 0/1 checks with masked load instructions in
   L(less_vec). In applications this reduces branch-misses in the
   hot [0, 32] case.
2) Change controlflow so that L(less_vec) case gets the fall through.

Change 2) helps copies in the [0, 32] size range but comes at the cost
of copies in the [33, 64] size range.  From profiles of GCC and
Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
appears to the the right tradeoff.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 sysdeps/x86_64/multiarch/memcmpeq-evex.S | 170 ++++++-----------------
 1 file changed, 43 insertions(+), 127 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
index f27e732036..b5e1edbdff 100644
--- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S
+++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
@@ -39,6 +39,7 @@
 #  define MEMCMPEQ	__memcmpeq_evex
 # endif
 
+# define VMOVU_MASK	vmovdqu8
 # define VMOVU	vmovdqu64
 # define VPCMP	vpcmpub
 # define VPTEST	vptestmb
@@ -62,12 +63,39 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
 	movl	%edx, %edx
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
-	jb	L(less_vec)
+	/* Fall through for [0, VEC_SIZE] as its the hottest.  */
+	ja	L(more_1x_vec)
+
+	/* Create mask of bytes that are guranteed to be valid because
+	   of length (edx). Using masked movs allows us to skip checks for
+	   page crosses/zero size.  */
+	movl	$-1, %ecx
+	bzhil	%edx, %ecx, %ecx
+	kmovd	%ecx, %k2
+
+	/* Use masked loads as VEC_SIZE could page cross where length
+	   (edx) would not.  */
+	VMOVU_MASK (%rsi), %YMM2{%k2}
+	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
+	kmovd	%k1, %eax
+	ret
 
-	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+
+L(last_1x_vec):
+	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
+	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
+	kmovd	%k1, %eax
+L(return_neq0):
+	ret
+
+
+
+	.p2align 4
+L(more_1x_vec):
+	/* From VEC + 1 to 2 * VEC.  */
 	VMOVU	(%rsi), %YMM1
 	/* Use compare not equals to directly check for mismatch.  */
-	VPCMP	$4, (%rdi), %YMM1, %k1
+	VPCMP	$4,(%rdi), %YMM1, %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_neq0)
@@ -88,13 +116,13 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
 
 	/* Check third and fourth VEC no matter what.  */
 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
-	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
+	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_neq0)
 
 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
-	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
+	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_neq0)
@@ -132,66 +160,6 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
 	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
 	VPTEST	%YMM4, %YMM4, %k1
 	kmovd	%k1, %eax
-L(return_neq0):
-	ret
-
-	/* Fits in padding needed to .p2align 5 L(less_vec).  */
-L(last_1x_vec):
-	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
-	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
-	kmovd	%k1, %eax
-	ret
-
-	/* NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32
-	   byte aligned.  */
-	.p2align 5
-L(less_vec):
-	/* Check if one or less char. This is necessary for size = 0 but
-	   is also faster for size = 1.  */
-	cmpl	$1, %edx
-	jbe	L(one_or_less)
-
-	/* Check if loading one VEC from either s1 or s2 could cause a
-	   page cross. This can have false positives but is by far the
-	   fastest method.  */
-	movl	%edi, %eax
-	orl	%esi, %eax
-	andl	$(PAGE_SIZE - 1), %eax
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	jg	L(page_cross_less_vec)
-
-	/* No page cross possible.  */
-	VMOVU	(%rsi), %YMM2
-	VPCMP	$4, (%rdi), %YMM2, %k1
-	kmovd	%k1, %eax
-	/* Result will be zero if s1 and s2 match. Otherwise first set
-	   bit will be first mismatch.  */
-	bzhil	%edx, %eax, %eax
-	ret
-
-	/* Relatively cold but placing close to L(less_vec) for 2 byte
-	   jump encoding.  */
-	.p2align 4
-L(one_or_less):
-	jb	L(zero)
-	movzbl	(%rsi), %ecx
-	movzbl	(%rdi), %eax
-	subl	%ecx, %eax
-	/* No ymm register was touched.  */
-	ret
-	/* Within the same 16 byte block is L(one_or_less).  */
-L(zero):
-	xorl	%eax, %eax
-	ret
-
-	.p2align 4
-L(last_2x_vec):
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
-	vpxorq	-(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
-	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
-	vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
-	VPTEST	%YMM2, %YMM2, %k1
-	kmovd	%k1, %eax
 	ret
 
 	.p2align 4
@@ -211,7 +179,7 @@ L(loop_4x_vec):
 	vpxorq	(%rdi), %YMM1, %YMM1
 
 	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
-	vpternlogd $0xde, (VEC_SIZE)(%rdi), %YMM1, %YMM2
+	vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
 
 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
@@ -238,7 +206,7 @@ L(loop_4x_vec):
 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
 	/* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
 	   oring with YMM4. Result is stored in YMM4.  */
-	vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
+	vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
 	cmpl	$(VEC_SIZE * 2), %edi
 	jae	L(8x_last_2x_vec)
 
@@ -256,68 +224,16 @@ L(8x_last_2x_vec):
 L(return_neq2):
 	ret
 
-	/* Relatively cold case as page cross are unexpected.  */
-	.p2align 4
-L(page_cross_less_vec):
-	cmpl	$16, %edx
-	jae	L(between_16_31)
-	cmpl	$8, %edx
-	ja	L(between_9_15)
-	cmpl	$4, %edx
-	jb	L(between_2_3)
-	/* From 4 to 8 bytes.  No branch when size == 4.  */
-	movl	(%rdi), %eax
-	subl	(%rsi), %eax
-	movl	-4(%rdi, %rdx), %ecx
-	movl	-4(%rsi, %rdx), %edi
-	subl	%edi, %ecx
-	orl	%ecx, %eax
-	ret
-
-	.p2align 4,, 8
-L(between_16_31):
-	/* From 16 to 31 bytes.  No branch when size == 16.  */
-
-	/* Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe.
-	 */
-	vmovdqu	(%rsi), %xmm1
-	vpcmpeqb (%rdi), %xmm1, %xmm1
-	vmovdqu	-16(%rsi, %rdx), %xmm2
-	vpcmpeqb -16(%rdi, %rdx), %xmm2, %xmm2
-	vpand	%xmm1, %xmm2, %xmm2
-	vpmovmskb %xmm2, %eax
-	notw	%ax
-	/* No ymm register was touched.  */
-	ret
-
 	.p2align 4,, 8
-L(between_9_15):
-	/* From 9 to 15 bytes.  */
-	movq	(%rdi), %rax
-	subq	(%rsi), %rax
-	movq	-8(%rdi, %rdx), %rcx
-	movq	-8(%rsi, %rdx), %rdi
-	subq	%rdi, %rcx
-	orq	%rcx, %rax
-	/* edx is guranteed to be a non-zero int.  */
-	cmovnz	%edx, %eax
-	ret
-
-	/* Don't align. This is cold and aligning here will cause code
-	   to spill into next cache line.  */
-L(between_2_3):
-	/* From 2 to 3 bytes.  No branch when size == 2.  */
-	movzwl	(%rdi), %eax
-	movzwl	(%rsi), %ecx
-	subl	%ecx, %eax
-	movzbl	-1(%rdi, %rdx), %ecx
-	/* All machines that support evex will insert a "merging uop"
-	   avoiding any serious partial register stalls.  */
-	subb	-1(%rsi, %rdx), %cl
-	orl	%ecx, %eax
-	/* No ymm register was touched.  */
+L(last_2x_vec):
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
+	vpxorq	-(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
+	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
+	vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
+	VPTEST	%YMM2, %YMM2, %k1
+	kmovd	%k1, %eax
 	ret
 
-    /* 4 Bytes from next cache line. */
+    /* 1 Bytes from next cache line. */
 END (MEMCMPEQ)
 #endif