From patchwork Wed Oct 27 02:43:18 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 46679
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id 2EAEF3857C67
	for <patchwork@sourceware.org>; Wed, 27 Oct 2021 02:43:56 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 2EAEF3857C67
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1635302636;
	bh=h0Gpw+fxCtyrcEa7Z4XBHuj6U7XVo3KUEK79ct4HSvE=;
	h=To:Subject:Date:List-Id:List-Unsubscribe:List-Archive:List-Post:
	 List-Help:List-Subscribe:From:Reply-To:From;
	b=D7ltbtmBj/T1oRqh1BaFLqYYWNLKEcSpxklnDsLhFTzAxhOmAYga5JnFvHuQqVSXx
	 KWGflZ0iv6CHo7SC599lR+vX8M0nj0/y2jbbkGBQyfEXd7PutrX+dhAtb21UOZZp9K
	 Np9ZIh4DhcVU1rJr6jEd45rkQwgJlmx3JOjPRqeM=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-il1-x129.google.com (mail-il1-x129.google.com
 [IPv6:2607:f8b0:4864:20::129])
 by sourceware.org (Postfix) with ESMTPS id 139F23858423
 for <libc-alpha@sourceware.org>; Wed, 27 Oct 2021 02:43:30 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 139F23858423
Received: by mail-il1-x129.google.com with SMTP id j3so1395594ilr.6
 for <libc-alpha@sourceware.org>; Tue, 26 Oct 2021 19:43:30 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:mime-version
 :content-transfer-encoding;
 bh=h0Gpw+fxCtyrcEa7Z4XBHuj6U7XVo3KUEK79ct4HSvE=;
 b=RzXlLhdR2xTJfZqtrVGvqdN8edTVqInmxHKaDywrCEgq1k74Blb7EX9zWZI9/d06ai
 E5fv+e1ym6nh7LmIzPFML/NvoONuxbbTjmnlbvWn8iA4WEUFAZgdT39vzYvGilrZwqlj
 sF10LSf86FKx1DIDBeAPzkT0KYhyfNeLggQPO3J5s6IW2TnqacbSySJXv46QE5Ejt5Vo
 GG5pd93gZZNg4YCwXwu8krHEwB01w1iIR3t4gMAKYLftPCl/Cp0uHlF/y1uPB92aGiVC
 DDIVHDv1SOS//zaNT4QNtuaVSVCBqtqXH9n4deRT8da5VpkuAbAVKKwR1WeJfiTYEP30
 ktxQ==
X-Gm-Message-State: AOAM533/f6LvhWbR6KMiiRQKST0MCdfWKR/EuGobWEUnLkPPCG3uKy4G
 OmWH8O+uSZYlp3DvP4lDsb6v4Hf6vUs=
X-Google-Smtp-Source: 
 ABdhPJzfyqtgHKG4qTaE3c3naLJQAsyAfxOf2qCZbqR7h1dEaJRt1+zLL1OXrnRBMgiSPlVkLZaIrw==
X-Received: by 2002:a05:6e02:190f:: with SMTP id
 w15mr5273834ilu.121.1635302609438;
 Tue, 26 Oct 2021 19:43:29 -0700 (PDT)
Received: from localhost.localdomain (mobile-130-126-255-38.near.illinois.edu.
 [130.126.255.38])
 by smtp.googlemail.com with ESMTPSA id l6sm12215373ilt.31.2021.10.26.19.43.29
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Tue, 26 Oct 2021 19:43:29 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v1 1/6] String: Add __memcmpeq as build target
Date: Tue, 26 Oct 2021 21:43:18 -0500
Message-Id: <20211027024323.1199441-1-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.25.1
MIME-Version: 1.0
X-Spam-Status: No, score=-12.0 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

No bug. This commit just adds __memcmpeq as a build target so that
implementations for __memcmpeq that are not just aliases to memcmp can
be supported.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 string/Makefile   |  2 +-
 string/memcmpeq.c | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 string/memcmpeq.c

diff --git a/string/Makefile b/string/Makefile
index 40d6fac133..2199dd30b7 100644
--- a/string/Makefile
+++ b/string/Makefile
@@ -34,7 +34,7 @@ routines	:= strcat strchr strcmp strcoll strcpy strcspn		\
 		   strerror _strerror strlen strnlen			\
 		   strncat strncmp strncpy				\
 		   strrchr strpbrk strsignal strspn strstr strtok	\
-		   strtok_r strxfrm memchr memcmp memmove memset	\
+		   strtok_r strxfrm memchr memcmp memcmpeq memmove memset	\
 		   mempcpy bcopy bzero ffs ffsll stpcpy stpncpy		\
 		   strcasecmp strncase strcasecmp_l strncase_l		\
 		   memccpy memcpy wordcopy strsep strcasestr		\
diff --git a/string/memcmpeq.c b/string/memcmpeq.c
new file mode 100644
index 0000000000..08726325a8
--- /dev/null
+++ b/string/memcmpeq.c
@@ -0,0 +1,24 @@
+/* Copyright (C) 1991-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* This file is intentionally left empty.  It exists so that both
+   architectures which implement __memcmpeq seperately from memcmp and
+   architectures which implement __memcmpeq by having it alias memcmp will
+   build.
+
+   The alias for __memcmpeq to memcmp for the C implementation is in
+   memcmp.c.  */

From patchwork Wed Oct 27 02:43:19 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 46680
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id 03CB53857C63
	for <patchwork@sourceware.org>; Wed, 27 Oct 2021 02:44:38 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 03CB53857C63
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1635302678;
	bh=r3lQjPYB984qltOWRQoUzSUVgxS0MsINEFiI/inEOZk=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=oeIGyd2s/7wjKLe04D1a8BMN+pwr+JWSgjzpkvjqDUuZiKLBNydCBIPjmugI74rfk
	 FtUOI4wKHUJgBgmi8JaYqgwz0UoZ18dLFvXDzm4iLHFNwOocoTu1JtkyU/H8IOYCAN
	 VQwCkmEa5r+OV/ENop4l7zmvsLIictilbTL9RdSY=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-il1-x12c.google.com (mail-il1-x12c.google.com
 [IPv6:2607:f8b0:4864:20::12c])
 by sourceware.org (Postfix) with ESMTPS id 8C0AE3858405
 for <libc-alpha@sourceware.org>; Wed, 27 Oct 2021 02:43:35 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 8C0AE3858405
Received: by mail-il1-x12c.google.com with SMTP id w10so1345976ilc.13
 for <libc-alpha@sourceware.org>; Tue, 26 Oct 2021 19:43:35 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=r3lQjPYB984qltOWRQoUzSUVgxS0MsINEFiI/inEOZk=;
 b=emZj+RUwAsVjC4oDNX6sDZt5hwYnPni7zNmHhyzlFUlFvtdJpNe08c956O7Kim837r
 aw0E3fMN+N8UOl4NC7xCTchRS4OWhP0lwZGRsl5qnfo8uo9WEjGgqUGm3nXwFwq17hWL
 8InLNVqlzfxzVTAMXYqV3QWG+mMOYDHKiIlUmFAeR7ESP6EPaP8dK0PYgLRDnxmuenpu
 9ar9aC0ffQL8mPDQGRtF5sAwK/LyQpdbYWaCfRjpwQgQy2oeE6br+5oRDIrUIXzvoAyY
 8eE3Z2ba4FzP0d8gAlVBZvGpTv+SHqctxPOzX0BXWE6ZFao8gJSjV0/he2SyN5Dn21T3
 jACw==
X-Gm-Message-State: AOAM533QaMebcV58yGnvMWVB76kOX8APMi6aY3Bhefe9ZYZ4uxqReAYw
 2A90vyXS6exoTqxp6RexaqgoFmikGWk=
X-Google-Smtp-Source: 
 ABdhPJz6VMVKDHsRri+Z13kOuZER1bF20yCZ+SRg6ni6PyDmX2v//vcrMPHKUVN+KvV3O5DsFQsMsg==
X-Received: by 2002:a92:cda5:: with SMTP id g5mr9291440ild.36.1635302614806;
 Tue, 26 Oct 2021 19:43:34 -0700 (PDT)
Received: from localhost.localdomain (mobile-130-126-255-38.near.illinois.edu.
 [130.126.255.38])
 by smtp.googlemail.com with ESMTPSA id l6sm12215373ilt.31.2021.10.26.19.43.34
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Tue, 26 Oct 2021 19:43:34 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v1 2/6] Benchtests: Add benchtests for __memcmpeq
Date: Tue, 26 Oct 2021 21:43:19 -0500
Message-Id: <20211027024323.1199441-2-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20211027024323.1199441-1-goldstein.w.n@gmail.com>
References: <20211027024323.1199441-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-12.0 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

No bug. This commit adds __memcmpeq benchmarks. The benchmarks just
use the existing ones in memcmp. This will be useful for testing
implementations of __memcmpeq that do not just alias memcmp.
---
 benchtests/Makefile         |  2 +-
 benchtests/bench-memcmp.c   |  4 +++-
 benchtests/bench-memcmpeq.c | 20 ++++++++++++++++++++
 3 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100644 benchtests/bench-memcmpeq.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index b690aaf65b..7be0e47c47 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -103,7 +103,7 @@ bench := $(foreach B,$(filter bench-%,${BENCHSET}), ${${B}})
 endif
 
 # String function benchmarks.
-string-benchset := memccpy memchr memcmp memcpy memmem memmove \
+string-benchset := memccpy memchr memcmp memcmpeq memcpy memmem memmove \
 		   mempcpy memset rawmemchr stpcpy stpncpy strcasecmp strcasestr \
 		   strcat strchr strchrnul strcmp strcpy strcspn strlen \
 		   strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
diff --git a/benchtests/bench-memcmp.c b/benchtests/bench-memcmp.c
index 0d6a93bf29..546b06e1ab 100644
--- a/benchtests/bench-memcmp.c
+++ b/benchtests/bench-memcmp.c
@@ -17,7 +17,9 @@
    <https://www.gnu.org/licenses/>.  */
 
 #define TEST_MAIN
-#ifdef WIDE
+#ifdef TEST_MEMCMPEQ
+# define TEST_NAME "__memcmpeq"
+#elif defined WIDE
 # define TEST_NAME "wmemcmp"
 #else
 # define TEST_NAME "memcmp"
diff --git a/benchtests/bench-memcmpeq.c b/benchtests/bench-memcmpeq.c
new file mode 100644
index 0000000000..e918d4f77c
--- /dev/null
+++ b/benchtests/bench-memcmpeq.c
@@ -0,0 +1,20 @@
+/* Measure __memcmpeq functions.
+   Copyright (C) 2015-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MEMCMPEQ 1
+#include "bench-memcmp.c"

From patchwork Wed Oct 27 02:43:20 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 46681
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id 1FFA73858402
	for <patchwork@sourceware.org>; Wed, 27 Oct 2021 02:45:20 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 1FFA73858402
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1635302720;
	bh=GowU3ZCiAyI2wx9rQTOhuVfwZtPQwBvE+VB8iHhVe+Q=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=NOthsjHU6yOwmuV3ahgWDviKfjqbHw3fd8WYy/R5029USSSA9CXxZStFUJgPqHm1E
	 couGkPXFcPCUmNt3pAx2QRyh4E/+Y4RP7SZtTe4O1iJuOrhtmtWOCipVDrX0994jAt
	 VFDkHPHK4HYeRuS9HIAlaNsVOjGyuY8L7ztIPjVI=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-il1-x12d.google.com (mail-il1-x12d.google.com
 [IPv6:2607:f8b0:4864:20::12d])
 by sourceware.org (Postfix) with ESMTPS id 567753858434
 for <libc-alpha@sourceware.org>; Wed, 27 Oct 2021 02:43:37 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 567753858434
Received: by mail-il1-x12d.google.com with SMTP id j10so1424170ilu.2
 for <libc-alpha@sourceware.org>; Tue, 26 Oct 2021 19:43:37 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=GowU3ZCiAyI2wx9rQTOhuVfwZtPQwBvE+VB8iHhVe+Q=;
 b=RejYSOLN5E0gxjDzhu3NFQMyvNegTKAPQLc1wgW0DPi1ulcHa05Kp4v3hfe54pMkui
 oTI37Hjl9P0Mna1nWAg6GHUmlnm8rwBZ6eb3cu3C24DTZ2nsQ49XwBzWuz6vvSIndLwk
 x6fTfHt94pG7ltiYaNqRiU3QKrPhtovy5VTWdpeku5QIRP+3NQiig8QzwiXaqgypNqoV
 0NbuT7Gw6AcQzUxCVmPFe38PRbYsXY5ECpPZPr9k3zjVl4OS16VmjDTpxRLdTjo5+DsM
 f/N9zQF+KrWKdCFBPP+QRnR4zHAycvPF9PNUkGQlV9D+B+2NYqYdYwmn9b2Y8vdOxX5u
 5aZA==
X-Gm-Message-State: AOAM531qUOHi6N5WMC3qWyAxmEfWOTyvSLQ2MQm/eu9mM3vkCCBzowRl
 QcV3pPgYXCkE0ZQGiEaj3mchojixztI=
X-Google-Smtp-Source: 
 ABdhPJzZ4SLtQtSodM6+VLMnCMTCWRq2+njp8zYD+hJCYwv3wCpOmbI+2Fa0tc6+rX3DeJ1qXhgovw==
X-Received: by 2002:a05:6e02:1588:: with SMTP id
 m8mr17128058ilu.188.1635302616325;
 Tue, 26 Oct 2021 19:43:36 -0700 (PDT)
Received: from localhost.localdomain (mobile-130-126-255-38.near.illinois.edu.
 [130.126.255.38])
 by smtp.googlemail.com with ESMTPSA id l6sm12215373ilt.31.2021.10.26.19.43.35
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Tue, 26 Oct 2021 19:43:36 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v1 3/6] x86_64: Add support for __memcmpeq using sse2, avx2,
 and evex
Date: Tue, 26 Oct 2021 21:43:20 -0500
Message-Id: <20211027024323.1199441-3-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20211027024323.1199441-1-goldstein.w.n@gmail.com>
References: <20211027024323.1199441-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-11.2 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 KAM_SHORT, KAM_STOCKGEN, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

No bug. This commit adds support for __memcmpeq to be implemented
seperately from memcmp. Support is added for versions optimized with
sse2, avx2, and evex.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 sysdeps/generic/ifunc-init.h                 |  5 +-
 sysdeps/x86_64/memcmp.S                      |  9 ++--
 sysdeps/x86_64/multiarch/Makefile            |  4 ++
 sysdeps/x86_64/multiarch/ifunc-impl-list.c   | 21 +++++++++
 sysdeps/x86_64/multiarch/ifunc-memcmpeq.h    | 49 ++++++++++++++++++++
 sysdeps/x86_64/multiarch/memcmp-sse2.S       |  4 +-
 sysdeps/x86_64/multiarch/memcmp.c            |  3 --
 sysdeps/x86_64/multiarch/memcmpeq-avx2-rtm.S | 12 +++++
 sysdeps/x86_64/multiarch/memcmpeq-avx2.S     | 23 +++++++++
 sysdeps/x86_64/multiarch/memcmpeq-evex.S     | 23 +++++++++
 sysdeps/x86_64/multiarch/memcmpeq-sse2.S     | 23 +++++++++
 sysdeps/x86_64/multiarch/memcmpeq.c          | 35 ++++++++++++++
 12 files changed, 202 insertions(+), 9 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/ifunc-memcmpeq.h
 create mode 100644 sysdeps/x86_64/multiarch/memcmpeq-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/memcmpeq-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/memcmpeq-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/memcmpeq-sse2.S
 create mode 100644 sysdeps/x86_64/multiarch/memcmpeq.c

diff --git a/sysdeps/generic/ifunc-init.h b/sysdeps/generic/ifunc-init.h
index 7f69485de8..ee8a8289c8 100644
--- a/sysdeps/generic/ifunc-init.h
+++ b/sysdeps/generic/ifunc-init.h
@@ -50,5 +50,8 @@
    '__<symbol>_<variant>' as the optimized implementation and
    '<symbol>_ifunc_selector' as the IFUNC selector.  */
 #define REDIRECT_NAME	EVALUATOR1 (__redirect, SYMBOL_NAME)
-#define OPTIMIZE(name)	EVALUATOR2 (SYMBOL_NAME, name)
 #define IFUNC_SELECTOR	EVALUATOR1 (SYMBOL_NAME, ifunc_selector)
+#define OPTIMIZE1(name)	EVALUATOR1 (SYMBOL_NAME, name)
+#define OPTIMIZE2(name)	EVALUATOR2 (SYMBOL_NAME, name)
+/* Default is to use OPTIMIZE2.  */
+#define OPTIMIZE(name)	OPTIMIZE2(name)
diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
index 8a03e572e8..b53f2c0866 100644
--- a/sysdeps/x86_64/memcmp.S
+++ b/sysdeps/x86_64/memcmp.S
@@ -356,9 +356,10 @@ L(ATR32res):
 	.p2align 4,, 4
 END(memcmp)
 
-#undef bcmp
+#ifdef USE_AS_MEMCMPEQ
+libc_hidden_def (memcmp)
+#else
+# undef bcmp
 weak_alias (memcmp, bcmp)
-#undef __memcmpeq
-strong_alias (memcmp, __memcmpeq)
 libc_hidden_builtin_def (memcmp)
-libc_hidden_def (__memcmpeq)
+#endif
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 26be40959c..044778585b 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -7,7 +7,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
 		   memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \
 		   memrchr-sse2 memrchr-avx2 \
 		   memcmp-sse2 \
+		   memcmpeq-sse2 \
 		   memcmp-avx2-movbe \
+		   memcmpeq-avx2 \
 		   memcmp-sse4 memcpy-ssse3 \
 		   memmove-ssse3 \
 		   memcpy-ssse3-back \
@@ -42,6 +44,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
 		   memset-avx512-unaligned-erms \
 		   memchr-avx2-rtm \
 		   memcmp-avx2-movbe-rtm \
+		   memcmpeq-avx2-rtm \
 		   memmove-avx-unaligned-erms-rtm \
 		   memrchr-avx2-rtm \
 		   memset-avx2-unaligned-erms-rtm \
@@ -61,6 +64,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
 		   strrchr-avx2-rtm \
 		   memchr-evex \
 		   memcmp-evex-movbe \
+		   memcmpeq-evex \
 		   memmove-evex-unaligned-erms \
 		   memrchr-evex \
 		   memset-evex-unaligned-erms \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 39ab10613b..f7f3806d1d 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -38,6 +38,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   size_t i = 0;
 
+  /* Support sysdeps/x86_64/multiarch/memcmpeq.c.  */
+  IFUNC_IMPL (i, name, __memcmpeq,
+	      IFUNC_IMPL_ADD (array, i, __memcmpeq,
+			      (CPU_FEATURE_USABLE (AVX2)
+                   && CPU_FEATURE_USABLE (MOVBE)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __memcmpeq_avx2)
+	      IFUNC_IMPL_ADD (array, i, __memcmpeq,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+                   && CPU_FEATURE_USABLE (MOVBE)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __memcmpeq_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memcmpeq,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+                   && CPU_FEATURE_USABLE (MOVBE)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __memcmpeq_evex)
+	      IFUNC_IMPL_ADD (array, i, __memcmpeq, 1, __memcmpeq_sse2))
+
   /* Support sysdeps/x86_64/multiarch/memchr.c.  */
   IFUNC_IMPL (i, name, memchr,
 	      IFUNC_IMPL_ADD (array, i, memchr,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h b/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h
new file mode 100644
index 0000000000..3319a9568a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h
@@ -0,0 +1,49 @@
+/* Common definition for __memcmpeq ifunc selections.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+# include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+      && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE1 (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE1 (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE1 (avx2);
+    }
+
+  return OPTIMIZE1 (sse2);
+}
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse2.S b/sysdeps/x86_64/multiarch/memcmp-sse2.S
index 7b30b7ca2e..132d6fb339 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse2.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse2.S
@@ -17,7 +17,9 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define memcmp __memcmp_sse2
+# ifndef memcmp
+#  define memcmp __memcmp_sse2
+# endif
 
 # ifdef SHARED
 #  undef libc_hidden_builtin_def
diff --git a/sysdeps/x86_64/multiarch/memcmp.c b/sysdeps/x86_64/multiarch/memcmp.c
index 7b3409b1dd..fe725f3563 100644
--- a/sysdeps/x86_64/multiarch/memcmp.c
+++ b/sysdeps/x86_64/multiarch/memcmp.c
@@ -29,9 +29,6 @@
 libc_ifunc_redirected (__redirect_memcmp, memcmp, IFUNC_SELECTOR ());
 # undef bcmp
 weak_alias (memcmp, bcmp)
-# undef __memcmpeq
-strong_alias (memcmp, __memcmpeq)
-libc_hidden_def (__memcmpeq)
 
 # ifdef SHARED
 __hidden_ver1 (memcmp, __GI_memcmp, __redirect_memcmp)
diff --git a/sysdeps/x86_64/multiarch/memcmpeq-avx2-rtm.S b/sysdeps/x86_64/multiarch/memcmpeq-avx2-rtm.S
new file mode 100644
index 0000000000..24b6a0c9ff
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmpeq-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef MEMCMP
+# define MEMCMP __memcmpeq_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memcmpeq-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/memcmpeq-avx2.S b/sysdeps/x86_64/multiarch/memcmpeq-avx2.S
new file mode 100644
index 0000000000..0181ea0d8d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmpeq-avx2.S
@@ -0,0 +1,23 @@
+/* __memcmpeq optimized with AVX2.
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef MEMCMP
+# define MEMCMP	__memcmpeq_avx2
+#endif
+
+#include "memcmp-avx2-movbe.S"
diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
new file mode 100644
index 0000000000..951e1e9560
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
@@ -0,0 +1,23 @@
+/* __memcmpeq optimized with EVEX.
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef MEMCMP
+# define MEMCMP	__memcmpeq_evex
+#endif
+
+#include "memcmp-evex-movbe.S"
diff --git a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
new file mode 100644
index 0000000000..c488cbbcd9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
@@ -0,0 +1,23 @@
+/* __memcmpeq optimized with SSE2.
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef memcmp
+# define memcmp	__memcmpeq_sse2
+#endif
+#define USE_AS_MEMCMPEQ	1
+#include "memcmp-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/memcmpeq.c b/sysdeps/x86_64/multiarch/memcmpeq.c
new file mode 100644
index 0000000000..163e56047e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmpeq.c
@@ -0,0 +1,35 @@
+/* Multiple versions of __memcmpeq.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __memcmpeq __redirect___memcmpeq
+# include <string.h>
+# undef __memcmpeq
+
+# define SYMBOL_NAME __memcmpeq
+# include "ifunc-memcmpeq.h"
+
+libc_ifunc_redirected (__redirect___memcmpeq, __memcmpeq, IFUNC_SELECTOR ());
+
+# ifdef SHARED
+__hidden_ver1 (__memcmpeq, __GI___memcmpeq, __redirect___memcmpeq)
+    __attribute__ ((visibility ("hidden"))) __attribute_copy__ (__memcmpeq);
+# endif
+#endif

From patchwork Wed Oct 27 02:43:21 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 46682
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id 5D7613858006
	for <patchwork@sourceware.org>; Wed, 27 Oct 2021 02:46:02 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 5D7613858006
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1635302762;
	bh=g44t0ySGbRNQiFmo5RH4AGqTMcQghlV3/ErOYxXxPzk=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=BkOK29rFwiQjWVzDgJC6rGcgVnIqUTId4m19answ/pi7GZoab1oHMNBgBVV0qw6Qr
	 tcCkE7zrjrFDP+p3+hC+tl8MllIcHg0CAM3qcrqGGieI90EB7FsfWJI7gDBYTIazt+
	 //ioiBCpWyO3SWmhEl21kIqTzssyqi5hCuh+ErIg=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-io1-xd2d.google.com (mail-io1-xd2d.google.com
 [IPv6:2607:f8b0:4864:20::d2d])
 by sourceware.org (Postfix) with ESMTPS id 55D283857C5B
 for <libc-alpha@sourceware.org>; Wed, 27 Oct 2021 02:43:38 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 55D283857C5B
Received: by mail-io1-xd2d.google.com with SMTP id z144so714549iof.0
 for <libc-alpha@sourceware.org>; Tue, 26 Oct 2021 19:43:38 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=g44t0ySGbRNQiFmo5RH4AGqTMcQghlV3/ErOYxXxPzk=;
 b=YYVcm5PFn2TnX4SBvGYbwxkh43QV3gsQRPZkiS8wAJtKCPhwguYyEx9j+ITl599x9w
 xUS9Ix73QsiTXx+7P+iE8SG8/oTNSnW5n9+qjTLQZ12bYO434LXXeVj2yhfxA1IoFAz8
 2QfTg1R/0Hgt7YjFOIr4CstLLTuiCrcYIrnd/BzomSpxaThXjch7MMGfqgT1Xb0uLtMK
 Sb9zlObCKdOpwiEajZD0ckjd7VmbdwfvjHAEbqdUtgRuC2UobnBafLvfD+JT+8NXoUGP
 KufM6ZcFHBbQvdmVAxY7mjuUbudprPcRp9M9RJ9kH47G0JQDMbHnnW8UtTbe6/IF5k7u
 OUzA==
X-Gm-Message-State: AOAM530QNF3htaH+Adhhx9A3U3FDWzUHhS1e3LEDihJZpVg9cqcWWjsj
 dhtyUK5CglKBgF6RBWTizBt7iDhM+ak=
X-Google-Smtp-Source: 
 ABdhPJxBV4JT0opAWZ4CJtwo7bnLJfHkYNFp9w8uB10kS0eFBjlfr6TrmXDnLAJ/asPXarqXHy5U3w==
X-Received: by 2002:a05:6602:2f11:: with SMTP id
 q17mr17853187iow.29.1635302617610;
 Tue, 26 Oct 2021 19:43:37 -0700 (PDT)
Received: from localhost.localdomain (mobile-130-126-255-38.near.illinois.edu.
 [130.126.255.38])
 by smtp.googlemail.com with ESMTPSA id l6sm12215373ilt.31.2021.10.26.19.43.37
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Tue, 26 Oct 2021 19:43:37 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v1 4/6] x86_64: Add sse2 optimized __memcmpeq in memcmp-sse2.S
Date: Tue, 26 Oct 2021 21:43:21 -0500
Message-Id: <20211027024323.1199441-4-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20211027024323.1199441-1-goldstein.w.n@gmail.com>
References: <20211027024323.1199441-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-12.0 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

No bug. This commit does not modify any of the memcmp
implementation. It just adds __memcmpeq ifdefs to skip obvious cases
where computing the proper 1/-1 required by memcmp is not needed.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 sysdeps/x86_64/memcmp.S | 55 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
index b53f2c0866..c245383963 100644
--- a/sysdeps/x86_64/memcmp.S
+++ b/sysdeps/x86_64/memcmp.S
@@ -49,34 +49,63 @@ L(s2b):
 	movzwl	(%rdi),	%eax
 	movzwl	(%rdi, %rsi), %edx
 	subq    $2, %r10
+#ifdef USE_AS_MEMCMPEQ
+	je	L(finz1)
+#else
 	je	L(fin2_7)
+#endif
 	addq	$2, %rdi
 	cmpl	%edx, %eax
+#ifdef USE_AS_MEMCMPEQ
+	jnz	L(neq_early)
+#else
 	jnz	L(fin2_7)
+#endif
 L(s4b):
 	testq	$4, %r10
 	jz	L(s8b)
 	movl	(%rdi),	%eax
 	movl	(%rdi, %rsi), %edx
 	subq    $4, %r10
+#ifdef USE_AS_MEMCMPEQ
+	je	L(finz1)
+#else
 	je	L(fin2_7)
+#endif
 	addq	$4, %rdi
 	cmpl	%edx, %eax
+#ifdef USE_AS_MEMCMPEQ
+	jnz	L(neq_early)
+#else
 	jnz	L(fin2_7)
+#endif
 L(s8b):
 	testq	$8, %r10
 	jz	L(s16b)
 	movq	(%rdi),	%rax
 	movq	(%rdi, %rsi), %rdx
 	subq    $8, %r10
+#ifdef USE_AS_MEMCMPEQ
+	je	L(sub_return8)
+#else
 	je	L(fin2_7)
+#endif
 	addq	$8, %rdi
 	cmpq	%rdx, %rax
+#ifdef USE_AS_MEMCMPEQ
+	jnz	L(neq_early)
+#else
 	jnz	L(fin2_7)
+#endif
 L(s16b):
 	movdqu    (%rdi), %xmm1
 	movdqu    (%rdi, %rsi), %xmm0
 	pcmpeqb   %xmm0, %xmm1
+#ifdef USE_AS_MEMCMPEQ
+	pmovmskb  %xmm1, %eax
+	subl      $0xffff, %eax
+	ret
+#else
 	pmovmskb  %xmm1, %edx
 	xorl	  %eax, %eax
 	subl      $0xffff, %edx
@@ -86,7 +115,7 @@ L(s16b):
 	movzbl	 (%rcx), %eax
 	movzbl	 (%rsi, %rcx), %edx
 	jmp	 L(finz1)
-
+#endif
 	.p2align 4,, 4
 L(finr1b):
 	movzbl	(%rdi), %eax
@@ -95,7 +124,15 @@ L(finz1):
 	subl	%edx, %eax
 L(exit):
 	ret
-
+#ifdef USE_AS_MEMCMPEQ
+	.p2align 4,, 4
+L(sub_return8):
+	subq	%rdx, %rax
+	movl	%eax, %edx
+	shrq	$32, %rax
+	orl	%edx, %eax
+	ret
+#else
 	.p2align 4,, 4
 L(fin2_7):
 	cmpq	%rdx, %rax
@@ -111,12 +148,17 @@ L(fin2_7):
 	movzbl  %dl, %edx
 	subl	%edx, %eax
 	ret
-
+#endif
 	.p2align 4,, 4
 L(finz):
 	xorl	%eax, %eax
 	ret
-
+#ifdef USE_AS_MEMCMPEQ
+	.p2align 4,, 4
+L(neq_early):
+	movl	$1, %eax
+	ret
+#endif
 	/* For blocks bigger than 32 bytes
 	   1. Advance one of the addr pointer to be 16B aligned.
 	   2. Treat the case of both addr pointers aligned to 16B
@@ -246,11 +288,16 @@ L(mt16):
 
 	.p2align 4,, 4
 L(neq):
+#ifdef USE_AS_MEMCMPEQ
+	movl	$1, %eax
+    ret
+#else
 	bsfl      %edx, %ecx
 	movzbl	 (%rdi, %rcx), %eax
 	addq	 %rdi, %rsi
 	movzbl	 (%rsi,%rcx), %edx
 	jmp	 L(finz1)
+#endif
 
 	.p2align 4,, 4
 L(ATR):

From patchwork Wed Oct 27 02:43:22 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 46683
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id CB9BD385843F
	for <patchwork@sourceware.org>; Wed, 27 Oct 2021 02:46:50 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org CB9BD385843F
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1635302810;
	bh=VuGigQL1Hygq+/XMuQZz8XxlmpZCzVQfhwpIFD2CIc4=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=jGxwgDasKeC9hQWbr9cqbAtKAiJfoeX+OLJSqwRtLN2C9kZOHYN/lvy8LSvI/uIcH
	 3v6Fv4cbAjXfiNgRobHl9NwKdFlmh0Tw5x8ptIyorQokIHz6+pSbF6D+bzqowICRTC
	 lIiNNzplo9O7Jh0Tb9Yu7y4MDvbxgt/Myoij1zxw=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-il1-x12d.google.com (mail-il1-x12d.google.com
 [IPv6:2607:f8b0:4864:20::12d])
 by sourceware.org (Postfix) with ESMTPS id A5DEC385842E
 for <libc-alpha@sourceware.org>; Wed, 27 Oct 2021 02:43:39 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org A5DEC385842E
Received: by mail-il1-x12d.google.com with SMTP id w15so1399777ilv.5
 for <libc-alpha@sourceware.org>; Tue, 26 Oct 2021 19:43:39 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=VuGigQL1Hygq+/XMuQZz8XxlmpZCzVQfhwpIFD2CIc4=;
 b=glWUTvQ6cSyQqATXCgMv5uo7RdbQdOHs3RQItfRLUXC1ln5Sf50C+xE4adHRLrcyE9
 DRQKd9HNM2/+MjgWVnYzduvPFcYc0hAFmhZVyHf8YS7dwVonY0lPFOJFYYjW66fW3t0a
 lWt6Mw9d2/IkOYfF5VFdISSsXBypQNfZRaNmbjGu+UaRgeOuTu54VBNpkr4ewiGvVEzz
 8mVh9KvcBEaCqgbvRNw7S4bmIAE1ZEF0fiCFFHLkr2mSQsVcBBLS/v63TZZwk1YuI1v+
 IBFDygcReM6QW89eDG8F0ImnHSv+y1z7J77pgA2r6r+V4GQq57SaVaAJ4XER/hCy3MnW
 em7w==
X-Gm-Message-State: AOAM532gGRrKwKe95iHQDYsdE8EDrby+wuS5eg7XiQB4jaJOV6w7PAGB
 zQj68nqkKydAHwPa5kPUbGGc8ka2e/4=
X-Google-Smtp-Source: 
 ABdhPJzF4pzXPeDwtkzWJF1FklaIYKjZmm5XI7/nr/Z0nP3gofgKSHzpKFTh5pehPRK64f2F4V0C+g==
X-Received: by 2002:a92:cf50:: with SMTP id
 c16mr16859167ilr.145.1635302618893;
 Tue, 26 Oct 2021 19:43:38 -0700 (PDT)
Received: from localhost.localdomain (mobile-130-126-255-38.near.illinois.edu.
 [130.126.255.38])
 by smtp.googlemail.com with ESMTPSA id l6sm12215373ilt.31.2021.10.26.19.43.38
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Tue, 26 Oct 2021 19:43:38 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v1 5/6] x86_64: Add avx2 optimized __memcmpeq in
 memcmpeq-avx2.S
Date: Tue, 26 Oct 2021 21:43:22 -0500
Message-Id: <20211027024323.1199441-5-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20211027024323.1199441-1-goldstein.w.n@gmail.com>
References: <20211027024323.1199441-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-12.0 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

No bug. This commit adds new optimized __memcmpeq implementation for
avx2.

The primary optimizations are:

1) skipping the logic to find the difference of the first mismatched
byte.

2) not updating src/dst addresses as the non-equals logic does not
need to be reused by different areas.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 sysdeps/x86_64/multiarch/ifunc-impl-list.c   |   2 -
 sysdeps/x86_64/multiarch/ifunc-memcmpeq.h    |   2 +-
 sysdeps/x86_64/multiarch/memcmpeq-avx2-rtm.S |   4 +-
 sysdeps/x86_64/multiarch/memcmpeq-avx2.S     | 309 ++++++++++++++++++-
 4 files changed, 308 insertions(+), 9 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f7f3806d1d..535450f52c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -42,13 +42,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, __memcmpeq,
 	      IFUNC_IMPL_ADD (array, i, __memcmpeq,
 			      (CPU_FEATURE_USABLE (AVX2)
-                   && CPU_FEATURE_USABLE (MOVBE)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __memcmpeq_avx2)
 	      IFUNC_IMPL_ADD (array, i, __memcmpeq,
 			      (CPU_FEATURE_USABLE (AVX2)
 			       && CPU_FEATURE_USABLE (BMI2)
-                   && CPU_FEATURE_USABLE (MOVBE)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __memcmpeq_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, __memcmpeq,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h b/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h
index 3319a9568a..e596c5048b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h
@@ -31,10 +31,10 @@ IFUNC_SELECTOR (void)
 
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
       && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
-      && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
 	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 	return OPTIMIZE1 (evex);
 
diff --git a/sysdeps/x86_64/multiarch/memcmpeq-avx2-rtm.S b/sysdeps/x86_64/multiarch/memcmpeq-avx2-rtm.S
index 24b6a0c9ff..3264a4a76c 100644
--- a/sysdeps/x86_64/multiarch/memcmpeq-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memcmpeq-avx2-rtm.S
@@ -1,5 +1,5 @@
-#ifndef MEMCMP
-# define MEMCMP __memcmpeq_avx2_rtm
+#ifndef MEMCMPEQ
+# define MEMCMPEQ __memcmpeq_avx2_rtm
 #endif
 
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
diff --git a/sysdeps/x86_64/multiarch/memcmpeq-avx2.S b/sysdeps/x86_64/multiarch/memcmpeq-avx2.S
index 0181ea0d8d..0bf59fb8fa 100644
--- a/sysdeps/x86_64/multiarch/memcmpeq-avx2.S
+++ b/sysdeps/x86_64/multiarch/memcmpeq-avx2.S
@@ -16,8 +16,309 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#ifndef MEMCMP
-# define MEMCMP	__memcmpeq_avx2
-#endif
+#if IS_IN (libc)
+
+/* __memcmpeq is implemented as:
+   1. Use ymm vector compares when possible. The only case where
+      vector compares is not possible for when size < VEC_SIZE
+      and loading from either s1 or s2 would cause a page cross.
+   2. Use xmm vector compare when size >= 8 bytes.
+   3. Optimistically compare up to first 4 * VEC_SIZE one at a
+      to check for early mismatches. Only do this if its guranteed the
+      work is not wasted.
+   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+      area.
+   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+
+# include <sysdep.h>
+
+# ifndef MEMCMPEQ
+#  define MEMCMPEQ	__memcmpeq_avx2
+# endif
+
+# define VPCMPEQ	vpcmpeqb
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
+# define VEC_SIZE 32
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (MEMCMPEQ, 6)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+	jb	L(less_vec)
+
+	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ	(%rdi), %ymm1, %ymm1
+	vpmovmskb %ymm1, %eax
+	incl	%eax
+	jnz	L(return_neq0)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jbe	L(last_1x_vec)
+
+	/* Check second VEC no matter what.  */
+	vmovdqu	VEC_SIZE(%rsi), %ymm2
+	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	/* If all 4 VEC where equal eax will be all 1s so incl will overflow
+	   and set zero flag.  */
+	incl	%eax
+	jnz	L(return_neq0)
+
+	/* Less than 4 * VEC.  */
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_2x_vec)
+
+	/* Check third and fourth VEC no matter what.  */
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	vpmovmskb %ymm3, %eax
+	incl	%eax
+	jnz	L(return_neq0)
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+	vpmovmskb %ymm4, %eax
+	incl	%eax
+	jnz	L(return_neq0)
+
+	/* Go to 4x VEC loop.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+
+	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
+	   branches.  */
+
+	/* Adjust rsi and rdi to avoid indexed address mode. This end up
+	   saving a 16 bytes of code, prevents unlamination, and bottlenecks in
+	   the AGU.  */
+	addq	%rdx, %rsi
+	vmovdqu	-(VEC_SIZE * 4)(%rsi), %ymm1
+	vmovdqu	-(VEC_SIZE * 3)(%rsi), %ymm2
+	addq	%rdx, %rdi
+
+	VPCMPEQ	-(VEC_SIZE * 4)(%rdi), %ymm1, %ymm1
+	VPCMPEQ	-(VEC_SIZE * 3)(%rdi), %ymm2, %ymm2
+
+	vmovdqu	-(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ	-(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	vmovdqu	-VEC_SIZE(%rsi), %ymm4
+	VPCMPEQ	-VEC_SIZE(%rdi), %ymm4, %ymm4
+
+	/* Reduce VEC0 - VEC4.  */
+	vpand	%ymm1, %ymm2, %ymm2
+	vpand	%ymm3, %ymm4, %ymm4
+	vpand	%ymm2, %ymm4, %ymm4
+	vpmovmskb %ymm4, %eax
+	incl	%eax
+L(return_neq0):
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-#include "memcmp-avx2-movbe.S"
+	/* NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32 byte
+	   aligned.  */
+	.p2align 5
+L(less_vec):
+	/* Check if one or less char. This is necessary for size = 0 but is
+	   also faster for size = 1.  */
+	cmpl	$1, %edx
+	jbe	L(one_or_less)
+
+	/* Check if loading one VEC from either s1 or s2 could cause a page
+	   cross. This can have false positives but is by far the fastest
+	   method.  */
+	movl	%edi, %eax
+	orl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(page_cross_less_vec)
+
+	/* No page cross possible.  */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ	(%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	incl	%eax
+	/* Result will be zero if s1 and s2 match. Otherwise first set bit
+	   will be first mismatch.  */
+	bzhil	%edx, %eax, %eax
+	VZEROUPPER_RETURN
+
+	/* Relatively cold but placing close to L(less_vec) for 2 byte jump
+	   encoding.  */
+	.p2align 4
+L(one_or_less):
+	jb	L(zero)
+	movzbl	(%rsi), %ecx
+	movzbl	(%rdi), %eax
+	subl	%ecx, %eax
+	/* No ymm register was touched.  */
+	ret
+	/* Within the same 16 byte block is L(one_or_less).  */
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(last_1x_vec):
+	vmovdqu	-(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
+	VPCMPEQ	-(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
+	vpmovmskb %ymm1, %eax
+	incl	%eax
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(last_2x_vec):
+	vmovdqu	-(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
+	VPCMPEQ	-(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
+	vmovdqu	-(VEC_SIZE * 1)(%rsi, %rdx), %ymm2
+	VPCMPEQ	-(VEC_SIZE * 1)(%rdi, %rdx), %ymm2, %ymm2
+	vpand	%ymm1, %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	incl	%eax
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(more_8x_vec):
+	/* Set end of s1 in rdx.  */
+	leaq	-(VEC_SIZE * 4)(%rdi, %rdx), %rdx
+	/* rsi stores s2 - s1. This allows loop to only update one pointer.
+	 */
+	subq	%rdi, %rsi
+	/* Align s1 pointer.  */
+	andq	$-VEC_SIZE, %rdi
+	/* Adjust because first 4x vec where check already.  */
+	subq	$-(VEC_SIZE * 4), %rdi
+	.p2align 4
+L(loop_4x_vec):
+	/* rsi has s2 - s1 so get correct address by adding s1 (in rdi).  */
+	vmovdqu	(%rsi, %rdi), %ymm1
+	VPCMPEQ	(%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi, %rdi), %ymm2
+	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdi), %ymm3
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdi), %ymm4
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+
+	vpand	%ymm1, %ymm2, %ymm2
+	vpand	%ymm3, %ymm4, %ymm4
+	vpand	%ymm2, %ymm4, %ymm4
+	vpmovmskb %ymm4, %eax
+	incl	%eax
+	jnz	L(return_neq1)
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check if s1 pointer at end.  */
+	cmpq	%rdx, %rdi
+	jb	L(loop_4x_vec)
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
+	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
+	subq	%rdx, %rdi
+	/* rdi has 4 * VEC_SIZE - remaining length.  */
+	cmpl	$(VEC_SIZE * 3), %edi
+	jae	L(8x_last_1x_vec)
+	/* Load regardless of branch.  */
+	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdx), %ymm3
+	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
+	cmpl	$(VEC_SIZE * 2), %edi
+	jae	L(8x_last_2x_vec)
+	/* Check last 4 VEC.  */
+	vmovdqu	VEC_SIZE(%rsi, %rdx), %ymm1
+	VPCMPEQ	VEC_SIZE(%rdx), %ymm1, %ymm1
+
+	vmovdqu	(%rsi, %rdx), %ymm2
+	VPCMPEQ	(%rdx), %ymm2, %ymm2
+
+	vpand	%ymm3, %ymm4, %ymm4
+	vpand	%ymm1, %ymm2, %ymm3
+L(8x_last_2x_vec):
+	vpand	%ymm3, %ymm4, %ymm4
+L(8x_last_1x_vec):
+	vpmovmskb %ymm4, %eax
+	/* Restore s1 pointer to rdi.  */
+	incl	%eax
+L(return_neq1):
+	VZEROUPPER_RETURN
+
+	/* Relatively cold case as page cross are unexpected.  */
+	.p2align 4
+L(page_cross_less_vec):
+	cmpl	$16, %edx
+	jae	L(between_16_31)
+	cmpl	$8, %edx
+	ja	L(between_9_15)
+	cmpl	$4, %edx
+	jb	L(between_2_3)
+	/* From 4 to 8 bytes.  No branch when size == 4.  */
+	movl	(%rdi), %eax
+	subl	(%rsi), %eax
+	movl	-4(%rdi, %rdx), %ecx
+	movl	-4(%rsi, %rdx), %edi
+	subl	%edi, %ecx
+	orl	%ecx, %eax
+	ret
+
+	.p2align 4,, 8
+L(between_16_31):
+	/* From 16 to 31 bytes.  No branch when size == 16.  */
+
+	/* Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe.
+	 */
+	vmovdqu	(%rsi), %xmm1
+	vpcmpeqb (%rdi), %xmm1, %xmm1
+	vmovdqu	-16(%rsi, %rdx), %xmm2
+	vpcmpeqb -16(%rdi, %rdx), %xmm2, %xmm2
+	vpand	%xmm1, %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	notw	%ax
+	/* No ymm register was touched.  */
+	ret
+
+	.p2align 4,, 8
+L(between_9_15):
+	/* From 9 to 15 bytes.  */
+	movq	(%rdi), %rax
+	subq	(%rsi), %rax
+	movq	-8(%rdi, %rdx), %rcx
+	movq	-8(%rsi, %rdx), %rdi
+	subq	%rdi, %rcx
+	orq	%rcx, %rax
+	/* edx is guranteed to be a non-zero int.  */
+	cmovnz	%edx, %eax
+	ret
+
+	/* Don't align. This is cold and aligning here will cause code
+	   to spill into next cache line.  */
+L(between_2_3):
+	/* From 2 to 3 bytes.  No branch when size == 2.  */
+	movzwl	(%rdi), %eax
+	movzwl	(%rsi), %ecx
+	subl	%ecx, %eax
+	movzbl	-1(%rdi, %rdx), %ecx
+	/* All machines that support evex will insert a "merging uop"
+	   avoiding any serious partial register stalls.  */
+	subb	-1(%rsi, %rdx), %cl
+	orl	%ecx, %eax
+	/* No ymm register was touched.  */
+	ret
+
+    /* 2 Bytes from next cache line. */
+END (MEMCMPEQ)
+#endif

From patchwork Wed Oct 27 02:43:23 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 46684
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id F2D0C3857C6B
	for <patchwork@sourceware.org>; Wed, 27 Oct 2021 02:47:32 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org F2D0C3857C6B
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1635302853;
	bh=OjpkHxQdFAk6A9ZCvZg5Q9WLHEyNzbV2VfPwMsftofE=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=BDUX8OEuj1F5lV76ErHK0xRGF/xok069QsIEx8EB2QJTnDAuZN0E8K1QWIL8vqZYM
	 EApXnaCRPg98uDsz5Bqcwr/L5km2lTDYw8w6MW50XFKOnWXA+SlnuhagUtAa9KGU9F
	 Qh6x9Q8U+1amdSP2wBNSkq/jmaxvEPv3bWdr3Iu4=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-il1-x130.google.com (mail-il1-x130.google.com
 [IPv6:2607:f8b0:4864:20::130])
 by sourceware.org (Postfix) with ESMTPS id DDD663857C4C
 for <libc-alpha@sourceware.org>; Wed, 27 Oct 2021 02:43:40 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org DDD663857C4C
Received: by mail-il1-x130.google.com with SMTP id j10so1424259ilu.2
 for <libc-alpha@sourceware.org>; Tue, 26 Oct 2021 19:43:40 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=OjpkHxQdFAk6A9ZCvZg5Q9WLHEyNzbV2VfPwMsftofE=;
 b=AkykxnlQQdP+q46VPi9QpjmlTD4D38bgT8vLGzfp3uqP8HDHSzcqhcV5+FgPDZ6VRs
 C7fTct9oOtjV72w5BmHGIjhnm2HE2M5Ckye35I3rfq+VAANK7lsi848mIzTmFnIDFQ9x
 3/NrP4bSsTpG7QfvhrnAnF1VOzXOv9RcQfyJNpqpUTxiwBsrNA+JeAU3rhzB3K3gmR7R
 k36juNR8icmFfmEYWAIk0QAJCeLEsCgzbnJIRzg2U9OcVNTc+YSTAUWVXQIn7SxtfFFl
 xnd0rNaGq1mjloPlcPj41HWSdPMiRKrJXJrRDA4KvRz8QdBsQ4r8SkKkkxGyOEAXZ0Yk
 plPA==
X-Gm-Message-State: AOAM532aDpTowQWVgdCxtiIlgevGCUcS5RHgiNB24+MXIum//8nECoSe
 T4s2akGUmyQRLoAvKMJUC1mXzdKgJ8M=
X-Google-Smtp-Source: 
 ABdhPJz17j4Wvgc4F0563mTuBrtoh17yY76TmgR30/WKF9Ollxj3DPKJ5/sc6wGfMoXJuYdEYa7Q5Q==
X-Received: by 2002:a05:6e02:1aa9:: with SMTP id
 l9mr8697451ilv.318.1635302619970;
 Tue, 26 Oct 2021 19:43:39 -0700 (PDT)
Received: from localhost.localdomain (mobile-130-126-255-38.near.illinois.edu.
 [130.126.255.38])
 by smtp.googlemail.com with ESMTPSA id l6sm12215373ilt.31.2021.10.26.19.43.39
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Tue, 26 Oct 2021 19:43:39 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v1 6/6] x86_64: Add evex optimized __memcmpeq in
 memcmpeq-evex.S
Date: Tue, 26 Oct 2021 21:43:23 -0500
Message-Id: <20211027024323.1199441-6-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20211027024323.1199441-1-goldstein.w.n@gmail.com>
References: <20211027024323.1199441-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-12.0 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

No bug. This commit adds new optimized __memcmpeq implementation for
evex.

The primary optimizations are:

1) skipping the logic to find the difference of the first mismatched
byte.

2) not updating src/dst addresses as the non-equals logic does not
need to be reused by different areas.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   1 -
 sysdeps/x86_64/multiarch/ifunc-memcmpeq.h  |   1 -
 sysdeps/x86_64/multiarch/memcmpeq-evex.S   | 308 ++++++++++++++++++++-
 3 files changed, 304 insertions(+), 6 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 535450f52c..ea8df9f9b9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -52,7 +52,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memcmpeq,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)
-                   && CPU_FEATURE_USABLE (MOVBE)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __memcmpeq_evex)
 	      IFUNC_IMPL_ADD (array, i, __memcmpeq, 1, __memcmpeq_sse2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h b/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h
index e596c5048b..2ea38adf05 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h
@@ -34,7 +34,6 @@ IFUNC_SELECTOR (void)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
-	  && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
 	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 	return OPTIMIZE1 (evex);
 
diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
index 951e1e9560..f27e732036 100644
--- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S
+++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
@@ -16,8 +16,308 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#ifndef MEMCMP
-# define MEMCMP	__memcmpeq_evex
-#endif
+#if IS_IN (libc)
+
+/* __memcmpeq is implemented as:
+   1. Use ymm vector compares when possible. The only case where
+      vector compares is not possible for when size < VEC_SIZE
+      and loading from either s1 or s2 would cause a page cross.
+   2. Use xmm vector compare when size >= 8 bytes.
+   3. Optimistically compare up to first 4 * VEC_SIZE one at a
+      to check for early mismatches. Only do this if its guranteed the
+      work is not wasted.
+   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+      area.
+   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+
+# include <sysdep.h>
+
+# ifndef MEMCMPEQ
+#  define MEMCMPEQ	__memcmpeq_evex
+# endif
+
+# define VMOVU	vmovdqu64
+# define VPCMP	vpcmpub
+# define VPTEST	vptestmb
+
+# define VEC_SIZE	32
+# define PAGE_SIZE	4096
+
+# define YMM0		ymm16
+# define YMM1		ymm17
+# define YMM2		ymm18
+# define YMM3		ymm19
+# define YMM4		ymm20
+# define YMM5		ymm21
+# define YMM6		ymm22
+
+
+	.section .text.evex, "ax", @progbits
+ENTRY_P2ALIGN (MEMCMPEQ, 6)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+	jb	L(less_vec)
+
+	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	(%rsi), %YMM1
+	/* Use compare not equals to directly check for mismatch.  */
+	VPCMP	$4, (%rdi), %YMM1, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_neq0)
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jbe	L(last_1x_vec)
+
+	/* Check second VEC no matter what.  */
+	VMOVU	VEC_SIZE(%rsi), %YMM2
+	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_neq0)
+
+	/* Less than 4 * VEC.  */
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_2x_vec)
+
+	/* Check third and fourth VEC no matter what.  */
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_neq0)
+
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_neq0)
+
+	/* Go to 4x VEC loop.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+
+	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
+	   branches.  */
+
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %YMM1
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %YMM2
+	addq	%rdx, %rdi
+
+	/* Wait to load from s1 until addressed adjust due to
+	   unlamination.  */
+
+	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
+	   will have some 1s.  */
+	vpxorq	-(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1
+	/* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while
+	   oring with YMM1. Result is stored in YMM1.  */
+	vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2
+
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
+	vpxorq	-(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
+	VMOVU	-(VEC_SIZE)(%rsi, %rdx), %YMM4
+	vpxorq	-(VEC_SIZE)(%rdi), %YMM4, %YMM4
+
+	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
 
-#include "memcmp-evex-movbe.S"
+	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
+	VPTEST	%YMM4, %YMM4, %k1
+	kmovd	%k1, %eax
+L(return_neq0):
+	ret
+
+	/* Fits in padding needed to .p2align 5 L(less_vec).  */
+L(last_1x_vec):
+	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
+	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
+	kmovd	%k1, %eax
+	ret
+
+	/* NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32
+	   byte aligned.  */
+	.p2align 5
+L(less_vec):
+	/* Check if one or less char. This is necessary for size = 0 but
+	   is also faster for size = 1.  */
+	cmpl	$1, %edx
+	jbe	L(one_or_less)
+
+	/* Check if loading one VEC from either s1 or s2 could cause a
+	   page cross. This can have false positives but is by far the
+	   fastest method.  */
+	movl	%edi, %eax
+	orl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(page_cross_less_vec)
+
+	/* No page cross possible.  */
+	VMOVU	(%rsi), %YMM2
+	VPCMP	$4, (%rdi), %YMM2, %k1
+	kmovd	%k1, %eax
+	/* Result will be zero if s1 and s2 match. Otherwise first set
+	   bit will be first mismatch.  */
+	bzhil	%edx, %eax, %eax
+	ret
+
+	/* Relatively cold but placing close to L(less_vec) for 2 byte
+	   jump encoding.  */
+	.p2align 4
+L(one_or_less):
+	jb	L(zero)
+	movzbl	(%rsi), %ecx
+	movzbl	(%rdi), %eax
+	subl	%ecx, %eax
+	/* No ymm register was touched.  */
+	ret
+	/* Within the same 16 byte block is L(one_or_less).  */
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(last_2x_vec):
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
+	vpxorq	-(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
+	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
+	vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
+	VPTEST	%YMM2, %YMM2, %k1
+	kmovd	%k1, %eax
+	ret
+
+	.p2align 4
+L(more_8x_vec):
+	/* Set end of s1 in rdx.  */
+	leaq	-(VEC_SIZE * 4)(%rdi, %rdx), %rdx
+	/* rsi stores s2 - s1. This allows loop to only update one
+	   pointer.  */
+	subq	%rdi, %rsi
+	/* Align s1 pointer.  */
+	andq	$-VEC_SIZE, %rdi
+	/* Adjust because first 4x vec where check already.  */
+	subq	$-(VEC_SIZE * 4), %rdi
+	.p2align 4
+L(loop_4x_vec):
+	VMOVU	(%rsi, %rdi), %YMM1
+	vpxorq	(%rdi), %YMM1, %YMM1
+
+	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
+	vpternlogd $0xde, (VEC_SIZE)(%rdi), %YMM1, %YMM2
+
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
+	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
+	vpxorq	(VEC_SIZE * 3)(%rdi), %YMM4, %YMM4
+
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+	VPTEST	%YMM4, %YMM4, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_neq2)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdx, %rdi
+	jb	L(loop_4x_vec)
+
+	subq	%rdx, %rdi
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
+	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM4, %YMM4
+	/* rdi has 4 * VEC_SIZE - remaining length.  */
+	cmpl	$(VEC_SIZE * 3), %edi
+	jae	L(8x_last_1x_vec)
+	/* Load regardless of branch.  */
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
+	/* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
+	   oring with YMM4. Result is stored in YMM4.  */
+	vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
+	cmpl	$(VEC_SIZE * 2), %edi
+	jae	L(8x_last_2x_vec)
+
+	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
+	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+
+	VMOVU	(%rsi, %rdx), %YMM1
+	vpxorq	(%rdx), %YMM1, %YMM1
+
+	vpternlogd $0xfe, %YMM1, %YMM2, %YMM4
+L(8x_last_1x_vec):
+L(8x_last_2x_vec):
+	VPTEST	%YMM4, %YMM4, %k1
+	kmovd	%k1, %eax
+L(return_neq2):
+	ret
+
+	/* Relatively cold case as page cross are unexpected.  */
+	.p2align 4
+L(page_cross_less_vec):
+	cmpl	$16, %edx
+	jae	L(between_16_31)
+	cmpl	$8, %edx
+	ja	L(between_9_15)
+	cmpl	$4, %edx
+	jb	L(between_2_3)
+	/* From 4 to 8 bytes.  No branch when size == 4.  */
+	movl	(%rdi), %eax
+	subl	(%rsi), %eax
+	movl	-4(%rdi, %rdx), %ecx
+	movl	-4(%rsi, %rdx), %edi
+	subl	%edi, %ecx
+	orl	%ecx, %eax
+	ret
+
+	.p2align 4,, 8
+L(between_16_31):
+	/* From 16 to 31 bytes.  No branch when size == 16.  */
+
+	/* Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe.
+	 */
+	vmovdqu	(%rsi), %xmm1
+	vpcmpeqb (%rdi), %xmm1, %xmm1
+	vmovdqu	-16(%rsi, %rdx), %xmm2
+	vpcmpeqb -16(%rdi, %rdx), %xmm2, %xmm2
+	vpand	%xmm1, %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	notw	%ax
+	/* No ymm register was touched.  */
+	ret
+
+	.p2align 4,, 8
+L(between_9_15):
+	/* From 9 to 15 bytes.  */
+	movq	(%rdi), %rax
+	subq	(%rsi), %rax
+	movq	-8(%rdi, %rdx), %rcx
+	movq	-8(%rsi, %rdx), %rdi
+	subq	%rdi, %rcx
+	orq	%rcx, %rax
+	/* edx is guranteed to be a non-zero int.  */
+	cmovnz	%edx, %eax
+	ret
+
+	/* Don't align. This is cold and aligning here will cause code
+	   to spill into next cache line.  */
+L(between_2_3):
+	/* From 2 to 3 bytes.  No branch when size == 2.  */
+	movzwl	(%rdi), %eax
+	movzwl	(%rsi), %ecx
+	subl	%ecx, %eax
+	movzbl	-1(%rdi, %rdx), %ecx
+	/* All machines that support evex will insert a "merging uop"
+	   avoiding any serious partial register stalls.  */
+	subb	-1(%rsi, %rdx), %cl
+	orl	%ecx, %eax
+	/* No ymm register was touched.  */
+	ret
+
+    /* 4 Bytes from next cache line. */
+END (MEMCMPEQ)
+#endif