From patchwork Wed Jun  9 20:52:55 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 43796
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id BBFC839A1826
	for <patchwork@sourceware.org>; Wed,  9 Jun 2021 20:54:36 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org BBFC839A1826
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1623272076;
	bh=AAheyy5L4v8NHNtfANUsN3glaCI2NLy5MKfZ7QKtC8U=;
	h=To:Subject:Date:List-Id:List-Unsubscribe:List-Archive:List-Post:
	 List-Help:List-Subscribe:From:Reply-To:From;
	b=pXFTaDF2j/ffWIIR8mhQ2r1OtJrAwlezI/FGFdFG6f+LIY99fwzRFIQ9ypNwmt6KK
	 hIHoYdzhmiSMGL3iI0p4H0UKJ4WljA7gFp3Fg3x2tALbTaJ02s0a2RCRGGoI5Gd4x7
	 GSp/iOxdN4mkdlc5OyIo57QJwXJJTmtMDgZBa4lI=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-qk1-x72a.google.com (mail-qk1-x72a.google.com
 [IPv6:2607:f8b0:4864:20::72a])
 by sourceware.org (Postfix) with ESMTPS id D05BE39BDC05
 for <libc-alpha@sourceware.org>; Wed,  9 Jun 2021 20:53:06 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org D05BE39BDC05
Received: by mail-qk1-x72a.google.com with SMTP id f70so9927641qke.13
 for <libc-alpha@sourceware.org>; Wed, 09 Jun 2021 13:53:06 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20161025;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:mime-version
 :content-transfer-encoding;
 bh=AAheyy5L4v8NHNtfANUsN3glaCI2NLy5MKfZ7QKtC8U=;
 b=t5b/ml1dXIt8/1j2jYQIt2I0tiuftC84ygBKk7FPkQootPtcYHImCZYfUYfQ7IbfTY
 54Jl7Rnl+g/taqRRcDHK0DD4zk6Yh1NT4Z7sg2DunsHZHDE4fOoVsh3i6Qy1yc4oTc64
 HBXhcr9sz6ZwfylkCcZANFG7zAWSxGj7SbhluUDYhoYD+siFAvLQEtXRT1m322+H/V7l
 fms8gkx6HJUACST2BPOPH8z0lGrTz0abUqGuHGCCF2a3p8rM5IsjnoUwA4RltXuXgDFX
 zN7wyL78PNQ15b2Fzk+9IEddUKawbrJ8gINHE4pJx9tlMevbobhzwjBdBjzteobxFU0T
 WToA==
X-Gm-Message-State: AOAM531XQ2i3vCk4PIgxCF3pOmCVGKxIcWv4MClw/2yiUgND0mIVny/V
 z4BunCvHEGfG6KhLeUoToUDTO3gFFlQ=
X-Google-Smtp-Source: 
 ABdhPJzAIDYYg93BxlPeKRdCg+utaUolCmo24aXfYCooxWZGtjgTnfl6GatknHyWQ6OGbXpDs0aCpg==
X-Received: by 2002:a37:a793:: with SMTP id
 q141mr1514582qke.107.1623271986203;
 Wed, 09 Jun 2021 13:53:06 -0700 (PDT)
Received: from localhost.localdomain
 (pool-71-245-178-39.pitbpa.fios.verizon.net. [71.245.178.39])
 by smtp.googlemail.com with ESMTPSA id h14sm840310qtp.46.2021.06.09.13.53.05
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Wed, 09 Jun 2021 13:53:05 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v1 1/3] String: Add additional overflow tests for strnlen,
 memchr, and strncat
Date: Wed,  9 Jun 2021 16:52:55 -0400
Message-Id: <20210609205257.123944-1-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.25.1
MIME-Version: 1.0
X-Spam-Status: No, score=-12.3 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.2
X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

This commit adds tests for a bug in the wide char variant of the
functions where the implementation may assume that maxlen for wcsnlen
or n for wmemchr/strncat will not overflow when multiplied by
sizeof(wchar_t).

These tests show the following implementations failing on x86_64:

wcsnlen-sse4_1
wcsnlen-avx2

wmemchr-sse2
wmemchr-avx2

strncat would fail as well if it where on a system that prefered
either of the wcsnlen implementations that failed as it relies on
wcsnlen.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 string/test-memchr.c  | 39 ++++++++++++++++++++++++---
 string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++
 string/test-strnlen.c | 33 +++++++++++++++++++++++
 3 files changed, 130 insertions(+), 3 deletions(-)

diff --git a/string/test-memchr.c b/string/test-memchr.c
index 665edc32af..ce964284aa 100644
--- a/string/test-memchr.c
+++ b/string/test-memchr.c
@@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
   CHAR *res = CALL (impl, s, c, n);
   if (res != exp_res)
     {
-      error (0, 0, "Wrong result in function %s %p %p", impl->name,
-	     res, exp_res);
+      error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
+             impl->name, s, c, n, res, exp_res);
       ret = 1;
       return;
     }
@@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
     }
   buf[align + len] = 0;
 
-  if (pos < len)
+  if (pos < MIN(n, len))
     {
       buf[align + pos] = seek_char;
       buf[align + len] = -seek_char;
@@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
     do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
 }
 
+static void
+do_overflow_tests (void)
+{
+  size_t i, j, len;
+  const size_t one = 1;
+  uintptr_t buf_addr = (uintptr_t) buf1;
+
+  for (i = 0; i < 750; ++i)
+    {
+        do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
+        do_test (0, i, 751, i - buf_addr, BIG_CHAR);
+        do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
+        do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
+        do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
+
+      len = 0;
+      for (j = 8 * sizeof(size_t) - 1; j ; --j)
+        {
+          len |= one << j;
+          do_test (0, i, 751, len - i, BIG_CHAR);
+          do_test (0, i, 751, len + i, BIG_CHAR);
+          do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
+
+          do_test (0, i, 751, ~len - i, BIG_CHAR);
+          do_test (0, i, 751, ~len + i, BIG_CHAR);
+          do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
+        }
+    }
+}
+
 static void
 do_random_tests (void)
 {
@@ -221,6 +253,7 @@ test_main (void)
     do_test (page_size / 2 - i, i, i, 1, 0x9B);
 
   do_random_tests ();
+  do_overflow_tests ();
   return ret;
 }
 
diff --git a/string/test-strncat.c b/string/test-strncat.c
index 2ef917b820..0ab7541d4e 100644
--- a/string/test-strncat.c
+++ b/string/test-strncat.c
@@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
     }
 }
 
+static void
+do_overflow_tests (void)
+{
+  size_t i, j, len;
+  const size_t one = 1;
+  CHAR *s1, *s2;
+  uintptr_t s1_addr;
+  s1 = (CHAR *) buf1;
+  s2 = (CHAR *) buf2;
+  s1_addr = (uintptr_t)s1;
+ for (j = 0; j < 200; ++j)
+      s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
+ s2[200] = 0;
+  for (i = 0; i < 750; ++i) {
+    for (j = 0; j < i; ++j)
+      s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
+    s1[i] = '\0';
+
+       FOR_EACH_IMPL (impl, 0)
+    {
+      s2[0] = '\0';
+      do_one_test (impl, s2, s1, SIZE_MAX - i);
+      s2[0] = '\0';
+      do_one_test (impl, s2, s1, i - s1_addr);
+      s2[0] = '\0';
+      do_one_test (impl, s2, s1, -s1_addr - i);
+      s2[0] = '\0';
+      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
+      s2[0] = '\0';
+      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
+    }
+
+    len = 0;
+    for (j = 8 * sizeof(size_t) - 1; j ; --j)
+      {
+        len |= one << j;
+        FOR_EACH_IMPL (impl, 0)
+          {
+            s2[0] = '\0';
+            do_one_test (impl, s2, s1, len - i);
+            s2[0] = '\0';
+            do_one_test (impl, s2, s1, len + i);
+            s2[0] = '\0';
+            do_one_test (impl, s2, s1, len - s1_addr - i);
+            s2[0] = '\0';
+            do_one_test (impl, s2, s1, len - s1_addr + i);
+
+            s2[0] = '\0';
+            do_one_test (impl, s2, s1, ~len - i);
+            s2[0] = '\0';
+            do_one_test (impl, s2, s1, ~len + i);
+            s2[0] = '\0';
+            do_one_test (impl, s2, s1, ~len - s1_addr - i);
+            s2[0] = '\0';
+            do_one_test (impl, s2, s1, ~len - s1_addr + i);
+          }
+      }
+  }
+}
+
 static void
 do_random_tests (void)
 {
@@ -316,6 +376,7 @@ test_main (void)
     }
 
   do_random_tests ();
+  do_overflow_tests ();
   return ret;
 }
 
diff --git a/string/test-strnlen.c b/string/test-strnlen.c
index 920f58e97b..f53e09263f 100644
--- a/string/test-strnlen.c
+++ b/string/test-strnlen.c
@@ -89,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
     do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
 }
 
+static void
+do_overflow_tests (void)
+{
+  size_t i, j, len;
+  const size_t one = 1;
+  uintptr_t buf_addr = (uintptr_t) buf1;
+
+  for (i = 0; i < 750; ++i)
+    {
+      do_test (0, i, SIZE_MAX - i, BIG_CHAR);
+      do_test (0, i, i - buf_addr, BIG_CHAR);
+      do_test (0, i, -buf_addr - i, BIG_CHAR);
+      do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
+      do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
+
+      len = 0;
+      for (j = 8 * sizeof(size_t) - 1; j ; --j)
+        {
+          len |= one << j;
+          do_test (0, i, len - i, BIG_CHAR);
+          do_test (0, i, len + i, BIG_CHAR);
+          do_test (0, i, len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, len - buf_addr + i, BIG_CHAR);
+
+          do_test (0, i, ~len - i, BIG_CHAR);
+          do_test (0, i, ~len + i, BIG_CHAR);
+          do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
+        }
+    }
+}
+
 static void
 do_random_tests (void)
 {
@@ -283,6 +315,7 @@ test_main (void)
   do_random_tests ();
   do_page_tests ();
   do_page_2_tests ();
+  do_overflow_tests ();
   return ret;
 }
 

From patchwork Wed Jun  9 20:52:56 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 43797
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id A0BD839BDC20
	for <patchwork@sourceware.org>; Wed,  9 Jun 2021 20:55:20 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org A0BD839BDC20
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1623272120;
	bh=DNu3g6921aD5ZkddV/1+3HDe6D5ehTnpHeyEddsWH9E=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=hvcRAbGLGSXGVOZbQjCmUNqt+wDhhnvL5HiGQupXnnCUyqGtEK7Jee1MzMVT0UAW7
	 I6MMa3f/0t8VuZyrs4z6b3vbukmcMOnTrLaE17ShGXsgHzqEYXXMItdhH8nblCu9pK
	 rvCEXK865y2ZaVHMURDtjmhMMOFhUJ6rbuthO0YQ=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-qv1-xf35.google.com (mail-qv1-xf35.google.com
 [IPv6:2607:f8b0:4864:20::f35])
 by sourceware.org (Postfix) with ESMTPS id 4A2A139A0449
 for <libc-alpha@sourceware.org>; Wed,  9 Jun 2021 20:53:10 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 4A2A139A0449
Received: by mail-qv1-xf35.google.com with SMTP id u13so13578618qvt.7
 for <libc-alpha@sourceware.org>; Wed, 09 Jun 2021 13:53:10 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20161025;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=DNu3g6921aD5ZkddV/1+3HDe6D5ehTnpHeyEddsWH9E=;
 b=m48tlLqQpVJKIkQT5x29FRWfQfK+r4t/pWseYJtBn3TZX3rkpfuM9KwgePUGrSNepw
 +cFBr5Xje0fICbIYUHkaTM1GrZyV9vteDRvW118tivdSNXk77jKpMbU1sJzpUlR91rt5
 eiIGedEDR9ISCLb35zS8SmxN36ENx1NtZ2Is4apQg/3RLVd6e89IsLk7b4GjECdiKqM/
 hLQ78GzsF380O8QPjgQdxcplGg2YMnFkguCGWIV4wG87mH1nvGuKmQPR339/Lf8b4Rp9
 OWcUyunTQDZo2O9/qHWScwVjaoU9vV4fccvP7qVoRe8YSH0c0LdsfvdIo0/R2682cR1e
 wfgg==
X-Gm-Message-State: AOAM530YuhA1MXcf1S0YH87U62mNRj4bUN65ITUOlO5I0Sx3yILp6zUM
 uI8pCPY/uO0wWmAQ04pQyILbrN0TdVA=
X-Google-Smtp-Source: 
 ABdhPJxPapoF7GLLyP+r5GFJwdmHmAD6SNvNevvAn80X9ZdNXpzDogL2EjUXU58MoWrOj4y2rikfpA==
X-Received: by 2002:a0c:d809:: with SMTP id h9mr1955014qvj.47.1623271989572;
 Wed, 09 Jun 2021 13:53:09 -0700 (PDT)
Received: from localhost.localdomain
 (pool-71-245-178-39.pitbpa.fios.verizon.net. [71.245.178.39])
 by smtp.googlemail.com with ESMTPSA id h14sm840310qtp.46.2021.06.09.13.53.08
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Wed, 09 Jun 2021 13:53:09 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v1 2/3] x86: Fix overflow bug with wmemchr-sse2 and
 wmemchr-avx2
Date: Wed,  9 Jun 2021 16:52:56 -0400
Message-Id: <20210609205257.123944-2-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20210609205257.123944-1-goldstein.w.n@gmail.com>
References: <20210609205257.123944-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-11.6 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 KAM_NUMSUBJECT, RCVD_IN_DNSWL_NONE, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE,
 SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2
X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

This commit fixes the bug mentioned in the previous commit.

The previous implementations of wmemchr in these files relied
on n * sizeof(wchar_t) which was not guranteed by the standard.

The new overflow tests added in the previous commit now
pass (As well as all the other tests).

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/memchr.S                | 77 +++++++++++++++++++-------
 sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
 2 files changed, 98 insertions(+), 37 deletions(-)

diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index beff2708de..3ddc4655cf 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -21,9 +21,11 @@
 #ifdef USE_AS_WMEMCHR
 # define MEMCHR		wmemchr
 # define PCMPEQ		pcmpeqd
+# define CHAR_PER_VEC	4
 #else
 # define MEMCHR		memchr
 # define PCMPEQ		pcmpeqb
+# define CHAR_PER_VEC	16
 #endif
 
 /* fast SSE2 version with using pmaxub and 64 byte loop */
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
 	movd	%esi, %xmm1
 	mov	%edi, %ecx
 
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#endif
 #ifdef USE_AS_WMEMCHR
 	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
-	shl	$2, %RDX_LP
 #else
-# ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%edx, %edx
-# endif
 	punpcklbw %xmm1, %xmm1
 	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
 	test	%eax, %eax
 
 	jnz	L(matches_1)
-	sub	$16, %rdx
+	sub	$CHAR_PER_VEC, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
 	and	$15, %ecx
 	and	$-16, %rdi
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	add	%rcx, %rdx
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 	jmp	L(loop_prolog)
 
@@ -77,16 +81,21 @@ L(crosscache):
 	movdqa	(%rdi), %xmm0
 
 	PCMPEQ	%xmm1, %xmm0
-/* Check if there is a match.  */
+	/* Check if there is a match.  */
 	pmovmskb %xmm0, %eax
-/* Remove the leading bytes.  */
+	/* Remove the leading bytes.  */
 	sar	%cl, %eax
 	test	%eax, %eax
 	je	L(unaligned_no_match)
-/* Check which byte is a match.  */
+	/* Check which byte is a match.  */
 	bsf	%eax, %eax
-
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	add	%rdi, %rax
 	add	%rcx, %rax
@@ -94,15 +103,18 @@ L(crosscache):
 
 	.p2align 4
 L(unaligned_no_match):
-        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+	/* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
 	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
 	   possible addition overflow.  */
 	neg	%rcx
 	add	$16, %rcx
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	sub	%rcx, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 
 	.p2align 4
@@ -135,7 +147,7 @@ L(loop_prolog):
 	test	$0x3f, %rdi
 	jz	L(align64_loop)
 
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 
 	movdqa	(%rdi), %xmm0
@@ -167,11 +179,14 @@ L(loop_prolog):
 	mov	%rdi, %rcx
 	and	$-64, %rdi
 	and	$63, %ecx
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	add	%rcx, %rdx
 
 	.p2align 4
 L(align64_loop):
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 	movdqa	(%rdi), %xmm0
 	movdqa	16(%rdi), %xmm2
@@ -218,7 +233,7 @@ L(align64_loop):
 
 	.p2align 4
 L(exit_loop):
-	add	$32, %edx
+	add	$(CHAR_PER_VEC * 2), %edx
 	jle	L(exit_loop_32)
 
 	movdqa	(%rdi), %xmm0
@@ -238,7 +253,7 @@ L(exit_loop):
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32_1)
-	sub	$16, %edx
+	sub	$CHAR_PER_VEC, %edx
 	jle	L(return_null)
 
 	PCMPEQ	48(%rdi), %xmm1
@@ -250,13 +265,13 @@ L(exit_loop):
 
 	.p2align 4
 L(exit_loop_32):
-	add	$32, %edx
+	add	$(CHAR_PER_VEC * 2), %edx
 	movdqa	(%rdi), %xmm0
 	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches_1)
-	sub	$16, %edx
+	sub	$CHAR_PER_VEC, %edx
 	jbe	L(return_null)
 
 	PCMPEQ	16(%rdi), %xmm1
@@ -293,7 +308,13 @@ L(matches32):
 	.p2align 4
 L(matches_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	add	%rdi, %rax
 	ret
@@ -301,7 +322,13 @@ L(matches_1):
 	.p2align 4
 L(matches16_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	16(%rdi, %rax), %rax
 	ret
@@ -309,7 +336,13 @@ L(matches16_1):
 	.p2align 4
 L(matches32_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	32(%rdi, %rax), %rax
 	ret
@@ -317,7 +350,13 @@ L(matches32_1):
 	.p2align 4
 L(matches48_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	48(%rdi, %rax), %rax
 	ret
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 0d8758e3e7..afdb956502 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -54,21 +54,19 @@
 
 # define VEC_SIZE 32
 # define PAGE_SIZE 4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text),"ax",@progbits
 ENTRY (MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
-	test	%RDX_LP, %RDX_LP
-	jz	L(null)
-# endif
-# ifdef USE_AS_WMEMCHR
-	shl	$2, %RDX_LP
-# else
 #  ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%edx, %edx
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+#  else
+	test	%RDX_LP, %RDX_LP
 #  endif
+	jz	L(null)
 # endif
 	/* Broadcast CHAR to YMMMATCH.  */
 	vmovd	%esi, %xmm0
@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
 	vpmovmskb %ymm1, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* If length < CHAR_PER_VEC handle special.  */
-	cmpq	$VEC_SIZE, %rdx
+	cmpq	$CHAR_PER_VEC, %rdx
 	jbe	L(first_vec_x0)
 # endif
 	testl	%eax, %eax
@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
 L(first_vec_x0):
 	/* Check if first match was before length.  */
 	tzcntl	%eax, %eax
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
 	xorl	%ecx, %ecx
 	cmpl	%eax, %edx
 	leaq	(%rdi, %rax), %rax
@@ -110,12 +112,12 @@ L(null):
 # endif
 	.p2align 4
 L(cross_page_boundary):
-	/* Save pointer before aligning as its original value is necessary
-	   for computer return address if byte is found or adjusting length
-	   if it is not and this is memchr.  */
+	/* Save pointer before aligning as its original value is
+	   necessary for computer return address if byte is found or
+	   adjusting length if it is not and this is memchr.  */
 	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
-	   rdi for rawmemchr.  */
+	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+	   and rdi for rawmemchr.  */
 	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
 	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
@@ -124,6 +126,10 @@ L(cross_page_boundary):
 	   match).  */
 	leaq	1(%ALGN_PTR_REG), %rsi
 	subq	%RRAW_PTR_REG, %rsi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %esi
+#  endif
 # endif
 	/* Remove the leading bytes.  */
 	sarxl	%ERAW_PTR_REG, %eax, %eax
@@ -181,6 +187,10 @@ L(cross_page_continue):
 	orq	$(VEC_SIZE - 1), %rdi
 	/* esi is for adjusting length to see if near the end.  */
 	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %esi
+#  endif
 # else
 	orq	$(VEC_SIZE - 1), %rdi
 L(cross_page_continue):
@@ -213,7 +223,7 @@ L(cross_page_continue):
 
 # ifndef USE_AS_RAWMEMCHR
 	/* Check if at last VEC_SIZE * 4 length.  */
-	subq	$(VEC_SIZE * 4), %rdx
+	subq	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(last_4x_vec_or_less_cmpeq)
 	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
 	   length.  */
@@ -221,6 +231,10 @@ L(cross_page_continue):
 	movl	%edi, %ecx
 	orq	$(VEC_SIZE * 4 - 1), %rdi
 	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
 	addq	%rcx, %rdx
 # else
 	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
@@ -250,15 +264,19 @@ L(loop_4x_vec):
 
 	subq	$-(VEC_SIZE * 4), %rdi
 
-	subq	$(VEC_SIZE * 4), %rdx
+	subq	$(CHAR_PER_VEC * 4), %rdx
 	ja	L(loop_4x_vec)
 
-	/* Fall through into less than 4 remaining vectors of length case.
-	 */
+	/* Fall through into less than 4 remaining vectors of length
+	   case.  */
 	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	.p2align 4
 L(last_4x_vec_or_less):
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
 	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
 	jnz	L(first_vec_x1_check)
@@ -355,6 +373,10 @@ L(last_vec_x2_return):
 L(last_4x_vec_or_less_cmpeq):
 	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
 	subq	$-(VEC_SIZE * 4), %rdi
 	/* Check first VEC regardless.  */
 	testl	%eax, %eax

From patchwork Wed Jun  9 20:52:58 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 43798
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id 71C9839BDC0B
	for <patchwork@sourceware.org>; Wed,  9 Jun 2021 20:56:04 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 71C9839BDC0B
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1623272164;
	bh=IAECTzLT3jgDtar4kZlImaqIA4oYZRgEGsziETWYkW8=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=EFSrIOIsKJOopqLq5SU/O/4KnI+IDfx/mujMEyleKcbRBd+N43Y7p30SC39OPZXtN
	 121bl0hupmSX3mHbB1+wuW+acinwDQ4e/SkD9+kioa9h7c1KyXCpXAWF1khOtu04q9
	 ycVExrHZQQExSPu/mBKT3NNzdUZxoX95ZUWNt8Co=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-qk1-x731.google.com (mail-qk1-x731.google.com
 [IPv6:2607:f8b0:4864:20::731])
 by sourceware.org (Postfix) with ESMTPS id D456E39BDC21
 for <libc-alpha@sourceware.org>; Wed,  9 Jun 2021 20:54:43 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org D456E39BDC21
Received: by mail-qk1-x731.google.com with SMTP id k11so23522899qkk.1
 for <libc-alpha@sourceware.org>; Wed, 09 Jun 2021 13:54:43 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20161025;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=IAECTzLT3jgDtar4kZlImaqIA4oYZRgEGsziETWYkW8=;
 b=MbiGdV0lNQHDEMn0C1vvhFYsHawp1z06mMOvbkpGCJ7pOAuOzNqMMd88mXbGh6oSc8
 L4BQmo1nFGTWT2o0+fU9WD/5M6fJ/g/KvJ2d7nrmTCiqM09PFV3swjSPwx9RwjiKm0Wp
 87ViYyZfFMgkuoNRgBna6vAL454nuU2awNY6ya/v4BIN+TAnbcGVomekekBNL4Lrk2yQ
 A+2RRPIj5cZNNRpleunZjSXW6lNJ1fLVQ0xirafBjNeHq7ABXwvETt78CL0Pa9941O0i
 /G4N/i4DYVDb1FXXejtuCSSn/Yg+mOCnyH2l2PrXO4N+Z7i4fs3dMbW5u7ZySN6mPEVH
 skYA==
X-Gm-Message-State: AOAM531DTveTFhC1weI42Bpjf8LE+M/I8Qw46ynZikQvUZmUgGM92Hwo
 HmwwUah6BrI4vE9DGYIycvvy3IHd1aU=
X-Google-Smtp-Source: 
 ABdhPJw3Oyq24ONlAnvKA3XXpL2SjLb6WYWkkefFoiC70vtCIyr+/Cq0WaG4ZI09sYQ1WAMS4hzalw==
X-Received: by 2002:a05:620a:127a:: with SMTP id
 b26mr1517992qkl.33.1623272082845;
 Wed, 09 Jun 2021 13:54:42 -0700 (PDT)
Received: from localhost.localdomain
 (pool-71-245-178-39.pitbpa.fios.verizon.net. [71.245.178.39])
 by smtp.googlemail.com with ESMTPSA id h14sm840310qtp.46.2021.06.09.13.54.42
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Wed, 09 Jun 2021 13:54:42 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v1 3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and
 wcsnlen-avx2
Date: Wed,  9 Jun 2021 16:52:58 -0400
Message-Id: <20210609205257.123944-3-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20210609205257.123944-1-goldstein.w.n@gmail.com>
References: <20210609205257.123944-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-12.1 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 KAM_NUMSUBJECT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.2
X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

This commit fixes the bug mentioned in the previous commit.

The previous implementations of wmemchr in these files relied
on maxlen * sizeof(wchar_t) which was not guranteed by the standard.

The new overflow tests added in the previous commit now
pass (As well as all the other tests).

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
Its possible there is a room for a speedup in strnlen-avx2
and strnlen-evex if we check for overflow first and jump to
strlen. This allows for end pointers to be used as opposed
to tracking length which will save some ALU / code size.
 sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
 sysdeps/x86_64/strlen.S                |  14 ++-
 2 files changed, 106 insertions(+), 38 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index bd2e6ee44a..b282a75613 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -44,21 +44,21 @@
 
 # define VEC_SIZE 32
 # define PAGE_SIZE 4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text),"ax",@progbits
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
 	/* Check zero length.  */
+#  ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RSI_LP, %RSI_LP
+#  else
 	test	%RSI_LP, %RSI_LP
+#  endif
 	jz	L(zero)
 	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
 	mov	%RSI_LP, %R8_LP
-#  ifdef USE_AS_WCSLEN
-	shl	$2, %RSI_LP
-#  elif defined __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%esi, %esi
-#  endif
 # endif
 	movl	%edi, %eax
 	movq	%rdi, %rdx
@@ -72,10 +72,10 @@ ENTRY (STRLEN)
 
 	/* Check the first VEC_SIZE bytes.  */
 	VPCMPEQ	(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 # ifdef USE_AS_STRNLEN
 	/* If length < VEC_SIZE handle special.  */
-	cmpq	$VEC_SIZE, %rsi
+	cmpq	$CHAR_PER_VEC, %rsi
 	jbe	L(first_vec_x0)
 # endif
 	/* If empty continue to aligned_more. Otherwise return bit
@@ -84,6 +84,7 @@ ENTRY (STRLEN)
 	jz	L(aligned_more)
 	tzcntl	%eax, %eax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -97,9 +98,14 @@ L(zero):
 L(first_vec_x0):
 	/* Set bit for max len so that tzcnt will return min of max len
 	   and position of first match.  */
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
 	btsq	%rsi, %rax
 	tzcntl	%eax, %eax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 #  endif
 	VZEROUPPER_RETURN
@@ -113,14 +119,19 @@ L(first_vec_x1):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE * 4 + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	incl	%edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -133,14 +144,19 @@ L(first_vec_x2):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE * 3 + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -153,14 +169,19 @@ L(first_vec_x3):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE * 2 + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE * 2 + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -173,14 +194,19 @@ L(first_vec_x4):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE * 3 + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -195,10 +221,14 @@ L(cross_page_continue):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
 # ifdef USE_AS_STRNLEN
-	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
-	   it simplies the logic in last_4x_vec_or_less.  */
+	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
+	   because it simplies the logic in last_4x_vec_or_less.  */
 	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
 	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
 # endif
 	/* Load first VEC regardless.  */
 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
@@ -207,34 +237,38 @@ L(cross_page_continue):
 	subq	%rcx, %rsi
 	jb	L(last_4x_vec_or_less)
 # endif
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x4)
 
 	/* Align data to VEC_SIZE * 4 - 1.  */
 # ifdef USE_AS_STRNLEN
 	/* Before adjusting length check if at last VEC_SIZE * 4.  */
-	cmpq	$(VEC_SIZE * 4 - 1), %rsi
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
 	jbe	L(last_4x_vec_or_less_load)
 	incq	%rdi
 	movl	%edi, %ecx
 	orq	$(VEC_SIZE * 4 - 1), %rdi
 	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
 	/* Readjust length.  */
 	addq	%rcx, %rsi
 # else
@@ -246,13 +280,13 @@ L(cross_page_continue):
 L(loop_4x_vec):
 # ifdef USE_AS_STRNLEN
 	/* Break if at end of length.  */
-	subq	$(VEC_SIZE * 4), %rsi
+	subq	$(CHAR_PER_VEC * 4), %rsi
 	jb	L(last_4x_vec_or_less_cmpeq)
 # endif
-	/* Save some code size by microfusing VPMINU with the load. Since
-	   the matches in ymm2/ymm4 can only be returned if there where no
-	   matches in ymm1/ymm3 respectively there is no issue with overlap.
-	 */
+	/* Save some code size by microfusing VPMINU with the load.
+	   Since the matches in ymm2/ymm4 can only be returned if there
+	   where no matches in ymm1/ymm3 respectively there is no issue
+	   with overlap.  */
 	vmovdqa	1(%rdi), %ymm1
 	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
 	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
@@ -260,7 +294,7 @@ L(loop_4x_vec):
 
 	VPMINU	%ymm2, %ymm4, %ymm5
 	VPCMPEQ	%ymm5, %ymm0, %ymm5
-	vpmovmskb	%ymm5, %ecx
+	vpmovmskb %ymm5, %ecx
 
 	subq	$-(VEC_SIZE * 4), %rdi
 	testl	%ecx, %ecx
@@ -268,27 +302,28 @@ L(loop_4x_vec):
 
 
 	VPCMPEQ	%ymm1, %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	subq	%rdx, %rdi
 	testl	%eax, %eax
 	jnz	L(last_vec_return_x0)
 
 	VPCMPEQ	%ymm2, %ymm0, %ymm2
-	vpmovmskb	%ymm2, %eax
+	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_return_x1)
 
 	/* Combine last 2 VEC.  */
 	VPCMPEQ	%ymm3, %ymm0, %ymm3
-	vpmovmskb	%ymm3, %eax
-	/* rcx has combined result from all 4 VEC. It will only be used if
-	   the first 3 other VEC all did not contain a match.  */
+	vpmovmskb %ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used
+	   if the first 3 other VEC all did not contain a match.  */
 	salq	$32, %rcx
 	orq	%rcx, %rax
 	tzcntq	%rax, %rax
 	subq	$(VEC_SIZE * 2 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -297,15 +332,19 @@ L(loop_4x_vec):
 # ifdef USE_AS_STRNLEN
 	.p2align 4
 L(last_4x_vec_or_less_load):
-	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
+	/* Depending on entry adjust rdi / prepare first VEC in ymm1.
+	 */
 	subq	$-(VEC_SIZE * 4), %rdi
 L(last_4x_vec_or_less_cmpeq):
 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 L(last_4x_vec_or_less):
-
-	vpmovmskb	%ymm1, %eax
-	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
-	   VEC_SIZE * 4.  */
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
+	vpmovmskb %ymm1, %eax
+	/* If remaining length > VEC_SIZE * 2. This works if esi is off
+	   by VEC_SIZE * 4.  */
 	testl	$(VEC_SIZE * 2), %esi
 	jnz	L(last_4x_vec)
 
@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
 	jb	L(max)
 
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
 	cmpl	%eax, %esi
@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
 	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -340,6 +380,7 @@ L(last_vec_return_x0):
 	subq	$(VEC_SIZE * 4 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -350,6 +391,7 @@ L(last_vec_return_x1):
 	subq	$(VEC_SIZE * 3 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -366,6 +408,7 @@ L(last_vec_x1_check):
 	incl	%eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -381,14 +424,14 @@ L(last_4x_vec):
 	jnz	L(last_vec_x1)
 
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x2)
 
 	/* Normalize length.  */
 	andl	$(VEC_SIZE * 4 - 1), %esi
 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x3)
 
@@ -396,7 +439,7 @@ L(last_4x_vec):
 	jb	L(max)
 
 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
 	cmpl	%eax, %esi
@@ -405,6 +448,7 @@ L(last_4x_vec):
 	addl	$(VEC_SIZE * 3 + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -419,6 +463,7 @@ L(last_vec_x1):
 	incl	%eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -432,6 +477,7 @@ L(last_vec_x2):
 	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -447,6 +493,7 @@ L(last_vec_x3):
 	addl	$(VEC_SIZE * 2 + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -455,13 +502,13 @@ L(max_end):
 	VZEROUPPER_RETURN
 # endif
 
-	/* Cold case for crossing page with first load.	 */
+	/* Cold case for crossing page with first load.  */
 	.p2align 4
 L(cross_page_boundary):
 	/* Align data to VEC_SIZE - 1.  */
 	orq	$(VEC_SIZE - 1), %rdi
 	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
 	   so no need to manually mod rdx.  */
 	sarxl	%edx, %eax, %eax
@@ -470,6 +517,10 @@ L(cross_page_boundary):
 	jnz	L(cross_page_less_vec)
 	leaq	1(%rdi), %rcx
 	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %ecx
+#  endif
 	/* Check length.  */
 	cmpq	%rsi, %rcx
 	jb	L(cross_page_continue)
@@ -479,6 +530,7 @@ L(cross_page_boundary):
 	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide length by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 #  endif
 # endif
@@ -489,6 +541,10 @@ L(return_vzeroupper):
 	.p2align 4
 L(cross_page_less_vec):
 	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
 	cmpq	%rax, %rsi
 	cmovb	%esi, %eax
 #  ifdef USE_AS_WCSLEN
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index d223ea1700..3fc6734910 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -65,12 +65,24 @@ ENTRY(strlen)
 	ret
 L(n_nonzero):
 # ifdef AS_WCSLEN
-	shl	$2, %RSI_LP
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
+   overflow the only way this program doesn't have undefined behavior 
+   is if there is a null terminator in valid memory so strlen will 
+   suffice.  */
+	mov	%RSI_LP, %R10_LP
+	sar	$62, %R10_LP
+	test	%R10_LP, %R10_LP
+	jnz	__wcslen_sse2
+	sal	$2, %RSI_LP
 # endif
 
 /* Initialize long lived registers.  */
 
 	add	%RDI_LP, %RSI_LP
+# ifdef AS_WCSLEN
+/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
+	jbe	__wcslen_sse2
+# endif
 	mov	%RSI_LP, %R10_LP
 	and	$-64, %R10_LP
 	mov	%RSI_LP, %R11_LP