From patchwork Thu Apr 21 03:14:08 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 53083
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id E156A3857362
	for <patchwork@sourceware.org>; Thu, 21 Apr 2022 03:15:25 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org E156A3857362
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1650510925;
	bh=0bxgprCORmKXHrSlbGRo/GYGZ9ePf17Pd5y+rvC8Kh0=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=wEPDH1V0eF+ScHmz1pF7pKg89ZesHaGtJLeNpAvYuTjVjMmFCnWmKaFj76eDnuGDH
	 55qDWbnxMKRlTAp0DcSCm+vWEo4SfF2T5Ea9+pX5jU6ZkmPZ4UElwvaUFdUaIFS/PL
	 ts4B7hQkfUbdIEJJ1VP2LSRvuyiXJ1T52X/L/580=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-pj1-x1029.google.com (mail-pj1-x1029.google.com
 [IPv6:2607:f8b0:4864:20::1029])
 by sourceware.org (Postfix) with ESMTPS id CC3423857365
 for <libc-alpha@sourceware.org>; Thu, 21 Apr 2022 03:14:38 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org CC3423857365
Received: by mail-pj1-x1029.google.com with SMTP id
 mp16-20020a17090b191000b001cb5efbcab6so6575734pjb.4
 for <libc-alpha@sourceware.org>; Wed, 20 Apr 2022 20:14:38 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=0bxgprCORmKXHrSlbGRo/GYGZ9ePf17Pd5y+rvC8Kh0=;
 b=PBB3LhG/yJWCsyt55jNJh67COvajTY2jlGWa84Ij/KQ2jBWf0rEJmDYpck8e4+F9st
 t7xZuyI+n0VWpg7u8VjzNRj68x5qx4EJS1gsKqxVMC09Vn4KoFZNo+8pNaByiRhoI81m
 DyU1IgYTaQ77yLfDnC5RAk8wFKY2YmMPpivF7MVcaLIy+7UrtCh0FXmpHJ39fOEwYRJZ
 0Yr2xfOES0/ccebL1Ul4n9cShG40d9Putn26f37yii9H9v8MSET0a0YrhCQJGcee9Cny
 MTyxouAv9z8++fupSPly5DuLIBE5s35YwwS5Ys72ch4+HXL5USJ1OWeVD8BgC73KBZnT
 XpBg==
X-Gm-Message-State: AOAM5313/T59xTe+tBMD2DUI7r/wvDAtXXBHlsXqJQ9IcFzqt6rpZK6i
 0mOTI+q8Qp2InfOpkL3arKb+PJdI8H4=
X-Google-Smtp-Source: 
 ABdhPJxafLFrpkBXN4pKGW1XWL2ECBiMfcCTEAKlQqFhSZoNnlhdF9m8Z3A9B3E2qN1V4iZSBX7FYw==
X-Received: by 2002:a17:90b:1e0b:b0:1d2:dabc:9929 with SMTP id
 pg11-20020a17090b1e0b00b001d2dabc9929mr8025110pjb.39.1650510877089;
 Wed, 20 Apr 2022 20:14:37 -0700 (PDT)
Received: from localhost.localdomain ([64.145.94.63])
 by smtp.googlemail.com with ESMTPSA id
 n59-20020a17090a5ac100b001cd498dc153sm1424022pji.3.2022.04.20.20.14.35
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Wed, 20 Apr 2022 20:14:36 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2
Date: Wed, 20 Apr 2022 22:14:08 -0500
Message-Id: <20220421031410.2142238-2-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20220421031410.2142238-1-goldstein.w.n@gmail.com>
References: <20220421031410.2142238-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-10.6 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 KAM_NUMSUBJECT, KAM_SHORT, RCVD_IN_DNSWL_NONE, SCC_10_SHORT_WORD_LINES,
 SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.

Geometric Mean of all benchmarks New / Old: 0.741
See email for all results.

Full xcheck passes on x86_64 with and without multiarch enabled.
---
Results For: strrchr

Geometric Mean of N=30 runs.

Geometric Mean of all benchmarks New / Old: 0.741
Benchmarks performance on Tigerlake:
https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html

 len, align,  pos, seek, max_char, freq, New Time / Old Time
2048,     0,   32,    0,      127,    1,               0.647
2048,     1,   32,    0,      127,    1,               0.621
2048,     0,   64,    0,      127,    1,               0.661
2048,     2,   64,    0,      127,    1,               0.655
2048,     0,  128,    0,      127,    1,                0.69
2048,     3,  128,    0,      127,    1,               0.689
2048,     0,  256,    0,      127,    1,               0.718
2048,     4,  256,    0,      127,    1,               0.718
2048,     0,  512,    0,      127,    1,               0.758
2048,     5,  512,    0,      127,    1,               0.754
2048,     0, 1024,    0,      127,    1,               1.029
2048,     6, 1024,    0,      127,    1,               1.032
2048,     0, 2048,    0,      127,    1,               0.826
2048,     7, 2048,    0,      127,    1,               0.834
2048,     0, 4096,    0,      127,    1,               0.825
2048,     8, 4096,    0,      127,    1,                0.83
 256,     1,   64,    0,      127,    1,               0.657
 256,    15,   64,    0,      127,    1,               0.657
 256,     2,   64,    0,      127,    1,               0.657
 256,    30,   64,    0,      127,    1,               0.523
 256,     3,   64,    0,      127,    1,               0.657
 256,    45,   64,    0,      127,    1,               0.654
 256,     4,   64,    0,      127,    1,               0.657
 256,    60,   64,    0,      127,    1,               0.526
 256,     5,   64,    0,      127,    1,               0.658
 256,    75,   64,    0,      127,    1,               0.658
 256,     6,   64,    0,      127,    1,               0.655
 256,    90,   64,    0,      127,    1,               0.523
 256,     7,   64,    0,      127,    1,               0.655
 256,   105,   64,    0,      127,    1,               0.654
   1,     0,    0,    0,      127,    1,                0.98
   2,     0,    1,    0,      127,    1,               0.978
   3,     0,    2,    0,      127,    1,               0.975
   4,     0,    3,    0,      127,    1,               0.976
   5,     0,    4,    0,      127,    1,               0.977
   6,     0,    5,    0,      127,    1,               0.981
   7,     0,    6,    0,      127,    1,               0.982
   8,     0,    7,    0,      127,    1,                0.98
   9,     0,    8,    0,      127,    1,               0.978
  10,     0,    9,    0,      127,    1,               0.981
  11,     0,   10,    0,      127,    1,               0.984
  12,     0,   11,    0,      127,    1,               0.982
  13,     0,   12,    0,      127,    1,                0.98
  14,     0,   13,    0,      127,    1,               0.978
  15,     0,   14,    0,      127,    1,               0.979
  16,     0,   15,    0,      127,    1,               0.986
  17,     0,   16,    0,      127,    1,               0.529
  18,     0,   17,    0,      127,    1,               0.566
  19,     0,   18,    0,      127,    1,               0.575
  20,     0,   19,    0,      127,    1,               0.573
  21,     0,   20,    0,      127,    1,               0.579
  22,     0,   21,    0,      127,    1,               0.595
  23,     0,   22,    0,      127,    1,               0.585
  24,     0,   23,    0,      127,    1,               0.586
  25,     0,   24,    0,      127,    1,               0.587
  26,     0,   25,    0,      127,    1,               0.592
  27,     0,   26,    0,      127,    1,               0.595
  28,     0,   27,    0,      127,    1,               0.592
  29,     0,   28,    0,      127,    1,                 0.6
  30,     0,   29,    0,      127,    1,               0.598
  31,     0,   30,    0,      127,    1,               0.595
  32,     0,   31,    0,      127,    1,               0.592
2048,     0,   32,   23,      127,    1,               0.827
2048,     1,   32,   23,      127,    1,               0.826
2048,     0,   64,   23,      127,    1,               0.824
2048,     2,   64,   23,      127,    1,               0.825
2048,     0,  128,   23,      127,    1,               0.829
2048,     3,  128,   23,      127,    1,               0.824
2048,     0,  256,   23,      127,    1,               0.832
2048,     4,  256,   23,      127,    1,               0.825
2048,     0,  512,   23,      127,    1,               0.831
2048,     5,  512,   23,      127,    1,               0.837
2048,     0, 1024,   23,      127,    1,               0.721
2048,     6, 1024,   23,      127,    1,               0.757
2048,     0, 2048,   23,      127,    1,               0.825
2048,     7, 2048,   23,      127,    1,               0.824
2048,     0, 4096,   23,      127,    1,               0.828
2048,     8, 4096,   23,      127,    1,               0.823
 256,     1,   64,   23,      127,    1,               0.665
 256,    15,   64,   23,      127,    1,               0.661
 256,     2,   64,   23,      127,    1,               0.674
 256,    30,   64,   23,      127,    1,               0.605
 256,     3,   64,   23,      127,    1,               0.668
 256,    45,   64,   23,      127,    1,               0.661
 256,     4,   64,   23,      127,    1,               0.657
 256,    60,   64,   23,      127,    1,               0.594
 256,     5,   64,   23,      127,    1,               0.654
 256,    75,   64,   23,      127,    1,               0.673
 256,     6,   64,   23,      127,    1,               0.688
 256,    90,   64,   23,      127,    1,                 0.6
 256,     7,   64,   23,      127,    1,                0.66
 256,   105,   64,   23,      127,    1,               0.654
   1,     0,    0,   23,      127,    1,               0.981
   2,     0,    1,   23,      127,    1,               0.976
   3,     0,    2,   23,      127,    1,               0.983
   4,     0,    3,   23,      127,    1,               0.984
   5,     0,    4,   23,      127,    1,               0.973
   6,     0,    5,   23,      127,    1,               0.987
   7,     0,    6,   23,      127,    1,               0.977
   8,     0,    7,   23,      127,    1,               0.979
   9,     0,    8,   23,      127,    1,               0.981
  10,     0,    9,   23,      127,    1,                0.98
  11,     0,   10,   23,      127,    1,               0.983
  12,     0,   11,   23,      127,    1,                0.98
  13,     0,   12,   23,      127,    1,                0.98
  14,     0,   13,   23,      127,    1,               0.977
  15,     0,   14,   23,      127,    1,               0.982
  16,     0,   15,   23,      127,    1,               0.581
  17,     0,   16,   23,      127,    1,               0.551
  18,     0,   17,   23,      127,    1,               0.555
  19,     0,   18,   23,      127,    1,               0.586
  20,     0,   19,   23,      127,    1,               0.585
  21,     0,   20,   23,      127,    1,               0.582
  22,     0,   21,   23,      127,    1,               0.571
  23,     0,   22,   23,      127,    1,               0.576
  24,     0,   23,   23,      127,    1,               0.581
  25,     0,   24,   23,      127,    1,               0.589
  26,     0,   25,   23,      127,    1,               0.593
  27,     0,   26,   23,      127,    1,               0.595
  28,     0,   27,   23,      127,    1,               0.583
  29,     0,   28,   23,      127,    1,               0.595
  30,     0,   29,   23,      127,    1,                0.58
  31,     0,   30,   23,      127,    1,               0.594
  32,     0,   31,   23,      127,    1,               0.665
2048,     0,   32,   23,      127,    2,               0.825
2048,     1,   32,   23,      127,    2,               0.818
2048,     0,   64,   23,      127,    2,               0.829
2048,     2,   64,   23,      127,    2,               0.828
2048,     0,  128,   23,      127,    2,               0.823
2048,     3,  128,   23,      127,    2,               0.825
2048,     0,  256,   23,      127,    2,               0.819
2048,     4,  256,   23,      127,    2,               0.828
2048,     0,  512,   23,      127,    2,               0.824
2048,     5,  512,   23,      127,    2,               0.827
2048,     0, 1024,   23,      127,    2,               0.813
2048,     6, 1024,   23,      127,    2,               0.834
2048,     0, 2048,   23,      127,    2,               0.927
2048,     7, 2048,   23,      127,    2,               0.923
2048,     0, 4096,   23,      127,    2,               0.818
2048,     8, 4096,   23,      127,    2,                0.82
 256,     1,   64,   23,      127,    2,               0.693
 256,    15,   64,   23,      127,    2,               0.686
 256,     2,   64,   23,      127,    2,                0.69
 256,    30,   64,   23,      127,    2,               0.611
 256,     3,   64,   23,      127,    2,               0.692
 256,    45,   64,   23,      127,    2,               0.685
 256,     4,   64,   23,      127,    2,               0.688
 256,    60,   64,   23,      127,    2,                 0.6
 256,     5,   64,   23,      127,    2,                0.69
 256,    75,   64,   23,      127,    2,               0.689
 256,     6,   64,   23,      127,    2,               0.688
 256,    90,   64,   23,      127,    2,               0.611
 256,     7,   64,   23,      127,    2,                0.69
 256,   105,   64,   23,      127,    2,               0.686
   1,     0,    0,   23,      127,    2,               0.982
   2,     0,    1,   23,      127,    2,               0.987
   3,     0,    2,   23,      127,    2,               0.978
   4,     0,    3,   23,      127,    2,               0.977
   5,     0,    4,   23,      127,    2,               0.979
   6,     0,    5,   23,      127,    2,               0.985
   7,     0,    6,   23,      127,    2,               0.975
   8,     0,    7,   23,      127,    2,               0.981
   9,     0,    8,   23,      127,    2,               0.984
  10,     0,    9,   23,      127,    2,               0.983
  11,     0,   10,   23,      127,    2,               0.982
  12,     0,   11,   23,      127,    2,               0.976
  13,     0,   12,   23,      127,    2,               0.985
  14,     0,   13,   23,      127,    2,               0.984
  15,     0,   14,   23,      127,    2,                0.98
  16,     0,   15,   23,      127,    2,               0.583
  17,     0,   16,   23,      127,    2,               0.552
  18,     0,   17,   23,      127,    2,               0.564
  19,     0,   18,   23,      127,    2,               0.585
  20,     0,   19,   23,      127,    2,               0.578
  21,     0,   20,   23,      127,    2,               0.578
  22,     0,   21,   23,      127,    2,               0.571
  23,     0,   22,   23,      127,    2,               0.587
  24,     0,   23,   23,      127,    2,               0.589
  25,     0,   24,   23,      127,    2,               0.593
  26,     0,   25,   23,      127,    2,               0.589
  27,     0,   26,   23,      127,    2,               0.588
  28,     0,   27,   23,      127,    2,               0.593
  29,     0,   28,   23,      127,    2,               0.579
  30,     0,   29,   23,      127,    2,               0.572
  31,     0,   30,   23,      127,    2,               0.582
  32,     0,   31,   23,      127,    2,               0.659
2048,     0,   32,   23,      127,    4,               0.822
2048,     1,   32,   23,      127,    4,               0.818
2048,     0,   64,   23,      127,    4,               0.826
2048,     2,   64,   23,      127,    4,               0.824
2048,     0,  128,   23,      127,    4,               0.833
2048,     3,  128,   23,      127,    4,               0.831
2048,     0,  256,   23,      127,    4,               0.826
2048,     4,  256,   23,      127,    4,               0.831
2048,     0,  512,   23,      127,    4,               0.834
2048,     5,  512,   23,      127,    4,                0.83
2048,     0, 1024,   23,      127,    4,               0.836
2048,     6, 1024,   23,      127,    4,               0.844
2048,     0, 2048,   23,      127,    4,               0.696
2048,     7, 2048,   23,      127,    4,               0.704
2048,     0, 4096,   23,      127,    4,               0.936
2048,     8, 4096,   23,      127,    4,               0.925
 256,     1,   64,   23,      127,    4,               0.694
 256,    15,   64,   23,      127,    4,                0.69
 256,     2,   64,   23,      127,    4,               0.687
 256,    30,   64,   23,      127,    4,               0.612
 256,     3,   64,   23,      127,    4,               0.685
 256,    45,   64,   23,      127,    4,               0.685
 256,     4,   64,   23,      127,    4,               0.684
 256,    60,   64,   23,      127,    4,               0.606
 256,     5,   64,   23,      127,    4,                0.69
 256,    75,   64,   23,      127,    4,               0.688
 256,     6,   64,   23,      127,    4,                0.69
 256,    90,   64,   23,      127,    4,               0.615
 256,     7,   64,   23,      127,    4,               0.691
 256,   105,   64,   23,      127,    4,               0.688
   1,     0,    0,   23,      127,    4,               0.982
   2,     0,    1,   23,      127,    4,               0.983
   3,     0,    2,   23,      127,    4,               0.981
   4,     0,    3,   23,      127,    4,               0.984
   5,     0,    4,   23,      127,    4,               0.963
   6,     0,    5,   23,      127,    4,               0.978
   7,     0,    6,   23,      127,    4,               0.985
   8,     0,    7,   23,      127,    4,               0.986
   9,     0,    8,   23,      127,    4,               0.978
  10,     0,    9,   23,      127,    4,               0.985
  11,     0,   10,   23,      127,    4,               0.986
  12,     0,   11,   23,      127,    4,               0.983
  13,     0,   12,   23,      127,    4,               0.986
  14,     0,   13,   23,      127,    4,                0.98
  15,     0,   14,   23,      127,    4,               0.979
  16,     0,   15,   23,      127,    4,               0.582
  17,     0,   16,   23,      127,    4,               0.542
  18,     0,   17,   23,      127,    4,               0.564
  19,     0,   18,   23,      127,    4,               0.571
  20,     0,   19,   23,      127,    4,               0.582
  21,     0,   20,   23,      127,    4,               0.573
  22,     0,   21,   23,      127,    4,               0.575
  23,     0,   22,   23,      127,    4,               0.578
  24,     0,   23,   23,      127,    4,                0.58
  25,     0,   24,   23,      127,    4,               0.592
  26,     0,   25,   23,      127,    4,               0.588
  27,     0,   26,   23,      127,    4,               0.574
  28,     0,   27,   23,      127,    4,               0.589
  29,     0,   28,   23,      127,    4,                0.56
  30,     0,   29,   23,      127,    4,               0.587
  31,     0,   30,   23,      127,    4,               0.584
  32,     0,   31,   23,      127,    4,               0.664
2048,     0,   32,   23,      127,    8,               0.826
2048,     1,   32,   23,      127,    8,               0.821
2048,     0,   64,   23,      127,    8,               0.828
2048,     2,   64,   23,      127,    8,               0.827
2048,     0,  128,   23,      127,    8,               0.833
2048,     3,  128,   23,      127,    8,                0.83
2048,     0,  256,   23,      127,    8,               0.855
2048,     4,  256,   23,      127,    8,               0.849
2048,     0,  512,   23,      127,    8,               0.849
2048,     5,  512,   23,      127,    8,               0.851
2048,     0, 1024,   23,      127,    8,               0.856
2048,     6, 1024,   23,      127,    8,               0.862
2048,     0, 2048,   23,      127,    8,               0.709
2048,     7, 2048,   23,      127,    8,               0.712
2048,     0, 4096,   23,      127,    8,               0.702
2048,     8, 4096,   23,      127,    8,               0.701
 256,     1,   64,   23,      127,    8,               0.689
 256,    15,   64,   23,      127,    8,               0.688
 256,     2,   64,   23,      127,    8,               0.691
 256,    30,   64,   23,      127,    8,               0.612
 256,     3,   64,   23,      127,    8,               0.688
 256,    45,   64,   23,      127,    8,               0.686
 256,     4,   64,   23,      127,    8,               0.694
 256,    60,   64,   23,      127,    8,               0.609
 256,     5,   64,   23,      127,    8,                0.69
 256,    75,   64,   23,      127,    8,                0.69
 256,     6,   64,   23,      127,    8,               0.691
 256,    90,   64,   23,      127,    8,               0.612
 256,     7,   64,   23,      127,    8,               0.689
 256,   105,   64,   23,      127,    8,               0.688
   1,     0,    0,   23,      127,    8,                0.98
   2,     0,    1,   23,      127,    8,               0.978
   3,     0,    2,   23,      127,    8,                0.98
   4,     0,    3,   23,      127,    8,               0.978
   5,     0,    4,   23,      127,    8,               0.977
   6,     0,    5,   23,      127,    8,               0.984
   7,     0,    6,   23,      127,    8,               0.982
   8,     0,    7,   23,      127,    8,               0.983
   9,     0,    8,   23,      127,    8,               0.987
  10,     0,    9,   23,      127,    8,               0.979
  11,     0,   10,   23,      127,    8,               0.985
  12,     0,   11,   23,      127,    8,               0.981
  13,     0,   12,   23,      127,    8,                0.98
  14,     0,   13,   23,      127,    8,               0.982
  15,     0,   14,   23,      127,    8,               0.981
  16,     0,   15,   23,      127,    8,               0.579
  17,     0,   16,   23,      127,    8,               0.531
  18,     0,   17,   23,      127,    8,               0.577
  19,     0,   18,   23,      127,    8,               0.588
  20,     0,   19,   23,      127,    8,               0.571
  21,     0,   20,   23,      127,    8,               0.576
  22,     0,   21,   23,      127,    8,                0.59
  23,     0,   22,   23,      127,    8,               0.574
  24,     0,   23,   23,      127,    8,               0.583
  25,     0,   24,   23,      127,    8,               0.581
  26,     0,   25,   23,      127,    8,               0.592
  27,     0,   26,   23,      127,    8,               0.586
  28,     0,   27,   23,      127,    8,               0.588
  29,     0,   28,   23,      127,    8,               0.578
  30,     0,   29,   23,      127,    8,               0.573
  31,     0,   30,   23,      127,    8,               0.588
  32,     0,   31,   23,      127,    8,               0.664
2048,     0,   32,   23,      127,   16,               0.825
2048,     1,   32,   23,      127,   16,               0.823
2048,     0,   64,   23,      127,   16,               0.831
2048,     2,   64,   23,      127,   16,               0.822
2048,     0,  128,   23,      127,   16,               0.831
2048,     3,  128,   23,      127,   16,               0.831
2048,     0,  256,   23,      127,   16,               0.849
2048,     4,  256,   23,      127,   16,                0.85
2048,     0,  512,   23,      127,   16,               0.751
2048,     5,  512,   23,      127,   16,                0.75
2048,     0, 1024,   23,      127,   16,               0.913
2048,     6, 1024,   23,      127,   16,               0.895
2048,     0, 2048,   23,      127,   16,               0.736
2048,     7, 2048,   23,      127,   16,               0.741
2048,     0, 4096,   23,      127,   16,               0.712
2048,     8, 4096,   23,      127,   16,               0.711
 256,     1,   64,   23,      127,   16,               0.758
 256,    15,   64,   23,      127,   16,               0.692
 256,     2,   64,   23,      127,   16,               0.692
 256,    30,   64,   23,      127,   16,               0.613
 256,     3,   64,   23,      127,   16,                0.69
 256,    45,   64,   23,      127,   16,               0.687
 256,     4,   64,   23,      127,   16,                0.69
 256,    60,   64,   23,      127,   16,               0.604
 256,     5,   64,   23,      127,   16,               0.687
 256,    75,   64,   23,      127,   16,               0.687
 256,     6,   64,   23,      127,   16,                0.69
 256,    90,   64,   23,      127,   16,                0.61
 256,     7,   64,   23,      127,   16,                0.69
 256,   105,   64,   23,      127,   16,               0.685
   1,     0,    0,   23,      127,   16,               0.981
   2,     0,    1,   23,      127,   16,               0.985
   3,     0,    2,   23,      127,   16,               0.985
   4,     0,    3,   23,      127,   16,               0.981
   5,     0,    4,   23,      127,   16,               0.979
   6,     0,    5,   23,      127,   16,               0.986
   7,     0,    6,   23,      127,   16,               0.986
   8,     0,    7,   23,      127,   16,               0.982
   9,     0,    8,   23,      127,   16,               0.982
  10,     0,    9,   23,      127,   16,                0.98
  11,     0,   10,   23,      127,   16,               0.983
  12,     0,   11,   23,      127,   16,               0.982
  13,     0,   12,   23,      127,   16,               0.982
  14,     0,   13,   23,      127,   16,               0.982
  15,     0,   14,   23,      127,   16,               0.982
  16,     0,   15,   23,      127,   16,               0.582
  17,     0,   16,   23,      127,   16,               0.542
  18,     0,   17,   23,      127,   16,               0.554
  19,     0,   18,   23,      127,   16,               0.562
  20,     0,   19,   23,      127,   16,               0.587
  21,     0,   20,   23,      127,   16,               0.584
  22,     0,   21,   23,      127,   16,               0.587
  23,     0,   22,   23,      127,   16,               0.594
  24,     0,   23,   23,      127,   16,               0.581
  25,     0,   24,   23,      127,   16,               0.577
  26,     0,   25,   23,      127,   16,               0.588
  27,     0,   26,   23,      127,   16,               0.589
  28,     0,   27,   23,      127,   16,               0.596
  29,     0,   28,   23,      127,   16,               0.591
  30,     0,   29,   23,      127,   16,               0.585
  31,     0,   30,   23,      127,   16,                0.59
  32,     0,   31,   23,      127,   16,               0.669

 sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
 sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
 sysdeps/x86_64/strrchr.S                | 505 +++++++++++++++---------
 sysdeps/x86_64/wcsrchr.S                | 268 +------------
 4 files changed, 334 insertions(+), 444 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
index db1b44c23c..866396e947 100644
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -17,7 +17,7 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define strrchr __strrchr_sse2
+# define STRRCHR __strrchr_sse2
 
 # undef weak_alias
 # define weak_alias(strrchr, rindex)
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
index 78d1ca6553..69d2f3cdb1 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -17,7 +17,6 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define wcsrchr __wcsrchr_sse2
+# define STRRCHR	__wcsrchr_sse2
 #endif
-
 #include "../wcsrchr.S"
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index 50d886713e..94449ad806 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -19,210 +19,355 @@
 
 #include <sysdep.h>
 
+#ifndef STRRCHR
+# define STRRCHR	strrchr
+#endif
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ	pcmpeqd
+# define CHAR_SIZE	4
+# define PMINU	pminud
+#else
+# define PCMPEQ	pcmpeqb
+# define CHAR_SIZE	1
+# define PMINU	pminub
+#endif
+
+#define PAGE_SIZE	4096
+#define VEC_SIZE	16
+
 	.text
-ENTRY (strrchr)
-	movd	%esi, %xmm1
+ENTRY(STRRCHR)
+	movd	%esi, %xmm0
 	movq	%rdi, %rax
-	andl	$4095, %eax
-	punpcklbw	%xmm1, %xmm1
-	cmpq	$4032, %rax
-	punpcklwd	%xmm1, %xmm1
-	pshufd	$0, %xmm1, %xmm1
+	andl	$(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+#endif
+	pshufd	$0, %xmm0, %xmm0
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(cross_page)
-	movdqu	(%rdi), %xmm0
+
+L(cross_page_continue):
+	movups	(%rdi), %xmm1
 	pxor	%xmm2, %xmm2
-	movdqa	%xmm0, %xmm3
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb	%xmm0, %ecx
-	pmovmskb	%xmm3, %edx
-	testq	%rdx, %rdx
-	je	L(next_48_bytes)
-	leaq	-1(%rdx), %rax
-	xorq	%rdx, %rax
-	andq	%rcx, %rax
-	je	L(exit)
-	bsrq	%rax, %rax
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %ecx
+	testl	%ecx, %ecx
+	jz	L(aligned_more)
+
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
 	addq	%rdi, %rax
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+	   search CHAR is zero we are correct. Either way `andq
+	   -CHAR_SIZE, %rax` gets the correct result.  */
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret0):
 	ret
 
+	/* Returns for first vec x1/x2 have hard coded backward search
+	   path for earlier matches.  */
 	.p2align 4
-L(next_48_bytes):
-	movdqu	16(%rdi), %xmm4
-	movdqa	%xmm4, %xmm5
-	movdqu	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm4
-	pcmpeqb	%xmm2, %xmm5
-	movdqu	48(%rdi), %xmm0
-	pmovmskb	%xmm5, %edx
-	movdqa	%xmm3, %xmm5
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm2, %xmm5
-	pcmpeqb	%xmm0, %xmm2
-	salq	$16, %rdx
-	pmovmskb	%xmm3, %r8d
-	pmovmskb	%xmm5, %eax
-	pmovmskb	%xmm2, %esi
-	salq	$32, %r8
-	salq	$32, %rax
-	pcmpeqb	%xmm1, %xmm0
-	orq	%rdx, %rax
-	movq	%rsi, %rdx
-	pmovmskb	%xmm4, %esi
-	salq	$48, %rdx
-	salq	$16, %rsi
-	orq	%r8, %rsi
-	orq	%rcx, %rsi
-	pmovmskb	%xmm0, %ecx
-	salq	$48, %rcx
-	orq	%rcx, %rsi
-	orq	%rdx, %rax
-	je	L(loop_header2)
-	leaq	-1(%rax), %rcx
-	xorq	%rax, %rcx
-	andq	%rcx, %rsi
-	je	L(exit)
-	bsrq	%rsi, %rsi
-	leaq	(%rdi,%rsi), %rax
+L(first_vec_x0_test):
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%r8, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
 	.p2align 4
-L(loop_header2):
-	testq	%rsi, %rsi
-	movq	%rdi, %rcx
-	je	L(no_c_found)
-L(loop_header):
-	addq	$64, %rdi
-	pxor	%xmm7, %xmm7
-	andq	$-64, %rdi
-	jmp	L(loop_entry)
+L(first_vec_x1):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
 
 	.p2align 4
-L(loop64):
-	testq	%rdx, %rdx
-	cmovne	%rdx, %rsi
-	cmovne	%rdi, %rcx
-	addq	$64, %rdi
-L(loop_entry):
-	movdqa	32(%rdi), %xmm3
-	pxor	%xmm6, %xmm6
-	movdqa	48(%rdi), %xmm2
-	movdqa	%xmm3, %xmm0
-	movdqa	16(%rdi), %xmm4
-	pminub	%xmm2, %xmm0
-	movdqa	(%rdi), %xmm5
-	pminub	%xmm4, %xmm0
-	pminub	%xmm5, %xmm0
-	pcmpeqb	%xmm7, %xmm0
-	pmovmskb	%xmm0, %eax
-	movdqa	%xmm5, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %r9d
-	movdqa	%xmm4, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %edx
-	movdqa	%xmm3, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	salq	$16, %rdx
-	pmovmskb	%xmm0, %r10d
-	movdqa	%xmm2, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	salq	$32, %r10
-	orq	%r10, %rdx
-	pmovmskb	%xmm0, %r8d
-	orq	%r9, %rdx
-	salq	$48, %r8
-	orq	%r8, %rdx
+L(first_vec_x1_test):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
 	testl	%eax, %eax
-	je	L(loop64)
-	pcmpeqb	%xmm6, %xmm4
-	pcmpeqb	%xmm6, %xmm3
-	pcmpeqb	%xmm6, %xmm5
-	pmovmskb	%xmm4, %eax
-	pmovmskb	%xmm3, %r10d
-	pcmpeqb	%xmm6, %xmm2
-	pmovmskb	%xmm5, %r9d
-	salq	$32, %r10
-	salq	$16, %rax
-	pmovmskb	%xmm2, %r8d
-	orq	%r10, %rax
-	orq	%r9, %rax
-	salq	$48, %r8
-	orq	%r8, %rax
-	leaq	-1(%rax), %r8
-	xorq	%rax, %r8
-	andq	%r8, %rdx
-	cmovne	%rdi, %rcx
-	cmovne	%rdx, %rsi
-	bsrq	%rsi, %rsi
-	leaq	(%rcx,%rsi), %rax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm3, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(aligned_more):
+	/* Save original pointer if match was in VEC 0.  */
+	movq	%rdi, %r8
+	andq	$-VEC_SIZE, %rdi
+
+	movaps	VEC_SIZE(%rdi), %xmm2
+	pxor	%xmm3, %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pmovmskb %xmm3, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x1)
+
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm3
+	pxor	%xmm4, %xmm4
+	PCMPEQ	%xmm3, %xmm4
+	pmovmskb %xmm4, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
+
+	addq	$VEC_SIZE, %rdi
+	/* Save pointer again before realigning.  */
+	movq	%rdi, %rsi
+	andq	$-(VEC_SIZE * 2), %rdi
+	.p2align 4
+L(first_loop):
+	/* Do 2x VEC at a time.  */
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+	/* If SSE2 no pminud.  */
+#ifdef NO_PMINU
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef NO_PMINU
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
+#endif
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
+
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Use `addl` 1) so we can undo it with `subl` and 2) it can
+	   macro-fuse with `jz`.  */
+	addl	%ecx, %eax
+	jz	L(first_loop)
+
+	/* Check if there is zero match.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+	/* Check if there was a match in last iteration.  */
+	subl	%ecx, %eax
+	jnz	L(new_match)
+
+L(first_loop_old_match):
+	PCMPEQ	%xmm0, %xmm2
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	addl	%eax, %ecx
+	jz	L(first_vec_x0_test)
+	/* NB: We could move this shift to before the branch and save a
+	   bit of code size / performance on the fall through. The
+	   branch leads to the null case which generally seems hotter
+	   than char in first 3x VEC.  */
+	sall	$16, %eax
+	orl	%ecx, %eax
+
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
+	/* Save minimum state for getting most recent match. We can
+	   throw out all previous work.  */
 	.p2align 4
-L(no_c_found):
-	movl	$1, %esi
-	xorl	%ecx, %ecx
-	jmp	L(loop_header)
+L(second_loop_match):
+	movq	%rdi, %rsi
+	movaps	%xmm4, %xmm2
+	movaps	%xmm7, %xmm3
 
 	.p2align 4
-L(exit):
-	xorl	%eax, %eax
+L(second_loop):
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+#ifdef NO_PMINU
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef NO_PMINU
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
+#endif
+
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
+
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Either null term or new occurence of CHAR.  */
+	addl	%ecx, %eax
+	jz	L(second_loop)
+
+	/* No null term so much be new occurence of CHAR.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+
+	subl	%ecx, %eax
+	jnz	L(second_loop_new_match)
+
+L(second_loop_old_match):
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	sall	$16, %eax
+	orl	%ecx, %eax
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
 	.p2align 4
+L(second_loop_new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(second_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4,, 4
 L(cross_page):
-	movq	%rdi, %rax
-	pxor	%xmm0, %xmm0
-	andq	$-64, %rax
-	movdqu	(%rax), %xmm5
-	movdqa	%xmm5, %xmm6
-	movdqu	16(%rax), %xmm4
-	pcmpeqb	%xmm1, %xmm5
-	pcmpeqb	%xmm0, %xmm6
-	movdqu	32(%rax), %xmm3
-	pmovmskb	%xmm6, %esi
-	movdqa	%xmm4, %xmm6
-	movdqu	48(%rax), %xmm2
-	pcmpeqb	%xmm1, %xmm4
-	pcmpeqb	%xmm0, %xmm6
-	pmovmskb	%xmm6, %edx
-	movdqa	%xmm3, %xmm6
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm0, %xmm6
-	pcmpeqb	%xmm2, %xmm0
-	salq	$16, %rdx
-	pmovmskb	%xmm3, %r9d
-	pmovmskb	%xmm6, %r8d
-	pmovmskb	%xmm0, %ecx
-	salq	$32, %r9
-	salq	$32, %r8
-	pcmpeqb	%xmm1, %xmm2
-	orq	%r8, %rdx
-	salq	$48, %rcx
-	pmovmskb	%xmm5, %r8d
-	orq	%rsi, %rdx
-	pmovmskb	%xmm4, %esi
-	orq	%rcx, %rdx
-	pmovmskb	%xmm2, %ecx
-	salq	$16, %rsi
-	salq	$48, %rcx
-	orq	%r9, %rsi
-	orq	%r8, %rsi
-	orq	%rcx, %rsi
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rsi
+	movaps	(%rsi), %xmm1
+	pxor	%xmm2, %xmm2
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
 	movl	%edi, %ecx
-	subl	%eax, %ecx
-	shrq	%cl, %rdx
-	shrq	%cl, %rsi
-	testq	%rdx, %rdx
-	je	L(loop_header2)
-	leaq	-1(%rdx), %rax
-	xorq	%rdx, %rax
-	andq	%rax, %rsi
-	je	L(exit)
-	bsrq	%rsi, %rax
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	%cl, %edx
+	jz	L(cross_page_continue)
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	sarl	%cl, %eax
+	leal	-1(%rdx), %ecx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
 	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret1):
 	ret
-END (strrchr)
+END(STRRCHR)
 
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
+#ifndef USE_AS_WCSRCHR
+	weak_alias (STRRCHR, rindex)
+	libc_hidden_builtin_def (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
index 61552954de..2b80efc5ef 100644
--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -1,4 +1,4 @@
-/* wcsrchr with SSSE3
+/* wcsrchr optimized with SSE2.
    Copyright (C) 2011-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,266 +16,12 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
 
-	.text
-ENTRY (wcsrchr)
+#define USE_AS_WCSRCHR	1
+#define NO_PMINU	1
 
-	movd	%rsi, %xmm1
-	mov	%rdi, %rcx
-	punpckldq %xmm1, %xmm1
-	pxor	%xmm2, %xmm2
-	punpckldq %xmm1, %xmm1
-	and	$63, %rcx
-	cmp	$48, %rcx
-	ja	L(crosscache)
+#ifndef STRRCHR
+# define STRRCHR	wcsrchr
+#endif
 
-	movdqu	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match1)
-
-	test	%rcx, %rcx
-	jnz	L(return_null)
-
-	and	$-16, %rdi
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match1):
-	test	%rcx, %rcx
-	jnz	L(prolog_find_zero_1)
-
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	and	$-16, %rdi
-	jmp	L(loop)
-
-	.p2align 4
-L(crosscache):
-	and	$15, %rcx
-	and	$-16, %rdi
-	pxor	%xmm3, %xmm3
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm3
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm3, %rdx
-	pmovmskb %xmm0, %rax
-	shr	%cl, %rdx
-	shr	%cl, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match)
-
-	test	%rdx, %rdx
-	jnz	L(return_null)
-
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match):
-	test	%rdx, %rdx
-	jnz	L(prolog_find_zero)
-
-	mov	%rax, %r8
-	lea	(%rdi, %rcx), %rsi
-
-/* Loop start on aligned string.  */
-	.p2align 4
-L(loop):
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm3
-	pcmpeqd	%xmm3, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm3
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm3, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm4
-	pcmpeqd	%xmm4, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm4
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm4, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm5
-	pcmpeqd	%xmm5, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm5
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm5, %rax
-	or	%rax, %rcx
-	jz	L(loop)
-
-	.p2align 4
-L(matches):
-	test	%rax, %rax
-	jnz	L(match)
-L(return_value):
-	test	%r8, %r8
-	jz	L(return_null)
-	mov	%r8, %rax
-	mov	%rsi, %rdi
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match):
-	pmovmskb %xmm2, %rcx
-	test	%rcx, %rcx
-	jnz	L(find_zero)
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	jmp	L(loop)
-
-	.p2align 4
-L(find_zero):
-	test	$15, %cl
-	jnz	L(find_zero_in_first_wchar)
-	test	%cl, %cl
-	jnz	L(find_zero_in_second_wchar)
-	test	$15, %ch
-	jnz	L(find_zero_in_third_wchar)
-
-	and	$1 << 13 - 1, %rax
-	jz	L(return_value)
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_first_wchar):
-	test	$1, %rax
-	jz	L(return_value)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_second_wchar):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_value)
-
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_third_wchar):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_value)
-
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero):
-	add	%rcx, %rdi
-	mov     %rdx, %rcx
-L(prolog_find_zero_1):
-	test	$15, %cl
-	jnz	L(prolog_find_zero_in_first_wchar)
-	test	%cl, %cl
-	jnz	L(prolog_find_zero_in_second_wchar)
-	test	$15, %ch
-	jnz	L(prolog_find_zero_in_third_wchar)
-
-	and	$1 << 13 - 1, %rax
-	jz	L(return_null)
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_first_wchar):
-	test	$1, %rax
-	jz	L(return_null)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_second_wchar):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_null)
-
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_third_wchar):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_null)
-
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_second_wchar):
-	lea	-12(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_third_wchar):
-	lea	-8(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_fourth_wchar):
-	lea	-4(%rdi), %rax
-	ret
-
-	.p2align 4
-L(return_null):
-	xor	%rax, %rax
-	ret
-
-END (wcsrchr)
+#include "../strrchr.S"