From patchwork Tue Jul 12 19:29:04 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 55984 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 87398382C146 for ; Tue, 12 Jul 2022 19:29:44 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 87398382C146 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1657654184; bh=f6CRjj6l03eQCK/FQzMfr8FlmSgWG764Ct28PLBCdcY=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=uP6BXNdSlcSSf1cz3tiykVX4VWkSvht5fOE14z2TR4b/0DVvii1Vk7gGWMsqPdHdF C1NAW9IC/EIjPPwhLKXCfvCHLjLU4RD5KY9Cp5Qwj1D1ls/UaC+UggqkncRidWNkwM MV0lLtbyOkZmWRac/WcA6Rp3mbEvDgqufErojykU= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pg1-x52d.google.com (mail-pg1-x52d.google.com [IPv6:2607:f8b0:4864:20::52d]) by sourceware.org (Postfix) with ESMTPS id C79353838F19 for ; Tue, 12 Jul 2022 19:29:17 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org C79353838F19 Received: by mail-pg1-x52d.google.com with SMTP id f11so7600382pgj.7 for ; Tue, 12 Jul 2022 12:29:17 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=f6CRjj6l03eQCK/FQzMfr8FlmSgWG764Ct28PLBCdcY=; b=VnXNAxUSM69rsWlvHHsL66la+8l0sGoH+XBstatytTG/sJbx1itgbDp9EBsWaHAgSp +0PDf+QQlrEbHiSa2o2OYZCbTUGtWubSJ16VTe6qaO2DTVpCZ+f+mreBGN6DNZLIsSEH 8/dCM+XTies57JckbnfCH/ckzf7bOTnTt2aGu9ciATDWRI7ToiiFljzaw7qb/zotlGd0 wLqU/jN+Liy6X3vGJqq+eMHjffw8kVvqP4T9OuyUbReBI7yLjn2rY09KMEtB1ovRNatw E7d6LA3PjN4HCgS3SWOv7aA2mdviQSdVhT9MIMyYaZa209eGr/f3WpYThZzVdEAmZIqV OkJQ== X-Gm-Message-State: AJIora+A22CnI5PXmZeZsqYxbMyMrImCBbpY5nWJ2tfvW3dDWL5GHVGB d2kIeJuXVgySbjyRnQYM1rEzaZo5pfE= X-Google-Smtp-Source: AGRyM1ur7e2VVwsDzHYzevdXzl0bvLN+iu04qwqjUD8gPtzRCCy0RyARwhG+nT/AzADSv2k0qikrcQ== X-Received: by 2002:a05:6a00:a8b:b0:4cd:6030:4df3 with SMTP id b11-20020a056a000a8b00b004cd60304df3mr25105654pfl.40.1657654156440; Tue, 12 Jul 2022 12:29:16 -0700 (PDT) Received: from noah-tgl.. ([192.55.60.37]) by smtp.gmail.com with ESMTPSA id w7-20020a170902e88700b0016c28fbd7e5sm7274704plg.268.2022.07.12.12.29.15 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 12 Jul 2022 12:29:15 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1] x86: Move strrchr SSE2 implementation to multiarch/strrchr-sse2.S Date: Tue, 12 Jul 2022 12:29:04 -0700 Message-Id: <20220712192910.351121-4-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20220712192910.351121-1-goldstein.w.n@gmail.com> References: <20220712192910.351121-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.1 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_SHORT, RCVD_IN_DNSWL_NONE, SCC_10_SHORT_WORD_LINES, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch. --- sysdeps/x86_64/multiarch/strrchr-sse2.S | 358 ++++++++++++++++++++++- sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 10 +- sysdeps/x86_64/strrchr.S | 364 +----------------------- sysdeps/x86_64/wcsrchr.S | 11 +- 4 files changed, 366 insertions(+), 377 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S index 866396e947..6ee7a5e33a 100644 --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S @@ -17,12 +17,358 @@ . */ #if IS_IN (libc) -# define STRRCHR __strrchr_sse2 +# ifndef STRRCHR +# define STRRCHR __strrchr_sse2 +# endif +#endif + +#include + +#ifdef USE_AS_WCSRCHR +# define PCMPEQ pcmpeqd +# define CHAR_SIZE 4 +# define PMINU pminud +#else +# define PCMPEQ pcmpeqb +# define CHAR_SIZE 1 +# define PMINU pminub +#endif + +#define PAGE_SIZE 4096 +#define VEC_SIZE 16 + + .text +ENTRY(STRRCHR) + movd %esi, %xmm0 + movq %rdi, %rax + andl $(PAGE_SIZE - 1), %eax +#ifndef USE_AS_WCSRCHR + punpcklbw %xmm0, %xmm0 + punpcklwd %xmm0, %xmm0 +#endif + pshufd $0, %xmm0, %xmm0 + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page) + +L(cross_page_continue): + movups (%rdi), %xmm1 + pxor %xmm2, %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %ecx + testl %ecx, %ecx + jz L(aligned_more) + + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + leal -1(%rcx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(ret0) + bsrl %eax, %eax + addq %rdi, %rax + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If + search CHAR is zero we are correct. Either way `andq + -CHAR_SIZE, %rax` gets the correct result. */ +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif +L(ret0): + ret + + /* Returns for first vec x1/x2 have hard coded backward search + path for earlier matches. */ + .p2align 4 +L(first_vec_x0_test): + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + testl %eax, %eax + jz L(ret0) + bsrl %eax, %eax + addq %r8, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(first_vec_x1): + PCMPEQ %xmm0, %xmm2 + pmovmskb %xmm2, %eax + leal -1(%rcx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(first_vec_x0_test) + bsrl %eax, %eax + leaq (VEC_SIZE)(%rdi, %rax), %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(first_vec_x1_test): + PCMPEQ %xmm0, %xmm2 + pmovmskb %xmm2, %eax + testl %eax, %eax + jz L(first_vec_x0_test) + bsrl %eax, %eax + leaq (VEC_SIZE)(%rdi, %rax), %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(first_vec_x2): + PCMPEQ %xmm0, %xmm3 + pmovmskb %xmm3, %eax + leal -1(%rcx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(first_vec_x1_test) + bsrl %eax, %eax + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(aligned_more): + /* Save original pointer if match was in VEC 0. */ + movq %rdi, %r8 + andq $-VEC_SIZE, %rdi + + movaps VEC_SIZE(%rdi), %xmm2 + pxor %xmm3, %xmm3 + PCMPEQ %xmm2, %xmm3 + pmovmskb %xmm3, %ecx + testl %ecx, %ecx + jnz L(first_vec_x1) + + movaps (VEC_SIZE * 2)(%rdi), %xmm3 + pxor %xmm4, %xmm4 + PCMPEQ %xmm3, %xmm4 + pmovmskb %xmm4, %ecx + testl %ecx, %ecx + jnz L(first_vec_x2) + + addq $VEC_SIZE, %rdi + /* Save pointer again before realigning. */ + movq %rdi, %rsi + andq $-(VEC_SIZE * 2), %rdi + .p2align 4 +L(first_loop): + /* Do 2x VEC at a time. */ + movaps (VEC_SIZE * 2)(%rdi), %xmm4 + movaps (VEC_SIZE * 3)(%rdi), %xmm5 + /* Since SSE2 no pminud so wcsrchr needs seperate logic for + detecting zero. Note if this is found to be a bottleneck it + may be worth adding an SSE4.1 wcsrchr implementation. */ +#ifdef USE_AS_WCSRCHR + movaps %xmm5, %xmm6 + pxor %xmm8, %xmm8 + + PCMPEQ %xmm8, %xmm5 + PCMPEQ %xmm4, %xmm8 + por %xmm5, %xmm8 +#else + movaps %xmm5, %xmm6 + PMINU %xmm4, %xmm5 +#endif + + movaps %xmm4, %xmm9 + PCMPEQ %xmm0, %xmm4 + PCMPEQ %xmm0, %xmm6 + movaps %xmm6, %xmm7 + por %xmm4, %xmm6 +#ifndef USE_AS_WCSRCHR + pxor %xmm8, %xmm8 + PCMPEQ %xmm5, %xmm8 +#endif + pmovmskb %xmm8, %ecx + pmovmskb %xmm6, %eax -# undef weak_alias -# define weak_alias(strrchr, rindex) -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(strrchr) + addq $(VEC_SIZE * 2), %rdi + /* Use `addl` 1) so we can undo it with `subl` and 2) it can + macro-fuse with `jz`. */ + addl %ecx, %eax + jz L(first_loop) + + /* Check if there is zero match. */ + testl %ecx, %ecx + jz L(second_loop_match) + + /* Check if there was a match in last iteration. */ + subl %ecx, %eax + jnz L(new_match) + +L(first_loop_old_match): + PCMPEQ %xmm0, %xmm2 + PCMPEQ %xmm0, %xmm3 + pmovmskb %xmm2, %ecx + pmovmskb %xmm3, %eax + addl %eax, %ecx + jz L(first_vec_x0_test) + /* NB: We could move this shift to before the branch and save a + bit of code size / performance on the fall through. The + branch leads to the null case which generally seems hotter + than char in first 3x VEC. */ + sall $16, %eax + orl %ecx, %eax + + bsrl %eax, %eax + addq %rsi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(new_match): + pxor %xmm6, %xmm6 + PCMPEQ %xmm9, %xmm6 + pmovmskb %xmm6, %eax + sall $16, %ecx + orl %eax, %ecx + + /* We can't reuse either of the old comparisons as since we mask + of zeros after first zero (instead of using the full + comparison) we can't gurantee no interference between match + after end of string and valid match. */ + pmovmskb %xmm4, %eax + pmovmskb %xmm7, %edx + sall $16, %edx + orl %edx, %eax + + leal -1(%ecx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(first_loop_old_match) + bsrl %eax, %eax + addq %rdi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + /* Save minimum state for getting most recent match. We can + throw out all previous work. */ + .p2align 4 +L(second_loop_match): + movq %rdi, %rsi + movaps %xmm4, %xmm2 + movaps %xmm7, %xmm3 + + .p2align 4 +L(second_loop): + movaps (VEC_SIZE * 2)(%rdi), %xmm4 + movaps (VEC_SIZE * 3)(%rdi), %xmm5 + /* Since SSE2 no pminud so wcsrchr needs seperate logic for + detecting zero. Note if this is found to be a bottleneck it + may be worth adding an SSE4.1 wcsrchr implementation. */ +#ifdef USE_AS_WCSRCHR + movaps %xmm5, %xmm6 + pxor %xmm8, %xmm8 + + PCMPEQ %xmm8, %xmm5 + PCMPEQ %xmm4, %xmm8 + por %xmm5, %xmm8 +#else + movaps %xmm5, %xmm6 + PMINU %xmm4, %xmm5 +#endif + + movaps %xmm4, %xmm9 + PCMPEQ %xmm0, %xmm4 + PCMPEQ %xmm0, %xmm6 + movaps %xmm6, %xmm7 + por %xmm4, %xmm6 +#ifndef USE_AS_WCSRCHR + pxor %xmm8, %xmm8 + PCMPEQ %xmm5, %xmm8 #endif -#include "../strrchr.S" + pmovmskb %xmm8, %ecx + pmovmskb %xmm6, %eax + + addq $(VEC_SIZE * 2), %rdi + /* Either null term or new occurence of CHAR. */ + addl %ecx, %eax + jz L(second_loop) + + /* No null term so much be new occurence of CHAR. */ + testl %ecx, %ecx + jz L(second_loop_match) + + + subl %ecx, %eax + jnz L(second_loop_new_match) + +L(second_loop_old_match): + pmovmskb %xmm2, %ecx + pmovmskb %xmm3, %eax + sall $16, %eax + orl %ecx, %eax + bsrl %eax, %eax + addq %rsi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(second_loop_new_match): + pxor %xmm6, %xmm6 + PCMPEQ %xmm9, %xmm6 + pmovmskb %xmm6, %eax + sall $16, %ecx + orl %eax, %ecx + + /* We can't reuse either of the old comparisons as since we mask + of zeros after first zero (instead of using the full + comparison) we can't gurantee no interference between match + after end of string and valid match. */ + pmovmskb %xmm4, %eax + pmovmskb %xmm7, %edx + sall $16, %edx + orl %edx, %eax + + leal -1(%ecx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(second_loop_old_match) + bsrl %eax, %eax + addq %rdi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4,, 4 +L(cross_page): + movq %rdi, %rsi + andq $-VEC_SIZE, %rsi + movaps (%rsi), %xmm1 + pxor %xmm2, %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %edx + movl %edi, %ecx + andl $(VEC_SIZE - 1), %ecx + sarl %cl, %edx + jz L(cross_page_continue) + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + sarl %cl, %eax + leal -1(%rdx), %ecx + xorl %edx, %ecx + andl %ecx, %eax + jz L(ret1) + bsrl %eax, %eax + addq %rdi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif +L(ret1): + ret +END(STRRCHR) diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S index 69d2f3cdb1..d9259720f8 100644 --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S @@ -17,6 +17,12 @@ . */ #if IS_IN (libc) -# define STRRCHR __wcsrchr_sse2 +# ifndef STRRCHR +# define STRRCHR __wcsrchr_sse2 +# endif #endif -#include "../wcsrchr.S" + +#define USE_AS_WCSRCHR 1 +#define NO_PMINU 1 + +#include "strrchr-sse2.S" diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S index 4d7ba4ceb2..f39da60454 100644 --- a/sysdeps/x86_64/strrchr.S +++ b/sysdeps/x86_64/strrchr.S @@ -16,363 +16,7 @@ License along with the GNU C Library; if not, see . */ - -#include - -#ifndef STRRCHR -# define STRRCHR strrchr -#endif - -#ifdef USE_AS_WCSRCHR -# define PCMPEQ pcmpeqd -# define CHAR_SIZE 4 -# define PMINU pminud -#else -# define PCMPEQ pcmpeqb -# define CHAR_SIZE 1 -# define PMINU pminub -#endif - -#define PAGE_SIZE 4096 -#define VEC_SIZE 16 - - .text -ENTRY(STRRCHR) - movd %esi, %xmm0 - movq %rdi, %rax - andl $(PAGE_SIZE - 1), %eax -#ifndef USE_AS_WCSRCHR - punpcklbw %xmm0, %xmm0 - punpcklwd %xmm0, %xmm0 -#endif - pshufd $0, %xmm0, %xmm0 - cmpl $(PAGE_SIZE - VEC_SIZE), %eax - ja L(cross_page) - -L(cross_page_continue): - movups (%rdi), %xmm1 - pxor %xmm2, %xmm2 - PCMPEQ %xmm1, %xmm2 - pmovmskb %xmm2, %ecx - testl %ecx, %ecx - jz L(aligned_more) - - PCMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - leal -1(%rcx), %edx - xorl %edx, %ecx - andl %ecx, %eax - jz L(ret0) - bsrl %eax, %eax - addq %rdi, %rax - /* We are off by 3 for wcsrchr if search CHAR is non-zero. If - search CHAR is zero we are correct. Either way `andq - -CHAR_SIZE, %rax` gets the correct result. */ -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif -L(ret0): - ret - - /* Returns for first vec x1/x2 have hard coded backward search - path for earlier matches. */ - .p2align 4 -L(first_vec_x0_test): - PCMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - testl %eax, %eax - jz L(ret0) - bsrl %eax, %eax - addq %r8, %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif - ret - - .p2align 4 -L(first_vec_x1): - PCMPEQ %xmm0, %xmm2 - pmovmskb %xmm2, %eax - leal -1(%rcx), %edx - xorl %edx, %ecx - andl %ecx, %eax - jz L(first_vec_x0_test) - bsrl %eax, %eax - leaq (VEC_SIZE)(%rdi, %rax), %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif - ret - - .p2align 4 -L(first_vec_x1_test): - PCMPEQ %xmm0, %xmm2 - pmovmskb %xmm2, %eax - testl %eax, %eax - jz L(first_vec_x0_test) - bsrl %eax, %eax - leaq (VEC_SIZE)(%rdi, %rax), %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif - ret - - .p2align 4 -L(first_vec_x2): - PCMPEQ %xmm0, %xmm3 - pmovmskb %xmm3, %eax - leal -1(%rcx), %edx - xorl %edx, %ecx - andl %ecx, %eax - jz L(first_vec_x1_test) - bsrl %eax, %eax - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif - ret - - .p2align 4 -L(aligned_more): - /* Save original pointer if match was in VEC 0. */ - movq %rdi, %r8 - andq $-VEC_SIZE, %rdi - - movaps VEC_SIZE(%rdi), %xmm2 - pxor %xmm3, %xmm3 - PCMPEQ %xmm2, %xmm3 - pmovmskb %xmm3, %ecx - testl %ecx, %ecx - jnz L(first_vec_x1) - - movaps (VEC_SIZE * 2)(%rdi), %xmm3 - pxor %xmm4, %xmm4 - PCMPEQ %xmm3, %xmm4 - pmovmskb %xmm4, %ecx - testl %ecx, %ecx - jnz L(first_vec_x2) - - addq $VEC_SIZE, %rdi - /* Save pointer again before realigning. */ - movq %rdi, %rsi - andq $-(VEC_SIZE * 2), %rdi - .p2align 4 -L(first_loop): - /* Do 2x VEC at a time. */ - movaps (VEC_SIZE * 2)(%rdi), %xmm4 - movaps (VEC_SIZE * 3)(%rdi), %xmm5 - /* Since SSE2 no pminud so wcsrchr needs seperate logic for - detecting zero. Note if this is found to be a bottleneck it - may be worth adding an SSE4.1 wcsrchr implementation. */ -#ifdef USE_AS_WCSRCHR - movaps %xmm5, %xmm6 - pxor %xmm8, %xmm8 - - PCMPEQ %xmm8, %xmm5 - PCMPEQ %xmm4, %xmm8 - por %xmm5, %xmm8 -#else - movaps %xmm5, %xmm6 - PMINU %xmm4, %xmm5 -#endif - - movaps %xmm4, %xmm9 - PCMPEQ %xmm0, %xmm4 - PCMPEQ %xmm0, %xmm6 - movaps %xmm6, %xmm7 - por %xmm4, %xmm6 -#ifndef USE_AS_WCSRCHR - pxor %xmm8, %xmm8 - PCMPEQ %xmm5, %xmm8 -#endif - pmovmskb %xmm8, %ecx - pmovmskb %xmm6, %eax - - addq $(VEC_SIZE * 2), %rdi - /* Use `addl` 1) so we can undo it with `subl` and 2) it can - macro-fuse with `jz`. */ - addl %ecx, %eax - jz L(first_loop) - - /* Check if there is zero match. */ - testl %ecx, %ecx - jz L(second_loop_match) - - /* Check if there was a match in last iteration. */ - subl %ecx, %eax - jnz L(new_match) - -L(first_loop_old_match): - PCMPEQ %xmm0, %xmm2 - PCMPEQ %xmm0, %xmm3 - pmovmskb %xmm2, %ecx - pmovmskb %xmm3, %eax - addl %eax, %ecx - jz L(first_vec_x0_test) - /* NB: We could move this shift to before the branch and save a - bit of code size / performance on the fall through. The - branch leads to the null case which generally seems hotter - than char in first 3x VEC. */ - sall $16, %eax - orl %ecx, %eax - - bsrl %eax, %eax - addq %rsi, %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif - ret - - .p2align 4 -L(new_match): - pxor %xmm6, %xmm6 - PCMPEQ %xmm9, %xmm6 - pmovmskb %xmm6, %eax - sall $16, %ecx - orl %eax, %ecx - - /* We can't reuse either of the old comparisons as since we mask - of zeros after first zero (instead of using the full - comparison) we can't gurantee no interference between match - after end of string and valid match. */ - pmovmskb %xmm4, %eax - pmovmskb %xmm7, %edx - sall $16, %edx - orl %edx, %eax - - leal -1(%ecx), %edx - xorl %edx, %ecx - andl %ecx, %eax - jz L(first_loop_old_match) - bsrl %eax, %eax - addq %rdi, %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif - ret - - /* Save minimum state for getting most recent match. We can - throw out all previous work. */ - .p2align 4 -L(second_loop_match): - movq %rdi, %rsi - movaps %xmm4, %xmm2 - movaps %xmm7, %xmm3 - - .p2align 4 -L(second_loop): - movaps (VEC_SIZE * 2)(%rdi), %xmm4 - movaps (VEC_SIZE * 3)(%rdi), %xmm5 - /* Since SSE2 no pminud so wcsrchr needs seperate logic for - detecting zero. Note if this is found to be a bottleneck it - may be worth adding an SSE4.1 wcsrchr implementation. */ -#ifdef USE_AS_WCSRCHR - movaps %xmm5, %xmm6 - pxor %xmm8, %xmm8 - - PCMPEQ %xmm8, %xmm5 - PCMPEQ %xmm4, %xmm8 - por %xmm5, %xmm8 -#else - movaps %xmm5, %xmm6 - PMINU %xmm4, %xmm5 -#endif - - movaps %xmm4, %xmm9 - PCMPEQ %xmm0, %xmm4 - PCMPEQ %xmm0, %xmm6 - movaps %xmm6, %xmm7 - por %xmm4, %xmm6 -#ifndef USE_AS_WCSRCHR - pxor %xmm8, %xmm8 - PCMPEQ %xmm5, %xmm8 -#endif - - pmovmskb %xmm8, %ecx - pmovmskb %xmm6, %eax - - addq $(VEC_SIZE * 2), %rdi - /* Either null term or new occurence of CHAR. */ - addl %ecx, %eax - jz L(second_loop) - - /* No null term so much be new occurence of CHAR. */ - testl %ecx, %ecx - jz L(second_loop_match) - - - subl %ecx, %eax - jnz L(second_loop_new_match) - -L(second_loop_old_match): - pmovmskb %xmm2, %ecx - pmovmskb %xmm3, %eax - sall $16, %eax - orl %ecx, %eax - bsrl %eax, %eax - addq %rsi, %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif - ret - - .p2align 4 -L(second_loop_new_match): - pxor %xmm6, %xmm6 - PCMPEQ %xmm9, %xmm6 - pmovmskb %xmm6, %eax - sall $16, %ecx - orl %eax, %ecx - - /* We can't reuse either of the old comparisons as since we mask - of zeros after first zero (instead of using the full - comparison) we can't gurantee no interference between match - after end of string and valid match. */ - pmovmskb %xmm4, %eax - pmovmskb %xmm7, %edx - sall $16, %edx - orl %edx, %eax - - leal -1(%ecx), %edx - xorl %edx, %ecx - andl %ecx, %eax - jz L(second_loop_old_match) - bsrl %eax, %eax - addq %rdi, %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif - ret - - .p2align 4,, 4 -L(cross_page): - movq %rdi, %rsi - andq $-VEC_SIZE, %rsi - movaps (%rsi), %xmm1 - pxor %xmm2, %xmm2 - PCMPEQ %xmm1, %xmm2 - pmovmskb %xmm2, %edx - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx - sarl %cl, %edx - jz L(cross_page_continue) - PCMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - sarl %cl, %eax - leal -1(%rdx), %ecx - xorl %edx, %ecx - andl %ecx, %eax - jz L(ret1) - bsrl %eax, %eax - addq %rdi, %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif -L(ret1): - ret -END(STRRCHR) - -#ifndef USE_AS_WCSRCHR - weak_alias (STRRCHR, rindex) - libc_hidden_builtin_def (STRRCHR) -#endif +#define STRRCHR strrchr +#include "multiarch/strrchr-sse2.S" +weak_alias (strrchr, rindex) +libc_hidden_builtin_def (strrchr) diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S index 2b80efc5ef..1d4b1eb21c 100644 --- a/sysdeps/x86_64/wcsrchr.S +++ b/sysdeps/x86_64/wcsrchr.S @@ -16,12 +16,5 @@ License along with the GNU C Library; if not, see . */ - -#define USE_AS_WCSRCHR 1 -#define NO_PMINU 1 - -#ifndef STRRCHR -# define STRRCHR wcsrchr -#endif - -#include "../strrchr.S" +#define STRRCHR wcsrchr +#include "multiarch/wcsrchr-sse2.S"