From patchwork Tue Jul 12 19:29:03 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 55989 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 818B9382458F for ; Tue, 12 Jul 2022 19:30:30 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 818B9382458F DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1657654230; bh=8XokivGxILiDVm1OfBNunJSR8Thh1U4q+JlSich6Hts=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=ualS/JviDG+90YDGGSCWHU5QHFy1B6p4Vvq4Bo9kFFn4RGzKJMZMud8egB3P2s3NH vv/gil92z/6NX79p417ieY3548bF4OtMgAaaOG5X/zzJaf+ql/qbGczPWUNOKF92bm dTJk2hYo/mJkfqqaOdSgFRvMeD25JKWzKN1LiRdE= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pj1-x1029.google.com (mail-pj1-x1029.google.com [IPv6:2607:f8b0:4864:20::1029]) by sourceware.org (Postfix) with ESMTPS id AFC793876880 for ; Tue, 12 Jul 2022 19:29:16 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org AFC793876880 Received: by mail-pj1-x1029.google.com with SMTP id fz10so8868974pjb.2 for ; Tue, 12 Jul 2022 12:29:16 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=8XokivGxILiDVm1OfBNunJSR8Thh1U4q+JlSich6Hts=; b=6QERY5j1fFgq8SGMshUzrGIrBOmyslhv/RhX3VzFRE3omC5OAo34tSZhtZBJ6ptVVR ZqWngDZPjhP4TPET5x5NE0tMNi1+vT5cQFPKI04EwR/DH37KW2Bq5shaLXFHu9HwBdJF thVrjDq4vpRvpo17nG3me07sH3ecFoCn2KE9JH+6gvWmVWlBbCbZ1O7qgVoRPW7GO5c5 midwPgPZk4njBG6a1Ub/NfwGeKE0R1KsyJdC/nmJckSRloCel17a6apB20fY4fLmofMe TJWKNRJvHt92kJ0f9au5uvEpchznRlH+zEh0CAA5CBhH0aEP1d5GrOHh7u6QqMUkMzZT 2Syg== X-Gm-Message-State: AJIora9cg9DYv7gHFR9SvrK6WZ6Eck2XrjnxW0RrKztELj5w8A4Jg5NB KLspu4rPJo0KgVJKbEhDOAwziSB9kN4= X-Google-Smtp-Source: AGRyM1vbGB9327JMqwH1dUcxuA/I7XhWEt4ti1NeIDH4BnzD7cLzv3bkxb3h7whOUt4YeNm2m+y3Ng== X-Received: by 2002:a17:90a:4704:b0:1ef:f369:bd0e with SMTP id h4-20020a17090a470400b001eff369bd0emr5988077pjg.20.1657654155238; Tue, 12 Jul 2022 12:29:15 -0700 (PDT) Received: from noah-tgl.. ([192.55.60.37]) by smtp.gmail.com with ESMTPSA id w7-20020a170902e88700b0016c28fbd7e5sm7274704plg.268.2022.07.12.12.29.14 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 12 Jul 2022 12:29:14 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1] x86: Move memrchr SSE2 implementation to multiarch/memrchr-sse2.S Date: Tue, 12 Jul 2022 12:29:03 -0700 Message-Id: <20220712192910.351121-3-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20220712192910.351121-1-goldstein.w.n@gmail.com> References: <20220712192910.351121-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-12.1 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch. --- sysdeps/x86_64/memrchr.S | 332 +---------------------- sysdeps/x86_64/multiarch/memrchr-sse2.S | 336 +++++++++++++++++++++++- 2 files changed, 334 insertions(+), 334 deletions(-) diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S index b0dffd2ae2..385e2c5668 100644 --- a/sysdeps/x86_64/memrchr.S +++ b/sysdeps/x86_64/memrchr.S @@ -17,334 +17,6 @@ License along with the GNU C Library; if not, see . */ -#include -#define VEC_SIZE 16 -#define PAGE_SIZE 4096 - - .text -ENTRY_P2ALIGN(__memrchr, 6) -#ifdef __ILP32__ - /* Clear upper bits. */ - mov %RDX_LP, %RDX_LP -#endif - movd %esi, %xmm0 - - /* Get end pointer. */ - leaq (%rdx, %rdi), %rcx - - punpcklbw %xmm0, %xmm0 - punpcklwd %xmm0, %xmm0 - pshufd $0, %xmm0, %xmm0 - - /* Check if we can load 1x VEC without cross a page. */ - testl $(PAGE_SIZE - VEC_SIZE), %ecx - jz L(page_cross) - - /* NB: This load happens regardless of whether rdx (len) is zero. Since - it doesn't cross a page and the standard gurantees any pointer have - at least one-valid byte this load must be safe. For the entire - history of the x86 memrchr implementation this has been possible so - no code "should" be relying on a zero-length check before this load. - The zero-length check is moved to the page cross case because it is - 1) pretty cold and including it pushes the hot case len <= VEC_SIZE - into 2-cache lines. */ - movups -(VEC_SIZE)(%rcx), %xmm1 - pcmpeqb %xmm0, %xmm1 - pmovmskb %xmm1, %eax - - subq $VEC_SIZE, %rdx - ja L(more_1x_vec) -L(ret_vec_x0_test): - /* Zero-flag set if eax (src) is zero. Destination unchanged if src is - zero. */ - bsrl %eax, %eax - jz L(ret_0) - /* Check if the CHAR match is in bounds. Need to truly zero `eax` here - if out of bounds. */ - addl %edx, %eax - jl L(zero_0) - /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base - ptr. */ - addq %rdi, %rax -L(ret_0): - ret - - .p2align 4,, 5 -L(ret_vec_x0): - bsrl %eax, %eax - leaq -(VEC_SIZE)(%rcx, %rax), %rax - ret - - .p2align 4,, 2 -L(zero_0): - xorl %eax, %eax - ret - - - .p2align 4,, 8 -L(more_1x_vec): - testl %eax, %eax - jnz L(ret_vec_x0) - - /* Align rcx (pointer to string). */ - decq %rcx - andq $-VEC_SIZE, %rcx - - movq %rcx, %rdx - /* NB: We could consistenyl save 1-byte in this pattern with `movaps - %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is - it adds more frontend uops (even if the moves can be eliminated) and - some percentage of the time actual backend uops. */ - movaps -(VEC_SIZE)(%rcx), %xmm1 - pcmpeqb %xmm0, %xmm1 - subq %rdi, %rdx - pmovmskb %xmm1, %eax - - cmpq $(VEC_SIZE * 2), %rdx - ja L(more_2x_vec) -L(last_2x_vec): - subl $VEC_SIZE, %edx - jbe L(ret_vec_x0_test) - - testl %eax, %eax - jnz L(ret_vec_x0) - - movaps -(VEC_SIZE * 2)(%rcx), %xmm1 - pcmpeqb %xmm0, %xmm1 - pmovmskb %xmm1, %eax - - subl $VEC_SIZE, %edx - bsrl %eax, %eax - jz L(ret_1) - addl %edx, %eax - jl L(zero_0) - addq %rdi, %rax -L(ret_1): - ret - - /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross) - causes the hot pause (length <= VEC_SIZE) to span multiple cache - lines. Naturally aligned % 16 to 8-bytes. */ -L(page_cross): - /* Zero length check. */ - testq %rdx, %rdx - jz L(zero_0) - - leaq -1(%rcx), %r8 - andq $-(VEC_SIZE), %r8 - - movaps (%r8), %xmm1 - pcmpeqb %xmm0, %xmm1 - pmovmskb %xmm1, %esi - /* Shift out negative alignment (because we are starting from endptr and - working backwards). */ - negl %ecx - /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count - explicitly. */ - andl $(VEC_SIZE - 1), %ecx - shl %cl, %esi - movzwl %si, %eax - leaq (%rdi, %rdx), %rcx - cmpq %rdi, %r8 - ja L(more_1x_vec) - subl $VEC_SIZE, %edx - bsrl %eax, %eax - jz L(ret_2) - addl %edx, %eax - jl L(zero_1) - addq %rdi, %rax -L(ret_2): - ret - - /* Fits in aliging bytes. */ -L(zero_1): - xorl %eax, %eax - ret - - .p2align 4,, 5 -L(ret_vec_x1): - bsrl %eax, %eax - leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax - ret - - .p2align 4,, 8 -L(more_2x_vec): - testl %eax, %eax - jnz L(ret_vec_x0) - - movaps -(VEC_SIZE * 2)(%rcx), %xmm1 - pcmpeqb %xmm0, %xmm1 - pmovmskb %xmm1, %eax - testl %eax, %eax - jnz L(ret_vec_x1) - - - movaps -(VEC_SIZE * 3)(%rcx), %xmm1 - pcmpeqb %xmm0, %xmm1 - pmovmskb %xmm1, %eax - - subq $(VEC_SIZE * 4), %rdx - ja L(more_4x_vec) - - addl $(VEC_SIZE), %edx - jle L(ret_vec_x2_test) - -L(last_vec): - testl %eax, %eax - jnz L(ret_vec_x2) - - movaps -(VEC_SIZE * 4)(%rcx), %xmm1 - pcmpeqb %xmm0, %xmm1 - pmovmskb %xmm1, %eax - - subl $(VEC_SIZE), %edx - bsrl %eax, %eax - jz L(ret_3) - addl %edx, %eax - jl L(zero_2) - addq %rdi, %rax -L(ret_3): - ret - - .p2align 4,, 6 -L(ret_vec_x2_test): - bsrl %eax, %eax - jz L(zero_2) - addl %edx, %eax - jl L(zero_2) - addq %rdi, %rax - ret - -L(zero_2): - xorl %eax, %eax - ret - - - .p2align 4,, 5 -L(ret_vec_x2): - bsrl %eax, %eax - leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax - ret - - .p2align 4,, 5 -L(ret_vec_x3): - bsrl %eax, %eax - leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax - ret - - .p2align 4,, 8 -L(more_4x_vec): - testl %eax, %eax - jnz L(ret_vec_x2) - - movaps -(VEC_SIZE * 4)(%rcx), %xmm1 - pcmpeqb %xmm0, %xmm1 - pmovmskb %xmm1, %eax - - testl %eax, %eax - jnz L(ret_vec_x3) - - addq $-(VEC_SIZE * 4), %rcx - cmpq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec) - - /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end - keeping the code from spilling to the next cache line. */ - addq $(VEC_SIZE * 4 - 1), %rcx - andq $-(VEC_SIZE * 4), %rcx - leaq (VEC_SIZE * 4)(%rdi), %rdx - andq $-(VEC_SIZE * 4), %rdx - - .p2align 4,, 11 -L(loop_4x_vec): - movaps (VEC_SIZE * -1)(%rcx), %xmm1 - movaps (VEC_SIZE * -2)(%rcx), %xmm2 - movaps (VEC_SIZE * -3)(%rcx), %xmm3 - movaps (VEC_SIZE * -4)(%rcx), %xmm4 - pcmpeqb %xmm0, %xmm1 - pcmpeqb %xmm0, %xmm2 - pcmpeqb %xmm0, %xmm3 - pcmpeqb %xmm0, %xmm4 - - por %xmm1, %xmm2 - por %xmm3, %xmm4 - por %xmm2, %xmm4 - - pmovmskb %xmm4, %esi - testl %esi, %esi - jnz L(loop_end) - - addq $-(VEC_SIZE * 4), %rcx - cmpq %rdx, %rcx - jne L(loop_4x_vec) - - subl %edi, %edx - - /* Ends up being 1-byte nop. */ - .p2align 4,, 2 -L(last_4x_vec): - movaps -(VEC_SIZE)(%rcx), %xmm1 - pcmpeqb %xmm0, %xmm1 - pmovmskb %xmm1, %eax - - cmpl $(VEC_SIZE * 2), %edx - jbe L(last_2x_vec) - - testl %eax, %eax - jnz L(ret_vec_x0) - - - movaps -(VEC_SIZE * 2)(%rcx), %xmm1 - pcmpeqb %xmm0, %xmm1 - pmovmskb %xmm1, %eax - - testl %eax, %eax - jnz L(ret_vec_end) - - movaps -(VEC_SIZE * 3)(%rcx), %xmm1 - pcmpeqb %xmm0, %xmm1 - pmovmskb %xmm1, %eax - - subl $(VEC_SIZE * 3), %edx - ja L(last_vec) - bsrl %eax, %eax - jz L(ret_4) - addl %edx, %eax - jl L(zero_3) - addq %rdi, %rax -L(ret_4): - ret - - /* Ends up being 1-byte nop. */ - .p2align 4,, 3 -L(loop_end): - pmovmskb %xmm1, %eax - sall $16, %eax - jnz L(ret_vec_end) - - pmovmskb %xmm2, %eax - testl %eax, %eax - jnz L(ret_vec_end) - - pmovmskb %xmm3, %eax - /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) - then it won't affect the result in esi (VEC4). If ecx is non-zero - then CHAR in VEC3 and bsrq will use that position. */ - sall $16, %eax - orl %esi, %eax - bsrl %eax, %eax - leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax - ret - -L(ret_vec_end): - bsrl %eax, %eax - leaq (VEC_SIZE * -2)(%rax, %rcx), %rax - ret - /* Use in L(last_4x_vec). In the same cache line. This is just a spare - aligning bytes. */ -L(zero_3): - xorl %eax, %eax - ret - /* 2-bytes from next cache line. */ -END(__memrchr) +#define MEMRCHR __memrchr +#include "multiarch/memrchr-sse2.S" weak_alias (__memrchr, memrchr) diff --git a/sysdeps/x86_64/multiarch/memrchr-sse2.S b/sysdeps/x86_64/multiarch/memrchr-sse2.S index b04202e171..d92a4022dc 100644 --- a/sysdeps/x86_64/multiarch/memrchr-sse2.S +++ b/sysdeps/x86_64/multiarch/memrchr-sse2.S @@ -17,10 +17,338 @@ . */ #if IS_IN (libc) -# define __memrchr __memrchr_sse2 +# ifndef MEMRCHR +# define MEMRCHR __memrchr_sse2 +# endif +#endif + +#include +#define VEC_SIZE 16 +#define PAGE_SIZE 4096 -# undef weak_alias -# define weak_alias(__memrchr, memrchr) + .text +ENTRY_P2ALIGN(MEMRCHR, 6) +#ifdef __ILP32__ + /* Clear upper bits. */ + mov %RDX_LP, %RDX_LP #endif + movd %esi, %xmm0 + + /* Get end pointer. */ + leaq (%rdx, %rdi), %rcx + + punpcklbw %xmm0, %xmm0 + punpcklwd %xmm0, %xmm0 + pshufd $0, %xmm0, %xmm0 + + /* Check if we can load 1x VEC without cross a page. */ + testl $(PAGE_SIZE - VEC_SIZE), %ecx + jz L(page_cross) + + /* NB: This load happens regardless of whether rdx (len) is zero. Since + it doesn't cross a page and the standard gurantees any pointer have + at least one-valid byte this load must be safe. For the entire + history of the x86 memrchr implementation this has been possible so + no code "should" be relying on a zero-length check before this load. + The zero-length check is moved to the page cross case because it is + 1) pretty cold and including it pushes the hot case len <= VEC_SIZE + into 2-cache lines. */ + movups -(VEC_SIZE)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + subq $VEC_SIZE, %rdx + ja L(more_1x_vec) +L(ret_vec_x0_test): + /* Zero-flag set if eax (src) is zero. Destination unchanged if src is + zero. */ + bsrl %eax, %eax + jz L(ret_0) + /* Check if the CHAR match is in bounds. Need to truly zero `eax` here + if out of bounds. */ + addl %edx, %eax + jl L(zero_0) + /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base + ptr. */ + addq %rdi, %rax +L(ret_0): + ret + + .p2align 4,, 5 +L(ret_vec_x0): + bsrl %eax, %eax + leaq -(VEC_SIZE)(%rcx, %rax), %rax + ret + + .p2align 4,, 2 +L(zero_0): + xorl %eax, %eax + ret + + + .p2align 4,, 8 +L(more_1x_vec): + testl %eax, %eax + jnz L(ret_vec_x0) + + /* Align rcx (pointer to string). */ + decq %rcx + andq $-VEC_SIZE, %rcx + + movq %rcx, %rdx + /* NB: We could consistenyl save 1-byte in this pattern with `movaps + %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is + it adds more frontend uops (even if the moves can be eliminated) and + some percentage of the time actual backend uops. */ + movaps -(VEC_SIZE)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + subq %rdi, %rdx + pmovmskb %xmm1, %eax + + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) +L(last_2x_vec): + subl $VEC_SIZE, %edx + jbe L(ret_vec_x0_test) + + testl %eax, %eax + jnz L(ret_vec_x0) + + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + subl $VEC_SIZE, %edx + bsrl %eax, %eax + jz L(ret_1) + addl %edx, %eax + jl L(zero_0) + addq %rdi, %rax +L(ret_1): + ret + + /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross) + causes the hot pause (length <= VEC_SIZE) to span multiple cache + lines. Naturally aligned % 16 to 8-bytes. */ +L(page_cross): + /* Zero length check. */ + testq %rdx, %rdx + jz L(zero_0) + + leaq -1(%rcx), %r8 + andq $-(VEC_SIZE), %r8 + + movaps (%r8), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + /* Shift out negative alignment (because we are starting from endptr and + working backwards). */ + negl %ecx + /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count + explicitly. */ + andl $(VEC_SIZE - 1), %ecx + shl %cl, %esi + movzwl %si, %eax + leaq (%rdi, %rdx), %rcx + cmpq %rdi, %r8 + ja L(more_1x_vec) + subl $VEC_SIZE, %edx + bsrl %eax, %eax + jz L(ret_2) + addl %edx, %eax + jl L(zero_1) + addq %rdi, %rax +L(ret_2): + ret + + /* Fits in aliging bytes. */ +L(zero_1): + xorl %eax, %eax + ret + + .p2align 4,, 5 +L(ret_vec_x1): + bsrl %eax, %eax + leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax + ret + + .p2align 4,, 8 +L(more_2x_vec): + testl %eax, %eax + jnz L(ret_vec_x0) + + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + testl %eax, %eax + jnz L(ret_vec_x1) + + + movaps -(VEC_SIZE * 3)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + subq $(VEC_SIZE * 4), %rdx + ja L(more_4x_vec) + + addl $(VEC_SIZE), %edx + jle L(ret_vec_x2_test) + +L(last_vec): + testl %eax, %eax + jnz L(ret_vec_x2) + + movaps -(VEC_SIZE * 4)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + subl $(VEC_SIZE), %edx + bsrl %eax, %eax + jz L(ret_3) + addl %edx, %eax + jl L(zero_2) + addq %rdi, %rax +L(ret_3): + ret + + .p2align 4,, 6 +L(ret_vec_x2_test): + bsrl %eax, %eax + jz L(zero_2) + addl %edx, %eax + jl L(zero_2) + addq %rdi, %rax + ret + +L(zero_2): + xorl %eax, %eax + ret + + + .p2align 4,, 5 +L(ret_vec_x2): + bsrl %eax, %eax + leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax + ret + + .p2align 4,, 5 +L(ret_vec_x3): + bsrl %eax, %eax + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax + ret + + .p2align 4,, 8 +L(more_4x_vec): + testl %eax, %eax + jnz L(ret_vec_x2) + + movaps -(VEC_SIZE * 4)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + testl %eax, %eax + jnz L(ret_vec_x3) + + addq $-(VEC_SIZE * 4), %rcx + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec) + + /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end + keeping the code from spilling to the next cache line. */ + addq $(VEC_SIZE * 4 - 1), %rcx + andq $-(VEC_SIZE * 4), %rcx + leaq (VEC_SIZE * 4)(%rdi), %rdx + andq $-(VEC_SIZE * 4), %rdx + + .p2align 4,, 11 +L(loop_4x_vec): + movaps (VEC_SIZE * -1)(%rcx), %xmm1 + movaps (VEC_SIZE * -2)(%rcx), %xmm2 + movaps (VEC_SIZE * -3)(%rcx), %xmm3 + movaps (VEC_SIZE * -4)(%rcx), %xmm4 + pcmpeqb %xmm0, %xmm1 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm0, %xmm3 + pcmpeqb %xmm0, %xmm4 + + por %xmm1, %xmm2 + por %xmm3, %xmm4 + por %xmm2, %xmm4 + + pmovmskb %xmm4, %esi + testl %esi, %esi + jnz L(loop_end) + + addq $-(VEC_SIZE * 4), %rcx + cmpq %rdx, %rcx + jne L(loop_4x_vec) + + subl %edi, %edx + + /* Ends up being 1-byte nop. */ + .p2align 4,, 2 +L(last_4x_vec): + movaps -(VEC_SIZE)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + cmpl $(VEC_SIZE * 2), %edx + jbe L(last_2x_vec) + + testl %eax, %eax + jnz L(ret_vec_x0) + + + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + testl %eax, %eax + jnz L(ret_vec_end) + + movaps -(VEC_SIZE * 3)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + subl $(VEC_SIZE * 3), %edx + ja L(last_vec) + bsrl %eax, %eax + jz L(ret_4) + addl %edx, %eax + jl L(zero_3) + addq %rdi, %rax +L(ret_4): + ret + + /* Ends up being 1-byte nop. */ + .p2align 4,, 3 +L(loop_end): + pmovmskb %xmm1, %eax + sall $16, %eax + jnz L(ret_vec_end) + + pmovmskb %xmm2, %eax + testl %eax, %eax + jnz L(ret_vec_end) + + pmovmskb %xmm3, %eax + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) + then it won't affect the result in esi (VEC4). If ecx is non-zero + then CHAR in VEC3 and bsrq will use that position. */ + sall $16, %eax + orl %esi, %eax + bsrl %eax, %eax + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax + ret -#include "../memrchr.S" +L(ret_vec_end): + bsrl %eax, %eax + leaq (VEC_SIZE * -2)(%rax, %rcx), %rax + ret + /* Use in L(last_4x_vec). In the same cache line. This is just a spare + aligning bytes. */ +L(zero_3): + xorl %eax, %eax + ret + /* 2-bytes from next cache line. */ +END(MEMRCHR)