From patchwork Mon May 3 22:58:31 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 43244 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id F31BC395040C; Mon, 3 May 2021 22:58:42 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org F31BC395040C DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1620082723; bh=+RGy6dvhfH9dQvRnsC7lXxojnS1dnN9HeBRKdj+mV0g=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=bkWnHYberB0fLp4Ko2xinTtin9KBqaFP4H7DTXpmhNajC2SABnTyJZaR53IaZejtT y0FptlOgkWRrMvE8lkBRJr75RxyC0/eCm665t+FyiHoeSrQamsM31Cl2CUzUuJIKkF wtCZQ9h16bWfdenjn6YJKPCku4eXr0fKhc2UDcKk= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-qt1-x833.google.com (mail-qt1-x833.google.com [IPv6:2607:f8b0:4864:20::833]) by sourceware.org (Postfix) with ESMTPS id CDA873945053 for ; Mon, 3 May 2021 22:58:39 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org CDA873945053 Received: by mail-qt1-x833.google.com with SMTP id p6so3322797qtk.13 for ; Mon, 03 May 2021 15:58:39 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=+RGy6dvhfH9dQvRnsC7lXxojnS1dnN9HeBRKdj+mV0g=; b=B8ZBMUHVwuzUXGGSL+zsE7gRbQS/tuWgZqtBBEa4RlsEtz/WyqkCyYfJbnYgpDHjP+ sCkOA9WQC7BHIG7StRsxNyhbfy/JM3XQ+XQTHrToy+jpRYTynWxQ21bziUH2F7nSPf7T 0Y8P+05b+pT/sViACRq60pIduRj8wvWe8G+7WUNPGEoTJa1MHRNXIS39tLdJTS7iAicl 1SqIenhEZMgkoCUXW1O8hWEn1ru6vCjIffHGwi+WvyoIvDcsjT1w0Flq4luhhGcc8+ME MVuwiJ4Gw9GJWYJIOsFhtj+GKVmOP3aHP1hzS0E/UvPSi5U2eqNJAr1WulnssOZZli9J bFSQ== X-Gm-Message-State: AOAM531U4O1hLSe8s/yJ8jV8E09SLcZN6HrJlXf4F2mG56I9liTMQxRr 5k3zPniaeAIWt6aHGoAhEdivvGbwX6U= X-Google-Smtp-Source: ABdhPJx8vBjxxehHMgCTVj/8dy7xxPam1hm/IKuBAgaOWWtYYED5i4Cq1XngzuoTwJ7XSbaTNn9P4w== X-Received: by 2002:a05:622a:1cc:: with SMTP id t12mr18701715qtw.153.1620082718838; Mon, 03 May 2021 15:58:38 -0700 (PDT) Received: from localhost.localdomain (pool-71-245-178-39.pitbpa.fios.verizon.net. [71.245.178.39]) by smtp.googlemail.com with ESMTPSA id j25sm1039911qtr.67.2021.05.03.15.58.38 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 03 May 2021 15:58:38 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v3 2/3] x86: Optimize memchr-avx2.S Date: Mon, 3 May 2021 18:58:31 -0400 Message-Id: <20210503225832.4042137-1-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.29.2 In-Reply-To: <20210503084435.160548-2-goldstein.w.n@gmail.com> References: <20210503084435.160548-2-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-12.6 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces@sourceware.org Sender: "Libc-alpha" No bug. This commit optimizes memchr-avx2.S. The optimizations include replacing some branches with cmovcc, avoiding some branches entirely in the less_4x_vec case, making the page cross logic less strict, asaving a few instructions the in loop return loop. test-memchr, test-rawmemchr, and test-wmemchr are all passing. Signed-off-by: Noah Goldstein --- sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++----------- 1 file changed, 247 insertions(+), 178 deletions(-) diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S index 1fcb1c350f..0d8758e3e7 100644 --- a/sysdeps/x86_64/multiarch/memchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S @@ -26,8 +26,22 @@ # ifdef USE_AS_WMEMCHR # define VPCMPEQ vpcmpeqd +# define VPBROADCAST vpbroadcastd +# define CHAR_SIZE 4 # else # define VPCMPEQ vpcmpeqb +# define VPBROADCAST vpbroadcastb +# define CHAR_SIZE 1 +# endif + +# ifdef USE_AS_RAWMEMCHR +# define ERAW_PTR_REG ecx +# define RRAW_PTR_REG rcx +# define ALGN_PTR_REG rdi +# else +# define ERAW_PTR_REG edi +# define RRAW_PTR_REG rdi +# define ALGN_PTR_REG rcx # endif # ifndef VZEROUPPER @@ -39,6 +53,7 @@ # endif # define VEC_SIZE 32 +# define PAGE_SIZE 4096 .section SECTION(.text),"ax",@progbits ENTRY (MEMCHR) @@ -47,295 +62,349 @@ ENTRY (MEMCHR) test %RDX_LP, %RDX_LP jz L(null) # endif - movl %edi, %ecx - /* Broadcast CHAR to YMM0. */ - vmovd %esi, %xmm0 # ifdef USE_AS_WMEMCHR shl $2, %RDX_LP - vpbroadcastd %xmm0, %ymm0 # else # ifdef __ILP32__ /* Clear the upper 32 bits. */ movl %edx, %edx # endif - vpbroadcastb %xmm0, %ymm0 # endif + /* Broadcast CHAR to YMMMATCH. */ + vmovd %esi, %xmm0 + VPBROADCAST %xmm0, %ymm0 /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) /* Check the first VEC_SIZE bytes. */ - VPCMPEQ (%rdi), %ymm0, %ymm1 + VPCMPEQ (%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax - testl %eax, %eax - # ifndef USE_AS_RAWMEMCHR - jnz L(first_vec_x0_check) - /* Adjust length and check the end of data. */ - subq $VEC_SIZE, %rdx - jbe L(zero) -# else - jnz L(first_vec_x0) + /* If length < CHAR_PER_VEC handle special. */ + cmpq $VEC_SIZE, %rdx + jbe L(first_vec_x0) # endif - - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax + addq %rdi, %rax + VZEROUPPER_RETURN # ifndef USE_AS_RAWMEMCHR - /* Adjust length. */ - addq %rcx, %rdx + .p2align 5 +L(first_vec_x0): + /* Check if first match was before length. */ + tzcntl %eax, %eax + xorl %ecx, %ecx + cmpl %eax, %edx + leaq (%rdi, %rax), %rax + cmovle %rcx, %rax + VZEROUPPER_RETURN - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) +L(null): + xorl %eax, %eax + ret # endif - jmp L(more_4x_vec) - .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - VPCMPEQ (%rdi), %ymm0, %ymm1 +L(cross_page_boundary): + /* Save pointer before aligning as its original value is necessary + for computer return address if byte is found or adjusting length + if it is not and this is memchr. */ + movq %rdi, %rcx + /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and + rdi for rawmemchr. */ + orq $(VEC_SIZE - 1), %ALGN_PTR_REG + VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 vpmovmskb %ymm1, %eax +# ifndef USE_AS_RAWMEMCHR + /* Calculate length until end of page (length checked for a + match). */ + leaq 1(%ALGN_PTR_REG), %rsi + subq %RRAW_PTR_REG, %rsi +# endif /* Remove the leading bytes. */ - sarl %cl, %eax - testl %eax, %eax - jz L(aligned_more) - tzcntl %eax, %eax + sarxl %ERAW_PTR_REG, %eax, %eax # ifndef USE_AS_RAWMEMCHR /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) + cmpq %rsi, %rdx + jbe L(first_vec_x0) # endif - addq %rdi, %rax - addq %rcx, %rax + testl %eax, %eax + jz L(cross_page_continue) + tzcntl %eax, %eax + addq %RRAW_PTR_REG, %rax L(return_vzeroupper): ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 -L(aligned_more): -# ifndef USE_AS_RAWMEMCHR - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition - overflow. */ - negq %rcx - addq $VEC_SIZE, %rcx +L(first_vec_x1): + tzcntl %eax, %eax + incq %rdi + addq %rdi, %rax + VZEROUPPER_RETURN - /* Check the end of data. */ - subq %rcx, %rdx - jbe L(zero) -# endif + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %eax + addq $(VEC_SIZE + 1), %rdi + addq %rdi, %rax + VZEROUPPER_RETURN + + .p2align 4 +L(first_vec_x3): + tzcntl %eax, %eax + addq $(VEC_SIZE * 2 + 1), %rdi + addq %rdi, %rax + VZEROUPPER_RETURN - addq $VEC_SIZE, %rdi -# ifndef USE_AS_RAWMEMCHR - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) -# endif + .p2align 4 +L(first_vec_x4): + tzcntl %eax, %eax + addq $(VEC_SIZE * 3 + 1), %rdi + addq %rdi, %rax + VZEROUPPER_RETURN -L(more_4x_vec): + .p2align 4 +L(aligned_more): /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time since data is only aligned to VEC_SIZE. */ - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 +# ifndef USE_AS_RAWMEMCHR +L(cross_page_continue): + /* Align data to VEC_SIZE - 1. */ + xorl %ecx, %ecx + subl %edi, %ecx + orq $(VEC_SIZE - 1), %rdi + /* esi is for adjusting length to see if near the end. */ + leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi +# else + orq $(VEC_SIZE - 1), %rdi +L(cross_page_continue): +# endif + /* Load first VEC regardless. */ + VPCMPEQ 1(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax +# ifndef USE_AS_RAWMEMCHR + /* Adjust length. If near end handle specially. */ + subq %rsi, %rdx + jbe L(last_4x_vec_or_less) +# endif testl %eax, %eax jnz L(first_vec_x1) - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x2) - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x3) - addq $(VEC_SIZE * 4), %rdi + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x4) # ifndef USE_AS_RAWMEMCHR + /* Check if at last VEC_SIZE * 4 length. */ subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) -# endif - - /* Align data to 4 * VEC_SIZE. */ - movq %rdi, %rcx - andl $(4 * VEC_SIZE - 1), %ecx - andq $-(4 * VEC_SIZE), %rdi - -# ifndef USE_AS_RAWMEMCHR - /* Adjust length. */ + jbe L(last_4x_vec_or_less_cmpeq) + /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust + length. */ + incq %rdi + movl %edi, %ecx + orq $(VEC_SIZE * 4 - 1), %rdi + andl $(VEC_SIZE * 4 - 1), %ecx addq %rcx, %rdx +# else + /* Align data to VEC_SIZE * 4 - 1 for loop. */ + incq %rdi + orq $(VEC_SIZE * 4 - 1), %rdi # endif + /* Compare 4 * VEC at a time forward. */ .p2align 4 L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - VPCMPEQ (%rdi), %ymm0, %ymm1 - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 - + VPCMPEQ 1(%rdi), %ymm0, %ymm1 + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2 + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3 + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4 vpor %ymm1, %ymm2, %ymm5 vpor %ymm3, %ymm4, %ymm6 vpor %ymm5, %ymm6, %ymm5 - vpmovmskb %ymm5, %eax - testl %eax, %eax - jnz L(4x_vec_end) - - addq $(VEC_SIZE * 4), %rdi - + vpmovmskb %ymm5, %ecx # ifdef USE_AS_RAWMEMCHR - jmp L(loop_4x_vec) + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx + jz L(loop_4x_vec) # else - subq $(VEC_SIZE * 4), %rdx - ja L(loop_4x_vec) + testl %ecx, %ecx + jnz L(loop_4x_vec_end) -L(last_4x_vec_or_less): - /* Less than 4 * VEC and aligned to VEC_SIZE. */ - addl $(VEC_SIZE * 2), %edx - jle L(last_2x_vec) + subq $-(VEC_SIZE * 4), %rdi - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) + subq $(VEC_SIZE * 4), %rdx + ja L(loop_4x_vec) - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 + /* Fall through into less than 4 remaining vectors of length case. + */ + VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax + .p2align 4 +L(last_4x_vec_or_less): + /* Check if first VEC contained match. */ testl %eax, %eax - jnz L(first_vec_x1) + jnz L(first_vec_x1_check) - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax + /* If remaining length > VEC_SIZE * 2. */ + addl $(VEC_SIZE * 2), %edx + jg L(last_4x_vec) - jnz L(first_vec_x2_check) - subl $VEC_SIZE, %edx - jle L(zero) +L(last_2x_vec): + /* If remaining length < VEC_SIZE. */ + addl $VEC_SIZE, %edx + jle L(zero_end) - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + /* Check VEC2 and compare any match with remaining length. */ + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax - testl %eax, %eax - - jnz L(first_vec_x3_check) - xorl %eax, %eax + tzcntl %eax, %eax + cmpl %eax, %edx + jbe L(set_zero_end) + addq $(VEC_SIZE + 1), %rdi + addq %rdi, %rax +L(zero_end): VZEROUPPER_RETURN .p2align 4 -L(last_2x_vec): - addl $(VEC_SIZE * 2), %edx - VPCMPEQ (%rdi), %ymm0, %ymm1 +L(loop_4x_vec_end): +# endif + /* rawmemchr will fall through into this if match was found in + loop. */ + vpmovmskb %ymm1, %eax testl %eax, %eax + jnz L(last_vec_x1_return) - jnz L(first_vec_x0_check) - subl $VEC_SIZE, %edx - jle L(zero) - - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + vpmovmskb %ymm2, %eax testl %eax, %eax - jnz L(first_vec_x1_check) - xorl %eax, %eax - VZEROUPPER_RETURN + jnz L(last_vec_x2_return) - .p2align 4 -L(first_vec_x0_check): - tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) + vpmovmskb %ymm3, %eax + /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */ + salq $32, %rcx + orq %rcx, %rax + tzcntq %rax, %rax +# ifdef USE_AS_RAWMEMCHR + subq $(VEC_SIZE * 2 - 1), %rdi +# else + subq $-(VEC_SIZE * 2 + 1), %rdi +# endif addq %rdi, %rax VZEROUPPER_RETURN +# ifndef USE_AS_RAWMEMCHR .p2align 4 L(first_vec_x1_check): tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) - addq $VEC_SIZE, %rax + /* Adjust length. */ + subl $-(VEC_SIZE * 4), %edx + /* Check if match within remaining length. */ + cmpl %eax, %edx + jbe L(set_zero_end) + incq %rdi addq %rdi, %rax VZEROUPPER_RETURN + .p2align 4 +L(set_zero_end): + xorl %eax, %eax + VZEROUPPER_RETURN +# endif .p2align 4 -L(first_vec_x2_check): +L(last_vec_x1_return): tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) - addq $(VEC_SIZE * 2), %rax +# ifdef USE_AS_RAWMEMCHR + subq $(VEC_SIZE * 4 - 1), %rdi +# else + incq %rdi +# endif addq %rdi, %rax VZEROUPPER_RETURN .p2align 4 -L(first_vec_x3_check): +L(last_vec_x2_return): tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) - addq $(VEC_SIZE * 3), %rax +# ifdef USE_AS_RAWMEMCHR + subq $(VEC_SIZE * 3 - 1), %rdi +# else + subq $-(VEC_SIZE + 1), %rdi +# endif addq %rdi, %rax VZEROUPPER_RETURN +# ifndef USE_AS_RAWMEMCHR .p2align 4 -L(zero): - xorl %eax, %eax - jmp L(return_vzeroupper) +L(last_4x_vec_or_less_cmpeq): + VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + subq $-(VEC_SIZE * 4), %rdi + /* Check first VEC regardless. */ + testl %eax, %eax + jnz L(first_vec_x1_check) + /* If remaining length <= CHAR_PER_VEC * 2. */ + addl $(VEC_SIZE * 2), %edx + jle L(last_2x_vec) .p2align 4 -L(null): - xorl %eax, %eax - ret -# endif +L(last_4x_vec): + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x2_return) - .p2align 4 -L(first_vec_x0): - tzcntl %eax, %eax - addq %rdi, %rax - VZEROUPPER_RETURN + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax - .p2align 4 -L(first_vec_x1): - tzcntl %eax, %eax - addq $VEC_SIZE, %rax - addq %rdi, %rax - VZEROUPPER_RETURN + /* Create mask for possible matches within remaining length. */ + movq $-1, %rcx + bzhiq %rdx, %rcx, %rcx - .p2align 4 -L(first_vec_x2): + /* Test matches in data against length match. */ + andl %ecx, %eax + jnz L(last_vec_x3) + + /* if remaining length <= VEC_SIZE * 3 (Note this is after + remaining length was found to be > VEC_SIZE * 2. */ + subl $VEC_SIZE, %edx + jbe L(zero_end2) + + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + /* Shift remaining length mask for last VEC. */ + shrq $32, %rcx + andl %ecx, %eax + jz L(zero_end2) tzcntl %eax, %eax - addq $(VEC_SIZE * 2), %rax + addq $(VEC_SIZE * 3 + 1), %rdi addq %rdi, %rax +L(zero_end2): VZEROUPPER_RETURN .p2align 4 -L(4x_vec_end): - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - vpmovmskb %ymm2, %eax - testl %eax, %eax - jnz L(first_vec_x1) - vpmovmskb %ymm3, %eax - testl %eax, %eax - jnz L(first_vec_x2) - vpmovmskb %ymm4, %eax - testl %eax, %eax -L(first_vec_x3): +L(last_vec_x3): tzcntl %eax, %eax - addq $(VEC_SIZE * 3), %rax + subq $-(VEC_SIZE * 2 + 1), %rdi addq %rdi, %rax VZEROUPPER_RETURN +# endif END (MEMCHR) #endif From patchwork Mon May 3 22:58:32 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 43245 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 896713950C59; Mon, 3 May 2021 22:58:45 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 896713950C59 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1620082725; bh=P9nvCaEHcP8t7xvGeyj4EBT/YTrYY+nWxPXGCVmauXI=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=ogPYWw9+TahwsmKToKKLYB57Ei8SlUa2++RZeGyN5FJyQx0h3GXCEXUQBbaYwUHbh g67GuBbxndorFNOBAstS2nwrNpUUH/zmsblT/95L6cRE5DXShl45f632Nl1fHTwQrM 7KG4D0yNikyGrJOVfylNQCPam6k4JDtt5zktTsVE= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-qv1-xf2e.google.com (mail-qv1-xf2e.google.com [IPv6:2607:f8b0:4864:20::f2e]) by sourceware.org (Postfix) with ESMTPS id 8DFF63950439 for ; Mon, 3 May 2021 22:58:41 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 8DFF63950439 Received: by mail-qv1-xf2e.google.com with SMTP id h1so2724970qvv.10 for ; Mon, 03 May 2021 15:58:41 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=P9nvCaEHcP8t7xvGeyj4EBT/YTrYY+nWxPXGCVmauXI=; b=tuYQ9OTJpVPLtTcx6j2sYCfNEE34YRgAZRgvOIOSlp2+3O5t+gwVg1ewdvd2ZMRTlS 8DuZPIQeiYONlkCcONtUPEEe4Qlv/5vrlFv+RB5cd+gjIOsBCS1qxUIpelRUyb81Dwv4 P5etLuG6Ce/y9B+kGxp0YPTKlsAJfj93m07VUMGLImavAmK59zcrYtj2oXQGxDQiKiKw ZSu+mCQOTu7bQsp8b5bjMN+jxzebD4AyZZ/gCAYWkzi4qdbU/GyKUFaay+3EXLgyXu0B n3Ocob+KtQnyy1qdSnYCQQ4iYZ1+Yd3XqDytWE/qVvgfdjKUCetCneZyV9CEuzXvX7JT YvRg== X-Gm-Message-State: AOAM5320XFk6wvcIEB+FCpVh0KbiG64wXB5Y4DFSzZbUvDjqXiPqgnCw PJGDcBwQ7ivqeiZ1gqXz0SPBGjfOUpQ= X-Google-Smtp-Source: ABdhPJw1P13YgfKKlsxUwNd9Ozk58w6X+ehQD9AiKkClqlI1+Z031ocZ2K0Gd1GyURnrKhrmk6sh8g== X-Received: by 2002:a0c:dd01:: with SMTP id u1mr6332884qvk.32.1620082720693; Mon, 03 May 2021 15:58:40 -0700 (PDT) Received: from localhost.localdomain (pool-71-245-178-39.pitbpa.fios.verizon.net. [71.245.178.39]) by smtp.googlemail.com with ESMTPSA id j25sm1039911qtr.67.2021.05.03.15.58.39 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 03 May 2021 15:58:40 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v3 3/3] x86: Optimize memchr-evex.S Date: Mon, 3 May 2021 18:58:32 -0400 Message-Id: <20210503225832.4042137-2-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.29.2 In-Reply-To: <20210503225832.4042137-1-goldstein.w.n@gmail.com> References: <20210503084435.160548-2-goldstein.w.n@gmail.com> <20210503225832.4042137-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-12.6 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces@sourceware.org Sender: "Libc-alpha" No bug. This commit optimizes memchr-evex.S. The optimizations include replacing some branches with cmovcc, avoiding some branches entirely in the less_4x_vec case, making the page cross logic less strict, saving some ALU in the alignment process, and most importantly increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and test-wmemchr are all passing. Signed-off-by: Noah Goldstein --- sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++---------- 1 file changed, 322 insertions(+), 225 deletions(-) diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S index 6dd5d67b90..81d5cd6486 100644 --- a/sysdeps/x86_64/multiarch/memchr-evex.S +++ b/sysdeps/x86_64/multiarch/memchr-evex.S @@ -26,14 +26,28 @@ # ifdef USE_AS_WMEMCHR # define VPBROADCAST vpbroadcastd -# define VPCMP vpcmpd -# define SHIFT_REG r8d +# define VPMINU vpminud +# define VPCMP vpcmpd +# define VPCMPEQ vpcmpeqd +# define CHAR_SIZE 4 # else # define VPBROADCAST vpbroadcastb -# define VPCMP vpcmpb -# define SHIFT_REG ecx +# define VPMINU vpminub +# define VPCMP vpcmpb +# define VPCMPEQ vpcmpeqb +# define CHAR_SIZE 1 # endif +# ifdef USE_AS_RAWMEMCHR +# define RAW_PTR_REG rcx +# define ALGN_PTR_REG rdi +# else +# define RAW_PTR_REG rdi +# define ALGN_PTR_REG rcx +# endif + +# define XMMZERO xmm23 +# define YMMZERO ymm23 # define XMMMATCH xmm16 # define YMMMATCH ymm16 # define YMM1 ymm17 @@ -44,6 +58,8 @@ # define YMM6 ymm22 # define VEC_SIZE 32 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) +# define PAGE_SIZE 4096 .section .text.evex,"ax",@progbits ENTRY (MEMCHR) @@ -51,11 +67,7 @@ ENTRY (MEMCHR) /* Check for zero length. */ test %RDX_LP, %RDX_LP jz L(zero) -# endif - movl %edi, %ecx -# ifdef USE_AS_WMEMCHR - shl $2, %RDX_LP -# else + # ifdef __ILP32__ /* Clear the upper 32 bits. */ movl %edx, %edx @@ -64,318 +76,403 @@ ENTRY (MEMCHR) /* Broadcast CHAR to YMMMATCH. */ VPBROADCAST %esi, %YMMMATCH /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) /* Check the first VEC_SIZE bytes. */ - VPCMP $0, (%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax - testl %eax, %eax - + VPCMP $0, (%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax # ifndef USE_AS_RAWMEMCHR - jnz L(first_vec_x0_check) - /* Adjust length and check the end of data. */ - subq $VEC_SIZE, %rdx - jbe L(zero) + /* If length < CHAR_PER_VEC handle special. */ + cmpq $CHAR_PER_VEC, %rdx + jbe L(first_vec_x0) +# endif + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax +# ifdef USE_AS_WMEMCHR + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (%rdi, %rax, CHAR_SIZE), %rax # else - jnz L(first_vec_x0) + addq %rdi, %rax # endif - - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi + ret # ifndef USE_AS_RAWMEMCHR - /* Adjust length. */ - addq %rcx, %rdx - - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) -# endif - jmp L(more_4x_vec) +L(zero): + xorl %eax, %eax + ret + .p2align 5 +L(first_vec_x0): + /* Check if first match was before length. */ + tzcntl %eax, %eax + xorl %ecx, %ecx + cmpl %eax, %edx + leaq (%rdi, %rax, CHAR_SIZE), %rax + cmovle %rcx, %rax + ret +# else + /* NB: first_vec_x0 is 17 bytes which will leave + cross_page_boundary (which is relatively cold) close enough + to ideal alignment. So only realign L(cross_page_boundary) if + rawmemchr. */ .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx +# endif +L(cross_page_boundary): + /* Save pointer before aligning as its original value is + necessary for computer return address if byte is found or + adjusting length if it is not and this is memchr. */ + movq %rdi, %rcx + /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi + for rawmemchr. */ + andq $-VEC_SIZE, %ALGN_PTR_REG + VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0 + kmovd %k0, %r8d # ifdef USE_AS_WMEMCHR - /* NB: Divide shift count by 4 since each bit in K1 represent 4 + /* NB: Divide shift count by 4 since each bit in K0 represent 4 bytes. */ - movl %ecx, %SHIFT_REG - sarl $2, %SHIFT_REG + sarl $2, %eax +# endif +# ifndef USE_AS_RAWMEMCHR + movl $(PAGE_SIZE / CHAR_SIZE), %esi + subl %eax, %esi # endif - andq $-VEC_SIZE, %rdi - VPCMP $0, (%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax - /* Remove the leading bytes. */ - sarxl %SHIFT_REG, %eax, %eax - testl %eax, %eax - jz L(aligned_more) - tzcntl %eax, %eax # ifdef USE_AS_WMEMCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax + andl $(CHAR_PER_VEC - 1), %eax # endif + /* Remove the leading bytes. */ + sarxl %eax, %r8d, %eax # ifndef USE_AS_RAWMEMCHR /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) + cmpq %rsi, %rdx + jbe L(first_vec_x0) +# endif + testl %eax, %eax + jz L(cross_page_continue) + tzcntl %eax, %eax +# ifdef USE_AS_WMEMCHR + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax +# else + addq %RAW_PTR_REG, %rax # endif - addq %rdi, %rax - addq %rcx, %rax ret .p2align 4 -L(aligned_more): -# ifndef USE_AS_RAWMEMCHR - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition - overflow. */ - negq %rcx - addq $VEC_SIZE, %rcx +L(first_vec_x1): + tzcntl %eax, %eax + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax + ret - /* Check the end of data. */ - subq %rcx, %rdx - jbe L(zero) -# endif + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %eax + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret - addq $VEC_SIZE, %rdi + .p2align 4 +L(first_vec_x3): + tzcntl %eax, %eax + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret -# ifndef USE_AS_RAWMEMCHR - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) -# endif + .p2align 4 +L(first_vec_x4): + tzcntl %eax, %eax + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax + ret -L(more_4x_vec): + .p2align 5 +L(aligned_more): /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time since data is only aligned to VEC_SIZE. */ - VPCMP $0, (%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax +# ifndef USE_AS_RAWMEMCHR + /* Align data to VEC_SIZE. */ +L(cross_page_continue): + xorl %ecx, %ecx + subl %edi, %ecx + andq $-VEC_SIZE, %rdi + /* esi is for adjusting length to see if near the end. */ + leal (VEC_SIZE * 5)(%rdi, %rcx), %esi +# ifdef USE_AS_WMEMCHR + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %esi +# endif +# else + andq $-VEC_SIZE, %rdi +L(cross_page_continue): +# endif + /* Load first VEC regardless. */ + VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax +# ifndef USE_AS_RAWMEMCHR + /* Adjust length. If near end handle specially. */ + subq %rsi, %rdx + jbe L(last_4x_vec_or_less) +# endif testl %eax, %eax jnz L(first_vec_x1) - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax testl %eax, %eax jnz L(first_vec_x2) - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax testl %eax, %eax jnz L(first_vec_x3) - addq $(VEC_SIZE * 4), %rdi + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x4) + # ifndef USE_AS_RAWMEMCHR - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) -# endif + /* Check if at last CHAR_PER_VEC * 4 length. */ + subq $(CHAR_PER_VEC * 4), %rdx + jbe L(last_4x_vec_or_less_cmpeq) + addq $VEC_SIZE, %rdi - /* Align data to 4 * VEC_SIZE. */ - movq %rdi, %rcx - andl $(4 * VEC_SIZE - 1), %ecx + /* Align data to VEC_SIZE * 4 for the loop and readjust length. + */ +# ifdef USE_AS_WMEMCHR + movl %edi, %ecx andq $-(4 * VEC_SIZE), %rdi - -# ifndef USE_AS_RAWMEMCHR - /* Adjust length. */ + andl $(VEC_SIZE * 4 - 1), %ecx + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %ecx addq %rcx, %rdx +# else + addq %rdi, %rdx + andq $-(4 * VEC_SIZE), %rdi + subq %rdi, %rdx +# endif +# else + addq $VEC_SIZE, %rdi + andq $-(4 * VEC_SIZE), %rdi # endif + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + + /* Compare 4 * VEC at a time forward. */ .p2align 4 L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - VPCMP $0, (%rdi), %YMMMATCH, %k1 - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2 - kord %k1, %k2, %k5 - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3 - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4 - - kord %k3, %k4, %k6 - kortestd %k5, %k6 - jnz L(4x_vec_end) - - addq $(VEC_SIZE * 4), %rdi - + /* It would be possible to save some instructions using 4x VPCMP + but bottleneck on port 5 makes it not woth it. */ + VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 + /* xor will set bytes match esi to zero. */ + vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2 + vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 + VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 + /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ + VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z} + VPCMP $0, %YMM3, %YMMZERO, %k2 # ifdef USE_AS_RAWMEMCHR - jmp L(loop_4x_vec) + subq $-(VEC_SIZE * 4), %rdi + kortestd %k2, %k3 + jz L(loop_4x_vec) # else - subq $(VEC_SIZE * 4), %rdx + kortestd %k2, %k3 + jnz L(loop_4x_vec_end) + + subq $-(VEC_SIZE * 4), %rdi + + subq $(CHAR_PER_VEC * 4), %rdx ja L(loop_4x_vec) + /* Fall through into less than 4 remaining vectors of length case. + */ + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + addq $(VEC_SIZE * 3), %rdi + .p2align 4 L(last_4x_vec_or_less): - /* Less than 4 * VEC and aligned to VEC_SIZE. */ - addl $(VEC_SIZE * 2), %edx - jle L(last_2x_vec) - - VPCMP $0, (%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax + /* Check if first VEC contained match. */ testl %eax, %eax - jnz L(first_vec_x0) + jnz L(first_vec_x1_check) - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax - testl %eax, %eax - jnz L(first_vec_x1) + /* If remaining length > CHAR_PER_VEC * 2. */ + addl $(CHAR_PER_VEC * 2), %edx + jg L(last_4x_vec) - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax - testl %eax, %eax +L(last_2x_vec): + /* If remaining length < CHAR_PER_VEC. */ + addl $CHAR_PER_VEC, %edx + jle L(zero_end) - jnz L(first_vec_x2_check) - subl $VEC_SIZE, %edx - jle L(zero) + /* Check VEC2 and compare any match with remaining length. */ + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + tzcntl %eax, %eax + cmpl %eax, %edx + jbe L(set_zero_end) + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax +L(zero_end): + ret - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax - testl %eax, %eax - jnz L(first_vec_x3_check) + .p2align 4 +L(first_vec_x1_check): + tzcntl %eax, %eax + /* Adjust length. */ + subl $-(CHAR_PER_VEC * 4), %edx + /* Check if match within remaining length. */ + cmpl %eax, %edx + jbe L(set_zero_end) + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax + ret +L(set_zero_end): xorl %eax, %eax ret .p2align 4 -L(last_2x_vec): - addl $(VEC_SIZE * 2), %edx - VPCMP $0, (%rdi), %YMMMATCH, %k1 +L(loop_4x_vec_end): +# endif + /* rawmemchr will fall through into this if match was found in + loop. */ + + /* k1 has not of matches with VEC1. */ kmovd %k1, %eax - testl %eax, %eax +# ifdef USE_AS_WMEMCHR + subl $((1 << CHAR_PER_VEC) - 1), %eax +# else + incl %eax +# endif + jnz L(last_vec_x1_return) - jnz L(first_vec_x0_check) - subl $VEC_SIZE, %edx - jle L(zero) + VPCMP $0, %YMM2, %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(last_vec_x2_return) - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax + kmovd %k2, %eax testl %eax, %eax - jnz L(first_vec_x1_check) - xorl %eax, %eax - ret + jnz L(last_vec_x3_return) - .p2align 4 -L(first_vec_x0_check): + kmovd %k3, %eax tzcntl %eax, %eax -# ifdef USE_AS_WMEMCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax +# ifdef USE_AS_RAWMEMCHR + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax +# else + leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax # endif - /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) - addq %rdi, %rax ret .p2align 4 -L(first_vec_x1_check): +L(last_vec_x1_return): tzcntl %eax, %eax -# ifdef USE_AS_WMEMCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) - addq $VEC_SIZE, %rax +# ifdef USE_AS_RAWMEMCHR +# ifdef USE_AS_WMEMCHR + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else addq %rdi, %rax - ret - - .p2align 4 -L(first_vec_x2_check): - tzcntl %eax, %eax -# ifdef USE_AS_WMEMCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax +# endif +# else + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax # endif - /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) - addq $(VEC_SIZE * 2), %rax - addq %rdi, %rax ret .p2align 4 -L(first_vec_x3_check): +L(last_vec_x2_return): tzcntl %eax, %eax -# ifdef USE_AS_WMEMCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax +# ifdef USE_AS_RAWMEMCHR + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax +# else + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax # endif - /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) - addq $(VEC_SIZE * 3), %rax - addq %rdi, %rax ret .p2align 4 -L(zero): - xorl %eax, %eax - ret -# endif - - .p2align 4 -L(first_vec_x0): +L(last_vec_x3_return): tzcntl %eax, %eax -# ifdef USE_AS_WMEMCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq (%rdi, %rax, 4), %rax +# ifdef USE_AS_RAWMEMCHR + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax # else - addq %rdi, %rax + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax # endif ret + +# ifndef USE_AS_RAWMEMCHR +L(last_4x_vec_or_less_cmpeq): + VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + subq $-(VEC_SIZE * 4), %rdi + /* Check first VEC regardless. */ + testl %eax, %eax + jnz L(first_vec_x1_check) + + /* If remaining length <= CHAR_PER_VEC * 2. */ + addl $(CHAR_PER_VEC * 2), %edx + jle L(last_2x_vec) + .p2align 4 -L(first_vec_x1): +L(last_4x_vec): + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + /* Create mask for possible matches within remaining length. */ +# ifdef USE_AS_WMEMCHR + movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx + bzhil %edx, %ecx, %ecx +# else + movq $-1, %rcx + bzhiq %rdx, %rcx, %rcx +# endif + /* Test matches in data against length match. */ + andl %ecx, %eax + jnz L(last_vec_x3) + + /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after + remaining length was found to be > CHAR_PER_VEC * 2. */ + subl $CHAR_PER_VEC, %edx + jbe L(zero_end2) + + + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + /* Shift remaining length mask for last VEC. */ +# ifdef USE_AS_WMEMCHR + shrl $CHAR_PER_VEC, %ecx +# else + shrq $CHAR_PER_VEC, %rcx +# endif + andl %ecx, %eax + jz L(zero_end2) tzcntl %eax, %eax -# ifdef USE_AS_WMEMCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq VEC_SIZE(%rdi, %rax, 4), %rax -# else - addq $VEC_SIZE, %rax - addq %rdi, %rax -# endif + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax +L(zero_end2): ret - .p2align 4 -L(first_vec_x2): +L(last_vec_x2): tzcntl %eax, %eax -# ifdef USE_AS_WMEMCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax -# else - addq $(VEC_SIZE * 2), %rax - addq %rdi, %rax -# endif + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ret .p2align 4 -L(4x_vec_end): - kmovd %k1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - kmovd %k2, %eax - testl %eax, %eax - jnz L(first_vec_x1) - kmovd %k3, %eax - testl %eax, %eax - jnz L(first_vec_x2) - kmovd %k4, %eax - testl %eax, %eax -L(first_vec_x3): +L(last_vec_x3): tzcntl %eax, %eax -# ifdef USE_AS_WMEMCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax -# else - addq $(VEC_SIZE * 3), %rax - addq %rdi, %rax -# endif + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ret +# endif END (MEMCHR) #endif