From patchwork Thu Apr 22 18:04:02 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 43101 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 746F03896821; Thu, 22 Apr 2021 18:04:14 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 746F03896821 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1619114654; bh=KQqPgBMarYxR5R4y1T5fFFCHkrsg4kCZCF6xPRbSuHI=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=fLr3B8JFf4605Ph0Fxx6cNQ4u4EVKsJ9V5FiOUZoNeaHNBRi6E5tQlIEAEpPsE32a iA7H3L3w/05EtItyn80WrCuo7NgA4jRLzHF9OSqfVE8uau3o94sG7WQ1Qb7CSil4dB pgJFAvVaw9nDmdz1zX/Hq8nvS4hzgsKMKH42IXDk= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-qt1-x82d.google.com (mail-qt1-x82d.google.com [IPv6:2607:f8b0:4864:20::82d]) by sourceware.org (Postfix) with ESMTPS id 872A0382E831 for ; Thu, 22 Apr 2021 18:04:10 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 872A0382E831 Received: by mail-qt1-x82d.google.com with SMTP id d6so16249794qtx.13 for ; Thu, 22 Apr 2021 11:04:10 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=KQqPgBMarYxR5R4y1T5fFFCHkrsg4kCZCF6xPRbSuHI=; b=grRll29TAXnBs/vhLEOw+4RJDopIN+oYWVLdvJkBLiVj/WVrLcvXc542lmRzjXrO/b BadsHrqCikR/HkaHM9laxGG/1H9fEPFxZvuRDA0/N9nYBlgWt0eLDFi94Ae/GDV4wehE ro8RQtKy4lU/1EYV5A0ePNIOgsyV0mC3rfffkjczhAaju+Ni2B4/r3iWF7kK4fUA1VR6 sjgEzXedlBy0qbm021dqZcpXmndrFH/wRyYeoh6DH9Sws98Pep9mfgPLGtux2FIVDjLG i/tSmIQYJpSsmCSfRUI1RtGrfDtI9YeZHxqfOzUqa1ldBEDpTv2cR0XTw72JrXjSV2DI I7fQ== X-Gm-Message-State: AOAM5302zAUsJx8uO8fMbR1GRV1KUGD5W/7fCud8OqXctWS0dbnMy7/9 alpqnP04s5ECRLgsIeteoGSFO6RfSnA= X-Google-Smtp-Source: ABdhPJy7OA3hk0+P05TDpQdseoqhoCtHKBwRai6eaOO9dv/fBsjeM4D0YOKWHh6efGyBksvVDDFU0w== X-Received: by 2002:ac8:548f:: with SMTP id h15mr4354937qtq.29.1619114649714; Thu, 22 Apr 2021 11:04:09 -0700 (PDT) Received: from localhost.localdomain (pool-71-245-178-39.pitbpa.fios.verizon.net. [71.245.178.39]) by smtp.googlemail.com with ESMTPSA id f5sm2699010qkk.12.2021.04.22.11.04.08 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Thu, 22 Apr 2021 11:04:09 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v3 1/2] x86: Optimize strchr-avx2.S Date: Thu, 22 Apr 2021 14:04:02 -0400 Message-Id: <20210422180403.422364-1-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.29.2 In-Reply-To: <20210421213951.404588-1-goldstein.w.n@gmail.com> References: <20210421213951.404588-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-12.6 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces@sourceware.org Sender: "Libc-alpha" No bug. This commit optimizes strchr-avx2.S. The optimizations are all small things such as save an ALU in the alignment process, saving a few instructions in the loop return, saving some bytes in the main loop, and increasing the ILP in the return cases. test-strchr, test-strchrnul, test-wcschr, and test-wcschrnul are all passing. Signed-off-by: Noah Goldstein --- sysdeps/x86_64/multiarch/strchr-avx2.S | 294 +++++++++++++++---------- 1 file changed, 173 insertions(+), 121 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S index 25bec38b5d..220165d2ba 100644 --- a/sysdeps/x86_64/multiarch/strchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S @@ -49,132 +49,144 @@ .section SECTION(.text),"ax",@progbits ENTRY (STRCHR) - movl %edi, %ecx -# ifndef USE_AS_STRCHRNUL - xorl %edx, %edx -# endif - /* Broadcast CHAR to YMM0. */ vmovd %esi, %xmm0 + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + VPBROADCAST %xmm0, %ymm0 vpxor %xmm9, %xmm9, %xmm9 - VPBROADCAST %xmm0, %ymm0 /* Check if we cross page boundary with one vector load. */ - andl $(PAGE_SIZE - 1), %ecx - cmpl $(PAGE_SIZE - VEC_SIZE), %ecx - ja L(cross_page_boundary) + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) /* Check the first VEC_SIZE bytes. Search for both CHAR and the null byte. */ vmovdqu (%rdi), %ymm8 - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax + vpmovmskb %ymm1, %eax testl %eax, %eax - jz L(more_vecs) + jz L(aligned_more) tzcntl %eax, %eax - /* Found CHAR or the null byte. */ - addq %rdi, %rax # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG + jne L(zero) # endif -L(return_vzeroupper): - ZERO_UPPER_VEC_REGISTERS_RETURN - - .p2align 4 -L(more_vecs): - /* Align data for aligned loads in the loop. */ - andq $-VEC_SIZE, %rdi -L(aligned_more): - - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time - since data is only aligned to VEC_SIZE. */ - vmovdqa VEC_SIZE(%rdi), %ymm8 - addq $VEC_SIZE, %rdi - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 - vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - - vmovdqa VEC_SIZE(%rdi), %ymm8 - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 - vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x1) - - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8 - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 - vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x2) - - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 - vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jz L(prep_loop_4x) + addq %rdi, %rax + VZEROUPPER_RETURN + /* .p2align 5 helps keep performance more consistent if ENTRY() + alignment % 32 was either 16 or 0. As well this makes the + alignment % 32 of the loop_4x_vec fixed which makes tuning it + easier. */ + .p2align 5 +L(first_vec_x4): tzcntl %eax, %eax - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax + addq $(VEC_SIZE * 3 + 1), %rdi # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG + jne L(zero) # endif + addq %rdi, %rax VZEROUPPER_RETURN - .p2align 4 -L(first_vec_x0): - tzcntl %eax, %eax - /* Found CHAR or the null byte. */ - addq %rdi, %rax # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax -# endif +L(zero): + xorl %eax, %eax VZEROUPPER_RETURN +# endif + .p2align 4 L(first_vec_x1): tzcntl %eax, %eax - leaq VEC_SIZE(%rdi, %rax), %rax + incq %rdi # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG + jne L(zero) # endif + addq %rdi, %rax VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): tzcntl %eax, %eax + addq $(VEC_SIZE + 1), %rdi +# ifndef USE_AS_STRCHRNUL /* Found CHAR or the null byte. */ - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax + cmp (%rdi, %rax), %CHAR_REG + jne L(zero) +# endif + addq %rdi, %rax + VZEROUPPER_RETURN + + .p2align 4 +L(first_vec_x3): + tzcntl %eax, %eax + addq $(VEC_SIZE * 2 + 1), %rdi # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG + jne L(zero) # endif + addq %rdi, %rax VZEROUPPER_RETURN -L(prep_loop_4x): - /* Align data to 4 * VEC_SIZE. */ - andq $-(VEC_SIZE * 4), %rdi + .p2align 4 +L(aligned_more): + /* Align data to VEC_SIZE - 1. This is the same number of + instructions as using andq -VEC_SIZE but saves 4 bytes of code on + x4 check. */ + orq $(VEC_SIZE - 1), %rdi +L(cross_page_continue): + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since + data is only aligned to VEC_SIZE. */ + vmovdqa 1(%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x1) + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x2) + + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x3) + + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x4) + /* Align data to VEC_SIZE * 4 - 1. */ + addq $(VEC_SIZE * 4 + 1), %rdi + andq $-(VEC_SIZE * 4), %rdi .p2align 4 L(loop_4x_vec): /* Compare 4 * VEC at a time forward. */ - vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5 - vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6 - vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7 - vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8 + vmovdqa (%rdi), %ymm5 + vmovdqa (VEC_SIZE)(%rdi), %ymm6 + vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 + vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 /* Leaves only CHARS matching esi as 0. */ vpxor %ymm5, %ymm0, %ymm1 @@ -190,62 +202,102 @@ L(loop_4x_vec): VPMINU %ymm1, %ymm2, %ymm5 VPMINU %ymm3, %ymm4, %ymm6 - VPMINU %ymm5, %ymm6, %ymm5 + VPMINU %ymm5, %ymm6, %ymm6 - VPCMPEQ %ymm5, %ymm9, %ymm5 - vpmovmskb %ymm5, %eax + VPCMPEQ %ymm6, %ymm9, %ymm6 + vpmovmskb %ymm6, %ecx + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx + jz L(loop_4x_vec) - addq $(VEC_SIZE * 4), %rdi - testl %eax, %eax - jz L(loop_4x_vec) - VPCMPEQ %ymm1, %ymm9, %ymm1 - vpmovmskb %ymm1, %eax + VPCMPEQ %ymm1, %ymm9, %ymm1 + vpmovmskb %ymm1, %eax testl %eax, %eax - jnz L(first_vec_x0) + jnz L(last_vec_x0) + - VPCMPEQ %ymm2, %ymm9, %ymm2 - vpmovmskb %ymm2, %eax + VPCMPEQ %ymm5, %ymm9, %ymm2 + vpmovmskb %ymm2, %eax testl %eax, %eax - jnz L(first_vec_x1) + jnz L(last_vec_x1) + + VPCMPEQ %ymm3, %ymm9, %ymm3 + vpmovmskb %ymm3, %eax + /* rcx has combined result from all 4 VEC. It will only be used if + the first 3 other VEC all did not contain a match. */ + salq $32, %rcx + orq %rcx, %rax + tzcntq %rax, %rax + subq $(VEC_SIZE * 2), %rdi +# ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG + jne L(zero_end) +# endif + addq %rdi, %rax + VZEROUPPER_RETURN + - VPCMPEQ %ymm3, %ymm9, %ymm3 - VPCMPEQ %ymm4, %ymm9, %ymm4 - vpmovmskb %ymm3, %ecx - vpmovmskb %ymm4, %eax - salq $32, %rax - orq %rcx, %rax - tzcntq %rax, %rax - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax + .p2align 4 +L(last_vec_x0): + tzcntl %eax, %eax + addq $-(VEC_SIZE * 4), %rdi # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG + jne L(zero_end) # endif + addq %rdi, %rax VZEROUPPER_RETURN +# ifndef USE_AS_STRCHRNUL +L(zero_end): + xorl %eax, %eax + VZEROUPPER_RETURN +# endif + + .p2align 4 +L(last_vec_x1): + tzcntl %eax, %eax + subq $(VEC_SIZE * 3), %rdi +# ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG + jne L(zero_end) +# endif + addq %rdi, %rax + VZEROUPPER_RETURN + + /* Cold case for crossing page with first load. */ .p2align 4 L(cross_page_boundary): - andq $-VEC_SIZE, %rdi - andl $(VEC_SIZE - 1), %ecx - - vmovdqa (%rdi), %ymm8 - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 + movq %rdi, %rdx + /* Align rdi to VEC_SIZE - 1. */ + orq $(VEC_SIZE - 1), %rdi + vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax - /* Remove the leading bits. */ - sarxl %ecx, %eax, %eax + vpmovmskb %ymm1, %eax + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT + so no need to manually mod edx. */ + sarxl %edx, %eax, %eax testl %eax, %eax - jz L(aligned_more) + jz L(cross_page_continue) tzcntl %eax, %eax - addq %rcx, %rdi - addq %rdi, %rax # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + xorl %ecx, %ecx + /* Found CHAR or the null byte. */ + cmp (%rdx, %rax), %CHAR_REG + leaq (%rdx, %rax), %rax + cmovne %rcx, %rax +# else + addq %rdx, %rax # endif - VZEROUPPER_RETURN +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN END (STRCHR) # endif From patchwork Thu Apr 22 18:04:03 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 43100 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 736B0382E831; Thu, 22 Apr 2021 18:04:17 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 736B0382E831 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1619114657; bh=JVx50/NZ90/shm/+gM8ilXEE7r73vo/UrIyVcdDWIdI=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=wvd05ZO88Archkcdqbi0Qy2ueVF6CAzR0ntSJOzSQH/GmQw0z80O3+tw5TTBLfC7d VnrArkg0RPXDsNtta7KLcdQl1enAJkPgRbNvVt+aBEFu9YCCebBozF+IZK9+S4wrmg KT0Z/xx0KouZ+NsZyuHOENoLVal1KBpGziEZHymQ= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-qk1-x72c.google.com (mail-qk1-x72c.google.com [IPv6:2607:f8b0:4864:20::72c]) by sourceware.org (Postfix) with ESMTPS id 40A82382E831 for ; Thu, 22 Apr 2021 18:04:13 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 40A82382E831 Received: by mail-qk1-x72c.google.com with SMTP id t17so18756907qkg.4 for ; Thu, 22 Apr 2021 11:04:13 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=JVx50/NZ90/shm/+gM8ilXEE7r73vo/UrIyVcdDWIdI=; b=rp6oDyHjNlSgvviBMcZJOmLzV6rjWN38hqRd0ESY08Di42vxwpnUyMiBVGe+yAbVwi r8jYVXScAWFVoBPCvcOGtGtRtcM0HRrFE++TjXyETkOqjA/xLUw74N3++k69uhqtjhSf hzbe7xu3poU261SaV39rkObI/80lYiQ4TGpXhXAQUzBpNVzveHzUURd60I0WfmG7HY8P 9pcdEN27T+HrGv3HDPf9R0GSy4plyS7UrPGk5zq+rXbVGqO91e6cHehS1HkSoBxk1xa0 s+gJqOElMrVOVUUcuKuE04x7D1vfMViSmov7GV57283SXEQSivW3JM+g7z9MpERVQTnz zo3g== X-Gm-Message-State: AOAM530cvLlP0UWcisw+6gQr+cfM6DSRAVXMb2F/9VW/Yy948I5xRlp9 jm466sqq1rQuGdK9fQv+90ytJNdN0DI= X-Google-Smtp-Source: ABdhPJwMrWkSK4ZcZR+lx4clrvRgCOofZmm5fC8jyw/VfA0rTy9OmrHJMNwU4iRB7y8AZIfBizPPDg== X-Received: by 2002:a05:620a:4d5:: with SMTP id 21mr4768086qks.461.1619114652238; Thu, 22 Apr 2021 11:04:12 -0700 (PDT) Received: from localhost.localdomain (pool-71-245-178-39.pitbpa.fios.verizon.net. [71.245.178.39]) by smtp.googlemail.com with ESMTPSA id f5sm2699010qkk.12.2021.04.22.11.04.10 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Thu, 22 Apr 2021 11:04:11 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v3 2/2] x86: Optimize strchr-evex.S Date: Thu, 22 Apr 2021 14:04:03 -0400 Message-Id: <20210422180403.422364-2-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.29.2 In-Reply-To: <20210422180403.422364-1-goldstein.w.n@gmail.com> References: <20210421213951.404588-1-goldstein.w.n@gmail.com> <20210422180403.422364-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-12.6 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces@sourceware.org Sender: "Libc-alpha" No bug. This commit optimizes strchr-evex.S. The optimizations are mostly small things such as save an ALU in the alignment process, saving a few instructions in the loop return. The one significant change is saving 2 instructions in the 4x loop. test-strchr, test-strchrnul, test-wcschr, and test-wcschrnul are all passing. Signed-off-by: Noah Goldstein --- sysdeps/x86_64/multiarch/strchr-evex.S | 388 ++++++++++++++----------- 1 file changed, 214 insertions(+), 174 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S index ddc86a7058..7cd111e96c 100644 --- a/sysdeps/x86_64/multiarch/strchr-evex.S +++ b/sysdeps/x86_64/multiarch/strchr-evex.S @@ -24,23 +24,26 @@ # define STRCHR __strchr_evex # endif -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 # ifdef USE_AS_WCSCHR # define VPBROADCAST vpbroadcastd # define VPCMP vpcmpd # define VPMINU vpminud # define CHAR_REG esi -# define SHIFT_REG r8d +# define SHIFT_REG ecx +# define CHAR_SIZE 4 # else # define VPBROADCAST vpbroadcastb # define VPCMP vpcmpb # define VPMINU vpminub # define CHAR_REG sil -# define SHIFT_REG ecx +# define SHIFT_REG edx +# define CHAR_SIZE 1 # endif + # define XMMZERO xmm16 # define YMMZERO ymm16 @@ -56,23 +59,20 @@ # define VEC_SIZE 32 # define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) .section .text.evex,"ax",@progbits ENTRY (STRCHR) - movl %edi, %ecx -# ifndef USE_AS_STRCHRNUL - xorl %edx, %edx -# endif - /* Broadcast CHAR to YMM0. */ - VPBROADCAST %esi, %YMM0 - + VPBROADCAST %esi, %YMM0 + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax vpxorq %XMMZERO, %XMMZERO, %XMMZERO - /* Check if we cross page boundary with one vector load. */ - andl $(PAGE_SIZE - 1), %ecx - cmpl $(PAGE_SIZE - VEC_SIZE), %ecx - ja L(cross_page_boundary) + /* Check if we cross page boundary with one vector load. Otherwise + it is safe to use an unaligned load. */ + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) /* Check the first VEC_SIZE bytes. Search for both CHAR and the null bytes. */ @@ -83,251 +83,291 @@ ENTRY (STRCHR) VPMINU %YMM2, %YMM1, %YMM2 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ VPCMP $0, %YMMZERO, %YMM2, %k0 - ktestd %k0, %k0 - jz L(more_vecs) kmovd %k0, %eax + testl %eax, %eax + jz L(aligned_more) tzcntl %eax, %eax - /* Found CHAR or the null byte. */ # ifdef USE_AS_WCSCHR /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq (%rdi, %rax, 4), %rax + leaq (%rdi, %rax, CHAR_SIZE), %rax # else addq %rdi, %rax # endif # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Found CHAR or the null byte. */ + cmp (%rax), %CHAR_REG + jne L(zero) # endif ret - .p2align 4 -L(more_vecs): - /* Align data for aligned loads in the loop. */ - andq $-VEC_SIZE, %rdi -L(aligned_more): - - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time - since data is only aligned to VEC_SIZE. */ - VMOVA VEC_SIZE(%rdi), %YMM1 - addq $VEC_SIZE, %rdi - - /* Leaves only CHARS matching esi as 0. */ - vpxorq %YMM1, %YMM0, %YMM2 - VPMINU %YMM2, %YMM1, %YMM2 - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM2, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x0) - - VMOVA VEC_SIZE(%rdi), %YMM1 - /* Leaves only CHARS matching esi as 0. */ - vpxorq %YMM1, %YMM0, %YMM2 - VPMINU %YMM2, %YMM1, %YMM2 - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM2, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x1) - - VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 - /* Leaves only CHARS matching esi as 0. */ - vpxorq %YMM1, %YMM0, %YMM2 - VPMINU %YMM2, %YMM1, %YMM2 - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM2, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x2) - - VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 - /* Leaves only CHARS matching esi as 0. */ - vpxorq %YMM1, %YMM0, %YMM2 - VPMINU %YMM2, %YMM1, %YMM2 - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM2, %k0 - ktestd %k0, %k0 - jz L(prep_loop_4x) - - kmovd %k0, %eax + /* .p2align 5 helps keep performance more consistent if ENTRY() + alignment % 32 was either 16 or 0. As well this makes the + alignment % 32 of the loop_4x_vec fixed which makes tuning it + easier. */ + .p2align 5 +L(first_vec_x3): tzcntl %eax, %eax +# ifndef USE_AS_STRCHRNUL /* Found CHAR or the null byte. */ -# ifdef USE_AS_WCSCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax -# else - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax + cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG + jne L(zero) # endif + /* NB: Multiply sizeof char type (1 or 4) to get the number of + bytes. */ + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret + # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax -# endif +L(zero): + xorl %eax, %eax ret +# endif .p2align 4 -L(first_vec_x0): +L(first_vec_x4): +# ifndef USE_AS_STRCHRNUL + /* Check to see if first match was CHAR (k0) or null (k1). */ + kmovd %k0, %eax tzcntl %eax, %eax - /* Found CHAR or the null byte. */ -# ifdef USE_AS_WCSCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq (%rdi, %rax, 4), %rax + kmovd %k1, %ecx + /* bzhil will not be 0 if first match was null. */ + bzhil %eax, %ecx, %ecx + jne L(zero) # else - addq %rdi, %rax -# endif -# ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Combine CHAR and null matches. */ + kord %k0, %k1, %k0 + kmovd %k0, %eax + tzcntl %eax, %eax # endif + /* NB: Multiply sizeof char type (1 or 4) to get the number of + bytes. */ + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax ret .p2align 4 L(first_vec_x1): tzcntl %eax, %eax - /* Found CHAR or the null byte. */ -# ifdef USE_AS_WCSCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq VEC_SIZE(%rdi, %rax, 4), %rax -# else - leaq VEC_SIZE(%rdi, %rax), %rax -# endif # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Found CHAR or the null byte. */ + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG + jne L(zero) + # endif + /* NB: Multiply sizeof char type (1 or 4) to get the number of + bytes. */ + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax ret .p2align 4 L(first_vec_x2): +# ifndef USE_AS_STRCHRNUL + /* Check to see if first match was CHAR (k0) or null (k1). */ + kmovd %k0, %eax tzcntl %eax, %eax - /* Found CHAR or the null byte. */ -# ifdef USE_AS_WCSCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax + kmovd %k1, %ecx + /* bzhil will not be 0 if first match was null. */ + bzhil %eax, %ecx, %ecx + jne L(zero) # else - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax -# endif -# ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Combine CHAR and null matches. */ + kord %k0, %k1, %k0 + kmovd %k0, %eax + tzcntl %eax, %eax # endif + /* NB: Multiply sizeof char type (1 or 4) to get the number of + bytes. */ + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ret -L(prep_loop_4x): - /* Align data to 4 * VEC_SIZE. */ + .p2align 4 +L(aligned_more): + /* Align data to VEC_SIZE. */ + andq $-VEC_SIZE, %rdi +L(cross_page_continue): + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since + data is only aligned to VEC_SIZE. Use two alternating methods for + checking VEC to balance latency and port contention. */ + + /* This method has higher latency but has better port + distribution. */ + VMOVA (VEC_SIZE)(%rdi), %YMM1 + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + /* This method has higher latency but has better port + distribution. */ + VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 + /* Each bit in K0 represents a CHAR in YMM1. */ + VPCMP $0, %YMM1, %YMM0, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ + VPCMP $0, %YMM1, %YMMZERO, %k1 + kortestd %k0, %k1 + jnz L(first_vec_x2) + + VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x3) + + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 + /* Each bit in K0 represents a CHAR in YMM1. */ + VPCMP $0, %YMM1, %YMM0, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ + VPCMP $0, %YMM1, %YMMZERO, %k1 + kortestd %k0, %k1 + jnz L(first_vec_x4) + + /* Align data to VEC_SIZE * 4 for the loop. */ + addq $VEC_SIZE, %rdi andq $-(VEC_SIZE * 4), %rdi .p2align 4 L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ + /* Check 4x VEC at a time. No penalty to imm32 offset with evex + encoding. */ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 - /* Leaves only CHARS matching esi as 0. */ + /* For YMM1 and YMM3 use xor to set the CHARs matching esi to zero. */ vpxorq %YMM1, %YMM0, %YMM5 - vpxorq %YMM2, %YMM0, %YMM6 + /* For YMM2 and YMM4 cmp not equals to CHAR and store result in k + register. Its possible to save either 1 or 2 instructions using cmp no + equals method for either YMM1 or YMM1 and YMM3 respectively but + bottleneck on p5 makes it no worth it. */ + VPCMP $4, %YMM0, %YMM2, %k2 vpxorq %YMM3, %YMM0, %YMM7 - vpxorq %YMM4, %YMM0, %YMM8 - - VPMINU %YMM5, %YMM1, %YMM5 - VPMINU %YMM6, %YMM2, %YMM6 - VPMINU %YMM7, %YMM3, %YMM7 - VPMINU %YMM8, %YMM4, %YMM8 - - VPMINU %YMM5, %YMM6, %YMM1 - VPMINU %YMM7, %YMM8, %YMM2 - - VPMINU %YMM1, %YMM2, %YMM1 - - /* Each bit in K0 represents a CHAR or a null byte. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - - addq $(VEC_SIZE * 4), %rdi - - ktestd %k0, %k0 + VPCMP $4, %YMM0, %YMM4, %k4 + + /* Use min to select all zeros (either from xor or end of string). */ + VPMINU %YMM1, %YMM5, %YMM1 + VPMINU %YMM3, %YMM7, %YMM3 + + /* Use min + zeromask to select for zeros. Since k2 and k4 will be + have 0 as positions that matched with CHAR which will set zero in + the corresponding destination bytes in YMM2 / YMM4. */ + VPMINU %YMM1, %YMM2, %YMM2{%k2}{z} + VPMINU %YMM3, %YMM4, %YMM4 + VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} + + VPCMP $0, %YMMZERO, %YMM4, %k1 + kmovd %k1, %ecx + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx jz L(loop_4x_vec) - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM5, %k0 + VPCMP $0, %YMMZERO, %YMM1, %k0 kmovd %k0, %eax testl %eax, %eax - jnz L(first_vec_x0) + jnz L(last_vec_x1) - /* Each bit in K1 represents a CHAR or a null byte in YMM2. */ - VPCMP $0, %YMMZERO, %YMM6, %k1 - kmovd %k1, %eax + VPCMP $0, %YMMZERO, %YMM2, %k0 + kmovd %k0, %eax testl %eax, %eax - jnz L(first_vec_x1) - - /* Each bit in K2 represents a CHAR or a null byte in YMM3. */ - VPCMP $0, %YMMZERO, %YMM7, %k2 - /* Each bit in K3 represents a CHAR or a null byte in YMM4. */ - VPCMP $0, %YMMZERO, %YMM8, %k3 + jnz L(last_vec_x2) + VPCMP $0, %YMMZERO, %YMM3, %k0 + kmovd %k0, %eax + /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ # ifdef USE_AS_WCSCHR - /* NB: Each bit in K2/K3 represents 4-byte element. */ - kshiftlw $8, %k3, %k1 + sall $8, %ecx + orl %ecx, %eax + tzcntl %eax, %eax # else - kshiftlq $32, %k3, %k1 + salq $32, %rcx + orq %rcx, %rax + tzcntq %rax, %rax # endif +# ifndef USE_AS_STRCHRNUL + /* Check if match was CHAR or null. */ + cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG + jne L(zero_end) +# endif + /* NB: Multiply sizeof char type (1 or 4) to get the number of + bytes. */ + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret - /* Each bit in K1 represents a NULL or a mismatch. */ - korq %k1, %k2, %k1 - kmovq %k1, %rax +# ifndef USE_AS_STRCHRNUL +L(zero_end): + xorl %eax, %eax + ret +# endif - tzcntq %rax, %rax -# ifdef USE_AS_WCSCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax -# else - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax + .p2align 4 +L(last_vec_x1): + tzcntl %eax, %eax +# ifndef USE_AS_STRCHRNUL + /* Check if match was null. */ + cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG + jne L(zero_end) # endif + /* NB: Multiply sizeof char type (1 or 4) to get the number of + bytes. */ + leaq (%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 +L(last_vec_x2): + tzcntl %eax, %eax # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Check if match was null. */ + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG + jne L(zero_end) # endif + /* NB: Multiply sizeof char type (1 or 4) to get the number of + bytes. */ + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax ret /* Cold case for crossing page with first load. */ .p2align 4 L(cross_page_boundary): + movq %rdi, %rdx + /* Align rdi. */ andq $-VEC_SIZE, %rdi - andl $(VEC_SIZE - 1), %ecx - VMOVA (%rdi), %YMM1 - /* Leaves only CHARS matching esi as 0. */ vpxorq %YMM1, %YMM0, %YMM2 VPMINU %YMM2, %YMM1, %YMM2 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ VPCMP $0, %YMMZERO, %YMM2, %k0 kmovd %k0, %eax - testl %eax, %eax - + /* Remove the leading bits. */ # ifdef USE_AS_WCSCHR + movl %edx, %SHIFT_REG /* NB: Divide shift count by 4 since each bit in K1 represent 4 bytes. */ - movl %ecx, %SHIFT_REG - sarl $2, %SHIFT_REG + sarl $2, %SHIFT_REG + andl $(CHAR_PER_VEC - 1), %SHIFT_REG # endif - - /* Remove the leading bits. */ sarxl %SHIFT_REG, %eax, %eax + /* If eax is zero continue. */ testl %eax, %eax - - jz L(aligned_more) + jz L(cross_page_continue) tzcntl %eax, %eax - addq %rcx, %rdi +# ifndef USE_AS_STRCHRNUL + /* Check to see if match was CHAR or null. */ + cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG + jne L(zero_end) +# endif # ifdef USE_AS_WCSCHR /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq (%rdi, %rax, 4), %rax + leaq (%rdx, %rax, CHAR_SIZE), %rax # else - addq %rdi, %rax -# endif -# ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + addq %rdx, %rax # endif ret