From patchwork Thu Apr 21 22:22:01 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 53107 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 114AF3857034 for ; Thu, 21 Apr 2022 22:22:33 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 114AF3857034 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1650579753; bh=4fWP89Tizyd7BdLvN05iGONORiynP373ZJkxQVG0h1U=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=pm2tx8+GM4D+EldQ39qf4r3NOlXg/HUPEx3iRYC/Y5Q5kI531aIhCidnCpMAz363A qSg9nHRbcNYfquAGRUJPVrjEOp1EVioXQrrVCzzxKpdwprA4sjv9QA0S8F0bu9TqWT Q2H0Y6uWt513hQXhCvw4aTLsyq4iiGyfGMUNWlH4= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-qt1-x835.google.com (mail-qt1-x835.google.com [IPv6:2607:f8b0:4864:20::835]) by sourceware.org (Postfix) with ESMTPS id 423603858C2C for ; Thu, 21 Apr 2022 22:22:11 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 423603858C2C Received: by mail-qt1-x835.google.com with SMTP id ay11so4333320qtb.4 for ; Thu, 21 Apr 2022 15:22:11 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=4fWP89Tizyd7BdLvN05iGONORiynP373ZJkxQVG0h1U=; b=19i75Qt9R1vNYyRwvpY91Zo0apH6ozkDIGDdG3XR7y2/JitVN2sFaCkyyIAW69KA6e WXAVdgLhVk75RGWWKdEWAO1QqDLR+L3Tb8TSWETruA80chukmf92ocGMuNZxBOgsZu4x ncGqayhZp/yWLp7Rz2VEJxFa15wE+ZOK8t1SdZ5iWRfHBM+QBKkVyweWyyIfcu/QAVFo PdgQIYFmqxYEJzUPyoV+6+PItbZfTyYp4AEwIlefvHqBf9yYRJvrUf4pVGU1WDqT88xM Fp16g/G+YybHXUX2nCzRCuz0MzbL+rE+hQw5CIlzDcjtQoYvVrgU7CgUkGdKs2yZeFrs kYxw== X-Gm-Message-State: AOAM532yCvts/OfBT3JZ1TB0mMDedM8QWiPVxAqXlJzgTZ6SHFIRtDnW ftFpG7AepAML3ZkQFCvWbyf6glzY0WI= X-Google-Smtp-Source: ABdhPJxo/DoGmaXR9MjmSo1WF0+KFKxSh9gICNTHwiHF4kau5wd8cWhrcIbpMb7iy8sKtaF0RDHvPA== X-Received: by 2002:ac8:57d2:0:b0:2f1:fa23:a415 with SMTP id w18-20020ac857d2000000b002f1fa23a415mr1228851qta.322.1650579730307; Thu, 21 Apr 2022 15:22:10 -0700 (PDT) Received: from localhost.localdomain ([173.245.203.170]) by smtp.googlemail.com with ESMTPSA id l9-20020a05622a174900b002f34421b01csm206337qtk.23.2022.04.21.15.22.09 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Thu, 21 Apr 2022 15:22:10 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v2 1/4] benchtests: Improve bench-strrchr Date: Thu, 21 Apr 2022 17:22:01 -0500 Message-Id: <20220421222204.3093130-1-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220421031410.2142238-1-goldstein.w.n@gmail.com> References: <20220421031410.2142238-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.9 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" 1. Use json-lib for printing results. 2. Expose all parameters (before pos, seek_char, and max_char where not printed). 3. Add benchmarks that test multiple occurence of seek_char in the string. --- benchtests/bench-strrchr.c | 124 ++++++++++++++++++++++++------------- 1 file changed, 80 insertions(+), 44 deletions(-) diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c index abdae60c51..ce4307a098 100644 --- a/benchtests/bench-strrchr.c +++ b/benchtests/bench-strrchr.c @@ -23,6 +23,7 @@ # define TEST_NAME "strrchr" #endif #include "bench-string.h" +#include "json-lib.h" #define BIG_CHAR MAX_CHAR @@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c) } static void -do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res) +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c, + CHAR *exp_res) { CHAR *res = CALL (impl, s, c); size_t i, iters = INNER_LOOP_ITERS8; @@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res) if (res != exp_res) { - error (0, 0, "Wrong result in function %s %p %p", impl->name, - res, exp_res); + error (0, 0, "Wrong result in function %s %p %p", impl->name, res, + exp_res); ret = 1; return; } @@ -73,23 +75,25 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res) CALL (impl, s, c); } TIMING_NOW (stop); - TIMING_DIFF (cur, start, stop); - TIMING_PRINT_MEAN ((double) cur, (double) iters); + json_element_double (json_ctx, (double) cur / (double) iters); } static void -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char) +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len, + int seek_char, int max_char, size_t freq) /* For wcsrchr: align here means align not in bytes, but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)) len for wcschr here isn't in bytes but it's number of wchar_t symbols. */ { size_t i; + size_t pos_chunk_sz = freq ? (pos / freq) : pos; + size_t last_pos = len; CHAR *result; CHAR *buf = (CHAR *) buf1; - align &= 7; + align &= (getpagesize () - 1); if ((align + len) * sizeof (CHAR) >= page_size) return; @@ -103,6 +107,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char) if ((i > pos || pos >= len) && buf[align + i] == seek_char) buf[align + i] = seek_char + 10 + (random () & 15); } + + if (pos_chunk_sz == 0 && pos) + pos_chunk_sz = 1; + + for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz) + { + buf[align + i] = seek_char; + last_pos = i; + } + buf[align + len] = 0; if (pos < len) @@ -110,66 +124,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char) buf[align + pos] = seek_char; result = (CHAR *) (buf + align + pos); } + else if (last_pos < len) + result = (CHAR *) (buf + align + last_pos); else if (seek_char == 0) result = (CHAR *) (buf + align + len); else result = NULL; - printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR)); + json_element_object_begin (json_ctx); + json_attr_uint (json_ctx, "len", len); + json_attr_uint (json_ctx, "pos", pos); + json_attr_uint (json_ctx, "align", align); + json_attr_uint (json_ctx, "freq", freq); + json_attr_uint (json_ctx, "seek", seek_char); + json_attr_uint (json_ctx, "max_char", max_char); + json_array_begin (json_ctx, "timings"); FOR_EACH_IMPL (impl, 0) - do_one_test (impl, (CHAR *) (buf + align), seek_char, result); + do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result); - putchar ('\n'); + json_array_end (json_ctx); + json_element_object_end (json_ctx); } int test_main (void) { - size_t i; + json_ctx_t json_ctx; + size_t i, j; + int seek; test_init (); + json_init (&json_ctx, 0, stdout); - printf ("%20s", ""); - FOR_EACH_IMPL (impl, 0) - printf ("\t%s", impl->name); - putchar ('\n'); - - for (i = 1; i < 8; ++i) - { - do_test (0, 16 << i, 2048, 23, SMALL_CHAR); - do_test (i, 16 << i, 2048, 23, SMALL_CHAR); - } + json_document_begin (&json_ctx); + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE); - for (i = 1; i < 8; ++i) - { - do_test (i, 64, 256, 23, SMALL_CHAR); - do_test (i, 64, 256, 23, BIG_CHAR); - } + json_attr_object_begin (&json_ctx, "functions"); + json_attr_object_begin (&json_ctx, TEST_NAME); + json_attr_string (&json_ctx, "bench-variant", ""); - for (i = 0; i < 32; ++i) - { - do_test (0, i, i + 1, 23, SMALL_CHAR); - do_test (0, i, i + 1, 23, BIG_CHAR); - } + json_array_begin (&json_ctx, "ifuncs"); + FOR_EACH_IMPL (impl, 0) + json_element_string (&json_ctx, impl->name); + json_array_end (&json_ctx); - for (i = 1; i < 8; ++i) - { - do_test (0, 16 << i, 2048, 0, SMALL_CHAR); - do_test (i, 16 << i, 2048, 0, SMALL_CHAR); - } + json_array_begin (&json_ctx, "results"); - for (i = 1; i < 8; ++i) + for (seek = 0; seek <= 23; seek += 23) { - do_test (i, 64, 256, 0, SMALL_CHAR); - do_test (i, 64, 256, 0, BIG_CHAR); + for (j = 1; j < 32; j += j) + { + for (i = 1; i < 9; ++i) + { + do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j); + do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j); + } + + for (i = 1; i < 8; ++i) + { + do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j); + do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j); + + do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j); + do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j); + } + + for (i = 0; i < 32; ++i) + { + do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j); + do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j); + } + if (seek == 0) + { + break; + } + } } - for (i = 0; i < 32; ++i) - { - do_test (0, i, i + 1, 0, SMALL_CHAR); - do_test (0, i, i + 1, 0, BIG_CHAR); - } + json_array_end (&json_ctx); + json_attr_object_end (&json_ctx); + json_attr_object_end (&json_ctx); + json_document_end (&json_ctx); return ret; } From patchwork Thu Apr 21 22:22:02 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 53108 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 88AA03856DC7 for ; Thu, 21 Apr 2022 22:23:15 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 88AA03856DC7 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1650579795; bh=XgThhXogtdPNBsHVY3n/5/nPzkcR3ugbjSd+sMAlJ8Q=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=ftuTBp3E6ULaQursWYhOpet/u6ID76jwc5PUHqh6Bu1RvqCYC+qFKmc8feLwF/wF7 L4xYsQhs4UlaPaBnKdGX7FvDvQW0AoKUtFiR6Cm2Q8TG4iEgegFyEVJWH7iiCs6X+n mEWjvqp786dH0oYrvsp4/+ey0WrV1oHfXhT0Y8VA= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-qv1-xf2a.google.com (mail-qv1-xf2a.google.com [IPv6:2607:f8b0:4864:20::f2a]) by sourceware.org (Postfix) with ESMTPS id 361A53858C2C for ; Thu, 21 Apr 2022 22:22:13 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 361A53858C2C Received: by mail-qv1-xf2a.google.com with SMTP id e17so4734504qvj.11 for ; Thu, 21 Apr 2022 15:22:13 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=XgThhXogtdPNBsHVY3n/5/nPzkcR3ugbjSd+sMAlJ8Q=; b=wLLeLXmovBuUNWWMvbszTDmUeYk3/UEGBbvZGUggQqr8r6YIYjnUFhcgVMLoEg0HlY OMOYDBDTlVrW+kbUgFEaTaVWx3vRVulJltF5sHR8wxhgA5iMRrCosMQ7hk3ACAJA1zxS LfpiKbiHNvk4sR95rIx5RKzmHCiZ0N2r2tVWf7iQ2IUObscF3adxZSQrWQUiOs6gewUP Xd9gXaTkGNM0CHFk1LgbapE5hGWao7BbkmI/N+S/xmrs6hqXPMNarnbzP9/iNVCZVroG F5wpt/ZmMELCviFz1iG93nE6hHtHhIqX0AsPjkFhe6lwe0cOEYKCdi1Oqo1R65WUclgs Pqjw== X-Gm-Message-State: AOAM5323kKhLD4MA84d+oJ7Dlh0ROs8a8Q14oISGinhltHPj5Xz5+X30 pMTekG5VAlm8tq6TKD8By/oeVUOyHFY= X-Google-Smtp-Source: ABdhPJzjDnWUADYFpk7DQRuOw9haXcViI6P5Q2M0QWoV+NEm4TAEmnmjeQDmrey24VcnkpSBoFDW2A== X-Received: by 2002:a0c:ed46:0:b0:444:3e69:767b with SMTP id v6-20020a0ced46000000b004443e69767bmr1533367qvq.71.1650579732167; Thu, 21 Apr 2022 15:22:12 -0700 (PDT) Received: from localhost.localdomain ([173.245.203.170]) by smtp.googlemail.com with ESMTPSA id l9-20020a05622a174900b002f34421b01csm206337qtk.23.2022.04.21.15.22.11 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Thu, 21 Apr 2022 15:22:11 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v2 2/4] x86: Optimize {str|wcs}rchr-sse2 Date: Thu, 21 Apr 2022 17:22:02 -0500 Message-Id: <20220421222204.3093130-2-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220421222204.3093130-1-goldstein.w.n@gmail.com> References: <20220421031410.2142238-1-goldstein.w.n@gmail.com> <20220421222204.3093130-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-10.5 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_NUMSUBJECT, KAM_SHORT, RCVD_IN_DNSWL_NONE, SCC_10_SHORT_WORD_LINES, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" The new code unrolls the main loop slightly without adding too much overhead and minimizes the comparisons for the search CHAR. Geometric Mean of all benchmarks New / Old: 0.741 See email for all results. Full xcheck passes on x86_64 with and without multiarch enabled. --- sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +- sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +- sysdeps/x86_64/strrchr.S | 510 +++++++++++++++--------- sysdeps/x86_64/wcsrchr.S | 268 +------------ 4 files changed, 339 insertions(+), 444 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S index db1b44c23c..866396e947 100644 --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S @@ -17,7 +17,7 @@ . */ #if IS_IN (libc) -# define strrchr __strrchr_sse2 +# define STRRCHR __strrchr_sse2 # undef weak_alias # define weak_alias(strrchr, rindex) diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S index 78d1ca6553..69d2f3cdb1 100644 --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S @@ -17,7 +17,6 @@ . */ #if IS_IN (libc) -# define wcsrchr __wcsrchr_sse2 +# define STRRCHR __wcsrchr_sse2 #endif - #include "../wcsrchr.S" diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S index 50d886713e..6efb25c880 100644 --- a/sysdeps/x86_64/strrchr.S +++ b/sysdeps/x86_64/strrchr.S @@ -19,210 +19,360 @@ #include +#ifndef STRRCHR +# define STRRCHR strrchr +#endif + +#ifdef USE_AS_WCSRCHR +# define PCMPEQ pcmpeqd +# define CHAR_SIZE 4 +# define PMINU pminud +#else +# define PCMPEQ pcmpeqb +# define CHAR_SIZE 1 +# define PMINU pminub +#endif + +#define PAGE_SIZE 4096 +#define VEC_SIZE 16 + .text -ENTRY (strrchr) - movd %esi, %xmm1 +ENTRY(STRRCHR) + movd %esi, %xmm0 movq %rdi, %rax - andl $4095, %eax - punpcklbw %xmm1, %xmm1 - cmpq $4032, %rax - punpcklwd %xmm1, %xmm1 - pshufd $0, %xmm1, %xmm1 + andl $(PAGE_SIZE - 1), %eax +#ifndef USE_AS_WCSRCHR + punpcklbw %xmm0, %xmm0 + punpcklwd %xmm0, %xmm0 +#endif + pshufd $0, %xmm0, %xmm0 + cmpl $(PAGE_SIZE - VEC_SIZE), %eax ja L(cross_page) - movdqu (%rdi), %xmm0 + +L(cross_page_continue): + movups (%rdi), %xmm1 pxor %xmm2, %xmm2 - movdqa %xmm0, %xmm3 - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm3 - pmovmskb %xmm0, %ecx - pmovmskb %xmm3, %edx - testq %rdx, %rdx - je L(next_48_bytes) - leaq -1(%rdx), %rax - xorq %rdx, %rax - andq %rcx, %rax - je L(exit) - bsrq %rax, %rax + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %ecx + testl %ecx, %ecx + jz L(aligned_more) + + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + leal -1(%rcx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(ret0) + bsrl %eax, %eax addq %rdi, %rax + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If + search CHAR is zero we are correct. Either way `andq + -CHAR_SIZE, %rax` gets the correct result. */ +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif +L(ret0): ret + /* Returns for first vec x1/x2 have hard coded backward search + path for earlier matches. */ .p2align 4 -L(next_48_bytes): - movdqu 16(%rdi), %xmm4 - movdqa %xmm4, %xmm5 - movdqu 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm4 - pcmpeqb %xmm2, %xmm5 - movdqu 48(%rdi), %xmm0 - pmovmskb %xmm5, %edx - movdqa %xmm3, %xmm5 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm2, %xmm5 - pcmpeqb %xmm0, %xmm2 - salq $16, %rdx - pmovmskb %xmm3, %r8d - pmovmskb %xmm5, %eax - pmovmskb %xmm2, %esi - salq $32, %r8 - salq $32, %rax - pcmpeqb %xmm1, %xmm0 - orq %rdx, %rax - movq %rsi, %rdx - pmovmskb %xmm4, %esi - salq $48, %rdx - salq $16, %rsi - orq %r8, %rsi - orq %rcx, %rsi - pmovmskb %xmm0, %ecx - salq $48, %rcx - orq %rcx, %rsi - orq %rdx, %rax - je L(loop_header2) - leaq -1(%rax), %rcx - xorq %rax, %rcx - andq %rcx, %rsi - je L(exit) - bsrq %rsi, %rsi - leaq (%rdi,%rsi), %rax +L(first_vec_x0_test): + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + testl %eax, %eax + jz L(ret0) + bsrl %eax, %eax + addq %r8, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif ret .p2align 4 -L(loop_header2): - testq %rsi, %rsi - movq %rdi, %rcx - je L(no_c_found) -L(loop_header): - addq $64, %rdi - pxor %xmm7, %xmm7 - andq $-64, %rdi - jmp L(loop_entry) +L(first_vec_x1): + PCMPEQ %xmm0, %xmm2 + pmovmskb %xmm2, %eax + leal -1(%rcx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(first_vec_x0_test) + bsrl %eax, %eax + leaq (VEC_SIZE)(%rdi, %rax), %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret .p2align 4 -L(loop64): - testq %rdx, %rdx - cmovne %rdx, %rsi - cmovne %rdi, %rcx - addq $64, %rdi -L(loop_entry): - movdqa 32(%rdi), %xmm3 - pxor %xmm6, %xmm6 - movdqa 48(%rdi), %xmm2 - movdqa %xmm3, %xmm0 - movdqa 16(%rdi), %xmm4 - pminub %xmm2, %xmm0 - movdqa (%rdi), %xmm5 - pminub %xmm4, %xmm0 - pminub %xmm5, %xmm0 - pcmpeqb %xmm7, %xmm0 - pmovmskb %xmm0, %eax - movdqa %xmm5, %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %r9d - movdqa %xmm4, %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %edx - movdqa %xmm3, %xmm0 - pcmpeqb %xmm1, %xmm0 - salq $16, %rdx - pmovmskb %xmm0, %r10d - movdqa %xmm2, %xmm0 - pcmpeqb %xmm1, %xmm0 - salq $32, %r10 - orq %r10, %rdx - pmovmskb %xmm0, %r8d - orq %r9, %rdx - salq $48, %r8 - orq %r8, %rdx +L(first_vec_x1_test): + PCMPEQ %xmm0, %xmm2 + pmovmskb %xmm2, %eax testl %eax, %eax - je L(loop64) - pcmpeqb %xmm6, %xmm4 - pcmpeqb %xmm6, %xmm3 - pcmpeqb %xmm6, %xmm5 - pmovmskb %xmm4, %eax - pmovmskb %xmm3, %r10d - pcmpeqb %xmm6, %xmm2 - pmovmskb %xmm5, %r9d - salq $32, %r10 - salq $16, %rax - pmovmskb %xmm2, %r8d - orq %r10, %rax - orq %r9, %rax - salq $48, %r8 - orq %r8, %rax - leaq -1(%rax), %r8 - xorq %rax, %r8 - andq %r8, %rdx - cmovne %rdi, %rcx - cmovne %rdx, %rsi - bsrq %rsi, %rsi - leaq (%rcx,%rsi), %rax + jz L(first_vec_x0_test) + bsrl %eax, %eax + leaq (VEC_SIZE)(%rdi, %rax), %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(first_vec_x2): + PCMPEQ %xmm0, %xmm3 + pmovmskb %xmm3, %eax + leal -1(%rcx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(first_vec_x1_test) + bsrl %eax, %eax + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(aligned_more): + /* Save original pointer if match was in VEC 0. */ + movq %rdi, %r8 + andq $-VEC_SIZE, %rdi + + movaps VEC_SIZE(%rdi), %xmm2 + pxor %xmm3, %xmm3 + PCMPEQ %xmm2, %xmm3 + pmovmskb %xmm3, %ecx + testl %ecx, %ecx + jnz L(first_vec_x1) + + movaps (VEC_SIZE * 2)(%rdi), %xmm3 + pxor %xmm4, %xmm4 + PCMPEQ %xmm3, %xmm4 + pmovmskb %xmm4, %ecx + testl %ecx, %ecx + jnz L(first_vec_x2) + + addq $VEC_SIZE, %rdi + /* Save pointer again before realigning. */ + movq %rdi, %rsi + andq $-(VEC_SIZE * 2), %rdi + .p2align 4 +L(first_loop): + /* Do 2x VEC at a time. */ + movaps (VEC_SIZE * 2)(%rdi), %xmm4 + movaps (VEC_SIZE * 3)(%rdi), %xmm5 + /* If SSE2 no pminud so wcsrchr needs seperate logic for + detecting zero. Note if this is found to be a bottleneck it + may be worth adding an SSE4.1 wcsrchr implementation. */ +#ifdef USE_AS_WCSRCHR + movaps %xmm5, %xmm6 + pxor %xmm8, %xmm8 + + PCMPEQ %xmm8, %xmm5 + PCMPEQ %xmm4, %xmm8 + por %xmm5, %xmm8 +#else + movaps %xmm5, %xmm6 + PMINU %xmm4, %xmm5 +#endif + + movaps %xmm4, %xmm9 + PCMPEQ %xmm0, %xmm4 + PCMPEQ %xmm0, %xmm6 + movaps %xmm6, %xmm7 + por %xmm4, %xmm6 +#ifndef USE_AS_WCSRCHR + pxor %xmm8, %xmm8 + PCMPEQ %xmm5, %xmm8 +#endif + pmovmskb %xmm8, %ecx + pmovmskb %xmm6, %eax + + addq $(VEC_SIZE * 2), %rdi + /* Use `addl` 1) so we can undo it with `subl` and 2) it can + macro-fuse with `jz`. */ + addl %ecx, %eax + jz L(first_loop) + + /* Check if there is zero match. */ + testl %ecx, %ecx + jz L(second_loop_match) + + /* Check if there was a match in last iteration. */ + subl %ecx, %eax + jnz L(new_match) + +L(first_loop_old_match): + PCMPEQ %xmm0, %xmm2 + PCMPEQ %xmm0, %xmm3 + pmovmskb %xmm2, %ecx + pmovmskb %xmm3, %eax + addl %eax, %ecx + jz L(first_vec_x0_test) + /* NB: We could move this shift to before the branch and save a + bit of code size / performance on the fall through. The + branch leads to the null case which generally seems hotter + than char in first 3x VEC. */ + sall $16, %eax + orl %ecx, %eax + + bsrl %eax, %eax + addq %rsi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(new_match): + pxor %xmm6, %xmm6 + PCMPEQ %xmm9, %xmm6 + pmovmskb %xmm6, %eax + sall $16, %ecx + orl %eax, %ecx + + /* We can't reuse either of the old comparisons as since we mask + of zeros after first zero (instead of using the full + comparison) we can't gurantee no interference between match + after end of string and valid match. */ + pmovmskb %xmm4, %eax + pmovmskb %xmm7, %edx + sall $16, %edx + orl %edx, %eax + + leal -1(%ecx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(first_loop_old_match) + bsrl %eax, %eax + addq %rdi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif ret + /* Save minimum state for getting most recent match. We can + throw out all previous work. */ .p2align 4 -L(no_c_found): - movl $1, %esi - xorl %ecx, %ecx - jmp L(loop_header) +L(second_loop_match): + movq %rdi, %rsi + movaps %xmm4, %xmm2 + movaps %xmm7, %xmm3 .p2align 4 -L(exit): - xorl %eax, %eax +L(second_loop): + movaps (VEC_SIZE * 2)(%rdi), %xmm4 + movaps (VEC_SIZE * 3)(%rdi), %xmm5 + /* If SSE2 no pminud so wcsrchr needs seperate logic for + detecting zero. Note if this is found to be a bottleneck it + may be worth adding an SSE4.1 wcsrchr implementation. */ +#ifdef USE_AS_WCSRCHR + movaps %xmm5, %xmm6 + pxor %xmm8, %xmm8 + + PCMPEQ %xmm8, %xmm5 + PCMPEQ %xmm4, %xmm8 + por %xmm5, %xmm8 +#else + movaps %xmm5, %xmm6 + PMINU %xmm4, %xmm5 +#endif + + movaps %xmm4, %xmm9 + PCMPEQ %xmm0, %xmm4 + PCMPEQ %xmm0, %xmm6 + movaps %xmm6, %xmm7 + por %xmm4, %xmm6 +#ifndef USE_AS_WCSRCHR + pxor %xmm8, %xmm8 + PCMPEQ %xmm5, %xmm8 +#endif + + pmovmskb %xmm8, %ecx + pmovmskb %xmm6, %eax + + addq $(VEC_SIZE * 2), %rdi + /* Either null term or new occurence of CHAR. */ + addl %ecx, %eax + jz L(second_loop) + + /* No null term so much be new occurence of CHAR. */ + testl %ecx, %ecx + jz L(second_loop_match) + + + subl %ecx, %eax + jnz L(second_loop_new_match) + +L(second_loop_old_match): + pmovmskb %xmm2, %ecx + pmovmskb %xmm3, %eax + sall $16, %eax + orl %ecx, %eax + bsrl %eax, %eax + addq %rsi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif ret .p2align 4 +L(second_loop_new_match): + pxor %xmm6, %xmm6 + PCMPEQ %xmm9, %xmm6 + pmovmskb %xmm6, %eax + sall $16, %ecx + orl %eax, %ecx + + /* We can't reuse either of the old comparisons as since we mask + of zeros after first zero (instead of using the full + comparison) we can't gurantee no interference between match + after end of string and valid match. */ + pmovmskb %xmm4, %eax + pmovmskb %xmm7, %edx + sall $16, %edx + orl %edx, %eax + + leal -1(%ecx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(second_loop_old_match) + bsrl %eax, %eax + addq %rdi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4,, 4 L(cross_page): - movq %rdi, %rax - pxor %xmm0, %xmm0 - andq $-64, %rax - movdqu (%rax), %xmm5 - movdqa %xmm5, %xmm6 - movdqu 16(%rax), %xmm4 - pcmpeqb %xmm1, %xmm5 - pcmpeqb %xmm0, %xmm6 - movdqu 32(%rax), %xmm3 - pmovmskb %xmm6, %esi - movdqa %xmm4, %xmm6 - movdqu 48(%rax), %xmm2 - pcmpeqb %xmm1, %xmm4 - pcmpeqb %xmm0, %xmm6 - pmovmskb %xmm6, %edx - movdqa %xmm3, %xmm6 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm0, %xmm6 - pcmpeqb %xmm2, %xmm0 - salq $16, %rdx - pmovmskb %xmm3, %r9d - pmovmskb %xmm6, %r8d - pmovmskb %xmm0, %ecx - salq $32, %r9 - salq $32, %r8 - pcmpeqb %xmm1, %xmm2 - orq %r8, %rdx - salq $48, %rcx - pmovmskb %xmm5, %r8d - orq %rsi, %rdx - pmovmskb %xmm4, %esi - orq %rcx, %rdx - pmovmskb %xmm2, %ecx - salq $16, %rsi - salq $48, %rcx - orq %r9, %rsi - orq %r8, %rsi - orq %rcx, %rsi + movq %rdi, %rsi + andq $-VEC_SIZE, %rsi + movaps (%rsi), %xmm1 + pxor %xmm2, %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %edx movl %edi, %ecx - subl %eax, %ecx - shrq %cl, %rdx - shrq %cl, %rsi - testq %rdx, %rdx - je L(loop_header2) - leaq -1(%rdx), %rax - xorq %rdx, %rax - andq %rax, %rsi - je L(exit) - bsrq %rsi, %rax + andl $(VEC_SIZE - 1), %ecx + sarl %cl, %edx + jz L(cross_page_continue) + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + sarl %cl, %eax + leal -1(%rdx), %ecx + xorl %edx, %ecx + andl %ecx, %eax + jz L(ret1) + bsrl %eax, %eax addq %rdi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif +L(ret1): ret -END (strrchr) +END(STRRCHR) -weak_alias (strrchr, rindex) -libc_hidden_builtin_def (strrchr) +#ifndef USE_AS_WCSRCHR + weak_alias (STRRCHR, rindex) + libc_hidden_builtin_def (STRRCHR) +#endif diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S index 61552954de..2b80efc5ef 100644 --- a/sysdeps/x86_64/wcsrchr.S +++ b/sysdeps/x86_64/wcsrchr.S @@ -1,4 +1,4 @@ -/* wcsrchr with SSSE3 +/* wcsrchr optimized with SSE2. Copyright (C) 2011-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,266 +16,12 @@ License along with the GNU C Library; if not, see . */ -#include - .text -ENTRY (wcsrchr) +#define USE_AS_WCSRCHR 1 +#define NO_PMINU 1 - movd %rsi, %xmm1 - mov %rdi, %rcx - punpckldq %xmm1, %xmm1 - pxor %xmm2, %xmm2 - punpckldq %xmm1, %xmm1 - and $63, %rcx - cmp $48, %rcx - ja L(crosscache) +#ifndef STRRCHR +# define STRRCHR wcsrchr +#endif - movdqu (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rcx - pmovmskb %xmm0, %rax - add $16, %rdi - - test %rax, %rax - jnz L(unaligned_match1) - - test %rcx, %rcx - jnz L(return_null) - - and $-16, %rdi - xor %r8, %r8 - jmp L(loop) - - .p2align 4 -L(unaligned_match1): - test %rcx, %rcx - jnz L(prolog_find_zero_1) - - mov %rax, %r8 - mov %rdi, %rsi - and $-16, %rdi - jmp L(loop) - - .p2align 4 -L(crosscache): - and $15, %rcx - and $-16, %rdi - pxor %xmm3, %xmm3 - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm3 - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm3, %rdx - pmovmskb %xmm0, %rax - shr %cl, %rdx - shr %cl, %rax - add $16, %rdi - - test %rax, %rax - jnz L(unaligned_match) - - test %rdx, %rdx - jnz L(return_null) - - xor %r8, %r8 - jmp L(loop) - - .p2align 4 -L(unaligned_match): - test %rdx, %rdx - jnz L(prolog_find_zero) - - mov %rax, %r8 - lea (%rdi, %rcx), %rsi - -/* Loop start on aligned string. */ - .p2align 4 -L(loop): - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rcx - pmovmskb %xmm0, %rax - or %rax, %rcx - jnz L(matches) - - movdqa (%rdi), %xmm3 - pcmpeqd %xmm3, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm3 - pmovmskb %xmm2, %rcx - pmovmskb %xmm3, %rax - or %rax, %rcx - jnz L(matches) - - movdqa (%rdi), %xmm4 - pcmpeqd %xmm4, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm4 - pmovmskb %xmm2, %rcx - pmovmskb %xmm4, %rax - or %rax, %rcx - jnz L(matches) - - movdqa (%rdi), %xmm5 - pcmpeqd %xmm5, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm5 - pmovmskb %xmm2, %rcx - pmovmskb %xmm5, %rax - or %rax, %rcx - jz L(loop) - - .p2align 4 -L(matches): - test %rax, %rax - jnz L(match) -L(return_value): - test %r8, %r8 - jz L(return_null) - mov %r8, %rax - mov %rsi, %rdi - - test $15 << 4, %ah - jnz L(match_fourth_wchar) - test %ah, %ah - jnz L(match_third_wchar) - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(match): - pmovmskb %xmm2, %rcx - test %rcx, %rcx - jnz L(find_zero) - mov %rax, %r8 - mov %rdi, %rsi - jmp L(loop) - - .p2align 4 -L(find_zero): - test $15, %cl - jnz L(find_zero_in_first_wchar) - test %cl, %cl - jnz L(find_zero_in_second_wchar) - test $15, %ch - jnz L(find_zero_in_third_wchar) - - and $1 << 13 - 1, %rax - jz L(return_value) - - test $15 << 4, %ah - jnz L(match_fourth_wchar) - test %ah, %ah - jnz L(match_third_wchar) - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(find_zero_in_first_wchar): - test $1, %rax - jz L(return_value) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(find_zero_in_second_wchar): - and $1 << 5 - 1, %rax - jz L(return_value) - - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(find_zero_in_third_wchar): - and $1 << 9 - 1, %rax - jz L(return_value) - - test %ah, %ah - jnz L(match_third_wchar) - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(prolog_find_zero): - add %rcx, %rdi - mov %rdx, %rcx -L(prolog_find_zero_1): - test $15, %cl - jnz L(prolog_find_zero_in_first_wchar) - test %cl, %cl - jnz L(prolog_find_zero_in_second_wchar) - test $15, %ch - jnz L(prolog_find_zero_in_third_wchar) - - and $1 << 13 - 1, %rax - jz L(return_null) - - test $15 << 4, %ah - jnz L(match_fourth_wchar) - test %ah, %ah - jnz L(match_third_wchar) - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(prolog_find_zero_in_first_wchar): - test $1, %rax - jz L(return_null) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(prolog_find_zero_in_second_wchar): - and $1 << 5 - 1, %rax - jz L(return_null) - - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(prolog_find_zero_in_third_wchar): - and $1 << 9 - 1, %rax - jz L(return_null) - - test %ah, %ah - jnz L(match_third_wchar) - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(match_second_wchar): - lea -12(%rdi), %rax - ret - - .p2align 4 -L(match_third_wchar): - lea -8(%rdi), %rax - ret - - .p2align 4 -L(match_fourth_wchar): - lea -4(%rdi), %rax - ret - - .p2align 4 -L(return_null): - xor %rax, %rax - ret - -END (wcsrchr) +#include "../strrchr.S" From patchwork Thu Apr 21 22:22:03 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 53109 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 9602C3856DD2 for ; Thu, 21 Apr 2022 22:23:58 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 9602C3856DD2 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1650579838; bh=W1tV8mjCg9rB4gSrfO18/BM6cYUcPuNagbNx4iTAfzw=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=hI16lqH5kvttVeCQ/e0NVnKpJTGRUQmpLuiQLv2NPC2RE4KzqU4FwsQu9/ZS2XyCD wS0AhTS04GenlLRizpDUSLoshxHB7neHyGA4+/jEEJYEhgaYAtMb7wZIbG1Evc9wyY Hy+rU3pU9lLa2PkPgfSH4C0IfhI0EpjxWqxpWlqE= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-qt1-x832.google.com (mail-qt1-x832.google.com [IPv6:2607:f8b0:4864:20::832]) by sourceware.org (Postfix) with ESMTPS id 539A1385842B for ; Thu, 21 Apr 2022 22:22:14 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 539A1385842B Received: by mail-qt1-x832.google.com with SMTP id bz24so4343217qtb.2 for ; Thu, 21 Apr 2022 15:22:14 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=W1tV8mjCg9rB4gSrfO18/BM6cYUcPuNagbNx4iTAfzw=; b=NvWEw953zt9LVf5OLH5tRPsap78rfqCIXid0qufR+Y8rpgXp6XZbUTOFvWMwvOuWoN J7fpuLlKeQNdqy90pel1OWq3aEq6U8xKPAsPLtU1b1yVxQslFdbHbpQHFCwQ5SMdCoXk WA6jmbz/AzCX9KsldZk9w21JRV3audQW8s9FKFIPw0DyVMGOrGE/2/GxybpaGBbU3x6b ueQ+HplQ7u6tYMqUncGSIyEuFbRgi43Iil7sjknikEaD4VvQYWB1ikksT+X7C0/RRSFT km/Z9gbq+mVCC/E1uXUqGo140WA9rR6lL5c0rqtqAkBprqrV4EKaxCrleV/ZcdrxtVqO kjpw== X-Gm-Message-State: AOAM531IW/ahb8vxR/iWcvEc9wGaqccQL49jBOqaPgCkT7Ce9ZSb1R7h ylrygZiqPi9RMQ9zzgUyfexJuDJglUw= X-Google-Smtp-Source: ABdhPJwud+1yJEh6UidK/90egwv/+0PTO1+Sk/Z/ICg3kDR4x7ffBWZRTuBK0XecyOwU417zk7hlqw== X-Received: by 2002:ac8:5a09:0:b0:2f1:ff1d:39df with SMTP id n9-20020ac85a09000000b002f1ff1d39dfmr1190696qta.453.1650579733474; Thu, 21 Apr 2022 15:22:13 -0700 (PDT) Received: from localhost.localdomain ([173.245.203.170]) by smtp.googlemail.com with ESMTPSA id l9-20020a05622a174900b002f34421b01csm206337qtk.23.2022.04.21.15.22.12 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Thu, 21 Apr 2022 15:22:13 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v2 3/4] x86: Optimize {str|wcs}rchr-avx2 Date: Thu, 21 Apr 2022 17:22:03 -0500 Message-Id: <20220421222204.3093130-3-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220421222204.3093130-1-goldstein.w.n@gmail.com> References: <20220421031410.2142238-1-goldstein.w.n@gmail.com> <20220421222204.3093130-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.6 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_NUMSUBJECT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" The new code unrolls the main loop slightly without adding too much overhead and minimizes the comparisons for the search CHAR. Geometric Mean of all benchmarks New / Old: 0.832 See email for all results. Full xcheck passes on x86_64 with and without multiarch enabled. --- sysdeps/x86_64/multiarch/strrchr-avx2.S | 415 +++++++++++++++--------- 1 file changed, 258 insertions(+), 157 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S index 1df2adfad0..9d1e45defc 100644 --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S @@ -27,9 +27,13 @@ # ifdef USE_AS_WCSRCHR # define VPBROADCAST vpbroadcastd # define VPCMPEQ vpcmpeqd +# define VPMIN vpminud +# define CHAR_SIZE 4 # else # define VPBROADCAST vpbroadcastb # define VPCMPEQ vpcmpeqb +# define VPMIN vpminub +# define CHAR_SIZE 1 # endif # ifndef VZEROUPPER @@ -41,196 +45,293 @@ # endif # define VEC_SIZE 32 +# define PAGE_SIZE 4096 - .section SECTION(.text),"ax",@progbits -ENTRY (STRRCHR) - movd %esi, %xmm4 - movl %edi, %ecx + .section SECTION(.text), "ax", @progbits +ENTRY(STRRCHR) + movd %esi, %xmm7 + movl %edi, %eax /* Broadcast CHAR to YMM4. */ - VPBROADCAST %xmm4, %ymm4 + VPBROADCAST %xmm7, %ymm7 vpxor %xmm0, %xmm0, %xmm0 - /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + /* Shift here instead of `andl` to save code size (saves a fetch + block). */ + sall $20, %eax + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax + ja L(cross_page) +L(page_cross_continue): vmovdqu (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx - vpmovmskb %ymm3, %eax - addq $VEC_SIZE, %rdi + /* Check end of string match. */ + VPCMPEQ %ymm1, %ymm0, %ymm6 + vpmovmskb %ymm6, %ecx + testl %ecx, %ecx + jz L(aligned_more) + + /* Only check match with search CHAR if needed. */ + VPCMPEQ %ymm1, %ymm7, %ymm1 + vpmovmskb %ymm1, %eax + /* Check if match before first zero. */ + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(ret0) + bsrl %eax, %eax + addq %rdi, %rax + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If + search CHAR is zero we are correct. Either way `andq + -CHAR_SIZE, %rax` gets the correct result. */ +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif +L(ret0): +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + + /* Returns for first vec x1/x2 have hard coded backward search + path for earlier matches. */ + .p2align 4,, 10 +L(first_vec_x1): + VPCMPEQ %ymm2, %ymm7, %ymm6 + vpmovmskb %ymm6, %eax + blsmskl %ecx, %ecx + andl %ecx, %eax + jnz L(first_vec_x1_return) + + .p2align 4,, 4 +L(first_vec_x0_test): + VPCMPEQ %ymm1, %ymm7, %ymm6 + vpmovmskb %ymm6, %eax + testl %eax, %eax + jz L(ret1) + bsrl %eax, %eax + addq %r8, %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif +L(ret1): + VZEROUPPER_RETURN + .p2align 4,, 10 +L(first_vec_x0_x1_test): + VPCMPEQ %ymm2, %ymm7, %ymm6 + vpmovmskb %ymm6, %eax testl %eax, %eax - jnz L(first_vec) + jz L(first_vec_x0_test) + .p2align 4,, 4 +L(first_vec_x1_return): + bsrl %eax, %eax + leaq 1(%rdi, %rax), %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif + VZEROUPPER_RETURN - testl %ecx, %ecx - jnz L(return_null) - andq $-VEC_SIZE, %rdi - xorl %edx, %edx - jmp L(aligned_loop) + .p2align 4,, 10 +L(first_vec_x2): + VPCMPEQ %ymm3, %ymm7, %ymm6 + vpmovmskb %ymm6, %eax + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(first_vec_x0_x1_test) + bsrl %eax, %eax + leaq (VEC_SIZE + 1)(%rdi, %rax), %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif + VZEROUPPER_RETURN + .p2align 4 -L(first_vec): - /* Check if there is a nul CHAR. */ +L(aligned_more): + /* Save original pointer if match was in VEC 0. */ + movq %rdi, %r8 + + /* Align src. */ + orq $(VEC_SIZE - 1), %rdi + vmovdqu 1(%rdi), %ymm2 + VPCMPEQ %ymm2, %ymm0, %ymm6 + vpmovmskb %ymm6, %ecx testl %ecx, %ecx - jnz L(char_and_nul_in_first_vec) + jnz L(first_vec_x1) - /* Remember the match and keep searching. */ - movl %eax, %edx - movq %rdi, %rsi - andq $-VEC_SIZE, %rdi - jmp L(aligned_loop) + vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3 + VPCMPEQ %ymm3, %ymm0, %ymm6 + vpmovmskb %ymm6, %ecx + testl %ecx, %ecx + jnz L(first_vec_x2) + /* Save pointer again before realigning. */ + movq %rdi, %rsi + addq $(VEC_SIZE + 1), %rdi + andq $-(VEC_SIZE * 2), %rdi .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %edx - vpmovmskb %ymm3, %eax - shrl %cl, %edx - shrl %cl, %eax - addq $VEC_SIZE, %rdi - - /* Check if there is a CHAR. */ +L(first_aligned_loop): + /* Do 2x VEC at a time. Any more and the cost of finding the + match outweights loop benefit. */ + vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 + vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 + + VPCMPEQ %ymm4, %ymm7, %ymm6 + VPMIN %ymm4, %ymm5, %ymm8 + VPCMPEQ %ymm5, %ymm7, %ymm10 + vpor %ymm6, %ymm10, %ymm5 + VPCMPEQ %ymm8, %ymm0, %ymm8 + vpor %ymm5, %ymm8, %ymm9 + + vpmovmskb %ymm9, %eax + addq $(VEC_SIZE * 2), %rdi + /* No zero or search CHAR. */ testl %eax, %eax - jnz L(found_char) - - testl %edx, %edx - jnz L(return_null) + jz L(first_aligned_loop) - jmp L(aligned_loop) - - .p2align 4 -L(found_char): - testl %edx, %edx - jnz L(char_and_nul) + /* If no zero CHAR then go to second loop (this allows us to + throw away all prior work). */ + vpmovmskb %ymm8, %ecx + testl %ecx, %ecx + jz L(second_aligned_loop_prep) - /* Remember the match and keep searching. */ - movl %eax, %edx - leaq (%rdi, %rcx), %rsi + /* Search char could be zero so we need to get the true match. + */ + vpmovmskb %ymm5, %eax + testl %eax, %eax + jnz L(first_aligned_loop_return) - .p2align 4 -L(aligned_loop): - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - addq $VEC_SIZE, %rdi - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx - vpmovmskb %ymm3, %eax - orl %eax, %ecx - jnz L(char_nor_null) - - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - add $VEC_SIZE, %rdi - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx + .p2align 4,, 4 +L(first_vec_x1_or_x2): + VPCMPEQ %ymm3, %ymm7, %ymm3 + VPCMPEQ %ymm2, %ymm7, %ymm2 vpmovmskb %ymm3, %eax - orl %eax, %ecx - jnz L(char_nor_null) - - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - addq $VEC_SIZE, %rdi - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx - vpmovmskb %ymm3, %eax - orl %eax, %ecx - jnz L(char_nor_null) - - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - addq $VEC_SIZE, %rdi - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx - vpmovmskb %ymm3, %eax - orl %eax, %ecx - jz L(aligned_loop) - - .p2align 4 -L(char_nor_null): - /* Find a CHAR or a nul CHAR in a loop. */ - testl %eax, %eax - jnz L(match) -L(return_value): - testl %edx, %edx - jz L(return_null) - movl %edx, %eax - movq %rsi, %rdi + vpmovmskb %ymm2, %edx + /* Use add for macro-fusion. */ + addq %rax, %rdx + jz L(first_vec_x0_test) + /* NB: We could move this shift to before the branch and save a + bit of code size / performance on the fall through. The + branch leads to the null case which generally seems hotter + than char in first 3x VEC. */ + salq $32, %rax + addq %rdx, %rax + bsrq %rax, %rax + leaq 1(%rsi, %rax), %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif + VZEROUPPER_RETURN + .p2align 4,, 8 +L(first_aligned_loop_return): + VPCMPEQ %ymm4, %ymm0, %ymm4 + vpmovmskb %ymm4, %edx + salq $32, %rcx + orq %rdx, %rcx + + vpmovmskb %ymm10, %eax + vpmovmskb %ymm6, %edx + salq $32, %rax + orq %rdx, %rax + blsmskq %rcx, %rcx + andq %rcx, %rax + jz L(first_vec_x1_or_x2) + + bsrq %rax, %rax + leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax # ifdef USE_AS_WCSRCHR - /* Keep the first bit for each matching CHAR for bsr. */ - andl $0x11111111, %eax + andq $-CHAR_SIZE, %rax # endif - bsrl %eax, %eax - leaq -VEC_SIZE(%rdi, %rax), %rax -L(return_vzeroupper): - ZERO_UPPER_VEC_REGISTERS_RETURN + VZEROUPPER_RETURN + /* Search char cannot be zero. */ .p2align 4 -L(match): - /* Find a CHAR. Check if there is a nul CHAR. */ - vpmovmskb %ymm2, %ecx - testl %ecx, %ecx - jnz L(find_nul) - - /* Remember the match and keep searching. */ - movl %eax, %edx +L(second_aligned_loop_set_furthest_match): + /* Save VEC and pointer from most recent match. */ +L(second_aligned_loop_prep): movq %rdi, %rsi - jmp L(aligned_loop) + vmovdqu %ymm6, %ymm2 + vmovdqu %ymm10, %ymm3 .p2align 4 -L(find_nul): -# ifdef USE_AS_WCSRCHR - /* Keep the first bit for each matching CHAR for bsr. */ - andl $0x11111111, %ecx - andl $0x11111111, %eax -# endif - /* Mask out any matching bits after the nul CHAR. */ - movl %ecx, %r8d - subl $1, %r8d - xorl %ecx, %r8d - andl %r8d, %eax +L(second_aligned_loop): + /* Search 2x at at time. */ + vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 + vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 + + VPCMPEQ %ymm4, %ymm7, %ymm6 + VPMIN %ymm4, %ymm5, %ymm1 + VPCMPEQ %ymm5, %ymm7, %ymm10 + vpor %ymm6, %ymm10, %ymm5 + VPCMPEQ %ymm1, %ymm0, %ymm1 + vpor %ymm5, %ymm1, %ymm9 + + vpmovmskb %ymm9, %eax + addq $(VEC_SIZE * 2), %rdi testl %eax, %eax - /* If there is no CHAR here, return the remembered one. */ - jz L(return_value) - bsrl %eax, %eax - leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER_RETURN - - .p2align 4 -L(char_and_nul): - /* Find both a CHAR and a nul CHAR. */ - addq %rcx, %rdi - movl %edx, %ecx -L(char_and_nul_in_first_vec): -# ifdef USE_AS_WCSRCHR - /* Keep the first bit for each matching CHAR for bsr. */ - andl $0x11111111, %ecx - andl $0x11111111, %eax -# endif - /* Mask out any matching bits after the nul CHAR. */ - movl %ecx, %r8d - subl $1, %r8d - xorl %ecx, %r8d - andl %r8d, %eax + jz L(second_aligned_loop) + vpmovmskb %ymm1, %ecx + testl %ecx, %ecx + jz L(second_aligned_loop_set_furthest_match) + vpmovmskb %ymm5, %eax testl %eax, %eax - /* Return null pointer if the nul CHAR comes first. */ - jz L(return_null) - bsrl %eax, %eax - leaq -VEC_SIZE(%rdi, %rax), %rax + jnz L(return_new_match) + + /* This is the hot patch. We know CHAR is inbounds and that + ymm3/ymm2 have latest match. */ + .p2align 4,, 4 +L(return_old_match): + vpmovmskb %ymm3, %eax + vpmovmskb %ymm2, %edx + salq $32, %rax + orq %rdx, %rax + bsrq %rax, %rax + /* Search char cannot be zero so safe to just use lea for + wcsrchr. */ + leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax VZEROUPPER_RETURN - .p2align 4 -L(return_null): - xorl %eax, %eax + /* Last iteration also potentially has a match. */ + .p2align 4,, 8 +L(return_new_match): + VPCMPEQ %ymm4, %ymm0, %ymm4 + vpmovmskb %ymm4, %edx + salq $32, %rcx + orq %rdx, %rcx + + vpmovmskb %ymm10, %eax + vpmovmskb %ymm6, %edx + salq $32, %rax + orq %rdx, %rax + blsmskq %rcx, %rcx + andq %rcx, %rax + jz L(return_old_match) + bsrq %rax, %rax + /* Search char cannot be zero so safe to just use lea for + wcsrchr. */ + leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax VZEROUPPER_RETURN -END (STRRCHR) + .p2align 4,, 4 +L(cross_page): + movq %rdi, %rsi + andq $-VEC_SIZE, %rsi + vmovdqu (%rsi), %ymm1 + VPCMPEQ %ymm1, %ymm0, %ymm6 + vpmovmskb %ymm6, %ecx + shrxl %edi, %ecx, %ecx + testl %ecx, %ecx + jz L(page_cross_continue) + VPCMPEQ %ymm1, %ymm7, %ymm1 + vpmovmskb %ymm1, %eax + shrxl %edi, %eax, %eax + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(ret2) + bsrl %eax, %eax + addq %rdi, %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif +L(ret2): + VZEROUPPER_RETURN +END(STRRCHR) #endif From patchwork Thu Apr 21 22:22:04 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 53110 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 8B64E3857373 for ; Thu, 21 Apr 2022 22:24:46 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 8B64E3857373 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1650579886; bh=LphmPfdfGWesFXe/+iI7LXW2bXittt5AXnyG1zBZ/x4=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=BVskKhcBuJEfa/F7gjef1+B3w+gVX/+nY9bxmcz0fzORVZY8OfAJn72GcEsI63nYK scPSTVcCRKS6AlIg5K0IS/Z2a/H88k5KsyrElqAkWX0Gwnxwa09+2rnSqkFGS+J/SO IqNy8NNZHdO6RM2fBznebo94TDp8lt9SpiwFwP0k= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-qv1-xf2f.google.com (mail-qv1-xf2f.google.com [IPv6:2607:f8b0:4864:20::f2f]) by sourceware.org (Postfix) with ESMTPS id C4687385AC22 for ; Thu, 21 Apr 2022 22:22:15 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org C4687385AC22 Received: by mail-qv1-xf2f.google.com with SMTP id e17so4734571qvj.11 for ; Thu, 21 Apr 2022 15:22:15 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=LphmPfdfGWesFXe/+iI7LXW2bXittt5AXnyG1zBZ/x4=; b=DYtsGQv/hJAMB/03PF7XF9MXfHJq6/GMjobpEKCGPoLksbeqq41USAMo5O44FhjTJs yCUza9+KF1kz5dYui0cSsuq1VnCyrUkloS32Ayr1mruP8o2uCIrvido9k+YuU3g6v+yW H6991vd7irumgtpc0V2DHW/0vzmlyzGQJRe5ePYCygDAWwaeVFcQBxR1zsYZoSmplG8t UhDogPcOTxPsZcdYZ6s9Ap73/fpLxRw6kfA7V550XITGESP9dn4mBqDt8rtQZdtmgvnA uTBKUrOhc6fefvpa9NVBMFF3P+ho1KGNI3+7+jvb5yKVQlPMD7QsqD9lV/JAWdquZn0m nJEQ== X-Gm-Message-State: AOAM533yP7ift2Tn/gui5yzfrr2jSptDG1seovkJO5vVecehtNhZt/S9 ACSLAcv3mRdtaYzSM3TWuf/D3Iw9ZR4= X-Google-Smtp-Source: ABdhPJyr0h6fxutPqU/fgAW/6Y1YlGq2q3SaNls1Rf1S38S+Ft7sNuPNrHRzUAoTHMcQHeq9hBxHUw== X-Received: by 2002:ad4:5aeb:0:b0:44b:a457:1339 with SMTP id c11-20020ad45aeb000000b0044ba4571339mr1255411qvh.38.1650579734841; Thu, 21 Apr 2022 15:22:14 -0700 (PDT) Received: from localhost.localdomain ([173.245.203.170]) by smtp.googlemail.com with ESMTPSA id l9-20020a05622a174900b002f34421b01csm206337qtk.23.2022.04.21.15.22.13 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Thu, 21 Apr 2022 15:22:14 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v2 4/4] x86: Optimize {str|wcs}rchr-evex Date: Thu, 21 Apr 2022 17:22:04 -0500 Message-Id: <20220421222204.3093130-4-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220421222204.3093130-1-goldstein.w.n@gmail.com> References: <20220421031410.2142238-1-goldstein.w.n@gmail.com> <20220421222204.3093130-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.8 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" The new code unrolls the main loop slightly without adding too much overhead and minimizes the comparisons for the search CHAR. Geometric Mean of all benchmarks New / Old: 0.755 See email for all results. Full xcheck passes on x86_64 with and without multiarch enabled. --- sysdeps/x86_64/multiarch/strrchr-evex.S | 441 ++++++++++++++---------- 1 file changed, 259 insertions(+), 182 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S index adeddaed32..5cf9a8315b 100644 --- a/sysdeps/x86_64/multiarch/strrchr-evex.S +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S @@ -24,242 +24,319 @@ # define STRRCHR __strrchr_evex # endif -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 # ifdef USE_AS_WCSRCHR +# define SHIFT_REG esi + +# define kunpck kunpckbw +# define kmov_2x kmovd +# define maskz_2x ecx +# define maskm_2x eax +# define CHAR_SIZE 4 +# define VPMIN vpminud +# define VPTESTN vptestnmd # define VPBROADCAST vpbroadcastd -# define VPCMP vpcmpd -# define SHIFT_REG r8d +# define VPCMP vpcmpd # else +# define SHIFT_REG edi + +# define kunpck kunpckdq +# define kmov_2x kmovq +# define maskz_2x rcx +# define maskm_2x rax + +# define CHAR_SIZE 1 +# define VPMIN vpminub +# define VPTESTN vptestnmb # define VPBROADCAST vpbroadcastb -# define VPCMP vpcmpb -# define SHIFT_REG ecx +# define VPCMP vpcmpb # endif # define XMMZERO xmm16 # define YMMZERO ymm16 # define YMMMATCH ymm17 -# define YMM1 ymm18 +# define YMMSAVE ymm18 + +# define YMM1 ymm19 +# define YMM2 ymm20 +# define YMM3 ymm21 +# define YMM4 ymm22 +# define YMM5 ymm23 +# define YMM6 ymm24 +# define YMM7 ymm25 +# define YMM8 ymm26 -# define VEC_SIZE 32 - .section .text.evex,"ax",@progbits -ENTRY (STRRCHR) - movl %edi, %ecx +# define VEC_SIZE 32 +# define PAGE_SIZE 4096 + .section .text.evex, "ax", @progbits +ENTRY(STRRCHR) + movl %edi, %eax /* Broadcast CHAR to YMMMATCH. */ VPBROADCAST %esi, %YMMMATCH - vpxorq %XMMZERO, %XMMZERO, %XMMZERO - - /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + jg L(cross_page_boundary) +L(page_cross_continue): VMOVU (%rdi), %YMM1 - - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMMMATCH, %YMM1, %k1 + VPTESTN %YMM1, %YMM1, %k0 kmovd %k0, %ecx - kmovd %k1, %eax - - addq $VEC_SIZE, %rdi - - testl %eax, %eax - jnz L(first_vec) - testl %ecx, %ecx - jnz L(return_null) - - andq $-VEC_SIZE, %rdi - xorl %edx, %edx - jmp L(aligned_loop) - - .p2align 4 -L(first_vec): - /* Check if there is a null byte. */ - testl %ecx, %ecx - jnz L(char_and_nul_in_first_vec) - - /* Remember the match and keep searching. */ - movl %eax, %edx - movq %rdi, %rsi - andq $-VEC_SIZE, %rdi - jmp L(aligned_loop) - - .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - + jz L(aligned_more) + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k1, %eax + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(ret0) + bsrl %eax, %eax # ifdef USE_AS_WCSRCHR - /* NB: Divide shift count by 4 since each bit in K1 represent 4 - bytes. */ - movl %ecx, %SHIFT_REG - sarl $2, %SHIFT_REG + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax # endif +L(ret0): + ret - VMOVA (%rdi), %YMM1 - - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ + /* Returns for first vec x1/x2/x3 have hard coded backward + search path for earlier matches. */ + .p2align 4,, 6 +L(first_vec_x1): + VPCMP $0, %YMMMATCH, %YMM2, %k1 + kmovd %k1, %eax + blsmskl %ecx, %ecx + andl %ecx, %eax + jnz L(first_vec_x1_return) + .p2align 4,, 4 +L(first_vec_x0_test): VPCMP $0, %YMMMATCH, %YMM1, %k1 - kmovd %k0, %edx kmovd %k1, %eax - - shrxl %SHIFT_REG, %edx, %edx - shrxl %SHIFT_REG, %eax, %eax - addq $VEC_SIZE, %rdi - - /* Check if there is a CHAR. */ testl %eax, %eax - jnz L(found_char) - - testl %edx, %edx - jnz L(return_null) - - jmp L(aligned_loop) - - .p2align 4 -L(found_char): - testl %edx, %edx - jnz L(char_and_nul) - - /* Remember the match and keep searching. */ - movl %eax, %edx - leaq (%rdi, %rcx), %rsi + jz L(ret1) + bsrl %eax, %eax +# ifdef USE_AS_WCSRCHR + leaq (%rsi, %rax, CHAR_SIZE), %rax +# else + addq %rsi, %rax +# endif +L(ret1): + ret - .p2align 4 -L(aligned_loop): - VMOVA (%rdi), %YMM1 - addq $VEC_SIZE, %rdi + .p2align 4,, 10 +L(first_vec_x1_or_x2): + VPCMP $0, %YMM3, %YMMMATCH, %k3 + VPCMP $0, %YMM2, %YMMMATCH, %k2 + kortestd %k2, %k3 + jz L(first_vec_x0_test) + + kunpck %k2, %k3, %k3 + kmovq %k3, %rax + bsrq %rax, %rax + leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax + ret - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMMMATCH, %YMM1, %k1 - kmovd %k0, %ecx + .p2align 4,, 6 +L(first_vec_x3): + VPCMP $0, %YMMMATCH, %YMM4, %k1 kmovd %k1, %eax - orl %eax, %ecx - jnz L(char_nor_null) + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(first_vec_x1_or_x2) + bsrl %eax, %eax + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret - VMOVA (%rdi), %YMM1 - add $VEC_SIZE, %rdi + .p2align 4,, 6 +L(first_vec_x0_x1_test): + VPCMP $0, %YMMMATCH, %YMM2, %k1 + kmovd %k1, %eax + testl %eax, %eax + jz L(first_vec_x0_test) + .p2align 4,, 4 +L(first_vec_x1_return): + bsrl %eax, %eax + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + ret - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMMMATCH, %YMM1, %k1 - kmovd %k0, %ecx + .p2align 4,, 10 +L(first_vec_x2): + VPCMP $0, %YMMMATCH, %YMM3, %k1 kmovd %k1, %eax - orl %eax, %ecx - jnz L(char_nor_null) + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(first_vec_x0_x1_test) + bsrl %eax, %eax + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret - VMOVA (%rdi), %YMM1 - addq $VEC_SIZE, %rdi - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMMMATCH, %YMM1, %k1 + .p2align 4 +L(aligned_more): + /* Need to keep original pointer incase YMM1 has last match. */ + movq %rdi, %rsi + andq $-VEC_SIZE, %rdi + VMOVU VEC_SIZE(%rdi), %YMM2 + VPTESTN %YMM2, %YMM2, %k0 kmovd %k0, %ecx - kmovd %k1, %eax - orl %eax, %ecx - jnz L(char_nor_null) + testl %ecx, %ecx + jnz L(first_vec_x1) - VMOVA (%rdi), %YMM1 - addq $VEC_SIZE, %rdi + VMOVU (VEC_SIZE * 2)(%rdi), %YMM3 + VPTESTN %YMM3, %YMM3, %k0 + kmovd %k0, %ecx + testl %ecx, %ecx + jnz L(first_vec_x2) - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMMMATCH, %YMM1, %k1 + VMOVU (VEC_SIZE * 3)(%rdi), %YMM4 + VPTESTN %YMM4, %YMM4, %k0 kmovd %k0, %ecx - kmovd %k1, %eax - orl %eax, %ecx - jz L(aligned_loop) + movq %rdi, %r8 + testl %ecx, %ecx + jnz L(first_vec_x3) + andq $-(VEC_SIZE * 2), %rdi .p2align 4 -L(char_nor_null): - /* Find a CHAR or a null byte in a loop. */ +L(first_aligned_loop): + /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee + they don't store a match. */ + VMOVA (VEC_SIZE * 4)(%rdi), %YMM5 + VMOVA (VEC_SIZE * 5)(%rdi), %YMM6 + + VPCMP $0, %YMM5, %YMMMATCH, %k2 + vpxord %YMM6, %YMMMATCH, %YMM7 + + VPMIN %YMM5, %YMM6, %YMM8 + VPMIN %YMM8, %YMM7, %YMM7 + + VPTESTN %YMM7, %YMM7, %k1 + subq $(VEC_SIZE * -2), %rdi + kortestd %k1, %k2 + jz L(first_aligned_loop) + + VPCMP $0, %YMM6, %YMMMATCH, %k3 + VPTESTN %YMM8, %YMM8, %k1 + ktestd %k1, %k1 + jz L(second_aligned_loop_prep) + + kortestd %k2, %k3 + jnz L(return_first_aligned_loop) + + .p2align 4,, 6 +L(first_vec_x1_or_x2_or_x3): + VPCMP $0, %YMM4, %YMMMATCH, %k4 + kmovd %k4, %eax testl %eax, %eax - jnz L(match) -L(return_value): - testl %edx, %edx - jz L(return_null) - movl %edx, %eax - movq %rsi, %rdi + jz L(first_vec_x1_or_x2) bsrl %eax, %eax -# ifdef USE_AS_WCSRCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq -VEC_SIZE(%rdi, %rax, 4), %rax -# else - leaq -VEC_SIZE(%rdi, %rax), %rax -# endif + leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax ret - .p2align 4 -L(match): - /* Find a CHAR. Check if there is a null byte. */ - kmovd %k0, %ecx - testl %ecx, %ecx - jnz L(find_nul) + .p2align 4,, 8 +L(return_first_aligned_loop): + VPTESTN %YMM5, %YMM5, %k0 + kunpck %k0, %k1, %k0 + kmov_2x %k0, %maskz_2x + + blsmsk %maskz_2x, %maskz_2x + kunpck %k2, %k3, %k3 + kmov_2x %k3, %maskm_2x + and %maskz_2x, %maskm_2x + jz L(first_vec_x1_or_x2_or_x3) + + bsr %maskm_2x, %maskm_2x + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret - /* Remember the match and keep searching. */ - movl %eax, %edx + .p2align 4 + /* We can throw away the work done for the first 4x checks here + as we have a later match. This is the 'fast' path persay. + */ +L(second_aligned_loop_prep): +L(second_aligned_loop_set_furthest_match): movq %rdi, %rsi - jmp L(aligned_loop) + kunpck %k2, %k3, %k4 .p2align 4 -L(find_nul): - /* Mask out any matching bits after the null byte. */ - movl %ecx, %r8d - subl $1, %r8d - xorl %ecx, %r8d - andl %r8d, %eax - testl %eax, %eax - /* If there is no CHAR here, return the remembered one. */ - jz L(return_value) - bsrl %eax, %eax +L(second_aligned_loop): + VMOVU (VEC_SIZE * 4)(%rdi), %YMM1 + VMOVU (VEC_SIZE * 5)(%rdi), %YMM2 + + VPCMP $0, %YMM1, %YMMMATCH, %k2 + vpxord %YMM2, %YMMMATCH, %YMM3 + + VPMIN %YMM1, %YMM2, %YMM4 + VPMIN %YMM3, %YMM4, %YMM3 + + VPTESTN %YMM3, %YMM3, %k1 + subq $(VEC_SIZE * -2), %rdi + kortestd %k1, %k2 + jz L(second_aligned_loop) + + VPCMP $0, %YMM2, %YMMMATCH, %k3 + VPTESTN %YMM4, %YMM4, %k1 + ktestd %k1, %k1 + jz L(second_aligned_loop_set_furthest_match) + + kortestd %k2, %k3 + /* branch here because there is a significant advantage interms + of output dependency chance in using edx. */ + jnz L(return_new_match) +L(return_old_match): + kmovq %k4, %rax + bsrq %rax, %rax + leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax + ret + +L(return_new_match): + VPTESTN %YMM1, %YMM1, %k0 + kunpck %k0, %k1, %k0 + kmov_2x %k0, %maskz_2x + + blsmsk %maskz_2x, %maskz_2x + kunpck %k2, %k3, %k3 + kmov_2x %k3, %maskm_2x + and %maskz_2x, %maskm_2x + jz L(return_old_match) + + bsr %maskm_2x, %maskm_2x + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + + /* This block is horribly aligned (% 16 == 15). This is + intentional. The L(cross_page_boundary) block is exactly + 32-bytes of code size. Ultimately this is a cold case so + save the code size by leaving misaligned. */ +L(cross_page_boundary): + xorq %rdi, %rax + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1 + VPTESTN %YMM1, %YMM1, %k0 + kmovd %k0, %ecx # ifdef USE_AS_WCSRCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq -VEC_SIZE(%rdi, %rax, 4), %rax -# else - leaq -VEC_SIZE(%rdi, %rax), %rax + movl %edi, %esi + andl $(VEC_SIZE - 1), %esi + shrl $2, %esi # endif - ret + shrxl %SHIFT_REG, %ecx, %ecx - .p2align 4 -L(char_and_nul): - /* Find both a CHAR and a null byte. */ - addq %rcx, %rdi - movl %edx, %ecx -L(char_and_nul_in_first_vec): - /* Mask out any matching bits after the null byte. */ - movl %ecx, %r8d - subl $1, %r8d - xorl %ecx, %r8d - andl %r8d, %eax - testl %eax, %eax - /* Return null pointer if the null byte comes first. */ - jz L(return_null) + testl %ecx, %ecx + jz L(page_cross_continue) + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k1, %eax + shrxl %SHIFT_REG, %eax, %eax + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(ret3) bsrl %eax, %eax # ifdef USE_AS_WCSRCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq -VEC_SIZE(%rdi, %rax, 4), %rax + leaq (%rdi, %rax, CHAR_SIZE), %rax # else - leaq -VEC_SIZE(%rdi, %rax), %rax + addq %rdi, %rax # endif +L(ret3): ret - .p2align 4 -L(return_null): - xorl %eax, %eax - ret - -END (STRRCHR) +END(STRRCHR) #endif