From patchwork Thu Apr 21 03:14:06 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 53082 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 0D9793857361 for ; Thu, 21 Apr 2022 03:14:43 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 0D9793857361 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1650510883; bh=lqUovSLaEJBaD+47GBT81Ps9cg+jfblMu3mYPmg2cjE=; h=To:Subject:Date:List-Id:List-Unsubscribe:List-Archive:List-Post: List-Help:List-Subscribe:From:Reply-To:From; b=wyGc+ULEhFZ3kNjVpMPONZ+M1nbs3LhoAMxv8+i5qOGkAThw51K/zya+lEyvJ1GUW OMFQHA7/7vbtCeNy51uMmc0f9a4+L61ftAManR4OxDp2uzMi/wlxOs5GHgvhJ2HVXG KOXOmV+lsPMbKri4uDJPTVhJDfaxJSu6P9XgjY68= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pl1-x62f.google.com (mail-pl1-x62f.google.com [IPv6:2607:f8b0:4864:20::62f]) by sourceware.org (Postfix) with ESMTPS id 6BDC03858D1E for ; Thu, 21 Apr 2022 03:14:20 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 6BDC03858D1E Received: by mail-pl1-x62f.google.com with SMTP id t12so3580319pll.7 for ; Wed, 20 Apr 2022 20:14:20 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:mime-version :content-transfer-encoding; bh=lqUovSLaEJBaD+47GBT81Ps9cg+jfblMu3mYPmg2cjE=; b=mwi/CAI/pX1DXb9vcpNViljE2VRNwQQaDBCD/RjcjFnJVhR1vLzV4lnIlXoduSw7Bu SeuZaKllSwYwi8kX8UQtHDeNsxhI1TLTIypQe+z8ToEhEznG/VE4HAT8mI0tMU6YHS8z HhywM0g/mZWFbk77T7kSQR88q5ieIbDo6RqmBWGGpHJGP410C9TUxCTvvT/zHeG1ttGh ls08bMJI+4IGsVWTLPT9sq1hQhBGc6MH4I1rbQxCkbmFqDxRZ6XliMO8HSeshP99GQbC Oa4OLTQeQIQbIGBsFEi9laO7GwI9a7n1mwXKzoyamtOSsaiRwRkHgQlvL3drFQqTS2Vj /8aA== X-Gm-Message-State: AOAM533RmVH5xJV91WTfEgpBFHM5rU0JiRkAXAVzEInuk6oIcdpBuXAN /a64cAKuSkJdq1tgixgPdMC8bPsj0vM= X-Google-Smtp-Source: ABdhPJz8fOlMJQWGUR6NvhQvGueBJ78rdyHZaQ9rUCSqLhdBx4lUKib9HuEZrSZhoGCi0AbHTBy00w== X-Received: by 2002:a17:90a:aa96:b0:1cb:c57f:9218 with SMTP id l22-20020a17090aaa9600b001cbc57f9218mr8018218pjq.227.1650510859323; Wed, 20 Apr 2022 20:14:19 -0700 (PDT) Received: from localhost.localdomain ([64.145.94.63]) by smtp.googlemail.com with ESMTPSA id n59-20020a17090a5ac100b001cd498dc153sm1424022pji.3.2022.04.20.20.14.18 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 20 Apr 2022 20:14:19 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1 1/5] benchtests: Improve bench-strrchr Date: Wed, 20 Apr 2022 22:14:06 -0500 Message-Id: <20220421031410.2142238-1-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 MIME-Version: 1.0 X-Spam-Status: No, score=-11.9 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" 1. Use json-lib for printing results. 2. Expose all parameters (before pos, seek_char, and max_char where not printed). 3. Add benchmarks that test multiple occurence of seek_char in the string. --- benchtests/bench-strrchr.c | 126 ++++++++++++++++++++++++------------- 1 file changed, 82 insertions(+), 44 deletions(-) diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c index abdae60c51..cceea77e1b 100644 --- a/benchtests/bench-strrchr.c +++ b/benchtests/bench-strrchr.c @@ -23,6 +23,7 @@ # define TEST_NAME "strrchr" #endif #include "bench-string.h" +#include "json-lib.h" #define BIG_CHAR MAX_CHAR @@ -53,7 +54,8 @@ SIMPLE_STRRCHR (const CHAR *s, int c) } static void -do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res) +do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c, + CHAR *exp_res) { CHAR *res = CALL (impl, s, c); size_t i, iters = INNER_LOOP_ITERS8; @@ -61,8 +63,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res) if (res != exp_res) { - error (0, 0, "Wrong result in function %s %p %p", impl->name, - res, exp_res); + error (0, 0, "Wrong result in function %s %p %p", impl->name, res, + exp_res); ret = 1; return; } @@ -72,24 +74,28 @@ do_one_test (impl_t *impl, const CHAR *s, int c, CHAR *exp_res) { CALL (impl, s, c); } - TIMING_NOW (stop); + TIMING_NOW (stop); TIMING_DIFF (cur, start, stop); - TIMING_PRINT_MEAN ((double) cur, (double) iters); + json_element_double (json_ctx, (double) cur / (double) iters); + return; } static void -do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char) +do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len, + int seek_char, int max_char, size_t freq) /* For wcsrchr: align here means align not in bytes, but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)) len for wcschr here isn't in bytes but it's number of wchar_t symbols. */ { size_t i; + size_t pos_chunk_sz = freq ? (pos / freq) : pos; + size_t last_pos = len; CHAR *result; CHAR *buf = (CHAR *) buf1; - align &= 7; + align &= (getpagesize () - 1); if ((align + len) * sizeof (CHAR) >= page_size) return; @@ -103,6 +109,16 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char) if ((i > pos || pos >= len) && buf[align + i] == seek_char) buf[align + i] = seek_char + 10 + (random () & 15); } + + if (pos_chunk_sz == 0 && pos) + pos_chunk_sz = 1; + + for (i = pos_chunk_sz; i < pos && i < len; i += pos_chunk_sz) + { + buf[align + i] = seek_char; + last_pos = i; + } + buf[align + len] = 0; if (pos < len) @@ -110,66 +126,88 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char) buf[align + pos] = seek_char; result = (CHAR *) (buf + align + pos); } + else if (last_pos < len) + result = (CHAR *) (buf + align + last_pos); else if (seek_char == 0) result = (CHAR *) (buf + align + len); else result = NULL; - printf ("Length %4zd, alignment in bytes %2zd:", len, align * sizeof (CHAR)); + json_element_object_begin (json_ctx); + json_attr_uint (json_ctx, "len", len); + json_attr_uint (json_ctx, "pos", pos); + json_attr_uint (json_ctx, "align", align); + json_attr_uint (json_ctx, "freq", freq); + json_attr_uint (json_ctx, "seek", seek_char); + json_attr_uint (json_ctx, "max_char", max_char); + json_array_begin (json_ctx, "timings"); FOR_EACH_IMPL (impl, 0) - do_one_test (impl, (CHAR *) (buf + align), seek_char, result); + do_one_test (json_ctx, impl, (CHAR *) (buf + align), seek_char, result); - putchar ('\n'); + json_array_end (json_ctx); + json_element_object_end (json_ctx); } int test_main (void) { - size_t i; + json_ctx_t json_ctx; + size_t i, j; + int seek; test_init (); + json_init (&json_ctx, 0, stdout); - printf ("%20s", ""); - FOR_EACH_IMPL (impl, 0) - printf ("\t%s", impl->name); - putchar ('\n'); + json_document_begin (&json_ctx); + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE); - for (i = 1; i < 8; ++i) - { - do_test (0, 16 << i, 2048, 23, SMALL_CHAR); - do_test (i, 16 << i, 2048, 23, SMALL_CHAR); - } + json_attr_object_begin (&json_ctx, "functions"); + json_attr_object_begin (&json_ctx, TEST_NAME); + json_attr_string (&json_ctx, "bench-variant", ""); - for (i = 1; i < 8; ++i) - { - do_test (i, 64, 256, 23, SMALL_CHAR); - do_test (i, 64, 256, 23, BIG_CHAR); - } - - for (i = 0; i < 32; ++i) - { - do_test (0, i, i + 1, 23, SMALL_CHAR); - do_test (0, i, i + 1, 23, BIG_CHAR); - } + json_array_begin (&json_ctx, "ifuncs"); + FOR_EACH_IMPL (impl, 0) + json_element_string (&json_ctx, impl->name); + json_array_end (&json_ctx); - for (i = 1; i < 8; ++i) - { - do_test (0, 16 << i, 2048, 0, SMALL_CHAR); - do_test (i, 16 << i, 2048, 0, SMALL_CHAR); - } + json_array_begin (&json_ctx, "results"); - for (i = 1; i < 8; ++i) + for (seek = 0; seek <= 23; seek += 23) { - do_test (i, 64, 256, 0, SMALL_CHAR); - do_test (i, 64, 256, 0, BIG_CHAR); + for (j = 1; j < 32; j += j) + { + for (i = 1; i < 9; ++i) + { + do_test (&json_ctx, 0, 16 << i, 2048, seek, SMALL_CHAR, j); + do_test (&json_ctx, i, 16 << i, 2048, seek, SMALL_CHAR, j); + } + + for (i = 1; i < 8; ++i) + { + do_test (&json_ctx, i, 64, 256, seek, SMALL_CHAR, j); + do_test (&json_ctx, i, 64, 256, seek, BIG_CHAR, j); + + do_test (&json_ctx, i * 15, 64, 256, seek, SMALL_CHAR, j); + do_test (&json_ctx, i * 15, 64, 256, seek, BIG_CHAR, j); + } + + for (i = 0; i < 32; ++i) + { + do_test (&json_ctx, 0, i, i + 1, seek, SMALL_CHAR, j); + do_test (&json_ctx, 0, i, i + 1, seek, BIG_CHAR, j); + } + if (seek == 0) + { + break; + } + } } - for (i = 0; i < 32; ++i) - { - do_test (0, i, i + 1, 0, SMALL_CHAR); - do_test (0, i, i + 1, 0, BIG_CHAR); - } + json_array_end (&json_ctx); + json_attr_object_end (&json_ctx); + json_attr_object_end (&json_ctx); + json_document_end (&json_ctx); return ret; } From patchwork Thu Apr 21 03:14:08 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 53083 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id E156A3857362 for ; Thu, 21 Apr 2022 03:15:25 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org E156A3857362 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1650510925; bh=0bxgprCORmKXHrSlbGRo/GYGZ9ePf17Pd5y+rvC8Kh0=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=wEPDH1V0eF+ScHmz1pF7pKg89ZesHaGtJLeNpAvYuTjVjMmFCnWmKaFj76eDnuGDH 55qDWbnxMKRlTAp0DcSCm+vWEo4SfF2T5Ea9+pX5jU6ZkmPZ4UElwvaUFdUaIFS/PL ts4B7hQkfUbdIEJJ1VP2LSRvuyiXJ1T52X/L/580= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pj1-x1029.google.com (mail-pj1-x1029.google.com [IPv6:2607:f8b0:4864:20::1029]) by sourceware.org (Postfix) with ESMTPS id CC3423857365 for ; Thu, 21 Apr 2022 03:14:38 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org CC3423857365 Received: by mail-pj1-x1029.google.com with SMTP id mp16-20020a17090b191000b001cb5efbcab6so6575734pjb.4 for ; Wed, 20 Apr 2022 20:14:38 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=0bxgprCORmKXHrSlbGRo/GYGZ9ePf17Pd5y+rvC8Kh0=; b=PBB3LhG/yJWCsyt55jNJh67COvajTY2jlGWa84Ij/KQ2jBWf0rEJmDYpck8e4+F9st t7xZuyI+n0VWpg7u8VjzNRj68x5qx4EJS1gsKqxVMC09Vn4KoFZNo+8pNaByiRhoI81m DyU1IgYTaQ77yLfDnC5RAk8wFKY2YmMPpivF7MVcaLIy+7UrtCh0FXmpHJ39fOEwYRJZ 0Yr2xfOES0/ccebL1Ul4n9cShG40d9Putn26f37yii9H9v8MSET0a0YrhCQJGcee9Cny MTyxouAv9z8++fupSPly5DuLIBE5s35YwwS5Ys72ch4+HXL5USJ1OWeVD8BgC73KBZnT XpBg== X-Gm-Message-State: AOAM5313/T59xTe+tBMD2DUI7r/wvDAtXXBHlsXqJQ9IcFzqt6rpZK6i 0mOTI+q8Qp2InfOpkL3arKb+PJdI8H4= X-Google-Smtp-Source: ABdhPJxafLFrpkBXN4pKGW1XWL2ECBiMfcCTEAKlQqFhSZoNnlhdF9m8Z3A9B3E2qN1V4iZSBX7FYw== X-Received: by 2002:a17:90b:1e0b:b0:1d2:dabc:9929 with SMTP id pg11-20020a17090b1e0b00b001d2dabc9929mr8025110pjb.39.1650510877089; Wed, 20 Apr 2022 20:14:37 -0700 (PDT) Received: from localhost.localdomain ([64.145.94.63]) by smtp.googlemail.com with ESMTPSA id n59-20020a17090a5ac100b001cd498dc153sm1424022pji.3.2022.04.20.20.14.35 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 20 Apr 2022 20:14:36 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1 2/5] x86: Optimize {str|wcs}rchr-sse2 Date: Wed, 20 Apr 2022 22:14:08 -0500 Message-Id: <20220421031410.2142238-2-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220421031410.2142238-1-goldstein.w.n@gmail.com> References: <20220421031410.2142238-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-10.6 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_NUMSUBJECT, KAM_SHORT, RCVD_IN_DNSWL_NONE, SCC_10_SHORT_WORD_LINES, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" The new code unrolls the main loop slightly without adding too much overhead and minimizes the comparisons for the search CHAR. Geometric Mean of all benchmarks New / Old: 0.741 See email for all results. Full xcheck passes on x86_64 with and without multiarch enabled. --- Results For: strrchr Geometric Mean of N=30 runs. Geometric Mean of all benchmarks New / Old: 0.741 Benchmarks performance on Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html len, align, pos, seek, max_char, freq, New Time / Old Time 2048, 0, 32, 0, 127, 1, 0.647 2048, 1, 32, 0, 127, 1, 0.621 2048, 0, 64, 0, 127, 1, 0.661 2048, 2, 64, 0, 127, 1, 0.655 2048, 0, 128, 0, 127, 1, 0.69 2048, 3, 128, 0, 127, 1, 0.689 2048, 0, 256, 0, 127, 1, 0.718 2048, 4, 256, 0, 127, 1, 0.718 2048, 0, 512, 0, 127, 1, 0.758 2048, 5, 512, 0, 127, 1, 0.754 2048, 0, 1024, 0, 127, 1, 1.029 2048, 6, 1024, 0, 127, 1, 1.032 2048, 0, 2048, 0, 127, 1, 0.826 2048, 7, 2048, 0, 127, 1, 0.834 2048, 0, 4096, 0, 127, 1, 0.825 2048, 8, 4096, 0, 127, 1, 0.83 256, 1, 64, 0, 127, 1, 0.657 256, 15, 64, 0, 127, 1, 0.657 256, 2, 64, 0, 127, 1, 0.657 256, 30, 64, 0, 127, 1, 0.523 256, 3, 64, 0, 127, 1, 0.657 256, 45, 64, 0, 127, 1, 0.654 256, 4, 64, 0, 127, 1, 0.657 256, 60, 64, 0, 127, 1, 0.526 256, 5, 64, 0, 127, 1, 0.658 256, 75, 64, 0, 127, 1, 0.658 256, 6, 64, 0, 127, 1, 0.655 256, 90, 64, 0, 127, 1, 0.523 256, 7, 64, 0, 127, 1, 0.655 256, 105, 64, 0, 127, 1, 0.654 1, 0, 0, 0, 127, 1, 0.98 2, 0, 1, 0, 127, 1, 0.978 3, 0, 2, 0, 127, 1, 0.975 4, 0, 3, 0, 127, 1, 0.976 5, 0, 4, 0, 127, 1, 0.977 6, 0, 5, 0, 127, 1, 0.981 7, 0, 6, 0, 127, 1, 0.982 8, 0, 7, 0, 127, 1, 0.98 9, 0, 8, 0, 127, 1, 0.978 10, 0, 9, 0, 127, 1, 0.981 11, 0, 10, 0, 127, 1, 0.984 12, 0, 11, 0, 127, 1, 0.982 13, 0, 12, 0, 127, 1, 0.98 14, 0, 13, 0, 127, 1, 0.978 15, 0, 14, 0, 127, 1, 0.979 16, 0, 15, 0, 127, 1, 0.986 17, 0, 16, 0, 127, 1, 0.529 18, 0, 17, 0, 127, 1, 0.566 19, 0, 18, 0, 127, 1, 0.575 20, 0, 19, 0, 127, 1, 0.573 21, 0, 20, 0, 127, 1, 0.579 22, 0, 21, 0, 127, 1, 0.595 23, 0, 22, 0, 127, 1, 0.585 24, 0, 23, 0, 127, 1, 0.586 25, 0, 24, 0, 127, 1, 0.587 26, 0, 25, 0, 127, 1, 0.592 27, 0, 26, 0, 127, 1, 0.595 28, 0, 27, 0, 127, 1, 0.592 29, 0, 28, 0, 127, 1, 0.6 30, 0, 29, 0, 127, 1, 0.598 31, 0, 30, 0, 127, 1, 0.595 32, 0, 31, 0, 127, 1, 0.592 2048, 0, 32, 23, 127, 1, 0.827 2048, 1, 32, 23, 127, 1, 0.826 2048, 0, 64, 23, 127, 1, 0.824 2048, 2, 64, 23, 127, 1, 0.825 2048, 0, 128, 23, 127, 1, 0.829 2048, 3, 128, 23, 127, 1, 0.824 2048, 0, 256, 23, 127, 1, 0.832 2048, 4, 256, 23, 127, 1, 0.825 2048, 0, 512, 23, 127, 1, 0.831 2048, 5, 512, 23, 127, 1, 0.837 2048, 0, 1024, 23, 127, 1, 0.721 2048, 6, 1024, 23, 127, 1, 0.757 2048, 0, 2048, 23, 127, 1, 0.825 2048, 7, 2048, 23, 127, 1, 0.824 2048, 0, 4096, 23, 127, 1, 0.828 2048, 8, 4096, 23, 127, 1, 0.823 256, 1, 64, 23, 127, 1, 0.665 256, 15, 64, 23, 127, 1, 0.661 256, 2, 64, 23, 127, 1, 0.674 256, 30, 64, 23, 127, 1, 0.605 256, 3, 64, 23, 127, 1, 0.668 256, 45, 64, 23, 127, 1, 0.661 256, 4, 64, 23, 127, 1, 0.657 256, 60, 64, 23, 127, 1, 0.594 256, 5, 64, 23, 127, 1, 0.654 256, 75, 64, 23, 127, 1, 0.673 256, 6, 64, 23, 127, 1, 0.688 256, 90, 64, 23, 127, 1, 0.6 256, 7, 64, 23, 127, 1, 0.66 256, 105, 64, 23, 127, 1, 0.654 1, 0, 0, 23, 127, 1, 0.981 2, 0, 1, 23, 127, 1, 0.976 3, 0, 2, 23, 127, 1, 0.983 4, 0, 3, 23, 127, 1, 0.984 5, 0, 4, 23, 127, 1, 0.973 6, 0, 5, 23, 127, 1, 0.987 7, 0, 6, 23, 127, 1, 0.977 8, 0, 7, 23, 127, 1, 0.979 9, 0, 8, 23, 127, 1, 0.981 10, 0, 9, 23, 127, 1, 0.98 11, 0, 10, 23, 127, 1, 0.983 12, 0, 11, 23, 127, 1, 0.98 13, 0, 12, 23, 127, 1, 0.98 14, 0, 13, 23, 127, 1, 0.977 15, 0, 14, 23, 127, 1, 0.982 16, 0, 15, 23, 127, 1, 0.581 17, 0, 16, 23, 127, 1, 0.551 18, 0, 17, 23, 127, 1, 0.555 19, 0, 18, 23, 127, 1, 0.586 20, 0, 19, 23, 127, 1, 0.585 21, 0, 20, 23, 127, 1, 0.582 22, 0, 21, 23, 127, 1, 0.571 23, 0, 22, 23, 127, 1, 0.576 24, 0, 23, 23, 127, 1, 0.581 25, 0, 24, 23, 127, 1, 0.589 26, 0, 25, 23, 127, 1, 0.593 27, 0, 26, 23, 127, 1, 0.595 28, 0, 27, 23, 127, 1, 0.583 29, 0, 28, 23, 127, 1, 0.595 30, 0, 29, 23, 127, 1, 0.58 31, 0, 30, 23, 127, 1, 0.594 32, 0, 31, 23, 127, 1, 0.665 2048, 0, 32, 23, 127, 2, 0.825 2048, 1, 32, 23, 127, 2, 0.818 2048, 0, 64, 23, 127, 2, 0.829 2048, 2, 64, 23, 127, 2, 0.828 2048, 0, 128, 23, 127, 2, 0.823 2048, 3, 128, 23, 127, 2, 0.825 2048, 0, 256, 23, 127, 2, 0.819 2048, 4, 256, 23, 127, 2, 0.828 2048, 0, 512, 23, 127, 2, 0.824 2048, 5, 512, 23, 127, 2, 0.827 2048, 0, 1024, 23, 127, 2, 0.813 2048, 6, 1024, 23, 127, 2, 0.834 2048, 0, 2048, 23, 127, 2, 0.927 2048, 7, 2048, 23, 127, 2, 0.923 2048, 0, 4096, 23, 127, 2, 0.818 2048, 8, 4096, 23, 127, 2, 0.82 256, 1, 64, 23, 127, 2, 0.693 256, 15, 64, 23, 127, 2, 0.686 256, 2, 64, 23, 127, 2, 0.69 256, 30, 64, 23, 127, 2, 0.611 256, 3, 64, 23, 127, 2, 0.692 256, 45, 64, 23, 127, 2, 0.685 256, 4, 64, 23, 127, 2, 0.688 256, 60, 64, 23, 127, 2, 0.6 256, 5, 64, 23, 127, 2, 0.69 256, 75, 64, 23, 127, 2, 0.689 256, 6, 64, 23, 127, 2, 0.688 256, 90, 64, 23, 127, 2, 0.611 256, 7, 64, 23, 127, 2, 0.69 256, 105, 64, 23, 127, 2, 0.686 1, 0, 0, 23, 127, 2, 0.982 2, 0, 1, 23, 127, 2, 0.987 3, 0, 2, 23, 127, 2, 0.978 4, 0, 3, 23, 127, 2, 0.977 5, 0, 4, 23, 127, 2, 0.979 6, 0, 5, 23, 127, 2, 0.985 7, 0, 6, 23, 127, 2, 0.975 8, 0, 7, 23, 127, 2, 0.981 9, 0, 8, 23, 127, 2, 0.984 10, 0, 9, 23, 127, 2, 0.983 11, 0, 10, 23, 127, 2, 0.982 12, 0, 11, 23, 127, 2, 0.976 13, 0, 12, 23, 127, 2, 0.985 14, 0, 13, 23, 127, 2, 0.984 15, 0, 14, 23, 127, 2, 0.98 16, 0, 15, 23, 127, 2, 0.583 17, 0, 16, 23, 127, 2, 0.552 18, 0, 17, 23, 127, 2, 0.564 19, 0, 18, 23, 127, 2, 0.585 20, 0, 19, 23, 127, 2, 0.578 21, 0, 20, 23, 127, 2, 0.578 22, 0, 21, 23, 127, 2, 0.571 23, 0, 22, 23, 127, 2, 0.587 24, 0, 23, 23, 127, 2, 0.589 25, 0, 24, 23, 127, 2, 0.593 26, 0, 25, 23, 127, 2, 0.589 27, 0, 26, 23, 127, 2, 0.588 28, 0, 27, 23, 127, 2, 0.593 29, 0, 28, 23, 127, 2, 0.579 30, 0, 29, 23, 127, 2, 0.572 31, 0, 30, 23, 127, 2, 0.582 32, 0, 31, 23, 127, 2, 0.659 2048, 0, 32, 23, 127, 4, 0.822 2048, 1, 32, 23, 127, 4, 0.818 2048, 0, 64, 23, 127, 4, 0.826 2048, 2, 64, 23, 127, 4, 0.824 2048, 0, 128, 23, 127, 4, 0.833 2048, 3, 128, 23, 127, 4, 0.831 2048, 0, 256, 23, 127, 4, 0.826 2048, 4, 256, 23, 127, 4, 0.831 2048, 0, 512, 23, 127, 4, 0.834 2048, 5, 512, 23, 127, 4, 0.83 2048, 0, 1024, 23, 127, 4, 0.836 2048, 6, 1024, 23, 127, 4, 0.844 2048, 0, 2048, 23, 127, 4, 0.696 2048, 7, 2048, 23, 127, 4, 0.704 2048, 0, 4096, 23, 127, 4, 0.936 2048, 8, 4096, 23, 127, 4, 0.925 256, 1, 64, 23, 127, 4, 0.694 256, 15, 64, 23, 127, 4, 0.69 256, 2, 64, 23, 127, 4, 0.687 256, 30, 64, 23, 127, 4, 0.612 256, 3, 64, 23, 127, 4, 0.685 256, 45, 64, 23, 127, 4, 0.685 256, 4, 64, 23, 127, 4, 0.684 256, 60, 64, 23, 127, 4, 0.606 256, 5, 64, 23, 127, 4, 0.69 256, 75, 64, 23, 127, 4, 0.688 256, 6, 64, 23, 127, 4, 0.69 256, 90, 64, 23, 127, 4, 0.615 256, 7, 64, 23, 127, 4, 0.691 256, 105, 64, 23, 127, 4, 0.688 1, 0, 0, 23, 127, 4, 0.982 2, 0, 1, 23, 127, 4, 0.983 3, 0, 2, 23, 127, 4, 0.981 4, 0, 3, 23, 127, 4, 0.984 5, 0, 4, 23, 127, 4, 0.963 6, 0, 5, 23, 127, 4, 0.978 7, 0, 6, 23, 127, 4, 0.985 8, 0, 7, 23, 127, 4, 0.986 9, 0, 8, 23, 127, 4, 0.978 10, 0, 9, 23, 127, 4, 0.985 11, 0, 10, 23, 127, 4, 0.986 12, 0, 11, 23, 127, 4, 0.983 13, 0, 12, 23, 127, 4, 0.986 14, 0, 13, 23, 127, 4, 0.98 15, 0, 14, 23, 127, 4, 0.979 16, 0, 15, 23, 127, 4, 0.582 17, 0, 16, 23, 127, 4, 0.542 18, 0, 17, 23, 127, 4, 0.564 19, 0, 18, 23, 127, 4, 0.571 20, 0, 19, 23, 127, 4, 0.582 21, 0, 20, 23, 127, 4, 0.573 22, 0, 21, 23, 127, 4, 0.575 23, 0, 22, 23, 127, 4, 0.578 24, 0, 23, 23, 127, 4, 0.58 25, 0, 24, 23, 127, 4, 0.592 26, 0, 25, 23, 127, 4, 0.588 27, 0, 26, 23, 127, 4, 0.574 28, 0, 27, 23, 127, 4, 0.589 29, 0, 28, 23, 127, 4, 0.56 30, 0, 29, 23, 127, 4, 0.587 31, 0, 30, 23, 127, 4, 0.584 32, 0, 31, 23, 127, 4, 0.664 2048, 0, 32, 23, 127, 8, 0.826 2048, 1, 32, 23, 127, 8, 0.821 2048, 0, 64, 23, 127, 8, 0.828 2048, 2, 64, 23, 127, 8, 0.827 2048, 0, 128, 23, 127, 8, 0.833 2048, 3, 128, 23, 127, 8, 0.83 2048, 0, 256, 23, 127, 8, 0.855 2048, 4, 256, 23, 127, 8, 0.849 2048, 0, 512, 23, 127, 8, 0.849 2048, 5, 512, 23, 127, 8, 0.851 2048, 0, 1024, 23, 127, 8, 0.856 2048, 6, 1024, 23, 127, 8, 0.862 2048, 0, 2048, 23, 127, 8, 0.709 2048, 7, 2048, 23, 127, 8, 0.712 2048, 0, 4096, 23, 127, 8, 0.702 2048, 8, 4096, 23, 127, 8, 0.701 256, 1, 64, 23, 127, 8, 0.689 256, 15, 64, 23, 127, 8, 0.688 256, 2, 64, 23, 127, 8, 0.691 256, 30, 64, 23, 127, 8, 0.612 256, 3, 64, 23, 127, 8, 0.688 256, 45, 64, 23, 127, 8, 0.686 256, 4, 64, 23, 127, 8, 0.694 256, 60, 64, 23, 127, 8, 0.609 256, 5, 64, 23, 127, 8, 0.69 256, 75, 64, 23, 127, 8, 0.69 256, 6, 64, 23, 127, 8, 0.691 256, 90, 64, 23, 127, 8, 0.612 256, 7, 64, 23, 127, 8, 0.689 256, 105, 64, 23, 127, 8, 0.688 1, 0, 0, 23, 127, 8, 0.98 2, 0, 1, 23, 127, 8, 0.978 3, 0, 2, 23, 127, 8, 0.98 4, 0, 3, 23, 127, 8, 0.978 5, 0, 4, 23, 127, 8, 0.977 6, 0, 5, 23, 127, 8, 0.984 7, 0, 6, 23, 127, 8, 0.982 8, 0, 7, 23, 127, 8, 0.983 9, 0, 8, 23, 127, 8, 0.987 10, 0, 9, 23, 127, 8, 0.979 11, 0, 10, 23, 127, 8, 0.985 12, 0, 11, 23, 127, 8, 0.981 13, 0, 12, 23, 127, 8, 0.98 14, 0, 13, 23, 127, 8, 0.982 15, 0, 14, 23, 127, 8, 0.981 16, 0, 15, 23, 127, 8, 0.579 17, 0, 16, 23, 127, 8, 0.531 18, 0, 17, 23, 127, 8, 0.577 19, 0, 18, 23, 127, 8, 0.588 20, 0, 19, 23, 127, 8, 0.571 21, 0, 20, 23, 127, 8, 0.576 22, 0, 21, 23, 127, 8, 0.59 23, 0, 22, 23, 127, 8, 0.574 24, 0, 23, 23, 127, 8, 0.583 25, 0, 24, 23, 127, 8, 0.581 26, 0, 25, 23, 127, 8, 0.592 27, 0, 26, 23, 127, 8, 0.586 28, 0, 27, 23, 127, 8, 0.588 29, 0, 28, 23, 127, 8, 0.578 30, 0, 29, 23, 127, 8, 0.573 31, 0, 30, 23, 127, 8, 0.588 32, 0, 31, 23, 127, 8, 0.664 2048, 0, 32, 23, 127, 16, 0.825 2048, 1, 32, 23, 127, 16, 0.823 2048, 0, 64, 23, 127, 16, 0.831 2048, 2, 64, 23, 127, 16, 0.822 2048, 0, 128, 23, 127, 16, 0.831 2048, 3, 128, 23, 127, 16, 0.831 2048, 0, 256, 23, 127, 16, 0.849 2048, 4, 256, 23, 127, 16, 0.85 2048, 0, 512, 23, 127, 16, 0.751 2048, 5, 512, 23, 127, 16, 0.75 2048, 0, 1024, 23, 127, 16, 0.913 2048, 6, 1024, 23, 127, 16, 0.895 2048, 0, 2048, 23, 127, 16, 0.736 2048, 7, 2048, 23, 127, 16, 0.741 2048, 0, 4096, 23, 127, 16, 0.712 2048, 8, 4096, 23, 127, 16, 0.711 256, 1, 64, 23, 127, 16, 0.758 256, 15, 64, 23, 127, 16, 0.692 256, 2, 64, 23, 127, 16, 0.692 256, 30, 64, 23, 127, 16, 0.613 256, 3, 64, 23, 127, 16, 0.69 256, 45, 64, 23, 127, 16, 0.687 256, 4, 64, 23, 127, 16, 0.69 256, 60, 64, 23, 127, 16, 0.604 256, 5, 64, 23, 127, 16, 0.687 256, 75, 64, 23, 127, 16, 0.687 256, 6, 64, 23, 127, 16, 0.69 256, 90, 64, 23, 127, 16, 0.61 256, 7, 64, 23, 127, 16, 0.69 256, 105, 64, 23, 127, 16, 0.685 1, 0, 0, 23, 127, 16, 0.981 2, 0, 1, 23, 127, 16, 0.985 3, 0, 2, 23, 127, 16, 0.985 4, 0, 3, 23, 127, 16, 0.981 5, 0, 4, 23, 127, 16, 0.979 6, 0, 5, 23, 127, 16, 0.986 7, 0, 6, 23, 127, 16, 0.986 8, 0, 7, 23, 127, 16, 0.982 9, 0, 8, 23, 127, 16, 0.982 10, 0, 9, 23, 127, 16, 0.98 11, 0, 10, 23, 127, 16, 0.983 12, 0, 11, 23, 127, 16, 0.982 13, 0, 12, 23, 127, 16, 0.982 14, 0, 13, 23, 127, 16, 0.982 15, 0, 14, 23, 127, 16, 0.982 16, 0, 15, 23, 127, 16, 0.582 17, 0, 16, 23, 127, 16, 0.542 18, 0, 17, 23, 127, 16, 0.554 19, 0, 18, 23, 127, 16, 0.562 20, 0, 19, 23, 127, 16, 0.587 21, 0, 20, 23, 127, 16, 0.584 22, 0, 21, 23, 127, 16, 0.587 23, 0, 22, 23, 127, 16, 0.594 24, 0, 23, 23, 127, 16, 0.581 25, 0, 24, 23, 127, 16, 0.577 26, 0, 25, 23, 127, 16, 0.588 27, 0, 26, 23, 127, 16, 0.589 28, 0, 27, 23, 127, 16, 0.596 29, 0, 28, 23, 127, 16, 0.591 30, 0, 29, 23, 127, 16, 0.585 31, 0, 30, 23, 127, 16, 0.59 32, 0, 31, 23, 127, 16, 0.669 sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +- sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +- sysdeps/x86_64/strrchr.S | 505 +++++++++++++++--------- sysdeps/x86_64/wcsrchr.S | 268 +------------ 4 files changed, 334 insertions(+), 444 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S index db1b44c23c..866396e947 100644 --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S @@ -17,7 +17,7 @@ . */ #if IS_IN (libc) -# define strrchr __strrchr_sse2 +# define STRRCHR __strrchr_sse2 # undef weak_alias # define weak_alias(strrchr, rindex) diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S index 78d1ca6553..69d2f3cdb1 100644 --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S @@ -17,7 +17,6 @@ . */ #if IS_IN (libc) -# define wcsrchr __wcsrchr_sse2 +# define STRRCHR __wcsrchr_sse2 #endif - #include "../wcsrchr.S" diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S index 50d886713e..94449ad806 100644 --- a/sysdeps/x86_64/strrchr.S +++ b/sysdeps/x86_64/strrchr.S @@ -19,210 +19,355 @@ #include +#ifndef STRRCHR +# define STRRCHR strrchr +#endif + +#ifdef USE_AS_WCSRCHR +# define PCMPEQ pcmpeqd +# define CHAR_SIZE 4 +# define PMINU pminud +#else +# define PCMPEQ pcmpeqb +# define CHAR_SIZE 1 +# define PMINU pminub +#endif + +#define PAGE_SIZE 4096 +#define VEC_SIZE 16 + .text -ENTRY (strrchr) - movd %esi, %xmm1 +ENTRY(STRRCHR) + movd %esi, %xmm0 movq %rdi, %rax - andl $4095, %eax - punpcklbw %xmm1, %xmm1 - cmpq $4032, %rax - punpcklwd %xmm1, %xmm1 - pshufd $0, %xmm1, %xmm1 + andl $(PAGE_SIZE - 1), %eax +#ifndef USE_AS_WCSRCHR + punpcklbw %xmm0, %xmm0 + punpcklwd %xmm0, %xmm0 +#endif + pshufd $0, %xmm0, %xmm0 + cmpl $(PAGE_SIZE - VEC_SIZE), %eax ja L(cross_page) - movdqu (%rdi), %xmm0 + +L(cross_page_continue): + movups (%rdi), %xmm1 pxor %xmm2, %xmm2 - movdqa %xmm0, %xmm3 - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm3 - pmovmskb %xmm0, %ecx - pmovmskb %xmm3, %edx - testq %rdx, %rdx - je L(next_48_bytes) - leaq -1(%rdx), %rax - xorq %rdx, %rax - andq %rcx, %rax - je L(exit) - bsrq %rax, %rax + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %ecx + testl %ecx, %ecx + jz L(aligned_more) + + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + leal -1(%rcx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(ret0) + bsrl %eax, %eax addq %rdi, %rax + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If + search CHAR is zero we are correct. Either way `andq + -CHAR_SIZE, %rax` gets the correct result. */ +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif +L(ret0): ret + /* Returns for first vec x1/x2 have hard coded backward search + path for earlier matches. */ .p2align 4 -L(next_48_bytes): - movdqu 16(%rdi), %xmm4 - movdqa %xmm4, %xmm5 - movdqu 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm4 - pcmpeqb %xmm2, %xmm5 - movdqu 48(%rdi), %xmm0 - pmovmskb %xmm5, %edx - movdqa %xmm3, %xmm5 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm2, %xmm5 - pcmpeqb %xmm0, %xmm2 - salq $16, %rdx - pmovmskb %xmm3, %r8d - pmovmskb %xmm5, %eax - pmovmskb %xmm2, %esi - salq $32, %r8 - salq $32, %rax - pcmpeqb %xmm1, %xmm0 - orq %rdx, %rax - movq %rsi, %rdx - pmovmskb %xmm4, %esi - salq $48, %rdx - salq $16, %rsi - orq %r8, %rsi - orq %rcx, %rsi - pmovmskb %xmm0, %ecx - salq $48, %rcx - orq %rcx, %rsi - orq %rdx, %rax - je L(loop_header2) - leaq -1(%rax), %rcx - xorq %rax, %rcx - andq %rcx, %rsi - je L(exit) - bsrq %rsi, %rsi - leaq (%rdi,%rsi), %rax +L(first_vec_x0_test): + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + testl %eax, %eax + jz L(ret0) + bsrl %eax, %eax + addq %r8, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif ret .p2align 4 -L(loop_header2): - testq %rsi, %rsi - movq %rdi, %rcx - je L(no_c_found) -L(loop_header): - addq $64, %rdi - pxor %xmm7, %xmm7 - andq $-64, %rdi - jmp L(loop_entry) +L(first_vec_x1): + PCMPEQ %xmm0, %xmm2 + pmovmskb %xmm2, %eax + leal -1(%rcx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(first_vec_x0_test) + bsrl %eax, %eax + leaq (VEC_SIZE)(%rdi, %rax), %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret .p2align 4 -L(loop64): - testq %rdx, %rdx - cmovne %rdx, %rsi - cmovne %rdi, %rcx - addq $64, %rdi -L(loop_entry): - movdqa 32(%rdi), %xmm3 - pxor %xmm6, %xmm6 - movdqa 48(%rdi), %xmm2 - movdqa %xmm3, %xmm0 - movdqa 16(%rdi), %xmm4 - pminub %xmm2, %xmm0 - movdqa (%rdi), %xmm5 - pminub %xmm4, %xmm0 - pminub %xmm5, %xmm0 - pcmpeqb %xmm7, %xmm0 - pmovmskb %xmm0, %eax - movdqa %xmm5, %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %r9d - movdqa %xmm4, %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %edx - movdqa %xmm3, %xmm0 - pcmpeqb %xmm1, %xmm0 - salq $16, %rdx - pmovmskb %xmm0, %r10d - movdqa %xmm2, %xmm0 - pcmpeqb %xmm1, %xmm0 - salq $32, %r10 - orq %r10, %rdx - pmovmskb %xmm0, %r8d - orq %r9, %rdx - salq $48, %r8 - orq %r8, %rdx +L(first_vec_x1_test): + PCMPEQ %xmm0, %xmm2 + pmovmskb %xmm2, %eax testl %eax, %eax - je L(loop64) - pcmpeqb %xmm6, %xmm4 - pcmpeqb %xmm6, %xmm3 - pcmpeqb %xmm6, %xmm5 - pmovmskb %xmm4, %eax - pmovmskb %xmm3, %r10d - pcmpeqb %xmm6, %xmm2 - pmovmskb %xmm5, %r9d - salq $32, %r10 - salq $16, %rax - pmovmskb %xmm2, %r8d - orq %r10, %rax - orq %r9, %rax - salq $48, %r8 - orq %r8, %rax - leaq -1(%rax), %r8 - xorq %rax, %r8 - andq %r8, %rdx - cmovne %rdi, %rcx - cmovne %rdx, %rsi - bsrq %rsi, %rsi - leaq (%rcx,%rsi), %rax + jz L(first_vec_x0_test) + bsrl %eax, %eax + leaq (VEC_SIZE)(%rdi, %rax), %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(first_vec_x2): + PCMPEQ %xmm0, %xmm3 + pmovmskb %xmm3, %eax + leal -1(%rcx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(first_vec_x1_test) + bsrl %eax, %eax + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(aligned_more): + /* Save original pointer if match was in VEC 0. */ + movq %rdi, %r8 + andq $-VEC_SIZE, %rdi + + movaps VEC_SIZE(%rdi), %xmm2 + pxor %xmm3, %xmm3 + PCMPEQ %xmm2, %xmm3 + pmovmskb %xmm3, %ecx + testl %ecx, %ecx + jnz L(first_vec_x1) + + movaps (VEC_SIZE * 2)(%rdi), %xmm3 + pxor %xmm4, %xmm4 + PCMPEQ %xmm3, %xmm4 + pmovmskb %xmm4, %ecx + testl %ecx, %ecx + jnz L(first_vec_x2) + + addq $VEC_SIZE, %rdi + /* Save pointer again before realigning. */ + movq %rdi, %rsi + andq $-(VEC_SIZE * 2), %rdi + .p2align 4 +L(first_loop): + /* Do 2x VEC at a time. */ + movaps (VEC_SIZE * 2)(%rdi), %xmm4 + movaps (VEC_SIZE * 3)(%rdi), %xmm5 + /* If SSE2 no pminud. */ +#ifdef NO_PMINU + movaps %xmm5, %xmm6 + pxor %xmm8, %xmm8 + + PCMPEQ %xmm8, %xmm5 + PCMPEQ %xmm4, %xmm8 + por %xmm5, %xmm8 +#else + movaps %xmm5, %xmm6 + PMINU %xmm4, %xmm5 +#endif + + movaps %xmm4, %xmm9 + PCMPEQ %xmm0, %xmm4 + PCMPEQ %xmm0, %xmm6 + movaps %xmm6, %xmm7 + por %xmm4, %xmm6 +#ifndef NO_PMINU + pxor %xmm8, %xmm8 + PCMPEQ %xmm5, %xmm8 +#endif + pmovmskb %xmm8, %ecx + pmovmskb %xmm6, %eax + + addq $(VEC_SIZE * 2), %rdi + /* Use `addl` 1) so we can undo it with `subl` and 2) it can + macro-fuse with `jz`. */ + addl %ecx, %eax + jz L(first_loop) + + /* Check if there is zero match. */ + testl %ecx, %ecx + jz L(second_loop_match) + + /* Check if there was a match in last iteration. */ + subl %ecx, %eax + jnz L(new_match) + +L(first_loop_old_match): + PCMPEQ %xmm0, %xmm2 + PCMPEQ %xmm0, %xmm3 + pmovmskb %xmm2, %ecx + pmovmskb %xmm3, %eax + addl %eax, %ecx + jz L(first_vec_x0_test) + /* NB: We could move this shift to before the branch and save a + bit of code size / performance on the fall through. The + branch leads to the null case which generally seems hotter + than char in first 3x VEC. */ + sall $16, %eax + orl %ecx, %eax + + bsrl %eax, %eax + addq %rsi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(new_match): + pxor %xmm6, %xmm6 + PCMPEQ %xmm9, %xmm6 + pmovmskb %xmm6, %eax + sall $16, %ecx + orl %eax, %ecx + + /* We can't reuse either of the old comparisons as since we mask + of zeros after first zero (instead of using the full + comparison) we can't gurantee no interference between match + after end of string and valid match. */ + pmovmskb %xmm4, %eax + pmovmskb %xmm7, %edx + sall $16, %edx + orl %edx, %eax + + leal -1(%ecx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(first_loop_old_match) + bsrl %eax, %eax + addq %rdi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif ret + /* Save minimum state for getting most recent match. We can + throw out all previous work. */ .p2align 4 -L(no_c_found): - movl $1, %esi - xorl %ecx, %ecx - jmp L(loop_header) +L(second_loop_match): + movq %rdi, %rsi + movaps %xmm4, %xmm2 + movaps %xmm7, %xmm3 .p2align 4 -L(exit): - xorl %eax, %eax +L(second_loop): + movaps (VEC_SIZE * 2)(%rdi), %xmm4 + movaps (VEC_SIZE * 3)(%rdi), %xmm5 +#ifdef NO_PMINU + movaps %xmm5, %xmm6 + pxor %xmm8, %xmm8 + + PCMPEQ %xmm8, %xmm5 + PCMPEQ %xmm4, %xmm8 + por %xmm5, %xmm8 +#else + movaps %xmm5, %xmm6 + PMINU %xmm4, %xmm5 +#endif + + movaps %xmm4, %xmm9 + PCMPEQ %xmm0, %xmm4 + PCMPEQ %xmm0, %xmm6 + movaps %xmm6, %xmm7 + por %xmm4, %xmm6 +#ifndef NO_PMINU + pxor %xmm8, %xmm8 + PCMPEQ %xmm5, %xmm8 +#endif + + pmovmskb %xmm8, %ecx + pmovmskb %xmm6, %eax + + addq $(VEC_SIZE * 2), %rdi + /* Either null term or new occurence of CHAR. */ + addl %ecx, %eax + jz L(second_loop) + + /* No null term so much be new occurence of CHAR. */ + testl %ecx, %ecx + jz L(second_loop_match) + + + subl %ecx, %eax + jnz L(second_loop_new_match) + +L(second_loop_old_match): + pmovmskb %xmm2, %ecx + pmovmskb %xmm3, %eax + sall $16, %eax + orl %ecx, %eax + bsrl %eax, %eax + addq %rsi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif ret .p2align 4 +L(second_loop_new_match): + pxor %xmm6, %xmm6 + PCMPEQ %xmm9, %xmm6 + pmovmskb %xmm6, %eax + sall $16, %ecx + orl %eax, %ecx + + /* We can't reuse either of the old comparisons as since we mask + of zeros after first zero (instead of using the full + comparison) we can't gurantee no interference between match + after end of string and valid match. */ + pmovmskb %xmm4, %eax + pmovmskb %xmm7, %edx + sall $16, %edx + orl %edx, %eax + + leal -1(%ecx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(second_loop_old_match) + bsrl %eax, %eax + addq %rdi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4,, 4 L(cross_page): - movq %rdi, %rax - pxor %xmm0, %xmm0 - andq $-64, %rax - movdqu (%rax), %xmm5 - movdqa %xmm5, %xmm6 - movdqu 16(%rax), %xmm4 - pcmpeqb %xmm1, %xmm5 - pcmpeqb %xmm0, %xmm6 - movdqu 32(%rax), %xmm3 - pmovmskb %xmm6, %esi - movdqa %xmm4, %xmm6 - movdqu 48(%rax), %xmm2 - pcmpeqb %xmm1, %xmm4 - pcmpeqb %xmm0, %xmm6 - pmovmskb %xmm6, %edx - movdqa %xmm3, %xmm6 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm0, %xmm6 - pcmpeqb %xmm2, %xmm0 - salq $16, %rdx - pmovmskb %xmm3, %r9d - pmovmskb %xmm6, %r8d - pmovmskb %xmm0, %ecx - salq $32, %r9 - salq $32, %r8 - pcmpeqb %xmm1, %xmm2 - orq %r8, %rdx - salq $48, %rcx - pmovmskb %xmm5, %r8d - orq %rsi, %rdx - pmovmskb %xmm4, %esi - orq %rcx, %rdx - pmovmskb %xmm2, %ecx - salq $16, %rsi - salq $48, %rcx - orq %r9, %rsi - orq %r8, %rsi - orq %rcx, %rsi + movq %rdi, %rsi + andq $-VEC_SIZE, %rsi + movaps (%rsi), %xmm1 + pxor %xmm2, %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %edx movl %edi, %ecx - subl %eax, %ecx - shrq %cl, %rdx - shrq %cl, %rsi - testq %rdx, %rdx - je L(loop_header2) - leaq -1(%rdx), %rax - xorq %rdx, %rax - andq %rax, %rsi - je L(exit) - bsrq %rsi, %rax + andl $(VEC_SIZE - 1), %ecx + sarl %cl, %edx + jz L(cross_page_continue) + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + sarl %cl, %eax + leal -1(%rdx), %ecx + xorl %edx, %ecx + andl %ecx, %eax + jz L(ret1) + bsrl %eax, %eax addq %rdi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif +L(ret1): ret -END (strrchr) +END(STRRCHR) -weak_alias (strrchr, rindex) -libc_hidden_builtin_def (strrchr) +#ifndef USE_AS_WCSRCHR + weak_alias (STRRCHR, rindex) + libc_hidden_builtin_def (STRRCHR) +#endif diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S index 61552954de..2b80efc5ef 100644 --- a/sysdeps/x86_64/wcsrchr.S +++ b/sysdeps/x86_64/wcsrchr.S @@ -1,4 +1,4 @@ -/* wcsrchr with SSSE3 +/* wcsrchr optimized with SSE2. Copyright (C) 2011-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,266 +16,12 @@ License along with the GNU C Library; if not, see . */ -#include - .text -ENTRY (wcsrchr) +#define USE_AS_WCSRCHR 1 +#define NO_PMINU 1 - movd %rsi, %xmm1 - mov %rdi, %rcx - punpckldq %xmm1, %xmm1 - pxor %xmm2, %xmm2 - punpckldq %xmm1, %xmm1 - and $63, %rcx - cmp $48, %rcx - ja L(crosscache) +#ifndef STRRCHR +# define STRRCHR wcsrchr +#endif - movdqu (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rcx - pmovmskb %xmm0, %rax - add $16, %rdi - - test %rax, %rax - jnz L(unaligned_match1) - - test %rcx, %rcx - jnz L(return_null) - - and $-16, %rdi - xor %r8, %r8 - jmp L(loop) - - .p2align 4 -L(unaligned_match1): - test %rcx, %rcx - jnz L(prolog_find_zero_1) - - mov %rax, %r8 - mov %rdi, %rsi - and $-16, %rdi - jmp L(loop) - - .p2align 4 -L(crosscache): - and $15, %rcx - and $-16, %rdi - pxor %xmm3, %xmm3 - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm3 - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm3, %rdx - pmovmskb %xmm0, %rax - shr %cl, %rdx - shr %cl, %rax - add $16, %rdi - - test %rax, %rax - jnz L(unaligned_match) - - test %rdx, %rdx - jnz L(return_null) - - xor %r8, %r8 - jmp L(loop) - - .p2align 4 -L(unaligned_match): - test %rdx, %rdx - jnz L(prolog_find_zero) - - mov %rax, %r8 - lea (%rdi, %rcx), %rsi - -/* Loop start on aligned string. */ - .p2align 4 -L(loop): - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rcx - pmovmskb %xmm0, %rax - or %rax, %rcx - jnz L(matches) - - movdqa (%rdi), %xmm3 - pcmpeqd %xmm3, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm3 - pmovmskb %xmm2, %rcx - pmovmskb %xmm3, %rax - or %rax, %rcx - jnz L(matches) - - movdqa (%rdi), %xmm4 - pcmpeqd %xmm4, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm4 - pmovmskb %xmm2, %rcx - pmovmskb %xmm4, %rax - or %rax, %rcx - jnz L(matches) - - movdqa (%rdi), %xmm5 - pcmpeqd %xmm5, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm5 - pmovmskb %xmm2, %rcx - pmovmskb %xmm5, %rax - or %rax, %rcx - jz L(loop) - - .p2align 4 -L(matches): - test %rax, %rax - jnz L(match) -L(return_value): - test %r8, %r8 - jz L(return_null) - mov %r8, %rax - mov %rsi, %rdi - - test $15 << 4, %ah - jnz L(match_fourth_wchar) - test %ah, %ah - jnz L(match_third_wchar) - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(match): - pmovmskb %xmm2, %rcx - test %rcx, %rcx - jnz L(find_zero) - mov %rax, %r8 - mov %rdi, %rsi - jmp L(loop) - - .p2align 4 -L(find_zero): - test $15, %cl - jnz L(find_zero_in_first_wchar) - test %cl, %cl - jnz L(find_zero_in_second_wchar) - test $15, %ch - jnz L(find_zero_in_third_wchar) - - and $1 << 13 - 1, %rax - jz L(return_value) - - test $15 << 4, %ah - jnz L(match_fourth_wchar) - test %ah, %ah - jnz L(match_third_wchar) - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(find_zero_in_first_wchar): - test $1, %rax - jz L(return_value) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(find_zero_in_second_wchar): - and $1 << 5 - 1, %rax - jz L(return_value) - - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(find_zero_in_third_wchar): - and $1 << 9 - 1, %rax - jz L(return_value) - - test %ah, %ah - jnz L(match_third_wchar) - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(prolog_find_zero): - add %rcx, %rdi - mov %rdx, %rcx -L(prolog_find_zero_1): - test $15, %cl - jnz L(prolog_find_zero_in_first_wchar) - test %cl, %cl - jnz L(prolog_find_zero_in_second_wchar) - test $15, %ch - jnz L(prolog_find_zero_in_third_wchar) - - and $1 << 13 - 1, %rax - jz L(return_null) - - test $15 << 4, %ah - jnz L(match_fourth_wchar) - test %ah, %ah - jnz L(match_third_wchar) - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(prolog_find_zero_in_first_wchar): - test $1, %rax - jz L(return_null) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(prolog_find_zero_in_second_wchar): - and $1 << 5 - 1, %rax - jz L(return_null) - - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(prolog_find_zero_in_third_wchar): - and $1 << 9 - 1, %rax - jz L(return_null) - - test %ah, %ah - jnz L(match_third_wchar) - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(match_second_wchar): - lea -12(%rdi), %rax - ret - - .p2align 4 -L(match_third_wchar): - lea -8(%rdi), %rax - ret - - .p2align 4 -L(match_fourth_wchar): - lea -4(%rdi), %rax - ret - - .p2align 4 -L(return_null): - xor %rax, %rax - ret - -END (wcsrchr) +#include "../strrchr.S" From patchwork Thu Apr 21 03:14:09 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 53084 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id A01B03857368 for ; Thu, 21 Apr 2022 03:16:13 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org A01B03857368 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1650510973; bh=4j/qcAY+RV3UesuhXjnvCKbQ/vwHMsiCRivaEXvZvsg=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=AHQaGhGmd1cmBfDzMNOaE/vIg07LKswcphdbsdoJeDGdRPF+FcPihOqiREjrkQlCX ihWvbucGjnvjK1SLvg1EJtu9IlfTUVPLBjRn9RahMm/SB8FMVbOWk756s2Y/HTJ3PG q4dzGuKMi9y1sTOvI3IDfeMhksgll2Ca5rfrtFzI= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pg1-x52c.google.com (mail-pg1-x52c.google.com [IPv6:2607:f8b0:4864:20::52c]) by sourceware.org (Postfix) with ESMTPS id 887F4385736E for ; Thu, 21 Apr 2022 03:14:42 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 887F4385736E Received: by mail-pg1-x52c.google.com with SMTP id x191so3518336pgd.4 for ; Wed, 20 Apr 2022 20:14:42 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=4j/qcAY+RV3UesuhXjnvCKbQ/vwHMsiCRivaEXvZvsg=; b=eGnVXmpiebOZtTAnV2uO9pPy/JMU0mYD3BQk3aM64YKBrTTqzWx1qxLoBRSBM5pnaN hTPbmg2Ze/LaNVcOxTmRyJl58Jp94p9RMicNlAZmBFs4VeVkEV2wsPmmKZfrRLu7AVPu 2YpdnbJzT9eTYek2AZ9JK+ijjKB/ZYYxDH8H2epTQhqOXpv9ZymgTMNquaw7z1CguUvS 49EuLsQ6NjJN+zZipGx2XfFuG9pZsTTNzwMDmnFSY4kfaMVasva+40bOjDI8tUgdcLwI dzan6IPpaN2W0C2w4FTo9Zb1WnPDC55uhhSl0CLhdBTgUc3uWDuB59PrRicEuLkq22rs l2vw== X-Gm-Message-State: AOAM530KM5uTH4CrAaAWZo5PpU/wVVEb5E50Z65/ALJuCJmKFzshN/46 wGxHbHwyNA0U5d4z/ZmMq1hfxCUyCTw= X-Google-Smtp-Source: ABdhPJyIPcJMD5BtPcCmpf1U+vTwX6HMEwts0CxmqlSdCpqLFhvxBZGqjEvCfW9KKUY5mI0crYfN5w== X-Received: by 2002:a05:6a00:174f:b0:4fd:aed5:b5e4 with SMTP id j15-20020a056a00174f00b004fdaed5b5e4mr26559806pfc.39.1650510881442; Wed, 20 Apr 2022 20:14:41 -0700 (PDT) Received: from localhost.localdomain ([64.145.94.63]) by smtp.googlemail.com with ESMTPSA id n59-20020a17090a5ac100b001cd498dc153sm1424022pji.3.2022.04.20.20.14.40 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 20 Apr 2022 20:14:41 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1 3/5] x86: Add wcsrchr optimized with SSE4_1 in wcsrchr-sse4_1.S Date: Wed, 20 Apr 2022 22:14:09 -0500 Message-Id: <20220421031410.2142238-3-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220421031410.2142238-1-goldstein.w.n@gmail.com> References: <20220421031410.2142238-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.1 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_SHORT, KAM_STOCKGEN, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" wcsrchr-sse2 can't use `pminud` which can speedup the main loop: len, align, pos, seek, max_char, freq, New Time / Old Time 256, 1, 64, 23, 1273, 1, 1.082 256, 1, 64, 23, 2147483647, 1, 1.076 256, 15, 64, 23, 1273, 1, 1.061 256, 15, 64, 23, 2147483647, 1, 1.075 256, 2, 64, 23, 1273, 1, 1.108 256, 2, 64, 23, 2147483647, 1, 1.109 256, 30, 64, 23, 1273, 1, 1.072 256, 30, 64, 23, 2147483647, 1, 1.077 256, 3, 64, 23, 1273, 1, 1.108 256, 3, 64, 23, 2147483647, 1, 1.103 256, 45, 64, 23, 1273, 1, 1.076 256, 45, 64, 23, 2147483647, 1, 1.079 256, 4, 64, 23, 1273, 1, 1.119 256, 4, 64, 23, 2147483647, 1, 1.112 256, 60, 64, 23, 1273, 1, 1.117 256, 60, 64, 23, 2147483647, 1, 1.112 256, 5, 64, 23, 1273, 1, 1.21 256, 5, 64, 23, 2147483647, 1, 1.194 256, 75, 64, 23, 1273, 1, 1.055 256, 75, 64, 23, 2147483647, 1, 1.045 256, 6, 64, 23, 1273, 1, 1.264 256, 6, 64, 23, 2147483647, 1, 1.3 256, 90, 64, 23, 1273, 1, 1.022 256, 90, 64, 23, 2147483647, 1, 1.026 256, 7, 64, 23, 1273, 1, 1.316 256, 7, 64, 23, 2147483647, 1, 1.325 Overall this leads to a 5% performance improvement in the benchmark suite. Full xcheck passes on x86_64 with and without multiarch enabled. --- sysdeps/x86_64/multiarch/Makefile | 1 + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 3 +++ sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S | 21 +++++++++++++++++++++ sysdeps/x86_64/multiarch/wcsrchr.c | 3 ++- 4 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 0400ea332b..5ad7bc8c25 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -154,6 +154,7 @@ sysdep_routines += \ wcsrchr-avx2-rtm \ wcsrchr-evex \ wcsrchr-sse2 \ + wcsrchr-sse4_1 \ wmemchr-avx2 \ wmemchr-avx2-rtm \ wmemchr-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index a8afcf81bb..1cbb6938c8 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -685,6 +685,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wcsrchr_evex) + IFUNC_IMPL_ADD (array, i, wcsrchr, + CPU_FEATURE_USABLE (SSE4_1), + __wcsrchr_sse4_1) IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2)) /* Support sysdeps/x86_64/multiarch/wcscmp.c. */ diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S b/sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S new file mode 100644 index 0000000000..34b92d28eb --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S @@ -0,0 +1,21 @@ +/* wcsrchr optimized with SSE4. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define USE_AS_WCSRCHR 1 +#define STRRCHR __wcsrchr_sse4_1 +#include "../strrchr.S" diff --git a/sysdeps/x86_64/multiarch/wcsrchr.c b/sysdeps/x86_64/multiarch/wcsrchr.c index 8b30c06f2e..eb18038eec 100644 --- a/sysdeps/x86_64/multiarch/wcsrchr.c +++ b/sysdeps/x86_64/multiarch/wcsrchr.c @@ -23,7 +23,8 @@ # undef wcsrchr # define SYMBOL_NAME wcsrchr -# include "ifunc-avx2.h" + +# include "ifunc-wcslen.h" libc_ifunc_redirected (__redirect_wcsrchr, wcsrchr, IFUNC_SELECTOR ()); #endif From patchwork Thu Apr 21 03:14:11 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 53085 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 3CF893857357 for ; Thu, 21 Apr 2022 03:16:56 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 3CF893857357 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1650511016; bh=Ov3HDsgC1rBdx5HPuNgTNYwpbTviRKQk62twdniIKzc=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=eyf/PJ+YahRtMdW+MTSVJ9TuyUOT9uE7oGTbt01YJ2W8cyqQ1ENkujsoVK9Kh7M3G TJrjLGIK722ivG28Qi1WkcgeV6xo4UrInXFoHjR1dn46Mnfs4TW0V1Z07iIIOIbXu5 5Ev3aLWyE7ks+obB0CVs8abpPkwRJlHTF+gVvuFQ= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pf1-x431.google.com (mail-pf1-x431.google.com [IPv6:2607:f8b0:4864:20::431]) by sourceware.org (Postfix) with ESMTPS id 32D043857C4A for ; Thu, 21 Apr 2022 03:15:00 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 32D043857C4A Received: by mail-pf1-x431.google.com with SMTP id j6so1426772pfe.13 for ; Wed, 20 Apr 2022 20:15:00 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=Ov3HDsgC1rBdx5HPuNgTNYwpbTviRKQk62twdniIKzc=; b=XMxJvgZbxOiPHrSH++QZ4mLK3RvquApZE5QGKcRjbF8lzFKvHEXWESh/EBdneSxU2a 3EKKPoSB0IfpiQCINX2tULBzg5XNT/IyjPToL9rF1kZv4Z84PdsO+e3mAdJnuW/iDf3Y cIEgc2iTfoc/JsCXOERUz9Dum/us/0dozb9kvCbD31Ic4zErCmisAjDA0Dr/j1cBAYYb WWHfw/H4X3jkkIdBUC/QWyLMyMxRvd4G6D4A7TwuNQZGt8l+7Jqdq38tWIYHegG44Z9K V1oO1yqOkmnC9kDVkmpVJlaU9R75ONGhHl2kRInbkG6PZEZl+WoUjZigsicPxiiHbQdg 6/tA== X-Gm-Message-State: AOAM5327ESAqjHslIDVkT66RTacMS1OQttKXO1tPciNuSC6EyAhp8bNX w4vCzSuJY9UUGwls3Q1+z+cehrjivUU= X-Google-Smtp-Source: ABdhPJyIleBRgX5n+qqcW1ZBfoT2E6gLG/Js9B+nrM1J4yFmbzPpqN+UtA6C1JcXv4rvmFEaa2DrjA== X-Received: by 2002:a63:5907:0:b0:382:2f93:5467 with SMTP id n7-20020a635907000000b003822f935467mr21723768pgb.460.1650510898559; Wed, 20 Apr 2022 20:14:58 -0700 (PDT) Received: from localhost.localdomain ([64.145.94.63]) by smtp.googlemail.com with ESMTPSA id n59-20020a17090a5ac100b001cd498dc153sm1424022pji.3.2022.04.20.20.14.56 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 20 Apr 2022 20:14:58 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1 4/5] x86: Optimize {str|wcs}rchr-avx2 Date: Wed, 20 Apr 2022 22:14:11 -0500 Message-Id: <20220421031410.2142238-4-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220421031410.2142238-1-goldstein.w.n@gmail.com> References: <20220421031410.2142238-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.6 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_NUMSUBJECT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" The new code unrolls the main loop slightly without adding too much overhead and minimizes the comparisons for the search CHAR. Geometric Mean of all benchmarks New / Old: 0.832 See email for all results. Full xcheck passes on x86_64 with and without multiarch enabled. --- Results For: strrchr Geometric Mean of N=30 runs. Geometric Mean of all benchmarks New / Old: 0.832 Benchmarks performance on Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html len, align, pos, seek, max_char, freq, New Time / Old Time 2048, 0, 32, 0, 127, 1, 0.673 2048, 1, 32, 0, 127, 1, 0.68 2048, 0, 64, 0, 127, 1, 0.566 2048, 2, 64, 0, 127, 1, 0.574 2048, 0, 128, 0, 127, 1, 0.976 2048, 3, 128, 0, 127, 1, 0.967 2048, 0, 256, 0, 127, 1, 0.931 2048, 4, 256, 0, 127, 1, 0.921 2048, 0, 512, 0, 127, 1, 0.792 2048, 5, 512, 0, 127, 1, 0.78 2048, 0, 1024, 0, 127, 1, 0.733 2048, 6, 1024, 0, 127, 1, 0.729 2048, 0, 2048, 0, 127, 1, 0.795 2048, 7, 2048, 0, 127, 1, 0.805 2048, 0, 4096, 0, 127, 1, 0.803 2048, 8, 4096, 0, 127, 1, 0.794 256, 1, 64, 0, 127, 1, 0.584 256, 15, 64, 0, 127, 1, 0.587 256, 2, 64, 0, 127, 1, 0.586 256, 30, 64, 0, 127, 1, 0.592 256, 3, 64, 0, 127, 1, 0.586 256, 45, 64, 0, 127, 1, 0.505 256, 4, 64, 0, 127, 1, 0.59 256, 60, 64, 0, 127, 1, 0.501 256, 5, 64, 0, 127, 1, 0.595 256, 75, 64, 0, 127, 1, 0.588 256, 6, 64, 0, 127, 1, 0.593 256, 90, 64, 0, 127, 1, 0.594 256, 7, 64, 0, 127, 1, 0.596 256, 105, 64, 0, 127, 1, 0.506 1, 0, 0, 0, 127, 1, 0.872 2, 0, 1, 0, 127, 1, 0.861 3, 0, 2, 0, 127, 1, 0.862 4, 0, 3, 0, 127, 1, 0.884 5, 0, 4, 0, 127, 1, 0.869 6, 0, 5, 0, 127, 1, 0.861 7, 0, 6, 0, 127, 1, 0.865 8, 0, 7, 0, 127, 1, 0.884 9, 0, 8, 0, 127, 1, 0.862 10, 0, 9, 0, 127, 1, 0.889 11, 0, 10, 0, 127, 1, 0.9 12, 0, 11, 0, 127, 1, 0.897 13, 0, 12, 0, 127, 1, 0.909 14, 0, 13, 0, 127, 1, 0.885 15, 0, 14, 0, 127, 1, 0.929 16, 0, 15, 0, 127, 1, 0.871 17, 0, 16, 0, 127, 1, 0.875 18, 0, 17, 0, 127, 1, 0.878 19, 0, 18, 0, 127, 1, 0.889 20, 0, 19, 0, 127, 1, 0.89 21, 0, 20, 0, 127, 1, 0.901 22, 0, 21, 0, 127, 1, 0.91 23, 0, 22, 0, 127, 1, 0.912 24, 0, 23, 0, 127, 1, 0.907 25, 0, 24, 0, 127, 1, 0.947 26, 0, 25, 0, 127, 1, 0.904 27, 0, 26, 0, 127, 1, 0.921 28, 0, 27, 0, 127, 1, 0.899 29, 0, 28, 0, 127, 1, 0.923 30, 0, 29, 0, 127, 1, 0.918 31, 0, 30, 0, 127, 1, 0.943 32, 0, 31, 0, 127, 1, 0.914 2048, 0, 32, 23, 127, 1, 0.815 2048, 1, 32, 23, 127, 1, 0.829 2048, 0, 64, 23, 127, 1, 0.884 2048, 2, 64, 23, 127, 1, 0.882 2048, 0, 128, 23, 127, 1, 0.884 2048, 3, 128, 23, 127, 1, 0.851 2048, 0, 256, 23, 127, 1, 0.843 2048, 4, 256, 23, 127, 1, 0.867 2048, 0, 512, 23, 127, 1, 0.746 2048, 5, 512, 23, 127, 1, 0.863 2048, 0, 1024, 23, 127, 1, 0.662 2048, 6, 1024, 23, 127, 1, 0.683 2048, 0, 2048, 23, 127, 1, 0.852 2048, 7, 2048, 23, 127, 1, 0.837 2048, 0, 4096, 23, 127, 1, 0.837 2048, 8, 4096, 23, 127, 1, 0.829 256, 1, 64, 23, 127, 1, 0.934 256, 15, 64, 23, 127, 1, 0.936 256, 2, 64, 23, 127, 1, 0.931 256, 30, 64, 23, 127, 1, 0.938 256, 3, 64, 23, 127, 1, 0.927 256, 45, 64, 23, 127, 1, 0.863 256, 4, 64, 23, 127, 1, 0.939 256, 60, 64, 23, 127, 1, 0.871 256, 5, 64, 23, 127, 1, 0.94 256, 75, 64, 23, 127, 1, 0.933 256, 6, 64, 23, 127, 1, 0.915 256, 90, 64, 23, 127, 1, 0.934 256, 7, 64, 23, 127, 1, 0.938 256, 105, 64, 23, 127, 1, 0.871 1, 0, 0, 23, 127, 1, 0.865 2, 0, 1, 23, 127, 1, 0.87 3, 0, 2, 23, 127, 1, 0.882 4, 0, 3, 23, 127, 1, 0.901 5, 0, 4, 23, 127, 1, 0.879 6, 0, 5, 23, 127, 1, 0.934 7, 0, 6, 23, 127, 1, 0.874 8, 0, 7, 23, 127, 1, 0.895 9, 0, 8, 23, 127, 1, 0.873 10, 0, 9, 23, 127, 1, 0.861 11, 0, 10, 23, 127, 1, 0.865 12, 0, 11, 23, 127, 1, 0.875 13, 0, 12, 23, 127, 1, 0.878 14, 0, 13, 23, 127, 1, 0.86 15, 0, 14, 23, 127, 1, 0.889 16, 0, 15, 23, 127, 1, 0.875 17, 0, 16, 23, 127, 1, 0.911 18, 0, 17, 23, 127, 1, 0.891 19, 0, 18, 23, 127, 1, 0.921 20, 0, 19, 23, 127, 1, 0.898 21, 0, 20, 23, 127, 1, 0.895 22, 0, 21, 23, 127, 1, 0.906 23, 0, 22, 23, 127, 1, 0.911 24, 0, 23, 23, 127, 1, 0.877 25, 0, 24, 23, 127, 1, 0.9 26, 0, 25, 23, 127, 1, 0.911 27, 0, 26, 23, 127, 1, 0.926 28, 0, 27, 23, 127, 1, 0.918 29, 0, 28, 23, 127, 1, 0.952 30, 0, 29, 23, 127, 1, 0.943 31, 0, 30, 23, 127, 1, 0.934 32, 0, 31, 23, 127, 1, 0.8 2048, 0, 32, 23, 127, 2, 0.872 2048, 1, 32, 23, 127, 2, 0.819 2048, 0, 64, 23, 127, 2, 0.815 2048, 2, 64, 23, 127, 2, 0.805 2048, 0, 128, 23, 127, 2, 0.884 2048, 3, 128, 23, 127, 2, 0.852 2048, 0, 256, 23, 127, 2, 0.873 2048, 4, 256, 23, 127, 2, 0.871 2048, 0, 512, 23, 127, 2, 0.654 2048, 5, 512, 23, 127, 2, 0.762 2048, 0, 1024, 23, 127, 2, 0.646 2048, 6, 1024, 23, 127, 2, 0.665 2048, 0, 2048, 23, 127, 2, 0.678 2048, 7, 2048, 23, 127, 2, 0.675 2048, 0, 4096, 23, 127, 2, 0.849 2048, 8, 4096, 23, 127, 2, 0.835 256, 1, 64, 23, 127, 2, 0.917 256, 15, 64, 23, 127, 2, 0.915 256, 2, 64, 23, 127, 2, 0.911 256, 30, 64, 23, 127, 2, 0.907 256, 3, 64, 23, 127, 2, 0.9 256, 45, 64, 23, 127, 2, 0.816 256, 4, 64, 23, 127, 2, 0.912 256, 60, 64, 23, 127, 2, 0.81 256, 5, 64, 23, 127, 2, 0.904 256, 75, 64, 23, 127, 2, 0.911 256, 6, 64, 23, 127, 2, 0.898 256, 90, 64, 23, 127, 2, 0.912 256, 7, 64, 23, 127, 2, 0.909 256, 105, 64, 23, 127, 2, 0.81 1, 0, 0, 23, 127, 2, 0.858 2, 0, 1, 23, 127, 2, 0.89 3, 0, 2, 23, 127, 2, 0.877 4, 0, 3, 23, 127, 2, 0.863 5, 0, 4, 23, 127, 2, 0.863 6, 0, 5, 23, 127, 2, 0.889 7, 0, 6, 23, 127, 2, 0.898 8, 0, 7, 23, 127, 2, 0.885 9, 0, 8, 23, 127, 2, 0.863 10, 0, 9, 23, 127, 2, 0.902 11, 0, 10, 23, 127, 2, 0.865 12, 0, 11, 23, 127, 2, 0.864 13, 0, 12, 23, 127, 2, 0.87 14, 0, 13, 23, 127, 2, 0.862 15, 0, 14, 23, 127, 2, 0.861 16, 0, 15, 23, 127, 2, 0.859 17, 0, 16, 23, 127, 2, 0.87 18, 0, 17, 23, 127, 2, 0.892 19, 0, 18, 23, 127, 2, 0.874 20, 0, 19, 23, 127, 2, 0.866 21, 0, 20, 23, 127, 2, 0.877 22, 0, 21, 23, 127, 2, 0.868 23, 0, 22, 23, 127, 2, 0.884 24, 0, 23, 23, 127, 2, 0.881 25, 0, 24, 23, 127, 2, 0.872 26, 0, 25, 23, 127, 2, 0.866 27, 0, 26, 23, 127, 2, 0.881 28, 0, 27, 23, 127, 2, 0.93 29, 0, 28, 23, 127, 2, 0.886 30, 0, 29, 23, 127, 2, 0.869 31, 0, 30, 23, 127, 2, 0.869 32, 0, 31, 23, 127, 2, 0.667 2048, 0, 32, 23, 127, 4, 0.858 2048, 1, 32, 23, 127, 4, 0.858 2048, 0, 64, 23, 127, 4, 0.838 2048, 2, 64, 23, 127, 4, 0.834 2048, 0, 128, 23, 127, 4, 0.85 2048, 3, 128, 23, 127, 4, 0.762 2048, 0, 256, 23, 127, 4, 0.874 2048, 4, 256, 23, 127, 4, 0.796 2048, 0, 512, 23, 127, 4, 0.691 2048, 5, 512, 23, 127, 4, 0.755 2048, 0, 1024, 23, 127, 4, 0.676 2048, 6, 1024, 23, 127, 4, 0.661 2048, 0, 2048, 23, 127, 4, 0.678 2048, 7, 2048, 23, 127, 4, 0.678 2048, 0, 4096, 23, 127, 4, 0.676 2048, 8, 4096, 23, 127, 4, 0.677 256, 1, 64, 23, 127, 4, 0.875 256, 15, 64, 23, 127, 4, 0.877 256, 2, 64, 23, 127, 4, 0.875 256, 30, 64, 23, 127, 4, 0.875 256, 3, 64, 23, 127, 4, 0.878 256, 45, 64, 23, 127, 4, 0.829 256, 4, 64, 23, 127, 4, 0.876 256, 60, 64, 23, 127, 4, 0.807 256, 5, 64, 23, 127, 4, 0.874 256, 75, 64, 23, 127, 4, 0.872 256, 6, 64, 23, 127, 4, 0.874 256, 90, 64, 23, 127, 4, 0.874 256, 7, 64, 23, 127, 4, 0.873 256, 105, 64, 23, 127, 4, 0.826 1, 0, 0, 23, 127, 4, 0.863 2, 0, 1, 23, 127, 4, 0.861 3, 0, 2, 23, 127, 4, 0.863 4, 0, 3, 23, 127, 4, 0.867 5, 0, 4, 23, 127, 4, 0.866 6, 0, 5, 23, 127, 4, 0.873 7, 0, 6, 23, 127, 4, 0.873 8, 0, 7, 23, 127, 4, 0.866 9, 0, 8, 23, 127, 4, 0.861 10, 0, 9, 23, 127, 4, 0.861 11, 0, 10, 23, 127, 4, 0.857 12, 0, 11, 23, 127, 4, 0.864 13, 0, 12, 23, 127, 4, 0.86 14, 0, 13, 23, 127, 4, 0.859 15, 0, 14, 23, 127, 4, 0.854 16, 0, 15, 23, 127, 4, 0.857 17, 0, 16, 23, 127, 4, 0.881 18, 0, 17, 23, 127, 4, 0.863 19, 0, 18, 23, 127, 4, 0.86 20, 0, 19, 23, 127, 4, 0.906 21, 0, 20, 23, 127, 4, 0.924 22, 0, 21, 23, 127, 4, 0.885 23, 0, 22, 23, 127, 4, 0.861 24, 0, 23, 23, 127, 4, 0.907 25, 0, 24, 23, 127, 4, 0.909 26, 0, 25, 23, 127, 4, 0.863 27, 0, 26, 23, 127, 4, 0.862 28, 0, 27, 23, 127, 4, 0.887 29, 0, 28, 23, 127, 4, 0.879 30, 0, 29, 23, 127, 4, 0.932 31, 0, 30, 23, 127, 4, 0.895 32, 0, 31, 23, 127, 4, 0.666 2048, 0, 32, 23, 127, 8, 0.865 2048, 1, 32, 23, 127, 8, 0.892 2048, 0, 64, 23, 127, 8, 0.85 2048, 2, 64, 23, 127, 8, 0.834 2048, 0, 128, 23, 127, 8, 0.823 2048, 3, 128, 23, 127, 8, 0.809 2048, 0, 256, 23, 127, 8, 0.84 2048, 4, 256, 23, 127, 8, 0.738 2048, 0, 512, 23, 127, 8, 0.656 2048, 5, 512, 23, 127, 8, 0.644 2048, 0, 1024, 23, 127, 8, 0.705 2048, 6, 1024, 23, 127, 8, 0.708 2048, 0, 2048, 23, 127, 8, 0.701 2048, 7, 2048, 23, 127, 8, 0.7 2048, 0, 4096, 23, 127, 8, 0.68 2048, 8, 4096, 23, 127, 8, 0.678 256, 1, 64, 23, 127, 8, 0.881 256, 15, 64, 23, 127, 8, 0.879 256, 2, 64, 23, 127, 8, 0.878 256, 30, 64, 23, 127, 8, 0.877 256, 3, 64, 23, 127, 8, 0.88 256, 45, 64, 23, 127, 8, 0.829 256, 4, 64, 23, 127, 8, 0.883 256, 60, 64, 23, 127, 8, 0.808 256, 5, 64, 23, 127, 8, 0.875 256, 75, 64, 23, 127, 8, 0.877 256, 6, 64, 23, 127, 8, 0.874 256, 90, 64, 23, 127, 8, 0.874 256, 7, 64, 23, 127, 8, 0.874 256, 105, 64, 23, 127, 8, 0.83 1, 0, 0, 23, 127, 8, 0.862 2, 0, 1, 23, 127, 8, 0.865 3, 0, 2, 23, 127, 8, 0.866 4, 0, 3, 23, 127, 8, 0.863 5, 0, 4, 23, 127, 8, 0.874 6, 0, 5, 23, 127, 8, 0.87 7, 0, 6, 23, 127, 8, 0.87 8, 0, 7, 23, 127, 8, 0.864 9, 0, 8, 23, 127, 8, 0.87 10, 0, 9, 23, 127, 8, 0.861 11, 0, 10, 23, 127, 8, 0.862 12, 0, 11, 23, 127, 8, 0.87 13, 0, 12, 23, 127, 8, 0.858 14, 0, 13, 23, 127, 8, 0.86 15, 0, 14, 23, 127, 8, 0.863 16, 0, 15, 23, 127, 8, 0.866 17, 0, 16, 23, 127, 8, 0.86 18, 0, 17, 23, 127, 8, 0.887 19, 0, 18, 23, 127, 8, 0.858 20, 0, 19, 23, 127, 8, 0.891 21, 0, 20, 23, 127, 8, 0.874 22, 0, 21, 23, 127, 8, 0.891 23, 0, 22, 23, 127, 8, 0.873 24, 0, 23, 23, 127, 8, 0.895 25, 0, 24, 23, 127, 8, 0.884 26, 0, 25, 23, 127, 8, 0.878 27, 0, 26, 23, 127, 8, 0.878 28, 0, 27, 23, 127, 8, 0.891 29, 0, 28, 23, 127, 8, 0.91 30, 0, 29, 23, 127, 8, 0.881 31, 0, 30, 23, 127, 8, 0.917 32, 0, 31, 23, 127, 8, 0.667 2048, 0, 32, 23, 127, 16, 0.86 2048, 1, 32, 23, 127, 16, 0.847 2048, 0, 64, 23, 127, 16, 0.846 2048, 2, 64, 23, 127, 16, 0.852 2048, 0, 128, 23, 127, 16, 0.82 2048, 3, 128, 23, 127, 16, 0.751 2048, 0, 256, 23, 127, 16, 0.788 2048, 4, 256, 23, 127, 16, 0.712 2048, 0, 512, 23, 127, 16, 0.524 2048, 5, 512, 23, 127, 16, 0.517 2048, 0, 1024, 23, 127, 16, 0.583 2048, 6, 1024, 23, 127, 16, 0.682 2048, 0, 2048, 23, 127, 16, 0.77 2048, 7, 2048, 23, 127, 16, 0.659 2048, 0, 4096, 23, 127, 16, 0.7 2048, 8, 4096, 23, 127, 16, 0.7 256, 1, 64, 23, 127, 16, 0.798 256, 15, 64, 23, 127, 16, 0.873 256, 2, 64, 23, 127, 16, 0.875 256, 30, 64, 23, 127, 16, 0.877 256, 3, 64, 23, 127, 16, 0.875 256, 45, 64, 23, 127, 16, 0.834 256, 4, 64, 23, 127, 16, 0.873 256, 60, 64, 23, 127, 16, 0.809 256, 5, 64, 23, 127, 16, 0.879 256, 75, 64, 23, 127, 16, 0.884 256, 6, 64, 23, 127, 16, 0.874 256, 90, 64, 23, 127, 16, 0.876 256, 7, 64, 23, 127, 16, 0.876 256, 105, 64, 23, 127, 16, 0.827 1, 0, 0, 23, 127, 16, 0.859 2, 0, 1, 23, 127, 16, 0.864 3, 0, 2, 23, 127, 16, 0.871 4, 0, 3, 23, 127, 16, 0.869 5, 0, 4, 23, 127, 16, 0.881 6, 0, 5, 23, 127, 16, 0.869 7, 0, 6, 23, 127, 16, 0.867 8, 0, 7, 23, 127, 16, 0.877 9, 0, 8, 23, 127, 16, 0.862 10, 0, 9, 23, 127, 16, 0.861 11, 0, 10, 23, 127, 16, 0.859 12, 0, 11, 23, 127, 16, 0.858 13, 0, 12, 23, 127, 16, 0.867 14, 0, 13, 23, 127, 16, 0.857 15, 0, 14, 23, 127, 16, 0.858 16, 0, 15, 23, 127, 16, 0.857 17, 0, 16, 23, 127, 16, 0.858 18, 0, 17, 23, 127, 16, 0.867 19, 0, 18, 23, 127, 16, 0.875 20, 0, 19, 23, 127, 16, 0.868 21, 0, 20, 23, 127, 16, 0.861 22, 0, 21, 23, 127, 16, 0.868 23, 0, 22, 23, 127, 16, 0.866 24, 0, 23, 23, 127, 16, 0.858 25, 0, 24, 23, 127, 16, 0.859 26, 0, 25, 23, 127, 16, 0.857 27, 0, 26, 23, 127, 16, 0.866 28, 0, 27, 23, 127, 16, 0.875 29, 0, 28, 23, 127, 16, 0.896 30, 0, 29, 23, 127, 16, 0.889 31, 0, 30, 23, 127, 16, 0.903 32, 0, 31, 23, 127, 16, 0.667 sysdeps/x86_64/multiarch/strrchr-avx2.S | 415 +++++++++++++++--------- 1 file changed, 258 insertions(+), 157 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S index 1df2adfad0..9d1e45defc 100644 --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S @@ -27,9 +27,13 @@ # ifdef USE_AS_WCSRCHR # define VPBROADCAST vpbroadcastd # define VPCMPEQ vpcmpeqd +# define VPMIN vpminud +# define CHAR_SIZE 4 # else # define VPBROADCAST vpbroadcastb # define VPCMPEQ vpcmpeqb +# define VPMIN vpminub +# define CHAR_SIZE 1 # endif # ifndef VZEROUPPER @@ -41,196 +45,293 @@ # endif # define VEC_SIZE 32 +# define PAGE_SIZE 4096 - .section SECTION(.text),"ax",@progbits -ENTRY (STRRCHR) - movd %esi, %xmm4 - movl %edi, %ecx + .section SECTION(.text), "ax", @progbits +ENTRY(STRRCHR) + movd %esi, %xmm7 + movl %edi, %eax /* Broadcast CHAR to YMM4. */ - VPBROADCAST %xmm4, %ymm4 + VPBROADCAST %xmm7, %ymm7 vpxor %xmm0, %xmm0, %xmm0 - /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + /* Shift here instead of `andl` to save code size (saves a fetch + block). */ + sall $20, %eax + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax + ja L(cross_page) +L(page_cross_continue): vmovdqu (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx - vpmovmskb %ymm3, %eax - addq $VEC_SIZE, %rdi + /* Check end of string match. */ + VPCMPEQ %ymm1, %ymm0, %ymm6 + vpmovmskb %ymm6, %ecx + testl %ecx, %ecx + jz L(aligned_more) + + /* Only check match with search CHAR if needed. */ + VPCMPEQ %ymm1, %ymm7, %ymm1 + vpmovmskb %ymm1, %eax + /* Check if match before first zero. */ + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(ret0) + bsrl %eax, %eax + addq %rdi, %rax + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If + search CHAR is zero we are correct. Either way `andq + -CHAR_SIZE, %rax` gets the correct result. */ +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif +L(ret0): +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + + /* Returns for first vec x1/x2 have hard coded backward search + path for earlier matches. */ + .p2align 4,, 10 +L(first_vec_x1): + VPCMPEQ %ymm2, %ymm7, %ymm6 + vpmovmskb %ymm6, %eax + blsmskl %ecx, %ecx + andl %ecx, %eax + jnz L(first_vec_x1_return) + + .p2align 4,, 4 +L(first_vec_x0_test): + VPCMPEQ %ymm1, %ymm7, %ymm6 + vpmovmskb %ymm6, %eax + testl %eax, %eax + jz L(ret1) + bsrl %eax, %eax + addq %r8, %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif +L(ret1): + VZEROUPPER_RETURN + .p2align 4,, 10 +L(first_vec_x0_x1_test): + VPCMPEQ %ymm2, %ymm7, %ymm6 + vpmovmskb %ymm6, %eax testl %eax, %eax - jnz L(first_vec) + jz L(first_vec_x0_test) + .p2align 4,, 4 +L(first_vec_x1_return): + bsrl %eax, %eax + leaq 1(%rdi, %rax), %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif + VZEROUPPER_RETURN - testl %ecx, %ecx - jnz L(return_null) - andq $-VEC_SIZE, %rdi - xorl %edx, %edx - jmp L(aligned_loop) + .p2align 4,, 10 +L(first_vec_x2): + VPCMPEQ %ymm3, %ymm7, %ymm6 + vpmovmskb %ymm6, %eax + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(first_vec_x0_x1_test) + bsrl %eax, %eax + leaq (VEC_SIZE + 1)(%rdi, %rax), %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif + VZEROUPPER_RETURN + .p2align 4 -L(first_vec): - /* Check if there is a nul CHAR. */ +L(aligned_more): + /* Save original pointer if match was in VEC 0. */ + movq %rdi, %r8 + + /* Align src. */ + orq $(VEC_SIZE - 1), %rdi + vmovdqu 1(%rdi), %ymm2 + VPCMPEQ %ymm2, %ymm0, %ymm6 + vpmovmskb %ymm6, %ecx testl %ecx, %ecx - jnz L(char_and_nul_in_first_vec) + jnz L(first_vec_x1) - /* Remember the match and keep searching. */ - movl %eax, %edx - movq %rdi, %rsi - andq $-VEC_SIZE, %rdi - jmp L(aligned_loop) + vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3 + VPCMPEQ %ymm3, %ymm0, %ymm6 + vpmovmskb %ymm6, %ecx + testl %ecx, %ecx + jnz L(first_vec_x2) + /* Save pointer again before realigning. */ + movq %rdi, %rsi + addq $(VEC_SIZE + 1), %rdi + andq $-(VEC_SIZE * 2), %rdi .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %edx - vpmovmskb %ymm3, %eax - shrl %cl, %edx - shrl %cl, %eax - addq $VEC_SIZE, %rdi - - /* Check if there is a CHAR. */ +L(first_aligned_loop): + /* Do 2x VEC at a time. Any more and the cost of finding the + match outweights loop benefit. */ + vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 + vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 + + VPCMPEQ %ymm4, %ymm7, %ymm6 + VPMIN %ymm4, %ymm5, %ymm8 + VPCMPEQ %ymm5, %ymm7, %ymm10 + vpor %ymm6, %ymm10, %ymm5 + VPCMPEQ %ymm8, %ymm0, %ymm8 + vpor %ymm5, %ymm8, %ymm9 + + vpmovmskb %ymm9, %eax + addq $(VEC_SIZE * 2), %rdi + /* No zero or search CHAR. */ testl %eax, %eax - jnz L(found_char) - - testl %edx, %edx - jnz L(return_null) + jz L(first_aligned_loop) - jmp L(aligned_loop) - - .p2align 4 -L(found_char): - testl %edx, %edx - jnz L(char_and_nul) + /* If no zero CHAR then go to second loop (this allows us to + throw away all prior work). */ + vpmovmskb %ymm8, %ecx + testl %ecx, %ecx + jz L(second_aligned_loop_prep) - /* Remember the match and keep searching. */ - movl %eax, %edx - leaq (%rdi, %rcx), %rsi + /* Search char could be zero so we need to get the true match. + */ + vpmovmskb %ymm5, %eax + testl %eax, %eax + jnz L(first_aligned_loop_return) - .p2align 4 -L(aligned_loop): - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - addq $VEC_SIZE, %rdi - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx - vpmovmskb %ymm3, %eax - orl %eax, %ecx - jnz L(char_nor_null) - - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - add $VEC_SIZE, %rdi - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx + .p2align 4,, 4 +L(first_vec_x1_or_x2): + VPCMPEQ %ymm3, %ymm7, %ymm3 + VPCMPEQ %ymm2, %ymm7, %ymm2 vpmovmskb %ymm3, %eax - orl %eax, %ecx - jnz L(char_nor_null) - - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - addq $VEC_SIZE, %rdi - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx - vpmovmskb %ymm3, %eax - orl %eax, %ecx - jnz L(char_nor_null) - - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - addq $VEC_SIZE, %rdi - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx - vpmovmskb %ymm3, %eax - orl %eax, %ecx - jz L(aligned_loop) - - .p2align 4 -L(char_nor_null): - /* Find a CHAR or a nul CHAR in a loop. */ - testl %eax, %eax - jnz L(match) -L(return_value): - testl %edx, %edx - jz L(return_null) - movl %edx, %eax - movq %rsi, %rdi + vpmovmskb %ymm2, %edx + /* Use add for macro-fusion. */ + addq %rax, %rdx + jz L(first_vec_x0_test) + /* NB: We could move this shift to before the branch and save a + bit of code size / performance on the fall through. The + branch leads to the null case which generally seems hotter + than char in first 3x VEC. */ + salq $32, %rax + addq %rdx, %rax + bsrq %rax, %rax + leaq 1(%rsi, %rax), %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif + VZEROUPPER_RETURN + .p2align 4,, 8 +L(first_aligned_loop_return): + VPCMPEQ %ymm4, %ymm0, %ymm4 + vpmovmskb %ymm4, %edx + salq $32, %rcx + orq %rdx, %rcx + + vpmovmskb %ymm10, %eax + vpmovmskb %ymm6, %edx + salq $32, %rax + orq %rdx, %rax + blsmskq %rcx, %rcx + andq %rcx, %rax + jz L(first_vec_x1_or_x2) + + bsrq %rax, %rax + leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax # ifdef USE_AS_WCSRCHR - /* Keep the first bit for each matching CHAR for bsr. */ - andl $0x11111111, %eax + andq $-CHAR_SIZE, %rax # endif - bsrl %eax, %eax - leaq -VEC_SIZE(%rdi, %rax), %rax -L(return_vzeroupper): - ZERO_UPPER_VEC_REGISTERS_RETURN + VZEROUPPER_RETURN + /* Search char cannot be zero. */ .p2align 4 -L(match): - /* Find a CHAR. Check if there is a nul CHAR. */ - vpmovmskb %ymm2, %ecx - testl %ecx, %ecx - jnz L(find_nul) - - /* Remember the match and keep searching. */ - movl %eax, %edx +L(second_aligned_loop_set_furthest_match): + /* Save VEC and pointer from most recent match. */ +L(second_aligned_loop_prep): movq %rdi, %rsi - jmp L(aligned_loop) + vmovdqu %ymm6, %ymm2 + vmovdqu %ymm10, %ymm3 .p2align 4 -L(find_nul): -# ifdef USE_AS_WCSRCHR - /* Keep the first bit for each matching CHAR for bsr. */ - andl $0x11111111, %ecx - andl $0x11111111, %eax -# endif - /* Mask out any matching bits after the nul CHAR. */ - movl %ecx, %r8d - subl $1, %r8d - xorl %ecx, %r8d - andl %r8d, %eax +L(second_aligned_loop): + /* Search 2x at at time. */ + vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 + vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 + + VPCMPEQ %ymm4, %ymm7, %ymm6 + VPMIN %ymm4, %ymm5, %ymm1 + VPCMPEQ %ymm5, %ymm7, %ymm10 + vpor %ymm6, %ymm10, %ymm5 + VPCMPEQ %ymm1, %ymm0, %ymm1 + vpor %ymm5, %ymm1, %ymm9 + + vpmovmskb %ymm9, %eax + addq $(VEC_SIZE * 2), %rdi testl %eax, %eax - /* If there is no CHAR here, return the remembered one. */ - jz L(return_value) - bsrl %eax, %eax - leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER_RETURN - - .p2align 4 -L(char_and_nul): - /* Find both a CHAR and a nul CHAR. */ - addq %rcx, %rdi - movl %edx, %ecx -L(char_and_nul_in_first_vec): -# ifdef USE_AS_WCSRCHR - /* Keep the first bit for each matching CHAR for bsr. */ - andl $0x11111111, %ecx - andl $0x11111111, %eax -# endif - /* Mask out any matching bits after the nul CHAR. */ - movl %ecx, %r8d - subl $1, %r8d - xorl %ecx, %r8d - andl %r8d, %eax + jz L(second_aligned_loop) + vpmovmskb %ymm1, %ecx + testl %ecx, %ecx + jz L(second_aligned_loop_set_furthest_match) + vpmovmskb %ymm5, %eax testl %eax, %eax - /* Return null pointer if the nul CHAR comes first. */ - jz L(return_null) - bsrl %eax, %eax - leaq -VEC_SIZE(%rdi, %rax), %rax + jnz L(return_new_match) + + /* This is the hot patch. We know CHAR is inbounds and that + ymm3/ymm2 have latest match. */ + .p2align 4,, 4 +L(return_old_match): + vpmovmskb %ymm3, %eax + vpmovmskb %ymm2, %edx + salq $32, %rax + orq %rdx, %rax + bsrq %rax, %rax + /* Search char cannot be zero so safe to just use lea for + wcsrchr. */ + leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax VZEROUPPER_RETURN - .p2align 4 -L(return_null): - xorl %eax, %eax + /* Last iteration also potentially has a match. */ + .p2align 4,, 8 +L(return_new_match): + VPCMPEQ %ymm4, %ymm0, %ymm4 + vpmovmskb %ymm4, %edx + salq $32, %rcx + orq %rdx, %rcx + + vpmovmskb %ymm10, %eax + vpmovmskb %ymm6, %edx + salq $32, %rax + orq %rdx, %rax + blsmskq %rcx, %rcx + andq %rcx, %rax + jz L(return_old_match) + bsrq %rax, %rax + /* Search char cannot be zero so safe to just use lea for + wcsrchr. */ + leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax VZEROUPPER_RETURN -END (STRRCHR) + .p2align 4,, 4 +L(cross_page): + movq %rdi, %rsi + andq $-VEC_SIZE, %rsi + vmovdqu (%rsi), %ymm1 + VPCMPEQ %ymm1, %ymm0, %ymm6 + vpmovmskb %ymm6, %ecx + shrxl %edi, %ecx, %ecx + testl %ecx, %ecx + jz L(page_cross_continue) + VPCMPEQ %ymm1, %ymm7, %ymm1 + vpmovmskb %ymm1, %eax + shrxl %edi, %eax, %eax + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(ret2) + bsrl %eax, %eax + addq %rdi, %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif +L(ret2): + VZEROUPPER_RETURN +END(STRRCHR) #endif From patchwork Thu Apr 21 03:14:13 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 53086 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id D723D3857405 for ; Thu, 21 Apr 2022 03:17:43 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org D723D3857405 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1650511063; bh=AB3yVeBQrCeSMFr5LYFKXmxbSPv/MKzaubCCmo5nZY8=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=OqaPFDLM0WcLVOV/OAQEn/0gVdNg0bSJbD+xazDCYs9IUcYOHepYfVVK6V8IU8btP b9/l67CBL0UZ38+ns1R5mRPHZyGb+qYZDp7fO2WnGq6qzh/mnCssPRQlHRWwYmVTeW HiTsLJSAfgTK5RMD4GJcF06a/g7iVHoAa4CPDN0k= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pf1-x430.google.com (mail-pf1-x430.google.com [IPv6:2607:f8b0:4864:20::430]) by sourceware.org (Postfix) with ESMTPS id 54070385735C for ; Thu, 21 Apr 2022 03:15:13 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 54070385735C Received: by mail-pf1-x430.google.com with SMTP id l127so3756203pfl.6 for ; Wed, 20 Apr 2022 20:15:13 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=AB3yVeBQrCeSMFr5LYFKXmxbSPv/MKzaubCCmo5nZY8=; b=QVYIPde5fYJXlOg/E04mkUx0JBfDOPNDwYPY9U7tJBL/8TMY9XDsdZpoA2gaDVPpi1 TdJcASCOCWPmhah0WtLY/pQq8cqcvyFgpWX0rMgMLWI7bCeipbv/6PEr8B9UICTztEhy 42XPD7UrW652x/IylL0bLwzbScYzl47oihgmo7uSYZyR55gwGg7st2v4w6TWjn8SKw4S tjmATuY72srk1qn9Dv324MkhmDl3F01IbhPeq00LIvA6OBLA4ctvjn2daDHRH8kFrpLX BlEW9MEdv/p9nPRJ4pDhlc7Ijal3ja5zkxeURzVAcfyCWdZG+nc65kkpg1hsJu5XRhtG fpdg== X-Gm-Message-State: AOAM530aMWBbpZqM4bFoEuV+P8xGpYmB0P8A7BiSUe8HdVmo+DMmuqjr xddWRAhOf6G+V8q7Mothbv3H12oix3E= X-Google-Smtp-Source: ABdhPJxZp6GgOo7Wv0xrijTkLHHJ2PUMkaCGPTWLpHUm9VjkeFj5Ued73L7wABmzEoa2f32kh/WQCg== X-Received: by 2002:a62:b60f:0:b0:508:2a61:2c8b with SMTP id j15-20020a62b60f000000b005082a612c8bmr26878398pff.2.1650510911565; Wed, 20 Apr 2022 20:15:11 -0700 (PDT) Received: from localhost.localdomain ([64.145.94.63]) by smtp.googlemail.com with ESMTPSA id n59-20020a17090a5ac100b001cd498dc153sm1424022pji.3.2022.04.20.20.15.09 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 20 Apr 2022 20:15:11 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1 5/5] x86: Optimize {str|wcs}rchr-evex Date: Wed, 20 Apr 2022 22:14:13 -0500 Message-Id: <20220421031410.2142238-5-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220421031410.2142238-1-goldstein.w.n@gmail.com> References: <20220421031410.2142238-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.8 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" The new code unrolls the main loop slightly without adding too much overhead and minimizes the comparisons for the search CHAR. Geometric Mean of all benchmarks New / Old: 0.755 See email for all results. Full xcheck passes on x86_64 with and without multiarch enabled. --- Results For: strrchr Geometric Mean of N=30 runs. Geometric Mean of all benchmarks New / Old: 0.755 Benchmarks performance on Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html len, align, pos, seek, max_char, freq, New Time / Old Time 2048, 0, 32, 0, 127, 1, 0.669 2048, 1, 32, 0, 127, 1, 0.672 2048, 0, 64, 0, 127, 1, 0.579 2048, 2, 64, 0, 127, 1, 0.579 2048, 0, 128, 0, 127, 1, 0.828 2048, 3, 128, 0, 127, 1, 0.827 2048, 0, 256, 0, 127, 1, 0.693 2048, 4, 256, 0, 127, 1, 0.692 2048, 0, 512, 0, 127, 1, 0.619 2048, 5, 512, 0, 127, 1, 0.622 2048, 0, 1024, 0, 127, 1, 0.626 2048, 6, 1024, 0, 127, 1, 0.627 2048, 0, 2048, 0, 127, 1, 0.85 2048, 7, 2048, 0, 127, 1, 0.855 2048, 0, 4096, 0, 127, 1, 0.849 2048, 8, 4096, 0, 127, 1, 0.848 256, 1, 64, 0, 127, 1, 0.579 256, 15, 64, 0, 127, 1, 0.579 256, 2, 64, 0, 127, 1, 0.579 256, 30, 64, 0, 127, 1, 0.579 256, 3, 64, 0, 127, 1, 0.579 256, 45, 64, 0, 127, 1, 0.551 256, 4, 64, 0, 127, 1, 0.579 256, 60, 64, 0, 127, 1, 0.553 256, 5, 64, 0, 127, 1, 0.579 256, 75, 64, 0, 127, 1, 0.578 256, 6, 64, 0, 127, 1, 0.578 256, 90, 64, 0, 127, 1, 0.579 256, 7, 64, 0, 127, 1, 0.579 256, 105, 64, 0, 127, 1, 0.55 1, 0, 0, 0, 127, 1, 0.795 2, 0, 1, 0, 127, 1, 0.797 3, 0, 2, 0, 127, 1, 0.796 4, 0, 3, 0, 127, 1, 0.792 5, 0, 4, 0, 127, 1, 0.789 6, 0, 5, 0, 127, 1, 0.791 7, 0, 6, 0, 127, 1, 0.793 8, 0, 7, 0, 127, 1, 0.789 9, 0, 8, 0, 127, 1, 0.797 10, 0, 9, 0, 127, 1, 0.788 11, 0, 10, 0, 127, 1, 0.796 12, 0, 11, 0, 127, 1, 0.793 13, 0, 12, 0, 127, 1, 0.797 14, 0, 13, 0, 127, 1, 0.795 15, 0, 14, 0, 127, 1, 0.795 16, 0, 15, 0, 127, 1, 0.791 17, 0, 16, 0, 127, 1, 0.798 18, 0, 17, 0, 127, 1, 0.8 19, 0, 18, 0, 127, 1, 0.797 20, 0, 19, 0, 127, 1, 0.798 21, 0, 20, 0, 127, 1, 0.797 22, 0, 21, 0, 127, 1, 0.796 23, 0, 22, 0, 127, 1, 0.792 24, 0, 23, 0, 127, 1, 0.791 25, 0, 24, 0, 127, 1, 0.794 26, 0, 25, 0, 127, 1, 0.797 27, 0, 26, 0, 127, 1, 0.793 28, 0, 27, 0, 127, 1, 0.79 29, 0, 28, 0, 127, 1, 0.79 30, 0, 29, 0, 127, 1, 0.791 31, 0, 30, 0, 127, 1, 0.791 32, 0, 31, 0, 127, 1, 0.79 2048, 0, 32, 23, 127, 1, 0.734 2048, 1, 32, 23, 127, 1, 0.748 2048, 0, 64, 23, 127, 1, 0.759 2048, 2, 64, 23, 127, 1, 0.753 2048, 0, 128, 23, 127, 1, 0.834 2048, 3, 128, 23, 127, 1, 0.835 2048, 0, 256, 23, 127, 1, 0.789 2048, 4, 256, 23, 127, 1, 0.791 2048, 0, 512, 23, 127, 1, 0.882 2048, 5, 512, 23, 127, 1, 0.861 2048, 0, 1024, 23, 127, 1, 0.643 2048, 6, 1024, 23, 127, 1, 0.643 2048, 0, 2048, 23, 127, 1, 0.931 2048, 7, 2048, 23, 127, 1, 0.929 2048, 0, 4096, 23, 127, 1, 0.922 2048, 8, 4096, 23, 127, 1, 0.934 256, 1, 64, 23, 127, 1, 0.73 256, 15, 64, 23, 127, 1, 0.729 256, 2, 64, 23, 127, 1, 0.725 256, 30, 64, 23, 127, 1, 0.728 256, 3, 64, 23, 127, 1, 0.727 256, 45, 64, 23, 127, 1, 0.749 256, 4, 64, 23, 127, 1, 0.73 256, 60, 64, 23, 127, 1, 0.752 256, 5, 64, 23, 127, 1, 0.729 256, 75, 64, 23, 127, 1, 0.727 256, 6, 64, 23, 127, 1, 0.693 256, 90, 64, 23, 127, 1, 0.73 256, 7, 64, 23, 127, 1, 0.73 256, 105, 64, 23, 127, 1, 0.751 1, 0, 0, 23, 127, 1, 0.797 2, 0, 1, 23, 127, 1, 0.794 3, 0, 2, 23, 127, 1, 0.797 4, 0, 3, 23, 127, 1, 0.792 5, 0, 4, 23, 127, 1, 0.781 6, 0, 5, 23, 127, 1, 0.783 7, 0, 6, 23, 127, 1, 0.79 8, 0, 7, 23, 127, 1, 0.791 9, 0, 8, 23, 127, 1, 0.794 10, 0, 9, 23, 127, 1, 0.795 11, 0, 10, 23, 127, 1, 0.795 12, 0, 11, 23, 127, 1, 0.795 13, 0, 12, 23, 127, 1, 0.794 14, 0, 13, 23, 127, 1, 0.792 15, 0, 14, 23, 127, 1, 0.79 16, 0, 15, 23, 127, 1, 0.793 17, 0, 16, 23, 127, 1, 0.795 18, 0, 17, 23, 127, 1, 0.797 19, 0, 18, 23, 127, 1, 0.796 20, 0, 19, 23, 127, 1, 0.796 21, 0, 20, 23, 127, 1, 0.794 22, 0, 21, 23, 127, 1, 0.794 23, 0, 22, 23, 127, 1, 0.793 24, 0, 23, 23, 127, 1, 0.792 25, 0, 24, 23, 127, 1, 0.795 26, 0, 25, 23, 127, 1, 0.792 27, 0, 26, 23, 127, 1, 0.789 28, 0, 27, 23, 127, 1, 0.794 29, 0, 28, 23, 127, 1, 0.793 30, 0, 29, 23, 127, 1, 0.795 31, 0, 30, 23, 127, 1, 0.797 32, 0, 31, 23, 127, 1, 0.775 2048, 0, 32, 23, 127, 2, 0.736 2048, 1, 32, 23, 127, 2, 0.738 2048, 0, 64, 23, 127, 2, 0.895 2048, 2, 64, 23, 127, 2, 0.897 2048, 0, 128, 23, 127, 2, 0.852 2048, 3, 128, 23, 127, 2, 0.845 2048, 0, 256, 23, 127, 2, 0.755 2048, 4, 256, 23, 127, 2, 0.712 2048, 0, 512, 23, 127, 2, 0.857 2048, 5, 512, 23, 127, 2, 0.849 2048, 0, 1024, 23, 127, 2, 0.626 2048, 6, 1024, 23, 127, 2, 0.661 2048, 0, 2048, 23, 127, 2, 0.67 2048, 7, 2048, 23, 127, 2, 0.67 2048, 0, 4096, 23, 127, 2, 0.928 2048, 8, 4096, 23, 127, 2, 0.935 256, 1, 64, 23, 127, 2, 0.693 256, 15, 64, 23, 127, 2, 0.692 256, 2, 64, 23, 127, 2, 0.693 256, 30, 64, 23, 127, 2, 0.692 256, 3, 64, 23, 127, 2, 0.692 256, 45, 64, 23, 127, 2, 0.701 256, 4, 64, 23, 127, 2, 0.692 256, 60, 64, 23, 127, 2, 0.701 256, 5, 64, 23, 127, 2, 0.69 256, 75, 64, 23, 127, 2, 0.693 256, 6, 64, 23, 127, 2, 0.691 256, 90, 64, 23, 127, 2, 0.692 256, 7, 64, 23, 127, 2, 0.693 256, 105, 64, 23, 127, 2, 0.701 1, 0, 0, 23, 127, 2, 0.797 2, 0, 1, 23, 127, 2, 0.787 3, 0, 2, 23, 127, 2, 0.797 4, 0, 3, 23, 127, 2, 0.793 5, 0, 4, 23, 127, 2, 0.792 6, 0, 5, 23, 127, 2, 0.795 7, 0, 6, 23, 127, 2, 0.791 8, 0, 7, 23, 127, 2, 0.792 9, 0, 8, 23, 127, 2, 0.796 10, 0, 9, 23, 127, 2, 0.797 11, 0, 10, 23, 127, 2, 0.797 12, 0, 11, 23, 127, 2, 0.798 13, 0, 12, 23, 127, 2, 0.799 14, 0, 13, 23, 127, 2, 0.796 15, 0, 14, 23, 127, 2, 0.796 16, 0, 15, 23, 127, 2, 0.794 17, 0, 16, 23, 127, 2, 0.795 18, 0, 17, 23, 127, 2, 0.797 19, 0, 18, 23, 127, 2, 0.793 20, 0, 19, 23, 127, 2, 0.795 21, 0, 20, 23, 127, 2, 0.794 22, 0, 21, 23, 127, 2, 0.794 23, 0, 22, 23, 127, 2, 0.796 24, 0, 23, 23, 127, 2, 0.794 25, 0, 24, 23, 127, 2, 0.794 26, 0, 25, 23, 127, 2, 0.794 27, 0, 26, 23, 127, 2, 0.788 28, 0, 27, 23, 127, 2, 0.791 29, 0, 28, 23, 127, 2, 0.791 30, 0, 29, 23, 127, 2, 0.793 31, 0, 30, 23, 127, 2, 0.796 32, 0, 31, 23, 127, 2, 0.628 2048, 0, 32, 23, 127, 4, 0.742 2048, 1, 32, 23, 127, 4, 0.742 2048, 0, 64, 23, 127, 4, 0.899 2048, 2, 64, 23, 127, 4, 0.912 2048, 0, 128, 23, 127, 4, 0.783 2048, 3, 128, 23, 127, 4, 0.815 2048, 0, 256, 23, 127, 4, 0.854 2048, 4, 256, 23, 127, 4, 0.858 2048, 0, 512, 23, 127, 4, 0.907 2048, 5, 512, 23, 127, 4, 0.873 2048, 0, 1024, 23, 127, 4, 0.657 2048, 6, 1024, 23, 127, 4, 0.653 2048, 0, 2048, 23, 127, 4, 0.666 2048, 7, 2048, 23, 127, 4, 0.667 2048, 0, 4096, 23, 127, 4, 0.67 2048, 8, 4096, 23, 127, 4, 0.67 256, 1, 64, 23, 127, 4, 0.686 256, 15, 64, 23, 127, 4, 0.687 256, 2, 64, 23, 127, 4, 0.687 256, 30, 64, 23, 127, 4, 0.687 256, 3, 64, 23, 127, 4, 0.687 256, 45, 64, 23, 127, 4, 0.672 256, 4, 64, 23, 127, 4, 0.687 256, 60, 64, 23, 127, 4, 0.701 256, 5, 64, 23, 127, 4, 0.687 256, 75, 64, 23, 127, 4, 0.686 256, 6, 64, 23, 127, 4, 0.687 256, 90, 64, 23, 127, 4, 0.686 256, 7, 64, 23, 127, 4, 0.69 256, 105, 64, 23, 127, 4, 0.672 1, 0, 0, 23, 127, 4, 0.798 2, 0, 1, 23, 127, 4, 0.791 3, 0, 2, 23, 127, 4, 0.792 4, 0, 3, 23, 127, 4, 0.795 5, 0, 4, 23, 127, 4, 0.791 6, 0, 5, 23, 127, 4, 0.793 7, 0, 6, 23, 127, 4, 0.78 8, 0, 7, 23, 127, 4, 0.791 9, 0, 8, 23, 127, 4, 0.788 10, 0, 9, 23, 127, 4, 0.798 11, 0, 10, 23, 127, 4, 0.796 12, 0, 11, 23, 127, 4, 0.794 13, 0, 12, 23, 127, 4, 0.795 14, 0, 13, 23, 127, 4, 0.793 15, 0, 14, 23, 127, 4, 0.8 16, 0, 15, 23, 127, 4, 0.796 17, 0, 16, 23, 127, 4, 0.796 18, 0, 17, 23, 127, 4, 0.796 19, 0, 18, 23, 127, 4, 0.798 20, 0, 19, 23, 127, 4, 0.796 21, 0, 20, 23, 127, 4, 0.796 22, 0, 21, 23, 127, 4, 0.796 23, 0, 22, 23, 127, 4, 0.801 24, 0, 23, 23, 127, 4, 0.799 25, 0, 24, 23, 127, 4, 0.795 26, 0, 25, 23, 127, 4, 0.793 27, 0, 26, 23, 127, 4, 0.796 28, 0, 27, 23, 127, 4, 0.794 29, 0, 28, 23, 127, 4, 0.798 30, 0, 29, 23, 127, 4, 0.795 31, 0, 30, 23, 127, 4, 0.797 32, 0, 31, 23, 127, 4, 0.628 2048, 0, 32, 23, 127, 8, 0.738 2048, 1, 32, 23, 127, 8, 0.747 2048, 0, 64, 23, 127, 8, 0.905 2048, 2, 64, 23, 127, 8, 0.906 2048, 0, 128, 23, 127, 8, 0.822 2048, 3, 128, 23, 127, 8, 0.827 2048, 0, 256, 23, 127, 8, 0.825 2048, 4, 256, 23, 127, 8, 0.825 2048, 0, 512, 23, 127, 8, 0.851 2048, 5, 512, 23, 127, 8, 0.855 2048, 0, 1024, 23, 127, 8, 0.653 2048, 6, 1024, 23, 127, 8, 0.651 2048, 0, 2048, 23, 127, 8, 0.644 2048, 7, 2048, 23, 127, 8, 0.643 2048, 0, 4096, 23, 127, 8, 0.67 2048, 8, 4096, 23, 127, 8, 0.67 256, 1, 64, 23, 127, 8, 0.686 256, 15, 64, 23, 127, 8, 0.686 256, 2, 64, 23, 127, 8, 0.686 256, 30, 64, 23, 127, 8, 0.687 256, 3, 64, 23, 127, 8, 0.686 256, 45, 64, 23, 127, 8, 0.671 256, 4, 64, 23, 127, 8, 0.69 256, 60, 64, 23, 127, 8, 0.705 256, 5, 64, 23, 127, 8, 0.688 256, 75, 64, 23, 127, 8, 0.687 256, 6, 64, 23, 127, 8, 0.692 256, 90, 64, 23, 127, 8, 0.689 256, 7, 64, 23, 127, 8, 0.69 256, 105, 64, 23, 127, 8, 0.674 1, 0, 0, 23, 127, 8, 0.798 2, 0, 1, 23, 127, 8, 0.798 3, 0, 2, 23, 127, 8, 0.797 4, 0, 3, 23, 127, 8, 0.792 5, 0, 4, 23, 127, 8, 0.795 6, 0, 5, 23, 127, 8, 0.792 7, 0, 6, 23, 127, 8, 0.792 8, 0, 7, 23, 127, 8, 0.795 9, 0, 8, 23, 127, 8, 0.799 10, 0, 9, 23, 127, 8, 0.798 11, 0, 10, 23, 127, 8, 0.795 12, 0, 11, 23, 127, 8, 0.795 13, 0, 12, 23, 127, 8, 0.797 14, 0, 13, 23, 127, 8, 0.796 15, 0, 14, 23, 127, 8, 0.795 16, 0, 15, 23, 127, 8, 0.796 17, 0, 16, 23, 127, 8, 0.798 18, 0, 17, 23, 127, 8, 0.798 19, 0, 18, 23, 127, 8, 0.795 20, 0, 19, 23, 127, 8, 0.797 21, 0, 20, 23, 127, 8, 0.797 22, 0, 21, 23, 127, 8, 0.793 23, 0, 22, 23, 127, 8, 0.797 24, 0, 23, 23, 127, 8, 0.8 25, 0, 24, 23, 127, 8, 0.796 26, 0, 25, 23, 127, 8, 0.796 27, 0, 26, 23, 127, 8, 0.791 28, 0, 27, 23, 127, 8, 0.795 29, 0, 28, 23, 127, 8, 0.786 30, 0, 29, 23, 127, 8, 0.797 31, 0, 30, 23, 127, 8, 0.791 32, 0, 31, 23, 127, 8, 0.628 2048, 0, 32, 23, 127, 16, 0.736 2048, 1, 32, 23, 127, 16, 0.737 2048, 0, 64, 23, 127, 16, 0.905 2048, 2, 64, 23, 127, 16, 0.908 2048, 0, 128, 23, 127, 16, 0.829 2048, 3, 128, 23, 127, 16, 0.824 2048, 0, 256, 23, 127, 16, 0.827 2048, 4, 256, 23, 127, 16, 0.825 2048, 0, 512, 23, 127, 16, 0.694 2048, 5, 512, 23, 127, 16, 0.687 2048, 0, 1024, 23, 127, 16, 0.568 2048, 6, 1024, 23, 127, 16, 0.667 2048, 0, 2048, 23, 127, 16, 0.766 2048, 7, 2048, 23, 127, 16, 0.781 2048, 0, 4096, 23, 127, 16, 0.646 2048, 8, 4096, 23, 127, 16, 0.646 256, 1, 64, 23, 127, 16, 0.697 256, 15, 64, 23, 127, 16, 0.686 256, 2, 64, 23, 127, 16, 0.687 256, 30, 64, 23, 127, 16, 0.687 256, 3, 64, 23, 127, 16, 0.686 256, 45, 64, 23, 127, 16, 0.672 256, 4, 64, 23, 127, 16, 0.686 256, 60, 64, 23, 127, 16, 0.701 256, 5, 64, 23, 127, 16, 0.686 256, 75, 64, 23, 127, 16, 0.686 256, 6, 64, 23, 127, 16, 0.691 256, 90, 64, 23, 127, 16, 0.687 256, 7, 64, 23, 127, 16, 0.688 256, 105, 64, 23, 127, 16, 0.674 1, 0, 0, 23, 127, 16, 0.797 2, 0, 1, 23, 127, 16, 0.798 3, 0, 2, 23, 127, 16, 0.786 4, 0, 3, 23, 127, 16, 0.792 5, 0, 4, 23, 127, 16, 0.792 6, 0, 5, 23, 127, 16, 0.795 7, 0, 6, 23, 127, 16, 0.796 8, 0, 7, 23, 127, 16, 0.798 9, 0, 8, 23, 127, 16, 0.795 10, 0, 9, 23, 127, 16, 0.797 11, 0, 10, 23, 127, 16, 0.797 12, 0, 11, 23, 127, 16, 0.797 13, 0, 12, 23, 127, 16, 0.799 14, 0, 13, 23, 127, 16, 0.798 15, 0, 14, 23, 127, 16, 0.798 16, 0, 15, 23, 127, 16, 0.796 17, 0, 16, 23, 127, 16, 0.798 18, 0, 17, 23, 127, 16, 0.796 19, 0, 18, 23, 127, 16, 0.797 20, 0, 19, 23, 127, 16, 0.797 21, 0, 20, 23, 127, 16, 0.798 22, 0, 21, 23, 127, 16, 0.797 23, 0, 22, 23, 127, 16, 0.797 24, 0, 23, 23, 127, 16, 0.797 25, 0, 24, 23, 127, 16, 0.798 26, 0, 25, 23, 127, 16, 0.794 27, 0, 26, 23, 127, 16, 0.796 28, 0, 27, 23, 127, 16, 0.796 29, 0, 28, 23, 127, 16, 0.792 30, 0, 29, 23, 127, 16, 0.788 31, 0, 30, 23, 127, 16, 0.79 32, 0, 31, 23, 127, 16, 0.628 sysdeps/x86_64/multiarch/strrchr-evex.S | 441 ++++++++++++++---------- 1 file changed, 259 insertions(+), 182 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S index adeddaed32..5cf9a8315b 100644 --- a/sysdeps/x86_64/multiarch/strrchr-evex.S +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S @@ -24,242 +24,319 @@ # define STRRCHR __strrchr_evex # endif -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 # ifdef USE_AS_WCSRCHR +# define SHIFT_REG esi + +# define kunpck kunpckbw +# define kmov_2x kmovd +# define maskz_2x ecx +# define maskm_2x eax +# define CHAR_SIZE 4 +# define VPMIN vpminud +# define VPTESTN vptestnmd # define VPBROADCAST vpbroadcastd -# define VPCMP vpcmpd -# define SHIFT_REG r8d +# define VPCMP vpcmpd # else +# define SHIFT_REG edi + +# define kunpck kunpckdq +# define kmov_2x kmovq +# define maskz_2x rcx +# define maskm_2x rax + +# define CHAR_SIZE 1 +# define VPMIN vpminub +# define VPTESTN vptestnmb # define VPBROADCAST vpbroadcastb -# define VPCMP vpcmpb -# define SHIFT_REG ecx +# define VPCMP vpcmpb # endif # define XMMZERO xmm16 # define YMMZERO ymm16 # define YMMMATCH ymm17 -# define YMM1 ymm18 +# define YMMSAVE ymm18 + +# define YMM1 ymm19 +# define YMM2 ymm20 +# define YMM3 ymm21 +# define YMM4 ymm22 +# define YMM5 ymm23 +# define YMM6 ymm24 +# define YMM7 ymm25 +# define YMM8 ymm26 -# define VEC_SIZE 32 - .section .text.evex,"ax",@progbits -ENTRY (STRRCHR) - movl %edi, %ecx +# define VEC_SIZE 32 +# define PAGE_SIZE 4096 + .section .text.evex, "ax", @progbits +ENTRY(STRRCHR) + movl %edi, %eax /* Broadcast CHAR to YMMMATCH. */ VPBROADCAST %esi, %YMMMATCH - vpxorq %XMMZERO, %XMMZERO, %XMMZERO - - /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + jg L(cross_page_boundary) +L(page_cross_continue): VMOVU (%rdi), %YMM1 - - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMMMATCH, %YMM1, %k1 + VPTESTN %YMM1, %YMM1, %k0 kmovd %k0, %ecx - kmovd %k1, %eax - - addq $VEC_SIZE, %rdi - - testl %eax, %eax - jnz L(first_vec) - testl %ecx, %ecx - jnz L(return_null) - - andq $-VEC_SIZE, %rdi - xorl %edx, %edx - jmp L(aligned_loop) - - .p2align 4 -L(first_vec): - /* Check if there is a null byte. */ - testl %ecx, %ecx - jnz L(char_and_nul_in_first_vec) - - /* Remember the match and keep searching. */ - movl %eax, %edx - movq %rdi, %rsi - andq $-VEC_SIZE, %rdi - jmp L(aligned_loop) - - .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - + jz L(aligned_more) + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k1, %eax + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(ret0) + bsrl %eax, %eax # ifdef USE_AS_WCSRCHR - /* NB: Divide shift count by 4 since each bit in K1 represent 4 - bytes. */ - movl %ecx, %SHIFT_REG - sarl $2, %SHIFT_REG + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax # endif +L(ret0): + ret - VMOVA (%rdi), %YMM1 - - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ + /* Returns for first vec x1/x2/x3 have hard coded backward + search path for earlier matches. */ + .p2align 4,, 6 +L(first_vec_x1): + VPCMP $0, %YMMMATCH, %YMM2, %k1 + kmovd %k1, %eax + blsmskl %ecx, %ecx + andl %ecx, %eax + jnz L(first_vec_x1_return) + .p2align 4,, 4 +L(first_vec_x0_test): VPCMP $0, %YMMMATCH, %YMM1, %k1 - kmovd %k0, %edx kmovd %k1, %eax - - shrxl %SHIFT_REG, %edx, %edx - shrxl %SHIFT_REG, %eax, %eax - addq $VEC_SIZE, %rdi - - /* Check if there is a CHAR. */ testl %eax, %eax - jnz L(found_char) - - testl %edx, %edx - jnz L(return_null) - - jmp L(aligned_loop) - - .p2align 4 -L(found_char): - testl %edx, %edx - jnz L(char_and_nul) - - /* Remember the match and keep searching. */ - movl %eax, %edx - leaq (%rdi, %rcx), %rsi + jz L(ret1) + bsrl %eax, %eax +# ifdef USE_AS_WCSRCHR + leaq (%rsi, %rax, CHAR_SIZE), %rax +# else + addq %rsi, %rax +# endif +L(ret1): + ret - .p2align 4 -L(aligned_loop): - VMOVA (%rdi), %YMM1 - addq $VEC_SIZE, %rdi + .p2align 4,, 10 +L(first_vec_x1_or_x2): + VPCMP $0, %YMM3, %YMMMATCH, %k3 + VPCMP $0, %YMM2, %YMMMATCH, %k2 + kortestd %k2, %k3 + jz L(first_vec_x0_test) + + kunpck %k2, %k3, %k3 + kmovq %k3, %rax + bsrq %rax, %rax + leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax + ret - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMMMATCH, %YMM1, %k1 - kmovd %k0, %ecx + .p2align 4,, 6 +L(first_vec_x3): + VPCMP $0, %YMMMATCH, %YMM4, %k1 kmovd %k1, %eax - orl %eax, %ecx - jnz L(char_nor_null) + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(first_vec_x1_or_x2) + bsrl %eax, %eax + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret - VMOVA (%rdi), %YMM1 - add $VEC_SIZE, %rdi + .p2align 4,, 6 +L(first_vec_x0_x1_test): + VPCMP $0, %YMMMATCH, %YMM2, %k1 + kmovd %k1, %eax + testl %eax, %eax + jz L(first_vec_x0_test) + .p2align 4,, 4 +L(first_vec_x1_return): + bsrl %eax, %eax + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + ret - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMMMATCH, %YMM1, %k1 - kmovd %k0, %ecx + .p2align 4,, 10 +L(first_vec_x2): + VPCMP $0, %YMMMATCH, %YMM3, %k1 kmovd %k1, %eax - orl %eax, %ecx - jnz L(char_nor_null) + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(first_vec_x0_x1_test) + bsrl %eax, %eax + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret - VMOVA (%rdi), %YMM1 - addq $VEC_SIZE, %rdi - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMMMATCH, %YMM1, %k1 + .p2align 4 +L(aligned_more): + /* Need to keep original pointer incase YMM1 has last match. */ + movq %rdi, %rsi + andq $-VEC_SIZE, %rdi + VMOVU VEC_SIZE(%rdi), %YMM2 + VPTESTN %YMM2, %YMM2, %k0 kmovd %k0, %ecx - kmovd %k1, %eax - orl %eax, %ecx - jnz L(char_nor_null) + testl %ecx, %ecx + jnz L(first_vec_x1) - VMOVA (%rdi), %YMM1 - addq $VEC_SIZE, %rdi + VMOVU (VEC_SIZE * 2)(%rdi), %YMM3 + VPTESTN %YMM3, %YMM3, %k0 + kmovd %k0, %ecx + testl %ecx, %ecx + jnz L(first_vec_x2) - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMMMATCH, %YMM1, %k1 + VMOVU (VEC_SIZE * 3)(%rdi), %YMM4 + VPTESTN %YMM4, %YMM4, %k0 kmovd %k0, %ecx - kmovd %k1, %eax - orl %eax, %ecx - jz L(aligned_loop) + movq %rdi, %r8 + testl %ecx, %ecx + jnz L(first_vec_x3) + andq $-(VEC_SIZE * 2), %rdi .p2align 4 -L(char_nor_null): - /* Find a CHAR or a null byte in a loop. */ +L(first_aligned_loop): + /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee + they don't store a match. */ + VMOVA (VEC_SIZE * 4)(%rdi), %YMM5 + VMOVA (VEC_SIZE * 5)(%rdi), %YMM6 + + VPCMP $0, %YMM5, %YMMMATCH, %k2 + vpxord %YMM6, %YMMMATCH, %YMM7 + + VPMIN %YMM5, %YMM6, %YMM8 + VPMIN %YMM8, %YMM7, %YMM7 + + VPTESTN %YMM7, %YMM7, %k1 + subq $(VEC_SIZE * -2), %rdi + kortestd %k1, %k2 + jz L(first_aligned_loop) + + VPCMP $0, %YMM6, %YMMMATCH, %k3 + VPTESTN %YMM8, %YMM8, %k1 + ktestd %k1, %k1 + jz L(second_aligned_loop_prep) + + kortestd %k2, %k3 + jnz L(return_first_aligned_loop) + + .p2align 4,, 6 +L(first_vec_x1_or_x2_or_x3): + VPCMP $0, %YMM4, %YMMMATCH, %k4 + kmovd %k4, %eax testl %eax, %eax - jnz L(match) -L(return_value): - testl %edx, %edx - jz L(return_null) - movl %edx, %eax - movq %rsi, %rdi + jz L(first_vec_x1_or_x2) bsrl %eax, %eax -# ifdef USE_AS_WCSRCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq -VEC_SIZE(%rdi, %rax, 4), %rax -# else - leaq -VEC_SIZE(%rdi, %rax), %rax -# endif + leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax ret - .p2align 4 -L(match): - /* Find a CHAR. Check if there is a null byte. */ - kmovd %k0, %ecx - testl %ecx, %ecx - jnz L(find_nul) + .p2align 4,, 8 +L(return_first_aligned_loop): + VPTESTN %YMM5, %YMM5, %k0 + kunpck %k0, %k1, %k0 + kmov_2x %k0, %maskz_2x + + blsmsk %maskz_2x, %maskz_2x + kunpck %k2, %k3, %k3 + kmov_2x %k3, %maskm_2x + and %maskz_2x, %maskm_2x + jz L(first_vec_x1_or_x2_or_x3) + + bsr %maskm_2x, %maskm_2x + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret - /* Remember the match and keep searching. */ - movl %eax, %edx + .p2align 4 + /* We can throw away the work done for the first 4x checks here + as we have a later match. This is the 'fast' path persay. + */ +L(second_aligned_loop_prep): +L(second_aligned_loop_set_furthest_match): movq %rdi, %rsi - jmp L(aligned_loop) + kunpck %k2, %k3, %k4 .p2align 4 -L(find_nul): - /* Mask out any matching bits after the null byte. */ - movl %ecx, %r8d - subl $1, %r8d - xorl %ecx, %r8d - andl %r8d, %eax - testl %eax, %eax - /* If there is no CHAR here, return the remembered one. */ - jz L(return_value) - bsrl %eax, %eax +L(second_aligned_loop): + VMOVU (VEC_SIZE * 4)(%rdi), %YMM1 + VMOVU (VEC_SIZE * 5)(%rdi), %YMM2 + + VPCMP $0, %YMM1, %YMMMATCH, %k2 + vpxord %YMM2, %YMMMATCH, %YMM3 + + VPMIN %YMM1, %YMM2, %YMM4 + VPMIN %YMM3, %YMM4, %YMM3 + + VPTESTN %YMM3, %YMM3, %k1 + subq $(VEC_SIZE * -2), %rdi + kortestd %k1, %k2 + jz L(second_aligned_loop) + + VPCMP $0, %YMM2, %YMMMATCH, %k3 + VPTESTN %YMM4, %YMM4, %k1 + ktestd %k1, %k1 + jz L(second_aligned_loop_set_furthest_match) + + kortestd %k2, %k3 + /* branch here because there is a significant advantage interms + of output dependency chance in using edx. */ + jnz L(return_new_match) +L(return_old_match): + kmovq %k4, %rax + bsrq %rax, %rax + leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax + ret + +L(return_new_match): + VPTESTN %YMM1, %YMM1, %k0 + kunpck %k0, %k1, %k0 + kmov_2x %k0, %maskz_2x + + blsmsk %maskz_2x, %maskz_2x + kunpck %k2, %k3, %k3 + kmov_2x %k3, %maskm_2x + and %maskz_2x, %maskm_2x + jz L(return_old_match) + + bsr %maskm_2x, %maskm_2x + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + + /* This block is horribly aligned (% 16 == 15). This is + intentional. The L(cross_page_boundary) block is exactly + 32-bytes of code size. Ultimately this is a cold case so + save the code size by leaving misaligned. */ +L(cross_page_boundary): + xorq %rdi, %rax + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1 + VPTESTN %YMM1, %YMM1, %k0 + kmovd %k0, %ecx # ifdef USE_AS_WCSRCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq -VEC_SIZE(%rdi, %rax, 4), %rax -# else - leaq -VEC_SIZE(%rdi, %rax), %rax + movl %edi, %esi + andl $(VEC_SIZE - 1), %esi + shrl $2, %esi # endif - ret + shrxl %SHIFT_REG, %ecx, %ecx - .p2align 4 -L(char_and_nul): - /* Find both a CHAR and a null byte. */ - addq %rcx, %rdi - movl %edx, %ecx -L(char_and_nul_in_first_vec): - /* Mask out any matching bits after the null byte. */ - movl %ecx, %r8d - subl $1, %r8d - xorl %ecx, %r8d - andl %r8d, %eax - testl %eax, %eax - /* Return null pointer if the null byte comes first. */ - jz L(return_null) + testl %ecx, %ecx + jz L(page_cross_continue) + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k1, %eax + shrxl %SHIFT_REG, %eax, %eax + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(ret3) bsrl %eax, %eax # ifdef USE_AS_WCSRCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq -VEC_SIZE(%rdi, %rax, 4), %rax + leaq (%rdi, %rax, CHAR_SIZE), %rax # else - leaq -VEC_SIZE(%rdi, %rax), %rax + addq %rdi, %rax # endif +L(ret3): ret - .p2align 4 -L(return_null): - xorl %eax, %eax - ret - -END (STRRCHR) +END(STRRCHR) #endif