From patchwork Wed May 19 02:24:36 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 43493 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 97AC6384F03C; Wed, 19 May 2021 02:24:48 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 97AC6384F03C DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1621391088; bh=N0NRxAmhXnit5eF3bJXlxMN7GF2z2pB8pFKqaTAn6oQ=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=Ke0UawylluBxXzJi8QyzpUzzDItCBE+Y/LfGk/t3r+E7qoaPPrP00J+HCOIsY/OeZ DWsi3Zv3g+vRnLFlfTwOZttG3niK7f7IuvJBWYojHw3Ws+oHgYlTAGREOhYHPAS/uj rBWEYrcIqnDe1ZokInlWnyc6D5arJ1BNSCW1kFF0= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-qv1-xf31.google.com (mail-qv1-xf31.google.com [IPv6:2607:f8b0:4864:20::f31]) by sourceware.org (Postfix) with ESMTPS id 66DFF38618F9 for ; Wed, 19 May 2021 02:24:46 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 66DFF38618F9 Received: by mail-qv1-xf31.google.com with SMTP id 5so6096663qvk.0 for ; Tue, 18 May 2021 19:24:46 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=N0NRxAmhXnit5eF3bJXlxMN7GF2z2pB8pFKqaTAn6oQ=; b=uWVAX3VhKuRBMeIgDCoMXahv4xtM1fN4jECbeDP9p2Xe8KK2jnP5boMoRlllc9ab0d cOyKU5DIJ+INsM2f12oYBy7su4l8WLmSelhsZhjGlhuJtIWdUD4DY6o1uSP7Utyr7Lng Geagl1Gxez8vNQA391OU/rEeuVTo40PC2bcUiwF69hjx4fQgNjlTPgcNvMKB/8fqE0Ti P6ePTgnCbPMMUNW9JUDjDKUar6qhR0lvbTlCNEa2pkQIcb0BIkZYX4PNa4oaVbDOmHa5 Ilxvk3dd5EJ1RnPXkWRoY6ZnD9DgWdGlEgc+B9l1+YvLjklp5HoZz+4OJ2APmxJctPKS qK8A== X-Gm-Message-State: AOAM533BK/kBh9t632QYOJ70qomKLUfDkiDtKkyhPmKTeD9CApfLLCA0 a325oboJTMGjgA53H8ffEIGGl2aHqYUXUQ== X-Google-Smtp-Source: ABdhPJz5JOk+wvx0/8szdoYRW7fKcmi+baHk+/HufQBcVgKlbY5Lt0+E1DBc9piHpu/+uLwPTqxlEg== X-Received: by 2002:a0c:eed4:: with SMTP id h20mr9776342qvs.40.1621391085701; Tue, 18 May 2021 19:24:45 -0700 (PDT) Received: from localhost.localdomain (pool-71-245-178-39.pitbpa.fios.verizon.net. [71.245.178.39]) by smtp.googlemail.com with ESMTPSA id u126sm14695896qkd.80.2021.05.18.19.24.44 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 18 May 2021 19:24:45 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v2 1/3] x86: Expand bench-memcmp.c and test-memcmp.c Date: Tue, 18 May 2021 22:24:36 -0400 Message-Id: <20210519022438.2986411-1-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.29.2 In-Reply-To: <20210517184406.2609574-1-goldstein.w.n@gmail.com> References: <20210517184406.2609574-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-12.0 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces@sourceware.org Sender: "Libc-alpha" No bug. This commit adds some additional performance test cases to bench-memcmp.c and test-memcmp.c. The new benchtests include some medium range sizes, as well as small sizes near page cross. The new correctness tests correspond with the new benchtests though add some additional cases for checking the page cross logic. Signed-off-by: Noah Goldstein Reviewed-by: H.J. Lu --- benchtests/bench-memcmp.c | 32 +++++++++++++++++++++++++------ string/test-memcmp.c | 40 +++++++++++++++++++++++++++++++++------ 2 files changed, 60 insertions(+), 12 deletions(-) diff --git a/benchtests/bench-memcmp.c b/benchtests/bench-memcmp.c index eb0f94a0f1..744c7ec5ba 100644 --- a/benchtests/bench-memcmp.c +++ b/benchtests/bench-memcmp.c @@ -88,11 +88,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len, if (len == 0) return; - align1 &= 63; + align1 &= (4096 - CHARBYTES); if (align1 + (len + 1) * CHARBYTES >= page_size) return; - align2 &= 63; + align2 &= (4096 - CHARBYTES); if (align2 + (len + 1) * CHARBYTES >= page_size) return; @@ -100,6 +100,7 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len, json_attr_uint (json_ctx, "length", (double) len); json_attr_uint (json_ctx, "align1", (double) align1); json_attr_uint (json_ctx, "align2", (double) align2); + json_attr_uint (json_ctx, "result", (double) exp_result); json_array_begin (json_ctx, "timings"); FOR_EACH_IMPL (impl, 0) @@ -145,18 +146,31 @@ test_main (void) json_array_end (&json_ctx); json_array_begin (&json_ctx, "results"); - for (i = 1; i < 16; ++i) + for (i = 1; i < 32; ++i) { do_test (&json_ctx, i * CHARBYTES, i * CHARBYTES, i, 0); do_test (&json_ctx, i * CHARBYTES, i * CHARBYTES, i, 1); do_test (&json_ctx, i * CHARBYTES, i * CHARBYTES, i, -1); } - for (i = 0; i < 16; ++i) + for (i = 0; i < 32; ++i) { do_test (&json_ctx, 0, 0, i, 0); do_test (&json_ctx, 0, 0, i, 1); do_test (&json_ctx, 0, 0, i, -1); + do_test (&json_ctx, 4096 - i, 0, i, 0); + do_test (&json_ctx, 4096 - i, 0, i, 1); + do_test (&json_ctx, 4096 - i, 0, i, -1); + } + + for (i = 33; i < 385; i += 32) + { + do_test (&json_ctx, 0, 0, i, 0); + do_test (&json_ctx, 0, 0, i, 1); + do_test (&json_ctx, 0, 0, i, -1); + do_test (&json_ctx, i, 0, i, 0); + do_test (&json_ctx, 0, i, i, 1); + do_test (&json_ctx, i, i, i, -1); } for (i = 1; i < 10; ++i) @@ -164,13 +178,19 @@ test_main (void) do_test (&json_ctx, 0, 0, 2 << i, 0); do_test (&json_ctx, 0, 0, 2 << i, 1); do_test (&json_ctx, 0, 0, 2 << i, -1); - do_test (&json_ctx, 0, 0, 16 << i, 0); do_test (&json_ctx, (8 - i) * CHARBYTES, (2 * i) * CHARBYTES, 16 << i, 0); + do_test (&json_ctx, 0, 0, 16 << i, 0); do_test (&json_ctx, 0, 0, 16 << i, 1); do_test (&json_ctx, 0, 0, 16 << i, -1); + do_test (&json_ctx, i, 0, 2 << i, 0); + do_test (&json_ctx, 0, i, 2 << i, 1); + do_test (&json_ctx, i, i, 2 << i, -1); + do_test (&json_ctx, i, 0, 16 << i, 0); + do_test (&json_ctx, 0, i, 16 << i, 1); + do_test (&json_ctx, i, i, 16 << i, -1); } - for (i = 1; i < 8; ++i) + for (i = 1; i < 10; ++i) { do_test (&json_ctx, i * CHARBYTES, 2 * (i * CHARBYTES), 8 << i, 0); do_test (&json_ctx, i * CHARBYTES, 2 * (i * CHARBYTES), 8 << i, 1); diff --git a/string/test-memcmp.c b/string/test-memcmp.c index 02ea9b782d..fbda26a41e 100644 --- a/string/test-memcmp.c +++ b/string/test-memcmp.c @@ -111,11 +111,11 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result) if (len == 0) return; - align1 &= 63; + align1 &= (4096 - CHARBYTES); if (align1 + (len + 1) * CHARBYTES >= page_size) return; - align2 &= 63; + align2 &= (4096 - CHARBYTES); if (align2 + (len + 1) * CHARBYTES >= page_size) return; @@ -487,18 +487,40 @@ test_main (void) printf ("\t%s", impl->name); putchar ('\n'); - for (i = 1; i < 16; ++i) + for (i = 1; i < 32; ++i) { do_test (i * CHARBYTES, i * CHARBYTES, i, 0); do_test (i * CHARBYTES, i * CHARBYTES, i, 1); do_test (i * CHARBYTES, i * CHARBYTES, i, -1); } - for (i = 0; i < 16; ++i) + for (i = 0; i < 32; ++i) { do_test (0, 0, i, 0); do_test (0, 0, i, 1); do_test (0, 0, i, -1); + do_test (4096 - i, 0, i, 0); + do_test (4096 - i, 0, i, 1); + do_test (4096 - i, 0, i, -1); + do_test (4095, 0, i, 0); + do_test (4095, 0, i, 1); + do_test (4095, 0, i, -1); + do_test (4095, 4095, i, 0); + do_test (4095, 4095, i, 1); + do_test (4095, 4095, i, -1); + do_test (4000, 95, i, 0); + do_test (4000, 95, i, 1); + do_test (4000, 95, i, -1); + } + + for (i = 33; i < 385; i += 32) + { + do_test (0, 0, i, 0); + do_test (0, 0, i, 1); + do_test (0, 0, i, -1); + do_test (i, 0, i, 0); + do_test (0, i, i, 1); + do_test (i, i, i, -1); } for (i = 1; i < 10; ++i) @@ -506,13 +528,19 @@ test_main (void) do_test (0, 0, 2 << i, 0); do_test (0, 0, 2 << i, 1); do_test (0, 0, 2 << i, -1); - do_test (0, 0, 16 << i, 0); do_test ((8 - i) * CHARBYTES, (2 * i) * CHARBYTES, 16 << i, 0); + do_test (0, 0, 16 << i, 0); do_test (0, 0, 16 << i, 1); do_test (0, 0, 16 << i, -1); + do_test (i, 0, 2 << i, 0); + do_test (0, i, 2 << i, 1); + do_test (i, i, 2 << i, -1); + do_test (i, 0, 16 << i, 0); + do_test (0, i, 16 << i, 1); + do_test (i, i, 16 << i, -1); } - for (i = 1; i < 8; ++i) + for (i = 1; i < 10; ++i) { do_test (i * CHARBYTES, 2 * (i * CHARBYTES), 8 << i, 0); do_test (i * CHARBYTES, 2 * (i * CHARBYTES), 8 << i, 1); From patchwork Wed May 19 02:24:37 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 43494 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 311AC383F418; Wed, 19 May 2021 02:24:53 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 311AC383F418 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1621391093; bh=BWU4APTGm2I+BrJri2Gy7XXQS259MecbnisYaaCi688=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=ntd3UyJ+3wMiS3X9nY+iNYX2PjwNy2cnOdfZZXcaSxA3SURiLs930NZX++szBpzd7 CecFb3wNnqm4FJaZJflNti8u9u8i2AKeJlpi3aL8bZ9FCAJAL/RPkFtLczCNe21iPo Oz3i4U3tN41cAeKtR4AEjm0uY7di1zxJpJO3FVUs= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-qk1-x730.google.com (mail-qk1-x730.google.com [IPv6:2607:f8b0:4864:20::730]) by sourceware.org (Postfix) with ESMTPS id 8E15738618F9 for ; Wed, 19 May 2021 02:24:49 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 8E15738618F9 Received: by mail-qk1-x730.google.com with SMTP id v8so11438784qkv.1 for ; Tue, 18 May 2021 19:24:49 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=BWU4APTGm2I+BrJri2Gy7XXQS259MecbnisYaaCi688=; b=gW5+VR9wvePrUu/fgki+CGd3oXr5UBs5LOPVNx97jEprwWsifgTu6Gq1LgnckWHJBj vXEZRX0zyLqUfQZ6rG4zj2DZSe6SJoh8/N8/SYF2tvYPrNGvHu05ZYRTbeBu3hxAQ6ON 5NNrqIIOwP4CDn4rBylSxBschWY5Mz2W+h/d9UOY8Nsh6ClamvJB5+PWreppFYhUpnEC Us4MkPwI1BIVEO8fffc7oiBnTa5GxsFRzJOAQ8FkviCpXIavzP7Sz5VGVrhNOS1tSqdh Y1ajjvyooERoOLCW3XL6whsBjOg4fKlPfaVozukmJtZygoq1+qVtKw053gQIVzbSWb11 S1DQ== X-Gm-Message-State: AOAM531yV0Hs3490WTsMs5dy8I8Ntrwiq7UHPICX0KwS8sW0LLts+Ow1 IUjwA5hWT3q8kSDsKjIYNwCe1h3RWtBnsg== X-Google-Smtp-Source: ABdhPJwbyQOo4PWPaaqkqYPk6BwZAKf8++MIkkfVxKWbWlVLzXv+d0BS0Y3eYl8DcJ/8IVAAhoGlXA== X-Received: by 2002:a05:620a:52f:: with SMTP id h15mr2210038qkh.40.1621391088426; Tue, 18 May 2021 19:24:48 -0700 (PDT) Received: from localhost.localdomain (pool-71-245-178-39.pitbpa.fios.verizon.net. [71.245.178.39]) by smtp.googlemail.com with ESMTPSA id u126sm14695896qkd.80.2021.05.18.19.24.46 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 18 May 2021 19:24:47 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v2 2/3] x86: Optimize memcmp-avx2-movbe.S Date: Tue, 18 May 2021 22:24:37 -0400 Message-Id: <20210519022438.2986411-2-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.29.2 In-Reply-To: <20210519022438.2986411-1-goldstein.w.n@gmail.com> References: <20210517184406.2609574-1-goldstein.w.n@gmail.com> <20210519022438.2986411-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.5 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SCC_10_SHORT_WORD_LINES, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces@sourceware.org Sender: "Libc-alpha" No bug. This commit optimizes memcmp-avx2.S. The optimizations include adding a new vec compare path for small sizes, reorganizing the entry control flow, and removing some unnecissary ALU instructions from the main loop. test-memcmp and test-wmemcmp are both passing. Signed-off-by: Noah Goldstein Reviewed-by: H.J. Lu --- sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 + sysdeps/x86_64/multiarch/ifunc-memcmp.h | 1 + sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 676 +++++++++++-------- 3 files changed, 402 insertions(+), 281 deletions(-) diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 2d811a550b..15eda47667 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -63,16 +63,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, memcmp, IFUNC_IMPL_ADD (array, i, memcmp, (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (MOVBE)), __memcmp_avx2_movbe) IFUNC_IMPL_ADD (array, i, memcmp, (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (MOVBE) && CPU_FEATURE_USABLE (RTM)), __memcmp_avx2_movbe_rtm) IFUNC_IMPL_ADD (array, i, memcmp, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (MOVBE)), __memcmp_evex_movbe) IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), @@ -729,16 +732,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, wmemcmp, IFUNC_IMPL_ADD (array, i, wmemcmp, (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (MOVBE)), __wmemcmp_avx2_movbe) IFUNC_IMPL_ADD (array, i, wmemcmp, (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (MOVBE) && CPU_FEATURE_USABLE (RTM)), __wmemcmp_avx2_movbe_rtm) IFUNC_IMPL_ADD (array, i, wmemcmp, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (MOVBE)), __wmemcmp_evex_movbe) IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h index 8bee1aff75..89e2129968 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h @@ -33,6 +33,7 @@ IFUNC_SELECTOR (void) if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) && CPU_FEATURE_USABLE_P (cpu_features, MOVBE) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) { if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S index ad0fa962a1..2621ec907a 100644 --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S @@ -19,17 +19,23 @@ #if IS_IN (libc) /* memcmp/wmemcmp is implemented as: - 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap - to avoid branches. - 2. Use overlapping compare to avoid branch. - 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8 - bytes for wmemcmp. - 4. If size is 8 * VEC_SIZE or less, unroll the loop. - 5. Compare 4 * VEC_SIZE at a time with the aligned first memory + 1. Use ymm vector compares when possible. The only case where + vector compares is not possible for when size < VEC_SIZE + and loading from either s1 or s2 would cause a page cross. + 2. For size from 2 to 7 bytes on page cross, load as big endian + with movbe and bswap to avoid branches. + 3. Use xmm vector compare when size >= 4 bytes for memcmp or + size >= 8 bytes for wmemcmp. + 4. Optimistically compare up to first 4 * VEC_SIZE one at a + to check for early mismatches. Only do this if its guranteed the + work is not wasted. + 5. If size is 8 * VEC_SIZE or less, unroll the loop. + 6. Compare 4 * VEC_SIZE at a time with the aligned first memory area. - 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. - 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. - 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ + 7. Use 2 vector compares when size is 2 * VEC_SIZE or less. + 8. Use 4 vector compares when size is 4 * VEC_SIZE or less. + 9. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ + # include @@ -38,8 +44,10 @@ # endif # ifdef USE_AS_WMEMCMP +# define CHAR_SIZE 4 # define VPCMPEQ vpcmpeqd # else +# define CHAR_SIZE 1 # define VPCMPEQ vpcmpeqb # endif @@ -52,7 +60,7 @@ # endif # define VEC_SIZE 32 -# define VEC_MASK ((1 << VEC_SIZE) - 1) +# define PAGE_SIZE 4096 /* Warning! wmemcmp has to use SIGNED comparison for elements. @@ -71,136 +79,359 @@ ENTRY (MEMCMP) jb L(less_vec) /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ - vmovdqu (%rsi), %ymm2 - VPCMPEQ (%rdi), %ymm2, %ymm2 - vpmovmskb %ymm2, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) + vmovdqu (%rsi), %ymm1 + VPCMPEQ (%rdi), %ymm1, %ymm1 + vpmovmskb %ymm1, %eax + /* NB: eax must be destination register if going to + L(return_vec_[0,2]). For L(return_vec_3 destination register + must be ecx. */ + incl %eax + jnz L(return_vec_0) cmpq $(VEC_SIZE * 2), %rdx - jbe L(last_vec) - - VPCMPEQ %ymm0, %ymm0, %ymm0 - /* More than 2 * VEC. */ - cmpq $(VEC_SIZE * 8), %rdx - ja L(more_8x_vec) - cmpq $(VEC_SIZE * 4), %rdx - jb L(last_4x_vec) - - /* From 4 * VEC to 8 * VEC, inclusively. */ - vmovdqu (%rsi), %ymm1 - VPCMPEQ (%rdi), %ymm1, %ymm1 + jbe L(last_1x_vec) + /* Check second VEC no matter what. */ vmovdqu VEC_SIZE(%rsi), %ymm2 - VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 + VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + /* If all 4 VEC where equal eax will be all 1s so incl will + overflow and set zero flag. */ + incl %eax + jnz L(return_vec_1) - vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 + /* Less than 4 * VEC. */ + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_2x_vec) + /* Check third and fourth VEC no matter what. */ + vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 + vpmovmskb %ymm3, %eax + incl %eax + jnz L(return_vec_2) vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 + vpmovmskb %ymm4, %ecx + incl %ecx + jnz L(return_vec_3) - vpand %ymm1, %ymm2, %ymm5 - vpand %ymm3, %ymm4, %ymm6 - vpand %ymm5, %ymm6, %ymm5 + /* Go to 4x VEC loop. */ + cmpq $(VEC_SIZE * 8), %rdx + ja L(more_8x_vec) - vptest %ymm0, %ymm5 - jnc L(4x_vec_end) + /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any + branches. */ + /* Load first two VEC from s2 before adjusting addresses. */ + vmovdqu -(VEC_SIZE * 4)(%rsi, %rdx), %ymm1 + vmovdqu -(VEC_SIZE * 3)(%rsi, %rdx), %ymm2 leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi - vmovdqu (%rsi), %ymm1 - VPCMPEQ (%rdi), %ymm1, %ymm1 - vmovdqu VEC_SIZE(%rsi), %ymm2 - VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 - vpand %ymm2, %ymm1, %ymm5 + /* Wait to load from s1 until addressed adjust due to + unlamination of microfusion with complex address mode. */ + VPCMPEQ (%rdi), %ymm1, %ymm1 + VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 - vpand %ymm3, %ymm5, %ymm5 - + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 - vpand %ymm4, %ymm5, %ymm5 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 - vptest %ymm0, %ymm5 - jnc L(4x_vec_end) - xorl %eax, %eax + /* Reduce VEC0 - VEC4. */ + vpand %ymm1, %ymm2, %ymm5 + vpand %ymm3, %ymm4, %ymm6 + vpand %ymm5, %ymm6, %ymm7 + vpmovmskb %ymm7, %ecx + incl %ecx + jnz L(return_vec_0_1_2_3) + /* NB: eax must be zero to reach here. */ + VZEROUPPER_RETURN + + .p2align 4 +L(return_vec_0): + tzcntl %eax, %eax +# ifdef USE_AS_WMEMCMP + movl (%rdi, %rax), %ecx + xorl %edx, %edx + cmpl (%rsi, %rax), %ecx + /* NB: no partial register stall here because xorl zero idiom + above. */ + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl (%rsi, %rax), %ecx + movzbl (%rdi, %rax), %eax + subl %ecx, %eax +# endif L(return_vzeroupper): ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 -L(last_2x_vec): - /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ - vmovdqu (%rsi), %ymm2 - VPCMPEQ (%rdi), %ymm2, %ymm2 - vpmovmskb %ymm2, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) +L(return_vec_1): + tzcntl %eax, %eax +# ifdef USE_AS_WMEMCMP + movl VEC_SIZE(%rdi, %rax), %ecx + xorl %edx, %edx + cmpl VEC_SIZE(%rsi, %rax), %ecx + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl VEC_SIZE(%rsi, %rax), %ecx + movzbl VEC_SIZE(%rdi, %rax), %eax + subl %ecx, %eax +# endif + VZEROUPPER_RETURN + + .p2align 4 +L(return_vec_2): + tzcntl %eax, %eax +# ifdef USE_AS_WMEMCMP + movl (VEC_SIZE * 2)(%rdi, %rax), %ecx + xorl %edx, %edx + cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx + movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax + subl %ecx, %eax +# endif + VZEROUPPER_RETURN + + /* NB: p2align 5 here to ensure 4x loop is 32 byte aligned. */ + .p2align 5 +L(8x_return_vec_0_1_2_3): + /* Returning from L(more_8x_vec) requires restoring rsi. */ + addq %rdi, %rsi +L(return_vec_0_1_2_3): + vpmovmskb %ymm1, %eax + incl %eax + jnz L(return_vec_0) -L(last_vec): - /* Use overlapping loads to avoid branches. */ - leaq -VEC_SIZE(%rdi, %rdx), %rdi - leaq -VEC_SIZE(%rsi, %rdx), %rsi - vmovdqu (%rsi), %ymm2 - VPCMPEQ (%rdi), %ymm2, %ymm2 vpmovmskb %ymm2, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) + incl %eax + jnz L(return_vec_1) + + vpmovmskb %ymm3, %eax + incl %eax + jnz L(return_vec_2) +L(return_vec_3): + tzcntl %ecx, %ecx +# ifdef USE_AS_WMEMCMP + movl (VEC_SIZE * 3)(%rdi, %rcx), %eax + xorl %edx, %edx + cmpl (VEC_SIZE * 3)(%rsi, %rcx), %eax + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx + subl %ecx, %eax +# endif + VZEROUPPER_RETURN + + .p2align 4 +L(more_8x_vec): + /* Set end of s1 in rdx. */ + leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx + /* rsi stores s2 - s1. This allows loop to only update one + pointer. */ + subq %rdi, %rsi + /* Align s1 pointer. */ + andq $-VEC_SIZE, %rdi + /* Adjust because first 4x vec where check already. */ + subq $-(VEC_SIZE * 4), %rdi + .p2align 4 +L(loop_4x_vec): + /* rsi has s2 - s1 so get correct address by adding s1 (in rdi). + */ + vmovdqu (%rsi, %rdi), %ymm1 + VPCMPEQ (%rdi), %ymm1, %ymm1 + + vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2 + VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 + + vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 + + vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 + + vpand %ymm1, %ymm2, %ymm5 + vpand %ymm3, %ymm4, %ymm6 + vpand %ymm5, %ymm6, %ymm7 + vpmovmskb %ymm7, %ecx + incl %ecx + jnz L(8x_return_vec_0_1_2_3) + subq $-(VEC_SIZE * 4), %rdi + /* Check if s1 pointer at end. */ + cmpq %rdx, %rdi + jb L(loop_4x_vec) + + subq %rdx, %rdi + /* rdi has 4 * VEC_SIZE - remaining length. */ + cmpl $(VEC_SIZE * 3), %edi + jae L(8x_last_1x_vec) + /* Load regardless of branch. */ + vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3 + cmpl $(VEC_SIZE * 2), %edi + jae L(8x_last_2x_vec) + + /* Check last 4 VEC. */ + vmovdqu (%rsi, %rdx), %ymm1 + VPCMPEQ (%rdx), %ymm1, %ymm1 + + vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2 + VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2 + + VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 + + vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 + VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 + + vpand %ymm1, %ymm2, %ymm5 + vpand %ymm3, %ymm4, %ymm6 + vpand %ymm5, %ymm6, %ymm7 + vpmovmskb %ymm7, %ecx + /* Restore s1 pointer to rdi. */ + movq %rdx, %rdi + incl %ecx + jnz L(8x_return_vec_0_1_2_3) + /* NB: eax must be zero to reach here. */ + VZEROUPPER_RETURN + + /* Only entry is from L(more_8x_vec). */ + .p2align 4 +L(8x_last_2x_vec): + /* Check second to last VEC. rdx store end pointer of s1 and + ymm3 has already been loaded with second to last VEC from s2. + */ + VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 + vpmovmskb %ymm3, %eax + incl %eax + jnz L(8x_return_vec_2) + /* Check last VEC. */ + .p2align 4 +L(8x_last_1x_vec): + vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 + VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 + vpmovmskb %ymm4, %eax + incl %eax + jnz L(8x_return_vec_3) VZEROUPPER_RETURN .p2align 4 -L(first_vec): - /* A byte or int32 is different within 16 or 32 bytes. */ - tzcntl %eax, %ecx +L(last_2x_vec): + /* Check second to last VEC. */ + vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1 + VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1 + vpmovmskb %ymm1, %eax + incl %eax + jnz L(return_vec_1_end) + /* Check last VEC. */ +L(last_1x_vec): + vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1 + VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1 + vpmovmskb %ymm1, %eax + incl %eax + jnz L(return_vec_0_end) + VZEROUPPER_RETURN + + .p2align 4 +L(8x_return_vec_2): + subq $VEC_SIZE, %rdx +L(8x_return_vec_3): + tzcntl %eax, %eax + addq %rdx, %rax # ifdef USE_AS_WMEMCMP - xorl %eax, %eax - movl (%rdi, %rcx), %edx - cmpl (%rsi, %rcx), %edx -L(wmemcmp_return): - setl %al - negl %eax - orl $1, %eax + movl (VEC_SIZE * 3)(%rax), %ecx + xorl %edx, %edx + cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx + setg %dl + leal -1(%rdx, %rdx), %eax # else - movzbl (%rdi, %rcx), %eax - movzbl (%rsi, %rcx), %edx - sub %edx, %eax + movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx + movzbl (VEC_SIZE * 3)(%rax), %eax + subl %ecx, %eax # endif VZEROUPPER_RETURN -# ifdef USE_AS_WMEMCMP .p2align 4 -L(4): - xorl %eax, %eax - movl (%rdi), %edx - cmpl (%rsi), %edx - jne L(wmemcmp_return) - ret +L(return_vec_1_end): + tzcntl %eax, %eax + addl %edx, %eax +# ifdef USE_AS_WMEMCMP + movl -(VEC_SIZE * 2)(%rdi, %rax), %ecx + xorl %edx, %edx + cmpl -(VEC_SIZE * 2)(%rsi, %rax), %ecx + setg %dl + leal -1(%rdx, %rdx), %eax # else + movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx + movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax + subl %ecx, %eax +# endif + VZEROUPPER_RETURN + .p2align 4 -L(between_4_7): - /* Load as big endian with overlapping movbe to avoid branches. */ - movbe (%rdi), %eax - movbe (%rsi), %ecx - shlq $32, %rax - shlq $32, %rcx - movbe -4(%rdi, %rdx), %edi - movbe -4(%rsi, %rdx), %esi - orq %rdi, %rax - orq %rsi, %rcx - subq %rcx, %rax - je L(exit) - sbbl %eax, %eax - orl $1, %eax - ret +L(return_vec_0_end): + tzcntl %eax, %eax + addl %edx, %eax +# ifdef USE_AS_WMEMCMP + movl -VEC_SIZE(%rdi, %rax), %ecx + xorl %edx, %edx + cmpl -VEC_SIZE(%rsi, %rax), %ecx + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl -VEC_SIZE(%rsi, %rax), %ecx + movzbl -VEC_SIZE(%rdi, %rax), %eax + subl %ecx, %eax +# endif + VZEROUPPER_RETURN .p2align 4 -L(exit): - ret +L(less_vec): + /* Check if one or less CHAR. This is necessary for size = 0 but + is also faster for size = CHAR_SIZE. */ + cmpl $CHAR_SIZE, %edx + jbe L(one_or_less) + + /* Check if loading one VEC from either s1 or s2 could cause a + page cross. This can have false positives but is by far the + fastest method. */ + movl %edi, %eax + orl %esi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + jg L(page_cross_less_vec) + + /* No page cross possible. */ + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + incl %eax + /* Result will be zero if s1 and s2 match. Otherwise first set + bit will be first mismatch. */ + bzhil %edx, %eax, %edx + jnz L(return_vec_0) + xorl %eax, %eax + VZEROUPPER_RETURN .p2align 4 -L(between_2_3): +L(page_cross_less_vec): + /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 + bytes. */ + cmpl $16, %edx + jae L(between_16_31) +# ifndef USE_AS_WMEMCMP + cmpl $8, %edx + jae L(between_8_15) + cmpl $4, %edx + jae L(between_4_7) + /* Load as big endian to avoid branches. */ movzwl (%rdi), %eax movzwl (%rsi), %ecx @@ -208,223 +439,106 @@ L(between_2_3): shll $8, %ecx bswap %eax bswap %ecx - movb -1(%rdi, %rdx), %al - movb -1(%rsi, %rdx), %cl + movzbl -1(%rdi, %rdx), %edi + movzbl -1(%rsi, %rdx), %esi + orl %edi, %eax + orl %esi, %ecx /* Subtraction is okay because the upper 8 bits are zero. */ subl %ecx, %eax + /* No ymm register was touched. */ ret .p2align 4 -L(1): - movzbl (%rdi), %eax +L(one_or_less): + jb L(zero) movzbl (%rsi), %ecx + movzbl (%rdi), %eax subl %ecx, %eax - ret -# endif - - .p2align 4 -L(zero): - xorl %eax, %eax + /* No ymm register was touched. */ ret .p2align 4 -L(less_vec): -# ifdef USE_AS_WMEMCMP - /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ - cmpb $4, %dl - je L(4) - jb L(zero) -# else - cmpb $1, %dl - je L(1) - jb L(zero) - cmpb $4, %dl - jb L(between_2_3) - cmpb $8, %dl - jb L(between_4_7) +L(between_8_15): # endif - cmpb $16, %dl - jae L(between_16_31) - /* It is between 8 and 15 bytes. */ + /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ vmovq (%rdi), %xmm1 vmovq (%rsi), %xmm2 - VPCMPEQ %xmm1, %xmm2, %xmm2 + VPCMPEQ %xmm1, %xmm2, %xmm2 vpmovmskb %xmm2, %eax - subl $0xffff, %eax - jnz L(first_vec) + subl $0xffff, %eax + jnz L(return_vec_0) /* Use overlapping loads to avoid branches. */ leaq -8(%rdi, %rdx), %rdi leaq -8(%rsi, %rdx), %rsi vmovq (%rdi), %xmm1 vmovq (%rsi), %xmm2 - VPCMPEQ %xmm1, %xmm2, %xmm2 + VPCMPEQ %xmm1, %xmm2, %xmm2 vpmovmskb %xmm2, %eax - subl $0xffff, %eax - jnz L(first_vec) + subl $0xffff, %eax + jnz L(return_vec_0) + /* No ymm register was touched. */ + ret + + .p2align 4 +L(zero): + xorl %eax, %eax ret .p2align 4 L(between_16_31): /* From 16 to 31 bytes. No branch when size == 16. */ vmovdqu (%rsi), %xmm2 - VPCMPEQ (%rdi), %xmm2, %xmm2 + VPCMPEQ (%rdi), %xmm2, %xmm2 vpmovmskb %xmm2, %eax - subl $0xffff, %eax - jnz L(first_vec) + subl $0xffff, %eax + jnz L(return_vec_0) /* Use overlapping loads to avoid branches. */ + + vmovdqu -16(%rsi, %rdx), %xmm2 leaq -16(%rdi, %rdx), %rdi leaq -16(%rsi, %rdx), %rsi - vmovdqu (%rsi), %xmm2 - VPCMPEQ (%rdi), %xmm2, %xmm2 + VPCMPEQ (%rdi), %xmm2, %xmm2 vpmovmskb %xmm2, %eax - subl $0xffff, %eax - jnz L(first_vec) + subl $0xffff, %eax + jnz L(return_vec_0) + /* No ymm register was touched. */ ret - .p2align 4 -L(more_8x_vec): - /* More than 8 * VEC. Check the first VEC. */ - vmovdqu (%rsi), %ymm2 - VPCMPEQ (%rdi), %ymm2, %ymm2 - vpmovmskb %ymm2, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) - - /* Align the first memory area for aligned loads in the loop. - Compute how much the first memory area is misaligned. */ - movq %rdi, %rcx - andl $(VEC_SIZE - 1), %ecx - /* Get the negative of offset for alignment. */ - subq $VEC_SIZE, %rcx - /* Adjust the second memory area. */ - subq %rcx, %rsi - /* Adjust the first memory area which should be aligned now. */ - subq %rcx, %rdi - /* Adjust length. */ - addq %rcx, %rdx - -L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - vmovdqu (%rsi), %ymm1 - VPCMPEQ (%rdi), %ymm1, %ymm1 - - vmovdqu VEC_SIZE(%rsi), %ymm2 - VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 - vpand %ymm2, %ymm1, %ymm5 - - vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 - vpand %ymm3, %ymm5, %ymm5 - - vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 - vpand %ymm4, %ymm5, %ymm5 - - vptest %ymm0, %ymm5 - jnc L(4x_vec_end) - - addq $(VEC_SIZE * 4), %rdi - addq $(VEC_SIZE * 4), %rsi - - subq $(VEC_SIZE * 4), %rdx - cmpq $(VEC_SIZE * 4), %rdx - jae L(loop_4x_vec) - - /* Less than 4 * VEC. */ - cmpq $VEC_SIZE, %rdx - jbe L(last_vec) - cmpq $(VEC_SIZE * 2), %rdx - jbe L(last_2x_vec) - -L(last_4x_vec): - /* From 2 * VEC to 4 * VEC. */ - vmovdqu (%rsi), %ymm2 - VPCMPEQ (%rdi), %ymm2, %ymm2 - vpmovmskb %ymm2, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) - - addq $VEC_SIZE, %rdi - addq $VEC_SIZE, %rsi - vmovdqu (%rsi), %ymm2 - VPCMPEQ (%rdi), %ymm2, %ymm2 - vpmovmskb %ymm2, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) - - /* Use overlapping loads to avoid branches. */ - leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi - leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi - vmovdqu (%rsi), %ymm2 - VPCMPEQ (%rdi), %ymm2, %ymm2 - vpmovmskb %ymm2, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) - - addq $VEC_SIZE, %rdi - addq $VEC_SIZE, %rsi - vmovdqu (%rsi), %ymm2 - VPCMPEQ (%rdi), %ymm2, %ymm2 - vpmovmskb %ymm2, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) - VZEROUPPER_RETURN - - .p2align 4 -L(4x_vec_end): - vpmovmskb %ymm1, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) - vpmovmskb %ymm2, %eax - subl $VEC_MASK, %eax - jnz L(first_vec_x1) - vpmovmskb %ymm3, %eax - subl $VEC_MASK, %eax - jnz L(first_vec_x2) - vpmovmskb %ymm4, %eax - subl $VEC_MASK, %eax - tzcntl %eax, %ecx # ifdef USE_AS_WMEMCMP - xorl %eax, %eax - movl (VEC_SIZE * 3)(%rdi, %rcx), %edx - cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx - jmp L(wmemcmp_return) -# else - movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax - movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx - sub %edx, %eax -# endif - VZEROUPPER_RETURN - .p2align 4 -L(first_vec_x1): - tzcntl %eax, %ecx -# ifdef USE_AS_WMEMCMP - xorl %eax, %eax - movl VEC_SIZE(%rdi, %rcx), %edx - cmpl VEC_SIZE(%rsi, %rcx), %edx - jmp L(wmemcmp_return) +L(one_or_less): + jb L(zero) + movl (%rdi), %ecx + xorl %edx, %edx + cmpl (%rsi), %ecx + je L(zero) + setg %dl + leal -1(%rdx, %rdx), %eax + /* No ymm register was touched. */ + ret # else - movzbl VEC_SIZE(%rdi, %rcx), %eax - movzbl VEC_SIZE(%rsi, %rcx), %edx - sub %edx, %eax -# endif - VZEROUPPER_RETURN .p2align 4 -L(first_vec_x2): - tzcntl %eax, %ecx -# ifdef USE_AS_WMEMCMP - xorl %eax, %eax - movl (VEC_SIZE * 2)(%rdi, %rcx), %edx - cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx - jmp L(wmemcmp_return) -# else - movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax - movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx - sub %edx, %eax +L(between_4_7): + /* Load as big endian with overlapping movbe to avoid branches. + */ + movbe (%rdi), %eax + movbe (%rsi), %ecx + shlq $32, %rax + shlq $32, %rcx + movbe -4(%rdi, %rdx), %edi + movbe -4(%rsi, %rdx), %esi + orq %rdi, %rax + orq %rsi, %rcx + subq %rcx, %rax + jz L(zero_4_7) + sbbl %eax, %eax + orl $1, %eax +L(zero_4_7): + /* No ymm register was touched. */ + ret # endif - VZEROUPPER_RETURN + END (MEMCMP) #endif From patchwork Wed May 19 02:24:38 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 43495 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 3B431383F431; Wed, 19 May 2021 02:24:55 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 3B431383F431 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1621391095; bh=1YX6A12brAsK0X6o0bQ2g5g2hp/Bytfi9p304jtlJDw=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=og9osvUrhN40sdIqDHxkHM5pmSj4EzPZVqdB9ADhV2kOlrYrUKRuK+oYIqpse1Ki9 jWQQYlIuHTK2plvC6r6ISJ3Ds9W/9xjhPLikeIyLKgi6cHPFaobvZhMQRLYf54Q/r8 pUpM6CD8aP2m+epUaEyh30EYrhpIHz1tAlQ5VSAI= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-qk1-x72f.google.com (mail-qk1-x72f.google.com [IPv6:2607:f8b0:4864:20::72f]) by sourceware.org (Postfix) with ESMTPS id 3F7E3386197D for ; Wed, 19 May 2021 02:24:51 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 3F7E3386197D Received: by mail-qk1-x72f.google.com with SMTP id f18so11405346qko.7 for ; Tue, 18 May 2021 19:24:51 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=1YX6A12brAsK0X6o0bQ2g5g2hp/Bytfi9p304jtlJDw=; b=oVGTLUTDrhlabqCLoaoQB8O4lMLMTYRqiJP0Z7708ed+La2x0gr+WvBmyluAeNBkkA jLQJVNepeLeV5QQqcBtbyf659ptFY69UhhdTUoWRkz3DghNxnIJ1u74BlOmlt++ZSuCL eMEEAH6ZH4HMkCmlTKuQSVyhmeRr8i5Go/zL9SgkjVf0USCV7AM1fgm7TukxmEWbBmVw kkYkzEpFInmPs6EEzGEFuHZ5rOwk8XK+ddEohJ22uj/wWoDXXWy6CM6mFccDmQPM5ZHL y/W6hIHW3GVMN8SDtxbfIqGwtvJ3kb3s15966wCOAOhDifbPGIkAbGlyUXKtCZ3PZFj7 wYxA== X-Gm-Message-State: AOAM533GtUqyJeYDnIEqtt8Y3ZmI8b7DffuulUOaq841zzFhzZKpbqQe Ml4J0hQGNQg/d4arOulfnuVV1Tbg3FXjMA== X-Google-Smtp-Source: ABdhPJzazFCpoiuQLoJuU75n2ua+tlafKTnd+zkmH6EvzXrrY78icNEELxjUatVV4kJhXIB0Mzy8jw== X-Received: by 2002:a37:444a:: with SMTP id r71mr9066135qka.381.1621391090041; Tue, 18 May 2021 19:24:50 -0700 (PDT) Received: from localhost.localdomain (pool-71-245-178-39.pitbpa.fios.verizon.net. [71.245.178.39]) by smtp.googlemail.com with ESMTPSA id u126sm14695896qkd.80.2021.05.18.19.24.49 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 18 May 2021 19:24:49 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v2 3/3] x86: Optimize memcmp-evex-movbe.S Date: Tue, 18 May 2021 22:24:38 -0400 Message-Id: <20210519022438.2986411-3-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.29.2 In-Reply-To: <20210519022438.2986411-1-goldstein.w.n@gmail.com> References: <20210517184406.2609574-1-goldstein.w.n@gmail.com> <20210519022438.2986411-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-12.0 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces@sourceware.org Sender: "Libc-alpha" No bug. This commit optimizes memcmp-evex.S. The optimizations include adding a new vec compare path for small sizes, reorganizing the entry control flow, removing some unnecissary ALU instructions from the main loop, and most importantly replacing the heavy use of vpcmp + kand logic with vpxor + vptern. test-memcmp and test-wmemcmp are both passing. Signed-off-by: Noah Goldstein Reviewed-by: H.J. Lu --- sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 710 +++++++++++-------- 1 file changed, 408 insertions(+), 302 deletions(-) diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S index 9c093972e1..654dc7ac8c 100644 --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S @@ -19,17 +19,22 @@ #if IS_IN (libc) /* memcmp/wmemcmp is implemented as: - 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap - to avoid branches. - 2. Use overlapping compare to avoid branch. - 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8 - bytes for wmemcmp. - 4. If size is 8 * VEC_SIZE or less, unroll the loop. - 5. Compare 4 * VEC_SIZE at a time with the aligned first memory + 1. Use ymm vector compares when possible. The only case where + vector compares is not possible for when size < CHAR_PER_VEC + and loading from either s1 or s2 would cause a page cross. + 2. For size from 2 to 7 bytes on page cross, load as big endian + with movbe and bswap to avoid branches. + 3. Use xmm vector compare when size >= 4 bytes for memcmp or + size >= 8 bytes for wmemcmp. + 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a + to check for early mismatches. Only do this if its guranteed the + work is not wasted. + 5. If size is 8 * VEC_SIZE or less, unroll the loop. + 6. Compare 4 * VEC_SIZE at a time with the aligned first memory area. - 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. - 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. - 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ + 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less. + 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less. + 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */ # include @@ -40,11 +45,21 @@ # define VMOVU vmovdqu64 # ifdef USE_AS_WMEMCMP -# define VPCMPEQ vpcmpeqd +# define CHAR_SIZE 4 +# define VPCMP vpcmpd # else -# define VPCMPEQ vpcmpeqb +# define CHAR_SIZE 1 +# define VPCMP vpcmpub # endif +# define VEC_SIZE 32 +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + +# define XMM0 xmm16 +# define XMM1 xmm17 +# define XMM2 xmm18 +# define YMM0 ymm16 # define XMM1 xmm17 # define XMM2 xmm18 # define YMM1 ymm17 @@ -54,15 +69,6 @@ # define YMM5 ymm21 # define YMM6 ymm22 -# define VEC_SIZE 32 -# ifdef USE_AS_WMEMCMP -# define VEC_MASK 0xff -# define XMM_MASK 0xf -# else -# define VEC_MASK 0xffffffff -# define XMM_MASK 0xffff -# endif - /* Warning! wmemcmp has to use SIGNED comparison for elements. memcmp has to use UNSIGNED comparison for elemnts. @@ -70,145 +76,370 @@ .section .text.evex,"ax",@progbits ENTRY (MEMCMP) -# ifdef USE_AS_WMEMCMP - shl $2, %RDX_LP -# elif defined __ILP32__ +# ifdef __ILP32__ /* Clear the upper 32 bits. */ movl %edx, %edx # endif - cmp $VEC_SIZE, %RDX_LP + cmp $CHAR_PER_VEC, %RDX_LP jb L(less_vec) /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ - VMOVU (%rsi), %YMM2 - VPCMPEQ (%rdi), %YMM2, %k1 + VMOVU (%rsi), %YMM1 + /* Use compare not equals to directly check for mismatch. */ + VPCMP $4, (%rdi), %YMM1, %k1 kmovd %k1, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) - - cmpq $(VEC_SIZE * 2), %rdx - jbe L(last_vec) - - /* More than 2 * VEC. */ - cmpq $(VEC_SIZE * 8), %rdx - ja L(more_8x_vec) - cmpq $(VEC_SIZE * 4), %rdx - jb L(last_4x_vec) + /* NB: eax must be destination register if going to + L(return_vec_[0,2]). For L(return_vec_3 destination register + must be ecx. */ + testl %eax, %eax + jnz L(return_vec_0) - /* From 4 * VEC to 8 * VEC, inclusively. */ - VMOVU (%rsi), %YMM1 - VPCMPEQ (%rdi), %YMM1, %k1 + cmpq $(CHAR_PER_VEC * 2), %rdx + jbe L(last_1x_vec) + /* Check second VEC no matter what. */ VMOVU VEC_SIZE(%rsi), %YMM2 - VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 + VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(return_vec_1) + + /* Less than 4 * VEC. */ + cmpq $(CHAR_PER_VEC * 4), %rdx + jbe L(last_2x_vec) + /* Check third and fourth VEC no matter what. */ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 - VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 + VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(return_vec_2) VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 - VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 + VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1 + kmovd %k1, %ecx + testl %ecx, %ecx + jnz L(return_vec_3) - kandd %k1, %k2, %k5 - kandd %k3, %k4, %k6 - kandd %k5, %k6, %k6 + /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so + compare with zero to get a mask is needed. */ + vpxorq %XMM0, %XMM0, %XMM0 - kmovd %k6, %eax - cmpl $VEC_MASK, %eax - jne L(4x_vec_end) + /* Go to 4x VEC loop. */ + cmpq $(CHAR_PER_VEC * 8), %rdx + ja L(more_8x_vec) - leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi - leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi - VMOVU (%rsi), %YMM1 - VPCMPEQ (%rdi), %YMM1, %k1 + /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any + branches. */ - VMOVU VEC_SIZE(%rsi), %YMM2 - VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 - kandd %k1, %k2, %k5 + /* Load first two VEC from s2 before adjusting addresses. */ + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1 + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2 + leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi + leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi + + /* Wait to load from s1 until addressed adjust due to + unlamination of microfusion with complex address mode. */ + + /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it + will have some 1s. */ + vpxorq (%rdi), %YMM1, %YMM1 + vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2 VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 - VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 - kandd %k3, %k5, %k5 + vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 + /* Or together YMM1, YMM2, and YMM3 into YMM3. */ + vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 - VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 - kandd %k4, %k5, %k5 + /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while + oring with YMM3. Result is stored in YMM4. */ + vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4 + /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */ + VPCMP $4, %YMM4, %YMM0, %k1 + kmovd %k1, %ecx + testl %ecx, %ecx + jnz L(return_vec_0_1_2_3) + /* NB: eax must be zero to reach here. */ + ret - kmovd %k5, %eax - cmpl $VEC_MASK, %eax - jne L(4x_vec_end) - xorl %eax, %eax + /* NB: aligning 32 here allows for the rest of the jump targets + to be tuned for 32 byte alignment. Most important this ensures + the L(more_8x_vec) loop is 32 byte aligned. */ + .p2align 5 +L(less_vec): + /* Check if one or less CHAR. This is necessary for size = 0 but + is also faster for size = CHAR_SIZE. */ + cmpl $1, %edx + jbe L(one_or_less) + + /* Check if loading one VEC from either s1 or s2 could cause a + page cross. This can have false positives but is by far the + fastest method. */ + movl %edi, %eax + orl %esi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + jg L(page_cross_less_vec) + + /* No page cross possible. */ + VMOVU (%rsi), %YMM2 + VPCMP $4, (%rdi), %YMM2, %k1 + kmovd %k1, %eax + /* Create mask in ecx for potentially in bound matches. */ + bzhil %edx, %eax, %eax + jnz L(return_vec_0) ret .p2align 4 -L(last_2x_vec): - /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ - VMOVU (%rsi), %YMM2 - VPCMPEQ (%rdi), %YMM2, %k2 - kmovd %k2, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) +L(return_vec_0): + tzcntl %eax, %eax +# ifdef USE_AS_WMEMCMP + movl (%rdi, %rax, CHAR_SIZE), %ecx + xorl %edx, %edx + cmpl (%rsi, %rax, CHAR_SIZE), %ecx + /* NB: no partial register stall here because xorl zero idiom + above. */ + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl (%rsi, %rax), %ecx + movzbl (%rdi, %rax), %eax + subl %ecx, %eax +# endif + ret -L(last_vec): - /* Use overlapping loads to avoid branches. */ - leaq -VEC_SIZE(%rdi, %rdx), %rdi - leaq -VEC_SIZE(%rsi, %rdx), %rsi - VMOVU (%rsi), %YMM2 - VPCMPEQ (%rdi), %YMM2, %k2 - kmovd %k2, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) + /* NB: No p2align necessary. Alignment % 16 is naturally 1 + which is good enough for a target not in a loop. */ +L(return_vec_1): + tzcntl %eax, %eax +# ifdef USE_AS_WMEMCMP + movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx + xorl %edx, %edx + cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl VEC_SIZE(%rsi, %rax), %ecx + movzbl VEC_SIZE(%rdi, %rax), %eax + subl %ecx, %eax +# endif ret - .p2align 4 -L(first_vec): - /* A byte or int32 is different within 16 or 32 bytes. */ - tzcntl %eax, %ecx + /* NB: No p2align necessary. Alignment % 16 is naturally 2 + which is good enough for a target not in a loop. */ +L(return_vec_2): + tzcntl %eax, %eax # ifdef USE_AS_WMEMCMP - xorl %eax, %eax - movl (%rdi, %rcx, 4), %edx - cmpl (%rsi, %rcx, 4), %edx -L(wmemcmp_return): - setl %al - negl %eax - orl $1, %eax + movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx + xorl %edx, %edx + cmpl (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx + setg %dl + leal -1(%rdx, %rdx), %eax # else - movzbl (%rdi, %rcx), %eax - movzbl (%rsi, %rcx), %edx - sub %edx, %eax + movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx + movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax + subl %ecx, %eax # endif ret + .p2align 4 +L(8x_return_vec_0_1_2_3): + /* Returning from L(more_8x_vec) requires restoring rsi. */ + addq %rdi, %rsi +L(return_vec_0_1_2_3): + VPCMP $4, %YMM1, %YMM0, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(return_vec_0) + + VPCMP $4, %YMM2, %YMM0, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(return_vec_1) + + VPCMP $4, %YMM3, %YMM0, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(return_vec_2) +L(return_vec_3): + tzcntl %ecx, %ecx # ifdef USE_AS_WMEMCMP + movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax + xorl %edx, %edx + cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx + subl %ecx, %eax +# endif + ret + .p2align 4 -L(4): - xorl %eax, %eax - movl (%rdi), %edx - cmpl (%rsi), %edx - jne L(wmemcmp_return) +L(more_8x_vec): + /* Set end of s1 in rdx. */ + leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx + /* rsi stores s2 - s1. This allows loop to only update one + pointer. */ + subq %rdi, %rsi + /* Align s1 pointer. */ + andq $-VEC_SIZE, %rdi + /* Adjust because first 4x vec where check already. */ + subq $-(VEC_SIZE * 4), %rdi + .p2align 4 +L(loop_4x_vec): + VMOVU (%rsi, %rdi), %YMM1 + vpxorq (%rdi), %YMM1, %YMM1 + + VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 + vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2 + + VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 + vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 + vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 + + VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 + vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4 + VPCMP $4, %YMM4, %YMM0, %k1 + kmovd %k1, %ecx + testl %ecx, %ecx + jnz L(8x_return_vec_0_1_2_3) + subq $-(VEC_SIZE * 4), %rdi + cmpq %rdx, %rdi + jb L(loop_4x_vec) + + subq %rdx, %rdi + /* rdi has 4 * VEC_SIZE - remaining length. */ + cmpl $(VEC_SIZE * 3), %edi + jae L(8x_last_1x_vec) + /* Load regardless of branch. */ + VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3 + cmpl $(VEC_SIZE * 2), %edi + jae L(8x_last_2x_vec) + + VMOVU (%rsi, %rdx), %YMM1 + vpxorq (%rdx), %YMM1, %YMM1 + + VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 + vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 + + vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3 + vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 + + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 + vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4 + VPCMP $4, %YMM4, %YMM0, %k1 + kmovd %k1, %ecx + /* Restore s1 pointer to rdi. */ + movq %rdx, %rdi + testl %ecx, %ecx + jnz L(8x_return_vec_0_1_2_3) + /* NB: eax must be zero to reach here. */ + ret + + /* Only entry is from L(more_8x_vec). */ + .p2align 4 +L(8x_last_2x_vec): + VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(8x_return_vec_2) + /* Naturally aligned to 16 bytes. */ +L(8x_last_1x_vec): + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1 + VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(8x_return_vec_3) + ret + + .p2align 4 +L(last_2x_vec): + /* Check second to last VEC. */ + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1 + VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(return_vec_1_end) + + /* Check last VEC. */ + .p2align 4 +L(last_1x_vec): + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1 + VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(return_vec_0_end) ret + + .p2align 4 +L(8x_return_vec_2): + subq $VEC_SIZE, %rdx +L(8x_return_vec_3): + tzcntl %eax, %eax +# ifdef USE_AS_WMEMCMP + leaq (%rdx, %rax, CHAR_SIZE), %rax + movl (VEC_SIZE * 3)(%rax), %ecx + xorl %edx, %edx + cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx + setg %dl + leal -1(%rdx, %rdx), %eax # else + addq %rdx, %rax + movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx + movzbl (VEC_SIZE * 3)(%rax), %eax + subl %ecx, %eax +# endif + ret + .p2align 4 -L(between_4_7): - /* Load as big endian with overlapping movbe to avoid branches. */ - movbe (%rdi), %eax - movbe (%rsi), %ecx - shlq $32, %rax - shlq $32, %rcx - movbe -4(%rdi, %rdx), %edi - movbe -4(%rsi, %rdx), %esi - orq %rdi, %rax - orq %rsi, %rcx - subq %rcx, %rax - je L(exit) - sbbl %eax, %eax - orl $1, %eax +L(return_vec_0_end): + tzcntl %eax, %eax + addl %edx, %eax +# ifdef USE_AS_WMEMCMP + movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx + xorl %edx, %edx + cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl -VEC_SIZE(%rsi, %rax), %ecx + movzbl -VEC_SIZE(%rdi, %rax), %eax + subl %ecx, %eax +# endif ret .p2align 4 -L(exit): +L(return_vec_1_end): + tzcntl %eax, %eax + addl %edx, %eax +# ifdef USE_AS_WMEMCMP + movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx + xorl %edx, %edx + cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx + movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax + subl %ecx, %eax +# endif ret + .p2align 4 +L(page_cross_less_vec): + /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 + bytes. */ + cmpl $(16 / CHAR_SIZE), %edx + jae L(between_16_31) +# ifndef USE_AS_WMEMCMP + cmpl $8, %edx + jae L(between_8_15) + cmpl $4, %edx + jae L(between_4_7) L(between_2_3): /* Load as big endian to avoid branches. */ movzwl (%rdi), %eax @@ -217,224 +448,99 @@ L(between_2_3): shll $8, %ecx bswap %eax bswap %ecx - movb -1(%rdi, %rdx), %al - movb -1(%rsi, %rdx), %cl + movzbl -1(%rdi, %rdx), %edi + movzbl -1(%rsi, %rdx), %esi + orl %edi, %eax + orl %esi, %ecx /* Subtraction is okay because the upper 8 bits are zero. */ subl %ecx, %eax ret - .p2align 4 -L(1): - movzbl (%rdi), %eax +L(one_or_less): + jb L(zero) movzbl (%rsi), %ecx + movzbl (%rdi), %eax subl %ecx, %eax ret -# endif - - .p2align 4 -L(zero): - xorl %eax, %eax - ret .p2align 4 -L(less_vec): -# ifdef USE_AS_WMEMCMP - /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ - cmpb $4, %dl - je L(4) - jb L(zero) -# else - cmpb $1, %dl - je L(1) - jb L(zero) - cmpb $4, %dl - jb L(between_2_3) - cmpb $8, %dl - jb L(between_4_7) +L(between_8_15): # endif - cmpb $16, %dl - jae L(between_16_31) - /* It is between 8 and 15 bytes. */ + /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ vmovq (%rdi), %XMM1 vmovq (%rsi), %XMM2 - VPCMPEQ %XMM1, %XMM2, %k2 - kmovw %k2, %eax - subl $XMM_MASK, %eax - jnz L(first_vec) + VPCMP $4, %XMM1, %XMM2, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(return_vec_0) /* Use overlapping loads to avoid branches. */ - leaq -8(%rdi, %rdx), %rdi - leaq -8(%rsi, %rdx), %rsi + leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi + leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi vmovq (%rdi), %XMM1 vmovq (%rsi), %XMM2 - VPCMPEQ %XMM1, %XMM2, %k2 - kmovw %k2, %eax - subl $XMM_MASK, %eax - jnz L(first_vec) + VPCMP $4, %XMM1, %XMM2, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(return_vec_0) ret .p2align 4 -L(between_16_31): - /* From 16 to 31 bytes. No branch when size == 16. */ - VMOVU (%rsi), %XMM2 - VPCMPEQ (%rdi), %XMM2, %k2 - kmovw %k2, %eax - subl $XMM_MASK, %eax - jnz L(first_vec) - - /* Use overlapping loads to avoid branches. */ - leaq -16(%rdi, %rdx), %rdi - leaq -16(%rsi, %rdx), %rsi - VMOVU (%rsi), %XMM2 - VPCMPEQ (%rdi), %XMM2, %k2 - kmovw %k2, %eax - subl $XMM_MASK, %eax - jnz L(first_vec) +L(zero): + xorl %eax, %eax ret .p2align 4 -L(more_8x_vec): - /* More than 8 * VEC. Check the first VEC. */ - VMOVU (%rsi), %YMM2 - VPCMPEQ (%rdi), %YMM2, %k2 - kmovd %k2, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) - - /* Align the first memory area for aligned loads in the loop. - Compute how much the first memory area is misaligned. */ - movq %rdi, %rcx - andl $(VEC_SIZE - 1), %ecx - /* Get the negative of offset for alignment. */ - subq $VEC_SIZE, %rcx - /* Adjust the second memory area. */ - subq %rcx, %rsi - /* Adjust the first memory area which should be aligned now. */ - subq %rcx, %rdi - /* Adjust length. */ - addq %rcx, %rdx - -L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - VMOVU (%rsi), %YMM1 - VPCMPEQ (%rdi), %YMM1, %k1 - - VMOVU VEC_SIZE(%rsi), %YMM2 - VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 - kandd %k2, %k1, %k5 - - VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 - VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 - kandd %k3, %k5, %k5 - - VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 - VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 - kandd %k4, %k5, %k5 - - kmovd %k5, %eax - cmpl $VEC_MASK, %eax - jne L(4x_vec_end) - - addq $(VEC_SIZE * 4), %rdi - addq $(VEC_SIZE * 4), %rsi - - subq $(VEC_SIZE * 4), %rdx - cmpq $(VEC_SIZE * 4), %rdx - jae L(loop_4x_vec) - - /* Less than 4 * VEC. */ - cmpq $VEC_SIZE, %rdx - jbe L(last_vec) - cmpq $(VEC_SIZE * 2), %rdx - jbe L(last_2x_vec) - -L(last_4x_vec): - /* From 2 * VEC to 4 * VEC. */ - VMOVU (%rsi), %YMM2 - VPCMPEQ (%rdi), %YMM2, %k2 - kmovd %k2, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) - - addq $VEC_SIZE, %rdi - addq $VEC_SIZE, %rsi - VMOVU (%rsi), %YMM2 - VPCMPEQ (%rdi), %YMM2, %k2 - kmovd %k2, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) +L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ + VMOVU (%rsi), %XMM2 + VPCMP $4, (%rdi), %XMM2, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(return_vec_0) /* Use overlapping loads to avoid branches. */ - leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi - leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi - VMOVU (%rsi), %YMM2 - VPCMPEQ (%rdi), %YMM2, %k2 - kmovd %k2, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) - addq $VEC_SIZE, %rdi - addq $VEC_SIZE, %rsi - VMOVU (%rsi), %YMM2 - VPCMPEQ (%rdi), %YMM2, %k2 - kmovd %k2, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) - ret - - .p2align 4 -L(4x_vec_end): + VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2 + leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi + leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi + VPCMP $4, (%rdi), %XMM2, %k1 kmovd %k1, %eax - subl $VEC_MASK, %eax - jnz L(first_vec) - kmovd %k2, %eax - subl $VEC_MASK, %eax - jnz L(first_vec_x1) - kmovd %k3, %eax - subl $VEC_MASK, %eax - jnz L(first_vec_x2) - kmovd %k4, %eax - subl $VEC_MASK, %eax - tzcntl %eax, %ecx -# ifdef USE_AS_WMEMCMP - xorl %eax, %eax - movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx - cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx - jmp L(wmemcmp_return) -# else - movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax - movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx - sub %edx, %eax -# endif + testl %eax, %eax + jnz L(return_vec_0) ret - .p2align 4 -L(first_vec_x1): - tzcntl %eax, %ecx # ifdef USE_AS_WMEMCMP - xorl %eax, %eax - movl VEC_SIZE(%rdi, %rcx, 4), %edx - cmpl VEC_SIZE(%rsi, %rcx, 4), %edx - jmp L(wmemcmp_return) -# else - movzbl VEC_SIZE(%rdi, %rcx), %eax - movzbl VEC_SIZE(%rsi, %rcx), %edx - sub %edx, %eax -# endif + .p2align 4 +L(one_or_less): + jb L(zero) + movl (%rdi), %ecx + xorl %edx, %edx + cmpl (%rsi), %ecx + je L(zero) + setg %dl + leal -1(%rdx, %rdx), %eax ret +# else .p2align 4 -L(first_vec_x2): - tzcntl %eax, %ecx -# ifdef USE_AS_WMEMCMP - xorl %eax, %eax - movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx - cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx - jmp L(wmemcmp_return) -# else - movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax - movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx - sub %edx, %eax -# endif +L(between_4_7): + /* Load as big endian with overlapping movbe to avoid branches. + */ + movbe (%rdi), %eax + movbe (%rsi), %ecx + shlq $32, %rax + shlq $32, %rcx + movbe -4(%rdi, %rdx), %edi + movbe -4(%rsi, %rdx), %esi + orq %rdi, %rax + orq %rsi, %rcx + subq %rcx, %rax + jz L(zero_4_7) + sbbl %eax, %eax + orl $1, %eax +L(zero_4_7): ret +# endif + END (MEMCMP) #endif