From patchwork Tue Aug 24 19:32:23 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 44785 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id B27C53858018 for ; Tue, 24 Aug 2021 19:33:47 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org B27C53858018 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1629833627; bh=b44J0shEPfv5g4qlbkbBeTI+cJa2nd59aGE3ZGksdL8=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=qLGoxYgWMasRlYimmgoff93I+hjPTCGF8QaP3X/a8GvYwDgJp04m5KUmNhEp8G1aL +9CM6PTboPW/BZHLuqhybbpUgs8QyjrhspDNPLeU+kalnZyX10nHnD6zvsjAI3dLhG frS4PeqdRM51MTrxQfciPku2k1GW/JWPGlHneyh8= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-il1-x12e.google.com (mail-il1-x12e.google.com [IPv6:2607:f8b0:4864:20::12e]) by sourceware.org (Postfix) with ESMTPS id E30793858423 for ; Tue, 24 Aug 2021 19:32:38 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org E30793858423 Received: by mail-il1-x12e.google.com with SMTP id v16so21645217ilo.10 for ; Tue, 24 Aug 2021 12:32:38 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=b44J0shEPfv5g4qlbkbBeTI+cJa2nd59aGE3ZGksdL8=; b=C+yo71TPsz34uA4FZ2//xT47MUO5DKJ5rfc6kuMM+y4AYEjFS5hy3m5O1IINpD7NYx BA8rsNoULhCqrZLbmyfj8SgSngNZIG4J+P8U2VHT/vtZSUoC52LgMnctb4gJ/e85hn04 DRzyuSBCl3sdri7QUbhwYKbDCeT1RUAeebw/dgadtCuVgGmxobEMOPPWzYp4JH0emuNF Pn1L8NvjugmNTkPKmFdMfn9L0lWM/ThIP2lNYcwCuU5y/JnISaFsT6rGNDuteSx7qyRf 7ge/pCumaJ+0Bj03W8lEINLZERb50I/J3/WQR7Gi7rdQ+un7ucnk6jlMMU/bPAGvzorS lj9w== X-Gm-Message-State: AOAM531+WDzn8fQ3VurcGIT/pcXqpJcdqB5e7pvveFhlnG4jSXudm4Ao +PeykDWdG3o8n9TJQza8XQRdKOPr28GFkA== X-Google-Smtp-Source: ABdhPJwPsry88mZkf3PFi/ZTVVdpnRkJq7ELiOLu6M47irt6xAoWR+qDxk+QmMYcu44OWRMvvWkubw== X-Received: by 2002:a92:dd09:: with SMTP id n9mr27492625ilm.298.1629833558157; Tue, 24 Aug 2021 12:32:38 -0700 (PDT) Received: from localhost.localdomain (node-17-161.flex.volo.net. [76.191.17.161]) by smtp.googlemail.com with ESMTPSA id h10sm10224923ilj.71.2021.08.24.12.32.37 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 24 Aug 2021 12:32:37 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1 1/5] string: Make tests birdirectional test-memcpy.c Date: Tue, 24 Aug 2021 15:32:23 -0400 Message-Id: <20210824193227.3474346-1-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20210824082753.3356637-1-goldstein.w.n@gmail.com> References: <20210824082753.3356637-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-12.3 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" This commit updates the memcpy tests to test both dst > src and dst < src. This is because there is logic in the code based on the condition. --- string/test-memcpy.c | 125 +++++++++++++++++++++++++++++++++--------- string/test-memmove.c | 73 +++++++++++++++++++++++- 2 files changed, 170 insertions(+), 28 deletions(-) diff --git a/string/test-memcpy.c b/string/test-memcpy.c index c9dfc88fed..705d79ba13 100644 --- a/string/test-memcpy.c +++ b/string/test-memcpy.c @@ -79,7 +79,7 @@ do_one_test (impl_t *impl, char *dst, const char *src, static void do_test (size_t align1, size_t align2, size_t len) { - size_t i, j; + size_t i, j, repeats; char *s1, *s2; align1 &= 4095; @@ -92,12 +92,14 @@ do_test (size_t align1, size_t align2, size_t len) s1 = (char *) (buf1 + align1); s2 = (char *) (buf2 + align2); + for (repeats = 0; repeats < 2; ++repeats) + { + for (i = 0, j = 1; i < len; i++, j += 23) + s1[i] = j; - for (i = 0, j = 1; i < len; i++, j += 23) - s1[i] = j; - - FOR_EACH_IMPL (impl, 0) - do_one_test (impl, s2, s1, len); + FOR_EACH_IMPL (impl, 0) + do_one_test (impl, s2, s1, len); + } } static void @@ -213,56 +215,88 @@ do_random_tests (void) } static void -do_test1 (size_t size) +do_test1 (size_t align1, size_t align2, size_t size) { void *large_buf; - large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANON, -1, 0); + size_t mmap_size, region_size; + + align1 &= (page_size - 1); + if (align1 == 0) + align1 = page_size; + + align2 &= (page_size - 1); + if (align2 == 0) + align2 = page_size; + + region_size = (size + page_size - 1) & (~(page_size - 1)); + + mmap_size = region_size * 2 + 3 * page_size; + large_buf = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, 0); if (large_buf == MAP_FAILED) { - puts ("Failed to allocat large_buf, skipping do_test1"); + puts ("Failed to allocate large_buf, skipping do_test1"); return; } - - if (mprotect (large_buf + size, page_size, PROT_NONE)) + if (mprotect (large_buf + region_size + page_size, page_size, PROT_NONE)) error (EXIT_FAILURE, errno, "mprotect failed"); - size_t arrary_size = size / sizeof (uint32_t); - uint32_t *dest = large_buf; - uint32_t *src = large_buf + size + page_size; + size_t array_size = size / sizeof (uint32_t); + uint32_t *dest = large_buf + align1; + uint32_t *src = large_buf + region_size + 2 * page_size + align2; size_t i; size_t repeats; for(repeats = 0; repeats < 2; repeats++) { - for (i = 0; i < arrary_size; i++) + for (i = 0; i < array_size; i++) src[i] = (uint32_t) i; - FOR_EACH_IMPL (impl, 0) { - printf ("\t\tRunning: %s\n", impl->name); + // printf ("\t\tRunning: %s\n", impl->name); memset (dest, -1, size); CALL (impl, (char *) dest, (char *) src, size); - for (i = 0; i < arrary_size; i++) + for (i = 0; i < array_size; i++) if (dest[i] != src[i]) { error (0, 0, "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"", impl->name, dest, src, i); ret = 1; - munmap ((void *) large_buf, size * 2 + page_size); + munmap ((void *) large_buf, mmap_size); return; } } - dest = src; - src = large_buf; + dest = large_buf + region_size + 2 * page_size + align1; + src = large_buf + align2; + } + munmap ((void *) large_buf, mmap_size); +} + +static void +do_random_large_tests (void) +{ + size_t i, align1, align2, size; + for (i = 0; i < 32; ++i) + { + align1 = random (); + align2 = random (); + size = (random() % 0x1000000) + 0x200000; + do_test1 (align1, align2, size); + } + + for (i = 0; i < 128; ++i) + { + align1 = random (); + align2 = random (); + size = (random() % 32768) + 4096; + do_test1 (align1, align2, size); } - munmap ((void *) large_buf, size * 2 + page_size); } int test_main (void) { - size_t i; + size_t i, j; test_init (); @@ -299,6 +333,7 @@ test_main (void) for (i = 19; i <= 25; ++i) { do_test (255, 0, 1 << i); + do_test (0, 4000, 1 << i); do_test (0, 255, i); do_test (0, 4000, i); } @@ -307,8 +342,46 @@ test_main (void) do_random_tests (); - do_test1 (0x100000); - do_test1 (0x2000000); + do_test1 (0, 0, 0x100000); + do_test1 (0, 0, 0x2000000); + + for (i = 4096; i < 32768; i += 4096) + { + for (j = 1; j <= 1024; j <<= 1) + { + do_test1 (0, j, i); + do_test1 (4095, j, i); + do_test1 (4096 - j, 0, i); + + do_test1 (0, j - 1, i); + do_test1 (4095, j - 1, i); + do_test1 (4096 - j - 1, 0, i); + + do_test1 (0, j + 1, i); + do_test1 (4095, j + 1, i); + do_test1 (4096 - j, 1, i); + } + } + + for (i = 0x300000; i < 0x2000000; i += 0x235689) + { + for (j = 64; j <= 1024; j <<= 1) + { + do_test1 (0, j, i); + do_test1 (4095, j, i); + do_test1 (4096 - j, 0, i); + + do_test1 (0, j - 1, i); + do_test1 (4095, j - 1, i); + do_test1 (4096 - j - 1, 0, i); + + do_test1 (0, j + 1, i); + do_test1 (4095, j + 1, i); + do_test1 (4096 - j, 1, i); + } + } + + do_random_large_tests (); return ret; } diff --git a/string/test-memmove.c b/string/test-memmove.c index 670094c9dc..5ba79acf61 100644 --- a/string/test-memmove.c +++ b/string/test-memmove.c @@ -101,11 +101,11 @@ do_test (size_t align1, size_t align2, size_t len) size_t i, j; char *s1, *s2; - align1 &= 63; + align1 &= (getpagesize() - 1); if (align1 + len >= page_size) return; - align2 &= 63; + align2 &= (getpagesize() - 1); if (align2 + len >= page_size) return; @@ -356,6 +356,51 @@ do_test3 (size_t bytes_move, size_t offset) munmap ((void *) buf, size); } +static void +do_test4 (size_t bytes_move, size_t offset1, size_t offset2) +{ + size_t size, repeats, i; + uint8_t *buf, *dst, *src; + + size = bytes_move + MAX(offset1, offset2); + buf = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, 0); + + if (buf == MAP_FAILED) + error (EXIT_UNSUPPORTED, errno, "mmap failed"); + + dst = &buf[offset1]; + src = &buf[offset2]; + for (repeats = 0; repeats < 2; ++repeats) + { + FOR_EACH_IMPL (impl, 0) + { + for (i = 0; i < bytes_move; i++) + src[i] = (uint8_t) i; +#ifdef TEST_BCOPY + CALL (impl, (char *) src, (char *) dst, bytes_move); +#else + CALL (impl, (char *) dst, (char *) src, bytes_move); +#endif + for (i = 0; i < bytes_move; i++) + { + if (dst[i] != (uint8_t) i) + { + error (0, 0, + "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"", + impl->name, dst, buf, i); + ret = 1; + break; + } + } + } + dst = &buf[offset2]; + src = &buf[offset1]; + } + munmap ((void *) buf, size); +} + + int test_main (void) { @@ -396,13 +441,37 @@ test_main (void) do_random_tests (); + do_test2 (0); do_test2 (33); do_test2 (0x200000); + do_test2 (0x200000 - 1); + do_test2 (0x200000 + 1); + do_test2 (0x1000000 + 1); do_test2 (0x4000000 - 1); do_test2 (0x4000000); + /* Copy 16KB data. */ do_test3 (16384, 3); + for (i = 4096; i <= 16384; i <<= 1) + { + do_test4 (i, 0, i); + do_test4 (i, 0, i - 1); + do_test4 (i, 0, i + 1); + do_test4 (i, 63, i + 63); + do_test4 (i, 63, i + 64); + do_test4 (i, 63, i); + + do_test4 (i, 0, 1); + do_test4 (i, 0, 15); + do_test4 (i, 0, 31); + do_test4 (i, 0, 63); + do_test4 (i, 0, 64); + do_test4 (i, 0, 65); + do_test4 (i, 0, 127); + do_test4 (i, 0, 129); + } + return ret; } From patchwork Tue Aug 24 19:32:24 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 44789 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 379543858020 for ; Tue, 24 Aug 2021 19:36:46 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 379543858020 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1629833806; bh=vtR/+cqij0b5cH6cLp/K2mBFLxE3d22yRFVA52K5/zQ=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=mQGMlpY669q1L46qCaysOiRus4caZh+Z55cMug43g24Q0iD8f1bMwTu4Jxx3oynAJ 9UmUwUo4QdW/98UaMfWr48qS2RQ0mFQhNFu4jrKqzFrtJJWrlOm4hZnVojbwaym8c7 OqVa5OofYgnSlSDe1oFKcY7z1Z4be0jeEvyNz/J0= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-il1-x132.google.com (mail-il1-x132.google.com [IPv6:2607:f8b0:4864:20::132]) by sourceware.org (Postfix) with ESMTPS id 49C7E3858426 for ; Tue, 24 Aug 2021 19:32:40 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 49C7E3858426 Received: by mail-il1-x132.google.com with SMTP id j15so21650296ila.1 for ; Tue, 24 Aug 2021 12:32:40 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=vtR/+cqij0b5cH6cLp/K2mBFLxE3d22yRFVA52K5/zQ=; b=Y+A/UNrruzYadflr4StuYnti8S+7WPNuowzCOJNjBlwTVZGyOA9a/weTuSyGGis8jq NKO2CzbcfQzIaLleTQ82LQuPe/+QgY89auwCTEULYuN9UxdpqaRC9hIhM1IMNJuOdhVc mPMpJkeDevLlwZzTuy2EMZIgqichGvGH26Hsuc4G40KftcTAYgc5ILU/rP++zxpL6S7O EIJ0rtvm37SuBUsTZyN9Zo3jxO9LRhKEJfWOsXsFquT0gj/zI9nCnJ2rqqgFACFpqMO3 wZM5uDL962hmndyqvct9afRBDQ1COsJxc8ntMRT4SkAC87gyds9fu5hF9ubni2RkqTpI tTmg== X-Gm-Message-State: AOAM531es2S8DsKndytjnKhWKGwgTnFN30GNRt0vRQbQSe+nOi0eASUf 1hVqR6fiU0c8AaZtCgK+2mBb52svmSvfAg== X-Google-Smtp-Source: ABdhPJz9dHpqgQnezQyclDIUd8h8r7erb5vweDcoA26nYWv3QKikvE63WB17twE1TbTwqizWJeaYeg== X-Received: by 2002:a92:611:: with SMTP id x17mr27225596ilg.41.1629833559497; Tue, 24 Aug 2021 12:32:39 -0700 (PDT) Received: from localhost.localdomain (node-17-161.flex.volo.net. [76.191.17.161]) by smtp.googlemail.com with ESMTPSA id h10sm10224923ilj.71.2021.08.24.12.32.39 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 24 Aug 2021 12:32:39 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1 2/5] benchtests: Add new random cases to bench-memcpy-random.c Date: Tue, 24 Aug 2021 15:32:24 -0400 Message-Id: <20210824193227.3474346-2-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20210824193227.3474346-1-goldstein.w.n@gmail.com> References: <20210824082753.3356637-1-goldstein.w.n@gmail.com> <20210824193227.3474346-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-12.3 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" This commit adds three new benchmarks for the SPEC2017 distribution. One randomized if dst > src and the other two set it either 1/0. As well add some tests for fixed sizes with randomize alignment and value of dst > src. This can be useful for testing different alignment configurations. --- benchtests/bench-memcpy-random.c | 103 +++++++++++++++++++++++++++---- 1 file changed, 92 insertions(+), 11 deletions(-) diff --git a/benchtests/bench-memcpy-random.c b/benchtests/bench-memcpy-random.c index c490b73ed0..eeeef42fc1 100644 --- a/benchtests/bench-memcpy-random.c +++ b/benchtests/bench-memcpy-random.c @@ -16,7 +16,8 @@ License along with the GNU C Library; if not, see . */ -#define MIN_PAGE_SIZE (512*1024+getpagesize()) +#define MAX_TEST_SIZE (512*1024) +#define MIN_PAGE_SIZE (3*MAX_TEST_SIZE+getpagesize()) #define TEST_MAIN #define TEST_NAME "memcpy" #include "bench-string.h" @@ -89,9 +90,12 @@ static align_data_t dst_align_freq[] = typedef struct { - uint64_t src : 24; - uint64_t dst : 24; - uint64_t len : 16; +/* 26 bits for src and dst so we have extra bit for alternating dst > + src without a branch. */ + uint64_t src : 26; + uint64_t dst : 26; +/* For size < 4096 12 bits is enough. */ + uint64_t len : 12; } copy_t; static copy_t copy[MAX_COPIES]; @@ -142,34 +146,100 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src, } static void -do_test (json_ctx_t *json_ctx, size_t max_size) +do_one_fixed_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src, + copy_t *copy, size_t n, size_t size) { - int i; + timing_t start, stop, cur; + size_t iters = INNER_LOOP_ITERS_SMALL; + + for (int j = 0; j < n; j++) + CALL (impl, dst + copy[j].dst, src + copy[j].src, size); + + TIMING_NOW (start); + for (int i = 0; i < iters; ++i) + for (int j = 0; j < n; j++) + CALL (impl, dst + copy[j].dst, src + copy[j].src, size); + TIMING_NOW (stop); + + TIMING_DIFF (cur, start, stop); + + json_element_double (json_ctx, (double) cur / (double) iters); +} + + +static size_t +init_copy(size_t max_size, int dst_gt_src) +{ + size_t i, dst_offset, src_offset; + if (dst_gt_src <= 0) + { + dst_offset = 0; + src_offset = max_size; + } + else + { + dst_offset = max_size; + src_offset = 0; + } - memset (buf1, 1, max_size); /* Create a random set of copies with the given size and alignment distributions. */ for (i = 0; i < MAX_COPIES; i++) { + dst_offset = dst_gt_src == -1 + ? (rand() & 1) ? max_size : 0 + : dst_offset; copy[i].dst = (rand () & (max_size - 1)); copy[i].dst &= ~dst_align_arr[rand () & ALIGN_MASK]; + copy[i].dst += dst_offset; copy[i].src = (rand () & (max_size - 1)); copy[i].src &= ~src_align_arr[rand () & ALIGN_MASK]; + copy[i].src += src_offset; copy[i].len = size_arr[rand () & SIZE_MASK]; } + memset (buf1, 1, 3 * max_size); + return i; +} + +static void +do_test (json_ctx_t *json_ctx, size_t max_size, int dst_gt_src) +{ + size_t n; + n = init_copy(max_size, dst_gt_src); + json_element_object_begin (json_ctx); + json_attr_uint (json_ctx, "region-size", (double) 3 * max_size); + json_attr_int (json_ctx, "dst > src", (double) dst_gt_src); + json_attr_uint (json_ctx, "with-fixed-size", (double) 0); + json_array_begin (json_ctx, "timings"); + + FOR_EACH_IMPL (impl, 0) + do_one_test (json_ctx, impl, (char *) buf1, (char *) buf1, copy, n); + + json_array_end (json_ctx); + json_element_object_end (json_ctx); +} +static void +do_test_fixed_size (json_ctx_t *json_ctx, size_t size, size_t max_size, int dst_gt_src) +{ + size_t n; + n = init_copy(3 * max_size, dst_gt_src); json_element_object_begin (json_ctx); - json_attr_uint (json_ctx, "length", (double) max_size); + json_attr_uint (json_ctx, "region-size", (double) 3 * max_size); + json_attr_int (json_ctx, "dst > src", (double) dst_gt_src); + json_attr_uint (json_ctx, "with-fixed-size", (double) 1); + json_attr_uint (json_ctx, "size", (double) size); json_array_begin (json_ctx, "timings"); FOR_EACH_IMPL (impl, 0) - do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, copy, i); + do_one_fixed_test (json_ctx, impl, (char *) buf1, (char *) buf1, copy, n, size); json_array_end (json_ctx); json_element_object_end (json_ctx); } + int test_main (void) { @@ -193,8 +263,19 @@ test_main (void) json_array_end (&json_ctx); json_array_begin (&json_ctx, "results"); - for (int i = 4; i <= 512; i = i * 2) - do_test (&json_ctx, i * 1024); + for (int i = 4096; i < MAX_TEST_SIZE; i = i * 2) + { + do_test (&json_ctx, i, 0); + do_test (&json_ctx, i, 1); + do_test (&json_ctx, i, -1); + } + + for (int i = 4096; i <= 65536; i = i * 2) + { + do_test_fixed_size (&json_ctx, i, i, 0); + do_test_fixed_size (&json_ctx, i, i, 1); + do_test_fixed_size (&json_ctx, i, i, -1); + } json_array_end (&json_ctx); json_attr_object_end (&json_ctx); From patchwork Tue Aug 24 19:32:25 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 44786 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 8D0913858438 for ; Tue, 24 Aug 2021 19:34:30 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 8D0913858438 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1629833670; bh=ejyysPaI47cP8C6+zXmLcW6rdxZZx/QCnJDZvTDkXdw=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=EQDg996O7Vo1AU0Avn0JRcMSkx0S8KZbqhpSmJIHpff0KG73oZ9s3s6dvkCUigaKH 9XBOF2O4HcJRhhehlYNzi/eDFXrNFWLvoIgQd2PGTPXQuKvFQfwd/hXkrebZApVWnl ZXafFH2tbIXkVeOW5gUa6ZN+/IyNxLaYZM6YjmTQ= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-io1-xd29.google.com (mail-io1-xd29.google.com [IPv6:2607:f8b0:4864:20::d29]) by sourceware.org (Postfix) with ESMTPS id 7E0933858438 for ; Tue, 24 Aug 2021 19:32:41 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 7E0933858438 Received: by mail-io1-xd29.google.com with SMTP id b7so27798348iob.4 for ; Tue, 24 Aug 2021 12:32:41 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=ejyysPaI47cP8C6+zXmLcW6rdxZZx/QCnJDZvTDkXdw=; b=Z/ldxslN9RgpFKkOSTpifFdxwJgByoe9lN69TYX63wQzvYxQ3P4T+oLKAgCXH9k/JU iloEslzlzdGMdJcBt1ibp/diOXrrkae2B8QOYGttxFlnCC++zPFF8UGtGIm/Slt/ICTz azQvPW2zizTrMga5XemKd/ZvHtPb71/G6efWYj/bXED7s2QMncEiOjurO4id73cthjOa 6MkSAmfeUh9dAMEqEJalJzBEdJD5BLFYZMbXMTfTPh+xu/ZgGcNABdUuQbbJw+rz8jlq A9wQIiLCXmmL7OGDzU0bHLTA7apOZ8o1zrENyC3ZyPg3uPJvkEazCvkDTTpuw47VCz7l goNw== X-Gm-Message-State: AOAM533p+36K4/y/x09ASaJg/AO89b3sKxqIoiXPrI6ywIhudiz0D6N7 TRWUOcyOapYE51W0oLvvxOvlSwuR56O1Rg== X-Google-Smtp-Source: ABdhPJwsBEtgIJR0FORTTk+Gh4k6z2bYt4B3fR8GNBzz5Pz4atT429z+HhP1jJLUdPJonVyYo/LujA== X-Received: by 2002:a5d:81c1:: with SMTP id t1mr32837479iol.31.1629833560738; Tue, 24 Aug 2021 12:32:40 -0700 (PDT) Received: from localhost.localdomain (node-17-161.flex.volo.net. [76.191.17.161]) by smtp.googlemail.com with ESMTPSA id h10sm10224923ilj.71.2021.08.24.12.32.40 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 24 Aug 2021 12:32:40 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1 3/5] benchtests: Add partial overlap case in bench-memmove-walk.c Date: Tue, 24 Aug 2021 15:32:25 -0400 Message-Id: <20210824193227.3474346-3-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20210824193227.3474346-1-goldstein.w.n@gmail.com> References: <20210824082753.3356637-1-goldstein.w.n@gmail.com> <20210824193227.3474346-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-12.3 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" This commit adds a new partial overlap benchmark. This is generally the most interesting performance case for memmove and was missing. --- benchtests/bench-memmove-walk.c | 67 ++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/benchtests/bench-memmove-walk.c b/benchtests/bench-memmove-walk.c index b5fdb2a422..18b716f5cb 100644 --- a/benchtests/bench-memmove-walk.c +++ b/benchtests/bench-memmove-walk.c @@ -36,6 +36,10 @@ # define TIMEOUT (20 * 60) # include "bench-string.h" +#define NO_OVERLAP 0 +#define PARTIAL_OVERLAP 1 +#define COMPLETE_OVERLAP 2 + IMPL (memmove, 1) #endif @@ -66,20 +70,40 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src, } static void -do_test (json_ctx_t *json_ctx, size_t len, bool overlap) +do_test (json_ctx_t *json_ctx, size_t len, int overlap, int both_ways) { - json_element_object_begin (json_ctx); - json_attr_uint (json_ctx, "length", (double) len); - json_array_begin (json_ctx, "timings"); - - if (overlap) - buf2 = buf1; - - FOR_EACH_IMPL (impl, 0) - do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len); - - json_array_end (json_ctx); - json_element_object_end (json_ctx); + char *s1, *s2, *tmp; + size_t repeats; + + s1 = (char *) (buf1); + s2 = (char *) (buf2); + if (overlap != NO_OVERLAP) + s2 = s1; + if (overlap == PARTIAL_OVERLAP) + s2 += len / 2; + + for (repeats = both_ways ? 2 : 1; repeats; --repeats) + { + json_element_object_begin (json_ctx); + json_attr_uint (json_ctx, "length", (double) len); + json_attr_string(json_ctx, "overlap", + overlap == NO_OVERLAP ? "none" + : overlap == PARTIAL_OVERLAP ? "partial" + : "complete"); + json_attr_uint (json_ctx, "dst > src", (double) (s2 > s1)); + json_array_begin (json_ctx, "timings"); + + + FOR_EACH_IMPL (impl, 0) + do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len); + + json_array_end (json_ctx); + json_element_object_end (json_ctx); + + tmp = s1; + s1 = s2; + s2 = tmp; + } } int @@ -107,15 +131,22 @@ test_main (void) /* Non-overlapping buffers. */ for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1) { - do_test (&json_ctx, i, false); - do_test (&json_ctx, i + 1, false); + do_test (&json_ctx, i, NO_OVERLAP, 1); + do_test (&json_ctx, i + 1, NO_OVERLAP, 1); + } + + /* Partially-overlapping buffers. */ + for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE / 2; i <<= 1) + { + do_test (&json_ctx, i, PARTIAL_OVERLAP, 1); + do_test (&json_ctx, i + 1, PARTIAL_OVERLAP, 1); } - /* Overlapping buffers. */ + /* Complete-overlapping buffers. */ for (size_t i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1) { - do_test (&json_ctx, i, true); - do_test (&json_ctx, i + 1, true); + do_test (&json_ctx, i, COMPLETE_OVERLAP, 0); + do_test (&json_ctx, i + 1, COMPLETE_OVERLAP, 0); } json_array_end (&json_ctx); From patchwork Tue Aug 24 19:32:26 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 44787 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 1EFA63857C5B for ; Tue, 24 Aug 2021 19:35:13 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 1EFA63857C5B DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1629833713; bh=BAQfAEzvQakxpVlR8YEAI3dwiQj3gaBhWPdOQ79RLsM=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=enApI/4jKQIphQhVKIFwfy1f+tAQjt6ZmPjanqEuLFf4uQnIVyIpmWqAO98DcbnTE XFjt7EVAv3bzsA2Q1/OjXxk550nCikVXBcbXx7ZXu8ySMhN1w2d9CwL3XQJ0CHSZP+ AxeR2NKAuVTWFWLnlFIDBccQC8zQAnYTxcZ7hjl8= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-io1-xd2f.google.com (mail-io1-xd2f.google.com [IPv6:2607:f8b0:4864:20::d2f]) by sourceware.org (Postfix) with ESMTPS id AA6BF3858416 for ; Tue, 24 Aug 2021 19:32:42 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org AA6BF3858416 Received: by mail-io1-xd2f.google.com with SMTP id a21so27793749ioq.6 for ; Tue, 24 Aug 2021 12:32:42 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=BAQfAEzvQakxpVlR8YEAI3dwiQj3gaBhWPdOQ79RLsM=; b=t3FY/sa6jd35zNGJgXzKbVA5zefgzPskE7GecmwhjC+nwLs9+VMD9dWDw95wu5xkbA 0kM6jxpx83Dx/py/ButaKteN2Z/Q6xpRXJILd8Z79P5QrTuiqoKa6km2TK0fxKWyH3Dw gCqsGYmk9aTSEKT0t6w1ba+Yf3ipTfq2zIT/+YI94CZ/qAoQqyAEOQ8DeXNPbcIj3UTK UboqppS0/wFpRtuZzaKAujvbeTX8+Nj0myt6i1OGzuGOzk50mtcWsck5iWR3Nrn8rPMQ zsgkInb2DIey0GI2mtmMoLAvdRjmHdse9TnY/RFw8mxhhvKtI+uRLcYzvR29EMBDU+Lq hGzg== X-Gm-Message-State: AOAM533CQcbsCqH3ZwIWjc3g33cXydEtXIA5rMiQgQ5Lx0ERxSIelNIz HrEYraCMs+0YYdBkjfqOQlFsUEqhVPGBeg== X-Google-Smtp-Source: ABdhPJzZPHTsE7a/rfgAUKeZk1OzKvrXwFJA2WFxTusPqdqvd4yJwS9Sqhjz8R8Xf5R4jGHaruPtzQ== X-Received: by 2002:a5d:8d06:: with SMTP id p6mr33443206ioj.7.1629833561981; Tue, 24 Aug 2021 12:32:41 -0700 (PDT) Received: from localhost.localdomain (node-17-161.flex.volo.net. [76.191.17.161]) by smtp.googlemail.com with ESMTPSA id h10sm10224923ilj.71.2021.08.24.12.32.41 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 24 Aug 2021 12:32:41 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1 4/5] benchtests: Add additional cases to bench-memcpy.c and bench-memmove.c Date: Tue, 24 Aug 2021 15:32:26 -0400 Message-Id: <20210824193227.3474346-4-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20210824193227.3474346-1-goldstein.w.n@gmail.com> References: <20210824082753.3356637-1-goldstein.w.n@gmail.com> <20210824193227.3474346-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-12.3 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" This commit adds more benchmarks for the common memcpy/memmove benchmarks. The most signifcant cases are the half page offsets. The current versions leaves dst and src near page aligned which leads to false 4k aliasing on x86_64. This can add noise due to false dependencies from one run to the next. As well, this seems like more of an edge case that common case so it shouldn't be the only thing benchmarked. --- benchtests/bench-memcpy.c | 42 ++++++++++++++++++++++++++++++++++---- benchtests/bench-memmove.c | 21 +++++++++++++++++-- 2 files changed, 57 insertions(+), 6 deletions(-) diff --git a/benchtests/bench-memcpy.c b/benchtests/bench-memcpy.c index d9236a2282..b9e661c997 100644 --- a/benchtests/bench-memcpy.c +++ b/benchtests/bench-memcpy.c @@ -60,11 +60,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len, size_t i, j; char *s1, *s2; size_t repeats; - align1 &= 63; + align1 &= (getpagesize () - 1); if (align1 + len >= page_size) return; - align2 &= 63; + align2 &= (getpagesize () - 1); if (align2 + len >= page_size) return; @@ -99,7 +99,7 @@ test_main (void) { json_ctx_t json_ctx; size_t i; - + size_t half_page = getpagesize () / 2; test_init (); json_init (&json_ctx, 0, stdout); @@ -121,8 +121,15 @@ test_main (void) { do_test (&json_ctx, 0, 0, 1 << i, 1); do_test (&json_ctx, i, 0, 1 << i, 1); + do_test (&json_ctx, i + 32, 0, 1 << i, 1); do_test (&json_ctx, 0, i, 1 << i, 1); + do_test (&json_ctx, 0, i + 32, 1 << i, 1); do_test (&json_ctx, i, i, 1 << i, 1); + do_test (&json_ctx, i + 32, i + 32, 1 << i, 1); + do_test (&json_ctx, half_page, 0, 1 << i, 1); + do_test (&json_ctx, half_page + i, 0, 1 << i, 1); + do_test (&json_ctx, half_page, i, 1 << i, 1); + do_test (&json_ctx, half_page + i, i, 1 << i, 1); } for (i = 0; i < 32; ++i) @@ -131,6 +138,12 @@ test_main (void) do_test (&json_ctx, i, 0, i, 0); do_test (&json_ctx, 0, i, i, 0); do_test (&json_ctx, i, i, i, 0); + do_test (&json_ctx, half_page, 0, i, 0); + do_test (&json_ctx, half_page + i, 0, i, 0); + do_test (&json_ctx, half_page, i, i, 0); + do_test (&json_ctx, half_page + i, i, i, 0); + do_test (&json_ctx, getpagesize () - 1, 0, i, 0); + do_test (&json_ctx, 0, getpagesize () - 1, i, 0); } for (i = 3; i < 32; ++i) @@ -141,6 +154,10 @@ test_main (void) do_test (&json_ctx, i, 0, 16 * i, 1); do_test (&json_ctx, 0, i, 16 * i, 1); do_test (&json_ctx, i, i, 16 * i, 1); + do_test (&json_ctx, half_page, 0, 16 * i, 1); + do_test (&json_ctx, half_page + i, 0, 16 * i, 1); + do_test (&json_ctx, half_page, i, 16 * i, 1); + do_test (&json_ctx, half_page + i, i, 16 * i, 1); } for (i = 32; i < 64; ++i) @@ -149,16 +166,33 @@ test_main (void) do_test (&json_ctx, i, 0, 32 * i, 1); do_test (&json_ctx, 0, i, 32 * i, 1); do_test (&json_ctx, i, i, 32 * i, 1); + do_test (&json_ctx, half_page, 0, 32 * i, 1); + do_test (&json_ctx, half_page + i, 0, 32 * i, 1); + do_test (&json_ctx, half_page, i, 32 * i, 1); + do_test (&json_ctx, half_page + i, i, 32 * i, 1); } do_test (&json_ctx, 0, 0, getpagesize (), 1); - for (i = 0; i <= 32; ++i) + for (i = 0; i <= 48; ++i) { do_test (&json_ctx, 0, 0, 2048 + 64 * i, 1); do_test (&json_ctx, i, 0, 2048 + 64 * i, 1); + do_test (&json_ctx, i + 32, 0, 2048 + 64 * i, 1); do_test (&json_ctx, 0, i, 2048 + 64 * i, 1); + do_test (&json_ctx, 0, i + 32, 2048 + 64 * i, 1); do_test (&json_ctx, i, i, 2048 + 64 * i, 1); + do_test (&json_ctx, i + 32, i + 32, 2048 + 64 * i, 1); + do_test (&json_ctx, half_page, 0, 2048 + 64 * i, 1); + do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i, 1); + do_test (&json_ctx, half_page, i, 2048 + 64 * i, 1); + do_test (&json_ctx, half_page + i, i, 2048 + 64 * i, 1); + do_test (&json_ctx, i, 1, 2048 + 64 * i, 1); + do_test (&json_ctx, 1, i, 2048 + 64 * i, 1); + do_test (&json_ctx, i + 32, 1, 2048 + 64 * i, 1); + do_test (&json_ctx, 1, i + 32, 2048 + 64 * i, 1); + do_test (&json_ctx, half_page + i, 1, 2048 + 64 * i, 1); + do_test (&json_ctx, half_page + 1, i, 2048 + 64 * i, 1); } json_array_end (&json_ctx); diff --git a/benchtests/bench-memmove.c b/benchtests/bench-memmove.c index 6becbf4782..bec1455f7b 100644 --- a/benchtests/bench-memmove.c +++ b/benchtests/bench-memmove.c @@ -53,11 +53,11 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len) size_t i, j; char *s1, *s2; - align1 &= 63; + align1 &= (getpagesize () - 1); if (align1 + len >= page_size) return; - align2 &= 63; + align2 &= (getpagesize () - 1); if (align2 + len >= page_size) return; @@ -85,6 +85,7 @@ test_main (void) { json_ctx_t json_ctx; size_t i; + size_t half_page = getpagesize () / 2; test_init (); @@ -138,6 +139,22 @@ test_main (void) do_test (&json_ctx, i, i, 32 * i); } + for (i = 0; i <= 48; ++i) + { + do_test (&json_ctx, 0, 0, 2048 + 64 * i); + do_test (&json_ctx, i, 0, 2048 + 64 * i); + do_test (&json_ctx, 0, i, 2048 + 64 * i); + do_test (&json_ctx, i, i, 2048 + 64 * i); + do_test (&json_ctx, half_page, 0, 2048 + 64 * i); + do_test (&json_ctx, 0, half_page, 2048 + 64 * i); + do_test (&json_ctx, half_page + i, 0, 2048 + 64 * i); + do_test (&json_ctx, i, half_page, 2048 + 64 * i); + do_test (&json_ctx, half_page, i, 2048 + 64 * i); + do_test (&json_ctx, 0, half_page + i, 2048 + 64 * i); + do_test (&json_ctx, half_page + i, i, 2048 + 64 * i); + do_test (&json_ctx, i, half_page + i, 2048 + 64 * i); + } + json_array_end (&json_ctx); json_attr_object_end (&json_ctx); json_attr_object_end (&json_ctx); From patchwork Tue Aug 24 19:32:27 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 44788 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 64831385802B for ; Tue, 24 Aug 2021 19:35:57 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 64831385802B DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1629833757; bh=V0txMAggnTnP28wpK0VkPjsxcma7P6ft1m0i8THpeUI=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=AFnBVFpSHk1jwrAU1O471O+jKer0ZDpnyFqVmxFJTHRJwqrGq0lg7xLEKpR+kbWlB syys5330UIN4WE4w27KPyh6tY1oO54GGUgf0wl8IHQYej8NzlFymwgjdWuC4Ueretd f6+mLBdfRzoOC9chdx8D/QEJuPWKk/2KbuR6o0U8= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-il1-x12a.google.com (mail-il1-x12a.google.com [IPv6:2607:f8b0:4864:20::12a]) by sourceware.org (Postfix) with ESMTPS id 166D23858038 for ; Tue, 24 Aug 2021 19:32:44 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 166D23858038 Received: by mail-il1-x12a.google.com with SMTP id z2so21680336iln.0 for ; Tue, 24 Aug 2021 12:32:44 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=V0txMAggnTnP28wpK0VkPjsxcma7P6ft1m0i8THpeUI=; b=IsD4G5sZnrCMzkKmFDDtqF08DEkOQSgvM1uMEQVvecmCVSPElCPFCWIAn7hfv2FJAo xIkbCNzh1rrcZImAGk1JM6UiiyKU8ty4zE0jVB/MFksrsC+DoY2QP0N5TNKfDKt/Q/aN 2p1dWJjhosp7OM06ASIRegQie7a2A0wOijL6QoqxAyT2X5T+ROCsIY9bK+lNyZhIPh6C qYs90/dNhHxuw+nOm5tJKDzreAvHMR8wl4R8it1pxeDpcfOc5nANJcEup9d21PbBrwjQ UydfwclQSCJcW/ohjsYMDNyaSUTKHNUAOmimUC9Hc9GDOfP6MAZLyAMFTL8jcupBdbnz fOYw== X-Gm-Message-State: AOAM531iuGjthpR1KxA/PCClBceG5AZSq+SFVssnxsINKvFaBFwpfp4L hnnl4K+1fqWsiac+HwOJq39CuoOuJ3lhRw== X-Google-Smtp-Source: ABdhPJz5/KkK/XBN0ZsbaThRftwXEQQdoalcVGL9DFPL8kBvCNDZ3Ss0MjzAh+vWPkGia9viMstOTg== X-Received: by 2002:a92:c04e:: with SMTP id o14mr26706406ilf.289.1629833563293; Tue, 24 Aug 2021 12:32:43 -0700 (PDT) Received: from localhost.localdomain (node-17-161.flex.volo.net. [76.191.17.161]) by smtp.googlemail.com with ESMTPSA id h10sm10224923ilj.71.2021.08.24.12.32.42 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 24 Aug 2021 12:32:43 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1 5/5] X86-64: Optimize memmove-vec-unaligned-erms.S Date: Tue, 24 Aug 2021 15:32:27 -0400 Message-Id: <20210824193227.3474346-5-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20210824193227.3474346-1-goldstein.w.n@gmail.com> References: <20210824082753.3356637-1-goldstein.w.n@gmail.com> <20210824193227.3474346-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.6 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_STOCKGEN, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" No bug. This commit optimizes memmove-vec-unaligned.S. The optimizations are in descending order of importance to the L(less_vec), L(movsb), the 8x forward/backward loops and various target alignments that have minimal code size impact. The L(less_vec) optimizations are to: 1. Readjust the branch order to either given hotter paths a fall through case or have less branches in there way. 2. Moderately change the size classes to make hot branches hotter and thus increase predictability. 3. Try and minimize branch aliasing to avoid BPU thrashing based misses. 4. 64 byte the prior function entry. This is to avoid cases where seemingly unrelated changes end up have severe negative performance impacts. The L(movsb) optimizations are to: 1. Reduce the number of taken branches needed to determine if movsb should be used. 2. 64 byte align either dst if the CPU has fsrm or if dst and src do not 4k alias. 3. 64 byte align src if the CPU does not have fsrm and dst and src do 4k alias. The 8x forward/backward loop optimizations are to: 1. Reduce instructions needed for aligning to VEC_SIZE. 2. Reduce uops and code size of the loops. All tests in string/ passing. --- sysdeps/x86/sysdep.h | 13 +- .../multiarch/memmove-vec-unaligned-erms.S | 484 +++++++++++------- 2 files changed, 317 insertions(+), 180 deletions(-) diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h index cac1d762fb..9226d2c6c9 100644 --- a/sysdeps/x86/sysdep.h +++ b/sysdeps/x86/sysdep.h @@ -78,15 +78,18 @@ enum cf_protection_level #define ASM_SIZE_DIRECTIVE(name) .size name,.-name; /* Define an entry point visible from C. */ -#define ENTRY(name) \ - .globl C_SYMBOL_NAME(name); \ - .type C_SYMBOL_NAME(name),@function; \ - .align ALIGNARG(4); \ +#define P2ALIGN_ENTRY(name, alignment) \ + .globl C_SYMBOL_NAME(name); \ + .type C_SYMBOL_NAME(name),@function; \ + .align ALIGNARG(alignment); \ C_LABEL(name) \ cfi_startproc; \ - _CET_ENDBR; \ + _CET_ENDBR; \ CALL_MCOUNT +#define ENTRY(name) P2ALIGN_ENTRY(name, 4) + + #undef END #define END(name) \ cfi_endproc; \ diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index 9f02624375..75b6efe969 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -165,6 +165,32 @@ # error Invalid LARGE_LOAD_SIZE #endif +/* Whether to align before movsb. Ultimately we want 64 byte align + and not worth it to load 4x VEC for VEC_SIZE == 16. */ +#define ALIGN_MOVSB (VEC_SIZE > 16) + +/* Number of VECs to align movsb to. */ +#if VEC_SIZE == 64 +# define MOVSB_ALIGN_TO (VEC_SIZE) +#else +# define MOVSB_ALIGN_TO (VEC_SIZE * 2) +#endif + +/* Macro for copying inclusive power of 2 range with two register + loads. */ +#define COPY_BLOCK(mov_inst, src_reg, dst_reg, size_reg, len, tmp_reg0, tmp_reg1) \ + mov_inst (%src_reg), %tmp_reg0; \ + mov_inst -(len)(%src_reg, %size_reg), %tmp_reg1; \ + mov_inst %tmp_reg0, (%dst_reg); \ + mov_inst %tmp_reg1, -(len)(%dst_reg, %size_reg); + +/* Define all copies used by L(less_vec) for VEC_SIZE of 16, 32, or + 64. */ +#define COPY_4_8 COPY_BLOCK(movl, rsi, rdi, rdx, 4, ecx, esi) +#define COPY_8_16 COPY_BLOCK(movq, rsi, rdi, rdx, 8, rcx, rsi) +#define COPY_16_32 COPY_BLOCK(vmovdqu, rsi, rdi, rdx, 16, xmm0, xmm1) +#define COPY_32_64 COPY_BLOCK(vmovdqu64, rsi, rdi, rdx, 32, ymm16, ymm17) + #ifndef SECTION # error SECTION is not defined! #endif @@ -198,7 +224,13 @@ L(start): movl %edx, %edx # endif cmp $VEC_SIZE, %RDX_LP + /* Based on SPEC2017 distribution both 16 and 32 memcpy calls are + really hot so we want them to take the same branch path. */ +#if VEC_SIZE > 16 + jbe L(less_vec) +#else jb L(less_vec) +#endif cmp $(VEC_SIZE * 2), %RDX_LP ja L(more_2x_vec) #if !defined USE_MULTIARCH || !IS_IN (libc) @@ -206,15 +238,10 @@ L(last_2x_vec): #endif /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ VMOVU (%rsi), %VEC(0) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1) VMOVU %VEC(0), (%rdi) - VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) -#if !defined USE_MULTIARCH || !IS_IN (libc) -L(nop): - ret -#else + VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx) VZEROUPPER_RETURN -#endif #if defined USE_MULTIARCH && IS_IN (libc) END (MEMMOVE_SYMBOL (__memmove, unaligned)) @@ -289,7 +316,9 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) # endif -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) +/* Cache align entry so that branch heavy L(less_vec) maintains good + alignment. */ +P2ALIGN_ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6) movq %rdi, %rax L(start_erms): # ifdef __ILP32__ @@ -297,123 +326,217 @@ L(start_erms): movl %edx, %edx # endif cmp $VEC_SIZE, %RDX_LP + /* Based on SPEC2017 distribution both 16 and 32 memcpy calls are + really hot so we want them to take the same branch path. */ +# if VEC_SIZE > 16 + jbe L(less_vec) +# else jb L(less_vec) +# endif cmp $(VEC_SIZE * 2), %RDX_LP ja L(movsb_more_2x_vec) L(last_2x_vec): - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ VMOVU (%rsi), %VEC(0) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1) VMOVU %VEC(0), (%rdi) - VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx) L(return): -#if VEC_SIZE > 16 +# if VEC_SIZE > 16 ZERO_UPPER_VEC_REGISTERS_RETURN -#else +# else ret +# endif #endif +#if VEC_SIZE == 64 +L(copy_8_15): + COPY_8_16 + ret -L(movsb): - cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP - jae L(more_8x_vec) - cmpq %rsi, %rdi - jb 1f - /* Source == destination is less common. */ - je L(nop) - leaq (%rsi,%rdx), %r9 - cmpq %r9, %rdi - /* Avoid slow backward REP MOVSB. */ - jb L(more_8x_vec_backward) -# if AVOID_SHORT_DISTANCE_REP_MOVSB - andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) - jz 3f - movq %rdi, %rcx - subq %rsi, %rcx - jmp 2f -# endif -1: -# if AVOID_SHORT_DISTANCE_REP_MOVSB - andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) - jz 3f - movq %rsi, %rcx - subq %rdi, %rcx -2: -/* Avoid "rep movsb" if RCX, the distance between source and destination, - is N*4GB + [1..63] with N >= 0. */ - cmpl $63, %ecx - jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ -3: -# endif - mov %RDX_LP, %RCX_LP - rep movsb -L(nop): +L(copy_33_63): + COPY_32_64 ret #endif - + /* Only worth aligning if near end of 16 byte block and won't get + first branch in first decode after jump. */ + .p2align 4,, 6 L(less_vec): - /* Less than 1 VEC. */ #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 # error Unsupported VEC_SIZE! #endif -#if VEC_SIZE > 32 - cmpb $32, %dl - jae L(between_32_63) + /* Second set of branches for smallest copies. */ + cmpl $(VEC_SIZE / 4), %edx + jb L(less_quarter_vec) + + cmpl $(VEC_SIZE / 2), %edx +#if VEC_SIZE == 64 + /* We branch to [33, 63] instead of [16, 32] to give [16, 32] fall + through path as [16, 32] is hotter. */ + ja L(copy_33_63) + COPY_16_32 +#elif VEC_SIZE == 32 + /* Branch to [8, 15]. Fall through to [16, 32]. */ + jb L(copy_8_15) + COPY_16_32 +#else + /* Branch to [4, 7]. Fall through to [8, 15]. */ + jb L(copy_4_7) + COPY_8_16 #endif -#if VEC_SIZE > 16 - cmpb $16, %dl - jae L(between_16_31) -#endif - cmpb $8, %dl - jae L(between_8_15) - cmpb $4, %dl - jae L(between_4_7) - cmpb $1, %dl - ja L(between_2_3) - jb 1f + ret + /* Align if won't cost too many bytes. */ + .p2align 4,, 6 +L(copy_4_7): + COPY_4_8 + ret + + /* Cold target. No need to align. */ +L(copy_1): movzbl (%rsi), %ecx movb %cl, (%rdi) -1: ret + + /* Colder copy case for [0, VEC_SIZE / 4 - 1]. */ +L(less_quarter_vec): #if VEC_SIZE > 32 -L(between_32_63): - /* From 32 to 63. No branch when size == 32. */ - VMOVU (%rsi), %YMM0 - VMOVU -32(%rsi,%rdx), %YMM1 - VMOVU %YMM0, (%rdi) - VMOVU %YMM1, -32(%rdi,%rdx) - VZEROUPPER_RETURN + cmpl $8, %edx + jae L(copy_8_15) #endif #if VEC_SIZE > 16 - /* From 16 to 31. No branch when size == 16. */ -L(between_16_31): - VMOVU (%rsi), %XMM0 - VMOVU -16(%rsi,%rdx), %XMM1 - VMOVU %XMM0, (%rdi) - VMOVU %XMM1, -16(%rdi,%rdx) - VZEROUPPER_RETURN -#endif -L(between_8_15): - /* From 8 to 15. No branch when size == 8. */ - movq -8(%rsi,%rdx), %rcx - movq (%rsi), %rsi - movq %rcx, -8(%rdi,%rdx) - movq %rsi, (%rdi) - ret -L(between_4_7): - /* From 4 to 7. No branch when size == 4. */ - movl -4(%rsi,%rdx), %ecx - movl (%rsi), %esi - movl %ecx, -4(%rdi,%rdx) - movl %esi, (%rdi) + cmpl $4, %edx + jae L(copy_4_7) +#endif + cmpl $1, %edx + je L(copy_1) + jb L(copy_0) + /* Fall through into copy [2, 3] as it is more common than [0, 1]. + */ + movzwl (%rsi), %ecx + movzbl -1(%rsi, %rdx), %esi + movw %cx, (%rdi) + movb %sil, -1(%rdi, %rdx) +L(copy_0): ret -L(between_2_3): - /* From 2 to 3. No branch when size == 2. */ - movzwl -2(%rsi,%rdx), %ecx - movzwl (%rsi), %esi - movw %cx, -2(%rdi,%rdx) - movw %si, (%rdi) + + .p2align 4 +#if VEC_SIZE == 32 +L(copy_8_15): + COPY_8_16 ret + /* COPY_8_16 is exactly 17 bytes so don't want to p2align after as + it wastes 15 bytes of code and 1 byte off is fine. */ +#endif + +#if defined USE_MULTIARCH && IS_IN (libc) +L(movsb): + movq %rdi, %rcx + subq %rsi, %rcx + /* Go to backwards temporal copy if overlap no matter what as + backward movsb is slow. */ + cmpq %rdx, %rcx + /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ + jb L(more_8x_vec_backward_check_nop) + /* If above __x86_rep_movsb_stop_threshold most likely is candidate + for NT moves aswell. */ + cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP + jae L(large_memcpy_2x_check) +# if ALIGN_MOVSB + VMOVU (%rsi), %VEC(0) +# if MOVSB_ALIGN_TO > VEC_SIZE + VMOVU VEC_SIZE(%rsi), %VEC(1) +# endif +# if MOVSB_ALIGN_TO > (VEC_SIZE * 2) +# error Unsupported MOVSB_ALIGN_TO +# endif + /* Store dst for use after rep movsb. */ + movq %rdi, %r8 +# endif +# if AVOID_SHORT_DISTANCE_REP_MOVSB + /* Only avoid short movsb if CPU has FSRM. */ + testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) + jz L(skip_short_movsb_check) + /* Avoid "rep movsb" if RCX, the distance between source and + destination, is N*4GB + [1..63] with N >= 0. */ + + /* ecx contains dst - src. Early check for backward copy conditions + means only case of slow movsb with src = dst + [0, 63] is ecx in + [-63, 0]. Use unsigned comparison with -64 check for that case. */ + cmpl $-64, %ecx + ja L(more_8x_vec_forward) +# endif +# if ALIGN_MOVSB + /* Fall through means cpu has FSRM. In that case exclusively align + destination. */ + + /* Subtract dst from src. Add back after dst aligned. */ + subq %rdi, %rsi + /* Add dst to len. Subtract back after dst aligned. */ + leaq (%rdi, %rdx), %rcx + /* Exclusively align dst to MOVSB_ALIGN_TO (64). */ + addq $(MOVSB_ALIGN_TO - 1), %rdi + andq $-(MOVSB_ALIGN_TO), %rdi + /* Restore src and len adjusted with new values for aligned dst. */ + addq %rdi, %rsi + subq %rdi, %rcx + rep movsb + VMOVU %VEC(0), (%r8) +# if MOVSB_ALIGN_TO > VEC_SIZE + VMOVU %VEC(1), VEC_SIZE(%r8) +# endif + VZEROUPPER_RETURN +L(movsb_align_dst): + /* Subtract dst from src. Add back after dst aligned. */ + subq %rdi, %rsi + /* Add dst to len. Subtract back after dst aligned. -1 because dst + is initially aligned to MOVSB_ALIGN_TO - 1. */ + leaq -(1)(%rdi, %rdx), %rcx + /* Inclusively align dst to MOVSB_ALIGN_TO - 1. */ + orq $(MOVSB_ALIGN_TO - 1), %rdi + leaq 1(%rdi, %rsi), %rsi + /* Restore src and len adjusted with new values for aligned dst. */ + subq %rdi, %rcx + /* Finish aligning dst. */ + incq %rdi + rep movsb + VMOVU %VEC(0), (%r8) +# if MOVSB_ALIGN_TO > VEC_SIZE + VMOVU %VEC(1), VEC_SIZE(%r8) +# endif + VZEROUPPER_RETURN + +L(skip_short_movsb_check): + /* If CPU does not have FSRM two options for aligning. Align src if + dst and src 4k alias. Otherwise align dst. */ + testl $(PAGE_SIZE - 512), %ecx + jnz L(movsb_align_dst) + /* rcx already has dst - src. */ + movq %rcx, %r9 + /* Add src to len. Subtract back after src aligned. -1 because src + is initially aligned to MOVSB_ALIGN_TO - 1. */ + leaq -(1)(%rsi, %rdx), %rcx + /* Inclusively align src to MOVSB_ALIGN_TO - 1. */ + orq $(MOVSB_ALIGN_TO - 1), %rsi + /* Restore dst and len adjusted with new values for aligned dst. */ + leaq 1(%rsi, %r9), %rdi + subq %rsi, %rcx + /* Finish aligning src. */ + incq %rsi + rep movsb + VMOVU %VEC(0), (%r8) +# if MOVSB_ALIGN_TO > VEC_SIZE + VMOVU %VEC(1), VEC_SIZE(%r8) +# endif + VZEROUPPER_RETURN +# else + /* Not alignined rep movsb so just copy. */ + mov %RDX_LP, %RCX_LP + rep movsb + ret +# endif +#endif + /* Align if doesn't cost too many bytes. */ + .p2align 4,, 6 #if defined USE_MULTIARCH && IS_IN (libc) L(movsb_more_2x_vec): cmp __x86_rep_movsb_threshold(%rip), %RDX_LP @@ -426,50 +549,60 @@ L(more_2x_vec): ja L(more_8x_vec) cmpq $(VEC_SIZE * 4), %rdx jbe L(last_4x_vec) - /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ + /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ VMOVU (%rsi), %VEC(0) VMOVU VEC_SIZE(%rsi), %VEC(1) VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) - VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) - VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5) + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6) + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7) VMOVU %VEC(0), (%rdi) VMOVU %VEC(1), VEC_SIZE(%rdi) VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) - VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) - VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) - VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) - VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) + VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx) + VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx) + VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx) + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx) VZEROUPPER_RETURN + /* Align if doesn't cost too much code size. 6 bytes so that after + jump to target a full mov instruction will always be able to be + fetched. */ + .p2align 4,, 6 L(last_4x_vec): - /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ + /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ VMOVU (%rsi), %VEC(0) VMOVU VEC_SIZE(%rsi), %VEC(1) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3) VMOVU %VEC(0), (%rdi) VMOVU %VEC(1), VEC_SIZE(%rdi) - VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) - VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx) + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx) + /* Keep nop target close to jmp for 2-byte encoding. */ +L(nop): VZEROUPPER_RETURN - + /* Align if doesn't cost too much code size. */ + .p2align 4,, 10 L(more_8x_vec): /* Check if non-temporal move candidate. */ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) /* Check non-temporal store threshold. */ - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP ja L(large_memcpy_2x) #endif - /* Entry if rdx is greater than non-temporal threshold but there - is overlap. */ + /* Entry if rdx is greater than non-temporal threshold but there is + overlap. */ L(more_8x_vec_check): cmpq %rsi, %rdi ja L(more_8x_vec_backward) /* Source == destination is less common. */ je L(nop) + /* Entry if rdx is greater than movsb or stop movsb threshold but + there is overlap with dst > src. */ +L(more_8x_vec_forward): /* Load the first VEC and last 4 * VEC to support overlapping addresses. */ VMOVU (%rsi), %VEC(4) @@ -477,22 +610,18 @@ L(more_8x_vec_check): VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) - /* Save start and stop of the destination buffer. */ - movq %rdi, %r11 - leaq -VEC_SIZE(%rdi, %rdx), %rcx - /* Align destination for aligned stores in the loop. Compute - how much destination is misaligned. */ - movq %rdi, %r8 - andq $(VEC_SIZE - 1), %r8 - /* Get the negative of offset for alignment. */ - subq $VEC_SIZE, %r8 - /* Adjust source. */ - subq %r8, %rsi - /* Adjust destination which should be aligned now. */ - subq %r8, %rdi - /* Adjust length. */ - addq %r8, %rdx - + /* Subtract dst from src. Add back after dst aligned. */ + subq %rdi, %rsi + /* Store end of buffer minus tail in rdx. */ + leaq (VEC_SIZE * -4)(%rdi, %rdx), %rdx + /* Save begining of dst. */ + movq %rdi, %rcx + /* Align dst to VEC_SIZE - 1. */ + orq $(VEC_SIZE - 1), %rdi + /* Restore src adjusted with new value for aligned dst. */ + leaq 1(%rdi, %rsi), %rsi + /* Finish aligning dst. */ + incq %rdi .p2align 4 L(loop_4x_vec_forward): /* Copy 4 * VEC a time forward. */ @@ -501,23 +630,27 @@ L(loop_4x_vec_forward): VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) subq $-(VEC_SIZE * 4), %rsi - addq $-(VEC_SIZE * 4), %rdx VMOVA %VEC(0), (%rdi) VMOVA %VEC(1), VEC_SIZE(%rdi) VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) subq $-(VEC_SIZE * 4), %rdi - cmpq $(VEC_SIZE * 4), %rdx + cmpq %rdi, %rdx ja L(loop_4x_vec_forward) /* Store the last 4 * VEC. */ - VMOVU %VEC(5), (%rcx) - VMOVU %VEC(6), -VEC_SIZE(%rcx) - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) + VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx) + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx) + VMOVU %VEC(7), VEC_SIZE(%rdx) + VMOVU %VEC(8), (%rdx) /* Store the first VEC. */ - VMOVU %VEC(4), (%r11) + VMOVU %VEC(4), (%rcx) + /* Keep nop target close to jmp for 2-byte encoding. */ +L(nop2): VZEROUPPER_RETURN - + /* Entry from fail movsb. Need to test if dst - src == 0 still. */ +L(more_8x_vec_backward_check_nop): + testq %rcx, %rcx + jz L(nop2) L(more_8x_vec_backward): /* Load the first 4 * VEC and last VEC to support overlapping addresses. */ @@ -525,49 +658,50 @@ L(more_8x_vec_backward): VMOVU VEC_SIZE(%rsi), %VEC(5) VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) - /* Save stop of the destination buffer. */ - leaq -VEC_SIZE(%rdi, %rdx), %r11 - /* Align destination end for aligned stores in the loop. Compute - how much destination end is misaligned. */ - leaq -VEC_SIZE(%rsi, %rdx), %rcx - movq %r11, %r9 - movq %r11, %r8 - andq $(VEC_SIZE - 1), %r8 - /* Adjust source. */ - subq %r8, %rcx - /* Adjust the end of destination which should be aligned now. */ - subq %r8, %r9 - /* Adjust length. */ - subq %r8, %rdx - - .p2align 4 + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8) + /* Subtract dst from src. Add back after dst aligned. */ + subq %rdi, %rsi + /* Save begining of buffer. */ + movq %rdi, %rcx + /* Set dst to begining of region to copy. -1 for inclusive + alignment. */ + leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rdi + /* Align dst. */ + andq $-(VEC_SIZE), %rdi + /* Restore src. */ + addq %rdi, %rsi + /* Don't use multi-byte nop to align. */ + .p2align 4,, 11 L(loop_4x_vec_backward): /* Copy 4 * VEC a time backward. */ - VMOVU (%rcx), %VEC(0) - VMOVU -VEC_SIZE(%rcx), %VEC(1) - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) - addq $-(VEC_SIZE * 4), %rcx - addq $-(VEC_SIZE * 4), %rdx - VMOVA %VEC(0), (%r9) - VMOVA %VEC(1), -VEC_SIZE(%r9) - VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) - VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) - addq $-(VEC_SIZE * 4), %r9 - cmpq $(VEC_SIZE * 4), %rdx - ja L(loop_4x_vec_backward) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(0) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 1)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 0)(%rsi), %VEC(3) + addq $(VEC_SIZE * -4), %rsi + VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi) + VMOVA %VEC(1), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 1)(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 0)(%rdi) + addq $(VEC_SIZE * -4), %rdi + cmpq %rdi, %rcx + jb L(loop_4x_vec_backward) /* Store the first 4 * VEC. */ - VMOVU %VEC(4), (%rdi) - VMOVU %VEC(5), VEC_SIZE(%rdi) - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) + VMOVU %VEC(4), (%rcx) + VMOVU %VEC(5), VEC_SIZE(%rcx) + VMOVU %VEC(6), (VEC_SIZE * 2)(%rcx) + VMOVU %VEC(7), (VEC_SIZE * 3)(%rcx) /* Store the last VEC. */ - VMOVU %VEC(8), (%r11) + VMOVU %VEC(8), -VEC_SIZE(%rdx, %rcx) VZEROUPPER_RETURN #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) .p2align 4 + /* Entry if dst > stop movsb threshold (usually set to non-temporal + threshold). */ +L(large_memcpy_2x_check): + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + jb L(more_8x_vec_forward) L(large_memcpy_2x): /* Compute absolute value of difference between source and destination. */