From patchwork Wed Sep 22 00:50:18 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 45263 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 097F43858405 for ; Wed, 22 Sep 2021 00:51:05 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 097F43858405 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1632271865; bh=m2S0i5McK/8/VavAMUaXy3QjTxu3ZYnBjNEK0BD6ZdY=; h=To:Subject:Date:List-Id:List-Unsubscribe:List-Archive:List-Post: List-Help:List-Subscribe:From:Reply-To:From; b=sp1r4tYGbJYYSCAvvWXjv0ym9MRgtwuj95tSBN9RNuFXPn0RUBE/kSuyOIw5qf4FZ cRuHAYIBfGEkzGtpTqWVTD4o5xfdt4Hw1DuiHviijVts9u39K+2ajmbvBjWMHRNIs2 LuIll6IvuEblbjrWrIbwW1UWL2ZO+skyW002linQ= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-io1-xd32.google.com (mail-io1-xd32.google.com [IPv6:2607:f8b0:4864:20::d32]) by sourceware.org (Postfix) with ESMTPS id 739D53858D39 for ; Wed, 22 Sep 2021 00:50:43 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 739D53858D39 Received: by mail-io1-xd32.google.com with SMTP id e82so1015835iof.5 for ; Tue, 21 Sep 2021 17:50:43 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:mime-version :content-transfer-encoding; bh=m2S0i5McK/8/VavAMUaXy3QjTxu3ZYnBjNEK0BD6ZdY=; b=T9HzI3lYjEhdOkYxWhY+Ra/JUZKhtZ1YABDt/Y95Kq+U0Uyf5VuNVrGsX9VY9nuq1F gV6+kHFJxPAW6uHPGNtPD3I4donMN/51VDVTo35ywNRd/rbg90VDVANqVnifjN7YlFay ZJdfoia7kkHbuyNxXJ67YPAwq+5GCCvxNXgH636aq/wdZdKgAykMS3d7iqcDF3NLzyNs uCkdUXo9f4TPIV4+ogs9iwx51aTVJEqvb5nSbmuZdn1z7aF2G1BAA02d5UkPSx+VCmGK QQ3885zYmH0glBxUAbVeZPGrcngTnezRZ8rMb6A52lNwLG5zDLeCpSNloZKDJfxN750e 9g2g== X-Gm-Message-State: AOAM533M0yddFrdsxwrh8T6KVpqE4FQR++lv/HsNFXdaV9Xu7D2F9diH 7QBa0KIwjSEkwWxJ3lL9EJ2VW7eOd1Q= X-Google-Smtp-Source: ABdhPJwC27er5ZcE0Vycl3bhSakpIOK9xeqJaop7Xpy3imPffllTLTYYSdbYkM8SZdWLu1WtsmzU/A== X-Received: by 2002:a6b:2b97:: with SMTP id r145mr2191191ior.193.1632271842699; Tue, 21 Sep 2021 17:50:42 -0700 (PDT) Received: from localhost.localdomain (node-17-161.flex.volo.net. [76.191.17.161]) by smtp.googlemail.com with ESMTPSA id o8sm243882ill.81.2021.09.21.17.50.41 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 21 Sep 2021 17:50:42 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1 1/2] x86: Modify ENTRY in sysdep.h so that p2align can be specified Date: Tue, 21 Sep 2021 19:50:18 -0500 Message-Id: <20210922005018.1542107-1-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 MIME-Version: 1.0 X-Spam-Status: No, score=-11.3 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_STOCKGEN, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" No bug. This change adds a new macro ENTRY_P2ALIGN which takes a second argument, log2 of the desired function alignment. The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this doesn't affect any existing functionality. --- sysdeps/x86/sysdep.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h index cac1d762fb..8d4f4b9fd2 100644 --- a/sysdeps/x86/sysdep.h +++ b/sysdeps/x86/sysdep.h @@ -78,15 +78,19 @@ enum cf_protection_level #define ASM_SIZE_DIRECTIVE(name) .size name,.-name; /* Define an entry point visible from C. */ -#define ENTRY(name) \ +#define ENTRY_P2ALIGN(name, alignment) \ .globl C_SYMBOL_NAME(name); \ .type C_SYMBOL_NAME(name),@function; \ - .align ALIGNARG(4); \ + .align ALIGNARG(12); \ + .align ALIGNARG(alignment); \ C_LABEL(name) \ cfi_startproc; \ _CET_ENDBR; \ CALL_MCOUNT +/* Common entry 16 byte aligns. */ +#define ENTRY(name) ENTRY_P2ALIGN (name, 4) + #undef END #define END(name) \ cfi_endproc; \ From patchwork Wed Sep 22 00:50:20 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 45264 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id AEEB8385840F for ; Wed, 22 Sep 2021 00:51:47 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org AEEB8385840F DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1632271907; bh=zzRUglSaqihGoWCnhKUtHsFq/KRRbmCkadbu9SnwRoc=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=B1uAIwl+mJJhNCmEXtWQZSSmXTQW/GXC5wFdYmO0WZuxmXtLy33Nawel15fMpD06S FrDjpv9JiB43TS2KNo1UR+lr0SSljmWlYl6e+xZlvoJSXm/NiSXNqj3/3A3xBmtZfr SEVx4D9JPBZGiK3JFzD3y4M+GOcvJy22l363Y+pQ= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-il1-x129.google.com (mail-il1-x129.google.com [IPv6:2607:f8b0:4864:20::129]) by sourceware.org (Postfix) with ESMTPS id 85D563858421 for ; Wed, 22 Sep 2021 00:51:10 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 85D563858421 Received: by mail-il1-x129.google.com with SMTP id d11so846724ilc.8 for ; Tue, 21 Sep 2021 17:51:10 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=zzRUglSaqihGoWCnhKUtHsFq/KRRbmCkadbu9SnwRoc=; b=z9NNEiX/T5xc7C5P58aJ1R0fK1h1VTYYxCQXZmr9VrqikHdSx62XmT+NSSb+ZTWatC ljD9OJm9QcWp1gFzyg25EzG5NscVA4DALVJG49Svug1Vo37UHia94ggR40XM0rMbmQrm xiOg4XFI2GIw7OYHy2ERsEgjxiv3Ek+TObptK+oXe19rASLZ0rfkpJ7zK8306P4IcIcm KzHKOdHCbdtIjSHrHXzz1gbkO6xJKKl49DfB2Ac4d4OkWf5COr+NZyvYvLiYC+KmWcNZ 3yDPa3PuuxElgcOkNmIQ87HqZoMMghXuw5IU7Bb89ZqfsC06kXlRXBig4Wn2pI+Heap+ hhXg== X-Gm-Message-State: AOAM532HZakANvyjD/zAxJItps5gVZCiF+5ZDGgBHFbpNMHONYy72Bv2 b1+hZu9UeHXStmNgDrhSvCfiMkUiIb4= X-Google-Smtp-Source: ABdhPJw0V3XpLfP1nXjd49WFJGRKSTzgNJ9DEK/gHm712G57wggq3BtIKcF0LaOAw7+L6QFB9J8apA== X-Received: by 2002:a05:6e02:1d0c:: with SMTP id i12mr24467944ila.209.1632271869254; Tue, 21 Sep 2021 17:51:09 -0700 (PDT) Received: from localhost.localdomain (node-17-161.flex.volo.net. [76.191.17.161]) by smtp.googlemail.com with ESMTPSA id o8sm243882ill.81.2021.09.21.17.51.08 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 21 Sep 2021 17:51:09 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1 2/2] x86: Optimize memcmp-evex-movbe.S for frontend behavior and size Date: Tue, 21 Sep 2021 19:50:20 -0500 Message-Id: <20210922005018.1542107-2-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20210922005018.1542107-1-goldstein.w.n@gmail.com> References: <20210922005018.1542107-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-10.3 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_STORAGE_GOOGLE, RCVD_IN_DNSWL_NONE, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_PASS, TXREP, URIBL_SBL_A autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" No bug. The frontend optimizations are to: 1. Reorganize logically connected basic blocks so they are either in the same cache line or adjacent cache lines. 2. Avoid cases when basic blocks unnecissarily cross cache lines. 3. Try and 32 byte align any basic blocks possible without sacrificing code size. Smaller / Less hot basic blocks are used for this. Overall code size shrunk by 168 bytes. This should make up for any extra costs due to aligning to 64 bytes. In general performance before deviated a great deal dependending on whether entry alignment % 64 was 0, 16, 32, or 48. These changes essentially make it so that the current implementation is at least equal to the best alignment of the original for any arguments. The only additional optimization is in the page cross case. Branch on equals case was removed from the size == [4, 7] case. As well the [4, 7] and [2, 3] case where swapped as [4, 7] is likely a more hot argument size. test-memcmp and test-wmemcmp are both passing. Reviewed-by: H.J. Lu --- Performance results attached in reply. Benchmark CPU: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html Results are from geometric mean of N = 20 runs. Time for new implemention shown. Score(N) is the New Time / Old Time with entry alignment == N In general there is one performance degregation for size range [2, 3] in the page cross case. This is expected as the new code intentionally gives the fallthrough path to the hotter size range [4, 7]. For the micro benchmarks there is no outright performance improvement in the sense that for each benchmark configuration there is at least one entry alignment where the old version had roughly the same performance. But this should provide a great deal more performance stability. The hope, however, is that with the code size shrink and reorganization of basic blocks for better icache behavior. Concerns such as those brought up in this paper: https://storage.googleapis.com/pub-tools-public-publication-data/pdf/e52f61fd2c51e8962305120548581efacbc06ffc.pdf should be better addressed. sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 434 +++++++++++-------- 1 file changed, 242 insertions(+), 192 deletions(-) diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S index 654dc7ac8c..2761b54f2e 100644 --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S @@ -34,7 +34,24 @@ area. 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less. 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less. - 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */ + 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. + +When possible the implementation tries to optimize for frontend in the +following ways: +Throughput: + 1. All code sections that fit are able to run optimally out of the + LSD. + 2. All code sections that fit are able to run optimally out of the + DSB + 3. Basic blocks are contained in minimum number of fetch blocks + necessary. + +Latency: + 1. Logically connected basic blocks are put in the same + cache-line. + 2. Logically connected basic blocks that do not fit in the same + cache-line are put in adjacent lines. This can get beneficial + L2 spatial prefetching and L1 next-line prefetching. */ # include @@ -47,9 +64,11 @@ # ifdef USE_AS_WMEMCMP # define CHAR_SIZE 4 # define VPCMP vpcmpd +# define VPTEST vptestmd # else # define CHAR_SIZE 1 # define VPCMP vpcmpub +# define VPTEST vptestmb # endif # define VEC_SIZE 32 @@ -75,7 +94,9 @@ */ .section .text.evex,"ax",@progbits -ENTRY (MEMCMP) +/* Cache align memcmp entry. This allows for much more thorough + frontend optimization. */ +ENTRY_P2ALIGN (MEMCMP, 6) # ifdef __ILP32__ /* Clear the upper 32 bits. */ movl %edx, %edx @@ -89,7 +110,7 @@ ENTRY (MEMCMP) VPCMP $4, (%rdi), %YMM1, %k1 kmovd %k1, %eax /* NB: eax must be destination register if going to - L(return_vec_[0,2]). For L(return_vec_3 destination register + L(return_vec_[0,2]). For L(return_vec_3) destination register must be ecx. */ testl %eax, %eax jnz L(return_vec_0) @@ -121,10 +142,6 @@ ENTRY (MEMCMP) testl %ecx, %ecx jnz L(return_vec_3) - /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so - compare with zero to get a mask is needed. */ - vpxorq %XMM0, %XMM0, %XMM0 - /* Go to 4x VEC loop. */ cmpq $(CHAR_PER_VEC * 8), %rdx ja L(more_8x_vec) @@ -148,47 +165,61 @@ ENTRY (MEMCMP) VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 - /* Or together YMM1, YMM2, and YMM3 into YMM3. */ - vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while - oring with YMM3. Result is stored in YMM4. */ - vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4 - /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */ - VPCMP $4, %YMM4, %YMM0, %k1 + oring with YMM1. Result is stored in YMM4. */ + vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 + + /* Or together YMM2, YMM3, and YMM4 into YMM4. */ + vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 + + /* Test YMM4 against itself. Store any CHAR mismatches in k1. + */ + VPTEST %YMM4, %YMM4, %k1 + /* k1 must go to ecx for L(return_vec_0_1_2_3). */ kmovd %k1, %ecx testl %ecx, %ecx jnz L(return_vec_0_1_2_3) /* NB: eax must be zero to reach here. */ ret - /* NB: aligning 32 here allows for the rest of the jump targets - to be tuned for 32 byte alignment. Most important this ensures - the L(more_8x_vec) loop is 32 byte aligned. */ - .p2align 5 -L(less_vec): - /* Check if one or less CHAR. This is necessary for size = 0 but - is also faster for size = CHAR_SIZE. */ - cmpl $1, %edx - jbe L(one_or_less) + .p2align 4 +L(8x_end_return_vec_0_1_2_3): + movq %rdx, %rdi +L(8x_return_vec_0_1_2_3): + addq %rdi, %rsi +L(return_vec_0_1_2_3): + VPTEST %YMM1, %YMM1, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(return_vec_0) - /* Check if loading one VEC from either s1 or s2 could cause a - page cross. This can have false positives but is by far the - fastest method. */ - movl %edi, %eax - orl %esi, %eax - andl $(PAGE_SIZE - 1), %eax - cmpl $(PAGE_SIZE - VEC_SIZE), %eax - jg L(page_cross_less_vec) + VPTEST %YMM2, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(return_vec_1) - /* No page cross possible. */ - VMOVU (%rsi), %YMM2 - VPCMP $4, (%rdi), %YMM2, %k1 - kmovd %k1, %eax - /* Create mask in ecx for potentially in bound matches. */ - bzhil %edx, %eax, %eax - jnz L(return_vec_0) + VPTEST %YMM3, %YMM3, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(return_vec_2) +L(return_vec_3): + /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one + fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache + line. */ + bsfl %ecx, %ecx +# ifdef USE_AS_WMEMCMP + movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax + xorl %edx, %edx + cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx + subl %ecx, %eax +# endif ret .p2align 4 @@ -209,10 +240,11 @@ L(return_vec_0): # endif ret - /* NB: No p2align necessary. Alignment % 16 is naturally 1 - which is good enough for a target not in a loop. */ + .p2align 4 L(return_vec_1): - tzcntl %eax, %eax + /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one + fetch block. */ + bsfl %eax, %eax # ifdef USE_AS_WMEMCMP movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx xorl %edx, %edx @@ -226,10 +258,11 @@ L(return_vec_1): # endif ret - /* NB: No p2align necessary. Alignment % 16 is naturally 2 - which is good enough for a target not in a loop. */ + .p2align 4,, 10 L(return_vec_2): - tzcntl %eax, %eax + /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one + fetch block. */ + bsfl %eax, %eax # ifdef USE_AS_WMEMCMP movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx xorl %edx, %edx @@ -243,40 +276,6 @@ L(return_vec_2): # endif ret - .p2align 4 -L(8x_return_vec_0_1_2_3): - /* Returning from L(more_8x_vec) requires restoring rsi. */ - addq %rdi, %rsi -L(return_vec_0_1_2_3): - VPCMP $4, %YMM1, %YMM0, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(return_vec_0) - - VPCMP $4, %YMM2, %YMM0, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(return_vec_1) - - VPCMP $4, %YMM3, %YMM0, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(return_vec_2) -L(return_vec_3): - tzcntl %ecx, %ecx -# ifdef USE_AS_WMEMCMP - movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax - xorl %edx, %edx - cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax - setg %dl - leal -1(%rdx, %rdx), %eax -# else - movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax - movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx - subl %ecx, %eax -# endif - ret - .p2align 4 L(more_8x_vec): /* Set end of s1 in rdx. */ @@ -288,21 +287,19 @@ L(more_8x_vec): andq $-VEC_SIZE, %rdi /* Adjust because first 4x vec where check already. */ subq $-(VEC_SIZE * 4), %rdi + .p2align 4 L(loop_4x_vec): VMOVU (%rsi, %rdi), %YMM1 vpxorq (%rdi), %YMM1, %YMM1 - VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2 - VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 - vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 - VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 - vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4 - VPCMP $4, %YMM4, %YMM0, %k1 + vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 + vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 + VPTEST %YMM4, %YMM4, %k1 kmovd %k1, %ecx testl %ecx, %ecx jnz L(8x_return_vec_0_1_2_3) @@ -319,28 +316,25 @@ L(loop_4x_vec): cmpl $(VEC_SIZE * 2), %edi jae L(8x_last_2x_vec) + vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3 + VMOVU (%rsi, %rdx), %YMM1 vpxorq (%rdx), %YMM1, %YMM1 VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 - - vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3 - vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 - VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 - vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4 - VPCMP $4, %YMM4, %YMM0, %k1 + vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4 + vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 + VPTEST %YMM4, %YMM4, %k1 kmovd %k1, %ecx - /* Restore s1 pointer to rdi. */ - movq %rdx, %rdi testl %ecx, %ecx - jnz L(8x_return_vec_0_1_2_3) + jnz L(8x_end_return_vec_0_1_2_3) /* NB: eax must be zero to reach here. */ ret /* Only entry is from L(more_8x_vec). */ - .p2align 4 + .p2align 4,, 10 L(8x_last_2x_vec): VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1 kmovd %k1, %eax @@ -355,7 +349,31 @@ L(8x_last_1x_vec): jnz L(8x_return_vec_3) ret - .p2align 4 + /* Not ideally aligned (at offset +9 bytes in fetch block) but + not aligning keeps it in the same cache line as + L(8x_last_1x/2x_vec) so likely worth it. As well, saves code + size. */ + .p2align 4,, 4 +L(8x_return_vec_2): + subq $VEC_SIZE, %rdx +L(8x_return_vec_3): + bsfl %eax, %eax +# ifdef USE_AS_WMEMCMP + leaq (%rdx, %rax, CHAR_SIZE), %rax + movl (VEC_SIZE * 3)(%rax), %ecx + xorl %edx, %edx + cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx + setg %dl + leal -1(%rdx, %rdx), %eax +# else + addq %rdx, %rax + movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx + movzbl (VEC_SIZE * 3)(%rax), %eax + subl %ecx, %eax +# endif + ret + + .p2align 4,, 10 L(last_2x_vec): /* Check second to last VEC. */ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1 @@ -374,26 +392,49 @@ L(last_1x_vec): jnz L(return_vec_0_end) ret - .p2align 4 -L(8x_return_vec_2): - subq $VEC_SIZE, %rdx -L(8x_return_vec_3): - tzcntl %eax, %eax + .p2align 4,, 10 +L(return_vec_1_end): + /* Use bsf to save code size. This is necessary to have + L(one_or_less) fit in aligning bytes between. */ + bsfl %eax, %eax + addl %edx, %eax # ifdef USE_AS_WMEMCMP - leaq (%rdx, %rax, CHAR_SIZE), %rax - movl (VEC_SIZE * 3)(%rax), %ecx + movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx xorl %edx, %edx - cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx + cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx setg %dl leal -1(%rdx, %rdx), %eax # else - addq %rdx, %rax - movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx - movzbl (VEC_SIZE * 3)(%rax), %eax + movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx + movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax subl %ecx, %eax # endif ret + /* NB: L(one_or_less) fits in alignment padding between + L(return_vec_1_end) and L(return_vec_0_end). */ +# ifdef USE_AS_WMEMCMP +L(one_or_less): + jb L(zero) + movl (%rdi), %ecx + xorl %edx, %edx + cmpl (%rsi), %ecx + je L(zero) + setg %dl + leal -1(%rdx, %rdx), %eax + ret +# else +L(one_or_less): + jb L(zero) + movzbl (%rsi), %ecx + movzbl (%rdi), %eax + subl %ecx, %eax + ret +# endif +L(zero): + xorl %eax, %eax + ret + .p2align 4 L(return_vec_0_end): tzcntl %eax, %eax @@ -412,23 +453,56 @@ L(return_vec_0_end): ret .p2align 4 -L(return_vec_1_end): +L(less_vec): + /* Check if one or less CHAR. This is necessary for size == 0 + but is also faster for size == CHAR_SIZE. */ + cmpl $1, %edx + jbe L(one_or_less) + + /* Check if loading one VEC from either s1 or s2 could cause a + page cross. This can have false positives but is by far the + fastest method. */ + movl %edi, %eax + orl %esi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + jg L(page_cross_less_vec) + + /* No page cross possible. */ + VMOVU (%rsi), %YMM2 + VPCMP $4, (%rdi), %YMM2, %k1 + kmovd %k1, %eax + /* Check if any matches where in bounds. Intentionally not + storing result in eax to limit dependency chain if it goes to + L(return_vec_0_lv). */ + bzhil %edx, %eax, %edx + jnz L(return_vec_0_lv) + xorl %eax, %eax + ret + + /* Essentially duplicate of L(return_vec_0). Ends up not costing + any code as shrinks L(less_vec) by allowing 2-byte encoding of + the jump and ends up fitting in aligning bytes. As well fits on + same cache line as L(less_vec) so also saves a line from having + to be fetched on cold calls to memcmp. */ + .p2align 4,, 4 +L(return_vec_0_lv): tzcntl %eax, %eax - addl %edx, %eax # ifdef USE_AS_WMEMCMP - movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx + movl (%rdi, %rax, CHAR_SIZE), %ecx xorl %edx, %edx - cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx + cmpl (%rsi, %rax, CHAR_SIZE), %ecx + /* NB: no partial register stall here because xorl zero idiom + above. */ setg %dl leal -1(%rdx, %rdx), %eax # else - movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx - movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax + movzbl (%rsi, %rax), %ecx + movzbl (%rdi, %rax), %eax subl %ecx, %eax # endif ret - .p2align 4 L(page_cross_less_vec): /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 @@ -439,108 +513,84 @@ L(page_cross_less_vec): cmpl $8, %edx jae L(between_8_15) cmpl $4, %edx - jae L(between_4_7) -L(between_2_3): - /* Load as big endian to avoid branches. */ - movzwl (%rdi), %eax - movzwl (%rsi), %ecx - shll $8, %eax - shll $8, %ecx - bswap %eax - bswap %ecx - movzbl -1(%rdi, %rdx), %edi - movzbl -1(%rsi, %rdx), %esi - orl %edi, %eax - orl %esi, %ecx - /* Subtraction is okay because the upper 8 bits are zero. */ - subl %ecx, %eax - ret - .p2align 4 -L(one_or_less): - jb L(zero) - movzbl (%rsi), %ecx - movzbl (%rdi), %eax - subl %ecx, %eax + jb L(between_2_3) + + /* Load as big endian with overlapping movbe to avoid branches. + */ + movbe (%rdi), %eax + movbe (%rsi), %ecx + shlq $32, %rax + shlq $32, %rcx + movbe -4(%rdi, %rdx), %edi + movbe -4(%rsi, %rdx), %esi + orq %rdi, %rax + orq %rsi, %rcx + subq %rcx, %rax + /* edx is guranteed to be positive int32 in range [4, 7]. */ + cmovne %edx, %eax + /* ecx is -1 if rcx > rax. Otherwise 0. */ + sbbl %ecx, %ecx + /* If rcx > rax, then ecx is 0 and eax is positive. If rcx == + rax then eax and ecx are zero. If rax < rax then ecx is -1 so + eax doesn't matter. */ + orl %ecx, %eax ret - .p2align 4 + .p2align 4,, 8 L(between_8_15): # endif /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ - vmovq (%rdi), %XMM1 - vmovq (%rsi), %XMM2 - VPCMP $4, %XMM1, %XMM2, %k1 + vmovq (%rdi), %xmm1 + vmovq (%rsi), %xmm2 + VPCMP $4, %xmm1, %xmm2, %k1 kmovd %k1, %eax testl %eax, %eax - jnz L(return_vec_0) + jnz L(return_vec_0_lv) /* Use overlapping loads to avoid branches. */ - leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi - leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi - vmovq (%rdi), %XMM1 - vmovq (%rsi), %XMM2 - VPCMP $4, %XMM1, %XMM2, %k1 + vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1 + vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2 + VPCMP $4, %xmm1, %xmm2, %k1 + addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx kmovd %k1, %eax testl %eax, %eax - jnz L(return_vec_0) - ret - - .p2align 4 -L(zero): - xorl %eax, %eax + jnz L(return_vec_0_end) ret - .p2align 4 + .p2align 4,, 8 L(between_16_31): /* From 16 to 31 bytes. No branch when size == 16. */ - VMOVU (%rsi), %XMM2 - VPCMP $4, (%rdi), %XMM2, %k1 + + /* Use movups to save code size. */ + movups (%rsi), %xmm2 + VPCMP $4, (%rdi), %xmm2, %k1 kmovd %k1, %eax testl %eax, %eax - jnz L(return_vec_0) - + jnz L(return_vec_0_lv) /* Use overlapping loads to avoid branches. */ - - VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2 - leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi - leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi - VPCMP $4, (%rdi), %XMM2, %k1 + movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2 + VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1 + addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx kmovd %k1, %eax testl %eax, %eax - jnz L(return_vec_0) - ret - -# ifdef USE_AS_WMEMCMP - .p2align 4 -L(one_or_less): - jb L(zero) - movl (%rdi), %ecx - xorl %edx, %edx - cmpl (%rsi), %ecx - je L(zero) - setg %dl - leal -1(%rdx, %rdx), %eax + jnz L(return_vec_0_end) ret -# else - .p2align 4 -L(between_4_7): - /* Load as big endian with overlapping movbe to avoid branches. - */ - movbe (%rdi), %eax - movbe (%rsi), %ecx - shlq $32, %rax - shlq $32, %rcx - movbe -4(%rdi, %rdx), %edi - movbe -4(%rsi, %rdx), %esi - orq %rdi, %rax - orq %rsi, %rcx - subq %rcx, %rax - jz L(zero_4_7) - sbbl %eax, %eax - orl $1, %eax -L(zero_4_7): +# ifndef USE_AS_WMEMCMP +L(between_2_3): + /* Load as big endian to avoid branches. */ + movzwl (%rdi), %eax + movzwl (%rsi), %ecx + shll $8, %eax + shll $8, %ecx + bswap %eax + bswap %ecx + movzbl -1(%rdi, %rdx), %edi + movzbl -1(%rsi, %rdx), %esi + orl %edi, %eax + orl %esi, %ecx + /* Subtraction is okay because the upper 8 bits are zero. */ + subl %ecx, %eax ret # endif - END (MEMCMP) #endif