From patchwork Sat Dec 25 03:22:57 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 49253 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 8653C3858424 for ; Sat, 25 Dec 2021 03:24:14 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 8653C3858424 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1640402654; bh=eW7bKvVKCDY/PYYXAzBrVdnZQofvJds/FWT06PUQfQ8=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=lYkmTanCqPoipoKGwuSO4B9JpRotdU4N87Q7qfTn7lEQZsTF29AXCT0Ycbmt4zkZY irx2MTZdWypBe9InybCP8fv7Qi/afXwHne1nmD966xZHtZD+PDburE7vYkxHvKXp5O SJsBPWtDUFf4NNzuZb7llXdZWn+63AoYwRhcdaCg= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pj1-x1033.google.com (mail-pj1-x1033.google.com [IPv6:2607:f8b0:4864:20::1033]) by sourceware.org (Postfix) with ESMTPS id 01390385840B for ; Sat, 25 Dec 2021 03:23:08 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 01390385840B Received: by mail-pj1-x1033.google.com with SMTP id iy13so8886341pjb.5 for ; Fri, 24 Dec 2021 19:23:07 -0800 (PST) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=eW7bKvVKCDY/PYYXAzBrVdnZQofvJds/FWT06PUQfQ8=; b=YEtAeL0+hp/nfVqh3RnYLwLRJS06ryoc8FfGe25UtlNqHGhkivaUjpHghYlvjd7nvd Wk3bkwPGEGA5fyI6oZIyVMkc8WvM//yGQRxEBRLLCprb5sH3jNv+1kaN/e17JKiJvyKK Odd8DkWTWH616eYAM/CN/kqHnnGBHA6fLm+sZurUF5FB9L6C+RjwmQAh3QYuO+qAoDRy rjqNk7MKTkNhs7Eip4CqDnA7DOyVXJCWYtKoWhYm1Bgp4SX6awMEax66GVMQ1L/8iD8D HAdFAv2VVVG/u3yezjyzUPEmOUxLbgExHWbrZP0nDNiQ7Wu0CjUNCP0KSl3A3k2DmT/Q ot2Q== X-Gm-Message-State: AOAM531sK4815NpvmKcKUyeJ01y69Jlu94Qn756kFql8b6NNvpN0OzhP o9wZ6x/RtEmAZiSteMdwwZhB8x1kxHs= X-Google-Smtp-Source: ABdhPJxoEakpvTJCbIgkcdDUVbN95bLtXSrYMqybdrGQ7euNp1Yh3HdWtqRXR+zxdi5GsWJftxO/ew== X-Received: by 2002:a17:902:8214:b0:148:e748:84c3 with SMTP id x20-20020a170902821400b00148e74884c3mr8858819pln.77.1640402586920; Fri, 24 Dec 2021 19:23:06 -0800 (PST) Received: from localhost.localdomain (c-73-202-60-99.hsd1.ca.comcast.net. [73.202.60.99]) by smtp.googlemail.com with ESMTPSA id q190sm9155601pgq.38.2021.12.24.19.23.06 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Fri, 24 Dec 2021 19:23:06 -0800 (PST) To: libc-alpha@sourceware.org Subject: [PATCH v1 2/2] x86: Optimize L(less_vec) case in memcmpeq-evex.S Date: Fri, 24 Dec 2021 21:22:57 -0600 Message-Id: <20211225032257.2887327-2-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20211225032257.2887327-1-goldstein.w.n@gmail.com> References: <20211225032257.2887327-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-12.0 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" No bug. Optimizations are twofold. 1) Replace page cross and 0/1 checks with masked load instructions in L(less_vec). In applications this reduces branch-misses in the hot [0, 32] case. 2) Change controlflow so that L(less_vec) case gets the fall through. Change 2) helps copies in the [0, 32] size range but comes at the cost of copies in the [33, 64] size range. From profiles of GCC and Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this appears to the the right tradeoff. Reviewed-by: H.J. Lu --- sysdeps/x86_64/multiarch/memcmpeq-evex.S | 170 ++++++----------------- 1 file changed, 43 insertions(+), 127 deletions(-) diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S index f27e732036..b5e1edbdff 100644 --- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S +++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S @@ -39,6 +39,7 @@ # define MEMCMPEQ __memcmpeq_evex # endif +# define VMOVU_MASK vmovdqu8 # define VMOVU vmovdqu64 # define VPCMP vpcmpub # define VPTEST vptestmb @@ -62,12 +63,39 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6) movl %edx, %edx # endif cmp $VEC_SIZE, %RDX_LP - jb L(less_vec) + /* Fall through for [0, VEC_SIZE] as its the hottest. */ + ja L(more_1x_vec) + + /* Create mask of bytes that are guranteed to be valid because + of length (edx). Using masked movs allows us to skip checks for + page crosses/zero size. */ + movl $-1, %ecx + bzhil %edx, %ecx, %ecx + kmovd %ecx, %k2 + + /* Use masked loads as VEC_SIZE could page cross where length + (edx) would not. */ + VMOVU_MASK (%rsi), %YMM2{%k2} + VPCMP $4,(%rdi), %YMM2, %k1{%k2} + kmovd %k1, %eax + ret - /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ + +L(last_1x_vec): + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1 + VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1 + kmovd %k1, %eax +L(return_neq0): + ret + + + + .p2align 4 +L(more_1x_vec): + /* From VEC + 1 to 2 * VEC. */ VMOVU (%rsi), %YMM1 /* Use compare not equals to directly check for mismatch. */ - VPCMP $4, (%rdi), %YMM1, %k1 + VPCMP $4,(%rdi), %YMM1, %k1 kmovd %k1, %eax testl %eax, %eax jnz L(return_neq0) @@ -88,13 +116,13 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6) /* Check third and fourth VEC no matter what. */ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 - VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1 + VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1 kmovd %k1, %eax testl %eax, %eax jnz L(return_neq0) VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 - VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1 + VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1 kmovd %k1, %eax testl %eax, %eax jnz L(return_neq0) @@ -132,66 +160,6 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6) /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */ VPTEST %YMM4, %YMM4, %k1 kmovd %k1, %eax -L(return_neq0): - ret - - /* Fits in padding needed to .p2align 5 L(less_vec). */ -L(last_1x_vec): - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1 - VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1 - kmovd %k1, %eax - ret - - /* NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32 - byte aligned. */ - .p2align 5 -L(less_vec): - /* Check if one or less char. This is necessary for size = 0 but - is also faster for size = 1. */ - cmpl $1, %edx - jbe L(one_or_less) - - /* Check if loading one VEC from either s1 or s2 could cause a - page cross. This can have false positives but is by far the - fastest method. */ - movl %edi, %eax - orl %esi, %eax - andl $(PAGE_SIZE - 1), %eax - cmpl $(PAGE_SIZE - VEC_SIZE), %eax - jg L(page_cross_less_vec) - - /* No page cross possible. */ - VMOVU (%rsi), %YMM2 - VPCMP $4, (%rdi), %YMM2, %k1 - kmovd %k1, %eax - /* Result will be zero if s1 and s2 match. Otherwise first set - bit will be first mismatch. */ - bzhil %edx, %eax, %eax - ret - - /* Relatively cold but placing close to L(less_vec) for 2 byte - jump encoding. */ - .p2align 4 -L(one_or_less): - jb L(zero) - movzbl (%rsi), %ecx - movzbl (%rdi), %eax - subl %ecx, %eax - /* No ymm register was touched. */ - ret - /* Within the same 16 byte block is L(one_or_less). */ -L(zero): - xorl %eax, %eax - ret - - .p2align 4 -L(last_2x_vec): - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1 - vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1 - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2 - vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2 - VPTEST %YMM2, %YMM2, %k1 - kmovd %k1, %eax ret .p2align 4 @@ -211,7 +179,7 @@ L(loop_4x_vec): vpxorq (%rdi), %YMM1, %YMM1 VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 - vpternlogd $0xde, (VEC_SIZE)(%rdi), %YMM1, %YMM2 + vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2 VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 @@ -238,7 +206,7 @@ L(loop_4x_vec): VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3 /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while oring with YMM4. Result is stored in YMM4. */ - vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %YMM3, %YMM4 + vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4 cmpl $(VEC_SIZE * 2), %edi jae L(8x_last_2x_vec) @@ -256,68 +224,16 @@ L(8x_last_2x_vec): L(return_neq2): ret - /* Relatively cold case as page cross are unexpected. */ - .p2align 4 -L(page_cross_less_vec): - cmpl $16, %edx - jae L(between_16_31) - cmpl $8, %edx - ja L(between_9_15) - cmpl $4, %edx - jb L(between_2_3) - /* From 4 to 8 bytes. No branch when size == 4. */ - movl (%rdi), %eax - subl (%rsi), %eax - movl -4(%rdi, %rdx), %ecx - movl -4(%rsi, %rdx), %edi - subl %edi, %ecx - orl %ecx, %eax - ret - - .p2align 4,, 8 -L(between_16_31): - /* From 16 to 31 bytes. No branch when size == 16. */ - - /* Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe. - */ - vmovdqu (%rsi), %xmm1 - vpcmpeqb (%rdi), %xmm1, %xmm1 - vmovdqu -16(%rsi, %rdx), %xmm2 - vpcmpeqb -16(%rdi, %rdx), %xmm2, %xmm2 - vpand %xmm1, %xmm2, %xmm2 - vpmovmskb %xmm2, %eax - notw %ax - /* No ymm register was touched. */ - ret - .p2align 4,, 8 -L(between_9_15): - /* From 9 to 15 bytes. */ - movq (%rdi), %rax - subq (%rsi), %rax - movq -8(%rdi, %rdx), %rcx - movq -8(%rsi, %rdx), %rdi - subq %rdi, %rcx - orq %rcx, %rax - /* edx is guranteed to be a non-zero int. */ - cmovnz %edx, %eax - ret - - /* Don't align. This is cold and aligning here will cause code - to spill into next cache line. */ -L(between_2_3): - /* From 2 to 3 bytes. No branch when size == 2. */ - movzwl (%rdi), %eax - movzwl (%rsi), %ecx - subl %ecx, %eax - movzbl -1(%rdi, %rdx), %ecx - /* All machines that support evex will insert a "merging uop" - avoiding any serious partial register stalls. */ - subb -1(%rsi, %rdx), %cl - orl %ecx, %eax - /* No ymm register was touched. */ +L(last_2x_vec): + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1 + vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1 + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2 + vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2 + VPTEST %YMM2, %YMM2, %k1 + kmovd %k1, %eax ret - /* 4 Bytes from next cache line. */ + /* 1 Bytes from next cache line. */ END (MEMCMPEQ) #endif