From patchwork Wed Mar 23 21:57:38 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 52285 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id D2132386549C for ; Wed, 23 Mar 2022 22:11:03 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org D2132386549C DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1648073463; bh=fEUjKbJUmRYseDxTgs2h/4YjBnqqEZ+lzofGwxrXJTY=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=BL5TpBr4LJcfC7eZ4YTpKwKfGWZnhfbzyuY6efFZYryNJBg6ZbP36MUpvwPJBz0c0 e24AptzpVyYHxbv5Cf5hyc9YheP79fnhPT3C9PeFRQ1Cra27IOqK0IBdre2U5rWorT idfIGQMsriGgUDfF5qKtdjeGLFN8060xm1JIJDRI= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-il1-x135.google.com (mail-il1-x135.google.com [IPv6:2607:f8b0:4864:20::135]) by sourceware.org (Postfix) with ESMTPS id 4EBE63840C32 for ; Wed, 23 Mar 2022 22:02:04 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 4EBE63840C32 Received: by mail-il1-x135.google.com with SMTP id h21so1989526ila.7 for ; Wed, 23 Mar 2022 15:02:04 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=fEUjKbJUmRYseDxTgs2h/4YjBnqqEZ+lzofGwxrXJTY=; b=zpyMiFv5V8NnEwuSYS4ES7kQhBb0xxPfjbtUY3vAfbVYCT7uP1TRJMjLk5zzyHaAIi qIzsC+OHjkPpyT3JFfAfZ8SffAvDo99rzG1L8YX78pkHojvUE8UiXw6jw0d3bRbjoVvV LMOFlCykAO2VF41T4MdaHmH4IrR8t1rhdN6xMWruX8tOVJY4VWQZt4M19DD+ckvXe2jo uChxAUaVPi9ld0wafjQQ9mj0V0zeOnTmyoQCp4nKXBJNAyiXZvUz9NvzTi/wLsUxIQrI i4AxgRmJts5yTCYKbuGuTfxw/r3w2H+hb+IpLklXoHaeV155cNpvsZcYu7hlJmGTDbmM GA4A== X-Gm-Message-State: AOAM531JKfoG3reRnpbMg45GJ5KQ4I9GskuYArraEgY05xNN6Ftw6LHf io0f0Cn8FIaAsDNHKUayMJguKOFv5WI= X-Google-Smtp-Source: ABdhPJzsRogD1Vzoc/yeFETjo+lq71MbU9bI69PucSbMGUj7txaK5ZG3s5a7ox5I7pSoK9Q8Np+Q+A== X-Received: by 2002:a05:6e02:198c:b0:2c8:4e15:7cb9 with SMTP id g12-20020a056e02198c00b002c84e157cb9mr1100313ilf.24.1648072923306; Wed, 23 Mar 2022 15:02:03 -0700 (PDT) Received: from localhost.localdomain (node-17-161.flex.volo.net. [76.191.17.161]) by smtp.googlemail.com with ESMTPSA id c22-20020a5ea816000000b00649d360663asm529227ioa.40.2022.03.23.15.02.02 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 23 Mar 2022 15:02:03 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S Date: Wed, 23 Mar 2022 16:57:38 -0500 Message-Id: <20220323215734.3927131-18-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220323215734.3927131-1-goldstein.w.n@gmail.com> References: <20220323215734.3927131-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-12.5 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" Slightly faster method of doing TOLOWER that saves an instruction. Also replace the hard coded 5-byte no with .p2align 4. On builds with CET enabled this misaligned entry to strcasecmp. geometric_mean(N=40) of all benchmarks New / Original: .920 All string/memory tests pass. Reviewed-by: H.J. Lu --- Geomtric Mean N=40 runs; All functions page aligned length, align1, align2, max_char, New Time / Old Time 1, 1, 1, 127, 0.914 2, 2, 2, 127, 0.952 3, 3, 3, 127, 0.924 4, 4, 4, 127, 0.995 5, 5, 5, 127, 0.985 6, 6, 6, 127, 1.017 7, 7, 7, 127, 1.031 8, 0, 0, 127, 0.967 9, 1, 1, 127, 0.969 10, 2, 2, 127, 0.951 11, 3, 3, 127, 0.938 12, 4, 4, 127, 0.937 13, 5, 5, 127, 0.967 14, 6, 6, 127, 0.941 15, 7, 7, 127, 0.951 4, 0, 0, 127, 0.959 4, 0, 0, 254, 0.98 8, 0, 0, 254, 0.959 16, 0, 0, 127, 0.895 16, 0, 0, 254, 0.901 32, 0, 0, 127, 0.85 32, 0, 0, 254, 0.851 64, 0, 0, 127, 0.897 64, 0, 0, 254, 0.895 128, 0, 0, 127, 0.944 128, 0, 0, 254, 0.935 256, 0, 0, 127, 0.922 256, 0, 0, 254, 0.913 512, 0, 0, 127, 0.921 512, 0, 0, 254, 0.914 1024, 0, 0, 127, 0.845 1024, 0, 0, 254, 0.84 16, 1, 2, 127, 0.923 16, 2, 1, 254, 0.955 32, 2, 4, 127, 0.979 32, 4, 2, 254, 0.957 64, 3, 6, 127, 0.866 64, 6, 3, 254, 0.849 128, 4, 0, 127, 0.882 128, 0, 4, 254, 0.876 256, 5, 2, 127, 0.877 256, 2, 5, 254, 0.882 512, 6, 4, 127, 0.822 512, 4, 6, 254, 0.862 1024, 7, 6, 127, 0.903 1024, 6, 7, 254, 0.908 sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++-------------- 1 file changed, 35 insertions(+), 48 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S index 580feb90e9..7805ae9d41 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S @@ -88,9 +88,8 @@ ENTRY (GLABEL(__strcasecmp)) movq __libc_tsd_LOCALE@gottpoff(%rip),%rax mov %fs:(%rax),%RDX_LP - // XXX 5 byte should be before the function - /* 5-byte NOP. */ - .byte 0x0f,0x1f,0x44,0x00,0x00 + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 END (GLABEL(__strcasecmp)) /* FALLTHROUGH to strcasecmp_l. */ #endif @@ -99,9 +98,8 @@ ENTRY (GLABEL(__strncasecmp)) movq __libc_tsd_LOCALE@gottpoff(%rip),%rax mov %fs:(%rax),%RCX_LP - // XXX 5 byte should be before the function - /* 5-byte NOP. */ - .byte 0x0f,0x1f,0x44,0x00,0x00 + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 END (GLABEL(__strncasecmp)) /* FALLTHROUGH to strncasecmp_l. */ #endif @@ -169,27 +167,22 @@ STRCMP_SSE42: #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L .section .rodata.cst16,"aM",@progbits,16 .align 16 -LABEL(belowupper): - .quad 0x4040404040404040 - .quad 0x4040404040404040 -LABEL(topupper): -# ifdef USE_AVX - .quad 0x5a5a5a5a5a5a5a5a - .quad 0x5a5a5a5a5a5a5a5a -# else - .quad 0x5b5b5b5b5b5b5b5b - .quad 0x5b5b5b5b5b5b5b5b -# endif -LABEL(touppermask): +LABEL(lcase_min): + .quad 0x3f3f3f3f3f3f3f3f + .quad 0x3f3f3f3f3f3f3f3f +LABEL(lcase_max): + .quad 0x9999999999999999 + .quad 0x9999999999999999 +LABEL(case_add): .quad 0x2020202020202020 .quad 0x2020202020202020 .previous - movdqa LABEL(belowupper)(%rip), %xmm4 -# define UCLOW_reg %xmm4 - movdqa LABEL(topupper)(%rip), %xmm5 -# define UCHIGH_reg %xmm5 - movdqa LABEL(touppermask)(%rip), %xmm6 -# define LCQWORD_reg %xmm6 + movdqa LABEL(lcase_min)(%rip), %xmm4 +# define LCASE_MIN_reg %xmm4 + movdqa LABEL(lcase_max)(%rip), %xmm5 +# define LCASE_MAX_reg %xmm5 + movdqa LABEL(case_add)(%rip), %xmm6 +# define CASE_ADD_reg %xmm6 #endif cmp $0x30, %ecx ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ @@ -200,32 +193,26 @@ LABEL(touppermask): #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L # ifdef USE_AVX # define TOLOWER(reg1, reg2) \ - vpcmpgtb UCLOW_reg, reg1, %xmm7; \ - vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ - vpcmpgtb UCLOW_reg, reg2, %xmm9; \ - vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ - vpandn %xmm7, %xmm8, %xmm8; \ - vpandn %xmm9, %xmm10, %xmm10; \ - vpand LCQWORD_reg, %xmm8, %xmm8; \ - vpand LCQWORD_reg, %xmm10, %xmm10; \ - vpor reg1, %xmm8, reg1; \ - vpor reg2, %xmm10, reg2 + vpaddb LCASE_MIN_reg, reg1, %xmm7; \ + vpaddb LCASE_MIN_reg, reg2, %xmm8; \ + vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \ + vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \ + vpandn CASE_ADD_reg, %xmm7, %xmm7; \ + vpandn CASE_ADD_reg, %xmm8, %xmm8; \ + vpaddb %xmm7, reg1, reg1; \ + vpaddb %xmm8, reg2, reg2 # else # define TOLOWER(reg1, reg2) \ - movdqa reg1, %xmm7; \ - movdqa UCHIGH_reg, %xmm8; \ - movdqa reg2, %xmm9; \ - movdqa UCHIGH_reg, %xmm10; \ - pcmpgtb UCLOW_reg, %xmm7; \ - pcmpgtb reg1, %xmm8; \ - pcmpgtb UCLOW_reg, %xmm9; \ - pcmpgtb reg2, %xmm10; \ - pand %xmm8, %xmm7; \ - pand %xmm10, %xmm9; \ - pand LCQWORD_reg, %xmm7; \ - pand LCQWORD_reg, %xmm9; \ - por %xmm7, reg1; \ - por %xmm9, reg2 + movdqa LCASE_MIN_reg, %xmm7; \ + movdqa LCASE_MIN_reg, %xmm8; \ + paddb reg1, %xmm7; \ + paddb reg2, %xmm8; \ + pcmpgtb LCASE_MAX_reg, %xmm7; \ + pcmpgtb LCASE_MAX_reg, %xmm8; \ + pandn CASE_ADD_reg, %xmm7; \ + pandn CASE_ADD_reg, %xmm8; \ + paddb %xmm7, reg1; \ + paddb %xmm8, reg2 # endif TOLOWER (%xmm1, %xmm2) #else