From patchwork Fri Mar 25 22:13:32 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 52379 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 30FB3388981B for ; Fri, 25 Mar 2022 22:14:02 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 30FB3388981B DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1648246442; bh=yD5ohQBDiC69R2dVaw3l2rOX1J9prgOREipNFKBq/iQ=; h=To:Subject:Date:List-Id:List-Unsubscribe:List-Archive:List-Post: List-Help:List-Subscribe:From:Reply-To:From; b=VecolhML4kzoaOZFBxooANOqsfOkMA7A8dWBPWRLXECSin41tMnqX3HB06mSzTU8I OA97VJrlwOxTZwwaUD2UcwIC6bCCIRBuF1mSFXvpm1YyVOPgL29t6CMqMoN0MpGKec E8l9vcJiAR2gFO6N5uBtj/U/m6hN4c1jWIoyMZ/g= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-io1-xd36.google.com (mail-io1-xd36.google.com [IPv6:2607:f8b0:4864:20::d36]) by sourceware.org (Postfix) with ESMTPS id F3CEF3858D37 for ; Fri, 25 Mar 2022 22:13:39 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org F3CEF3858D37 Received: by mail-io1-xd36.google.com with SMTP id z7so10508116iom.1 for ; Fri, 25 Mar 2022 15:13:39 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:mime-version :content-transfer-encoding; bh=yD5ohQBDiC69R2dVaw3l2rOX1J9prgOREipNFKBq/iQ=; b=6SutZrzufWNZK0uGFSje5CaFbJqoYyddmfMRs3em7q0GG7NwrXa1VCmlie9+vLlpp4 ehAXKizcJPRODWAQgiHZEs5Am/0C5oLjwDm2k98L2C3e1Vk9jlaAyCQWOqnsMcLEOMlw hjWfOjL5GoiQycZDLSjfQisGN/ZTAICNVbwmeI5YBgGZGkOMouakpa4lKg4colQuSVDV rhyD0TTbw1VAqCb756tRDqZDWyagF9J7DUpYGT9aA7PYneTUM3RbQlaeRyq3raqVsgde Bix+OxPcC1FF01YLTOi5g7RQO97N3pgs2Rh11O/Zv2tUS+Qnq2f/lS5tHk1SL1wxA+47 TfYw== X-Gm-Message-State: AOAM5316H64WtW20LTzn6+65h5J/w/33V9369oYPgKhpkJ7AbnX0yr51 gdlua92pAoZw8XnHq3i9AgCLtcEftIQ= X-Google-Smtp-Source: ABdhPJw6rW0VkD8MenqL6JIl2OdcCbIZMOelJcX48zQOyhiR9n737zJhu8JZF15rHeYvqklrmin0Lg== X-Received: by 2002:a05:6602:22da:b0:645:ec83:6393 with SMTP id e26-20020a05660222da00b00645ec836393mr600778ioe.165.1648246418993; Fri, 25 Mar 2022 15:13:38 -0700 (PDT) Received: from localhost.localdomain (node-17-161.flex.volo.net. [76.191.17.161]) by smtp.googlemail.com with ESMTPSA id k1-20020a056e021a8100b002c64cf94399sm3784845ilv.44.2022.03.25.15.13.38 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Fri, 25 Mar 2022 15:13:38 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1 1/2] x86: Small improvements for wcscpy-ssse3 Date: Fri, 25 Mar 2022 17:13:32 -0500 Message-Id: <20220325221333.3079015-1-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 MIME-Version: 1.0 X-Spam-Status: No, score=-12.1 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_NUMSUBJECT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" Just a few small QOL changes. 1. Prefer `add` > `lea` as it has high execution units it can run on. 2. Don't break macro-fusion between `test` and `jcc` geometric_mean(N=20) of all benchmarks New / Original: 0.973 All string/memory tests pass. Reviewed-by: H.J. Lu --- sysdeps/x86_64/multiarch/wcscpy-ssse3.S | 194 ++++++++++++------------ 1 file changed, 97 insertions(+), 97 deletions(-) diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S index 34b09af327..aa2b9d030f 100644 --- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S @@ -52,7 +52,7 @@ ENTRY (__wcscpy_ssse3) jnz L(CopyFrom1To16Bytes) mov %rdx, %rax - lea 16(%rdx), %rdx + addq $16, %rdx and $-16, %rdx sub %rdx, %rax sub %rax, %rcx @@ -75,55 +75,55 @@ L(Align16Both): movaps 16(%rcx), %xmm2 movaps %xmm1, (%rdx) pcmpeqd %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi + pmovmskb %xmm0, %eax + addq $16, %rsi - test %rax, %rax + test %eax, %eax jnz L(CopyFrom1To16Bytes) movaps 16(%rcx, %rsi), %xmm3 movaps %xmm2, (%rdx, %rsi) pcmpeqd %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi + pmovmskb %xmm0, %eax + addq $16, %rsi - test %rax, %rax + test %eax, %eax jnz L(CopyFrom1To16Bytes) movaps 16(%rcx, %rsi), %xmm4 movaps %xmm3, (%rdx, %rsi) pcmpeqd %xmm4, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi + pmovmskb %xmm0, %eax + addq $16, %rsi - test %rax, %rax + test %eax, %eax jnz L(CopyFrom1To16Bytes) movaps 16(%rcx, %rsi), %xmm1 movaps %xmm4, (%rdx, %rsi) pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi + pmovmskb %xmm0, %eax + addq $16, %rsi - test %rax, %rax + test %eax, %eax jnz L(CopyFrom1To16Bytes) movaps 16(%rcx, %rsi), %xmm2 movaps %xmm1, (%rdx, %rsi) pcmpeqd %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi + pmovmskb %xmm0, %eax + addq $16, %rsi - test %rax, %rax + test %eax, %eax jnz L(CopyFrom1To16Bytes) movaps 16(%rcx, %rsi), %xmm3 movaps %xmm2, (%rdx, %rsi) pcmpeqd %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi + pmovmskb %xmm0, %eax + addq $16, %rsi - test %rax, %rax + test %eax, %eax jnz L(CopyFrom1To16Bytes) movaps %xmm3, (%rdx, %rsi) @@ -147,10 +147,10 @@ L(Aligned64Loop): pminub %xmm7, %xmm3 pminub %xmm2, %xmm3 pcmpeqd %xmm0, %xmm3 - pmovmskb %xmm3, %rax - lea 64(%rdx), %rdx - lea 64(%rcx), %rcx - test %rax, %rax + pmovmskb %xmm3, %eax + addq $64, %rdx + addq $64, %rcx + testl %eax, %eax jnz L(Aligned64Leave) movaps %xmm4, -64(%rdx) movaps %xmm5, -48(%rdx) @@ -160,32 +160,32 @@ L(Aligned64Loop): L(Aligned64Leave): pcmpeqd %xmm4, %xmm0 - pmovmskb %xmm0, %rax - test %rax, %rax + pmovmskb %xmm0, %eax + test %eax, %eax jnz L(CopyFrom1To16Bytes) pcmpeqd %xmm5, %xmm0 - pmovmskb %xmm0, %rax + pmovmskb %xmm0, %eax movaps %xmm4, -64(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi + addq $16, %rsi + test %eax, %eax jnz L(CopyFrom1To16Bytes) pcmpeqd %xmm6, %xmm0 - pmovmskb %xmm0, %rax + pmovmskb %xmm0, %eax movaps %xmm5, -48(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi + addq $16, %rsi + test %eax, %eax jnz L(CopyFrom1To16Bytes) movaps %xmm6, -32(%rdx) pcmpeqd %xmm7, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi - test %rax, %rax + pmovmskb %xmm0, %eax + addq $16, %rsi + test %eax, %eax jnz L(CopyFrom1To16Bytes) mov $-0x40, %rsi @@ -198,10 +198,10 @@ L(Shl4): movaps 12(%rcx), %xmm2 L(Shl4Start): pcmpeqd %xmm2, %xmm0 - pmovmskb %xmm0, %rax + pmovmskb %xmm0, %eax movaps %xmm2, %xmm3 - test %rax, %rax + test %eax, %eax jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 @@ -209,12 +209,12 @@ L(Shl4Start): movaps 28(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx movaps %xmm2, %xmm1 - test %rax, %rax + test %eax, %eax jnz L(Shl4LoopExit) palignr $4, %xmm3, %xmm2 @@ -222,12 +222,12 @@ L(Shl4Start): movaps 28(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx movaps %xmm2, %xmm3 - test %rax, %rax + test %eax, %eax jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 @@ -235,22 +235,22 @@ L(Shl4Start): movaps 28(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx - test %rax, %rax + test %eax, %eax jnz L(Shl4LoopExit) palignr $4, %xmm3, %xmm2 movaps %xmm2, (%rdx) - lea 28(%rcx), %rcx - lea 16(%rdx), %rdx + addq $28, %rcx + addq $16, %rdx mov %rcx, %rax and $-0x40, %rcx sub %rcx, %rax - lea -12(%rcx), %rcx + addq $-12, %rcx sub %rax, %rdx movaps -4(%rcx), %xmm1 @@ -267,22 +267,22 @@ L(Shl4LoopStart): pminub %xmm5, %xmm7 pminub %xmm6, %xmm7 pcmpeqd %xmm0, %xmm7 - pmovmskb %xmm7, %rax + pmovmskb %xmm7, %eax movaps %xmm5, %xmm7 palignr $4, %xmm4, %xmm5 - test %rax, %rax palignr $4, %xmm3, %xmm4 + test %eax, %eax jnz L(Shl4Start) palignr $4, %xmm2, %xmm3 - lea 64(%rcx), %rcx + addq $64, %rcx palignr $4, %xmm1, %xmm2 movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) movaps %xmm4, 32(%rdx) movaps %xmm3, 16(%rdx) movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx + addq $64, %rdx jmp L(Shl4LoopStart) L(Shl4LoopExit): @@ -297,10 +297,10 @@ L(Shl8): movaps 8(%rcx), %xmm2 L(Shl8Start): pcmpeqd %xmm2, %xmm0 - pmovmskb %xmm0, %rax + pmovmskb %xmm0, %eax movaps %xmm2, %xmm3 - test %rax, %rax + test %eax, %eax jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 @@ -308,12 +308,12 @@ L(Shl8Start): movaps 24(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx movaps %xmm2, %xmm1 - test %rax, %rax + test %eax, %eax jnz L(Shl8LoopExit) palignr $8, %xmm3, %xmm2 @@ -321,12 +321,12 @@ L(Shl8Start): movaps 24(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx movaps %xmm2, %xmm3 - test %rax, %rax + test %eax, %eax jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 @@ -334,22 +334,22 @@ L(Shl8Start): movaps 24(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx - test %rax, %rax + test %eax, %eax jnz L(Shl8LoopExit) palignr $8, %xmm3, %xmm2 movaps %xmm2, (%rdx) - lea 24(%rcx), %rcx - lea 16(%rdx), %rdx + addq $24, %rcx + addq $16, %rdx mov %rcx, %rax and $-0x40, %rcx sub %rcx, %rax - lea -8(%rcx), %rcx + addq $-8, %rcx sub %rax, %rdx movaps -8(%rcx), %xmm1 @@ -366,22 +366,22 @@ L(Shl8LoopStart): pminub %xmm5, %xmm7 pminub %xmm6, %xmm7 pcmpeqd %xmm0, %xmm7 - pmovmskb %xmm7, %rax + pmovmskb %xmm7, %eax movaps %xmm5, %xmm7 palignr $8, %xmm4, %xmm5 - test %rax, %rax palignr $8, %xmm3, %xmm4 + test %eax, %eax jnz L(Shl8Start) palignr $8, %xmm2, %xmm3 - lea 64(%rcx), %rcx + addq $64, %rcx palignr $8, %xmm1, %xmm2 movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) movaps %xmm4, 32(%rdx) movaps %xmm3, 16(%rdx) movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx + addq $64, %rdx jmp L(Shl8LoopStart) L(Shl8LoopExit): @@ -396,10 +396,10 @@ L(Shl12): movaps 4(%rcx), %xmm2 L(Shl12Start): pcmpeqd %xmm2, %xmm0 - pmovmskb %xmm0, %rax + pmovmskb %xmm0, %eax movaps %xmm2, %xmm3 - test %rax, %rax + test %eax, %eax jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 @@ -407,12 +407,12 @@ L(Shl12Start): movaps 20(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx movaps %xmm2, %xmm1 - test %rax, %rax + test %eax, %eax jnz L(Shl12LoopExit) palignr $12, %xmm3, %xmm2 @@ -420,12 +420,12 @@ L(Shl12Start): movaps 20(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx movaps %xmm2, %xmm3 - test %rax, %rax + test %eax, %eax jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 @@ -433,22 +433,22 @@ L(Shl12Start): movaps 20(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx - test %rax, %rax + test %eax, %eax jnz L(Shl12LoopExit) palignr $12, %xmm3, %xmm2 movaps %xmm2, (%rdx) - lea 20(%rcx), %rcx - lea 16(%rdx), %rdx + addq $20, %rcx + addq $16, %rdx mov %rcx, %rax and $-0x40, %rcx sub %rcx, %rax - lea -4(%rcx), %rcx + addq $-4, %rcx sub %rax, %rdx movaps -12(%rcx), %xmm1 @@ -465,21 +465,21 @@ L(Shl12LoopStart): pminub %xmm5, %xmm7 pminub %xmm6, %xmm7 pcmpeqd %xmm0, %xmm7 - pmovmskb %xmm7, %rax + pmovmskb %xmm7, %eax movaps %xmm5, %xmm7 palignr $12, %xmm4, %xmm5 - test %rax, %rax palignr $12, %xmm3, %xmm4 + test %eax, %eax jnz L(Shl12Start) palignr $12, %xmm2, %xmm3 - lea 64(%rcx), %rcx + addq $64, %rcx palignr $12, %xmm1, %xmm2 movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) movaps %xmm4, 32(%rdx) movaps %xmm3, 16(%rdx) movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx + addq $64, %rdx jmp L(Shl12LoopStart) L(Shl12LoopExit): From patchwork Fri Mar 25 22:13:33 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 52380 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 175493889806 for ; Fri, 25 Mar 2022 22:14:44 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 175493889806 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1648246484; bh=xl3fehAkBeCn861ug96LFmEtQ9ol8PWqWs2HCfwQuQI=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=LGw/cpr7J9LpNhPblGeA1Izr8U0nAo9My1SLMn0VU+n0g/ONHdB69/9w3P6pq2GIs bTOkAFMgntUlhIZ1WnbUrMX+2NE0EdHC4VAd6f3ExFZRYncmWh3VKLup4Fgmx18vFr Tk9CxMLX8NZaaVzkV8MJJIYf9+UzyjeqWFaKeRB8= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-io1-xd30.google.com (mail-io1-xd30.google.com [IPv6:2607:f8b0:4864:20::d30]) by sourceware.org (Postfix) with ESMTPS id 1EC783858036 for ; Fri, 25 Mar 2022 22:13:41 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 1EC783858036 Received: by mail-io1-xd30.google.com with SMTP id p22so10512158iod.2 for ; Fri, 25 Mar 2022 15:13:41 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=xl3fehAkBeCn861ug96LFmEtQ9ol8PWqWs2HCfwQuQI=; b=4jUB5dX31/CudSM3sLF6zW3A1YNS0MW+W0O2Laf3ENyz+7vh6v0uN5uNsYBas4rWCV 8VCYlI/Ao4a2tRG6JvoaxEzCRPujOp3bTkHritpPJAuiOp5Vv5ay1AtHx+FzRL6gOb2p tZNAwPrQpZ6v7/lRzjYXvoUvCKnpzbF59HKsjtwP5LQLeeD83LbGTS/yNYp6NMsDnAg7 NcwkK+e1spi1eWvO8o0BSIn5PgmeRIDXutShEdkc2q0TKMRsATKiQkUZAOKyItp/U4jy Wdpat/E4GVNqukaPr4G8xOMw/iw+lE17kuAMpH28CFkANZXfg5glRLAgLWDYdEgTzoNo JoTw== X-Gm-Message-State: AOAM530Fz4o5oCL0o4+u65eWVcX0XRcPcXiScQbjI4MqhW59FTp4NLKm T4mV0DoD5JQg1U1CuuCQEzrhmomNFG8= X-Google-Smtp-Source: ABdhPJyP3Nxd15DDL6U9gOME3yYDmYBiWDUwP4m/rJutjfWEO9EVPz5LDMBanWXel+ToK8tm+1zn+A== X-Received: by 2002:a05:6602:2a42:b0:611:799e:bf30 with SMTP id k2-20020a0566022a4200b00611799ebf30mr608633iov.113.1648246420227; Fri, 25 Mar 2022 15:13:40 -0700 (PDT) Received: from localhost.localdomain (node-17-161.flex.volo.net. [76.191.17.161]) by smtp.googlemail.com with ESMTPSA id k1-20020a056e021a8100b002c64cf94399sm3784845ilv.44.2022.03.25.15.13.39 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Fri, 25 Mar 2022 15:13:39 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1 2/2] x86: Small improvements for wcslen Date: Fri, 25 Mar 2022 17:13:33 -0500 Message-Id: <20220325221333.3079015-2-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220325221333.3079015-1-goldstein.w.n@gmail.com> References: <20220325221333.3079015-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.9 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" Just a few QOL changes. 1. Prefer `add` > `lea` as it has high execution units it can run on. 2. Don't break macro-fusion between `test` and `jcc` 3. Reduce code size by removing gratuitous padding bytes (-90 bytes). geometric_mean(N=20) of all benchmarks New / Original: 0.959 All string/memory tests pass. Reviewed-by: H.J. Lu --- sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 45 deletions(-) diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S index c9165dbf03..d641141d75 100644 --- a/sysdeps/x86_64/wcslen.S +++ b/sysdeps/x86_64/wcslen.S @@ -40,82 +40,82 @@ ENTRY (__wcslen) pxor %xmm0, %xmm0 lea 32(%rdi), %rax - lea 16(%rdi), %rcx + addq $16, %rdi and $-16, %rax pcmpeqd (%rax), %xmm0 pmovmskb %xmm0, %edx pxor %xmm1, %xmm1 + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm1 pmovmskb %xmm1, %edx pxor %xmm2, %xmm2 + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm2 pmovmskb %xmm2, %edx pxor %xmm3, %xmm3 + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm3 pmovmskb %xmm3, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm0 pmovmskb %xmm0, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm1 pmovmskb %xmm1, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm2 pmovmskb %xmm2, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm3 pmovmskb %xmm3, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm0 pmovmskb %xmm0, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm1 pmovmskb %xmm1, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm2 pmovmskb %xmm2, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm3 pmovmskb %xmm3, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) and $-0x40, %rax @@ -132,104 +132,100 @@ L(aligned_64_loop): pminub %xmm0, %xmm2 pcmpeqd %xmm3, %xmm2 pmovmskb %xmm2, %edx + addq $64, %rax test %edx, %edx - lea 64(%rax), %rax jz L(aligned_64_loop) pcmpeqd -64(%rax), %xmm3 pmovmskb %xmm3, %edx + addq $48, %rdi test %edx, %edx - lea 48(%rcx), %rcx jnz L(exit) pcmpeqd %xmm1, %xmm3 pmovmskb %xmm3, %edx + addq $-16, %rdi test %edx, %edx - lea -16(%rcx), %rcx jnz L(exit) pcmpeqd -32(%rax), %xmm3 pmovmskb %xmm3, %edx + addq $-16, %rdi test %edx, %edx - lea -16(%rcx), %rcx jnz L(exit) pcmpeqd %xmm6, %xmm3 pmovmskb %xmm3, %edx + addq $-16, %rdi test %edx, %edx - lea -16(%rcx), %rcx - jnz L(exit) - - jmp L(aligned_64_loop) + jz L(aligned_64_loop) .p2align 4 L(exit): - sub %rcx, %rax + sub %rdi, %rax shr $2, %rax test %dl, %dl jz L(exit_high) - mov %dl, %cl - and $15, %cl + andl $15, %edx jz L(exit_1) ret - .p2align 4 + /* No align here. Naturally aligned % 16 == 1. */ L(exit_high): - mov %dh, %ch - and $15, %ch + andl $(15 << 8), %edx jz L(exit_3) add $2, %rax ret - .p2align 4 + .p2align 3 L(exit_1): add $1, %rax ret - .p2align 4 + .p2align 3 L(exit_3): add $3, %rax ret - .p2align 4 + .p2align 3 L(exit_tail0): - xor %rax, %rax + xorl %eax, %eax ret - .p2align 4 + .p2align 3 L(exit_tail1): - mov $1, %rax + movl $1, %eax ret - .p2align 4 + .p2align 3 L(exit_tail2): - mov $2, %rax + movl $2, %eax ret - .p2align 4 + .p2align 3 L(exit_tail3): - mov $3, %rax + movl $3, %eax ret - .p2align 4 + .p2align 3 L(exit_tail4): - mov $4, %rax + movl $4, %eax ret - .p2align 4 + .p2align 3 L(exit_tail5): - mov $5, %rax + movl $5, %eax ret - .p2align 4 + .p2align 3 L(exit_tail6): - mov $6, %rax + movl $6, %eax ret - .p2align 4 + .p2align 3 L(exit_tail7): - mov $7, %rax + movl $7, %eax ret END (__wcslen)