From patchwork Tue Jul 12 19:29:08 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 55991 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 489843829BD4 for ; Tue, 12 Jul 2022 19:31:17 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 489843829BD4 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1657654277; bh=aC3BtLVEZ9/fB+ftrW/LhI4xZrwNh05gqPdVegugbLo=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=ePfRIzj26wvLQPX/ESAyjN4hMzx/BRgWokYlm9Ca7Nf4Ec+nvpYmjHtoMLfMI3jvg feUrac/gqfogBE1Gz5jVuQ/Ocpzqq8pE1xCneeCbk+qmwd/+odqpfLe+N4COookEHn iEhJYMdGpI01DDPxRbbVVhUrYPLxDRqy+oCMj/tw= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pj1-x102a.google.com (mail-pj1-x102a.google.com [IPv6:2607:f8b0:4864:20::102a]) by sourceware.org (Postfix) with ESMTPS id 382603836F96 for ; Tue, 12 Jul 2022 19:29:23 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 382603836F96 Received: by mail-pj1-x102a.google.com with SMTP id i8-20020a17090a4b8800b001ef8a65bfbdso9543327pjh.1 for ; Tue, 12 Jul 2022 12:29:23 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=aC3BtLVEZ9/fB+ftrW/LhI4xZrwNh05gqPdVegugbLo=; b=lecUEVBgjAHinZj6UTzgxSwX0VW7BtXy5ZPTqPKe9hOYAzYbeE33JeP8mOdmd6abFL RJSqj/aWa2lbrZ2F/XeZRXmGfNbK43Uub1dwcNwClXJT4/ennVRt4sBTmNia+05h3Q16 jSrjePADM0r3TFszMij9r8oppFSUtVE5n8s3iwfbom/duEQ+RM22BqrwUhZuOWfQE4MH 40AvNgB/qyIdFAEcPwIXuyl3k4ag9NjGoWovz5eqEdsBNdCEMHCSacKMuzIKCDNw88ia 728JdefoLxx1Pra3hbHrhM6lp0waQkPJn2UowYbwO3iBZO7ouuEYRKYm40SZWN0D7lXy 5nNw== X-Gm-Message-State: AJIora8w5WHQG/oBgCI8kddcJT5tgKLJHgPZcYsXYNqF9jQVC+LDIZet 8G/2iFT4goxrIgu0DIdfDcN2LzTeMCU= X-Google-Smtp-Source: AGRyM1smbYTgomPHL1m2Xf/MtQTtpvL0OUFAnGK5ngHdgguAw2xCa9JBeP+kF9RxN2zv8jz0+2dWig== X-Received: by 2002:a17:90b:3596:b0:1ef:76fb:d8b7 with SMTP id mm22-20020a17090b359600b001ef76fbd8b7mr6075976pjb.109.1657654161923; Tue, 12 Jul 2022 12:29:21 -0700 (PDT) Received: from noah-tgl.. ([192.55.60.37]) by smtp.gmail.com with ESMTPSA id w7-20020a170902e88700b0016c28fbd7e5sm7274704plg.268.2022.07.12.12.29.20 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 12 Jul 2022 12:29:21 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1] x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S Date: Tue, 12 Jul 2022 12:29:08 -0700 Message-Id: <20220712192910.351121-8-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20220712192910.351121-1-goldstein.w.n@gmail.com> References: <20220712192910.351121-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.6 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_SHORT, RCVD_IN_DNSWL_NONE, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch. --- sysdeps/x86_64/multiarch/wcslen-sse2.S | 221 ++++++++++++++++++++++++- sysdeps/x86_64/wcslen.S | 216 +----------------------- 2 files changed, 218 insertions(+), 219 deletions(-) diff --git a/sysdeps/x86_64/multiarch/wcslen-sse2.S b/sysdeps/x86_64/multiarch/wcslen-sse2.S index 2b3a9efd64..944c3bd9c6 100644 --- a/sysdeps/x86_64/multiarch/wcslen-sse2.S +++ b/sysdeps/x86_64/multiarch/wcslen-sse2.S @@ -17,10 +17,221 @@ . */ #if IS_IN (libc) -# define __wcslen __wcslen_sse2 - -# undef weak_alias -# define weak_alias(__wcslen, wcslen) +# ifndef WCSLEN +# define WCSLEN __wcslen_sse2 +# endif #endif -#include "../wcslen.S" +#include + + .text +ENTRY (WCSLEN) + cmpl $0, (%rdi) + jz L(exit_tail0) + cmpl $0, 4(%rdi) + jz L(exit_tail1) + cmpl $0, 8(%rdi) + jz L(exit_tail2) + cmpl $0, 12(%rdi) + jz L(exit_tail3) + cmpl $0, 16(%rdi) + jz L(exit_tail4) + cmpl $0, 20(%rdi) + jz L(exit_tail5) + cmpl $0, 24(%rdi) + jz L(exit_tail6) + cmpl $0, 28(%rdi) + jz L(exit_tail7) + + pxor %xmm0, %xmm0 + + lea 32(%rdi), %rax + addq $16, %rdi + and $-16, %rax + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + and $-0x40, %rax + + .p2align 4 +L(aligned_64_loop): + movaps (%rax), %xmm0 + movaps 16(%rax), %xmm1 + movaps 32(%rax), %xmm2 + movaps 48(%rax), %xmm6 + + pminub %xmm1, %xmm0 + pminub %xmm6, %xmm2 + pminub %xmm0, %xmm2 + pcmpeqd %xmm3, %xmm2 + pmovmskb %xmm2, %edx + addq $64, %rax + test %edx, %edx + jz L(aligned_64_loop) + + pcmpeqd -64(%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $48, %rdi + test %edx, %edx + jnz L(exit) + + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm3, %edx + addq $-16, %rdi + test %edx, %edx + jnz L(exit) + + pcmpeqd -32(%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $-16, %rdi + test %edx, %edx + jnz L(exit) + + pcmpeqd %xmm6, %xmm3 + pmovmskb %xmm3, %edx + addq $-16, %rdi + test %edx, %edx + jz L(aligned_64_loop) + + .p2align 4 +L(exit): + sub %rdi, %rax + shr $2, %rax + test %dl, %dl + jz L(exit_high) + + andl $15, %edx + jz L(exit_1) + ret + + /* No align here. Naturally aligned % 16 == 1. */ +L(exit_high): + andl $(15 << 8), %edx + jz L(exit_3) + add $2, %rax + ret + + .p2align 3 +L(exit_1): + add $1, %rax + ret + + .p2align 3 +L(exit_3): + add $3, %rax + ret + + .p2align 3 +L(exit_tail0): + xorl %eax, %eax + ret + + .p2align 3 +L(exit_tail1): + movl $1, %eax + ret + + .p2align 3 +L(exit_tail2): + movl $2, %eax + ret + + .p2align 3 +L(exit_tail3): + movl $3, %eax + ret + + .p2align 3 +L(exit_tail4): + movl $4, %eax + ret + + .p2align 3 +L(exit_tail5): + movl $5, %eax + ret + + .p2align 3 +L(exit_tail6): + movl $6, %eax + ret + + .p2align 3 +L(exit_tail7): + movl $7, %eax + ret + +END (WCSLEN) diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S index d641141d75..588a0fbe01 100644 --- a/sysdeps/x86_64/wcslen.S +++ b/sysdeps/x86_64/wcslen.S @@ -16,218 +16,6 @@ License along with the GNU C Library; if not, see . */ -#include - - .text -ENTRY (__wcslen) - cmpl $0, (%rdi) - jz L(exit_tail0) - cmpl $0, 4(%rdi) - jz L(exit_tail1) - cmpl $0, 8(%rdi) - jz L(exit_tail2) - cmpl $0, 12(%rdi) - jz L(exit_tail3) - cmpl $0, 16(%rdi) - jz L(exit_tail4) - cmpl $0, 20(%rdi) - jz L(exit_tail5) - cmpl $0, 24(%rdi) - jz L(exit_tail6) - cmpl $0, 28(%rdi) - jz L(exit_tail7) - - pxor %xmm0, %xmm0 - - lea 32(%rdi), %rax - addq $16, %rdi - and $-16, %rax - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - pxor %xmm2, %xmm2 - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - pxor %xmm3, %xmm3 - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - and $-0x40, %rax - - .p2align 4 -L(aligned_64_loop): - movaps (%rax), %xmm0 - movaps 16(%rax), %xmm1 - movaps 32(%rax), %xmm2 - movaps 48(%rax), %xmm6 - - pminub %xmm1, %xmm0 - pminub %xmm6, %xmm2 - pminub %xmm0, %xmm2 - pcmpeqd %xmm3, %xmm2 - pmovmskb %xmm2, %edx - addq $64, %rax - test %edx, %edx - jz L(aligned_64_loop) - - pcmpeqd -64(%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $48, %rdi - test %edx, %edx - jnz L(exit) - - pcmpeqd %xmm1, %xmm3 - pmovmskb %xmm3, %edx - addq $-16, %rdi - test %edx, %edx - jnz L(exit) - - pcmpeqd -32(%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $-16, %rdi - test %edx, %edx - jnz L(exit) - - pcmpeqd %xmm6, %xmm3 - pmovmskb %xmm3, %edx - addq $-16, %rdi - test %edx, %edx - jz L(aligned_64_loop) - - .p2align 4 -L(exit): - sub %rdi, %rax - shr $2, %rax - test %dl, %dl - jz L(exit_high) - - andl $15, %edx - jz L(exit_1) - ret - - /* No align here. Naturally aligned % 16 == 1. */ -L(exit_high): - andl $(15 << 8), %edx - jz L(exit_3) - add $2, %rax - ret - - .p2align 3 -L(exit_1): - add $1, %rax - ret - - .p2align 3 -L(exit_3): - add $3, %rax - ret - - .p2align 3 -L(exit_tail0): - xorl %eax, %eax - ret - - .p2align 3 -L(exit_tail1): - movl $1, %eax - ret - - .p2align 3 -L(exit_tail2): - movl $2, %eax - ret - - .p2align 3 -L(exit_tail3): - movl $3, %eax - ret - - .p2align 3 -L(exit_tail4): - movl $4, %eax - ret - - .p2align 3 -L(exit_tail5): - movl $5, %eax - ret - - .p2align 3 -L(exit_tail6): - movl $6, %eax - ret - - .p2align 3 -L(exit_tail7): - movl $7, %eax - ret - -END (__wcslen) - +#define WCSLEN __wcslen +#include "multiarch/wcslen-sse2.S" weak_alias(__wcslen, wcslen)