From patchwork Tue Jul 12 19:29:07 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 55987 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id BB33F382458E for ; Tue, 12 Jul 2022 19:30:25 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org BB33F382458E DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1657654225; bh=zvBVUif88b/gcnG/5Hvkr33T5oSlFj/ri+4z+ci07Gc=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=O98K9m0wEOe8MkxOC5CnSC8jxIoNC0kM1N3vvzOEbcZtINbbQXPri+NUq2YPlNC4f E70wEeA8WS2gXoF6u4Iy//jY1/Qyy3s6PfyFlfouuJzTR0i/zVm2lbcdQT+su2qRRZ SyzZYJ5oNiaaEXkkJQUQp6roVkFXk6DF+0XCv/t0= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pg1-x52e.google.com (mail-pg1-x52e.google.com [IPv6:2607:f8b0:4864:20::52e]) by sourceware.org (Postfix) with ESMTPS id 96CB33838AA6 for ; Tue, 12 Jul 2022 19:29:21 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 96CB33838AA6 Received: by mail-pg1-x52e.google.com with SMTP id o18so8464189pgu.9 for ; Tue, 12 Jul 2022 12:29:21 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=zvBVUif88b/gcnG/5Hvkr33T5oSlFj/ri+4z+ci07Gc=; b=0eg+FCglWmKUUiSYrY0Qokimg3adfIvWnqXw+wxP9fTmvS3LiqPtEw64g5KqNE5jcf MX0ydWgZtLR8J9iVV+dk43Z6U8BjCNCofa0ubk8f8zNFem0H6iOFLajPiwR7HrImmoVT 8zZhL0wVnC6X12F6R5qSpVjQJggGHErjlS28p8e3/0NA0LOydlhcgDucNVaY8rfHbx+s InfpSBS0jZGHy32DrzK3j20jnxhUXBEHW8YIB+pgd+VChXeeqo5GDh1QSnxH9sFkQtCk x3wkUTfofI5TK8RtwVC5LERBE5SPEsV0e3HwkBgzCWk4NlZ2rimqMMckjcVRuEiMa/Ro kRYg== X-Gm-Message-State: AJIora+O7nW5ml4lnk7r3bXVVpoitSoXM+P1kqPb88vKbzHVDrFGpXi2 fg/tiLjAblOocF+tdRkI6LGNOvArPns= X-Google-Smtp-Source: AGRyM1tvI0dCR1Exv7rM5Ajd6g/2izZQtOXfHvyWxaJpCytjRlLPvcnFJsGg2JksLJ99KKfAPaS21w== X-Received: by 2002:a63:5366:0:b0:411:415a:5888 with SMTP id t38-20020a635366000000b00411415a5888mr21273062pgl.286.1657654160457; Tue, 12 Jul 2022 12:29:20 -0700 (PDT) Received: from noah-tgl.. ([192.55.60.37]) by smtp.gmail.com with ESMTPSA id w7-20020a170902e88700b0016c28fbd7e5sm7274704plg.268.2022.07.12.12.29.19 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 12 Jul 2022 12:29:19 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1] x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S Date: Tue, 12 Jul 2022 12:29:07 -0700 Message-Id: <20220712192910.351121-7-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20220712192910.351121-1-goldstein.w.n@gmail.com> References: <20220712192910.351121-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-12.1 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch. --- sysdeps/x86_64/multiarch/wcschr-sse2.S | 145 +++++++++++++++++++++++-- sysdeps/x86_64/wcschr.S | 135 +---------------------- 2 files changed, 138 insertions(+), 142 deletions(-) diff --git a/sysdeps/x86_64/multiarch/wcschr-sse2.S b/sysdeps/x86_64/multiarch/wcschr-sse2.S index 218ea609b9..c872926ba9 100644 --- a/sysdeps/x86_64/multiarch/wcschr-sse2.S +++ b/sysdeps/x86_64/multiarch/wcschr-sse2.S @@ -17,14 +17,141 @@ . */ #if IS_IN (libc) -# define __wcschr __wcschr_sse2 - -# undef weak_alias -# define weak_alias(__wcschr, wcschr) -# undef libc_hidden_def -# define libc_hidden_def(__wcschr) -# undef libc_hidden_weak -# define libc_hidden_weak(wcschr) +# ifndef WCSCHR +# define WCSCHR __wcschr_sse2 +# endif #endif -#include "../wcschr.S" +#include + + .text +ENTRY (WCSCHR) + + movd %rsi, %xmm1 + pxor %xmm2, %xmm2 + mov %rdi, %rcx + punpckldq %xmm1, %xmm1 + punpckldq %xmm1, %xmm1 + + and $63, %rcx + cmp $48, %rcx + ja L(cross_cache) + + movdqu (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + and $-16, %rdi + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + jmp L(loop) + +L(cross_cache): + and $15, %rcx + and $-16, %rdi + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + + sar %cl, %rdx + sar %cl, %rax + test %rax, %rax + je L(unaligned_no_match) + + bsf %rax, %rax + test %rdx, %rdx + je L(unaligned_match) + bsf %rdx, %rdx + cmp %rdx, %rax + ja L(return_null) + +L(unaligned_match): + add %rdi, %rax + add %rcx, %rax + ret + + .p2align 4 +L(unaligned_no_match): + test %rdx, %rdx + jne L(return_null) + pxor %xmm2, %xmm2 + + add $16, %rdi + + .p2align 4 +/* Loop start on aligned string. */ +L(loop): + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + jmp L(loop) + + .p2align 4 +L(matches): + pmovmskb %xmm2, %rdx + test %rax, %rax + jz L(return_null) + bsf %rax, %rax + test %rdx, %rdx + je L(match) + bsf %rdx, %rcx + cmp %rcx, %rax + ja L(return_null) +L(match): + sub $16, %rdi + add %rdi, %rax + ret + + .p2align 4 +L(return_null): + xor %rax, %rax + ret + +END (WCSCHR) diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S index 2131220382..80b12c4286 100644 --- a/sysdeps/x86_64/wcschr.S +++ b/sysdeps/x86_64/wcschr.S @@ -16,140 +16,9 @@ License along with the GNU C Library; if not, see . */ -#include - - .text -ENTRY (__wcschr) - - movd %rsi, %xmm1 - pxor %xmm2, %xmm2 - mov %rdi, %rcx - punpckldq %xmm1, %xmm1 - punpckldq %xmm1, %xmm1 - - and $63, %rcx - cmp $48, %rcx - ja L(cross_cache) - - movdqu (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - and $-16, %rdi - - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - jmp L(loop) - -L(cross_cache): - and $15, %rcx - and $-16, %rdi - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - - sar %cl, %rdx - sar %cl, %rax - test %rax, %rax - je L(unaligned_no_match) - - bsf %rax, %rax - test %rdx, %rdx - je L(unaligned_match) - bsf %rdx, %rdx - cmp %rdx, %rax - ja L(return_null) - -L(unaligned_match): - add %rdi, %rax - add %rcx, %rax - ret - - .p2align 4 -L(unaligned_no_match): - test %rdx, %rdx - jne L(return_null) - pxor %xmm2, %xmm2 - - add $16, %rdi - - .p2align 4 -/* Loop start on aligned string. */ -L(loop): - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - jmp L(loop) - - .p2align 4 -L(matches): - pmovmskb %xmm2, %rdx - test %rax, %rax - jz L(return_null) - bsf %rax, %rax - test %rdx, %rdx - je L(match) - bsf %rdx, %rcx - cmp %rcx, %rax - ja L(return_null) -L(match): - sub $16, %rdi - add %rdi, %rax - ret - - .p2align 4 -L(return_null): - xor %rax, %rax - ret - -END (__wcschr) +#define WCSCHR __wcschr +#include "multiarch/wcschr-sse2.S" libc_hidden_def(__wcschr) weak_alias (__wcschr, wcschr) libc_hidden_weak (wcschr)