From patchwork Fri Oct 14 18:22:05 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 58862 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 7FC6B3857369 for ; Fri, 14 Oct 2022 18:23:35 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 7FC6B3857369 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1665771815; bh=zs4PrI9znQDbJF2gqAWcYEzOM1tz62MC5CYPINpetNQ=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=HkO8SLsvbHEFfPB9cqoL/HqBrH1UC1PPFZMw7kJMbfPoik4EJ3OiIf+6vZ0k3VtET Wcd2+iMF0pcHmGklyy4k4UO+jL4U/aaT8h3uBDSwUkNkD4Pfu3LZ4lNAxn/wm6nN46 xaK1aj59MDnrPQ7Ooe0SRXmXOdyRSaxjz/bXC2a4= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-ej1-x635.google.com (mail-ej1-x635.google.com [IPv6:2a00:1450:4864:20::635]) by sourceware.org (Postfix) with ESMTPS id B46243858C52 for ; Fri, 14 Oct 2022 18:22:20 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org B46243858C52 Received: by mail-ej1-x635.google.com with SMTP id w18so12203387ejq.11 for ; Fri, 14 Oct 2022 11:22:20 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=zs4PrI9znQDbJF2gqAWcYEzOM1tz62MC5CYPINpetNQ=; b=3aYH0/QGsLVzueQP/nQzf0ce391pGd43UIG+7RZVK38UG/Wb9wLo6rCYjkP0flh1D8 8rm44fIwpA92moH7EASmua/47PHd9clkWvGo9pTFHScHKKoWW7nwOO2pChvsywyuNPll 44oQe5qdZBXXZVkE8vQ2TRCjc3gPacCY4v4OZLsBSnXcTm8uyYQE9+JiO2zivBbRbiax 0obb73j3Y6/AIxvPiwKIH+fs/DghKHwcqD9HX6U3mEkQozTArgmHlv1/XVLWhU1tYySw AY/oK5s3BsucNdwzoyUtTkWAPjkI21McYdJHQFKBTE9Zi5L70i53QUPq+xoIQrrhmNcz 6EZw== X-Gm-Message-State: ACrzQf1dtdtv/6VRW35nGxDnPOmG1tlOPI9rkFD3yDhZ0SFiNjKuKhRt goDJHpqJVS/Xe4R0T6+eRRNHJLq1dSennQ== X-Google-Smtp-Source: AMsMyM7AVWR4gITzK06C/sFXXeDHYAia4GC/wROx2mK5XDp17i6phcgLquVA7n5OP+Js0RbJfEFjJQ== X-Received: by 2002:a17:907:761b:b0:78d:4990:3f3e with SMTP id jx27-20020a170907761b00b0078d49903f3emr4471923ejc.228.1665771739195; Fri, 14 Oct 2022 11:22:19 -0700 (PDT) Received: from noahgold-DESK.an.intel.com ([192.55.60.38]) by smtp.gmail.com with ESMTPSA id p18-20020a17090653d200b0073dd1ac2fc8sm1851821ejo.195.2022.10.14.11.22.16 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Fri, 14 Oct 2022 11:22:18 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v3 3/3] x86: Update strlen-evex-base to use new reg/vec macros. Date: Fri, 14 Oct 2022 13:22:05 -0500 Message-Id: <20221014182205.115792-3-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20221014182205.115792-1-goldstein.w.n@gmail.com> References: <20221014164008.1325863-1-goldstein.w.n@gmail.com> <20221014182205.115792-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.1 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, URIBL_BLACK autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" To avoid duplicate the VMM / GPR / mask insn macros in all incoming evex512 files use the macros defined in 'reg-macros.h' and '{vec}-macros.h' This commit does not change libc.so Tested build on x86-64 --- sysdeps/x86_64/multiarch/strlen-evex-base.S | 116 +++++++------------- sysdeps/x86_64/multiarch/strlen-evex512.S | 4 +- 2 files changed, 44 insertions(+), 76 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S index 418e9f8411..8af9791e92 100644 --- a/sysdeps/x86_64/multiarch/strlen-evex-base.S +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S @@ -36,42 +36,10 @@ # define CHAR_SIZE 1 # endif -# define XMM0 xmm16 # define PAGE_SIZE 4096 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) -# if VEC_SIZE == 64 -# define KMOV kmovq -# define KORTEST kortestq -# define RAX rax -# define RCX rcx -# define RDX rdx -# define SHR shrq -# define TEXTSUFFIX evex512 -# define VMM0 zmm16 -# define VMM1 zmm17 -# define VMM2 zmm18 -# define VMM3 zmm19 -# define VMM4 zmm20 -# define VMOVA vmovdqa64 -# elif VEC_SIZE == 32 -/* Currently Unused. */ -# define KMOV kmovd -# define KORTEST kortestd -# define RAX eax -# define RCX ecx -# define RDX edx -# define SHR shrl -# define TEXTSUFFIX evex256 -# define VMM0 ymm16 -# define VMM1 ymm17 -# define VMM2 ymm18 -# define VMM3 ymm19 -# define VMM4 ymm20 -# define VMOVA vmovdqa32 -# endif - - .section .text.TEXTSUFFIX, "ax", @progbits + .section SECTION(.text),"ax",@progbits /* Aligning entry point to 64 byte, provides better performance for one vector length string. */ ENTRY_P2ALIGN (STRLEN, 6) @@ -86,18 +54,18 @@ ENTRY_P2ALIGN (STRLEN, 6) # endif movl %edi, %eax - vpxorq %XMM0, %XMM0, %XMM0 + vpxorq %VEC_xmm(0), %VEC_xmm(0), %VEC_xmm(0) andl $(PAGE_SIZE - 1), %eax cmpl $(PAGE_SIZE - VEC_SIZE), %eax ja L(page_cross) /* Compare [w]char for null, mask bit will be set for match. */ - VPCMP $0, (%rdi), %VMM0, %k0 - KMOV %k0, %RAX - test %RAX, %RAX + VPCMP $0, (%rdi), %VEC(0), %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX jz L(align_more) - bsf %RAX, %RAX + bsf %VRAX, %VRAX # ifdef USE_AS_STRNLEN cmpq %rsi, %rax cmovnb %rsi, %rax @@ -120,7 +88,7 @@ L(align_more): movq %rax, %rdx subq %rdi, %rdx # ifdef USE_AS_WCSLEN - SHR $2, %RDX + shr $2, %VRDX # endif /* At this point rdx contains [w]chars already compared. */ subq %rsi, %rdx @@ -131,9 +99,9 @@ L(align_more): # endif /* Loop unroll 4 times for 4 vector loop. */ - VPCMP $0, (%rax), %VMM0, %k0 - KMOV %k0, %RCX - test %RCX, %RCX + VPCMP $0, (%rax), %VEC(0), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX jnz L(ret_vec_x1) # ifdef USE_AS_STRNLEN @@ -141,9 +109,9 @@ L(align_more): jbe L(ret_max) # endif - VPCMP $0, VEC_SIZE(%rax), %VMM0, %k0 - KMOV %k0, %RCX - test %RCX, %RCX + VPCMP $0, VEC_SIZE(%rax), %VEC(0), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX jnz L(ret_vec_x2) # ifdef USE_AS_STRNLEN @@ -151,9 +119,9 @@ L(align_more): jbe L(ret_max) # endif - VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0 - KMOV %k0, %RCX - test %RCX, %RCX + VPCMP $0, (VEC_SIZE * 2)(%rax), %VEC(0), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX jnz L(ret_vec_x3) # ifdef USE_AS_STRNLEN @@ -161,9 +129,9 @@ L(align_more): jbe L(ret_max) # endif - VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0 - KMOV %k0, %RCX - test %RCX, %RCX + VPCMP $0, (VEC_SIZE * 3)(%rax), %VEC(0), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX jnz L(ret_vec_x4) # ifdef USE_AS_STRNLEN @@ -179,7 +147,7 @@ L(align_more): # ifdef USE_AS_STRNLEN subq %rax, %rcx # ifdef USE_AS_WCSLEN - SHR $2, %RCX + shr $2, %VRCX # endif /* rcx contains number of [w]char will be recompared due to alignment fixes. rdx must be incremented by rcx to offset @@ -199,42 +167,42 @@ L(loop_entry): # endif /* VPMINU and VPCMP combination provide better performance as compared to alternative combinations. */ - VMOVA (VEC_SIZE * 4)(%rax), %VMM1 - VPMINU (VEC_SIZE * 5)(%rax), %VMM1, %VMM2 - VMOVA (VEC_SIZE * 6)(%rax), %VMM3 - VPMINU (VEC_SIZE * 7)(%rax), %VMM3, %VMM4 + VMOVA (VEC_SIZE * 4)(%rax), %VEC(1) + VPMINU (VEC_SIZE * 5)(%rax), %VEC(1), %VEC(2) + VMOVA (VEC_SIZE * 6)(%rax), %VEC(3) + VPMINU (VEC_SIZE * 7)(%rax), %VEC(3), %VEC(4) - VPTESTN %VMM2, %VMM2, %k0 - VPTESTN %VMM4, %VMM4, %k1 + VPTESTN %VEC(2), %VEC(2), %k0 + VPTESTN %VEC(4), %VEC(4), %k1 subq $-(VEC_SIZE * 4), %rax KORTEST %k0, %k1 jz L(loop) - VPTESTN %VMM1, %VMM1, %k2 - KMOV %k2, %RCX - test %RCX, %RCX + VPTESTN %VEC(1), %VEC(1), %k2 + KMOV %k2, %VRCX + test %VRCX, %VRCX jnz L(ret_vec_x1) - KMOV %k0, %RCX + KMOV %k0, %VRCX /* At this point, if k0 is non zero, null char must be in the second vector. */ - test %RCX, %RCX + test %VRCX, %VRCX jnz L(ret_vec_x2) - VPTESTN %VMM3, %VMM3, %k3 - KMOV %k3, %RCX - test %RCX, %RCX + VPTESTN %VEC(3), %VEC(3), %k3 + KMOV %k3, %VRCX + test %VRCX, %VRCX jnz L(ret_vec_x3) /* At this point null [w]char must be in the fourth vector so no need to check. */ - KMOV %k1, %RCX + KMOV %k1, %VRCX /* Fourth, third, second vector terminating are pretty much same, implemented this way to avoid branching and reuse code from pre loop exit condition. */ L(ret_vec_x4): - bsf %RCX, %RCX + bsf %VRCX, %VRCX subq %rdi, %rax # ifdef USE_AS_WCSLEN subq $-(VEC_SIZE * 3), %rax @@ -250,7 +218,7 @@ L(ret_vec_x4): ret L(ret_vec_x3): - bsf %RCX, %RCX + bsf %VRCX, %VRCX subq %rdi, %rax # ifdef USE_AS_WCSLEN subq $-(VEC_SIZE * 2), %rax @@ -268,7 +236,7 @@ L(ret_vec_x3): L(ret_vec_x2): subq $-VEC_SIZE, %rax L(ret_vec_x1): - bsf %RCX, %RCX + bsf %VRCX, %VRCX subq %rdi, %rax # ifdef USE_AS_WCSLEN shrq $2, %rax @@ -289,13 +257,13 @@ L(page_cross): /* ecx contains number of w[char] to be skipped as a result of address alignment. */ xorq %rdi, %rax - VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0 - KMOV %k0, %RAX + VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VEC(0), %k0 + KMOV %k0, %VRAX /* Ignore number of character for alignment adjustment. */ - SHR %cl, %RAX + shr %cl, %VRAX jz L(align_more) - bsf %RAX, %RAX + bsf %VRAX, %VRAX # ifdef USE_AS_STRNLEN cmpq %rsi, %rax cmovnb %rsi, %rax diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S index 116f8981c8..dfd0a7821b 100644 --- a/sysdeps/x86_64/multiarch/strlen-evex512.S +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S @@ -2,6 +2,6 @@ # define STRLEN __strlen_evex512 #endif -#define VEC_SIZE 64 - +#include "evex512-vecs.h" +#include "reg-macros.h" #include "strlen-evex-base.S"