From patchwork Wed Dec 7 08:52:22 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 61633 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 4419D396E858 for ; Wed, 7 Dec 2022 08:56:19 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 4419D396E858 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1670403379; bh=oGylcp8HcfrIWcg5GcPG2o/H7AI9JexJnfxgA8wJs0Y=; h=To:Cc:Subject:Date:In-Reply-To:References:List-Id: List-Unsubscribe:List-Archive:List-Post:List-Help:List-Subscribe: From:Reply-To:From; b=J2Fa7M6vEyp32a7o3kUy5HaKI3HMuhnUXBTWd4DnZd2k65REFpdzKgaVqyjrfowrP QVK1Bro3tNkFbn6jsjEJkqgnx6/k8qUR6YY3h6u4tXTEtRxwk084vzSvxSoUUvHr9+ 2BmrhqqjICL1Sjw3eupZxm54ZuiI+Ir4k8zHqJGk= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-ej1-x635.google.com (mail-ej1-x635.google.com [IPv6:2a00:1450:4864:20::635]) by sourceware.org (Postfix) with ESMTPS id 3C369392B173 for ; Wed, 7 Dec 2022 08:53:05 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 3C369392B173 Received: by mail-ej1-x635.google.com with SMTP id n21so12283133ejb.9 for ; Wed, 07 Dec 2022 00:53:05 -0800 (PST) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=oGylcp8HcfrIWcg5GcPG2o/H7AI9JexJnfxgA8wJs0Y=; b=Cdf75SNbTacn5hhPaoebYuufD+M7qx5+CFz3SmaD2rojmnKo+wTuFsSnB/NXgkf4on cEiTmKiz7DN40SZ5SQAwegrUbwhf/x2H1qoHcMZFmw3kBwAUda5qZRL7RQ3otsAV08o9 2nKisUdbXh7zASoAcHp0AsOUFlN7S5fnJK4QJgXt0gezqUY9CCRot/vijravO/lRvFwf qc4suK+WILRny4GSkRqTUXr2ghDe7QU3qFmwoT/dPsfFJd4Cp/LHRClTqwwRhO4hMDmx cckBwdS9VakvbTiRU9cE4Elbn7zDLCbmA5/2t4Qw7edi75UfAQ1FgsfvRTHwWvsgLQk9 kyTw== X-Gm-Message-State: ANoB5pnyMzMmtCdlFaGQELusMPkie68iklWk/FpzKAWmMBlBzX4Yyed0 tHald4Pfzv5wSXvGnPiBwqY/TLjreXo= X-Google-Smtp-Source: AA0mqf5uiOONtyI+AATLRbKPenJBXNMXZnMunAVu6Ajdv8iYN0OC345z6O1tn2iOGdtzNfejpsvd6Q== X-Received: by 2002:a17:906:6a1d:b0:7c0:b569:8efd with SMTP id qw29-20020a1709066a1d00b007c0b5698efdmr22355661ejc.652.1670403183706; Wed, 07 Dec 2022 00:53:03 -0800 (PST) Received: from noahgold-desk.lan (2603-8080-1301-76c6-feb7-1b9b-f2dd-08f7.res6.spectrum.com. [2603:8080:1301:76c6:feb7:1b9b:f2dd:8f7]) by smtp.gmail.com with ESMTPSA id k17-20020aa7c051000000b0046bd3b366f9sm1931767edo.32.2022.12.07.00.53.02 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 07 Dec 2022 00:53:03 -0800 (PST) To: libc-alpha@sourceware.org Cc: goldstein.w.n@gmail.com, hjl.tools@gmail.com, andrey.kolesov@intel.com, carlos@systemhalted.org Subject: [PATCH v1 13/27] x86/fpu: Optimize svml_s_atanf8_core_avx2.S Date: Wed, 7 Dec 2022 00:52:22 -0800 Message-Id: <20221207085236.1424424-13-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20221207085236.1424424-1-goldstein.w.n@gmail.com> References: <20221207085236.1424424-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-12.1 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" 1. Cleanup some missed optimizations in instruction selection / unnecissary repeated rodata references. 2. Remove unused rodata. 3. Use common data definitions where possible. Code Size Change: -12 Bytes (163 - 175) Input New Time / Old Time 0F (0x00000000) -> 0.8484 0F (0x0000ffff, Denorm) -> 0.9993 .1F (0x3dcccccd) -> 0.9368 5F (0x40a00000) -> 0.9476 2315255808F (0x4f0a0000) -> 0.9454 -NaN (0xffffffff) -> 0.9193 --- .../fpu/multiarch/svml_s_atanf8_core_avx2.S | 162 +++++++----------- 1 file changed, 58 insertions(+), 104 deletions(-) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core_avx2.S index ee49a3e10e..649277c682 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core_avx2.S @@ -28,120 +28,74 @@ * */ -/* Offsets for data table __svml_satan_data_internal - */ -#define _sSIGN_MASK 0 -#define _sABS_MASK 32 -#define _sONE 64 -#define _sPIO2 96 -#define _sPC8 128 -#define _sPC7 160 -#define _sPC6 192 -#define _sPC5 224 -#define _sPC4 256 -#define _sPC3 288 -#define _sPC2 320 -#define _sPC1 352 -#define _sPC0 384 + +#define LOCAL_DATA_NAME __svml_satan_data_internal +#include "svml_s_common_avx2_rodata_offsets.h" +/* Offsets for data table __svml_satan_data_internal. */ +#define _sPC8 0 +#define _sPC7 32 +#define _sPC6 64 +#define _sPC5 96 +#define _sPC4 128 +#define _sPC3 160 +#define _sPC2 192 +#define _sPC1 224 +#define _sPC0 256 #include .section .text.avx2, "ax", @progbits ENTRY(_ZGVdN8v_atanf_avx2) - /* - * 1) If x>1, then r=-1/x, PIO2=Pi/2 - * 2) If -1<=x<=1, then r=x, PIO2=0 - * 3) If x<-1, then r=-1/x, PIO2=-Pi/2 - */ - vmovups _sONE+__svml_satan_data_internal(%rip), %ymm2 - vmovups __svml_satan_data_internal(%rip), %ymm7 - vmovups _sPC7+__svml_satan_data_internal(%rip), %ymm13 + /* 1) If x>1, then r=-1/x, PIO2=Pi/2 + 2) If -1<=x<=1, then r=x, PIO2=0 + 3) If x<-1, then r=-1/x, PIO2=-Pi/2. */ + vmovups COMMON_DATA(_AbsMask)(%rip), %ymm7 + vmovups COMMON_DATA(_OneF)(%rip), %ymm2 + + vandps %ymm0, %ymm7, %ymm3 + /* Use minud\maxud operations for argument reduction. */ + vpmaxud %ymm3, %ymm2, %ymm5 + vpminud %ymm3, %ymm2, %ymm4 - /* - * To use minps\maxps operations for argument reduction - * uncomment _AT_USEMINMAX_ definition - * Declarations - * Variables - * Constants - */ - vandps _sABS_MASK+__svml_satan_data_internal(%rip), %ymm0, %ymm3 - vmaxps %ymm3, %ymm2, %ymm5 - vminps %ymm3, %ymm2, %ymm4 - vcmple_oqps %ymm2, %ymm3, %ymm6 - vdivps %ymm5, %ymm4, %ymm11 - vandps %ymm7, %ymm0, %ymm9 - vandnps %ymm7, %ymm6, %ymm8 - vxorps %ymm9, %ymm8, %ymm10 - vxorps %ymm11, %ymm10, %ymm15 + vdivps %ymm5, %ymm4, %ymm4 - /* Polynomial. */ - vmulps %ymm15, %ymm15, %ymm14 - vmovups _sPC8+__svml_satan_data_internal(%rip), %ymm0 - vmulps %ymm14, %ymm14, %ymm12 - vfmadd213ps _sPC6+__svml_satan_data_internal(%rip), %ymm12, %ymm0 - vfmadd213ps _sPC5+__svml_satan_data_internal(%rip), %ymm12, %ymm13 - vfmadd213ps _sPC4+__svml_satan_data_internal(%rip), %ymm12, %ymm0 - vfmadd213ps _sPC3+__svml_satan_data_internal(%rip), %ymm12, %ymm13 - vfmadd213ps _sPC2+__svml_satan_data_internal(%rip), %ymm12, %ymm0 - vfmadd213ps _sPC1+__svml_satan_data_internal(%rip), %ymm12, %ymm13 - vfmadd213ps %ymm13, %ymm14, %ymm0 - vfmadd213ps _sPC0+__svml_satan_data_internal(%rip), %ymm14, %ymm0 - vandnps _sPIO2+__svml_satan_data_internal(%rip), %ymm6, %ymm1 - vxorps %ymm9, %ymm1, %ymm1 + vpcmpgtd %ymm2, %ymm3, %ymm6 + vandnps %ymm0, %ymm7, %ymm3 + vandnps %ymm6, %ymm7, %ymm7 + vxorps %ymm3, %ymm7, %ymm5 + vxorps %ymm4, %ymm5, %ymm7 + /* Polynomial. */ + vmulps %ymm4, %ymm4, %ymm1 + vmovups LOCAL_DATA(_sPC8)(%rip), %ymm0 + vmovups LOCAL_DATA(_sPC7)(%rip), %ymm4 + vmulps %ymm1, %ymm1, %ymm5 + vfmadd213ps LOCAL_DATA(_sPC6)(%rip), %ymm5, %ymm0 + vfmadd213ps LOCAL_DATA(_sPC5)(%rip), %ymm5, %ymm4 + vfmadd213ps LOCAL_DATA(_sPC4)(%rip), %ymm5, %ymm0 + vfmadd213ps LOCAL_DATA(_sPC3)(%rip), %ymm5, %ymm4 + vfmadd213ps LOCAL_DATA(_sPC2)(%rip), %ymm5, %ymm0 + vfmadd213ps LOCAL_DATA(_sPC1)(%rip), %ymm5, %ymm4 + vfmadd213ps %ymm4, %ymm1, %ymm0 + vfmadd213ps %ymm2, %ymm1, %ymm0 + vandps COMMON_DATA(_TanSPI1_FMA)(%rip), %ymm6, %ymm1 + vxorps %ymm3, %ymm1, %ymm1 - /* Reconstruction. */ - vfmadd213ps %ymm1, %ymm15, %ymm0 + /* Reconstruction. */ + vfmadd213ps %ymm1, %ymm7, %ymm0 ret END(_ZGVdN8v_atanf_avx2) - .section .rodata, "a" - .align 32 - -#ifdef __svml_satan_data_internal_typedef -typedef unsigned int VUINT32; -typedef struct { - __declspec(align(32)) VUINT32 _sSIGN_MASK[8][1]; - __declspec(align(32)) VUINT32 _sABS_MASK[8][1]; - __declspec(align(32)) VUINT32 _sONE[8][1]; - __declspec(align(32)) VUINT32 _sPIO2[8][1]; - __declspec(align(32)) VUINT32 _sPC8[8][1]; - __declspec(align(32)) VUINT32 _sPC7[8][1]; - __declspec(align(32)) VUINT32 _sPC6[8][1]; - __declspec(align(32)) VUINT32 _sPC5[8][1]; - __declspec(align(32)) VUINT32 _sPC4[8][1]; - __declspec(align(32)) VUINT32 _sPC3[8][1]; - __declspec(align(32)) VUINT32 _sPC2[8][1]; - __declspec(align(32)) VUINT32 _sPC1[8][1]; - __declspec(align(32)) VUINT32 _sPC0[8][1]; -} __svml_satan_data_internal; -#endif -__svml_satan_data_internal: - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 // _sSIGN_MASK - .align 32 - .long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF // _sABS_MASK - .align 32 - .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 // _sONE - .align 32 - .long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB // _sPIO2 - .align 32 - .long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 // _sPC8 - .align 32 - .long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 // _sPC7 - .align 32 - .long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 // _sPC6 - .align 32 - .long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 // _sPC5 - .align 32 - .long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 // _sPC4 - .align 32 - .long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 // _sPC3 - .align 32 - .long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F // _sPC2 - .align 32 - .long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 // _sPC1 - .align 32 - .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 // _sPC0 + .section .rodata.avx2, "a" .align 32 - .type __svml_satan_data_internal, @object - .size __svml_satan_data_internal, .-__svml_satan_data_internal +LOCAL_DATA_NAME: + DATA_VEC (LOCAL_DATA_NAME, _sPC8, 0x3B322CC0) + DATA_VEC (LOCAL_DATA_NAME, _sPC7, 0xBC7F2631) + DATA_VEC (LOCAL_DATA_NAME, _sPC6, 0x3D2BC384) + DATA_VEC (LOCAL_DATA_NAME, _sPC5, 0xBD987629) + DATA_VEC (LOCAL_DATA_NAME, _sPC4, 0x3DD96474) + DATA_VEC (LOCAL_DATA_NAME, _sPC3, 0xBE1161F8) + DATA_VEC (LOCAL_DATA_NAME, _sPC2, 0x3E4CB79F) + DATA_VEC (LOCAL_DATA_NAME, _sPC1, 0xBEAAAA49) + .type LOCAL_DATA_NAME, @object + .size LOCAL_DATA_NAME, .-LOCAL_DATA_NAME