[v1,11/27] x86/fpu: Optimize svml_s_atanf16_core_avx512.S

Message ID 20221207085236.1424424-11-goldstein.w.n@gmail.com (mailing list archive)
State New
Headers
Series [v1,01/27] x86/fpu: Create helper file for common data macros |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Noah Goldstein Dec. 7, 2022, 8:52 a.m. UTC
  1. Change the algorithm used to match the avx2 implementation which
   seems to be faster.
2. Cleanup some missed optimizations in instruction selection /
   unnecissary repeated rodata references.
3. Remove unused rodata.
4. Use common data definitions where possible.

Changing the algorithm (1) causes a slight ULP error increase (exact
same as the avx2 version).

Before:

   ulp:
	 0: 4127324924 (0.9610)
	 1:  167635550 (0.0390)
	 2:       6822 (0.0000)
	 3:          0 (0.0000)
	 4:          0 (0.0000)

After:
   ulp:
	 0: 4088299128 (0.9519)
	 1:  206531674 (0.0481)
	 2:     136494 (0.0000)
	 3:          0 (0.0000)
	 4:          0 (0.0000)

Since the max ULP is the same and the distribution matches the avx2
implementation this seems like an acceptable "regression" as it
doesn't seem feasible any application could have been relying on
the precision distribution.

Code Size Change: -79 Bytes (193 - 272)

Perf Changes:
Input                                 New Time / Old Time
0F          (0x00000000)           -> 0.7612
0F          (0x0000ffff, Denorm)   -> 1.3234
.1F         (0x3dcccccd)           -> 0.7690
5F          (0x40a00000)           -> 0.7752
2315255808F (0x4f0a0000)           -> 0.7712
-NaN        (0xffffffff)           -> 0.7824

Note the ~32% regression in the denorm case is because of
additional micro-code assists (from the algorithm shift).
This generally seems worth it for the ~23-24% perf improvement
in other cases as denormal inputs are almost certainly cold cases.
---
 .../multiarch/svml_s_atanf16_core_avx512.S    | 199 ++++++------------
 1 file changed, 67 insertions(+), 132 deletions(-)
  

Patch

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
index 88b44a989c..abb3c76209 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
@@ -28,146 +28,81 @@ 
  *
  */
 
-/* Offsets for data table __svml_satan_data_internal_avx512
- */
-#define AbsMask				0
-#define Shifter				64
-#define MaxThreshold			128
-#define MOne				192
-#define One				256
-#define LargeX				320
-#define Zero				384
-#define Tbl_H				448
-#define Pi2				576
-#define coeff_1				640
-#define coeff_2				704
-#define coeff_3				768
+#define LOCAL_DATA_NAME	__svml_satan_data_internal
+#include "svml_s_common_evex512_rodata_offsets.h"
+/* Offsets for data table __svml_satan_data_internal.  */
+#define _sPC8	0
+#define _sPC7	64
+#define _sPC6	128
+#define _sPC5	192
+#define _sPC4	256
+#define _sPC3	320
+#define _sPC2	384
+#define _sPC1	448
+#define _sPIO2	512
 
 #include <sysdep.h>
 
 	.section .text.evex512, "ax", @progbits
 ENTRY(_ZGVeN16v_atanf_skx)
-	vandps	__svml_satan_data_internal_avx512(%rip), %zmm0, %zmm7
-	vmovups	MaxThreshold+__svml_satan_data_internal_avx512(%rip), %zmm3
-	vmovups	One+__svml_satan_data_internal_avx512(%rip), %zmm8
-
-	/* round to 2 bits after binary point */
-	vreduceps $40, {sae}, %zmm7, %zmm5
-
-	/* saturate X range */
-	vmovups	LargeX+__svml_satan_data_internal_avx512(%rip), %zmm6
-	vmovups	Shifter+__svml_satan_data_internal_avx512(%rip), %zmm2
-	vcmpps	$29, {sae}, %zmm3, %zmm7, %k1
-
-	/* table lookup sequence */
-	vmovups	Tbl_H+__svml_satan_data_internal_avx512(%rip), %zmm3
-	vsubps	{rn-sae}, %zmm5, %zmm7, %zmm4
-	vaddps	{rn-sae}, %zmm2, %zmm7, %zmm1
-	vxorps	%zmm0, %zmm7, %zmm0
-	vfmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
-	vmovups	coeff_2+__svml_satan_data_internal_avx512(%rip), %zmm4
-
-	/* if|X|>=MaxThreshold, set DiffX=-1 */
-	vblendmps MOne+__svml_satan_data_internal_avx512(%rip), %zmm5, %zmm9{%k1}
-	vmovups	coeff_3+__svml_satan_data_internal_avx512(%rip), %zmm5
-
-	/* if|X|>=MaxThreshold, set Y=X */
-	vminps	{sae}, %zmm7, %zmm6, %zmm8{%k1}
-
-	/* R+Rl = DiffX/Y */
-	vgetmantps $0, {sae}, %zmm9, %zmm12
-	vgetexpps {sae}, %zmm9, %zmm10
-	vpermt2ps Tbl_H+64+__svml_satan_data_internal_avx512(%rip), %zmm1, %zmm3
-	vgetmantps $0, {sae}, %zmm8, %zmm15
-	vgetexpps {sae}, %zmm8, %zmm11
-	vmovups	coeff_1+__svml_satan_data_internal_avx512(%rip), %zmm1
-
-	/* set table value to Pi/2 for large X */
-	vblendmps Pi2+__svml_satan_data_internal_avx512(%rip), %zmm3, %zmm9{%k1}
-	vrcp14ps %zmm15, %zmm13
-	vsubps	{rn-sae}, %zmm11, %zmm10, %zmm2
-	vmulps	{rn-sae}, %zmm13, %zmm12, %zmm14
-	vfnmadd213ps {rn-sae}, %zmm12, %zmm14, %zmm15
-	vfmadd213ps {rn-sae}, %zmm14, %zmm13, %zmm15
-	vscalefps {rn-sae}, %zmm2, %zmm15, %zmm7
-
-	/* polynomial evaluation */
-	vmulps	{rn-sae}, %zmm7, %zmm7, %zmm8
-	vmulps	{rn-sae}, %zmm7, %zmm8, %zmm6
-	vfmadd231ps {rn-sae}, %zmm8, %zmm1, %zmm4
-	vfmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm8
-	vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm8
-	vaddps	{rn-sae}, %zmm9, %zmm8, %zmm10
-	vxorps	%zmm0, %zmm10, %zmm0
+	/* 1) If x>1,      then r=-1/x, PIO2=Pi/2
+	   2) If -1<=x<=1, then r=x,    PIO2=0
+	   3) If x<-1,     then r=-1/x, PIO2=-Pi/2.  */
+	vmovups	COMMON_DATA(_OneF)(%rip), %zmm2
+	vmovups	COMMON_DATA(_SignMask)(%rip), %zmm7
+
+
+	/* Use minud\maxud operations for argument reduction.  */
+	vandnps	%zmm0, %zmm7, %zmm3
+	vpcmpgtd %zmm2, %zmm3, %k1
+
+	vpmaxud	%zmm3, %zmm2, %zmm4
+	vpminud	%zmm3, %zmm2, %zmm5
+
+	vdivps	%zmm4, %zmm5, %zmm4
+
+	vandps	%zmm7, %zmm0, %zmm3
+	vmovdqa32 %zmm7, %zmm7{%k1}{z}
+
+	vmulps	%zmm4, %zmm4, %zmm1
+	vpternlogq $0x96, %zmm3, %zmm4, %zmm7
+
+	/* Polynomial.  */
+
+	vmovups	LOCAL_DATA(_sPC8)(%rip), %zmm0
+	vmovups	LOCAL_DATA(_sPC7)(%rip), %zmm4
+
+	vmulps	%zmm1, %zmm1, %zmm5
+
+	vfmadd213ps LOCAL_DATA(_sPC6)(%rip), %zmm5, %zmm0
+	vfmadd213ps LOCAL_DATA(_sPC5)(%rip), %zmm5, %zmm4
+	vfmadd213ps LOCAL_DATA(_sPC4)(%rip), %zmm5, %zmm0
+	vfmadd213ps LOCAL_DATA(_sPC3)(%rip), %zmm5, %zmm4
+	vfmadd213ps LOCAL_DATA(_sPC2)(%rip), %zmm5, %zmm0
+	vfmadd213ps LOCAL_DATA(_sPC1)(%rip), %zmm5, %zmm4
+	vfmadd213ps %zmm4, %zmm1, %zmm0
+	vfmadd213ps %zmm2, %zmm1, %zmm0
+	vorps	LOCAL_DATA(_sPIO2)(%rip), %zmm3, %zmm3{%k1}
+
+	/* Reconstruction.  */
+	vfmadd213ps %zmm3, %zmm7, %zmm0
 	ret
 
 END(_ZGVeN16v_atanf_skx)
 
-	.section .rodata, "a"
+	.section .rodata.evex512, "a"
 	.align	64
 
-#ifdef __svml_satan_data_internal_avx512_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(64)) VUINT32 AbsMask[16][1];
-	__declspec(align(64)) VUINT32 Shifter[16][1];
-	__declspec(align(64)) VUINT32 MaxThreshold[16][1];
-	__declspec(align(64)) VUINT32 MOne[16][1];
-	__declspec(align(64)) VUINT32 One[16][1];
-	__declspec(align(64)) VUINT32 LargeX[16][1];
-	__declspec(align(64)) VUINT32 Zero[16][1];
-	__declspec(align(64)) VUINT32 Tbl_H[32][1];
-	__declspec(align(64)) VUINT32 Pi2[16][1];
-	__declspec(align(64)) VUINT32 coeff[3][16][1];
-} __svml_satan_data_internal_avx512;
-#endif
-__svml_satan_data_internal_avx512:
-	/* AbsMask */
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-	/* Shifter */
-	.align	64
-	.long	0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000
-	/* MaxThreshold */
-	.align	64
-	.long	0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000
-	/* MOne */
-	.align	64
-	.long	0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
-	/* One */
-	.align	64
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* LargeX */
-	.align	64
-	.long	0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000
-	/* Zero */
-	.align	64
-	.long	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
-	/* Tbl_H */
-	.align	64
-	.long	0x00000000, 0x3e7adbb0
-	.long	0x3eed6338, 0x3f24bc7d
-	.long	0x3f490fdb, 0x3f6563e3
-	.long	0x3f7b985f, 0x3f869c79
-	.long	0x3f8db70d, 0x3f93877b
-	.long	0x3f985b6c, 0x3f9c6b53
-	.long	0x3f9fe0bb, 0x3fa2daa4
-	.long	0x3fa57088, 0x3fa7b46f
-	.long	0x3fa9b465, 0x3fab7b7a
-	.long	0x3fad1283, 0x3fae809e
-	.long	0x3fafcb99, 0x3fb0f836
-	.long	0x3fb20a6a, 0x3fb30581
-	.long	0x3fb3ec43, 0x3fb4c10a
-	.long	0x3fb585d7, 0x3fb63c64
-	.long	0x3fb6e62c, 0x3fb78478
-	.long	0x3fb81868, 0x3fb8a2f5
-	/* Pi2 */
-	.align	64
-	.long	0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
-	/* coeff3 */
-	.align	64
-	.long	0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de
-	.long	0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2
-	.long	0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa
-	.align	64
-	.type	__svml_satan_data_internal_avx512, @object
-	.size	__svml_satan_data_internal_avx512, .-__svml_satan_data_internal_avx512
+LOCAL_DATA_NAME:
+	DATA_VEC (LOCAL_DATA_NAME, _sPC8, 0x3B322CC0)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC7, 0xBC7F2631)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC6, 0x3D2BC384)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC5, 0xBD987629)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC4, 0x3DD96474)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC3, 0xBE1161F8)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC2, 0x3E4CB79F)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC1, 0xBEAAAA49)
+	DATA_VEC (LOCAL_DATA_NAME, _sPIO2, 0x3FC90FDB)
+
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME