[v1,12/27] x86/fpu: Optimize svml_s_atanf4_core_sse4.S

Message ID 20221207085236.1424424-12-goldstein.w.n@gmail.com
State New
Headers
Series [v1,01/27] x86/fpu: Create helper file for common data macros |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Noah Goldstein Dec. 7, 2022, 8:52 a.m. UTC
  1. Cleanup some missed optimizations in instruction selection /
   unnecissary repeated rodata references.
2. Remove unused rodata.
3. Use common data definitions where possible.

Code Size Change: -31 Bytes (173 - 204)

Input                                 New Time / Old Time
0F          (0x00000000)           -> 0.9446
0F          (0x0000ffff, Denorm)   -> 0.9977
.1F         (0x3dcccccd)           -> 0.9380
5F          (0x40a00000)           -> 0.9542
2315255808F (0x4f0a0000)           -> 1.0115
-NaN        (0xffffffff)           -> 0.9232
---
 .../fpu/multiarch/svml_s_atanf4_core_sse4.S   | 198 +++++++-----------
 1 file changed, 75 insertions(+), 123 deletions(-)
  

Patch

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S
index 83cecb8ee5..2ab599f7a8 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S
@@ -28,136 +28,88 @@ 
  *
  */
 
-/* Offsets for data table __svml_satan_data_internal
- */
-#define _sSIGN_MASK			0
-#define _sABS_MASK			16
-#define _sONE				32
-#define _sPIO2				48
-#define _sPC8				64
-#define _sPC7				80
-#define _sPC6				96
-#define _sPC5				112
-#define _sPC4				128
-#define _sPC3				144
-#define _sPC2				160
-#define _sPC1				176
-#define _sPC0				192
+#define LOCAL_DATA_NAME	__svml_satan_data_internal
+#include "svml_s_common_sse4_rodata_offsets.h"
+/* Offsets for data table __svml_satan_data_internal.  */
+#define _SignMask	0
+#define _sPIO2	16
+#define _sPC7	32
+#define _sPC5	48
+#define _sPC3	64
+#define _sPC1	80
+#define _sPC8	96
+#define _sPC6	112
+#define _sPC4	128
+#define _sPC2	144
+#define _sPC0	160
 
 #include <sysdep.h>
 
 	.section .text.sse4, "ax", @progbits
 ENTRY(_ZGVbN4v_atanf_sse4)
-	/*
-	 * To use minps\maxps operations for argument reduction
-	 * uncomment _AT_USEMINMAX_ definition
-	 *  Declarations
-	 * Variables
-	 * Constants
-	 */
-	movups	_sABS_MASK+__svml_satan_data_internal(%rip), %xmm2
-
-	/*
-	 * 1) If x>1,      then r=-1/x, PIO2=Pi/2
-	 * 2) If -1<=x<=1, then r=x,    PIO2=0
-	 * 3) If x<-1,     then r=-1/x, PIO2=-Pi/2
-	 */
-	movups	_sONE+__svml_satan_data_internal(%rip), %xmm1
-	andps	%xmm0, %xmm2
-	movaps	%xmm2, %xmm9
-	movaps	%xmm1, %xmm3
-	cmpleps	%xmm1, %xmm9
-	maxps	%xmm2, %xmm3
-	minps	%xmm2, %xmm1
-	divps	%xmm3, %xmm1
-	movups	__svml_satan_data_internal(%rip), %xmm4
-	movaps	%xmm9, %xmm10
-	andps	%xmm4, %xmm0
-	andnps	%xmm4, %xmm9
-	pxor	%xmm0, %xmm9
-	pxor	%xmm1, %xmm9
-
-	/* Polynomial. */
-	movaps	%xmm9, %xmm8
-	mulps	%xmm9, %xmm8
-	movaps	%xmm8, %xmm7
-	mulps	%xmm8, %xmm7
-	movups	_sPC8+__svml_satan_data_internal(%rip), %xmm6
-	mulps	%xmm7, %xmm6
-	movups	_sPC7+__svml_satan_data_internal(%rip), %xmm5
-	mulps	%xmm7, %xmm5
-	addps	_sPC6+__svml_satan_data_internal(%rip), %xmm6
-	mulps	%xmm7, %xmm6
-	addps	_sPC5+__svml_satan_data_internal(%rip), %xmm5
-	mulps	%xmm7, %xmm5
-	addps	_sPC4+__svml_satan_data_internal(%rip), %xmm6
-	mulps	%xmm7, %xmm6
-	addps	_sPC3+__svml_satan_data_internal(%rip), %xmm5
-	mulps	%xmm5, %xmm7
-	addps	_sPC2+__svml_satan_data_internal(%rip), %xmm6
-	mulps	%xmm8, %xmm6
-	addps	_sPC1+__svml_satan_data_internal(%rip), %xmm7
-	andnps	_sPIO2+__svml_satan_data_internal(%rip), %xmm10
-	addps	%xmm6, %xmm7
-	mulps	%xmm7, %xmm8
-	pxor	%xmm0, %xmm10
-	addps	_sPC0+__svml_satan_data_internal(%rip), %xmm8
-
-	/* Reconstruction. */
-	mulps	%xmm8, %xmm9
-	addps	%xmm9, %xmm10
-	movaps	%xmm10, %xmm0
+	/* 1) If x>1,      then r=-1/x, PIO2=Pi/2
+	   2) If -1<=x<=1, then r=x,    PIO2=0
+	   3) If x<-1,     then r=-1/x, PIO2=-Pi/2.  */
+	movups	COMMON_DATA(_OneF)(%rip), %xmm1
+	/* use minud\maxud operations for argument reduction.  */
+	movups	LOCAL_DATA(_SignMask)(%rip), %xmm5
+	movaps	%xmm5, %xmm6
+	andnps	%xmm0, %xmm5
+	andps	%xmm6, %xmm0
+	movaps	%xmm5, %xmm7
+    
+	movaps	%xmmA, %xmm4
+	pminud	%xmm5, %xmmA
+	pmaxud	%xmm4, %xmm7
+    pcmpgtd	%xmmA, %xmm5
+	divps	%xmm7, %xmmA
+    
+	andps	%xmm5, %xmm6
+	pxor	%xmm0, %xmm6
+	andps	LOCAL_DATA(_sPIO2)(%rip), %xmm5
+	pxor	%xmm0, %xmm5
+	pxor	%xmmA, %xmm6
+	/* Polynomial.  */
+	mulps	%xmmA, %xmmA
+	movaps	%xmmA, %xmm0
+	mulps	%xmmA, %xmmA
+	movups	LOCAL_DATA(_sPC7)(%rip), %xmm2
+	mulps	%xmmA, %xmm2
+	addps	LOCAL_DATA(_sPC5)(%rip), %xmm2
+	mulps	%xmmA, %xmm2
+	addps	LOCAL_DATA(_sPC3)(%rip), %xmm2
+	mulps	%xmmA, %xmm2
+	addps	LOCAL_DATA(_sPC1)(%rip), %xmm2
+	movups	LOCAL_DATA(_sPC8)(%rip), %xmm3
+	mulps	%xmmA, %xmm3
+	addps	LOCAL_DATA(_sPC6)(%rip), %xmm3
+	mulps	%xmmA, %xmm3
+	addps	LOCAL_DATA(_sPC4)(%rip), %xmm3
+	mulps	%xmmA, %xmm3
+	addps	LOCAL_DATA(_sPC2)(%rip), %xmm3
+	mulps	%xmm0, %xmm3
+	addps	%xmm3, %xmm2
+	mulps	%xmm2, %xmm0
+	addps	%xmm4, %xmm0
+	/* Reconstruction.  */
+	mulps	%xmm6, %xmm0
+	addps	%xmm5, %xmm0
 	ret
-
 END(_ZGVbN4v_atanf_sse4)
 
-	.section .rodata, "a"
+	.section .rodata.sse4, "a"
 	.align	16
+LOCAL_DATA_NAME:
+	DATA_VEC (LOCAL_DATA_NAME, _SignMask, 0x80000000)
+	DATA_VEC (LOCAL_DATA_NAME, _sPIO2, 0x3fc90fdb)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC7, 0xBC7F2631)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC5, 0xBD987629)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC3, 0xBE1161F8)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC1, 0xBEAAAA49)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC8, 0x3B322CC0)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC6, 0x3D2BC384)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC4, 0x3DD96474)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC2, 0x3E4CB79F)
 
-#ifdef __svml_satan_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(16)) VUINT32 _sSIGN_MASK[4][1];
-	__declspec(align(16)) VUINT32 _sABS_MASK[4][1];
-	__declspec(align(16)) VUINT32 _sONE[4][1];
-	__declspec(align(16)) VUINT32 _sPIO2[4][1];
-	__declspec(align(16)) VUINT32 _sPC8[4][1];
-	__declspec(align(16)) VUINT32 _sPC7[4][1];
-	__declspec(align(16)) VUINT32 _sPC6[4][1];
-	__declspec(align(16)) VUINT32 _sPC5[4][1];
-	__declspec(align(16)) VUINT32 _sPC4[4][1];
-	__declspec(align(16)) VUINT32 _sPC3[4][1];
-	__declspec(align(16)) VUINT32 _sPC2[4][1];
-	__declspec(align(16)) VUINT32 _sPC1[4][1];
-	__declspec(align(16)) VUINT32 _sPC0[4][1];
-} __svml_satan_data_internal;
-#endif
-__svml_satan_data_internal:
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000 // _sSIGN_MASK
-	.align	16
-	.long	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF // _sABS_MASK
-	.align	16
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 // _sONE
-	.align	16
-	.long	0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB // _sPIO2
-	.align	16
-	.long	0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 // _sPC8
-	.align	16
-	.long	0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 // _sPC7
-	.align	16
-	.long	0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 // _sPC6
-	.align	16
-	.long	0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 // _sPC5
-	.align	16
-	.long	0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 // _sPC4
-	.align	16
-	.long	0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 // _sPC3
-	.align	16
-	.long	0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F // _sPC2
-	.align	16
-	.long	0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 // _sPC1
-	.align	16
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 // _sPC0
-	.align	16
-	.type	__svml_satan_data_internal, @object
-	.size	__svml_satan_data_internal, .-__svml_satan_data_internal
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME