[1/3] AArch64: Improve codegen for SVE erfcf

Message ID 20250108093910.4709-2-yatlong.poon@arm.com (mailing list archive)
State New
Headers
Series AArch64: Improve codegen for SVE pow, powf and erfcf |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Test passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Test passed

Commit Message

Yat Long Poon Jan. 8, 2025, 9:39 a.m. UTC
  Reduce number of MOV/MOVPRFXs and use unpredicated FMUL.
Replace MUL with LSL.
Speedup on Neoverse V1: 6%.
---
 sysdeps/aarch64/fpu/erfcf_sve.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)
  

Patch

diff --git a/sysdeps/aarch64/fpu/erfcf_sve.c b/sysdeps/aarch64/fpu/erfcf_sve.c
index 2743f9dbb5..b57ab514b7 100644
--- a/sysdeps/aarch64/fpu/erfcf_sve.c
+++ b/sysdeps/aarch64/fpu/erfcf_sve.c
@@ -76,7 +76,7 @@  svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
   svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx);
 
   /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables.  */
-  i = svmul_x (pg, i, 2);
+  i = svlsl_x (svptrue_b32 (), i, 1);
   const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr;
   svfloat32_t erfcr = svld1_gather_index (pg, p, i);
   svfloat32_t scale = svld1_gather_index (pg, p + 1, i);
@@ -84,15 +84,15 @@  svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
   /* erfc(x) ~ erfc(r) - scale * d * poly(r, d).  */
   svfloat32_t r = svsub_x (pg, z, shift);
   svfloat32_t d = svsub_x (pg, a, r);
-  svfloat32_t d2 = svmul_x (pg, d, d);
-  svfloat32_t r2 = svmul_x (pg, r, r);
+  svfloat32_t d2 = svmul_x (svptrue_b32 (), d, d);
+  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
 
   svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third);
-  svfloat32_t third = svdup_lane (coeffs, 0);
 
   svfloat32_t p1 = r;
-  svfloat32_t p2 = svmls_lane (third, r2, coeffs, 1);
-  svfloat32_t p3 = svmul_x (pg, r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
+  svfloat32_t p2 = svmls_lane (sv_f32 (dat->third), r2, coeffs, 1);
+  svfloat32_t p3
+      = svmul_x (svptrue_b32 (), r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
   svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2);
   p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4);