aarch64/fpu: Sync libmvec routines from 2.39 and before with AOR
Checks
Context |
Check |
Description |
redhat-pt-bot/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
redhat-pt-bot/TryBot-32bit |
success
|
Build for i686
|
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_glibc_build--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-arm |
success
|
Testing passed
|
Commit Message
This includes a fix for big-endian in AdvSIMD log, some cosmetic
changes, and numerous small optimisations mainly around inlining and
using indexed variants of MLA intrinsics.
---
Thanks,
Joe
sysdeps/aarch64/fpu/acos_advsimd.c | 4 ++--
sysdeps/aarch64/fpu/asin_advsimd.c | 4 ++--
sysdeps/aarch64/fpu/atan2_sve.c | 29 +++++++++++++--------------
sysdeps/aarch64/fpu/atan2f_sve.c | 30 +++++++++++++++-------------
sysdeps/aarch64/fpu/cos_advsimd.c | 3 +--
sysdeps/aarch64/fpu/cosf_advsimd.c | 3 +--
sysdeps/aarch64/fpu/exp10_advsimd.c | 4 ++--
sysdeps/aarch64/fpu/exp10f_advsimd.c | 21 ++++++++++---------
sysdeps/aarch64/fpu/exp2_advsimd.c | 20 ++++++++++---------
sysdeps/aarch64/fpu/exp2f_sve.c | 4 +++-
sysdeps/aarch64/fpu/exp_advsimd.c | 4 ++--
sysdeps/aarch64/fpu/expm1_advsimd.c | 11 +++++-----
sysdeps/aarch64/fpu/expm1f_advsimd.c | 17 ++++++++--------
sysdeps/aarch64/fpu/log_advsimd.c | 5 +++++
sysdeps/aarch64/fpu/sin_advsimd.c | 3 +--
sysdeps/aarch64/fpu/sinf_advsimd.c | 3 +--
sysdeps/aarch64/fpu/tan_advsimd.c | 26 ++++++++++++------------
sysdeps/aarch64/fpu/tanf_advsimd.c | 25 ++++++++++++-----------
18 files changed, 111 insertions(+), 105 deletions(-)
Comments
On 20/02/24 13:44, Joe Ramsay wrote:
> This includes a fix for big-endian in AdvSIMD log, some cosmetic
> changes, and numerous small optimisations mainly around inlining and
> using indexed variants of MLA intrinsics.
LGTM, thanks.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
> ---
> Thanks,
> Joe
> sysdeps/aarch64/fpu/acos_advsimd.c | 4 ++--
> sysdeps/aarch64/fpu/asin_advsimd.c | 4 ++--
> sysdeps/aarch64/fpu/atan2_sve.c | 29 +++++++++++++--------------
> sysdeps/aarch64/fpu/atan2f_sve.c | 30 +++++++++++++++-------------
> sysdeps/aarch64/fpu/cos_advsimd.c | 3 +--
> sysdeps/aarch64/fpu/cosf_advsimd.c | 3 +--
> sysdeps/aarch64/fpu/exp10_advsimd.c | 4 ++--
> sysdeps/aarch64/fpu/exp10f_advsimd.c | 21 ++++++++++---------
> sysdeps/aarch64/fpu/exp2_advsimd.c | 20 ++++++++++---------
> sysdeps/aarch64/fpu/exp2f_sve.c | 4 +++-
> sysdeps/aarch64/fpu/exp_advsimd.c | 4 ++--
> sysdeps/aarch64/fpu/expm1_advsimd.c | 11 +++++-----
> sysdeps/aarch64/fpu/expm1f_advsimd.c | 17 ++++++++--------
> sysdeps/aarch64/fpu/log_advsimd.c | 5 +++++
> sysdeps/aarch64/fpu/sin_advsimd.c | 3 +--
> sysdeps/aarch64/fpu/sinf_advsimd.c | 3 +--
> sysdeps/aarch64/fpu/tan_advsimd.c | 26 ++++++++++++------------
> sysdeps/aarch64/fpu/tanf_advsimd.c | 25 ++++++++++++-----------
> 18 files changed, 111 insertions(+), 105 deletions(-)
>
> diff --git a/sysdeps/aarch64/fpu/acos_advsimd.c b/sysdeps/aarch64/fpu/acos_advsimd.c
> index a8eabb5e71..0a86c9823a 100644
> --- a/sysdeps/aarch64/fpu/acos_advsimd.c
> +++ b/sysdeps/aarch64/fpu/acos_advsimd.c
> @@ -40,8 +40,8 @@ static const struct data
> };
>
> #define AllMask v_u64 (0xffffffffffffffff)
> -#define Oneu (0x3ff0000000000000)
> -#define Small (0x3e50000000000000) /* 2^-53. */
> +#define Oneu 0x3ff0000000000000
> +#define Small 0x3e50000000000000 /* 2^-53. */
>
> #if WANT_SIMD_EXCEPT
> static float64x2_t VPCS_ATTR NOINLINE
> diff --git a/sysdeps/aarch64/fpu/asin_advsimd.c b/sysdeps/aarch64/fpu/asin_advsimd.c
> index 141646e954..2de6eff407 100644
> --- a/sysdeps/aarch64/fpu/asin_advsimd.c
> +++ b/sysdeps/aarch64/fpu/asin_advsimd.c
> @@ -39,8 +39,8 @@ static const struct data
> };
>
> #define AllMask v_u64 (0xffffffffffffffff)
> -#define One (0x3ff0000000000000)
> -#define Small (0x3e50000000000000) /* 2^-12. */
> +#define One 0x3ff0000000000000
> +#define Small 0x3e50000000000000 /* 2^-12. */
>
> #if WANT_SIMD_EXCEPT
> static float64x2_t VPCS_ATTR NOINLINE
> diff --git a/sysdeps/aarch64/fpu/atan2_sve.c b/sysdeps/aarch64/fpu/atan2_sve.c
> index 09a4c559b8..04fa71fa37 100644
> --- a/sysdeps/aarch64/fpu/atan2_sve.c
> +++ b/sysdeps/aarch64/fpu/atan2_sve.c
> @@ -37,9 +37,6 @@ static const struct data
> .pi_over_2 = 0x1.921fb54442d18p+0,
> };
>
> -/* Useful constants. */
> -#define SignMask sv_u64 (0x8000000000000000)
> -
> /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */
> static svfloat64_t NOINLINE
> special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret,
> @@ -72,14 +69,15 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
> svbool_t cmp_y = zeroinfnan (iy, pg);
> svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
>
> - svuint64_t sign_x = svand_x (pg, ix, SignMask);
> - svuint64_t sign_y = svand_x (pg, iy, SignMask);
> - svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y);
> -
> svfloat64_t ax = svabs_x (pg, x);
> svfloat64_t ay = svabs_x (pg, y);
> + svuint64_t iax = svreinterpret_u64 (ax);
> + svuint64_t iay = svreinterpret_u64 (ay);
> +
> + svuint64_t sign_x = sveor_x (pg, ix, iax);
> + svuint64_t sign_y = sveor_x (pg, iy, iay);
> + svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y);
>
> - svbool_t pred_xlt0 = svcmplt (pg, x, 0.0);
> svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
>
> /* Set up z for call to atan. */
> @@ -88,8 +86,9 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
> svfloat64_t z = svdiv_x (pg, n, d);
>
> /* Work out the correct shift. */
> - svfloat64_t shift = svsel (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0));
> - shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift);
> + svfloat64_t shift = svreinterpret_f64 (svlsr_x (pg, sign_x, 1));
> + shift = svsel (pred_aygtax, sv_f64 (1.0), shift);
> + shift = svreinterpret_f64 (svorr_x (pg, sign_x, svreinterpret_u64 (shift)));
> shift = svmul_x (pg, shift, data_ptr->pi_over_2);
>
> /* Use split Estrin scheme for P(z^2) with deg(P)=19. */
> @@ -109,10 +108,10 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
> ret = svadd_m (pg, ret, shift);
>
> /* Account for the sign of x and y. */
> - ret = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy));
> -
> if (__glibc_unlikely (svptest_any (pg, cmp_xy)))
> - return special_case (y, x, ret, cmp_xy);
> -
> - return ret;
> + return special_case (
> + y, x,
> + svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)),
> + cmp_xy);
> + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy));
> }
> diff --git a/sysdeps/aarch64/fpu/atan2f_sve.c b/sysdeps/aarch64/fpu/atan2f_sve.c
> index b92f83cdea..9ea197147c 100644
> --- a/sysdeps/aarch64/fpu/atan2f_sve.c
> +++ b/sysdeps/aarch64/fpu/atan2f_sve.c
> @@ -32,10 +32,8 @@ static const struct data
> .pi_over_2 = 0x1.921fb6p+0f,
> };
>
> -#define SignMask sv_u32 (0x80000000)
> -
> /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */
> -static inline svfloat32_t
> +static svfloat32_t NOINLINE
> special_case (svfloat32_t y, svfloat32_t x, svfloat32_t ret,
> const svbool_t cmp)
> {
> @@ -67,14 +65,15 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
> svbool_t cmp_y = zeroinfnan (iy, pg);
> svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
>
> - svuint32_t sign_x = svand_x (pg, ix, SignMask);
> - svuint32_t sign_y = svand_x (pg, iy, SignMask);
> - svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y);
> -
> svfloat32_t ax = svabs_x (pg, x);
> svfloat32_t ay = svabs_x (pg, y);
> + svuint32_t iax = svreinterpret_u32 (ax);
> + svuint32_t iay = svreinterpret_u32 (ay);
> +
> + svuint32_t sign_x = sveor_x (pg, ix, iax);
> + svuint32_t sign_y = sveor_x (pg, iy, iay);
> + svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y);
>
> - svbool_t pred_xlt0 = svcmplt (pg, x, 0.0);
> svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
>
> /* Set up z for call to atan. */
> @@ -83,11 +82,12 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
> svfloat32_t z = svdiv_x (pg, n, d);
>
> /* Work out the correct shift. */
> - svfloat32_t shift = svsel (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0));
> - shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift);
> + svfloat32_t shift = svreinterpret_f32 (svlsr_x (pg, sign_x, 1));
> + shift = svsel (pred_aygtax, sv_f32 (1.0), shift);
> + shift = svreinterpret_f32 (svorr_x (pg, sign_x, svreinterpret_u32 (shift)));
> shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2));
>
> - /* Use split Estrin scheme for P(z^2) with deg(P)=7. */
> + /* Use pure Estrin scheme for P(z^2) with deg(P)=7. */
> svfloat32_t z2 = svmul_x (pg, z, z);
> svfloat32_t z4 = svmul_x (pg, z2, z2);
> svfloat32_t z8 = svmul_x (pg, z4, z4);
> @@ -101,10 +101,12 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
> ret = svadd_m (pg, ret, shift);
>
> /* Account for the sign of x and y. */
> - ret = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy));
>
> if (__glibc_unlikely (svptest_any (pg, cmp_xy)))
> - return special_case (y, x, ret, cmp_xy);
> + return special_case (
> + y, x,
> + svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)),
> + cmp_xy);
>
> - return ret;
> + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy));
> }
> diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c
> index 2897e8b909..3924c9ce44 100644
> --- a/sysdeps/aarch64/fpu/cos_advsimd.c
> +++ b/sysdeps/aarch64/fpu/cos_advsimd.c
> @@ -63,8 +63,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
> special-case handler later. */
> r = vbslq_f64 (cmp, v_f64 (1.0), r);
> #else
> - cmp = vcageq_f64 (d->range_val, x);
> - cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */
> + cmp = vcageq_f64 (x, d->range_val);
> r = x;
> #endif
>
> diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c
> index 60abc8dfcf..d0c285b03a 100644
> --- a/sysdeps/aarch64/fpu/cosf_advsimd.c
> +++ b/sysdeps/aarch64/fpu/cosf_advsimd.c
> @@ -64,8 +64,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
> special-case handler later. */
> r = vbslq_f32 (cmp, v_f32 (1.0f), r);
> #else
> - cmp = vcageq_f32 (d->range_val, x);
> - cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */
> + cmp = vcageq_f32 (x, d->range_val);
> r = x;
> #endif
>
> diff --git a/sysdeps/aarch64/fpu/exp10_advsimd.c b/sysdeps/aarch64/fpu/exp10_advsimd.c
> index fe7149b191..eeb31ca839 100644
> --- a/sysdeps/aarch64/fpu/exp10_advsimd.c
> +++ b/sysdeps/aarch64/fpu/exp10_advsimd.c
> @@ -57,7 +57,7 @@ const static struct data
> # define BigBound v_u64 (0x4070000000000000) /* asuint64 (0x1p8). */
> # define Thres v_u64 (0x2070000000000000) /* BigBound - TinyBound. */
>
> -static inline float64x2_t VPCS_ATTR
> +static float64x2_t VPCS_ATTR NOINLINE
> special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
> {
> /* If fenv exceptions are to be triggered correctly, fall back to the scalar
> @@ -72,7 +72,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
> # define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
> # define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
>
> -static float64x2_t VPCS_ATTR NOINLINE
> +static inline float64x2_t VPCS_ATTR
> special_case (float64x2_t s, float64x2_t y, float64x2_t n,
> const struct data *d)
> {
> diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c
> index 7ee0c90948..ab117b69da 100644
> --- a/sysdeps/aarch64/fpu/exp10f_advsimd.c
> +++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c
> @@ -25,7 +25,8 @@
> static const struct data
> {
> float32x4_t poly[5];
> - float32x4_t shift, log10_2, log2_10_hi, log2_10_lo;
> + float32x4_t log10_2_and_inv, shift;
> +
> #if !WANT_SIMD_EXCEPT
> float32x4_t scale_thresh;
> #endif
> @@ -38,9 +39,9 @@ static const struct data
> .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f),
> V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) },
> .shift = V4 (0x1.8p23f),
> - .log10_2 = V4 (0x1.a934fp+1),
> - .log2_10_hi = V4 (0x1.344136p-2),
> - .log2_10_lo = V4 (-0x1.ec10cp-27),
> +
> + /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */
> + .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 },
> #if !WANT_SIMD_EXCEPT
> .scale_thresh = V4 (ScaleBound)
> #endif
> @@ -98,24 +99,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
> #if WANT_SIMD_EXCEPT
> /* asuint(x) - TinyBound >= BigBound - TinyBound. */
> uint32x4_t cmp = vcgeq_u32 (
> - vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
> - TinyBound),
> - Thres);
> + vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres);
> float32x4_t xm = x;
> /* If any lanes are special, mask them with 1 and retain a copy of x to allow
> special case handler to fix special lanes later. This is only necessary if
> fenv exceptions are to be triggered correctly. */
> if (__glibc_unlikely (v_any_u32 (cmp)))
> - x = vbslq_f32 (cmp, v_f32 (1), x);
> + x = v_zerofy_f32 (x, cmp);
> #endif
>
> /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
> with poly(r) in [1/sqrt(2), sqrt(2)] and
> x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
> - float32x4_t z = vfmaq_f32 (d->shift, x, d->log10_2);
> + float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0);
> float32x4_t n = vsubq_f32 (z, d->shift);
> - float32x4_t r = vfmsq_f32 (x, n, d->log2_10_hi);
> - r = vfmsq_f32 (r, n, d->log2_10_lo);
> + float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1);
> + r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2);
> uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
>
> float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
> diff --git a/sysdeps/aarch64/fpu/exp2_advsimd.c b/sysdeps/aarch64/fpu/exp2_advsimd.c
> index 391a93180c..ae1e63d503 100644
> --- a/sysdeps/aarch64/fpu/exp2_advsimd.c
> +++ b/sysdeps/aarch64/fpu/exp2_advsimd.c
> @@ -24,6 +24,7 @@
> #define IndexMask (N - 1)
> #define BigBound 1022.0
> #define UOFlowBound 1280.0
> +#define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */
>
> static const struct data
> {
> @@ -48,14 +49,13 @@ lookup_sbits (uint64x2_t i)
>
> #if WANT_SIMD_EXCEPT
>
> -# define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */
> # define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */
>
> /* Call scalar exp2 as a fallback. */
> static float64x2_t VPCS_ATTR NOINLINE
> -special_case (float64x2_t x)
> +special_case (float64x2_t x, float64x2_t y, uint64x2_t is_special)
> {
> - return v_call_f64 (exp2, x, x, v_u64 (0xffffffffffffffff));
> + return v_call_f64 (exp2, x, y, is_special);
> }
>
> #else
> @@ -65,7 +65,7 @@ special_case (float64x2_t x)
> # define SpecialBias1 0x7000000000000000 /* 0x1p769. */
> # define SpecialBias2 0x3010000000000000 /* 0x1p-254. */
>
> -static float64x2_t VPCS_ATTR
> +static inline float64x2_t VPCS_ATTR
> special_case (float64x2_t s, float64x2_t y, float64x2_t n,
> const struct data *d)
> {
> @@ -94,10 +94,10 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
> #if WANT_SIMD_EXCEPT
> uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
> cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres));
> - /* If any special case (inf, nan, small and large x) is detected,
> - fall back to scalar for all lanes. */
> - if (__glibc_unlikely (v_any_u64 (cmp)))
> - return special_case (x);
> + /* Mask special lanes and retain a copy of x for passing to special-case
> + handler. */
> + float64x2_t xc = x;
> + x = v_zerofy_f64 (x, cmp);
> #else
> cmp = vcagtq_f64 (x, d->scale_big_bound);
> #endif
> @@ -120,9 +120,11 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
> float64x2_t y = v_pairwise_poly_3_f64 (r, r2, d->poly);
> y = vmulq_f64 (r, y);
>
> -#if !WANT_SIMD_EXCEPT
> if (__glibc_unlikely (v_any_u64 (cmp)))
> +#if !WANT_SIMD_EXCEPT
> return special_case (s, y, n, d);
> +#else
> + return special_case (xc, vfmaq_f64 (s, s, y), cmp);
> #endif
> return vfmaq_f64 (s, s, y);
> }
> diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c
> index 9a5a523a10..8a686e3e05 100644
> --- a/sysdeps/aarch64/fpu/exp2f_sve.c
> +++ b/sysdeps/aarch64/fpu/exp2f_sve.c
> @@ -20,6 +20,8 @@
> #include "sv_math.h"
> #include "poly_sve_f32.h"
>
> +#define Thres 0x1.5d5e2ap+6f
> +
> static const struct data
> {
> float poly[5];
> @@ -33,7 +35,7 @@ static const struct data
> .shift = 0x1.903f8p17f,
> /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
> correctly by FEXPA. */
> - .thres = 0x1.5d5e2ap+6f,
> + .thres = Thres,
> };
>
> static svfloat32_t NOINLINE
> diff --git a/sysdeps/aarch64/fpu/exp_advsimd.c b/sysdeps/aarch64/fpu/exp_advsimd.c
> index fd215f1d2c..5e3a9a0d44 100644
> --- a/sysdeps/aarch64/fpu/exp_advsimd.c
> +++ b/sysdeps/aarch64/fpu/exp_advsimd.c
> @@ -54,7 +54,7 @@ const static volatile struct
> # define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */
> # define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */
>
> -static inline float64x2_t VPCS_ATTR
> +static float64x2_t VPCS_ATTR NOINLINE
> special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
> {
> /* If fenv exceptions are to be triggered correctly, fall back to the scalar
> @@ -69,7 +69,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
> # define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
> # define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
>
> -static float64x2_t VPCS_ATTR NOINLINE
> +static inline float64x2_t VPCS_ATTR
> special_case (float64x2_t s, float64x2_t y, float64x2_t n)
> {
> /* 2^(n/N) may overflow, break it up into s1*s2. */
> diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c
> index 0b85bd06f3..3628398674 100644
> --- a/sysdeps/aarch64/fpu/expm1_advsimd.c
> +++ b/sysdeps/aarch64/fpu/expm1_advsimd.c
> @@ -23,7 +23,7 @@
> static const struct data
> {
> float64x2_t poly[11];
> - float64x2_t invln2, ln2_lo, ln2_hi, shift;
> + float64x2_t invln2, ln2, shift;
> int64x2_t exponent_bias;
> #if WANT_SIMD_EXCEPT
> uint64x2_t thresh, tiny_bound;
> @@ -38,8 +38,7 @@ static const struct data
> V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
> V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) },
> .invln2 = V2 (0x1.71547652b82fep0),
> - .ln2_hi = V2 (0x1.62e42fefa39efp-1),
> - .ln2_lo = V2 (0x1.abc9e3b39803fp-56),
> + .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 },
> .shift = V2 (0x1.8p52),
> .exponent_bias = V2 (0x3ff0000000000000),
> #if WANT_SIMD_EXCEPT
> @@ -83,7 +82,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
> x = v_zerofy_f64 (x, special);
> #else
> /* Large input, NaNs and Infs. */
> - uint64x2_t special = vceqzq_u64 (vcaltq_f64 (x, d->oflow_bound));
> + uint64x2_t special = vcageq_f64 (x, d->oflow_bound);
> #endif
>
> /* Reduce argument to smaller range:
> @@ -93,8 +92,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
> where 2^i is exact because i is an integer. */
> float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift);
> int64x2_t i = vcvtq_s64_f64 (n);
> - float64x2_t f = vfmsq_f64 (x, n, d->ln2_hi);
> - f = vfmsq_f64 (f, n, d->ln2_lo);
> + float64x2_t f = vfmsq_laneq_f64 (x, n, d->ln2, 0);
> + f = vfmsq_laneq_f64 (f, n, d->ln2, 1);
>
> /* Approximate expm1(f) using polynomial.
> Taylor expansion for expm1(x) has the form:
> diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c
> index 8d4c9a2193..93db200f61 100644
> --- a/sysdeps/aarch64/fpu/expm1f_advsimd.c
> +++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
> @@ -23,7 +23,8 @@
> static const struct data
> {
> float32x4_t poly[5];
> - float32x4_t invln2, ln2_lo, ln2_hi, shift;
> + float32x4_t invln2_and_ln2;
> + float32x4_t shift;
> int32x4_t exponent_bias;
> #if WANT_SIMD_EXCEPT
> uint32x4_t thresh;
> @@ -34,9 +35,8 @@ static const struct data
> /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
> .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
> V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
> - .invln2 = V4 (0x1.715476p+0f),
> - .ln2_hi = V4 (0x1.62e4p-1f),
> - .ln2_lo = V4 (0x1.7f7d1cp-20f),
> + /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */
> + .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
> .shift = V4 (0x1.8p23f),
> .exponent_bias = V4 (0x3f800000),
> #if !WANT_SIMD_EXCEPT
> @@ -80,7 +80,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
> x = v_zerofy_f32 (x, special);
> #else
> /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
> - uint32x4_t special = vceqzq_u32 (vcaltq_f32 (x, d->oflow_bound));
> + uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
> #endif
>
> /* Reduce argument to smaller range:
> @@ -88,10 +88,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
> and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
> exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
> where 2^i is exact because i is an integer. */
> - float32x4_t j = vsubq_f32 (vfmaq_f32 (d->shift, d->invln2, x), d->shift);
> + float32x4_t j = vsubq_f32 (
> + vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
> int32x4_t i = vcvtq_s32_f32 (j);
> - float32x4_t f = vfmsq_f32 (x, j, d->ln2_hi);
> - f = vfmsq_f32 (f, j, d->ln2_lo);
> + float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
> + f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
>
> /* Approximate expm1(f) using polynomial.
> Taylor expansion for expm1(x) has the form:
> diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c
> index 067ae79613..21df61728c 100644
> --- a/sysdeps/aarch64/fpu/log_advsimd.c
> +++ b/sysdeps/aarch64/fpu/log_advsimd.c
> @@ -58,8 +58,13 @@ lookup (uint64x2_t i)
> uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
> float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
> float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
> +#if __BYTE_ORDER == __LITTLE_ENDIAN
> e.invc = vuzp1q_f64 (e0, e1);
> e.logc = vuzp2q_f64 (e0, e1);
> +#else
> + e.invc = vuzp1q_f64 (e1, e0);
> + e.logc = vuzp2q_f64 (e1, e0);
> +#endif
> return e;
> }
>
> diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c
> index efce183e86..a0d9d3b819 100644
> --- a/sysdeps/aarch64/fpu/sin_advsimd.c
> +++ b/sysdeps/aarch64/fpu/sin_advsimd.c
> @@ -75,8 +75,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
> r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
> #else
> r = x;
> - cmp = vcageq_f64 (d->range_val, x);
> - cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */
> + cmp = vcageq_f64 (x, d->range_val);
> #endif
>
> /* n = rint(|x|/pi). */
> diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c
> index 60cf3f2ca1..375dfc3331 100644
> --- a/sysdeps/aarch64/fpu/sinf_advsimd.c
> +++ b/sysdeps/aarch64/fpu/sinf_advsimd.c
> @@ -67,8 +67,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
> r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
> #else
> r = x;
> - cmp = vcageq_f32 (d->range_val, x);
> - cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */
> + cmp = vcageq_f32 (x, d->range_val);
> #endif
>
> /* n = rint(|x|/pi) */
> diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c
> index d7e5ba7b1a..0459821ab2 100644
> --- a/sysdeps/aarch64/fpu/tan_advsimd.c
> +++ b/sysdeps/aarch64/fpu/tan_advsimd.c
> @@ -23,7 +23,7 @@
> static const struct data
> {
> float64x2_t poly[9];
> - float64x2_t half_pi_hi, half_pi_lo, two_over_pi, shift;
> + float64x2_t half_pi, two_over_pi, shift;
> #if !WANT_SIMD_EXCEPT
> float64x2_t range_val;
> #endif
> @@ -34,8 +34,7 @@ static const struct data
> V2 (0x1.226e5e5ecdfa3p-7), V2 (0x1.d6c7ddbf87047p-9),
> V2 (0x1.7ea75d05b583ep-10), V2 (0x1.289f22964a03cp-11),
> V2 (0x1.4e4fd14147622p-12) },
> - .half_pi_hi = V2 (0x1.921fb54442d18p0),
> - .half_pi_lo = V2 (0x1.1a62633145c07p-54),
> + .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 },
> .two_over_pi = V2 (0x1.45f306dc9c883p-1),
> .shift = V2 (0x1.8p52),
> #if !WANT_SIMD_EXCEPT
> @@ -56,15 +55,15 @@ special_case (float64x2_t x)
>
> /* Vector approximation for double-precision tan.
> Maximum measured error is 3.48 ULP:
> - __v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37
> - want -0x1.f6ccd8ecf7deap+37. */
> + _ZGVnN2v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37
> + want -0x1.f6ccd8ecf7deap+37. */
> float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
> {
> const struct data *dat = ptr_barrier (&data);
> - /* Our argument reduction cannot calculate q with sufficient accuracy for very
> - large inputs. Fall back to scalar routine for all lanes if any are too
> - large, or Inf/NaN. If fenv exceptions are expected, also fall back for tiny
> - input to avoid underflow. */
> + /* Our argument reduction cannot calculate q with sufficient accuracy for
> + very large inputs. Fall back to scalar routine for all lanes if any are
> + too large, or Inf/NaN. If fenv exceptions are expected, also fall back for
> + tiny input to avoid underflow. */
> #if WANT_SIMD_EXCEPT
> uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
> /* iax - tiny_bound > range_val - tiny_bound. */
> @@ -82,8 +81,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
> /* Use q to reduce x to r in [-pi/4, pi/4], by:
> r = x - q * pi/2, in extended precision. */
> float64x2_t r = x;
> - r = vfmsq_f64 (r, q, dat->half_pi_hi);
> - r = vfmsq_f64 (r, q, dat->half_pi_lo);
> + r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0);
> + r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1);
> /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
> formula. */
> r = vmulq_n_f64 (r, 0.5);
> @@ -106,14 +105,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
> and reciprocity around pi/2:
> tan(x) = 1 / (tan(pi/2 - x))
> to assemble result using change-of-sign and conditional selection of
> - numerator/denominator, dependent on odd/even-ness of q (hence quadrant). */
> + numerator/denominator, dependent on odd/even-ness of q (hence quadrant).
> + */
> float64x2_t n = vfmaq_f64 (v_f64 (-1), p, p);
> float64x2_t d = vaddq_f64 (p, p);
>
> uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1));
>
> #if !WANT_SIMD_EXCEPT
> - uint64x2_t special = vceqzq_u64 (vcaleq_f64 (x, dat->range_val));
> + uint64x2_t special = vcageq_f64 (x, dat->range_val);
> if (__glibc_unlikely (v_any_u64 (special)))
> return special_case (x);
> #endif
> diff --git a/sysdeps/aarch64/fpu/tanf_advsimd.c b/sysdeps/aarch64/fpu/tanf_advsimd.c
> index 1f16103f8a..5a7489390a 100644
> --- a/sysdeps/aarch64/fpu/tanf_advsimd.c
> +++ b/sysdeps/aarch64/fpu/tanf_advsimd.c
> @@ -23,7 +23,8 @@
> static const struct data
> {
> float32x4_t poly[6];
> - float32x4_t neg_half_pi_1, neg_half_pi_2, neg_half_pi_3, two_over_pi, shift;
> + float32x4_t pi_consts;
> + float32x4_t shift;
> #if !WANT_SIMD_EXCEPT
> float32x4_t range_val;
> #endif
> @@ -31,10 +32,9 @@ static const struct data
> /* Coefficients generated using FPMinimax. */
> .poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f),
> V4 (0x1.7b5756p-6f), V4 (0x1.4ef4cep-8f), V4 (0x1.0e1e74p-7f) },
> - .neg_half_pi_1 = V4 (-0x1.921fb6p+0f),
> - .neg_half_pi_2 = V4 (0x1.777a5cp-25f),
> - .neg_half_pi_3 = V4 (0x1.ee59dap-50f),
> - .two_over_pi = V4 (0x1.45f306p-1f),
> + /* Stores constants: (-pi/2)_high, (-pi/2)_mid, (-pi/2)_low, and 2/pi. */
> + .pi_consts
> + = { -0x1.921fb6p+0f, 0x1.777a5cp-25f, 0x1.ee59dap-50f, 0x1.45f306p-1f },
> .shift = V4 (0x1.8p+23f),
> #if !WANT_SIMD_EXCEPT
> .range_val = V4 (0x1p15f),
> @@ -58,10 +58,11 @@ eval_poly (float32x4_t z, const struct data *d)
> {
> float32x4_t z2 = vmulq_f32 (z, z);
> #if WANT_SIMD_EXCEPT
> - /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If fp exceptions
> - are to be triggered correctly, sidestep this by fixing such lanes to 0. */
> + /* Tiny z (<= 0x1p-31) will underflow when calculating z^4.
> + If fp exceptions are to be triggered correctly,
> + sidestep this by fixing such lanes to 0. */
> uint32x4_t will_uflow
> - = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound);
> + = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound);
> if (__glibc_unlikely (v_any_u32 (will_uflow)))
> z2 = vbslq_f32 (will_uflow, v_f32 (0), z2);
> #endif
> @@ -94,16 +95,16 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x)
> #endif
>
> /* n = rint(x/(pi/2)). */
> - float32x4_t q = vfmaq_f32 (d->shift, d->two_over_pi, x);
> + float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3);
> float32x4_t n = vsubq_f32 (q, d->shift);
> /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */
> uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1));
>
> /* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */
> float32x4_t r;
> - r = vfmaq_f32 (x, d->neg_half_pi_1, n);
> - r = vfmaq_f32 (r, d->neg_half_pi_2, n);
> - r = vfmaq_f32 (r, d->neg_half_pi_3, n);
> + r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0);
> + r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1);
> + r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2);
>
> /* If x lives in an interval, where |tan(x)|
> - is finite, then use a polynomial approximation of the form
Thanks Adhemerval. I don't have commit rights - would you be able to
commit this for me?
On 23/02/2024 12:19, Adhemerval Zanella Netto wrote:
>
> On 20/02/24 13:44, Joe Ramsay wrote:
>> This includes a fix for big-endian in AdvSIMD log, some cosmetic
>> changes, and numerous small optimisations mainly around inlining and
>> using indexed variants of MLA intrinsics.
> LGTM, thanks.
>
> Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
>
>> ---
>> Thanks,
>> Joe
>> sysdeps/aarch64/fpu/acos_advsimd.c | 4 ++--
>> sysdeps/aarch64/fpu/asin_advsimd.c | 4 ++--
>> sysdeps/aarch64/fpu/atan2_sve.c | 29 +++++++++++++--------------
>> sysdeps/aarch64/fpu/atan2f_sve.c | 30 +++++++++++++++-------------
>> sysdeps/aarch64/fpu/cos_advsimd.c | 3 +--
>> sysdeps/aarch64/fpu/cosf_advsimd.c | 3 +--
>> sysdeps/aarch64/fpu/exp10_advsimd.c | 4 ++--
>> sysdeps/aarch64/fpu/exp10f_advsimd.c | 21 ++++++++++---------
>> sysdeps/aarch64/fpu/exp2_advsimd.c | 20 ++++++++++---------
>> sysdeps/aarch64/fpu/exp2f_sve.c | 4 +++-
>> sysdeps/aarch64/fpu/exp_advsimd.c | 4 ++--
>> sysdeps/aarch64/fpu/expm1_advsimd.c | 11 +++++-----
>> sysdeps/aarch64/fpu/expm1f_advsimd.c | 17 ++++++++--------
>> sysdeps/aarch64/fpu/log_advsimd.c | 5 +++++
>> sysdeps/aarch64/fpu/sin_advsimd.c | 3 +--
>> sysdeps/aarch64/fpu/sinf_advsimd.c | 3 +--
>> sysdeps/aarch64/fpu/tan_advsimd.c | 26 ++++++++++++------------
>> sysdeps/aarch64/fpu/tanf_advsimd.c | 25 ++++++++++++-----------
>> 18 files changed, 111 insertions(+), 105 deletions(-)
>>
>> diff --git a/sysdeps/aarch64/fpu/acos_advsimd.c b/sysdeps/aarch64/fpu/acos_advsimd.c
>> index a8eabb5e71..0a86c9823a 100644
>> --- a/sysdeps/aarch64/fpu/acos_advsimd.c
>> +++ b/sysdeps/aarch64/fpu/acos_advsimd.c
>> @@ -40,8 +40,8 @@ static const struct data
>> };
>>
>> #define AllMask v_u64 (0xffffffffffffffff)
>> -#define Oneu (0x3ff0000000000000)
>> -#define Small (0x3e50000000000000) /* 2^-53. */
>> +#define Oneu 0x3ff0000000000000
>> +#define Small 0x3e50000000000000 /* 2^-53. */
>>
>> #if WANT_SIMD_EXCEPT
>> static float64x2_t VPCS_ATTR NOINLINE
>> diff --git a/sysdeps/aarch64/fpu/asin_advsimd.c b/sysdeps/aarch64/fpu/asin_advsimd.c
>> index 141646e954..2de6eff407 100644
>> --- a/sysdeps/aarch64/fpu/asin_advsimd.c
>> +++ b/sysdeps/aarch64/fpu/asin_advsimd.c
>> @@ -39,8 +39,8 @@ static const struct data
>> };
>>
>> #define AllMask v_u64 (0xffffffffffffffff)
>> -#define One (0x3ff0000000000000)
>> -#define Small (0x3e50000000000000) /* 2^-12. */
>> +#define One 0x3ff0000000000000
>> +#define Small 0x3e50000000000000 /* 2^-12. */
>>
>> #if WANT_SIMD_EXCEPT
>> static float64x2_t VPCS_ATTR NOINLINE
>> diff --git a/sysdeps/aarch64/fpu/atan2_sve.c b/sysdeps/aarch64/fpu/atan2_sve.c
>> index 09a4c559b8..04fa71fa37 100644
>> --- a/sysdeps/aarch64/fpu/atan2_sve.c
>> +++ b/sysdeps/aarch64/fpu/atan2_sve.c
>> @@ -37,9 +37,6 @@ static const struct data
>> .pi_over_2 = 0x1.921fb54442d18p+0,
>> };
>>
>> -/* Useful constants. */
>> -#define SignMask sv_u64 (0x8000000000000000)
>> -
>> /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */
>> static svfloat64_t NOINLINE
>> special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret,
>> @@ -72,14 +69,15 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
>> svbool_t cmp_y = zeroinfnan (iy, pg);
>> svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
>>
>> - svuint64_t sign_x = svand_x (pg, ix, SignMask);
>> - svuint64_t sign_y = svand_x (pg, iy, SignMask);
>> - svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y);
>> -
>> svfloat64_t ax = svabs_x (pg, x);
>> svfloat64_t ay = svabs_x (pg, y);
>> + svuint64_t iax = svreinterpret_u64 (ax);
>> + svuint64_t iay = svreinterpret_u64 (ay);
>> +
>> + svuint64_t sign_x = sveor_x (pg, ix, iax);
>> + svuint64_t sign_y = sveor_x (pg, iy, iay);
>> + svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y);
>>
>> - svbool_t pred_xlt0 = svcmplt (pg, x, 0.0);
>> svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
>>
>> /* Set up z for call to atan. */
>> @@ -88,8 +86,9 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
>> svfloat64_t z = svdiv_x (pg, n, d);
>>
>> /* Work out the correct shift. */
>> - svfloat64_t shift = svsel (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0));
>> - shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift);
>> + svfloat64_t shift = svreinterpret_f64 (svlsr_x (pg, sign_x, 1));
>> + shift = svsel (pred_aygtax, sv_f64 (1.0), shift);
>> + shift = svreinterpret_f64 (svorr_x (pg, sign_x, svreinterpret_u64 (shift)));
>> shift = svmul_x (pg, shift, data_ptr->pi_over_2);
>>
>> /* Use split Estrin scheme for P(z^2) with deg(P)=19. */
>> @@ -109,10 +108,10 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
>> ret = svadd_m (pg, ret, shift);
>>
>> /* Account for the sign of x and y. */
>> - ret = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy));
>> -
>> if (__glibc_unlikely (svptest_any (pg, cmp_xy)))
>> - return special_case (y, x, ret, cmp_xy);
>> -
>> - return ret;
>> + return special_case (
>> + y, x,
>> + svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)),
>> + cmp_xy);
>> + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy));
>> }
>> diff --git a/sysdeps/aarch64/fpu/atan2f_sve.c b/sysdeps/aarch64/fpu/atan2f_sve.c
>> index b92f83cdea..9ea197147c 100644
>> --- a/sysdeps/aarch64/fpu/atan2f_sve.c
>> +++ b/sysdeps/aarch64/fpu/atan2f_sve.c
>> @@ -32,10 +32,8 @@ static const struct data
>> .pi_over_2 = 0x1.921fb6p+0f,
>> };
>>
>> -#define SignMask sv_u32 (0x80000000)
>> -
>> /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */
>> -static inline svfloat32_t
>> +static svfloat32_t NOINLINE
>> special_case (svfloat32_t y, svfloat32_t x, svfloat32_t ret,
>> const svbool_t cmp)
>> {
>> @@ -67,14 +65,15 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
>> svbool_t cmp_y = zeroinfnan (iy, pg);
>> svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
>>
>> - svuint32_t sign_x = svand_x (pg, ix, SignMask);
>> - svuint32_t sign_y = svand_x (pg, iy, SignMask);
>> - svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y);
>> -
>> svfloat32_t ax = svabs_x (pg, x);
>> svfloat32_t ay = svabs_x (pg, y);
>> + svuint32_t iax = svreinterpret_u32 (ax);
>> + svuint32_t iay = svreinterpret_u32 (ay);
>> +
>> + svuint32_t sign_x = sveor_x (pg, ix, iax);
>> + svuint32_t sign_y = sveor_x (pg, iy, iay);
>> + svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y);
>>
>> - svbool_t pred_xlt0 = svcmplt (pg, x, 0.0);
>> svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
>>
>> /* Set up z for call to atan. */
>> @@ -83,11 +82,12 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
>> svfloat32_t z = svdiv_x (pg, n, d);
>>
>> /* Work out the correct shift. */
>> - svfloat32_t shift = svsel (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0));
>> - shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift);
>> + svfloat32_t shift = svreinterpret_f32 (svlsr_x (pg, sign_x, 1));
>> + shift = svsel (pred_aygtax, sv_f32 (1.0), shift);
>> + shift = svreinterpret_f32 (svorr_x (pg, sign_x, svreinterpret_u32 (shift)));
>> shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2));
>>
>> - /* Use split Estrin scheme for P(z^2) with deg(P)=7. */
>> + /* Use pure Estrin scheme for P(z^2) with deg(P)=7. */
>> svfloat32_t z2 = svmul_x (pg, z, z);
>> svfloat32_t z4 = svmul_x (pg, z2, z2);
>> svfloat32_t z8 = svmul_x (pg, z4, z4);
>> @@ -101,10 +101,12 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
>> ret = svadd_m (pg, ret, shift);
>>
>> /* Account for the sign of x and y. */
>> - ret = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy));
>>
>> if (__glibc_unlikely (svptest_any (pg, cmp_xy)))
>> - return special_case (y, x, ret, cmp_xy);
>> + return special_case (
>> + y, x,
>> + svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)),
>> + cmp_xy);
>>
>> - return ret;
>> + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy));
>> }
>> diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c
>> index 2897e8b909..3924c9ce44 100644
>> --- a/sysdeps/aarch64/fpu/cos_advsimd.c
>> +++ b/sysdeps/aarch64/fpu/cos_advsimd.c
>> @@ -63,8 +63,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
>> special-case handler later. */
>> r = vbslq_f64 (cmp, v_f64 (1.0), r);
>> #else
>> - cmp = vcageq_f64 (d->range_val, x);
>> - cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */
>> + cmp = vcageq_f64 (x, d->range_val);
>> r = x;
>> #endif
>>
>> diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c
>> index 60abc8dfcf..d0c285b03a 100644
>> --- a/sysdeps/aarch64/fpu/cosf_advsimd.c
>> +++ b/sysdeps/aarch64/fpu/cosf_advsimd.c
>> @@ -64,8 +64,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
>> special-case handler later. */
>> r = vbslq_f32 (cmp, v_f32 (1.0f), r);
>> #else
>> - cmp = vcageq_f32 (d->range_val, x);
>> - cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */
>> + cmp = vcageq_f32 (x, d->range_val);
>> r = x;
>> #endif
>>
>> diff --git a/sysdeps/aarch64/fpu/exp10_advsimd.c b/sysdeps/aarch64/fpu/exp10_advsimd.c
>> index fe7149b191..eeb31ca839 100644
>> --- a/sysdeps/aarch64/fpu/exp10_advsimd.c
>> +++ b/sysdeps/aarch64/fpu/exp10_advsimd.c
>> @@ -57,7 +57,7 @@ const static struct data
>> # define BigBound v_u64 (0x4070000000000000) /* asuint64 (0x1p8). */
>> # define Thres v_u64 (0x2070000000000000) /* BigBound - TinyBound. */
>>
>> -static inline float64x2_t VPCS_ATTR
>> +static float64x2_t VPCS_ATTR NOINLINE
>> special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
>> {
>> /* If fenv exceptions are to be triggered correctly, fall back to the scalar
>> @@ -72,7 +72,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
>> # define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
>> # define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
>>
>> -static float64x2_t VPCS_ATTR NOINLINE
>> +static inline float64x2_t VPCS_ATTR
>> special_case (float64x2_t s, float64x2_t y, float64x2_t n,
>> const struct data *d)
>> {
>> diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c
>> index 7ee0c90948..ab117b69da 100644
>> --- a/sysdeps/aarch64/fpu/exp10f_advsimd.c
>> +++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c
>> @@ -25,7 +25,8 @@
>> static const struct data
>> {
>> float32x4_t poly[5];
>> - float32x4_t shift, log10_2, log2_10_hi, log2_10_lo;
>> + float32x4_t log10_2_and_inv, shift;
>> +
>> #if !WANT_SIMD_EXCEPT
>> float32x4_t scale_thresh;
>> #endif
>> @@ -38,9 +39,9 @@ static const struct data
>> .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f),
>> V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) },
>> .shift = V4 (0x1.8p23f),
>> - .log10_2 = V4 (0x1.a934fp+1),
>> - .log2_10_hi = V4 (0x1.344136p-2),
>> - .log2_10_lo = V4 (-0x1.ec10cp-27),
>> +
>> + /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */
>> + .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 },
>> #if !WANT_SIMD_EXCEPT
>> .scale_thresh = V4 (ScaleBound)
>> #endif
>> @@ -98,24 +99,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
>> #if WANT_SIMD_EXCEPT
>> /* asuint(x) - TinyBound >= BigBound - TinyBound. */
>> uint32x4_t cmp = vcgeq_u32 (
>> - vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
>> - TinyBound),
>> - Thres);
>> + vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres);
>> float32x4_t xm = x;
>> /* If any lanes are special, mask them with 1 and retain a copy of x to allow
>> special case handler to fix special lanes later. This is only necessary if
>> fenv exceptions are to be triggered correctly. */
>> if (__glibc_unlikely (v_any_u32 (cmp)))
>> - x = vbslq_f32 (cmp, v_f32 (1), x);
>> + x = v_zerofy_f32 (x, cmp);
>> #endif
>>
>> /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
>> with poly(r) in [1/sqrt(2), sqrt(2)] and
>> x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
>> - float32x4_t z = vfmaq_f32 (d->shift, x, d->log10_2);
>> + float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0);
>> float32x4_t n = vsubq_f32 (z, d->shift);
>> - float32x4_t r = vfmsq_f32 (x, n, d->log2_10_hi);
>> - r = vfmsq_f32 (r, n, d->log2_10_lo);
>> + float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1);
>> + r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2);
>> uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
>>
>> float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
>> diff --git a/sysdeps/aarch64/fpu/exp2_advsimd.c b/sysdeps/aarch64/fpu/exp2_advsimd.c
>> index 391a93180c..ae1e63d503 100644
>> --- a/sysdeps/aarch64/fpu/exp2_advsimd.c
>> +++ b/sysdeps/aarch64/fpu/exp2_advsimd.c
>> @@ -24,6 +24,7 @@
>> #define IndexMask (N - 1)
>> #define BigBound 1022.0
>> #define UOFlowBound 1280.0
>> +#define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */
>>
>> static const struct data
>> {
>> @@ -48,14 +49,13 @@ lookup_sbits (uint64x2_t i)
>>
>> #if WANT_SIMD_EXCEPT
>>
>> -# define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */
>> # define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */
>>
>> /* Call scalar exp2 as a fallback. */
>> static float64x2_t VPCS_ATTR NOINLINE
>> -special_case (float64x2_t x)
>> +special_case (float64x2_t x, float64x2_t y, uint64x2_t is_special)
>> {
>> - return v_call_f64 (exp2, x, x, v_u64 (0xffffffffffffffff));
>> + return v_call_f64 (exp2, x, y, is_special);
>> }
>>
>> #else
>> @@ -65,7 +65,7 @@ special_case (float64x2_t x)
>> # define SpecialBias1 0x7000000000000000 /* 0x1p769. */
>> # define SpecialBias2 0x3010000000000000 /* 0x1p-254. */
>>
>> -static float64x2_t VPCS_ATTR
>> +static inline float64x2_t VPCS_ATTR
>> special_case (float64x2_t s, float64x2_t y, float64x2_t n,
>> const struct data *d)
>> {
>> @@ -94,10 +94,10 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
>> #if WANT_SIMD_EXCEPT
>> uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
>> cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres));
>> - /* If any special case (inf, nan, small and large x) is detected,
>> - fall back to scalar for all lanes. */
>> - if (__glibc_unlikely (v_any_u64 (cmp)))
>> - return special_case (x);
>> + /* Mask special lanes and retain a copy of x for passing to special-case
>> + handler. */
>> + float64x2_t xc = x;
>> + x = v_zerofy_f64 (x, cmp);
>> #else
>> cmp = vcagtq_f64 (x, d->scale_big_bound);
>> #endif
>> @@ -120,9 +120,11 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
>> float64x2_t y = v_pairwise_poly_3_f64 (r, r2, d->poly);
>> y = vmulq_f64 (r, y);
>>
>> -#if !WANT_SIMD_EXCEPT
>> if (__glibc_unlikely (v_any_u64 (cmp)))
>> +#if !WANT_SIMD_EXCEPT
>> return special_case (s, y, n, d);
>> +#else
>> + return special_case (xc, vfmaq_f64 (s, s, y), cmp);
>> #endif
>> return vfmaq_f64 (s, s, y);
>> }
>> diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c
>> index 9a5a523a10..8a686e3e05 100644
>> --- a/sysdeps/aarch64/fpu/exp2f_sve.c
>> +++ b/sysdeps/aarch64/fpu/exp2f_sve.c
>> @@ -20,6 +20,8 @@
>> #include "sv_math.h"
>> #include "poly_sve_f32.h"
>>
>> +#define Thres 0x1.5d5e2ap+6f
>> +
>> static const struct data
>> {
>> float poly[5];
>> @@ -33,7 +35,7 @@ static const struct data
>> .shift = 0x1.903f8p17f,
>> /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
>> correctly by FEXPA. */
>> - .thres = 0x1.5d5e2ap+6f,
>> + .thres = Thres,
>> };
>>
>> static svfloat32_t NOINLINE
>> diff --git a/sysdeps/aarch64/fpu/exp_advsimd.c b/sysdeps/aarch64/fpu/exp_advsimd.c
>> index fd215f1d2c..5e3a9a0d44 100644
>> --- a/sysdeps/aarch64/fpu/exp_advsimd.c
>> +++ b/sysdeps/aarch64/fpu/exp_advsimd.c
>> @@ -54,7 +54,7 @@ const static volatile struct
>> # define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */
>> # define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */
>>
>> -static inline float64x2_t VPCS_ATTR
>> +static float64x2_t VPCS_ATTR NOINLINE
>> special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
>> {
>> /* If fenv exceptions are to be triggered correctly, fall back to the scalar
>> @@ -69,7 +69,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
>> # define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
>> # define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
>>
>> -static float64x2_t VPCS_ATTR NOINLINE
>> +static inline float64x2_t VPCS_ATTR
>> special_case (float64x2_t s, float64x2_t y, float64x2_t n)
>> {
>> /* 2^(n/N) may overflow, break it up into s1*s2. */
>> diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c
>> index 0b85bd06f3..3628398674 100644
>> --- a/sysdeps/aarch64/fpu/expm1_advsimd.c
>> +++ b/sysdeps/aarch64/fpu/expm1_advsimd.c
>> @@ -23,7 +23,7 @@
>> static const struct data
>> {
>> float64x2_t poly[11];
>> - float64x2_t invln2, ln2_lo, ln2_hi, shift;
>> + float64x2_t invln2, ln2, shift;
>> int64x2_t exponent_bias;
>> #if WANT_SIMD_EXCEPT
>> uint64x2_t thresh, tiny_bound;
>> @@ -38,8 +38,7 @@ static const struct data
>> V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
>> V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) },
>> .invln2 = V2 (0x1.71547652b82fep0),
>> - .ln2_hi = V2 (0x1.62e42fefa39efp-1),
>> - .ln2_lo = V2 (0x1.abc9e3b39803fp-56),
>> + .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 },
>> .shift = V2 (0x1.8p52),
>> .exponent_bias = V2 (0x3ff0000000000000),
>> #if WANT_SIMD_EXCEPT
>> @@ -83,7 +82,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
>> x = v_zerofy_f64 (x, special);
>> #else
>> /* Large input, NaNs and Infs. */
>> - uint64x2_t special = vceqzq_u64 (vcaltq_f64 (x, d->oflow_bound));
>> + uint64x2_t special = vcageq_f64 (x, d->oflow_bound);
>> #endif
>>
>> /* Reduce argument to smaller range:
>> @@ -93,8 +92,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
>> where 2^i is exact because i is an integer. */
>> float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift);
>> int64x2_t i = vcvtq_s64_f64 (n);
>> - float64x2_t f = vfmsq_f64 (x, n, d->ln2_hi);
>> - f = vfmsq_f64 (f, n, d->ln2_lo);
>> + float64x2_t f = vfmsq_laneq_f64 (x, n, d->ln2, 0);
>> + f = vfmsq_laneq_f64 (f, n, d->ln2, 1);
>>
>> /* Approximate expm1(f) using polynomial.
>> Taylor expansion for expm1(x) has the form:
>> diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c
>> index 8d4c9a2193..93db200f61 100644
>> --- a/sysdeps/aarch64/fpu/expm1f_advsimd.c
>> +++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
>> @@ -23,7 +23,8 @@
>> static const struct data
>> {
>> float32x4_t poly[5];
>> - float32x4_t invln2, ln2_lo, ln2_hi, shift;
>> + float32x4_t invln2_and_ln2;
>> + float32x4_t shift;
>> int32x4_t exponent_bias;
>> #if WANT_SIMD_EXCEPT
>> uint32x4_t thresh;
>> @@ -34,9 +35,8 @@ static const struct data
>> /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
>> .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
>> V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
>> - .invln2 = V4 (0x1.715476p+0f),
>> - .ln2_hi = V4 (0x1.62e4p-1f),
>> - .ln2_lo = V4 (0x1.7f7d1cp-20f),
>> + /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */
>> + .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
>> .shift = V4 (0x1.8p23f),
>> .exponent_bias = V4 (0x3f800000),
>> #if !WANT_SIMD_EXCEPT
>> @@ -80,7 +80,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
>> x = v_zerofy_f32 (x, special);
>> #else
>> /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
>> - uint32x4_t special = vceqzq_u32 (vcaltq_f32 (x, d->oflow_bound));
>> + uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
>> #endif
>>
>> /* Reduce argument to smaller range:
>> @@ -88,10 +88,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
>> and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
>> exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
>> where 2^i is exact because i is an integer. */
>> - float32x4_t j = vsubq_f32 (vfmaq_f32 (d->shift, d->invln2, x), d->shift);
>> + float32x4_t j = vsubq_f32 (
>> + vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
>> int32x4_t i = vcvtq_s32_f32 (j);
>> - float32x4_t f = vfmsq_f32 (x, j, d->ln2_hi);
>> - f = vfmsq_f32 (f, j, d->ln2_lo);
>> + float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
>> + f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
>>
>> /* Approximate expm1(f) using polynomial.
>> Taylor expansion for expm1(x) has the form:
>> diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c
>> index 067ae79613..21df61728c 100644
>> --- a/sysdeps/aarch64/fpu/log_advsimd.c
>> +++ b/sysdeps/aarch64/fpu/log_advsimd.c
>> @@ -58,8 +58,13 @@ lookup (uint64x2_t i)
>> uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
>> float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
>> float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
>> +#if __BYTE_ORDER == __LITTLE_ENDIAN
>> e.invc = vuzp1q_f64 (e0, e1);
>> e.logc = vuzp2q_f64 (e0, e1);
>> +#else
>> + e.invc = vuzp1q_f64 (e1, e0);
>> + e.logc = vuzp2q_f64 (e1, e0);
>> +#endif
>> return e;
>> }
>>
>> diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c
>> index efce183e86..a0d9d3b819 100644
>> --- a/sysdeps/aarch64/fpu/sin_advsimd.c
>> +++ b/sysdeps/aarch64/fpu/sin_advsimd.c
>> @@ -75,8 +75,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
>> r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
>> #else
>> r = x;
>> - cmp = vcageq_f64 (d->range_val, x);
>> - cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */
>> + cmp = vcageq_f64 (x, d->range_val);
>> #endif
>>
>> /* n = rint(|x|/pi). */
>> diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c
>> index 60cf3f2ca1..375dfc3331 100644
>> --- a/sysdeps/aarch64/fpu/sinf_advsimd.c
>> +++ b/sysdeps/aarch64/fpu/sinf_advsimd.c
>> @@ -67,8 +67,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
>> r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
>> #else
>> r = x;
>> - cmp = vcageq_f32 (d->range_val, x);
>> - cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */
>> + cmp = vcageq_f32 (x, d->range_val);
>> #endif
>>
>> /* n = rint(|x|/pi) */
>> diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c
>> index d7e5ba7b1a..0459821ab2 100644
>> --- a/sysdeps/aarch64/fpu/tan_advsimd.c
>> +++ b/sysdeps/aarch64/fpu/tan_advsimd.c
>> @@ -23,7 +23,7 @@
>> static const struct data
>> {
>> float64x2_t poly[9];
>> - float64x2_t half_pi_hi, half_pi_lo, two_over_pi, shift;
>> + float64x2_t half_pi, two_over_pi, shift;
>> #if !WANT_SIMD_EXCEPT
>> float64x2_t range_val;
>> #endif
>> @@ -34,8 +34,7 @@ static const struct data
>> V2 (0x1.226e5e5ecdfa3p-7), V2 (0x1.d6c7ddbf87047p-9),
>> V2 (0x1.7ea75d05b583ep-10), V2 (0x1.289f22964a03cp-11),
>> V2 (0x1.4e4fd14147622p-12) },
>> - .half_pi_hi = V2 (0x1.921fb54442d18p0),
>> - .half_pi_lo = V2 (0x1.1a62633145c07p-54),
>> + .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 },
>> .two_over_pi = V2 (0x1.45f306dc9c883p-1),
>> .shift = V2 (0x1.8p52),
>> #if !WANT_SIMD_EXCEPT
>> @@ -56,15 +55,15 @@ special_case (float64x2_t x)
>>
>> /* Vector approximation for double-precision tan.
>> Maximum measured error is 3.48 ULP:
>> - __v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37
>> - want -0x1.f6ccd8ecf7deap+37. */
>> + _ZGVnN2v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37
>> + want -0x1.f6ccd8ecf7deap+37. */
>> float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
>> {
>> const struct data *dat = ptr_barrier (&data);
>> - /* Our argument reduction cannot calculate q with sufficient accuracy for very
>> - large inputs. Fall back to scalar routine for all lanes if any are too
>> - large, or Inf/NaN. If fenv exceptions are expected, also fall back for tiny
>> - input to avoid underflow. */
>> + /* Our argument reduction cannot calculate q with sufficient accuracy for
>> + very large inputs. Fall back to scalar routine for all lanes if any are
>> + too large, or Inf/NaN. If fenv exceptions are expected, also fall back for
>> + tiny input to avoid underflow. */
>> #if WANT_SIMD_EXCEPT
>> uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
>> /* iax - tiny_bound > range_val - tiny_bound. */
>> @@ -82,8 +81,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
>> /* Use q to reduce x to r in [-pi/4, pi/4], by:
>> r = x - q * pi/2, in extended precision. */
>> float64x2_t r = x;
>> - r = vfmsq_f64 (r, q, dat->half_pi_hi);
>> - r = vfmsq_f64 (r, q, dat->half_pi_lo);
>> + r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0);
>> + r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1);
>> /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
>> formula. */
>> r = vmulq_n_f64 (r, 0.5);
>> @@ -106,14 +105,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
>> and reciprocity around pi/2:
>> tan(x) = 1 / (tan(pi/2 - x))
>> to assemble result using change-of-sign and conditional selection of
>> - numerator/denominator, dependent on odd/even-ness of q (hence quadrant). */
>> + numerator/denominator, dependent on odd/even-ness of q (hence quadrant).
>> + */
>> float64x2_t n = vfmaq_f64 (v_f64 (-1), p, p);
>> float64x2_t d = vaddq_f64 (p, p);
>>
>> uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1));
>>
>> #if !WANT_SIMD_EXCEPT
>> - uint64x2_t special = vceqzq_u64 (vcaleq_f64 (x, dat->range_val));
>> + uint64x2_t special = vcageq_f64 (x, dat->range_val);
>> if (__glibc_unlikely (v_any_u64 (special)))
>> return special_case (x);
>> #endif
>> diff --git a/sysdeps/aarch64/fpu/tanf_advsimd.c b/sysdeps/aarch64/fpu/tanf_advsimd.c
>> index 1f16103f8a..5a7489390a 100644
>> --- a/sysdeps/aarch64/fpu/tanf_advsimd.c
>> +++ b/sysdeps/aarch64/fpu/tanf_advsimd.c
>> @@ -23,7 +23,8 @@
>> static const struct data
>> {
>> float32x4_t poly[6];
>> - float32x4_t neg_half_pi_1, neg_half_pi_2, neg_half_pi_3, two_over_pi, shift;
>> + float32x4_t pi_consts;
>> + float32x4_t shift;
>> #if !WANT_SIMD_EXCEPT
>> float32x4_t range_val;
>> #endif
>> @@ -31,10 +32,9 @@ static const struct data
>> /* Coefficients generated using FPMinimax. */
>> .poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f),
>> V4 (0x1.7b5756p-6f), V4 (0x1.4ef4cep-8f), V4 (0x1.0e1e74p-7f) },
>> - .neg_half_pi_1 = V4 (-0x1.921fb6p+0f),
>> - .neg_half_pi_2 = V4 (0x1.777a5cp-25f),
>> - .neg_half_pi_3 = V4 (0x1.ee59dap-50f),
>> - .two_over_pi = V4 (0x1.45f306p-1f),
>> + /* Stores constants: (-pi/2)_high, (-pi/2)_mid, (-pi/2)_low, and 2/pi. */
>> + .pi_consts
>> + = { -0x1.921fb6p+0f, 0x1.777a5cp-25f, 0x1.ee59dap-50f, 0x1.45f306p-1f },
>> .shift = V4 (0x1.8p+23f),
>> #if !WANT_SIMD_EXCEPT
>> .range_val = V4 (0x1p15f),
>> @@ -58,10 +58,11 @@ eval_poly (float32x4_t z, const struct data *d)
>> {
>> float32x4_t z2 = vmulq_f32 (z, z);
>> #if WANT_SIMD_EXCEPT
>> - /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If fp exceptions
>> - are to be triggered correctly, sidestep this by fixing such lanes to 0. */
>> + /* Tiny z (<= 0x1p-31) will underflow when calculating z^4.
>> + If fp exceptions are to be triggered correctly,
>> + sidestep this by fixing such lanes to 0. */
>> uint32x4_t will_uflow
>> - = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound);
>> + = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound);
>> if (__glibc_unlikely (v_any_u32 (will_uflow)))
>> z2 = vbslq_f32 (will_uflow, v_f32 (0), z2);
>> #endif
>> @@ -94,16 +95,16 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x)
>> #endif
>>
>> /* n = rint(x/(pi/2)). */
>> - float32x4_t q = vfmaq_f32 (d->shift, d->two_over_pi, x);
>> + float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3);
>> float32x4_t n = vsubq_f32 (q, d->shift);
>> /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */
>> uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1));
>>
>> /* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */
>> float32x4_t r;
>> - r = vfmaq_f32 (x, d->neg_half_pi_1, n);
>> - r = vfmaq_f32 (r, d->neg_half_pi_2, n);
>> - r = vfmaq_f32 (r, d->neg_half_pi_3, n);
>> + r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0);
>> + r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1);
>> + r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2);
>>
>> /* If x lives in an interval, where |tan(x)|
>> - is finite, then use a polynomial approximation of the form
@@ -40,8 +40,8 @@ static const struct data
};
#define AllMask v_u64 (0xffffffffffffffff)
-#define Oneu (0x3ff0000000000000)
-#define Small (0x3e50000000000000) /* 2^-53. */
+#define Oneu 0x3ff0000000000000
+#define Small 0x3e50000000000000 /* 2^-53. */
#if WANT_SIMD_EXCEPT
static float64x2_t VPCS_ATTR NOINLINE
@@ -39,8 +39,8 @@ static const struct data
};
#define AllMask v_u64 (0xffffffffffffffff)
-#define One (0x3ff0000000000000)
-#define Small (0x3e50000000000000) /* 2^-12. */
+#define One 0x3ff0000000000000
+#define Small 0x3e50000000000000 /* 2^-12. */
#if WANT_SIMD_EXCEPT
static float64x2_t VPCS_ATTR NOINLINE
@@ -37,9 +37,6 @@ static const struct data
.pi_over_2 = 0x1.921fb54442d18p+0,
};
-/* Useful constants. */
-#define SignMask sv_u64 (0x8000000000000000)
-
/* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */
static svfloat64_t NOINLINE
special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret,
@@ -72,14 +69,15 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
svbool_t cmp_y = zeroinfnan (iy, pg);
svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
- svuint64_t sign_x = svand_x (pg, ix, SignMask);
- svuint64_t sign_y = svand_x (pg, iy, SignMask);
- svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y);
-
svfloat64_t ax = svabs_x (pg, x);
svfloat64_t ay = svabs_x (pg, y);
+ svuint64_t iax = svreinterpret_u64 (ax);
+ svuint64_t iay = svreinterpret_u64 (ay);
+
+ svuint64_t sign_x = sveor_x (pg, ix, iax);
+ svuint64_t sign_y = sveor_x (pg, iy, iay);
+ svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y);
- svbool_t pred_xlt0 = svcmplt (pg, x, 0.0);
svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
/* Set up z for call to atan. */
@@ -88,8 +86,9 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
svfloat64_t z = svdiv_x (pg, n, d);
/* Work out the correct shift. */
- svfloat64_t shift = svsel (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0));
- shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift);
+ svfloat64_t shift = svreinterpret_f64 (svlsr_x (pg, sign_x, 1));
+ shift = svsel (pred_aygtax, sv_f64 (1.0), shift);
+ shift = svreinterpret_f64 (svorr_x (pg, sign_x, svreinterpret_u64 (shift)));
shift = svmul_x (pg, shift, data_ptr->pi_over_2);
/* Use split Estrin scheme for P(z^2) with deg(P)=19. */
@@ -109,10 +108,10 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
ret = svadd_m (pg, ret, shift);
/* Account for the sign of x and y. */
- ret = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy));
-
if (__glibc_unlikely (svptest_any (pg, cmp_xy)))
- return special_case (y, x, ret, cmp_xy);
-
- return ret;
+ return special_case (
+ y, x,
+ svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)),
+ cmp_xy);
+ return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy));
}
@@ -32,10 +32,8 @@ static const struct data
.pi_over_2 = 0x1.921fb6p+0f,
};
-#define SignMask sv_u32 (0x80000000)
-
/* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */
-static inline svfloat32_t
+static svfloat32_t NOINLINE
special_case (svfloat32_t y, svfloat32_t x, svfloat32_t ret,
const svbool_t cmp)
{
@@ -67,14 +65,15 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
svbool_t cmp_y = zeroinfnan (iy, pg);
svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
- svuint32_t sign_x = svand_x (pg, ix, SignMask);
- svuint32_t sign_y = svand_x (pg, iy, SignMask);
- svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y);
-
svfloat32_t ax = svabs_x (pg, x);
svfloat32_t ay = svabs_x (pg, y);
+ svuint32_t iax = svreinterpret_u32 (ax);
+ svuint32_t iay = svreinterpret_u32 (ay);
+
+ svuint32_t sign_x = sveor_x (pg, ix, iax);
+ svuint32_t sign_y = sveor_x (pg, iy, iay);
+ svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y);
- svbool_t pred_xlt0 = svcmplt (pg, x, 0.0);
svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
/* Set up z for call to atan. */
@@ -83,11 +82,12 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
svfloat32_t z = svdiv_x (pg, n, d);
/* Work out the correct shift. */
- svfloat32_t shift = svsel (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0));
- shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift);
+ svfloat32_t shift = svreinterpret_f32 (svlsr_x (pg, sign_x, 1));
+ shift = svsel (pred_aygtax, sv_f32 (1.0), shift);
+ shift = svreinterpret_f32 (svorr_x (pg, sign_x, svreinterpret_u32 (shift)));
shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2));
- /* Use split Estrin scheme for P(z^2) with deg(P)=7. */
+ /* Use pure Estrin scheme for P(z^2) with deg(P)=7. */
svfloat32_t z2 = svmul_x (pg, z, z);
svfloat32_t z4 = svmul_x (pg, z2, z2);
svfloat32_t z8 = svmul_x (pg, z4, z4);
@@ -101,10 +101,12 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
ret = svadd_m (pg, ret, shift);
/* Account for the sign of x and y. */
- ret = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy));
if (__glibc_unlikely (svptest_any (pg, cmp_xy)))
- return special_case (y, x, ret, cmp_xy);
+ return special_case (
+ y, x,
+ svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)),
+ cmp_xy);
- return ret;
+ return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy));
}
@@ -63,8 +63,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
special-case handler later. */
r = vbslq_f64 (cmp, v_f64 (1.0), r);
#else
- cmp = vcageq_f64 (d->range_val, x);
- cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */
+ cmp = vcageq_f64 (x, d->range_val);
r = x;
#endif
@@ -64,8 +64,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
special-case handler later. */
r = vbslq_f32 (cmp, v_f32 (1.0f), r);
#else
- cmp = vcageq_f32 (d->range_val, x);
- cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */
+ cmp = vcageq_f32 (x, d->range_val);
r = x;
#endif
@@ -57,7 +57,7 @@ const static struct data
# define BigBound v_u64 (0x4070000000000000) /* asuint64 (0x1p8). */
# define Thres v_u64 (0x2070000000000000) /* BigBound - TinyBound. */
-static inline float64x2_t VPCS_ATTR
+static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
{
/* If fenv exceptions are to be triggered correctly, fall back to the scalar
@@ -72,7 +72,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
-static float64x2_t VPCS_ATTR NOINLINE
+static inline float64x2_t VPCS_ATTR
special_case (float64x2_t s, float64x2_t y, float64x2_t n,
const struct data *d)
{
@@ -25,7 +25,8 @@
static const struct data
{
float32x4_t poly[5];
- float32x4_t shift, log10_2, log2_10_hi, log2_10_lo;
+ float32x4_t log10_2_and_inv, shift;
+
#if !WANT_SIMD_EXCEPT
float32x4_t scale_thresh;
#endif
@@ -38,9 +39,9 @@ static const struct data
.poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f),
V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) },
.shift = V4 (0x1.8p23f),
- .log10_2 = V4 (0x1.a934fp+1),
- .log2_10_hi = V4 (0x1.344136p-2),
- .log2_10_lo = V4 (-0x1.ec10cp-27),
+
+ /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */
+ .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 },
#if !WANT_SIMD_EXCEPT
.scale_thresh = V4 (ScaleBound)
#endif
@@ -98,24 +99,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
#if WANT_SIMD_EXCEPT
/* asuint(x) - TinyBound >= BigBound - TinyBound. */
uint32x4_t cmp = vcgeq_u32 (
- vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
- TinyBound),
- Thres);
+ vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres);
float32x4_t xm = x;
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
special case handler to fix special lanes later. This is only necessary if
fenv exceptions are to be triggered correctly. */
if (__glibc_unlikely (v_any_u32 (cmp)))
- x = vbslq_f32 (cmp, v_f32 (1), x);
+ x = v_zerofy_f32 (x, cmp);
#endif
/* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
with poly(r) in [1/sqrt(2), sqrt(2)] and
x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
- float32x4_t z = vfmaq_f32 (d->shift, x, d->log10_2);
+ float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0);
float32x4_t n = vsubq_f32 (z, d->shift);
- float32x4_t r = vfmsq_f32 (x, n, d->log2_10_hi);
- r = vfmsq_f32 (r, n, d->log2_10_lo);
+ float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1);
+ r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2);
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
@@ -24,6 +24,7 @@
#define IndexMask (N - 1)
#define BigBound 1022.0
#define UOFlowBound 1280.0
+#define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */
static const struct data
{
@@ -48,14 +49,13 @@ lookup_sbits (uint64x2_t i)
#if WANT_SIMD_EXCEPT
-# define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */
# define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */
/* Call scalar exp2 as a fallback. */
static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x)
+special_case (float64x2_t x, float64x2_t y, uint64x2_t is_special)
{
- return v_call_f64 (exp2, x, x, v_u64 (0xffffffffffffffff));
+ return v_call_f64 (exp2, x, y, is_special);
}
#else
@@ -65,7 +65,7 @@ special_case (float64x2_t x)
# define SpecialBias1 0x7000000000000000 /* 0x1p769. */
# define SpecialBias2 0x3010000000000000 /* 0x1p-254. */
-static float64x2_t VPCS_ATTR
+static inline float64x2_t VPCS_ATTR
special_case (float64x2_t s, float64x2_t y, float64x2_t n,
const struct data *d)
{
@@ -94,10 +94,10 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
#if WANT_SIMD_EXCEPT
uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres));
- /* If any special case (inf, nan, small and large x) is detected,
- fall back to scalar for all lanes. */
- if (__glibc_unlikely (v_any_u64 (cmp)))
- return special_case (x);
+ /* Mask special lanes and retain a copy of x for passing to special-case
+ handler. */
+ float64x2_t xc = x;
+ x = v_zerofy_f64 (x, cmp);
#else
cmp = vcagtq_f64 (x, d->scale_big_bound);
#endif
@@ -120,9 +120,11 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
float64x2_t y = v_pairwise_poly_3_f64 (r, r2, d->poly);
y = vmulq_f64 (r, y);
-#if !WANT_SIMD_EXCEPT
if (__glibc_unlikely (v_any_u64 (cmp)))
+#if !WANT_SIMD_EXCEPT
return special_case (s, y, n, d);
+#else
+ return special_case (xc, vfmaq_f64 (s, s, y), cmp);
#endif
return vfmaq_f64 (s, s, y);
}
@@ -20,6 +20,8 @@
#include "sv_math.h"
#include "poly_sve_f32.h"
+#define Thres 0x1.5d5e2ap+6f
+
static const struct data
{
float poly[5];
@@ -33,7 +35,7 @@ static const struct data
.shift = 0x1.903f8p17f,
/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
correctly by FEXPA. */
- .thres = 0x1.5d5e2ap+6f,
+ .thres = Thres,
};
static svfloat32_t NOINLINE
@@ -54,7 +54,7 @@ const static volatile struct
# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */
# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */
-static inline float64x2_t VPCS_ATTR
+static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
{
/* If fenv exceptions are to be triggered correctly, fall back to the scalar
@@ -69,7 +69,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
-static float64x2_t VPCS_ATTR NOINLINE
+static inline float64x2_t VPCS_ATTR
special_case (float64x2_t s, float64x2_t y, float64x2_t n)
{
/* 2^(n/N) may overflow, break it up into s1*s2. */
@@ -23,7 +23,7 @@
static const struct data
{
float64x2_t poly[11];
- float64x2_t invln2, ln2_lo, ln2_hi, shift;
+ float64x2_t invln2, ln2, shift;
int64x2_t exponent_bias;
#if WANT_SIMD_EXCEPT
uint64x2_t thresh, tiny_bound;
@@ -38,8 +38,7 @@ static const struct data
V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) },
.invln2 = V2 (0x1.71547652b82fep0),
- .ln2_hi = V2 (0x1.62e42fefa39efp-1),
- .ln2_lo = V2 (0x1.abc9e3b39803fp-56),
+ .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 },
.shift = V2 (0x1.8p52),
.exponent_bias = V2 (0x3ff0000000000000),
#if WANT_SIMD_EXCEPT
@@ -83,7 +82,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
x = v_zerofy_f64 (x, special);
#else
/* Large input, NaNs and Infs. */
- uint64x2_t special = vceqzq_u64 (vcaltq_f64 (x, d->oflow_bound));
+ uint64x2_t special = vcageq_f64 (x, d->oflow_bound);
#endif
/* Reduce argument to smaller range:
@@ -93,8 +92,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
where 2^i is exact because i is an integer. */
float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift);
int64x2_t i = vcvtq_s64_f64 (n);
- float64x2_t f = vfmsq_f64 (x, n, d->ln2_hi);
- f = vfmsq_f64 (f, n, d->ln2_lo);
+ float64x2_t f = vfmsq_laneq_f64 (x, n, d->ln2, 0);
+ f = vfmsq_laneq_f64 (f, n, d->ln2, 1);
/* Approximate expm1(f) using polynomial.
Taylor expansion for expm1(x) has the form:
@@ -23,7 +23,8 @@
static const struct data
{
float32x4_t poly[5];
- float32x4_t invln2, ln2_lo, ln2_hi, shift;
+ float32x4_t invln2_and_ln2;
+ float32x4_t shift;
int32x4_t exponent_bias;
#if WANT_SIMD_EXCEPT
uint32x4_t thresh;
@@ -34,9 +35,8 @@ static const struct data
/* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
.poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
- .invln2 = V4 (0x1.715476p+0f),
- .ln2_hi = V4 (0x1.62e4p-1f),
- .ln2_lo = V4 (0x1.7f7d1cp-20f),
+ /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */
+ .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
.shift = V4 (0x1.8p23f),
.exponent_bias = V4 (0x3f800000),
#if !WANT_SIMD_EXCEPT
@@ -80,7 +80,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
x = v_zerofy_f32 (x, special);
#else
/* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
- uint32x4_t special = vceqzq_u32 (vcaltq_f32 (x, d->oflow_bound));
+ uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
#endif
/* Reduce argument to smaller range:
@@ -88,10 +88,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
where 2^i is exact because i is an integer. */
- float32x4_t j = vsubq_f32 (vfmaq_f32 (d->shift, d->invln2, x), d->shift);
+ float32x4_t j = vsubq_f32 (
+ vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
int32x4_t i = vcvtq_s32_f32 (j);
- float32x4_t f = vfmsq_f32 (x, j, d->ln2_hi);
- f = vfmsq_f32 (f, j, d->ln2_lo);
+ float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
+ f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
/* Approximate expm1(f) using polynomial.
Taylor expansion for expm1(x) has the form:
@@ -58,8 +58,13 @@ lookup (uint64x2_t i)
uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+#if __BYTE_ORDER == __LITTLE_ENDIAN
e.invc = vuzp1q_f64 (e0, e1);
e.logc = vuzp2q_f64 (e0, e1);
+#else
+ e.invc = vuzp1q_f64 (e1, e0);
+ e.logc = vuzp2q_f64 (e1, e0);
+#endif
return e;
}
@@ -75,8 +75,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
#else
r = x;
- cmp = vcageq_f64 (d->range_val, x);
- cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */
+ cmp = vcageq_f64 (x, d->range_val);
#endif
/* n = rint(|x|/pi). */
@@ -67,8 +67,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
#else
r = x;
- cmp = vcageq_f32 (d->range_val, x);
- cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */
+ cmp = vcageq_f32 (x, d->range_val);
#endif
/* n = rint(|x|/pi) */
@@ -23,7 +23,7 @@
static const struct data
{
float64x2_t poly[9];
- float64x2_t half_pi_hi, half_pi_lo, two_over_pi, shift;
+ float64x2_t half_pi, two_over_pi, shift;
#if !WANT_SIMD_EXCEPT
float64x2_t range_val;
#endif
@@ -34,8 +34,7 @@ static const struct data
V2 (0x1.226e5e5ecdfa3p-7), V2 (0x1.d6c7ddbf87047p-9),
V2 (0x1.7ea75d05b583ep-10), V2 (0x1.289f22964a03cp-11),
V2 (0x1.4e4fd14147622p-12) },
- .half_pi_hi = V2 (0x1.921fb54442d18p0),
- .half_pi_lo = V2 (0x1.1a62633145c07p-54),
+ .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 },
.two_over_pi = V2 (0x1.45f306dc9c883p-1),
.shift = V2 (0x1.8p52),
#if !WANT_SIMD_EXCEPT
@@ -56,15 +55,15 @@ special_case (float64x2_t x)
/* Vector approximation for double-precision tan.
Maximum measured error is 3.48 ULP:
- __v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37
- want -0x1.f6ccd8ecf7deap+37. */
+ _ZGVnN2v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37
+ want -0x1.f6ccd8ecf7deap+37. */
float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
{
const struct data *dat = ptr_barrier (&data);
- /* Our argument reduction cannot calculate q with sufficient accuracy for very
- large inputs. Fall back to scalar routine for all lanes if any are too
- large, or Inf/NaN. If fenv exceptions are expected, also fall back for tiny
- input to avoid underflow. */
+ /* Our argument reduction cannot calculate q with sufficient accuracy for
+ very large inputs. Fall back to scalar routine for all lanes if any are
+ too large, or Inf/NaN. If fenv exceptions are expected, also fall back for
+ tiny input to avoid underflow. */
#if WANT_SIMD_EXCEPT
uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
/* iax - tiny_bound > range_val - tiny_bound. */
@@ -82,8 +81,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
/* Use q to reduce x to r in [-pi/4, pi/4], by:
r = x - q * pi/2, in extended precision. */
float64x2_t r = x;
- r = vfmsq_f64 (r, q, dat->half_pi_hi);
- r = vfmsq_f64 (r, q, dat->half_pi_lo);
+ r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0);
+ r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1);
/* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
formula. */
r = vmulq_n_f64 (r, 0.5);
@@ -106,14 +105,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
and reciprocity around pi/2:
tan(x) = 1 / (tan(pi/2 - x))
to assemble result using change-of-sign and conditional selection of
- numerator/denominator, dependent on odd/even-ness of q (hence quadrant). */
+ numerator/denominator, dependent on odd/even-ness of q (hence quadrant).
+ */
float64x2_t n = vfmaq_f64 (v_f64 (-1), p, p);
float64x2_t d = vaddq_f64 (p, p);
uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1));
#if !WANT_SIMD_EXCEPT
- uint64x2_t special = vceqzq_u64 (vcaleq_f64 (x, dat->range_val));
+ uint64x2_t special = vcageq_f64 (x, dat->range_val);
if (__glibc_unlikely (v_any_u64 (special)))
return special_case (x);
#endif
@@ -23,7 +23,8 @@
static const struct data
{
float32x4_t poly[6];
- float32x4_t neg_half_pi_1, neg_half_pi_2, neg_half_pi_3, two_over_pi, shift;
+ float32x4_t pi_consts;
+ float32x4_t shift;
#if !WANT_SIMD_EXCEPT
float32x4_t range_val;
#endif
@@ -31,10 +32,9 @@ static const struct data
/* Coefficients generated using FPMinimax. */
.poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f),
V4 (0x1.7b5756p-6f), V4 (0x1.4ef4cep-8f), V4 (0x1.0e1e74p-7f) },
- .neg_half_pi_1 = V4 (-0x1.921fb6p+0f),
- .neg_half_pi_2 = V4 (0x1.777a5cp-25f),
- .neg_half_pi_3 = V4 (0x1.ee59dap-50f),
- .two_over_pi = V4 (0x1.45f306p-1f),
+ /* Stores constants: (-pi/2)_high, (-pi/2)_mid, (-pi/2)_low, and 2/pi. */
+ .pi_consts
+ = { -0x1.921fb6p+0f, 0x1.777a5cp-25f, 0x1.ee59dap-50f, 0x1.45f306p-1f },
.shift = V4 (0x1.8p+23f),
#if !WANT_SIMD_EXCEPT
.range_val = V4 (0x1p15f),
@@ -58,10 +58,11 @@ eval_poly (float32x4_t z, const struct data *d)
{
float32x4_t z2 = vmulq_f32 (z, z);
#if WANT_SIMD_EXCEPT
- /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If fp exceptions
- are to be triggered correctly, sidestep this by fixing such lanes to 0. */
+ /* Tiny z (<= 0x1p-31) will underflow when calculating z^4.
+ If fp exceptions are to be triggered correctly,
+ sidestep this by fixing such lanes to 0. */
uint32x4_t will_uflow
- = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound);
+ = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound);
if (__glibc_unlikely (v_any_u32 (will_uflow)))
z2 = vbslq_f32 (will_uflow, v_f32 (0), z2);
#endif
@@ -94,16 +95,16 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x)
#endif
/* n = rint(x/(pi/2)). */
- float32x4_t q = vfmaq_f32 (d->shift, d->two_over_pi, x);
+ float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3);
float32x4_t n = vsubq_f32 (q, d->shift);
/* Determine if x lives in an interval, where |tan(x)| grows to infinity. */
uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1));
/* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */
float32x4_t r;
- r = vfmaq_f32 (x, d->neg_half_pi_1, n);
- r = vfmaq_f32 (r, d->neg_half_pi_2, n);
- r = vfmaq_f32 (r, d->neg_half_pi_3, n);
+ r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0);
+ r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1);
+ r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2);
/* If x lives in an interval, where |tan(x)|
- is finite, then use a polynomial approximation of the form