[v3,1/2] aarch64: Add half-width versions of AdvSIMD f32 libmvec routines

Message ID 20231219164401.21756-1-Joe.Ramsay@arm.com
State Committed
Commit cc0d77ba944cd4ce46c5f0e6d426af3057962ca5
Headers
Series [v3,1/2] aarch64: Add half-width versions of AdvSIMD f32 libmvec routines |

Commit Message

Joe Ramsay Dec. 19, 2023, 4:44 p.m. UTC
  Compilers may emit calls to 'half-width' routines (two-lane
single-precision variants). These have been added in the form of
wrappers around the full-width versions, where the low half of the
vector is simply duplicated. This will perform poorly when one lane
triggers the special-case handler, as there will be a redundant call
to the scalar version, however this is expected to be rare at Ofast.
---
Changes from v2:
* Mark full-width versions NOINLINE to keep code size down
* Also filed GCC bug 113084 for the tail calls
 include/libc-symbols.h                        |  2 ++
 sysdeps/aarch64/fpu/Versions                  | 15 ++++++++
 sysdeps/aarch64/fpu/acosf_advsimd.c           |  4 ++-
 sysdeps/aarch64/fpu/advsimd_f32_protos.h      | 34 +++++++++++++++++++
 sysdeps/aarch64/fpu/asinf_advsimd.c           |  4 ++-
 sysdeps/aarch64/fpu/atan2f_advsimd.c          |  4 ++-
 sysdeps/aarch64/fpu/atanf_advsimd.c           |  4 ++-
 sysdeps/aarch64/fpu/cosf_advsimd.c            |  4 ++-
 sysdeps/aarch64/fpu/exp10f_advsimd.c          |  4 ++-
 sysdeps/aarch64/fpu/exp2f_advsimd.c           |  4 ++-
 sysdeps/aarch64/fpu/expf_advsimd.c            |  4 ++-
 sysdeps/aarch64/fpu/expm1f_advsimd.c          |  4 ++-
 sysdeps/aarch64/fpu/log10f_advsimd.c          |  4 ++-
 sysdeps/aarch64/fpu/log1pf_advsimd.c          |  2 ++
 sysdeps/aarch64/fpu/log2f_advsimd.c           |  4 ++-
 sysdeps/aarch64/fpu/logf_advsimd.c            |  4 ++-
 sysdeps/aarch64/fpu/sinf_advsimd.c            |  4 ++-
 sysdeps/aarch64/fpu/tanf_advsimd.c            |  4 ++-
 sysdeps/aarch64/fpu/v_math.h                  | 15 ++++++++
 .../unix/sysv/linux/aarch64/libmvec.abilist   | 15 ++++++++
 20 files changed, 125 insertions(+), 14 deletions(-)
 create mode 100644 sysdeps/aarch64/fpu/advsimd_f32_protos.h
  

Comments

Szabolcs Nagy Dec. 20, 2023, 8:48 a.m. UTC | #1
The 12/19/2023 16:44, Joe Ramsay wrote:
> Compilers may emit calls to 'half-width' routines (two-lane
> single-precision variants). These have been added in the form of
> wrappers around the full-width versions, where the low half of the
> vector is simply duplicated. This will perform poorly when one lane
> triggers the special-case handler, as there will be a redundant call
> to the scalar version, however this is expected to be rare at Ofast.
> ---
> Changes from v2:
> * Mark full-width versions NOINLINE to keep code size down
> * Also filed GCC bug 113084 for the tail calls

thanks, committed.


>  include/libc-symbols.h                        |  2 ++
>  sysdeps/aarch64/fpu/Versions                  | 15 ++++++++
>  sysdeps/aarch64/fpu/acosf_advsimd.c           |  4 ++-
>  sysdeps/aarch64/fpu/advsimd_f32_protos.h      | 34 +++++++++++++++++++
>  sysdeps/aarch64/fpu/asinf_advsimd.c           |  4 ++-
>  sysdeps/aarch64/fpu/atan2f_advsimd.c          |  4 ++-
>  sysdeps/aarch64/fpu/atanf_advsimd.c           |  4 ++-
>  sysdeps/aarch64/fpu/cosf_advsimd.c            |  4 ++-
>  sysdeps/aarch64/fpu/exp10f_advsimd.c          |  4 ++-
>  sysdeps/aarch64/fpu/exp2f_advsimd.c           |  4 ++-
>  sysdeps/aarch64/fpu/expf_advsimd.c            |  4 ++-
>  sysdeps/aarch64/fpu/expm1f_advsimd.c          |  4 ++-
>  sysdeps/aarch64/fpu/log10f_advsimd.c          |  4 ++-
>  sysdeps/aarch64/fpu/log1pf_advsimd.c          |  2 ++
>  sysdeps/aarch64/fpu/log2f_advsimd.c           |  4 ++-
>  sysdeps/aarch64/fpu/logf_advsimd.c            |  4 ++-
>  sysdeps/aarch64/fpu/sinf_advsimd.c            |  4 ++-
>  sysdeps/aarch64/fpu/tanf_advsimd.c            |  4 ++-
>  sysdeps/aarch64/fpu/v_math.h                  | 15 ++++++++
>  .../unix/sysv/linux/aarch64/libmvec.abilist   | 15 ++++++++
>  20 files changed, 125 insertions(+), 14 deletions(-)
>  create mode 100644 sysdeps/aarch64/fpu/advsimd_f32_protos.h
> 
> diff --git a/include/libc-symbols.h b/include/libc-symbols.h
> index 5794614488..a226119295 100644
> --- a/include/libc-symbols.h
> +++ b/include/libc-symbols.h
> @@ -600,8 +600,10 @@ for linking")
>  #endif
>  
>  #if IS_IN (libmvec)
> +# define libmvec_hidden_proto(name, attrs...) hidden_proto (name, ##attrs)
>  # define libmvec_hidden_def(name) hidden_def (name)
>  #else
> +# define libmvec_hidden_proto(name, attrs...)
>  # define libmvec_hidden_def(name)
>  #endif
>  
> diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
> index aaacacaebe..accd101184 100644
> --- a/sysdeps/aarch64/fpu/Versions
> +++ b/sysdeps/aarch64/fpu/Versions
> @@ -18,47 +18,62 @@ libmvec {
>      _ZGVsMxv_sinf;
>    }
>    GLIBC_2.39 {
> +    _ZGVnN2v_cosf;
> +    _ZGVnN2v_expf;
> +    _ZGVnN2v_logf;
> +    _ZGVnN2v_sinf;
>      _ZGVnN4v_acosf;
> +    _ZGVnN2v_acosf;
>      _ZGVnN2v_acos;
>      _ZGVsMxv_acosf;
>      _ZGVsMxv_acos;
>      _ZGVnN4v_asinf;
> +    _ZGVnN2v_asinf;
>      _ZGVnN2v_asin;
>      _ZGVsMxv_asinf;
>      _ZGVsMxv_asin;
>      _ZGVnN4v_atanf;
> +    _ZGVnN2v_atanf;
>      _ZGVnN2v_atan;
>      _ZGVsMxv_atanf;
>      _ZGVsMxv_atan;
>      _ZGVnN4vv_atan2f;
> +    _ZGVnN2vv_atan2f;
>      _ZGVnN2vv_atan2;
>      _ZGVsMxvv_atan2f;
>      _ZGVsMxvv_atan2;
>      _ZGVnN4v_exp10f;
> +    _ZGVnN2v_exp10f;
>      _ZGVnN2v_exp10;
>      _ZGVsMxv_exp10f;
>      _ZGVsMxv_exp10;
>      _ZGVnN4v_exp2f;
> +    _ZGVnN2v_exp2f;
>      _ZGVnN2v_exp2;
>      _ZGVsMxv_exp2f;
>      _ZGVsMxv_exp2;
>      _ZGVnN4v_expm1f;
> +    _ZGVnN2v_expm1f;
>      _ZGVnN2v_expm1;
>      _ZGVsMxv_expm1f;
>      _ZGVsMxv_expm1;
>      _ZGVnN4v_log10f;
> +    _ZGVnN2v_log10f;
>      _ZGVnN2v_log10;
>      _ZGVsMxv_log10f;
>      _ZGVsMxv_log10;
>      _ZGVnN4v_log1pf;
> +    _ZGVnN2v_log1pf;
>      _ZGVnN2v_log1p;
>      _ZGVsMxv_log1pf;
>      _ZGVsMxv_log1p;
>      _ZGVnN4v_log2f;
> +    _ZGVnN2v_log2f;
>      _ZGVnN2v_log2;
>      _ZGVsMxv_log2f;
>      _ZGVsMxv_log2;
>      _ZGVnN4v_tanf;
> +    _ZGVnN2v_tanf;
>      _ZGVnN2v_tan;
>      _ZGVsMxv_tanf;
>      _ZGVsMxv_tan;
> diff --git a/sysdeps/aarch64/fpu/acosf_advsimd.c b/sysdeps/aarch64/fpu/acosf_advsimd.c
> index 7d39e9b805..a73e3b09b2 100644
> --- a/sysdeps/aarch64/fpu/acosf_advsimd.c
> +++ b/sysdeps/aarch64/fpu/acosf_advsimd.c
> @@ -68,7 +68,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
>     The largest observed error in this region is 1.32 ulps,
>     _ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
>  			   want 0x1.feb32ep-1.  */
> -float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x)
> +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x)
>  {
>    const struct data *d = ptr_barrier (&data);
>  
> @@ -111,3 +111,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x)
>  
>    return vfmaq_f32 (add, mul, y);
>  }
> +libmvec_hidden_def (V_NAME_F1(acos))
> +HALF_WIDTH_ALIAS_F1 (acos)
> diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
> new file mode 100644
> index 0000000000..b406ad7156
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
> @@ -0,0 +1,34 @@
> +/* Hidden prototypes for single-precision AdvSIMD routines
> +
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +libmvec_hidden_proto (V_NAME_F1(acos));
> +libmvec_hidden_proto (V_NAME_F1(asin));
> +libmvec_hidden_proto (V_NAME_F1(atan));
> +libmvec_hidden_proto (V_NAME_F1(cos));
> +libmvec_hidden_proto (V_NAME_F1(exp10));
> +libmvec_hidden_proto (V_NAME_F1(exp2));
> +libmvec_hidden_proto (V_NAME_F1(exp));
> +libmvec_hidden_proto (V_NAME_F1(expm1));
> +libmvec_hidden_proto (V_NAME_F1(log10));
> +libmvec_hidden_proto (V_NAME_F1(log1p));
> +libmvec_hidden_proto (V_NAME_F1(log2));
> +libmvec_hidden_proto (V_NAME_F1(log));
> +libmvec_hidden_proto (V_NAME_F1(sin));
> +libmvec_hidden_proto (V_NAME_F1(tan));
> +libmvec_hidden_proto (V_NAME_F2(atan2));
> diff --git a/sysdeps/aarch64/fpu/asinf_advsimd.c b/sysdeps/aarch64/fpu/asinf_advsimd.c
> index 3180ae7c8e..9a100e52fe 100644
> --- a/sysdeps/aarch64/fpu/asinf_advsimd.c
> +++ b/sysdeps/aarch64/fpu/asinf_advsimd.c
> @@ -63,7 +63,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
>  
>     The largest observed error in this region is 2.41 ulps,
>       _ZGVnN4v_asinf (0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1.  */
> -float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x)
> +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x)
>  {
>    const struct data *d = ptr_barrier (&data);
>  
> @@ -102,3 +102,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x)
>    /* Copy sign.  */
>    return vbslq_f32 (v_u32 (AbsMask), y, x);
>  }
> +libmvec_hidden_def (V_NAME_F1 (asin))
> +HALF_WIDTH_ALIAS_F1 (asin)
> diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c
> index 5a5a6202d1..39b682c97a 100644
> --- a/sysdeps/aarch64/fpu/atan2f_advsimd.c
> +++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c
> @@ -56,7 +56,7 @@ zeroinfnan (uint32x4_t i)
>     2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
>     _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
>  						 want 0x1.967f00p-1.  */
> -float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
> +float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
>  {
>    const struct data *data_ptr = ptr_barrier (&data);
>  
> @@ -114,3 +114,5 @@ float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
>  
>    return ret;
>  }
> +libmvec_hidden_def (V_NAME_F2 (atan2))
> +HALF_WIDTH_ALIAS_F2(atan2)
> diff --git a/sysdeps/aarch64/fpu/atanf_advsimd.c b/sysdeps/aarch64/fpu/atanf_advsimd.c
> index 589b0e8c96..eaa96a8efe 100644
> --- a/sysdeps/aarch64/fpu/atanf_advsimd.c
> +++ b/sysdeps/aarch64/fpu/atanf_advsimd.c
> @@ -53,7 +53,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
>     atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
>     using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
>     _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1.  */
> -float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x)
> +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
>  {
>    const struct data *d = ptr_barrier (&data);
>  
> @@ -107,3 +107,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x)
>  
>    return y;
>  }
> +libmvec_hidden_def (V_NAME_F1 (atan))
> +HALF_WIDTH_ALIAS_F1 (atan)
> diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c
> index f05dd2bcda..b7f78aa8a3 100644
> --- a/sysdeps/aarch64/fpu/cosf_advsimd.c
> +++ b/sysdeps/aarch64/fpu/cosf_advsimd.c
> @@ -48,7 +48,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
>    return v_call_f32 (cosf, x, y, cmp);
>  }
>  
> -float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
> +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
>  {
>    const struct data *d = ptr_barrier (&data);
>    float32x4_t n, r, r2, r3, y;
> @@ -92,3 +92,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
>      return special_case (x, y, odd, cmp);
>    return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
>  }
> +libmvec_hidden_def (V_NAME_F1 (cos))
> +HALF_WIDTH_ALIAS_F1 (cos)
> diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c
> index 9e754c46fa..645462acad 100644
> --- a/sysdeps/aarch64/fpu/exp10f_advsimd.c
> +++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c
> @@ -92,7 +92,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
>     Algorithm is accurate to 2.36 ULP.
>     _ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11
>  				 want 0x1.7e79cp+11.  */
> -float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
> +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
>  {
>    const struct data *d = ptr_barrier (&data);
>  #if WANT_SIMD_EXCEPT
> @@ -138,3 +138,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
>  
>    return vfmaq_f32 (scale, poly, scale);
>  }
> +libmvec_hidden_def (V_NAME_F1 (exp10))
> +HALF_WIDTH_ALIAS_F1 (exp10)
> diff --git a/sysdeps/aarch64/fpu/exp2f_advsimd.c b/sysdeps/aarch64/fpu/exp2f_advsimd.c
> index 70b3ab66c1..549d77a45f 100644
> --- a/sysdeps/aarch64/fpu/exp2f_advsimd.c
> +++ b/sysdeps/aarch64/fpu/exp2f_advsimd.c
> @@ -77,7 +77,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
>  
>  #endif
>  
> -float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
> +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
>  {
>    const struct data *d = ptr_barrier (&data);
>    float32x4_t n, r, r2, scale, p, q, poly;
> @@ -122,3 +122,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
>  
>    return vfmaq_f32 (scale, poly, scale);
>  }
> +libmvec_hidden_def (V_NAME_F1 (exp2))
> +HALF_WIDTH_ALIAS_F1 (exp2)
> diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c
> index 69d5d1ea77..9d6455689c 100644
> --- a/sysdeps/aarch64/fpu/expf_advsimd.c
> +++ b/sysdeps/aarch64/fpu/expf_advsimd.c
> @@ -82,7 +82,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
>  
>  #endif
>  
> -float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
> +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
>  {
>    const struct data *d = ptr_barrier (&data);
>    float32x4_t n, r, r2, scale, p, q, poly, z;
> @@ -131,3 +131,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
>  
>    return vfmaq_f32 (scale, poly, scale);
>  }
> +libmvec_hidden_def (V_NAME_F1 (exp))
> +HALF_WIDTH_ALIAS_F1 (exp)
> diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c
> index b27b75068a..a6fe5627e5 100644
> --- a/sysdeps/aarch64/fpu/expm1f_advsimd.c
> +++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
> @@ -64,7 +64,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
>     The maximum error is 1.51 ULP:
>     _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
>  				  want 0x1.e2fb94p-2.  */
> -float32x4_t VPCS_ATTR V_NAME_F1 (expm1) (float32x4_t x)
> +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
>  {
>    const struct data *d = ptr_barrier (&data);
>    uint32x4_t ix = vreinterpretq_u32_f32 (x);
> @@ -115,3 +115,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (expm1) (float32x4_t x)
>    /* expm1(x) ~= p * t + (t - 1).  */
>    return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
>  }
> +libmvec_hidden_def (V_NAME_F1 (expm1))
> +HALF_WIDTH_ALIAS_F1 (expm1)
> diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c
> index ba02060bbe..26fe14765f 100644
> --- a/sysdeps/aarch64/fpu/log10f_advsimd.c
> +++ b/sysdeps/aarch64/fpu/log10f_advsimd.c
> @@ -55,7 +55,7 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
>     Maximum error: 3.305ulps (nearest rounding.)
>     _ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
>  				 want 0x1.ffe2f4p-4.  */
> -float32x4_t VPCS_ATTR V_NAME_F1 (log10) (float32x4_t x)
> +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
>  {
>    const struct data *d = ptr_barrier (&data);
>    uint32x4_t u = vreinterpretq_u32_f32 (x);
> @@ -80,3 +80,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (log10) (float32x4_t x)
>      return special_case (x, y, poly, r2, special);
>    return vfmaq_f32 (y, poly, r2);
>  }
> +libmvec_hidden_def (V_NAME_F1 (log10))
> +HALF_WIDTH_ALIAS_F1 (log10)
> diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c
> index 3748830de8..0530dc2002 100644
> --- a/sysdeps/aarch64/fpu/log1pf_advsimd.c
> +++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c
> @@ -126,3 +126,5 @@ VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
>      return special_case (special_arg, y, special_cases);
>    return y;
>  }
> +libmvec_hidden_def (V_NAME_F1 (log1p))
> +HALF_WIDTH_ALIAS_F1 (log1p)
> diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c
> index e913bcda18..5b10e85263 100644
> --- a/sysdeps/aarch64/fpu/log2f_advsimd.c
> +++ b/sysdeps/aarch64/fpu/log2f_advsimd.c
> @@ -53,7 +53,7 @@ special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
>     Maximum error: 2.48 ULPs
>     _ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2
>  				want 0x1.a9be8p-2.  */
> -float32x4_t VPCS_ATTR V_NAME_F1 (log2) (float32x4_t x)
> +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
>  {
>    const struct data *d = ptr_barrier (&data);
>    uint32x4_t u = vreinterpretq_u32_f32 (x);
> @@ -75,3 +75,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (log2) (float32x4_t x)
>      return special_case (x, n, p, r, special);
>    return vfmaq_f32 (n, p, r);
>  }
> +libmvec_hidden_def (V_NAME_F1 (log2))
> +HALF_WIDTH_ALIAS_F1 (log2)
> diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c
> index 93903c7962..d9e6498a71 100644
> --- a/sysdeps/aarch64/fpu/logf_advsimd.c
> +++ b/sysdeps/aarch64/fpu/logf_advsimd.c
> @@ -49,7 +49,7 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
>    return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
>  }
>  
> -float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
> +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
>  {
>    const struct data *d = ptr_barrier (&data);
>    float32x4_t n, p, q, r, r2, y;
> @@ -83,3 +83,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
>      return special_case (x, y, r2, p, cmp);
>    return vfmaq_f32 (p, y, r2);
>  }
> +libmvec_hidden_def (V_NAME_F1 (log))
> +HALF_WIDTH_ALIAS_F1 (log)
> diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c
> index 0e78cf55f0..1966bd6e8f 100644
> --- a/sysdeps/aarch64/fpu/sinf_advsimd.c
> +++ b/sysdeps/aarch64/fpu/sinf_advsimd.c
> @@ -52,7 +52,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
>    return v_call_f32 (sinf, x, y, cmp);
>  }
>  
> -float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
> +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
>  {
>    const struct data *d = ptr_barrier (&data);
>    float32x4_t n, r, r2, y;
> @@ -92,3 +92,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
>      return special_case (x, y, odd, cmp);
>    return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
>  }
> +libmvec_hidden_def (V_NAME_F1 (sin))
> +HALF_WIDTH_ALIAS_F1 (sin)
> diff --git a/sysdeps/aarch64/fpu/tanf_advsimd.c b/sysdeps/aarch64/fpu/tanf_advsimd.c
> index 4c8a7f740e..16b39e17a0 100644
> --- a/sysdeps/aarch64/fpu/tanf_advsimd.c
> +++ b/sysdeps/aarch64/fpu/tanf_advsimd.c
> @@ -73,7 +73,7 @@ eval_poly (float32x4_t z, const struct data *d)
>     Maximum error is 3.45 ULP:
>     __v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
>  			    want 0x1.ff9850p-1.  */
> -float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
> +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x)
>  {
>    const struct data *d = ptr_barrier (&data);
>    float32x4_t special_arg = x;
> @@ -127,3 +127,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
>      return special_case (special_arg, vbslq_f32 (pred_alt, inv_y, y), special);
>    return vbslq_f32 (pred_alt, inv_y, y);
>  }
> +libmvec_hidden_def (V_NAME_F1 (tan))
> +HALF_WIDTH_ALIAS_F1 (tan)
> diff --git a/sysdeps/aarch64/fpu/v_math.h b/sysdeps/aarch64/fpu/v_math.h
> index d286eb81b3..e8ac0e2332 100644
> --- a/sysdeps/aarch64/fpu/v_math.h
> +++ b/sysdeps/aarch64/fpu/v_math.h
> @@ -29,6 +29,21 @@
>  #define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
>  #define V_NAME_D2(fun) _ZGVnN2vv_##fun
>  
> +#include "advsimd_f32_protos.h"
> +
> +#define HALF_WIDTH_ALIAS_F1(fun)                                              \
> +  float32x2_t VPCS_ATTR _ZGVnN2v_##fun##f (float32x2_t x)                     \
> +  {                                                                           \
> +    return vget_low_f32 (_ZGVnN4v_##fun##f (vcombine_f32 (x, x)));            \
> +  }
> +
> +#define HALF_WIDTH_ALIAS_F2(fun)                                              \
> +  float32x2_t VPCS_ATTR _ZGVnN2vv_##fun##f (float32x2_t x, float32x2_t y)     \
> +  {                                                                           \
> +    return vget_low_f32 (                                                     \
> +	_ZGVnN4vv_##fun##f (vcombine_f32 (x, x), vcombine_f32 (y, y)));       \
> +  }
> +
>  /* Shorthand helpers for declaring constants.  */
>  #define V2(X) { X, X }
>  #define V4(X) { X, X, X, X }
> diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> index 2bf4ea6332..580952b4de 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> +++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
> @@ -15,16 +15,31 @@ GLIBC_2.38 _ZGVsMxv_logf F
>  GLIBC_2.38 _ZGVsMxv_sin F
>  GLIBC_2.38 _ZGVsMxv_sinf F
>  GLIBC_2.39 _ZGVnN2v_acos F
> +GLIBC_2.39 _ZGVnN2v_acosf F
>  GLIBC_2.39 _ZGVnN2v_asin F
> +GLIBC_2.39 _ZGVnN2v_asinf F
>  GLIBC_2.39 _ZGVnN2v_atan F
> +GLIBC_2.39 _ZGVnN2v_atanf F
> +GLIBC_2.39 _ZGVnN2v_cosf F
>  GLIBC_2.39 _ZGVnN2v_exp10 F
> +GLIBC_2.39 _ZGVnN2v_exp10f F
>  GLIBC_2.39 _ZGVnN2v_exp2 F
> +GLIBC_2.39 _ZGVnN2v_exp2f F
> +GLIBC_2.39 _ZGVnN2v_expf F
>  GLIBC_2.39 _ZGVnN2v_expm1 F
> +GLIBC_2.39 _ZGVnN2v_expm1f F
>  GLIBC_2.39 _ZGVnN2v_log10 F
> +GLIBC_2.39 _ZGVnN2v_log10f F
>  GLIBC_2.39 _ZGVnN2v_log1p F
> +GLIBC_2.39 _ZGVnN2v_log1pf F
>  GLIBC_2.39 _ZGVnN2v_log2 F
> +GLIBC_2.39 _ZGVnN2v_log2f F
> +GLIBC_2.39 _ZGVnN2v_logf F
> +GLIBC_2.39 _ZGVnN2v_sinf F
>  GLIBC_2.39 _ZGVnN2v_tan F
> +GLIBC_2.39 _ZGVnN2v_tanf F
>  GLIBC_2.39 _ZGVnN2vv_atan2 F
> +GLIBC_2.39 _ZGVnN2vv_atan2f F
>  GLIBC_2.39 _ZGVnN4v_acosf F
>  GLIBC_2.39 _ZGVnN4v_asinf F
>  GLIBC_2.39 _ZGVnN4v_atanf F
> -- 
> 2.27.0
>
  

Patch

diff --git a/include/libc-symbols.h b/include/libc-symbols.h
index 5794614488..a226119295 100644
--- a/include/libc-symbols.h
+++ b/include/libc-symbols.h
@@ -600,8 +600,10 @@  for linking")
 #endif
 
 #if IS_IN (libmvec)
+# define libmvec_hidden_proto(name, attrs...) hidden_proto (name, ##attrs)
 # define libmvec_hidden_def(name) hidden_def (name)
 #else
+# define libmvec_hidden_proto(name, attrs...)
 # define libmvec_hidden_def(name)
 #endif
 
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index aaacacaebe..accd101184 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -18,47 +18,62 @@  libmvec {
     _ZGVsMxv_sinf;
   }
   GLIBC_2.39 {
+    _ZGVnN2v_cosf;
+    _ZGVnN2v_expf;
+    _ZGVnN2v_logf;
+    _ZGVnN2v_sinf;
     _ZGVnN4v_acosf;
+    _ZGVnN2v_acosf;
     _ZGVnN2v_acos;
     _ZGVsMxv_acosf;
     _ZGVsMxv_acos;
     _ZGVnN4v_asinf;
+    _ZGVnN2v_asinf;
     _ZGVnN2v_asin;
     _ZGVsMxv_asinf;
     _ZGVsMxv_asin;
     _ZGVnN4v_atanf;
+    _ZGVnN2v_atanf;
     _ZGVnN2v_atan;
     _ZGVsMxv_atanf;
     _ZGVsMxv_atan;
     _ZGVnN4vv_atan2f;
+    _ZGVnN2vv_atan2f;
     _ZGVnN2vv_atan2;
     _ZGVsMxvv_atan2f;
     _ZGVsMxvv_atan2;
     _ZGVnN4v_exp10f;
+    _ZGVnN2v_exp10f;
     _ZGVnN2v_exp10;
     _ZGVsMxv_exp10f;
     _ZGVsMxv_exp10;
     _ZGVnN4v_exp2f;
+    _ZGVnN2v_exp2f;
     _ZGVnN2v_exp2;
     _ZGVsMxv_exp2f;
     _ZGVsMxv_exp2;
     _ZGVnN4v_expm1f;
+    _ZGVnN2v_expm1f;
     _ZGVnN2v_expm1;
     _ZGVsMxv_expm1f;
     _ZGVsMxv_expm1;
     _ZGVnN4v_log10f;
+    _ZGVnN2v_log10f;
     _ZGVnN2v_log10;
     _ZGVsMxv_log10f;
     _ZGVsMxv_log10;
     _ZGVnN4v_log1pf;
+    _ZGVnN2v_log1pf;
     _ZGVnN2v_log1p;
     _ZGVsMxv_log1pf;
     _ZGVsMxv_log1p;
     _ZGVnN4v_log2f;
+    _ZGVnN2v_log2f;
     _ZGVnN2v_log2;
     _ZGVsMxv_log2f;
     _ZGVsMxv_log2;
     _ZGVnN4v_tanf;
+    _ZGVnN2v_tanf;
     _ZGVnN2v_tan;
     _ZGVsMxv_tanf;
     _ZGVsMxv_tan;
diff --git a/sysdeps/aarch64/fpu/acosf_advsimd.c b/sysdeps/aarch64/fpu/acosf_advsimd.c
index 7d39e9b805..a73e3b09b2 100644
--- a/sysdeps/aarch64/fpu/acosf_advsimd.c
+++ b/sysdeps/aarch64/fpu/acosf_advsimd.c
@@ -68,7 +68,7 @@  special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
    The largest observed error in this region is 1.32 ulps,
    _ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
 			   want 0x1.feb32ep-1.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
 
@@ -111,3 +111,5 @@  float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x)
 
   return vfmaq_f32 (add, mul, y);
 }
+libmvec_hidden_def (V_NAME_F1(acos))
+HALF_WIDTH_ALIAS_F1 (acos)
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
new file mode 100644
index 0000000000..b406ad7156
--- /dev/null
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
@@ -0,0 +1,34 @@ 
+/* Hidden prototypes for single-precision AdvSIMD routines
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+libmvec_hidden_proto (V_NAME_F1(acos));
+libmvec_hidden_proto (V_NAME_F1(asin));
+libmvec_hidden_proto (V_NAME_F1(atan));
+libmvec_hidden_proto (V_NAME_F1(cos));
+libmvec_hidden_proto (V_NAME_F1(exp10));
+libmvec_hidden_proto (V_NAME_F1(exp2));
+libmvec_hidden_proto (V_NAME_F1(exp));
+libmvec_hidden_proto (V_NAME_F1(expm1));
+libmvec_hidden_proto (V_NAME_F1(log10));
+libmvec_hidden_proto (V_NAME_F1(log1p));
+libmvec_hidden_proto (V_NAME_F1(log2));
+libmvec_hidden_proto (V_NAME_F1(log));
+libmvec_hidden_proto (V_NAME_F1(sin));
+libmvec_hidden_proto (V_NAME_F1(tan));
+libmvec_hidden_proto (V_NAME_F2(atan2));
diff --git a/sysdeps/aarch64/fpu/asinf_advsimd.c b/sysdeps/aarch64/fpu/asinf_advsimd.c
index 3180ae7c8e..9a100e52fe 100644
--- a/sysdeps/aarch64/fpu/asinf_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinf_advsimd.c
@@ -63,7 +63,7 @@  special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
 
    The largest observed error in this region is 2.41 ulps,
      _ZGVnN4v_asinf (0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
 
@@ -102,3 +102,5 @@  float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x)
   /* Copy sign.  */
   return vbslq_f32 (v_u32 (AbsMask), y, x);
 }
+libmvec_hidden_def (V_NAME_F1 (asin))
+HALF_WIDTH_ALIAS_F1 (asin)
diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c
index 5a5a6202d1..39b682c97a 100644
--- a/sysdeps/aarch64/fpu/atan2f_advsimd.c
+++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c
@@ -56,7 +56,7 @@  zeroinfnan (uint32x4_t i)
    2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
    _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
 						 want 0x1.967f00p-1.  */
-float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
 {
   const struct data *data_ptr = ptr_barrier (&data);
 
@@ -114,3 +114,5 @@  float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
 
   return ret;
 }
+libmvec_hidden_def (V_NAME_F2 (atan2))
+HALF_WIDTH_ALIAS_F2(atan2)
diff --git a/sysdeps/aarch64/fpu/atanf_advsimd.c b/sysdeps/aarch64/fpu/atanf_advsimd.c
index 589b0e8c96..eaa96a8efe 100644
--- a/sysdeps/aarch64/fpu/atanf_advsimd.c
+++ b/sysdeps/aarch64/fpu/atanf_advsimd.c
@@ -53,7 +53,7 @@  special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
    atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
    using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
    _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
 
@@ -107,3 +107,5 @@  float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x)
 
   return y;
 }
+libmvec_hidden_def (V_NAME_F1 (atan))
+HALF_WIDTH_ALIAS_F1 (atan)
diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c
index f05dd2bcda..b7f78aa8a3 100644
--- a/sysdeps/aarch64/fpu/cosf_advsimd.c
+++ b/sysdeps/aarch64/fpu/cosf_advsimd.c
@@ -48,7 +48,7 @@  special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
   return v_call_f32 (cosf, x, y, cmp);
 }
 
-float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
   float32x4_t n, r, r2, r3, y;
@@ -92,3 +92,5 @@  float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
     return special_case (x, y, odd, cmp);
   return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
 }
+libmvec_hidden_def (V_NAME_F1 (cos))
+HALF_WIDTH_ALIAS_F1 (cos)
diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c
index 9e754c46fa..645462acad 100644
--- a/sysdeps/aarch64/fpu/exp10f_advsimd.c
+++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c
@@ -92,7 +92,7 @@  special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
    Algorithm is accurate to 2.36 ULP.
    _ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11
 				 want 0x1.7e79cp+11.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
 #if WANT_SIMD_EXCEPT
@@ -138,3 +138,5 @@  float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
 
   return vfmaq_f32 (scale, poly, scale);
 }
+libmvec_hidden_def (V_NAME_F1 (exp10))
+HALF_WIDTH_ALIAS_F1 (exp10)
diff --git a/sysdeps/aarch64/fpu/exp2f_advsimd.c b/sysdeps/aarch64/fpu/exp2f_advsimd.c
index 70b3ab66c1..549d77a45f 100644
--- a/sysdeps/aarch64/fpu/exp2f_advsimd.c
+++ b/sysdeps/aarch64/fpu/exp2f_advsimd.c
@@ -77,7 +77,7 @@  special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
 
 #endif
 
-float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
   float32x4_t n, r, r2, scale, p, q, poly;
@@ -122,3 +122,5 @@  float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
 
   return vfmaq_f32 (scale, poly, scale);
 }
+libmvec_hidden_def (V_NAME_F1 (exp2))
+HALF_WIDTH_ALIAS_F1 (exp2)
diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c
index 69d5d1ea77..9d6455689c 100644
--- a/sysdeps/aarch64/fpu/expf_advsimd.c
+++ b/sysdeps/aarch64/fpu/expf_advsimd.c
@@ -82,7 +82,7 @@  special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
 
 #endif
 
-float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
   float32x4_t n, r, r2, scale, p, q, poly, z;
@@ -131,3 +131,5 @@  float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
 
   return vfmaq_f32 (scale, poly, scale);
 }
+libmvec_hidden_def (V_NAME_F1 (exp))
+HALF_WIDTH_ALIAS_F1 (exp)
diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c
index b27b75068a..a6fe5627e5 100644
--- a/sysdeps/aarch64/fpu/expm1f_advsimd.c
+++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
@@ -64,7 +64,7 @@  special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
    The maximum error is 1.51 ULP:
    _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
 				  want 0x1.e2fb94p-2.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (expm1) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
   uint32x4_t ix = vreinterpretq_u32_f32 (x);
@@ -115,3 +115,5 @@  float32x4_t VPCS_ATTR V_NAME_F1 (expm1) (float32x4_t x)
   /* expm1(x) ~= p * t + (t - 1).  */
   return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
 }
+libmvec_hidden_def (V_NAME_F1 (expm1))
+HALF_WIDTH_ALIAS_F1 (expm1)
diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c
index ba02060bbe..26fe14765f 100644
--- a/sysdeps/aarch64/fpu/log10f_advsimd.c
+++ b/sysdeps/aarch64/fpu/log10f_advsimd.c
@@ -55,7 +55,7 @@  special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
    Maximum error: 3.305ulps (nearest rounding.)
    _ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
 				 want 0x1.ffe2f4p-4.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (log10) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
   uint32x4_t u = vreinterpretq_u32_f32 (x);
@@ -80,3 +80,5 @@  float32x4_t VPCS_ATTR V_NAME_F1 (log10) (float32x4_t x)
     return special_case (x, y, poly, r2, special);
   return vfmaq_f32 (y, poly, r2);
 }
+libmvec_hidden_def (V_NAME_F1 (log10))
+HALF_WIDTH_ALIAS_F1 (log10)
diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c
index 3748830de8..0530dc2002 100644
--- a/sysdeps/aarch64/fpu/log1pf_advsimd.c
+++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c
@@ -126,3 +126,5 @@  VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
     return special_case (special_arg, y, special_cases);
   return y;
 }
+libmvec_hidden_def (V_NAME_F1 (log1p))
+HALF_WIDTH_ALIAS_F1 (log1p)
diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c
index e913bcda18..5b10e85263 100644
--- a/sysdeps/aarch64/fpu/log2f_advsimd.c
+++ b/sysdeps/aarch64/fpu/log2f_advsimd.c
@@ -53,7 +53,7 @@  special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
    Maximum error: 2.48 ULPs
    _ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2
 				want 0x1.a9be8p-2.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (log2) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
   uint32x4_t u = vreinterpretq_u32_f32 (x);
@@ -75,3 +75,5 @@  float32x4_t VPCS_ATTR V_NAME_F1 (log2) (float32x4_t x)
     return special_case (x, n, p, r, special);
   return vfmaq_f32 (n, p, r);
 }
+libmvec_hidden_def (V_NAME_F1 (log2))
+HALF_WIDTH_ALIAS_F1 (log2)
diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c
index 93903c7962..d9e6498a71 100644
--- a/sysdeps/aarch64/fpu/logf_advsimd.c
+++ b/sysdeps/aarch64/fpu/logf_advsimd.c
@@ -49,7 +49,7 @@  special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
   return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
 }
 
-float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
   float32x4_t n, p, q, r, r2, y;
@@ -83,3 +83,5 @@  float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
     return special_case (x, y, r2, p, cmp);
   return vfmaq_f32 (p, y, r2);
 }
+libmvec_hidden_def (V_NAME_F1 (log))
+HALF_WIDTH_ALIAS_F1 (log)
diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c
index 0e78cf55f0..1966bd6e8f 100644
--- a/sysdeps/aarch64/fpu/sinf_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinf_advsimd.c
@@ -52,7 +52,7 @@  special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
   return v_call_f32 (sinf, x, y, cmp);
 }
 
-float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
   float32x4_t n, r, r2, y;
@@ -92,3 +92,5 @@  float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
     return special_case (x, y, odd, cmp);
   return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
 }
+libmvec_hidden_def (V_NAME_F1 (sin))
+HALF_WIDTH_ALIAS_F1 (sin)
diff --git a/sysdeps/aarch64/fpu/tanf_advsimd.c b/sysdeps/aarch64/fpu/tanf_advsimd.c
index 4c8a7f740e..16b39e17a0 100644
--- a/sysdeps/aarch64/fpu/tanf_advsimd.c
+++ b/sysdeps/aarch64/fpu/tanf_advsimd.c
@@ -73,7 +73,7 @@  eval_poly (float32x4_t z, const struct data *d)
    Maximum error is 3.45 ULP:
    __v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
 			    want 0x1.ff9850p-1.  */
-float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
   float32x4_t special_arg = x;
@@ -127,3 +127,5 @@  float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
     return special_case (special_arg, vbslq_f32 (pred_alt, inv_y, y), special);
   return vbslq_f32 (pred_alt, inv_y, y);
 }
+libmvec_hidden_def (V_NAME_F1 (tan))
+HALF_WIDTH_ALIAS_F1 (tan)
diff --git a/sysdeps/aarch64/fpu/v_math.h b/sysdeps/aarch64/fpu/v_math.h
index d286eb81b3..e8ac0e2332 100644
--- a/sysdeps/aarch64/fpu/v_math.h
+++ b/sysdeps/aarch64/fpu/v_math.h
@@ -29,6 +29,21 @@ 
 #define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
 #define V_NAME_D2(fun) _ZGVnN2vv_##fun
 
+#include "advsimd_f32_protos.h"
+
+#define HALF_WIDTH_ALIAS_F1(fun)                                              \
+  float32x2_t VPCS_ATTR _ZGVnN2v_##fun##f (float32x2_t x)                     \
+  {                                                                           \
+    return vget_low_f32 (_ZGVnN4v_##fun##f (vcombine_f32 (x, x)));            \
+  }
+
+#define HALF_WIDTH_ALIAS_F2(fun)                                              \
+  float32x2_t VPCS_ATTR _ZGVnN2vv_##fun##f (float32x2_t x, float32x2_t y)     \
+  {                                                                           \
+    return vget_low_f32 (                                                     \
+	_ZGVnN4vv_##fun##f (vcombine_f32 (x, x), vcombine_f32 (y, y)));       \
+  }
+
 /* Shorthand helpers for declaring constants.  */
 #define V2(X) { X, X }
 #define V4(X) { X, X, X, X }
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index 2bf4ea6332..580952b4de 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -15,16 +15,31 @@  GLIBC_2.38 _ZGVsMxv_logf F
 GLIBC_2.38 _ZGVsMxv_sin F
 GLIBC_2.38 _ZGVsMxv_sinf F
 GLIBC_2.39 _ZGVnN2v_acos F
+GLIBC_2.39 _ZGVnN2v_acosf F
 GLIBC_2.39 _ZGVnN2v_asin F
+GLIBC_2.39 _ZGVnN2v_asinf F
 GLIBC_2.39 _ZGVnN2v_atan F
+GLIBC_2.39 _ZGVnN2v_atanf F
+GLIBC_2.39 _ZGVnN2v_cosf F
 GLIBC_2.39 _ZGVnN2v_exp10 F
+GLIBC_2.39 _ZGVnN2v_exp10f F
 GLIBC_2.39 _ZGVnN2v_exp2 F
+GLIBC_2.39 _ZGVnN2v_exp2f F
+GLIBC_2.39 _ZGVnN2v_expf F
 GLIBC_2.39 _ZGVnN2v_expm1 F
+GLIBC_2.39 _ZGVnN2v_expm1f F
 GLIBC_2.39 _ZGVnN2v_log10 F
+GLIBC_2.39 _ZGVnN2v_log10f F
 GLIBC_2.39 _ZGVnN2v_log1p F
+GLIBC_2.39 _ZGVnN2v_log1pf F
 GLIBC_2.39 _ZGVnN2v_log2 F
+GLIBC_2.39 _ZGVnN2v_log2f F
+GLIBC_2.39 _ZGVnN2v_logf F
+GLIBC_2.39 _ZGVnN2v_sinf F
 GLIBC_2.39 _ZGVnN2v_tan F
+GLIBC_2.39 _ZGVnN2v_tanf F
 GLIBC_2.39 _ZGVnN2vv_atan2 F
+GLIBC_2.39 _ZGVnN2vv_atan2f F
 GLIBC_2.39 _ZGVnN4v_acosf F
 GLIBC_2.39 _ZGVnN4v_asinf F
 GLIBC_2.39 _ZGVnN4v_atanf F