[13/35] arm: further fix overloading of MVE vaddq[_m]_n intrinsic

Message ID 20221117163809.1009526-14-andrea.corallo@arm.com
State Committed
Commit e0dd75fe90ef4cda94f431747d239d6cfcf5656f
Headers
Series arm: rework MVE testsuite and rework backend where necessary (1st chunk) |

Commit Message

Andrea Corallo Nov. 17, 2022, 4:37 p.m. UTC
  From: Stam Markianos-Wright <stam.markianos-wright@arm.com>

It was observed that in tests `vaddq_m_n_[s/u][8/16/32].c`, the _Generic
resolution would fall back to the `__ARM_undef` failure state.

This is a regression since `dc39db873670bea8d8e655444387ceaa53a01a79` and
`6bd4ce64eb48a72eca300cb52773e6101d646004`, but it previously wasn't
identified, because the tests were not checking for this kind of failure.

The above commits changed the definitions of the intrinsics from using
`[u]int[8/16/32]_t` types for the scalar argument to using `int`. This
allowed `int` to be supported in user code through the overloaded
`#defines`, but seems to have broken the `[u]int[8/16/32]_t` types

The solution implemented by this patch is to explicitly use a new
_Generic mapping from all the `[u]int[8/16/32]_t` types for int. With this
change, both `int` and `[u]int[8/16/32]_t` parameters are supported from
user code and are handled by the overloading mechanism correctly.

gcc/ChangeLog:

        * config/arm/arm_mve.h (__arm_vaddq_m_n_s8): Change types.
        (__arm_vaddq_m_n_s32): Likewise.
        (__arm_vaddq_m_n_s16): Likewise.
        (__arm_vaddq_m_n_u8): Likewise.
        (__arm_vaddq_m_n_u32): Likewise.
        (__arm_vaddq_m_n_u16): Likewise.
        (__arm_vaddq_m): Fix Overloading.
        (__ARM_mve_coerce3): New.
---
 gcc/config/arm/arm_mve.h | 78 ++++++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 38 deletions(-)
  

Comments

Kyrylo Tkachov Nov. 18, 2022, 4:49 p.m. UTC | #1
> -----Original Message-----
> From: Andrea Corallo <andrea.corallo@arm.com>
> Sent: Thursday, November 17, 2022 4:38 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Stam Markianos-Wright <Stam.Markianos-
> Wright@arm.com>
> Subject: [PATCH 13/35] arm: further fix overloading of MVE vaddq[_m]_n
> intrinsic
> 
> From: Stam Markianos-Wright <stam.markianos-wright@arm.com>
> 
> It was observed that in tests `vaddq_m_n_[s/u][8/16/32].c`, the _Generic
> resolution would fall back to the `__ARM_undef` failure state.
> 
> This is a regression since `dc39db873670bea8d8e655444387ceaa53a01a79`
> and
> `6bd4ce64eb48a72eca300cb52773e6101d646004`, but it previously wasn't
> identified, because the tests were not checking for this kind of failure.
> 
> The above commits changed the definitions of the intrinsics from using
> `[u]int[8/16/32]_t` types for the scalar argument to using `int`. This
> allowed `int` to be supported in user code through the overloaded
> `#defines`, but seems to have broken the `[u]int[8/16/32]_t` types
> 
> The solution implemented by this patch is to explicitly use a new
> _Generic mapping from all the `[u]int[8/16/32]_t` types for int. With this
> change, both `int` and `[u]int[8/16/32]_t` parameters are supported from
> user code and are handled by the overloading mechanism correctly.
> 
> gcc/ChangeLog:
> 
>         * config/arm/arm_mve.h (__arm_vaddq_m_n_s8): Change types.
>         (__arm_vaddq_m_n_s32): Likewise.
>         (__arm_vaddq_m_n_s16): Likewise.
>         (__arm_vaddq_m_n_u8): Likewise.
>         (__arm_vaddq_m_n_u32): Likewise.
>         (__arm_vaddq_m_n_u16): Likewise.
>         (__arm_vaddq_m): Fix Overloading.
>         (__ARM_mve_coerce3): New.

Ok. Wasn't there a PR in Bugzilla about this that we can cite in the commit message?
Thanks,
Kyrill

> ---
>  gcc/config/arm/arm_mve.h | 78 ++++++++++++++++++++--------------------
>  1 file changed, 40 insertions(+), 38 deletions(-)
> 
> diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
> index 684f997520f..951dc25374b 100644
> --- a/gcc/config/arm/arm_mve.h
> +++ b/gcc/config/arm/arm_mve.h
> @@ -9675,42 +9675,42 @@ __arm_vabdq_m_u16 (uint16x8_t __inactive,
> uint16x8_t __a, uint16x8_t __b, mve_pr
> 
>  __extension__ extern __inline int8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vaddq_m_n_s8 (int8x16_t __inactive, int8x16_t __a, int __b,
> mve_pred16_t __p)
> +__arm_vaddq_m_n_s8 (int8x16_t __inactive, int8x16_t __a, int8_t __b,
> mve_pred16_t __p)
>  {
>    return __builtin_mve_vaddq_m_n_sv16qi (__inactive, __a, __b, __p);
>  }
> 
>  __extension__ extern __inline int32x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vaddq_m_n_s32 (int32x4_t __inactive, int32x4_t __a, int __b,
> mve_pred16_t __p)
> +__arm_vaddq_m_n_s32 (int32x4_t __inactive, int32x4_t __a, int32_t __b,
> mve_pred16_t __p)
>  {
>    return __builtin_mve_vaddq_m_n_sv4si (__inactive, __a, __b, __p);
>  }
> 
>  __extension__ extern __inline int16x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vaddq_m_n_s16 (int16x8_t __inactive, int16x8_t __a, int __b,
> mve_pred16_t __p)
> +__arm_vaddq_m_n_s16 (int16x8_t __inactive, int16x8_t __a, int16_t __b,
> mve_pred16_t __p)
>  {
>    return __builtin_mve_vaddq_m_n_sv8hi (__inactive, __a, __b, __p);
>  }
> 
>  __extension__ extern __inline uint8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vaddq_m_n_u8 (uint8x16_t __inactive, uint8x16_t __a, int __b,
> mve_pred16_t __p)
> +__arm_vaddq_m_n_u8 (uint8x16_t __inactive, uint8x16_t __a, uint8_t __b,
> mve_pred16_t __p)
>  {
>    return __builtin_mve_vaddq_m_n_uv16qi (__inactive, __a, __b, __p);
>  }
> 
>  __extension__ extern __inline uint32x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vaddq_m_n_u32 (uint32x4_t __inactive, uint32x4_t __a, int __b,
> mve_pred16_t __p)
> +__arm_vaddq_m_n_u32 (uint32x4_t __inactive, uint32x4_t __a, uint32_t
> __b, mve_pred16_t __p)
>  {
>    return __builtin_mve_vaddq_m_n_uv4si (__inactive, __a, __b, __p);
>  }
> 
>  __extension__ extern __inline uint16x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vaddq_m_n_u16 (uint16x8_t __inactive, uint16x8_t __a, int __b,
> mve_pred16_t __p)
> +__arm_vaddq_m_n_u16 (uint16x8_t __inactive, uint16x8_t __a, uint16_t
> __b, mve_pred16_t __p)
>  {
>    return __builtin_mve_vaddq_m_n_uv8hi (__inactive, __a, __b, __p);
>  }
> @@ -26417,42 +26417,42 @@ __arm_vabdq_m (uint16x8_t __inactive,
> uint16x8_t __a, uint16x8_t __b, mve_pred16
> 
>  __extension__ extern __inline int8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vaddq_m (int8x16_t __inactive, int8x16_t __a, int __b,
> mve_pred16_t __p)
> +__arm_vaddq_m (int8x16_t __inactive, int8x16_t __a, int8_t __b,
> mve_pred16_t __p)
>  {
>   return __arm_vaddq_m_n_s8 (__inactive, __a, __b, __p);
>  }
> 
>  __extension__ extern __inline int32x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vaddq_m (int32x4_t __inactive, int32x4_t __a, int __b,
> mve_pred16_t __p)
> +__arm_vaddq_m (int32x4_t __inactive, int32x4_t __a, int32_t __b,
> mve_pred16_t __p)
>  {
>   return __arm_vaddq_m_n_s32 (__inactive, __a, __b, __p);
>  }
> 
>  __extension__ extern __inline int16x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vaddq_m (int16x8_t __inactive, int16x8_t __a, int __b,
> mve_pred16_t __p)
> +__arm_vaddq_m (int16x8_t __inactive, int16x8_t __a, int16_t __b,
> mve_pred16_t __p)
>  {
>   return __arm_vaddq_m_n_s16 (__inactive, __a, __b, __p);
>  }
> 
>  __extension__ extern __inline uint8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vaddq_m (uint8x16_t __inactive, uint8x16_t __a, int __b,
> mve_pred16_t __p)
> +__arm_vaddq_m (uint8x16_t __inactive, uint8x16_t __a, uint8_t __b,
> mve_pred16_t __p)
>  {
>   return __arm_vaddq_m_n_u8 (__inactive, __a, __b, __p);
>  }
> 
>  __extension__ extern __inline uint32x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vaddq_m (uint32x4_t __inactive, uint32x4_t __a, int __b,
> mve_pred16_t __p)
> +__arm_vaddq_m (uint32x4_t __inactive, uint32x4_t __a, uint32_t __b,
> mve_pred16_t __p)
>  {
>   return __arm_vaddq_m_n_u32 (__inactive, __a, __b, __p);
>  }
> 
>  __extension__ extern __inline uint16x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vaddq_m (uint16x8_t __inactive, uint16x8_t __a, int __b,
> mve_pred16_t __p)
> +__arm_vaddq_m (uint16x8_t __inactive, uint16x8_t __a, uint16_t __b,
> mve_pred16_t __p)
>  {
>   return __arm_vaddq_m_n_u16 (__inactive, __a, __b, __p);
>  }
> @@ -35657,6 +35657,8 @@ extern void *__ARM_undef;
>      _Generic(param, type: param, const type: param, default: *(type
> *)__ARM_undef)
>  #define __ARM_mve_coerce2(param, type) \
>      _Generic(param, type: param, float16_t: param, float32_t: param, default:
> *(type *)__ARM_undef)
> +#define __ARM_mve_coerce3(param, type) \
> +    _Generic(param, type: param, int8_t: param, int16_t: param, int32_t:
> param, int64_t: param, uint8_t: param, uint16_t: param, uint32_t: param,
> uint64_t: param, default: *(type *)__ARM_undef)
> 
>  #if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */
> 
> @@ -35871,14 +35873,14 @@ extern void *__ARM_undef;
>    int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]:
> __arm_vaddq_u8 (__ARM_mve_coerce(__p0, uint8x16_t),
> __ARM_mve_coerce(__p1, uint8x16_t)), \
>    int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]:
> __arm_vaddq_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
> __ARM_mve_coerce(__p1, uint16x8_t)), \
>    int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]:
> __arm_vaddq_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
> __ARM_mve_coerce(__p1, uint32x4_t)), \
> -  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]:
> __arm_vaddq_f16 (__ARM_mve_coerce(p0, float16x8_t),
> __ARM_mve_coerce(p1, float16x8_t)), \
> -  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]:
> __arm_vaddq_f32 (__ARM_mve_coerce(p0, float32x4_t),
> __ARM_mve_coerce(p1, float32x4_t)), \
> -  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t),
> __ARM_mve_coerce(__p1, int)), \
> -  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
> __ARM_mve_coerce(__p1, int)), \
> -  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
> __ARM_mve_coerce(__p1, int)), \
> -  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
> __ARM_mve_coerce(__p1, int)), \
> -  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t),
> __ARM_mve_coerce(__p1, int)), \
> -  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t),
> __ARM_mve_coerce(__p1, int)), \
> +  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]:
> __arm_vaddq_f16 (__ARM_mve_coerce(__p0, float16x8_t),
> __ARM_mve_coerce(__p1, float16x8_t)), \
> +  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]:
> __arm_vaddq_f32 (__ARM_mve_coerce(__p0, float32x4_t),
> __ARM_mve_coerce(__p1, float32x4_t)), \
> +  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t),
> __ARM_mve_coerce3(p1, int)), \
> +  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
> __ARM_mve_coerce3(p1, int)), \
> +  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
> __ARM_mve_coerce3(p1, int)), \
> +  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
> __ARM_mve_coerce3(p1, int)), \
> +  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t),
> __ARM_mve_coerce3(p1, int)), \
> +  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t),
> __ARM_mve_coerce3(p1, int)), \
>    int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]:
> __arm_vaddq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t),
> __ARM_mve_coerce2(__p1, double)), \
>    int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]:
> __arm_vaddq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t),
> __ARM_mve_coerce2(__p1, double)));})
> 
> @@ -37316,12 +37318,12 @@ extern void *__ARM_undef;
>    int
> (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_m
> ve_type_uint32x4_t]: __arm_vaddq_m_u32 (__ARM_mve_coerce(__p0,
> uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t),
> __ARM_mve_coerce(__p2, uint32x4_t), p3), \
>    int
> (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_
> mve_type_float16x8_t]: __arm_vaddq_m_f16 (__ARM_mve_coerce(__p0,
> float16x8_t), __ARM_mve_coerce(__p1, float16x8_t),
> __ARM_mve_coerce(__p2, float16x8_t), p3), \
>    int
> (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_
> mve_type_float32x4_t]: __arm_vaddq_m_f32 (__ARM_mve_coerce(__p0,
> float32x4_t), __ARM_mve_coerce(__p1, float32x4_t),
> __ARM_mve_coerce(__p2, float32x4_t), p3), \
> -  int
> (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve
> _type_int_n]: __arm_vaddq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
> __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int), p3), \
> -  int
> (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve
> _type_int_n]: __arm_vaddq_m_n_s16 (__ARM_mve_coerce(__p0,
> int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2,
> int), p3), \
> -  int
> (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve
> _type_int_n]: __arm_vaddq_m_n_s32 (__ARM_mve_coerce(__p0,
> int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2,
> int), p3), \
> -  int
> (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_m
> ve_type_int_n]: __arm_vaddq_m_n_u8 (__ARM_mve_coerce(__p0,
> uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t),
> __ARM_mve_coerce(__p2, int), p3), \
> -  int
> (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_m
> ve_type_int_n]: __arm_vaddq_m_n_u16 (__ARM_mve_coerce(__p0,
> uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t),
> __ARM_mve_coerce(__p2, int), p3), \
> -  int
> (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_m
> ve_type_int_n]: __arm_vaddq_m_n_u32 (__ARM_mve_coerce(__p0,
> uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t),
> __ARM_mve_coerce(__p2, int), p3), \
> +  int
> (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve
> _type_int_n]: __arm_vaddq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
> __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce3(p2, int), p3), \
> +  int
> (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve
> _type_int_n]: __arm_vaddq_m_n_s16 (__ARM_mve_coerce(__p0,
> int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce3(p2,
> int), p3), \
> +  int
> (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve
> _type_int_n]: __arm_vaddq_m_n_s32 (__ARM_mve_coerce(__p0,
> int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce3(p2,
> int), p3), \
> +  int
> (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_m
> ve_type_int_n]: __arm_vaddq_m_n_u8 (__ARM_mve_coerce(__p0,
> uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t),
> __ARM_mve_coerce3(p2, int), p3), \
> +  int
> (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_m
> ve_type_int_n]: __arm_vaddq_m_n_u16 (__ARM_mve_coerce(__p0,
> uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t),
> __ARM_mve_coerce3(p2, int), p3), \
> +  int
> (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_m
> ve_type_int_n]: __arm_vaddq_m_n_u32 (__ARM_mve_coerce(__p0,
> uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t),
> __ARM_mve_coerce3(p2, int), p3), \
>    int
> (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_
> mve_type_fp_n]: __arm_vaddq_m_n_f16 (__ARM_mve_coerce(__p0,
> float16x8_t), __ARM_mve_coerce(__p1, float16x8_t),
> __ARM_mve_coerce2(__p2, double), p3), \
>    int
> (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_
> mve_type_fp_n]: __arm_vaddq_m_n_f32 (__ARM_mve_coerce(__p0,
> float32x4_t), __ARM_mve_coerce(__p1, float32x4_t),
> __ARM_mve_coerce2(__p2, double), p3));})
> 
> @@ -38820,12 +38822,12 @@ extern void *__ARM_undef;
>    int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]:
> __arm_vaddq_u8 (__ARM_mve_coerce(__p0, uint8x16_t),
> __ARM_mve_coerce(__p1, uint8x16_t)), \
>    int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]:
> __arm_vaddq_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
> __ARM_mve_coerce(__p1, uint16x8_t)), \
>    int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]:
> __arm_vaddq_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
> __ARM_mve_coerce(__p1, uint32x4_t)), \
> -  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t),
> __ARM_mve_coerce(__p1, int)), \
> -  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
> __ARM_mve_coerce(__p1, int)), \
> -  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
> __ARM_mve_coerce(__p1, int)), \
> -  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
> __ARM_mve_coerce(__p1, int)), \
> -  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t),
> __ARM_mve_coerce(__p1, int)), \
> -  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t),
> __ARM_mve_coerce(__p1, int)));})
> +  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t),
> __ARM_mve_coerce3(p1, int)), \
> +  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
> __ARM_mve_coerce3(p1, int)), \
> +  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
> __ARM_mve_coerce3(p1, int)), \
> +  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
> __ARM_mve_coerce3(p1, int)), \
> +  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t),
> __ARM_mve_coerce3(p1, int)), \
> +  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
> __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t),
> __ARM_mve_coerce3(p1, int)));})
> 
>  #define __arm_vandq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
>    __typeof(p1) __p1 = (p1); \
> @@ -39641,12 +39643,12 @@ extern void *__ARM_undef;
>    __typeof(p1) __p1 = (p1); \
>    __typeof(p2) __p2 = (p2); \
>    _Generic( (int
> (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typ
> eid(__p2)])0, \
> -  int
> (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve
> _type_int_n]: __arm_vaddq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
> __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int), p3), \
> -  int
> (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve
> _type_int_n]: __arm_vaddq_m_n_s16 (__ARM_mve_coerce(__p0,
> int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2,
> int), p3), \
> -  int
> (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve
> _type_int_n]: __arm_vaddq_m_n_s32 (__ARM_mve_coerce(__p0,
> int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2,
> int), p3), \
> -  int
> (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_m
> ve_type_int_n]: __arm_vaddq_m_n_u8 (__ARM_mve_coerce(__p0,
> uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t),
> __ARM_mve_coerce(__p2, int), p3), \
> -  int
> (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_m
> ve_type_int_n]: __arm_vaddq_m_n_u16 (__ARM_mve_coerce(__p0,
> uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t),
> __ARM_mve_coerce(__p2, int), p3), \
> -  int
> (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_m
> ve_type_int_n]: __arm_vaddq_m_n_u32 (__ARM_mve_coerce(__p0,
> uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t),
> __ARM_mve_coerce(__p2, int), p3), \
> +  int
> (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve
> _type_int_n]: __arm_vaddq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
> __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce3(p2, int), p3), \
> +  int
> (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve
> _type_int_n]: __arm_vaddq_m_n_s16 (__ARM_mve_coerce(__p0,
> int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce3(p2,
> int), p3), \
> +  int
> (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve
> _type_int_n]: __arm_vaddq_m_n_s32 (__ARM_mve_coerce(__p0,
> int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce3(p2,
> int), p3), \
> +  int
> (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_m
> ve_type_int_n]: __arm_vaddq_m_n_u8 (__ARM_mve_coerce(__p0,
> uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t),
> __ARM_mve_coerce3(p2, int), p3), \
> +  int
> (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_m
> ve_type_int_n]: __arm_vaddq_m_n_u16 (__ARM_mve_coerce(__p0,
> uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t),
> __ARM_mve_coerce3(p2, int), p3), \
> +  int
> (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_m
> ve_type_int_n]: __arm_vaddq_m_n_u32 (__ARM_mve_coerce(__p0,
> uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t),
> __ARM_mve_coerce3(p2, int), p3), \
>    int
> (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve
> _type_int8x16_t]: __arm_vaddq_m_s8 (__ARM_mve_coerce(__p0,
> int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2,
> int8x16_t), p3), \
>    int
> (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve
> _type_int16x8_t]: __arm_vaddq_m_s16 (__ARM_mve_coerce(__p0,
> int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2,
> int16x8_t), p3), \
>    int
> (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve
> _type_int32x4_t]: __arm_vaddq_m_s32 (__ARM_mve_coerce(__p0,
> int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2,
> int32x4_t), p3), \
> --
> 2.25.1
  
Stamatis Markianos-Wright Nov. 21, 2022, 10:45 a.m. UTC | #2
On 11/18/22 16:49, Kyrylo Tkachov wrote:
>
>> -----Original Message-----
>> From: Andrea Corallo <andrea.corallo@arm.com>
>> Sent: Thursday, November 17, 2022 4:38 PM
>> To: gcc-patches@gcc.gnu.org
>> Cc: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>; Stam Markianos-Wright <Stam.Markianos-
>> Wright@arm.com>
>> Subject: [PATCH 13/35] arm: further fix overloading of MVE vaddq[_m]_n
>> intrinsic
>>
>> From: Stam Markianos-Wright <stam.markianos-wright@arm.com>
>>
>> It was observed that in tests `vaddq_m_n_[s/u][8/16/32].c`, the _Generic
>> resolution would fall back to the `__ARM_undef` failure state.
>>
>> This is a regression since `dc39db873670bea8d8e655444387ceaa53a01a79`
>> and
>> `6bd4ce64eb48a72eca300cb52773e6101d646004`, but it previously wasn't
>> identified, because the tests were not checking for this kind of failure.
>>
>> The above commits changed the definitions of the intrinsics from using
>> `[u]int[8/16/32]_t` types for the scalar argument to using `int`. This
>> allowed `int` to be supported in user code through the overloaded
>> `#defines`, but seems to have broken the `[u]int[8/16/32]_t` types
>>
>> The solution implemented by this patch is to explicitly use a new
>> _Generic mapping from all the `[u]int[8/16/32]_t` types for int. With this
>> change, both `int` and `[u]int[8/16/32]_t` parameters are supported from
>> user code and are handled by the overloading mechanism correctly.
>>
>> gcc/ChangeLog:
>>
>>          * config/arm/arm_mve.h (__arm_vaddq_m_n_s8): Change types.
>>          (__arm_vaddq_m_n_s32): Likewise.
>>          (__arm_vaddq_m_n_s16): Likewise.
>>          (__arm_vaddq_m_n_u8): Likewise.
>>          (__arm_vaddq_m_n_u32): Likewise.
>>          (__arm_vaddq_m_n_u16): Likewise.
>>          (__arm_vaddq_m): Fix Overloading.
>>          (__ARM_mve_coerce3): New.
> Ok. Wasn't there a PR in Bugzilla about this that we can cite in the commit message?
> Thanks,
> Kyrill

Thanks for the review! Ah yes, there was this one:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96795

which was closed last time around.
It does make sense to add it, though, so we'll do that.

Thanks!

>
>> ---
>>   gcc/config/arm/arm_mve.h | 78 ++++++++++++++++++++--------------------
>>   1 file changed, 40 insertions(+), 38 deletions(-)
>>
>> diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
>> index 684f997520f..951dc25374b 100644
>> --- a/gcc/config/arm/arm_mve.h
>> +++ b/gcc/config/arm/arm_mve.h
>> @@ -9675,42 +9675,42 @@ __arm_vabdq_m_u16 (uint16x8_t __inactive,
>> uint16x8_t __a, uint16x8_t __b, mve_pr
>>
>>   __extension__ extern __inline int8x16_t
>>   __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> -__arm_vaddq_m_n_s8 (int8x16_t __inactive, int8x16_t __a, int __b,
>> mve_pred16_t __p)
>> +__arm_vaddq_m_n_s8 (int8x16_t __inactive, int8x16_t __a, int8_t __b,
>> mve_pred16_t __p)
>>   {
>>     return __builtin_mve_vaddq_m_n_sv16qi (__inactive, __a, __b, __p);
>>   }
>>
>>   __extension__ extern __inline int32x4_t
>>   __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> -__arm_vaddq_m_n_s32 (int32x4_t __inactive, int32x4_t __a, int __b,
>> mve_pred16_t __p)
>> +__arm_vaddq_m_n_s32 (int32x4_t __inactive, int32x4_t __a, int32_t __b,
>> mve_pred16_t __p)
>>   {
>>     return __builtin_mve_vaddq_m_n_sv4si (__inactive, __a, __b, __p);
>>   }
>>
>>   __extension__ extern __inline int16x8_t
>>   __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> -__arm_vaddq_m_n_s16 (int16x8_t __inactive, int16x8_t __a, int __b,
>> mve_pred16_t __p)
>> +__arm_vaddq_m_n_s16 (int16x8_t __inactive, int16x8_t __a, int16_t __b,
>> mve_pred16_t __p)
>>   {
>>     return __builtin_mve_vaddq_m_n_sv8hi (__inactive, __a, __b, __p);
>>   }
>>
>>   __extension__ extern __inline uint8x16_t
>>   __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> -__arm_vaddq_m_n_u8 (uint8x16_t __inactive, uint8x16_t __a, int __b,
>> mve_pred16_t __p)
>> +__arm_vaddq_m_n_u8 (uint8x16_t __inactive, uint8x16_t __a, uint8_t __b,
>> mve_pred16_t __p)
>>   {
>>     return __builtin_mve_vaddq_m_n_uv16qi (__inactive, __a, __b, __p);
>>   }
>>
>>   __extension__ extern __inline uint32x4_t
>>   __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> -__arm_vaddq_m_n_u32 (uint32x4_t __inactive, uint32x4_t __a, int __b,
>> mve_pred16_t __p)
>> +__arm_vaddq_m_n_u32 (uint32x4_t __inactive, uint32x4_t __a, uint32_t
>> __b, mve_pred16_t __p)
>>   {
>>     return __builtin_mve_vaddq_m_n_uv4si (__inactive, __a, __b, __p);
>>   }
>>
>>   __extension__ extern __inline uint16x8_t
>>   __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> -__arm_vaddq_m_n_u16 (uint16x8_t __inactive, uint16x8_t __a, int __b,
>> mve_pred16_t __p)
>> +__arm_vaddq_m_n_u16 (uint16x8_t __inactive, uint16x8_t __a, uint16_t
>> __b, mve_pred16_t __p)
>>   {
>>     return __builtin_mve_vaddq_m_n_uv8hi (__inactive, __a, __b, __p);
>>   }
>> @@ -26417,42 +26417,42 @@ __arm_vabdq_m (uint16x8_t __inactive,
>> uint16x8_t __a, uint16x8_t __b, mve_pred16
>>
>>   __extension__ extern __inline int8x16_t
>>   __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> -__arm_vaddq_m (int8x16_t __inactive, int8x16_t __a, int __b,
>> mve_pred16_t __p)
>> +__arm_vaddq_m (int8x16_t __inactive, int8x16_t __a, int8_t __b,
>> mve_pred16_t __p)
>>   {
>>    return __arm_vaddq_m_n_s8 (__inactive, __a, __b, __p);
>>   }
>>
>>   __extension__ extern __inline int32x4_t
>>   __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> -__arm_vaddq_m (int32x4_t __inactive, int32x4_t __a, int __b,
>> mve_pred16_t __p)
>> +__arm_vaddq_m (int32x4_t __inactive, int32x4_t __a, int32_t __b,
>> mve_pred16_t __p)
>>   {
>>    return __arm_vaddq_m_n_s32 (__inactive, __a, __b, __p);
>>   }
>>
>>   __extension__ extern __inline int16x8_t
>>   __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> -__arm_vaddq_m (int16x8_t __inactive, int16x8_t __a, int __b,
>> mve_pred16_t __p)
>> +__arm_vaddq_m (int16x8_t __inactive, int16x8_t __a, int16_t __b,
>> mve_pred16_t __p)
>>   {
>>    return __arm_vaddq_m_n_s16 (__inactive, __a, __b, __p);
>>   }
>>
>>   __extension__ extern __inline uint8x16_t
>>   __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> -__arm_vaddq_m (uint8x16_t __inactive, uint8x16_t __a, int __b,
>> mve_pred16_t __p)
>> +__arm_vaddq_m (uint8x16_t __inactive, uint8x16_t __a, uint8_t __b,
>> mve_pred16_t __p)
>>   {
>>    return __arm_vaddq_m_n_u8 (__inactive, __a, __b, __p);
>>   }
>>
>>   __extension__ extern __inline uint32x4_t
>>   __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> -__arm_vaddq_m (uint32x4_t __inactive, uint32x4_t __a, int __b,
>> mve_pred16_t __p)
>> +__arm_vaddq_m (uint32x4_t __inactive, uint32x4_t __a, uint32_t __b,
>> mve_pred16_t __p)
>>   {
>>    return __arm_vaddq_m_n_u32 (__inactive, __a, __b, __p);
>>   }
>>
>>   __extension__ extern __inline uint16x8_t
>>   __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> -__arm_vaddq_m (uint16x8_t __inactive, uint16x8_t __a, int __b,
>> mve_pred16_t __p)
>> +__arm_vaddq_m (uint16x8_t __inactive, uint16x8_t __a, uint16_t __b,
>> mve_pred16_t __p)
>>   {
>>    return __arm_vaddq_m_n_u16 (__inactive, __a, __b, __p);
>>   }
>> @@ -35657,6 +35657,8 @@ extern void *__ARM_undef;
>>       _Generic(param, type: param, const type: param, default: *(type
>> *)__ARM_undef)
>>   #define __ARM_mve_coerce2(param, type) \
>>       _Generic(param, type: param, float16_t: param, float32_t: param, default:
>> *(type *)__ARM_undef)
>> +#define __ARM_mve_coerce3(param, type) \
>> +    _Generic(param, type: param, int8_t: param, int16_t: param, int32_t:
>> param, int64_t: param, uint8_t: param, uint16_t: param, uint32_t: param,
>> uint64_t: param, default: *(type *)__ARM_undef)
>>
>>   #if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */
>>
>> @@ -35871,14 +35873,14 @@ extern void *__ARM_undef;
>>     int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]:
>> __arm_vaddq_u8 (__ARM_mve_coerce(__p0, uint8x16_t),
>> __ARM_mve_coerce(__p1, uint8x16_t)), \
>>     int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]:
>> __arm_vaddq_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
>> __ARM_mve_coerce(__p1, uint16x8_t)), \
>>     int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]:
>> __arm_vaddq_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
>> __ARM_mve_coerce(__p1, uint32x4_t)), \
>> -  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]:
>> __arm_vaddq_f16 (__ARM_mve_coerce(p0, float16x8_t),
>> __ARM_mve_coerce(p1, float16x8_t)), \
>> -  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]:
>> __arm_vaddq_f32 (__ARM_mve_coerce(p0, float32x4_t),
>> __ARM_mve_coerce(p1, float32x4_t)), \
>> -  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t),
>> __ARM_mve_coerce(__p1, int)), \
>> -  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
>> __ARM_mve_coerce(__p1, int)), \
>> -  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
>> __ARM_mve_coerce(__p1, int)), \
>> -  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
>> __ARM_mve_coerce(__p1, int)), \
>> -  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t),
>> __ARM_mve_coerce(__p1, int)), \
>> -  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t),
>> __ARM_mve_coerce(__p1, int)), \
>> +  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]:
>> __arm_vaddq_f16 (__ARM_mve_coerce(__p0, float16x8_t),
>> __ARM_mve_coerce(__p1, float16x8_t)), \
>> +  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]:
>> __arm_vaddq_f32 (__ARM_mve_coerce(__p0, float32x4_t),
>> __ARM_mve_coerce(__p1, float32x4_t)), \
>> +  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t),
>> __ARM_mve_coerce3(p1, int)), \
>> +  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
>> __ARM_mve_coerce3(p1, int)), \
>> +  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
>> __ARM_mve_coerce3(p1, int)), \
>> +  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
>> __ARM_mve_coerce3(p1, int)), \
>> +  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t),
>> __ARM_mve_coerce3(p1, int)), \
>> +  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t),
>> __ARM_mve_coerce3(p1, int)), \
>>     int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]:
>> __arm_vaddq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t),
>> __ARM_mve_coerce2(__p1, double)), \
>>     int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]:
>> __arm_vaddq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t),
>> __ARM_mve_coerce2(__p1, double)));})
>>
>> @@ -37316,12 +37318,12 @@ extern void *__ARM_undef;
>>     int
>> (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_m
>> ve_type_uint32x4_t]: __arm_vaddq_m_u32 (__ARM_mve_coerce(__p0,
>> uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t),
>> __ARM_mve_coerce(__p2, uint32x4_t), p3), \
>>     int
>> (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_
>> mve_type_float16x8_t]: __arm_vaddq_m_f16 (__ARM_mve_coerce(__p0,
>> float16x8_t), __ARM_mve_coerce(__p1, float16x8_t),
>> __ARM_mve_coerce(__p2, float16x8_t), p3), \
>>     int
>> (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_
>> mve_type_float32x4_t]: __arm_vaddq_m_f32 (__ARM_mve_coerce(__p0,
>> float32x4_t), __ARM_mve_coerce(__p1, float32x4_t),
>> __ARM_mve_coerce(__p2, float32x4_t), p3), \
>> -  int
>> (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve
>> _type_int_n]: __arm_vaddq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
>> __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int), p3), \
>> -  int
>> (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve
>> _type_int_n]: __arm_vaddq_m_n_s16 (__ARM_mve_coerce(__p0,
>> int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2,
>> int), p3), \
>> -  int
>> (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve
>> _type_int_n]: __arm_vaddq_m_n_s32 (__ARM_mve_coerce(__p0,
>> int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2,
>> int), p3), \
>> -  int
>> (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_m
>> ve_type_int_n]: __arm_vaddq_m_n_u8 (__ARM_mve_coerce(__p0,
>> uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t),
>> __ARM_mve_coerce(__p2, int), p3), \
>> -  int
>> (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_m
>> ve_type_int_n]: __arm_vaddq_m_n_u16 (__ARM_mve_coerce(__p0,
>> uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t),
>> __ARM_mve_coerce(__p2, int), p3), \
>> -  int
>> (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_m
>> ve_type_int_n]: __arm_vaddq_m_n_u32 (__ARM_mve_coerce(__p0,
>> uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t),
>> __ARM_mve_coerce(__p2, int), p3), \
>> +  int
>> (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve
>> _type_int_n]: __arm_vaddq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
>> __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce3(p2, int), p3), \
>> +  int
>> (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve
>> _type_int_n]: __arm_vaddq_m_n_s16 (__ARM_mve_coerce(__p0,
>> int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce3(p2,
>> int), p3), \
>> +  int
>> (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve
>> _type_int_n]: __arm_vaddq_m_n_s32 (__ARM_mve_coerce(__p0,
>> int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce3(p2,
>> int), p3), \
>> +  int
>> (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_m
>> ve_type_int_n]: __arm_vaddq_m_n_u8 (__ARM_mve_coerce(__p0,
>> uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t),
>> __ARM_mve_coerce3(p2, int), p3), \
>> +  int
>> (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_m
>> ve_type_int_n]: __arm_vaddq_m_n_u16 (__ARM_mve_coerce(__p0,
>> uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t),
>> __ARM_mve_coerce3(p2, int), p3), \
>> +  int
>> (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_m
>> ve_type_int_n]: __arm_vaddq_m_n_u32 (__ARM_mve_coerce(__p0,
>> uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t),
>> __ARM_mve_coerce3(p2, int), p3), \
>>     int
>> (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_
>> mve_type_fp_n]: __arm_vaddq_m_n_f16 (__ARM_mve_coerce(__p0,
>> float16x8_t), __ARM_mve_coerce(__p1, float16x8_t),
>> __ARM_mve_coerce2(__p2, double), p3), \
>>     int
>> (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_
>> mve_type_fp_n]: __arm_vaddq_m_n_f32 (__ARM_mve_coerce(__p0,
>> float32x4_t), __ARM_mve_coerce(__p1, float32x4_t),
>> __ARM_mve_coerce2(__p2, double), p3));})
>>
>> @@ -38820,12 +38822,12 @@ extern void *__ARM_undef;
>>     int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]:
>> __arm_vaddq_u8 (__ARM_mve_coerce(__p0, uint8x16_t),
>> __ARM_mve_coerce(__p1, uint8x16_t)), \
>>     int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]:
>> __arm_vaddq_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
>> __ARM_mve_coerce(__p1, uint16x8_t)), \
>>     int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]:
>> __arm_vaddq_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
>> __ARM_mve_coerce(__p1, uint32x4_t)), \
>> -  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t),
>> __ARM_mve_coerce(__p1, int)), \
>> -  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
>> __ARM_mve_coerce(__p1, int)), \
>> -  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
>> __ARM_mve_coerce(__p1, int)), \
>> -  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
>> __ARM_mve_coerce(__p1, int)), \
>> -  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t),
>> __ARM_mve_coerce(__p1, int)), \
>> -  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t),
>> __ARM_mve_coerce(__p1, int)));})
>> +  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t),
>> __ARM_mve_coerce3(p1, int)), \
>> +  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t),
>> __ARM_mve_coerce3(p1, int)), \
>> +  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t),
>> __ARM_mve_coerce3(p1, int)), \
>> +  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
>> __ARM_mve_coerce3(p1, int)), \
>> +  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t),
>> __ARM_mve_coerce3(p1, int)), \
>> +  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
>> __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t),
>> __ARM_mve_coerce3(p1, int)));})
>>
>>   #define __arm_vandq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
>>     __typeof(p1) __p1 = (p1); \
>> @@ -39641,12 +39643,12 @@ extern void *__ARM_undef;
>>     __typeof(p1) __p1 = (p1); \
>>     __typeof(p2) __p2 = (p2); \
>>     _Generic( (int
>> (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typ
>> eid(__p2)])0, \
>> -  int
>> (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve
>> _type_int_n]: __arm_vaddq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
>> __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int), p3), \
>> -  int
>> (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve
>> _type_int_n]: __arm_vaddq_m_n_s16 (__ARM_mve_coerce(__p0,
>> int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2,
>> int), p3), \
>> -  int
>> (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve
>> _type_int_n]: __arm_vaddq_m_n_s32 (__ARM_mve_coerce(__p0,
>> int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2,
>> int), p3), \
>> -  int
>> (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_m
>> ve_type_int_n]: __arm_vaddq_m_n_u8 (__ARM_mve_coerce(__p0,
>> uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t),
>> __ARM_mve_coerce(__p2, int), p3), \
>> -  int
>> (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_m
>> ve_type_int_n]: __arm_vaddq_m_n_u16 (__ARM_mve_coerce(__p0,
>> uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t),
>> __ARM_mve_coerce(__p2, int), p3), \
>> -  int
>> (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_m
>> ve_type_int_n]: __arm_vaddq_m_n_u32 (__ARM_mve_coerce(__p0,
>> uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t),
>> __ARM_mve_coerce(__p2, int), p3), \
>> +  int
>> (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve
>> _type_int_n]: __arm_vaddq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t),
>> __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce3(p2, int), p3), \
>> +  int
>> (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve
>> _type_int_n]: __arm_vaddq_m_n_s16 (__ARM_mve_coerce(__p0,
>> int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce3(p2,
>> int), p3), \
>> +  int
>> (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve
>> _type_int_n]: __arm_vaddq_m_n_s32 (__ARM_mve_coerce(__p0,
>> int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce3(p2,
>> int), p3), \
>> +  int
>> (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_m
>> ve_type_int_n]: __arm_vaddq_m_n_u8 (__ARM_mve_coerce(__p0,
>> uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t),
>> __ARM_mve_coerce3(p2, int), p3), \
>> +  int
>> (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_m
>> ve_type_int_n]: __arm_vaddq_m_n_u16 (__ARM_mve_coerce(__p0,
>> uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t),
>> __ARM_mve_coerce3(p2, int), p3), \
>> +  int
>> (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_m
>> ve_type_int_n]: __arm_vaddq_m_n_u32 (__ARM_mve_coerce(__p0,
>> uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t),
>> __ARM_mve_coerce3(p2, int), p3), \
>>     int
>> (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve
>> _type_int8x16_t]: __arm_vaddq_m_s8 (__ARM_mve_coerce(__p0,
>> int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2,
>> int8x16_t), p3), \
>>     int
>> (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve
>> _type_int16x8_t]: __arm_vaddq_m_s16 (__ARM_mve_coerce(__p0,
>> int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2,
>> int16x8_t), p3), \
>>     int
>> (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve
>> _type_int32x4_t]: __arm_vaddq_m_s32 (__ARM_mve_coerce(__p0,
>> int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2,
>> int32x4_t), p3), \
>> --
>> 2.25.1
  

Patch

diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 684f997520f..951dc25374b 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -9675,42 +9675,42 @@  __arm_vabdq_m_u16 (uint16x8_t __inactive, uint16x8_t __a, uint16x8_t __b, mve_pr
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vaddq_m_n_s8 (int8x16_t __inactive, int8x16_t __a, int __b, mve_pred16_t __p)
+__arm_vaddq_m_n_s8 (int8x16_t __inactive, int8x16_t __a, int8_t __b, mve_pred16_t __p)
 {
   return __builtin_mve_vaddq_m_n_sv16qi (__inactive, __a, __b, __p);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vaddq_m_n_s32 (int32x4_t __inactive, int32x4_t __a, int __b, mve_pred16_t __p)
+__arm_vaddq_m_n_s32 (int32x4_t __inactive, int32x4_t __a, int32_t __b, mve_pred16_t __p)
 {
   return __builtin_mve_vaddq_m_n_sv4si (__inactive, __a, __b, __p);
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vaddq_m_n_s16 (int16x8_t __inactive, int16x8_t __a, int __b, mve_pred16_t __p)
+__arm_vaddq_m_n_s16 (int16x8_t __inactive, int16x8_t __a, int16_t __b, mve_pred16_t __p)
 {
   return __builtin_mve_vaddq_m_n_sv8hi (__inactive, __a, __b, __p);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vaddq_m_n_u8 (uint8x16_t __inactive, uint8x16_t __a, int __b, mve_pred16_t __p)
+__arm_vaddq_m_n_u8 (uint8x16_t __inactive, uint8x16_t __a, uint8_t __b, mve_pred16_t __p)
 {
   return __builtin_mve_vaddq_m_n_uv16qi (__inactive, __a, __b, __p);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vaddq_m_n_u32 (uint32x4_t __inactive, uint32x4_t __a, int __b, mve_pred16_t __p)
+__arm_vaddq_m_n_u32 (uint32x4_t __inactive, uint32x4_t __a, uint32_t __b, mve_pred16_t __p)
 {
   return __builtin_mve_vaddq_m_n_uv4si (__inactive, __a, __b, __p);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vaddq_m_n_u16 (uint16x8_t __inactive, uint16x8_t __a, int __b, mve_pred16_t __p)
+__arm_vaddq_m_n_u16 (uint16x8_t __inactive, uint16x8_t __a, uint16_t __b, mve_pred16_t __p)
 {
   return __builtin_mve_vaddq_m_n_uv8hi (__inactive, __a, __b, __p);
 }
@@ -26417,42 +26417,42 @@  __arm_vabdq_m (uint16x8_t __inactive, uint16x8_t __a, uint16x8_t __b, mve_pred16
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vaddq_m (int8x16_t __inactive, int8x16_t __a, int __b, mve_pred16_t __p)
+__arm_vaddq_m (int8x16_t __inactive, int8x16_t __a, int8_t __b, mve_pred16_t __p)
 {
  return __arm_vaddq_m_n_s8 (__inactive, __a, __b, __p);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vaddq_m (int32x4_t __inactive, int32x4_t __a, int __b, mve_pred16_t __p)
+__arm_vaddq_m (int32x4_t __inactive, int32x4_t __a, int32_t __b, mve_pred16_t __p)
 {
  return __arm_vaddq_m_n_s32 (__inactive, __a, __b, __p);
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vaddq_m (int16x8_t __inactive, int16x8_t __a, int __b, mve_pred16_t __p)
+__arm_vaddq_m (int16x8_t __inactive, int16x8_t __a, int16_t __b, mve_pred16_t __p)
 {
  return __arm_vaddq_m_n_s16 (__inactive, __a, __b, __p);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vaddq_m (uint8x16_t __inactive, uint8x16_t __a, int __b, mve_pred16_t __p)
+__arm_vaddq_m (uint8x16_t __inactive, uint8x16_t __a, uint8_t __b, mve_pred16_t __p)
 {
  return __arm_vaddq_m_n_u8 (__inactive, __a, __b, __p);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vaddq_m (uint32x4_t __inactive, uint32x4_t __a, int __b, mve_pred16_t __p)
+__arm_vaddq_m (uint32x4_t __inactive, uint32x4_t __a, uint32_t __b, mve_pred16_t __p)
 {
  return __arm_vaddq_m_n_u32 (__inactive, __a, __b, __p);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vaddq_m (uint16x8_t __inactive, uint16x8_t __a, int __b, mve_pred16_t __p)
+__arm_vaddq_m (uint16x8_t __inactive, uint16x8_t __a, uint16_t __b, mve_pred16_t __p)
 {
  return __arm_vaddq_m_n_u16 (__inactive, __a, __b, __p);
 }
@@ -35657,6 +35657,8 @@  extern void *__ARM_undef;
     _Generic(param, type: param, const type: param, default: *(type *)__ARM_undef)
 #define __ARM_mve_coerce2(param, type) \
     _Generic(param, type: param, float16_t: param, float32_t: param, default: *(type *)__ARM_undef)
+#define __ARM_mve_coerce3(param, type) \
+    _Generic(param, type: param, int8_t: param, int16_t: param, int32_t: param, int64_t: param, uint8_t: param, uint16_t: param, uint32_t: param, uint64_t: param, default: *(type *)__ARM_undef)
 
 #if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */
 
@@ -35871,14 +35873,14 @@  extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vaddq_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t)), \
   int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vaddq_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t)), \
   int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vaddq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t)), \
-  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vaddq_f16 (__ARM_mve_coerce(p0, float16x8_t), __ARM_mve_coerce(p1, float16x8_t)), \
-  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vaddq_f32 (__ARM_mve_coerce(p0, float32x4_t), __ARM_mve_coerce(p1, float32x4_t)), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, int)), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, int)), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, int)), \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int)), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int)), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int)), \
+  int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vaddq_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t)), \
+  int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vaddq_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t)), \
+  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce3(p1, int)), \
+  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce3(p1, int)), \
+  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce3(p1, int)), \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce3(p1, int)), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce3(p1, int)), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce3(p1, int)), \
   int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vaddq_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce2(__p1, double)), \
   int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vaddq_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce2(__p1, double)));})
 
@@ -37316,12 +37318,12 @@  extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vaddq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3), \
   int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vaddq_m_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t), p3), \
   int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vaddq_m_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t), p3), \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int), p3), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int), p3), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int), p3), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, int), p3), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, int), p3), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, int), p3), \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce3(p2, int), p3), \
   int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: __arm_vaddq_m_n_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(__p2, double), p3), \
   int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_fp_n]: __arm_vaddq_m_n_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce2(__p2, double), p3));})
 
@@ -38820,12 +38822,12 @@  extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vaddq_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t)), \
   int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vaddq_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t)), \
   int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vaddq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t)), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, int)), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, int)), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, int)), \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int)), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int)), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int)));})
+  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce3(p1, int)), \
+  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce3(p1, int)), \
+  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce3(p1, int)), \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce3(p1, int)), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce3(p1, int)), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce3(p1, int)));})
 
 #define __arm_vandq(p0,p1) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
@@ -39641,12 +39643,12 @@  extern void *__ARM_undef;
   __typeof(p1) __p1 = (p1); \
   __typeof(p2) __p2 = (p2); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
-  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int), p3), \
-  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int), p3), \
-  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int), p3), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, int), p3), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, int), p3), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, int), p3), \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vaddq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce3(p2, int), p3), \
   int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vaddq_m_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \
   int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vaddq_m_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \
   int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vaddq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \