Enable vectorization for _Float16 floor/ceil/trunc/nearbyint/rint operations.

Message ID 20211025082413.111327-1-hongtao.liu@intel.com
State Committed
Commit 84bcefd5555af6d95e08cd980965098961289215
Headers
Series Enable vectorization for _Float16 floor/ceil/trunc/nearbyint/rint operations. |

Commit Message

liuhongt Oct. 25, 2021, 8:24 a.m. UTC
  Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
  Ok for trunk?

gcc/ChangeLog:

	PR target/102464
	* config/i386/i386-builtin-types.def (V8HF_FTYPE_V8HF): New
	function type.
	(V16HF_FTYPE_V16HF): Ditto.
	(V32HF_FTYPE_V32HF): Ditto.
	(V8HF_FTYPE_V8HF_ROUND): Ditto.
	(V16HF_FTYPE_V16HF_ROUND): Ditto.
	(V32HF_FTYPE_V32HF_ROUND): Ditto.
	* config/i386/i386-builtin.def ( IX86_BUILTIN_FLOORPH,
	IX86_BUILTIN_CEILPH, IX86_BUILTIN_TRUNCPH,
	IX86_BUILTIN_FLOORPH256, IX86_BUILTIN_CEILPH256,
	IX86_BUILTIN_TRUNCPH256, IX86_BUILTIN_FLOORPH512,
	IX86_BUILTIN_CEILPH512, IX86_BUILTIN_TRUNCPH512): New builtin.
	* config/i386/i386-builtins.c
	(ix86_builtin_vectorized_function): Enable vectorization for
	HFmode FLOOR/CEIL/TRUNC operation.
	* config/i386/i386-expand.c (ix86_expand_args_builtin): Handle
	new builtins.
	* config/i386/sse.md (rint<mode>2, nearbyint<mode>2): Extend
	to vector HFmodes.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr102464-vrndscaleph.c: New test.
---
 gcc/config/i386/i386-builtin-types.def        |   7 ++
 gcc/config/i386/i386-builtin.def              |  11 ++
 gcc/config/i386/i386-builtins.c               |  42 +++++++
 gcc/config/i386/i386-expand.c                 |   3 +
 gcc/config/i386/sse.md                        |  12 +-
 .../gcc.target/i386/pr102464-vrndscaleph.c    | 115 ++++++++++++++++++
 6 files changed, 184 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c
  

Comments

Hongtao Liu Oct. 28, 2021, 2:26 a.m. UTC | #1
On Mon, Oct 25, 2021 at 4:24 PM liuhongt <hongtao.liu@intel.com> wrote:
>
>   Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
>   Ok for trunk?
>
I'm going to check in this patch if there's no objection.
> gcc/ChangeLog:
>
>         PR target/102464
>         * config/i386/i386-builtin-types.def (V8HF_FTYPE_V8HF): New
>         function type.
>         (V16HF_FTYPE_V16HF): Ditto.
>         (V32HF_FTYPE_V32HF): Ditto.
>         (V8HF_FTYPE_V8HF_ROUND): Ditto.
>         (V16HF_FTYPE_V16HF_ROUND): Ditto.
>         (V32HF_FTYPE_V32HF_ROUND): Ditto.
>         * config/i386/i386-builtin.def ( IX86_BUILTIN_FLOORPH,
>         IX86_BUILTIN_CEILPH, IX86_BUILTIN_TRUNCPH,
>         IX86_BUILTIN_FLOORPH256, IX86_BUILTIN_CEILPH256,
>         IX86_BUILTIN_TRUNCPH256, IX86_BUILTIN_FLOORPH512,
>         IX86_BUILTIN_CEILPH512, IX86_BUILTIN_TRUNCPH512): New builtin.
>         * config/i386/i386-builtins.c
>         (ix86_builtin_vectorized_function): Enable vectorization for
>         HFmode FLOOR/CEIL/TRUNC operation.
>         * config/i386/i386-expand.c (ix86_expand_args_builtin): Handle
>         new builtins.
>         * config/i386/sse.md (rint<mode>2, nearbyint<mode>2): Extend
>         to vector HFmodes.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr102464-vrndscaleph.c: New test.
> ---
>  gcc/config/i386/i386-builtin-types.def        |   7 ++
>  gcc/config/i386/i386-builtin.def              |  11 ++
>  gcc/config/i386/i386-builtins.c               |  42 +++++++
>  gcc/config/i386/i386-expand.c                 |   3 +
>  gcc/config/i386/sse.md                        |  12 +-
>  .../gcc.target/i386/pr102464-vrndscaleph.c    | 115 ++++++++++++++++++
>  6 files changed, 184 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c
>
> diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
> index 4c355c587b5..e33f06ab30b 100644
> --- a/gcc/config/i386/i386-builtin-types.def
> +++ b/gcc/config/i386/i386-builtin-types.def
> @@ -1380,3 +1380,10 @@ DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI, INT)
>  DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, UHI, INT)
>  DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT)
>  DEF_FUNCTION_TYPE (V32HF, V32HF, INT, V32HF, USI, INT)
> +
> +DEF_FUNCTION_TYPE (V8HF, V8HF)
> +DEF_FUNCTION_TYPE (V16HF, V16HF)
> +DEF_FUNCTION_TYPE (V32HF, V32HF)
> +DEF_FUNCTION_TYPE_ALIAS (V8HF_FTYPE_V8HF, ROUND)
> +DEF_FUNCTION_TYPE_ALIAS (V16HF_FTYPE_V16HF, ROUND)
> +DEF_FUNCTION_TYPE_ALIAS (V32HF_FTYPE_V32HF, ROUND)
> diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
> index 99217d08d37..d9eee3f373c 100644
> --- a/gcc/config/i386/i386-builtin.def
> +++ b/gcc/config/i386/i386-builtin.def
> @@ -958,6 +958,10 @@ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__buil
>  BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF)
>  BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF)
>
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rndscalev8hf, "__builtin_ia32_floorph", IX86_BUILTIN_FLOORPH, (enum rtx_code) ROUND_FLOOR, (int) V8HF_FTYPE_V8HF_ROUND)
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rndscalev8hf, "__builtin_ia32_ceilph", IX86_BUILTIN_CEILPH, (enum rtx_code) ROUND_CEIL, (int) V8HF_FTYPE_V8HF_ROUND)
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rndscalev8hf, "__builtin_ia32_truncph", IX86_BUILTIN_TRUNCPH, (enum rtx_code) ROUND_TRUNC, (int) V8HF_FTYPE_V8HF_ROUND)
> +
>  BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND)
>  BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND)
>  BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND)
> @@ -1090,6 +1094,10 @@ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia3
>  BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND)
>  BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND)
>
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_rndscalev16hf, "__builtin_ia32_floorph256", IX86_BUILTIN_FLOORPH256, (enum rtx_code) ROUND_FLOOR, (int) V16HF_FTYPE_V16HF_ROUND)
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_rndscalev16hf, "__builtin_ia32_ceilph256", IX86_BUILTIN_CEILPH256, (enum rtx_code) ROUND_CEIL, (int) V16HF_FTYPE_V16HF_ROUND)
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_rndscalev16hf, "__builtin_ia32_truncph256", IX86_BUILTIN_TRUNCPH256, (enum rtx_code) ROUND_TRUNC, (int) V16HF_FTYPE_V16HF_ROUND)
> +
>  BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND)
>  BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND)
>  BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND)
> @@ -1528,6 +1536,9 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_copysignv8df3,  "__builtin_ia32_copy
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_sqrtv8df2, "__builtin_ia32_sqrtpd512", IX86_BUILTIN_SQRTPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF)
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sqrtv16sf2, "__builtin_ia32_sqrtps512", IX86_BUILTIN_SQRTPS_NR512, UNKNOWN, (int) V16SF_FTYPE_V16SF)
>  BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_exp2v16sf, "__builtin_ia32_exp2ps", IX86_BUILTIN_EXP2PS, UNKNOWN, (int) V16SF_FTYPE_V16SF)
> +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_floorph512", IX86_BUILTIN_FLOORPH512, (enum rtx_code) ROUND_FLOOR, (int) V32HF_FTYPE_V32HF_ROUND)
> +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_ceilph512", IX86_BUILTIN_CEILPH512, (enum rtx_code) ROUND_CEIL, (int) V32HF_FTYPE_V32HF_ROUND)
> +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_truncph512", IX86_BUILTIN_TRUNCPH512, (enum rtx_code) ROUND_TRUNC, (int) V32HF_FTYPE_V32HF_ROUND)
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_roundps512, "__builtin_ia32_floorps512", IX86_BUILTIN_FLOORPS512, (enum rtx_code) ROUND_FLOOR, (int) V16SF_FTYPE_V16SF_ROUND)
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_roundps512, "__builtin_ia32_ceilps512", IX86_BUILTIN_CEILPS512, (enum rtx_code) ROUND_CEIL, (int) V16SF_FTYPE_V16SF_ROUND)
>  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_roundps512, "__builtin_ia32_truncps512", IX86_BUILTIN_TRUNCPS512, (enum rtx_code) ROUND_TRUNC, (int) V16SF_FTYPE_V16SF_ROUND)
> diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c
> index 11ce58b2574..0fb14b55712 100644
> --- a/gcc/config/i386/i386-builtins.c
> +++ b/gcc/config/i386/i386-builtins.c
> @@ -1652,6 +1652,20 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
>           else if (out_n == 16 && in_n == 16)
>             return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
>         }
> +      if (out_mode == HFmode && in_mode == HFmode)
> +       {
> +         /* V8HF/V16HF is supported in ix86_vector_mode_supported_p
> +            under TARGET_AVX512FP16, TARGET_AVX512VL is needed here.  */
> +         if (out_n < 32 && !TARGET_AVX512VL)
> +           break;
> +
> +         if (out_n == 8 && in_n == 8)
> +           return ix86_get_builtin (IX86_BUILTIN_FLOORPH);
> +         else if (out_n == 16 && in_n == 16)
> +           return ix86_get_builtin (IX86_BUILTIN_FLOORPH256);
> +         else if (out_n == 32 && in_n == 32)
> +           return ix86_get_builtin (IX86_BUILTIN_FLOORPH512);
> +       }
>        break;
>
>      CASE_CFN_CEIL:
> @@ -1677,6 +1691,20 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
>           else if (out_n == 16 && in_n == 16)
>             return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
>         }
> +      if (out_mode == HFmode && in_mode == HFmode)
> +       {
> +         /* V8HF/V16HF is supported in ix86_vector_mode_supported_p
> +            under TARGET_AVX512FP16, TARGET_AVX512VL is needed here.  */
> +         if (out_n < 32 && !TARGET_AVX512VL)
> +           break;
> +
> +         if (out_n == 8 && in_n == 8)
> +           return ix86_get_builtin (IX86_BUILTIN_CEILPH);
> +         else if (out_n == 16 && in_n == 16)
> +           return ix86_get_builtin (IX86_BUILTIN_CEILPH256);
> +         else if (out_n == 32 && in_n == 32)
> +           return ix86_get_builtin (IX86_BUILTIN_CEILPH512);
> +       }
>        break;
>
>      CASE_CFN_TRUNC:
> @@ -1702,6 +1730,20 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
>           else if (out_n == 16 && in_n == 16)
>             return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
>         }
> +      if (out_mode == HFmode && in_mode == HFmode)
> +       {
> +         /* V8HF/V16HF is supported in ix86_vector_mode_supported_p
> +            under TARGET_AVX512FP16, TARGET_AVX512VL is needed here.  */
> +         if (out_n < 32 && !TARGET_AVX512VL)
> +           break;
> +
> +         if (out_n == 8 && in_n == 8)
> +           return ix86_get_builtin (IX86_BUILTIN_TRUNCPH);
> +         else if (out_n == 16 && in_n == 16)
> +           return ix86_get_builtin (IX86_BUILTIN_TRUNCPH256);
> +         else if (out_n == 32 && in_n == 32)
> +           return ix86_get_builtin (IX86_BUILTIN_TRUNCPH512);
> +       }
>        break;
>
>      CASE_CFN_FMA:
> diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
> index 56dd99b5511..15c4bc375d5 100644
> --- a/gcc/config/i386/i386-expand.c
> +++ b/gcc/config/i386/i386-expand.c
> @@ -9423,6 +9423,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
>      case V4SF_FTYPE_V4SF_ROUND:
>      case V8SF_FTYPE_V8SF_ROUND:
>      case V16SF_FTYPE_V16SF_ROUND:
> +    case V8HF_FTYPE_V8HF_ROUND:
> +    case V16HF_FTYPE_V16HF_ROUND:
> +    case V32HF_FTYPE_V32HF_ROUND:
>      case V4SI_FTYPE_V4SF_ROUND:
>      case V8SI_FTYPE_V8SF_ROUND:
>      case V16SI_FTYPE_V16SF_ROUND:
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index fbf056bf9e6..00ee7b58ef3 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -21758,18 +21758,18 @@ (define_insn "ptesttf2"
>     (set_attr "mode" "TI")])
>
>  (define_expand "nearbyint<mode>2"
> -  [(set (match_operand:VF 0 "register_operand")
> -       (unspec:VF
> -         [(match_operand:VF 1 "vector_operand")
> +  [(set (match_operand:VFH 0 "register_operand")
> +       (unspec:VFH
> +         [(match_operand:VFH 1 "vector_operand")
>            (match_dup 2)]
>           UNSPEC_ROUND))]
>    "TARGET_SSE4_1"
>    "operands[2] = GEN_INT (ROUND_MXCSR | ROUND_NO_EXC);")
>
>  (define_expand "rint<mode>2"
> -  [(set (match_operand:VF 0 "register_operand")
> -       (unspec:VF
> -         [(match_operand:VF 1 "vector_operand")
> +  [(set (match_operand:VFH 0 "register_operand")
> +       (unspec:VFH
> +         [(match_operand:VFH 1 "vector_operand")
>            (match_dup 2)]
>           UNSPEC_ROUND))]
>    "TARGET_SSE4_1"
> diff --git a/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c b/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c
> new file mode 100644
> index 00000000000..a76d9e7e376
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c
> @@ -0,0 +1,115 @@
> +/* PR target/102464.  */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -mavx512fp16 -mavx512vl -mprefer-vector-width=512" } */
> +#include<math.h>
> +void
> +foo (_Float16* __restrict a, _Float16* b)
> +{
> +    for (int i = 0; i != 8; i++)
> +      a[i] = floor (b[i]);
> +}
> +
> +void
> +foo1 (_Float16* __restrict a, _Float16* b)
> +{
> +    for (int i = 0; i != 8; i++)
> +      a[i] = ceil (b[i]);
> +}
> +
> +void
> +foo2 (_Float16* __restrict a, _Float16* b)
> +{
> +    for (int i = 0; i != 8; i++)
> +      a[i] = trunc (b[i]);
> +}
> +
> +void
> +foo3 (_Float16* __restrict a, _Float16* b)
> +{
> +    for (int i = 0; i != 8; i++)
> +      a[i] = nearbyint (b[i]);
> +}
> +
> +void
> +foo4 (_Float16* __restrict a, _Float16* b)
> +{
> +    for (int i = 0; i != 8; i++)
> +      a[i] = rint (b[i]);
> +}
> +
> +void
> +foo5 (_Float16* __restrict a, _Float16* b)
> +{
> +    for (int i = 0; i != 16; i++)
> +      a[i] = floor (b[i]);
> +}
> +
> +void
> +foo6 (_Float16* __restrict a, _Float16* b)
> +{
> +    for (int i = 0; i != 16; i++)
> +      a[i] = ceil (b[i]);
> +}
> +
> +void
> +foo7 (_Float16* __restrict a, _Float16* b)
> +{
> +    for (int i = 0; i != 16; i++)
> +      a[i] = trunc (b[i]);
> +}
> +
> +void
> +foo8 (_Float16* __restrict a, _Float16* b)
> +{
> +    for (int i = 0; i != 16; i++)
> +      a[i] = nearbyint (b[i]);
> +}
> +
> +void
> +foo9 (_Float16* __restrict a, _Float16* b)
> +{
> +    for (int i = 0; i != 16; i++)
> +      a[i] = rint (b[i]);
> +}
> +
> +void
> +foo10 (_Float16* __restrict a, _Float16* b)
> +{
> +    for (int i = 0; i != 32; i++)
> +      a[i] = floor (b[i]);
> +}
> +
> +void
> +foo11 (_Float16* __restrict a, _Float16* b)
> +{
> +    for (int i = 0; i != 32; i++)
> +      a[i] = ceil (b[i]);
> +}
> +
> +void
> +foo12 (_Float16* __restrict a, _Float16* b)
> +{
> +    for (int i = 0; i != 32; i++)
> +      a[i] = trunc (b[i]);
> +}
> +
> +void
> +foo13 (_Float16* __restrict a, _Float16* b)
> +{
> +    for (int i = 0; i != 32; i++)
> +      a[i] = nearbyint (b[i]);
> +}
> +
> +void
> +foo14 (_Float16* __restrict a, _Float16* b)
> +{
> +    for (int i = 0; i != 32; i++)
> +      a[i] = rint (b[i]);
> +}
> +
> +/* { dg-final { scan-assembler-not "vcvtsh2s\[sd\]" } } */
> +/* { dg-final { scan-assembler-not "vcvtph2p\[sd\]" } } */
> +/* { dg-final { scan-assembler-not "extendhfxf" } } */
> +/* { dg-final { scan-assembler-times "vrndscaleph\[^\n\r\]*xmm\[0-9\]" 5 } } */
> +/* { dg-final { scan-assembler-times "vrndscaleph\[^\n\r\]*ymm\[0-9\]" 5 } } */
> +/* { dg-final { scan-assembler-times "vrndscaleph\[^\n\r\]*zmm\[0-9\]" 5 } } */
> --
> 2.18.1
>
  
Hongtao Liu Oct. 29, 2021, 1:53 a.m. UTC | #2
On Thu, Oct 28, 2021 at 10:26 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, Oct 25, 2021 at 4:24 PM liuhongt <hongtao.liu@intel.com> wrote:
> >
> >   Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> >   Ok for trunk?
> >
> I'm going to check in this patch if there's no objection.
Committed.
> > gcc/ChangeLog:
> >
> >         PR target/102464
> >         * config/i386/i386-builtin-types.def (V8HF_FTYPE_V8HF): New
> >         function type.
> >         (V16HF_FTYPE_V16HF): Ditto.
> >         (V32HF_FTYPE_V32HF): Ditto.
> >         (V8HF_FTYPE_V8HF_ROUND): Ditto.
> >         (V16HF_FTYPE_V16HF_ROUND): Ditto.
> >         (V32HF_FTYPE_V32HF_ROUND): Ditto.
> >         * config/i386/i386-builtin.def ( IX86_BUILTIN_FLOORPH,
> >         IX86_BUILTIN_CEILPH, IX86_BUILTIN_TRUNCPH,
> >         IX86_BUILTIN_FLOORPH256, IX86_BUILTIN_CEILPH256,
> >         IX86_BUILTIN_TRUNCPH256, IX86_BUILTIN_FLOORPH512,
> >         IX86_BUILTIN_CEILPH512, IX86_BUILTIN_TRUNCPH512): New builtin.
> >         * config/i386/i386-builtins.c
> >         (ix86_builtin_vectorized_function): Enable vectorization for
> >         HFmode FLOOR/CEIL/TRUNC operation.
> >         * config/i386/i386-expand.c (ix86_expand_args_builtin): Handle
> >         new builtins.
> >         * config/i386/sse.md (rint<mode>2, nearbyint<mode>2): Extend
> >         to vector HFmodes.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/pr102464-vrndscaleph.c: New test.
> > ---
> >  gcc/config/i386/i386-builtin-types.def        |   7 ++
> >  gcc/config/i386/i386-builtin.def              |  11 ++
> >  gcc/config/i386/i386-builtins.c               |  42 +++++++
> >  gcc/config/i386/i386-expand.c                 |   3 +
> >  gcc/config/i386/sse.md                        |  12 +-
> >  .../gcc.target/i386/pr102464-vrndscaleph.c    | 115 ++++++++++++++++++
> >  6 files changed, 184 insertions(+), 6 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c
> >
> > diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
> > index 4c355c587b5..e33f06ab30b 100644
> > --- a/gcc/config/i386/i386-builtin-types.def
> > +++ b/gcc/config/i386/i386-builtin-types.def
> > @@ -1380,3 +1380,10 @@ DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI, INT)
> >  DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, UHI, INT)
> >  DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT)
> >  DEF_FUNCTION_TYPE (V32HF, V32HF, INT, V32HF, USI, INT)
> > +
> > +DEF_FUNCTION_TYPE (V8HF, V8HF)
> > +DEF_FUNCTION_TYPE (V16HF, V16HF)
> > +DEF_FUNCTION_TYPE (V32HF, V32HF)
> > +DEF_FUNCTION_TYPE_ALIAS (V8HF_FTYPE_V8HF, ROUND)
> > +DEF_FUNCTION_TYPE_ALIAS (V16HF_FTYPE_V16HF, ROUND)
> > +DEF_FUNCTION_TYPE_ALIAS (V32HF_FTYPE_V32HF, ROUND)
> > diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
> > index 99217d08d37..d9eee3f373c 100644
> > --- a/gcc/config/i386/i386-builtin.def
> > +++ b/gcc/config/i386/i386-builtin.def
> > @@ -958,6 +958,10 @@ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__buil
> >  BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF)
> >  BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF)
> >
> > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rndscalev8hf, "__builtin_ia32_floorph", IX86_BUILTIN_FLOORPH, (enum rtx_code) ROUND_FLOOR, (int) V8HF_FTYPE_V8HF_ROUND)
> > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rndscalev8hf, "__builtin_ia32_ceilph", IX86_BUILTIN_CEILPH, (enum rtx_code) ROUND_CEIL, (int) V8HF_FTYPE_V8HF_ROUND)
> > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rndscalev8hf, "__builtin_ia32_truncph", IX86_BUILTIN_TRUNCPH, (enum rtx_code) ROUND_TRUNC, (int) V8HF_FTYPE_V8HF_ROUND)
> > +
> >  BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND)
> >  BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND)
> >  BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND)
> > @@ -1090,6 +1094,10 @@ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia3
> >  BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND)
> >  BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND)
> >
> > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_rndscalev16hf, "__builtin_ia32_floorph256", IX86_BUILTIN_FLOORPH256, (enum rtx_code) ROUND_FLOOR, (int) V16HF_FTYPE_V16HF_ROUND)
> > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_rndscalev16hf, "__builtin_ia32_ceilph256", IX86_BUILTIN_CEILPH256, (enum rtx_code) ROUND_CEIL, (int) V16HF_FTYPE_V16HF_ROUND)
> > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_rndscalev16hf, "__builtin_ia32_truncph256", IX86_BUILTIN_TRUNCPH256, (enum rtx_code) ROUND_TRUNC, (int) V16HF_FTYPE_V16HF_ROUND)
> > +
> >  BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND)
> >  BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND)
> >  BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND)
> > @@ -1528,6 +1536,9 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_copysignv8df3,  "__builtin_ia32_copy
> >  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_sqrtv8df2, "__builtin_ia32_sqrtpd512", IX86_BUILTIN_SQRTPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF)
> >  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sqrtv16sf2, "__builtin_ia32_sqrtps512", IX86_BUILTIN_SQRTPS_NR512, UNKNOWN, (int) V16SF_FTYPE_V16SF)
> >  BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_exp2v16sf, "__builtin_ia32_exp2ps", IX86_BUILTIN_EXP2PS, UNKNOWN, (int) V16SF_FTYPE_V16SF)
> > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_floorph512", IX86_BUILTIN_FLOORPH512, (enum rtx_code) ROUND_FLOOR, (int) V32HF_FTYPE_V32HF_ROUND)
> > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_ceilph512", IX86_BUILTIN_CEILPH512, (enum rtx_code) ROUND_CEIL, (int) V32HF_FTYPE_V32HF_ROUND)
> > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_truncph512", IX86_BUILTIN_TRUNCPH512, (enum rtx_code) ROUND_TRUNC, (int) V32HF_FTYPE_V32HF_ROUND)
> >  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_roundps512, "__builtin_ia32_floorps512", IX86_BUILTIN_FLOORPS512, (enum rtx_code) ROUND_FLOOR, (int) V16SF_FTYPE_V16SF_ROUND)
> >  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_roundps512, "__builtin_ia32_ceilps512", IX86_BUILTIN_CEILPS512, (enum rtx_code) ROUND_CEIL, (int) V16SF_FTYPE_V16SF_ROUND)
> >  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_roundps512, "__builtin_ia32_truncps512", IX86_BUILTIN_TRUNCPS512, (enum rtx_code) ROUND_TRUNC, (int) V16SF_FTYPE_V16SF_ROUND)
> > diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c
> > index 11ce58b2574..0fb14b55712 100644
> > --- a/gcc/config/i386/i386-builtins.c
> > +++ b/gcc/config/i386/i386-builtins.c
> > @@ -1652,6 +1652,20 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
> >           else if (out_n == 16 && in_n == 16)
> >             return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
> >         }
> > +      if (out_mode == HFmode && in_mode == HFmode)
> > +       {
> > +         /* V8HF/V16HF is supported in ix86_vector_mode_supported_p
> > +            under TARGET_AVX512FP16, TARGET_AVX512VL is needed here.  */
> > +         if (out_n < 32 && !TARGET_AVX512VL)
> > +           break;
> > +
> > +         if (out_n == 8 && in_n == 8)
> > +           return ix86_get_builtin (IX86_BUILTIN_FLOORPH);
> > +         else if (out_n == 16 && in_n == 16)
> > +           return ix86_get_builtin (IX86_BUILTIN_FLOORPH256);
> > +         else if (out_n == 32 && in_n == 32)
> > +           return ix86_get_builtin (IX86_BUILTIN_FLOORPH512);
> > +       }
> >        break;
> >
> >      CASE_CFN_CEIL:
> > @@ -1677,6 +1691,20 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
> >           else if (out_n == 16 && in_n == 16)
> >             return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
> >         }
> > +      if (out_mode == HFmode && in_mode == HFmode)
> > +       {
> > +         /* V8HF/V16HF is supported in ix86_vector_mode_supported_p
> > +            under TARGET_AVX512FP16, TARGET_AVX512VL is needed here.  */
> > +         if (out_n < 32 && !TARGET_AVX512VL)
> > +           break;
> > +
> > +         if (out_n == 8 && in_n == 8)
> > +           return ix86_get_builtin (IX86_BUILTIN_CEILPH);
> > +         else if (out_n == 16 && in_n == 16)
> > +           return ix86_get_builtin (IX86_BUILTIN_CEILPH256);
> > +         else if (out_n == 32 && in_n == 32)
> > +           return ix86_get_builtin (IX86_BUILTIN_CEILPH512);
> > +       }
> >        break;
> >
> >      CASE_CFN_TRUNC:
> > @@ -1702,6 +1730,20 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
> >           else if (out_n == 16 && in_n == 16)
> >             return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
> >         }
> > +      if (out_mode == HFmode && in_mode == HFmode)
> > +       {
> > +         /* V8HF/V16HF is supported in ix86_vector_mode_supported_p
> > +            under TARGET_AVX512FP16, TARGET_AVX512VL is needed here.  */
> > +         if (out_n < 32 && !TARGET_AVX512VL)
> > +           break;
> > +
> > +         if (out_n == 8 && in_n == 8)
> > +           return ix86_get_builtin (IX86_BUILTIN_TRUNCPH);
> > +         else if (out_n == 16 && in_n == 16)
> > +           return ix86_get_builtin (IX86_BUILTIN_TRUNCPH256);
> > +         else if (out_n == 32 && in_n == 32)
> > +           return ix86_get_builtin (IX86_BUILTIN_TRUNCPH512);
> > +       }
> >        break;
> >
> >      CASE_CFN_FMA:
> > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
> > index 56dd99b5511..15c4bc375d5 100644
> > --- a/gcc/config/i386/i386-expand.c
> > +++ b/gcc/config/i386/i386-expand.c
> > @@ -9423,6 +9423,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
> >      case V4SF_FTYPE_V4SF_ROUND:
> >      case V8SF_FTYPE_V8SF_ROUND:
> >      case V16SF_FTYPE_V16SF_ROUND:
> > +    case V8HF_FTYPE_V8HF_ROUND:
> > +    case V16HF_FTYPE_V16HF_ROUND:
> > +    case V32HF_FTYPE_V32HF_ROUND:
> >      case V4SI_FTYPE_V4SF_ROUND:
> >      case V8SI_FTYPE_V8SF_ROUND:
> >      case V16SI_FTYPE_V16SF_ROUND:
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index fbf056bf9e6..00ee7b58ef3 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -21758,18 +21758,18 @@ (define_insn "ptesttf2"
> >     (set_attr "mode" "TI")])
> >
> >  (define_expand "nearbyint<mode>2"
> > -  [(set (match_operand:VF 0 "register_operand")
> > -       (unspec:VF
> > -         [(match_operand:VF 1 "vector_operand")
> > +  [(set (match_operand:VFH 0 "register_operand")
> > +       (unspec:VFH
> > +         [(match_operand:VFH 1 "vector_operand")
> >            (match_dup 2)]
> >           UNSPEC_ROUND))]
> >    "TARGET_SSE4_1"
> >    "operands[2] = GEN_INT (ROUND_MXCSR | ROUND_NO_EXC);")
> >
> >  (define_expand "rint<mode>2"
> > -  [(set (match_operand:VF 0 "register_operand")
> > -       (unspec:VF
> > -         [(match_operand:VF 1 "vector_operand")
> > +  [(set (match_operand:VFH 0 "register_operand")
> > +       (unspec:VFH
> > +         [(match_operand:VFH 1 "vector_operand")
> >            (match_dup 2)]
> >           UNSPEC_ROUND))]
> >    "TARGET_SSE4_1"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c b/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c
> > new file mode 100644
> > index 00000000000..a76d9e7e376
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c
> > @@ -0,0 +1,115 @@
> > +/* PR target/102464.  */
> > +/* { dg-do compile } */
> > +/* { dg-options "-Ofast -mavx512fp16 -mavx512vl -mprefer-vector-width=512" } */
> > +#include<math.h>
> > +void
> > +foo (_Float16* __restrict a, _Float16* b)
> > +{
> > +    for (int i = 0; i != 8; i++)
> > +      a[i] = floor (b[i]);
> > +}
> > +
> > +void
> > +foo1 (_Float16* __restrict a, _Float16* b)
> > +{
> > +    for (int i = 0; i != 8; i++)
> > +      a[i] = ceil (b[i]);
> > +}
> > +
> > +void
> > +foo2 (_Float16* __restrict a, _Float16* b)
> > +{
> > +    for (int i = 0; i != 8; i++)
> > +      a[i] = trunc (b[i]);
> > +}
> > +
> > +void
> > +foo3 (_Float16* __restrict a, _Float16* b)
> > +{
> > +    for (int i = 0; i != 8; i++)
> > +      a[i] = nearbyint (b[i]);
> > +}
> > +
> > +void
> > +foo4 (_Float16* __restrict a, _Float16* b)
> > +{
> > +    for (int i = 0; i != 8; i++)
> > +      a[i] = rint (b[i]);
> > +}
> > +
> > +void
> > +foo5 (_Float16* __restrict a, _Float16* b)
> > +{
> > +    for (int i = 0; i != 16; i++)
> > +      a[i] = floor (b[i]);
> > +}
> > +
> > +void
> > +foo6 (_Float16* __restrict a, _Float16* b)
> > +{
> > +    for (int i = 0; i != 16; i++)
> > +      a[i] = ceil (b[i]);
> > +}
> > +
> > +void
> > +foo7 (_Float16* __restrict a, _Float16* b)
> > +{
> > +    for (int i = 0; i != 16; i++)
> > +      a[i] = trunc (b[i]);
> > +}
> > +
> > +void
> > +foo8 (_Float16* __restrict a, _Float16* b)
> > +{
> > +    for (int i = 0; i != 16; i++)
> > +      a[i] = nearbyint (b[i]);
> > +}
> > +
> > +void
> > +foo9 (_Float16* __restrict a, _Float16* b)
> > +{
> > +    for (int i = 0; i != 16; i++)
> > +      a[i] = rint (b[i]);
> > +}
> > +
> > +void
> > +foo10 (_Float16* __restrict a, _Float16* b)
> > +{
> > +    for (int i = 0; i != 32; i++)
> > +      a[i] = floor (b[i]);
> > +}
> > +
> > +void
> > +foo11 (_Float16* __restrict a, _Float16* b)
> > +{
> > +    for (int i = 0; i != 32; i++)
> > +      a[i] = ceil (b[i]);
> > +}
> > +
> > +void
> > +foo12 (_Float16* __restrict a, _Float16* b)
> > +{
> > +    for (int i = 0; i != 32; i++)
> > +      a[i] = trunc (b[i]);
> > +}
> > +
> > +void
> > +foo13 (_Float16* __restrict a, _Float16* b)
> > +{
> > +    for (int i = 0; i != 32; i++)
> > +      a[i] = nearbyint (b[i]);
> > +}
> > +
> > +void
> > +foo14 (_Float16* __restrict a, _Float16* b)
> > +{
> > +    for (int i = 0; i != 32; i++)
> > +      a[i] = rint (b[i]);
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "vcvtsh2s\[sd\]" } } */
> > +/* { dg-final { scan-assembler-not "vcvtph2p\[sd\]" } } */
> > +/* { dg-final { scan-assembler-not "extendhfxf" } } */
> > +/* { dg-final { scan-assembler-times "vrndscaleph\[^\n\r\]*xmm\[0-9\]" 5 } } */
> > +/* { dg-final { scan-assembler-times "vrndscaleph\[^\n\r\]*ymm\[0-9\]" 5 } } */
> > +/* { dg-final { scan-assembler-times "vrndscaleph\[^\n\r\]*zmm\[0-9\]" 5 } } */
> > --
> > 2.18.1
> >
>
>
> --
> BR,
> Hongtao
  

Patch

diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index 4c355c587b5..e33f06ab30b 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -1380,3 +1380,10 @@  DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI, INT)
 DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, UHI, INT)
 DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT)
 DEF_FUNCTION_TYPE (V32HF, V32HF, INT, V32HF, USI, INT)
+
+DEF_FUNCTION_TYPE (V8HF, V8HF)
+DEF_FUNCTION_TYPE (V16HF, V16HF)
+DEF_FUNCTION_TYPE (V32HF, V32HF)
+DEF_FUNCTION_TYPE_ALIAS (V8HF_FTYPE_V8HF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V16HF_FTYPE_V16HF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V32HF_FTYPE_V32HF, ROUND)
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 99217d08d37..d9eee3f373c 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -958,6 +958,10 @@  BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__buil
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF)
 
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rndscalev8hf, "__builtin_ia32_floorph", IX86_BUILTIN_FLOORPH, (enum rtx_code) ROUND_FLOOR, (int) V8HF_FTYPE_V8HF_ROUND)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rndscalev8hf, "__builtin_ia32_ceilph", IX86_BUILTIN_CEILPH, (enum rtx_code) ROUND_CEIL, (int) V8HF_FTYPE_V8HF_ROUND)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rndscalev8hf, "__builtin_ia32_truncph", IX86_BUILTIN_TRUNCPH, (enum rtx_code) ROUND_TRUNC, (int) V8HF_FTYPE_V8HF_ROUND)
+
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND)
@@ -1090,6 +1094,10 @@  BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia3
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND)
 
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_rndscalev16hf, "__builtin_ia32_floorph256", IX86_BUILTIN_FLOORPH256, (enum rtx_code) ROUND_FLOOR, (int) V16HF_FTYPE_V16HF_ROUND)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_rndscalev16hf, "__builtin_ia32_ceilph256", IX86_BUILTIN_CEILPH256, (enum rtx_code) ROUND_CEIL, (int) V16HF_FTYPE_V16HF_ROUND)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_rndscalev16hf, "__builtin_ia32_truncph256", IX86_BUILTIN_TRUNCPH256, (enum rtx_code) ROUND_TRUNC, (int) V16HF_FTYPE_V16HF_ROUND)
+
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND)
@@ -1528,6 +1536,9 @@  BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_copysignv8df3,  "__builtin_ia32_copy
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_sqrtv8df2, "__builtin_ia32_sqrtpd512", IX86_BUILTIN_SQRTPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sqrtv16sf2, "__builtin_ia32_sqrtps512", IX86_BUILTIN_SQRTPS_NR512, UNKNOWN, (int) V16SF_FTYPE_V16SF)
 BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_exp2v16sf, "__builtin_ia32_exp2ps", IX86_BUILTIN_EXP2PS, UNKNOWN, (int) V16SF_FTYPE_V16SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_floorph512", IX86_BUILTIN_FLOORPH512, (enum rtx_code) ROUND_FLOOR, (int) V32HF_FTYPE_V32HF_ROUND)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_ceilph512", IX86_BUILTIN_CEILPH512, (enum rtx_code) ROUND_CEIL, (int) V32HF_FTYPE_V32HF_ROUND)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_truncph512", IX86_BUILTIN_TRUNCPH512, (enum rtx_code) ROUND_TRUNC, (int) V32HF_FTYPE_V32HF_ROUND)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_roundps512, "__builtin_ia32_floorps512", IX86_BUILTIN_FLOORPS512, (enum rtx_code) ROUND_FLOOR, (int) V16SF_FTYPE_V16SF_ROUND)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_roundps512, "__builtin_ia32_ceilps512", IX86_BUILTIN_CEILPS512, (enum rtx_code) ROUND_CEIL, (int) V16SF_FTYPE_V16SF_ROUND)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_roundps512, "__builtin_ia32_truncps512", IX86_BUILTIN_TRUNCPS512, (enum rtx_code) ROUND_TRUNC, (int) V16SF_FTYPE_V16SF_ROUND)
diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c
index 11ce58b2574..0fb14b55712 100644
--- a/gcc/config/i386/i386-builtins.c
+++ b/gcc/config/i386/i386-builtins.c
@@ -1652,6 +1652,20 @@  ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
 	  else if (out_n == 16 && in_n == 16)
 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
 	}
+      if (out_mode == HFmode && in_mode == HFmode)
+	{
+	  /* V8HF/V16HF is supported in ix86_vector_mode_supported_p
+	     under TARGET_AVX512FP16, TARGET_AVX512VL is needed here.  */
+	  if (out_n < 32 && !TARGET_AVX512VL)
+	    break;
+
+	  if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPH);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPH256);
+	  else if (out_n == 32 && in_n == 32)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPH512);
+	}
       break;
 
     CASE_CFN_CEIL:
@@ -1677,6 +1691,20 @@  ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
 	  else if (out_n == 16 && in_n == 16)
 	    return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
 	}
+      if (out_mode == HFmode && in_mode == HFmode)
+	{
+	  /* V8HF/V16HF is supported in ix86_vector_mode_supported_p
+	     under TARGET_AVX512FP16, TARGET_AVX512VL is needed here.  */
+	  if (out_n < 32 && !TARGET_AVX512VL)
+	    break;
+
+	  if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPH);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPH256);
+	  else if (out_n == 32 && in_n == 32)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPH512);
+	}
       break;
 
     CASE_CFN_TRUNC:
@@ -1702,6 +1730,20 @@  ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
 	  else if (out_n == 16 && in_n == 16)
 	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
 	}
+      if (out_mode == HFmode && in_mode == HFmode)
+	{
+	  /* V8HF/V16HF is supported in ix86_vector_mode_supported_p
+	     under TARGET_AVX512FP16, TARGET_AVX512VL is needed here.  */
+	  if (out_n < 32 && !TARGET_AVX512VL)
+	    break;
+
+	  if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPH);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPH256);
+	  else if (out_n == 32 && in_n == 32)
+	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPH512);
+	}
       break;
 
     CASE_CFN_FMA:
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 56dd99b5511..15c4bc375d5 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -9423,6 +9423,9 @@  ix86_expand_args_builtin (const struct builtin_description *d,
     case V4SF_FTYPE_V4SF_ROUND:
     case V8SF_FTYPE_V8SF_ROUND:
     case V16SF_FTYPE_V16SF_ROUND:
+    case V8HF_FTYPE_V8HF_ROUND:
+    case V16HF_FTYPE_V16HF_ROUND:
+    case V32HF_FTYPE_V32HF_ROUND:
     case V4SI_FTYPE_V4SF_ROUND:
     case V8SI_FTYPE_V8SF_ROUND:
     case V16SI_FTYPE_V16SF_ROUND:
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index fbf056bf9e6..00ee7b58ef3 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -21758,18 +21758,18 @@  (define_insn "ptesttf2"
    (set_attr "mode" "TI")])
 
 (define_expand "nearbyint<mode>2"
-  [(set (match_operand:VF 0 "register_operand")
-	(unspec:VF
-	  [(match_operand:VF 1 "vector_operand")
+  [(set (match_operand:VFH 0 "register_operand")
+	(unspec:VFH
+	  [(match_operand:VFH 1 "vector_operand")
 	   (match_dup 2)]
 	  UNSPEC_ROUND))]
   "TARGET_SSE4_1"
   "operands[2] = GEN_INT (ROUND_MXCSR | ROUND_NO_EXC);")
 
 (define_expand "rint<mode>2"
-  [(set (match_operand:VF 0 "register_operand")
-	(unspec:VF
-	  [(match_operand:VF 1 "vector_operand")
+  [(set (match_operand:VFH 0 "register_operand")
+	(unspec:VFH
+	  [(match_operand:VFH 1 "vector_operand")
 	   (match_dup 2)]
 	  UNSPEC_ROUND))]
   "TARGET_SSE4_1"
diff --git a/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c b/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c
new file mode 100644
index 00000000000..a76d9e7e376
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c
@@ -0,0 +1,115 @@ 
+/* PR target/102464.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mavx512fp16 -mavx512vl -mprefer-vector-width=512" } */
+#include<math.h>
+void
+foo (_Float16* __restrict a, _Float16* b)
+{
+    for (int i = 0; i != 8; i++)
+      a[i] = floor (b[i]);
+}
+
+void
+foo1 (_Float16* __restrict a, _Float16* b)
+{
+    for (int i = 0; i != 8; i++)
+      a[i] = ceil (b[i]);
+}
+
+void
+foo2 (_Float16* __restrict a, _Float16* b)
+{
+    for (int i = 0; i != 8; i++)
+      a[i] = trunc (b[i]);
+}
+
+void
+foo3 (_Float16* __restrict a, _Float16* b)
+{
+    for (int i = 0; i != 8; i++)
+      a[i] = nearbyint (b[i]);
+}
+
+void
+foo4 (_Float16* __restrict a, _Float16* b)
+{
+    for (int i = 0; i != 8; i++)
+      a[i] = rint (b[i]);
+}
+
+void
+foo5 (_Float16* __restrict a, _Float16* b)
+{
+    for (int i = 0; i != 16; i++)
+      a[i] = floor (b[i]);
+}
+
+void
+foo6 (_Float16* __restrict a, _Float16* b)
+{
+    for (int i = 0; i != 16; i++)
+      a[i] = ceil (b[i]);
+}
+
+void
+foo7 (_Float16* __restrict a, _Float16* b)
+{
+    for (int i = 0; i != 16; i++)
+      a[i] = trunc (b[i]);
+}
+
+void
+foo8 (_Float16* __restrict a, _Float16* b)
+{
+    for (int i = 0; i != 16; i++)
+      a[i] = nearbyint (b[i]);
+}
+
+void
+foo9 (_Float16* __restrict a, _Float16* b)
+{
+    for (int i = 0; i != 16; i++)
+      a[i] = rint (b[i]);
+}
+
+void
+foo10 (_Float16* __restrict a, _Float16* b)
+{
+    for (int i = 0; i != 32; i++)
+      a[i] = floor (b[i]);
+}
+
+void
+foo11 (_Float16* __restrict a, _Float16* b)
+{
+    for (int i = 0; i != 32; i++)
+      a[i] = ceil (b[i]);
+}
+
+void
+foo12 (_Float16* __restrict a, _Float16* b)
+{
+    for (int i = 0; i != 32; i++)
+      a[i] = trunc (b[i]);
+}
+
+void
+foo13 (_Float16* __restrict a, _Float16* b)
+{
+    for (int i = 0; i != 32; i++)
+      a[i] = nearbyint (b[i]);
+}
+
+void
+foo14 (_Float16* __restrict a, _Float16* b)
+{
+    for (int i = 0; i != 32; i++)
+      a[i] = rint (b[i]);
+}
+
+/* { dg-final { scan-assembler-not "vcvtsh2s\[sd\]" } } */
+/* { dg-final { scan-assembler-not "vcvtph2p\[sd\]" } } */
+/* { dg-final { scan-assembler-not "extendhfxf" } } */
+/* { dg-final { scan-assembler-times "vrndscaleph\[^\n\r\]*xmm\[0-9\]" 5 } } */
+/* { dg-final { scan-assembler-times "vrndscaleph\[^\n\r\]*ymm\[0-9\]" 5 } } */
+/* { dg-final { scan-assembler-times "vrndscaleph\[^\n\r\]*zmm\[0-9\]" 5 } } */