diff mbox series

AVX512FP16: Optimize _Float16 reciprocal for div and sqrt

Message ID	20211026095120.29899-1-hongyu.wang@intel.com
State	Committed
Commit	5720c450fab749664b32dbcd14d0a66f8ba57e5f
Headers	DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 2A9FF3858410 To: hongtao.liu@intel.com Subject: [PATCH] AVX512FP16: Optimize _Float16 reciprocal for div and sqrt Date: Tue, 26 Oct 2021 17:51:20 +0800 Message-Id: <20211026095120.29899-1-hongyu.wang@intel.com> Precedence: list From: Hongyu Wang via Gcc-patches <gcc-patches@gcc.gnu.org> Reply-To: Hongyu Wang <hongyu.wang@intel.com> Cc: gcc-patches@gcc.gnu.org Errors-To: gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org>
Series	AVX512FP16: Optimize _Float16 reciprocal for div and sqrt \| AVX512FP16: Optimize _Float16 reciprocal for div and sqrt

Commit Message

Hongyu Wang Oct. 26, 2021, 9:51 a.m. UTC

  Hi,

For _Float16 type, add insn and expanders to optimize x / y to
x * rcp (y), and x / sqrt (y) to x * rsqrt (y).
As Half float only have minor precision difference between div and
mul * rcp, there is no need for Newton-Rhapson approximation.

Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
Ok for master?

gcc/ChangeLog:

	* config/i386/i386.c (use_rsqrt_p): Add mode parameter, enable
	  HFmode rsqrt without TARGET_SSE_MATH.
	(ix86_optab_supported_p): Refactor rint, adjust floor, ceil,
	btrunc condition to be restricted by -ftrapping-math, adjust
	use_rsqrt_p function call.
	* config/i386/i386.md (rcphf2): New define_insn.
	(rsqrthf2): Likewise.
	* config/i386/sse.md (div<mode>3): Change VF2H to VF2.
	(div<mode>3): New expander for HF mode.
	(rsqrt<mode>2): Likewise.
	(*avx512fp16_vmrcpv8hf2): New define_insn for rpad pass.
	(*avx512fp16_vmrsqrtv8hf2): Likewise.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx512fp16-recip-1.c: New test.
	* gcc.target/i386/avx512fp16-recip-2.c: Ditto.
	* gcc.target/i386/pr102464.c: Add -fno-trapping-math.
---
 gcc/config/i386/i386.c                        | 29 +++---
 gcc/config/i386/i386.md                       | 44 ++++++++-
 gcc/config/i386/sse.md                        | 63 +++++++++++-
 .../gcc.target/i386/avx512fp16-recip-1.c      | 43 ++++++++
 .../gcc.target/i386/avx512fp16-recip-2.c      | 97 +++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr102464.c      |  2 +-
 6 files changed, 258 insertions(+), 20 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-recip-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-recip-2.c

Comments

Hongtao Liu Oct. 28, 2021, 1:25 a.m. UTC | #1

On Tue, Oct 26, 2021 at 5:51 PM Hongyu Wang via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi,
>
> For _Float16 type, add insn and expanders to optimize x / y to
> x * rcp (y), and x / sqrt (y) to x * rsqrt (y).
> As Half float only have minor precision difference between div and
> mul * rcp, there is no need for Newton-Rhapson approximation.
>
> Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> Ok for master?
Ok.
>
> gcc/ChangeLog:
>
>         * config/i386/i386.c (use_rsqrt_p): Add mode parameter, enable
>           HFmode rsqrt without TARGET_SSE_MATH.
>         (ix86_optab_supported_p): Refactor rint, adjust floor, ceil,
>         btrunc condition to be restricted by -ftrapping-math, adjust
>         use_rsqrt_p function call.
>         * config/i386/i386.md (rcphf2): New define_insn.
>         (rsqrthf2): Likewise.
>         * config/i386/sse.md (div<mode>3): Change VF2H to VF2.
>         (div<mode>3): New expander for HF mode.
>         (rsqrt<mode>2): Likewise.
>         (*avx512fp16_vmrcpv8hf2): New define_insn for rpad pass.
>         (*avx512fp16_vmrsqrtv8hf2): Likewise.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/avx512fp16-recip-1.c: New test.
>         * gcc.target/i386/avx512fp16-recip-2.c: Ditto.
>         * gcc.target/i386/pr102464.c: Add -fno-trapping-math.
> ---
>  gcc/config/i386/i386.c                        | 29 +++---
>  gcc/config/i386/i386.md                       | 44 ++++++++-
>  gcc/config/i386/sse.md                        | 63 +++++++++++-
>  .../gcc.target/i386/avx512fp16-recip-1.c      | 43 ++++++++
>  .../gcc.target/i386/avx512fp16-recip-2.c      | 97 +++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr102464.c      |  2 +-
>  6 files changed, 258 insertions(+), 20 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-recip-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-recip-2.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 299e1ab2621..c5789365d3b 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -18905,9 +18905,10 @@ ix86_vectorize_builtin_scatter (const_tree vectype,
>     1.0/sqrt.  */
>
>  static bool
> -use_rsqrt_p ()
> +use_rsqrt_p (machine_mode mode)
>  {
> -  return (TARGET_SSE && TARGET_SSE_MATH
> +  return ((mode == HFmode
> +          || (TARGET_SSE && TARGET_SSE_MATH))
>           && flag_finite_math_only
>           && !flag_trapping_math
>           && flag_unsafe_math_optimizations);
> @@ -23603,29 +23604,27 @@ ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
>        return opt_type == OPTIMIZE_FOR_SPEED;
>
>      case rint_optab:
> -      if (mode1 == HFmode)
> -       return true;
> -      else if (SSE_FLOAT_MODE_P (mode1)
> -              && TARGET_SSE_MATH
> -              && !flag_trapping_math
> -              && !TARGET_SSE4_1)
> +      if (SSE_FLOAT_MODE_P (mode1)
> +         && TARGET_SSE_MATH
> +         && !flag_trapping_math
> +         && !TARGET_SSE4_1
> +         && mode1 != HFmode)
>         return opt_type == OPTIMIZE_FOR_SPEED;
>        return true;
>
>      case floor_optab:
>      case ceil_optab:
>      case btrunc_optab:
> -      if (mode1 == HFmode)
> -       return true;
> -      else if (SSE_FLOAT_MODE_P (mode1)
> -              && TARGET_SSE_MATH
> -              && !flag_trapping_math
> -              && TARGET_SSE4_1)
> +      if (((SSE_FLOAT_MODE_P (mode1)
> +           && TARGET_SSE_MATH
> +           && TARGET_SSE4_1)
> +          || mode1 == HFmode)
> +         && !flag_trapping_math)
>         return true;
>        return opt_type == OPTIMIZE_FOR_SPEED;
>
>      case rsqrt_optab:
> -      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
> +      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
>
>      default:
>        return true;
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index e733a40fc90..11535df5425 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -8417,11 +8417,27 @@
>                 (match_operand:XF 2 "register_operand")))]
>    "TARGET_80387")
>
> +/* There is no more precision loss than Newton-Rhapson approximation
> +  when using HFmode rcp/rsqrt, so do the transformation directly under
> +  TARGET_RECIP_DIV and fast-math.  */
>  (define_expand "divhf3"
>    [(set (match_operand:HF 0 "register_operand")
>         (div:HF (match_operand:HF 1 "register_operand")
>                    (match_operand:HF 2 "nonimmediate_operand")))]
> -  "TARGET_AVX512FP16")
> +  "TARGET_AVX512FP16"
> +{
> +  if (TARGET_RECIP_DIV
> +      && optimize_insn_for_speed_p ()
> +      && flag_finite_math_only && !flag_trapping_math
> +      && flag_unsafe_math_optimizations)
> +    {
> +      rtx op = gen_reg_rtx (HFmode);
> +      operands[2] = force_reg (HFmode, operands[2]);
> +      emit_insn (gen_rcphf2 (op, operands[2]));
> +      emit_insn (gen_mulhf3 (operands[0], operands[1], op));
> +      DONE;
> +    }
> +})
>
>  (define_expand "div<mode>3"
>    [(set (match_operand:MODEF 0 "register_operand")
> @@ -16973,6 +16989,19 @@
>             ]
>             (symbol_ref "true")))])
>
> +(define_insn "rcphf2"
> +  [(set (match_operand:HF 0 "register_operand" "=v,v")
> +       (unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "v,m")]
> +                  UNSPEC_RCP))]
> +  "TARGET_AVX512FP16"
> +  "@
> +   vrcpsh\t{%d1, %0|%0, %d1}
> +   vrcpsh\t{%1, %d0|%d0, %1}"
> +  [(set_attr "type" "sse")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "HF")
> +   (set_attr "avx_partial_xmm_update" "false,true")])
> +
>  (define_insn "*fop_xf_1_i387"
>    [(set (match_operand:XF 0 "register_operand" "=f,f")
>         (match_operator:XF 3 "binary_fp_operator"
> @@ -17230,6 +17259,19 @@
>    DONE;
>  })
>
> +(define_insn "rsqrthf2"
> +  [(set (match_operand:HF 0 "register_operand" "=v,v")
> +       (unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "v,m")]
> +                  UNSPEC_RSQRT))]
> +  "TARGET_AVX512FP16"
> +  "@
> +   vrsqrtsh\t{%d1, %0|%0, %d1}
> +   vrsqrtsh\t{%1, %d0|%d0, %1}"
> +  [(set_attr "type" "sse")
> +   (set_attr "prefix" "evex")
> +   (set_attr "avx_partial_xmm_update" "false,true")
> +   (set_attr "mode" "HF")])
> +
>  (define_insn "sqrthf2"
>    [(set (match_operand:HF 0 "register_operand" "=v,v")
>         (sqrt:HF
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 431236ab3a4..0d87aeb75a1 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -2306,11 +2306,33 @@
>     (set_attr "mode" "<ssescalarmode>")])
>
>  (define_expand "div<mode>3"
> -  [(set (match_operand:VF2H 0 "register_operand")
> -       (div:VF2H (match_operand:VF2H 1 "register_operand")
> -                 (match_operand:VF2H 2 "vector_operand")))]
> +  [(set (match_operand:VF2 0 "register_operand")
> +       (div:VF2 (match_operand:VF2 1 "register_operand")
> +                 (match_operand:VF2 2 "vector_operand")))]
>    "TARGET_SSE2")
>
> +(define_expand "div<mode>3"
> +  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
> +       (div:VF_AVX512FP16VL
> +         (match_operand:VF_AVX512FP16VL 1 "register_operand")
> +         (match_operand:VF_AVX512FP16VL 2 "vector_operand")))]
> +  "TARGET_AVX512FP16"
> +{
> +  /* Transform HF vector div to vector mul/rcp.  */
> +  if (GET_MODE_INNER (<MODE>mode) == HFmode
> +      && TARGET_RECIP_VEC_DIV
> +      && optimize_insn_for_speed_p ()
> +      && flag_finite_math_only && !flag_trapping_math
> +      && flag_unsafe_math_optimizations)
> +    {
> +      rtx op = gen_reg_rtx (<MODE>mode);
> +      operands[2] = force_reg (<MODE>mode, operands[2]);
> +      emit_insn (gen_avx512fp16_rcp<mode>2 (op, operands[2]));
> +      emit_insn (gen_mul<mode>3 (operands[0], operands[1], op));
> +      DONE;
> +    }
> +})
> +
>  (define_expand "div<mode>3"
>    [(set (match_operand:VF1 0 "register_operand")
>         (div:VF1 (match_operand:VF1 1 "register_operand")
> @@ -2433,6 +2455,20 @@
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "HF")])
>
> +(define_insn "*avx512fp16_vmrcpv8hf2"
> +  [(set (match_operand:V8HF 0 "register_operand" "=v")
> +       (vec_merge:V8HF
> +         (vec_duplicate:V8HF
> +           (unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "vm")]
> +                        UNSPEC_RCP))
> +         (match_operand:V8HF 2 "register_operand" "v")
> +         (const_int 1)))]
> +  "TARGET_AVX512FP16"
> +  "vrcpsh\t{%1, %2, %0|%0, %2, %w1}"
> +  [(set_attr "type" "sse")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "HF")])
> +
>  (define_insn "<mask_codefor>rcp14<mode><mask_name>"
>    [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
>         (unspec:VF_AVX512VL
> @@ -2558,6 +2594,13 @@
>    DONE;
>  })
>
> +(define_expand "rsqrt<mode>2"
> +  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
> +       (unspec:VF_AVX512FP16VL
> +         [(match_operand:VF_AVX512FP16VL 1 "vector_operand")]
> +         UNSPEC_RSQRT))]
> +  "TARGET_AVX512FP16")
> +
>  (define_insn "<sse>_rsqrt<mode>2"
>    [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
>         (unspec:VF1_128_256
> @@ -2666,6 +2709,20 @@
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "HF")])
>
> +(define_insn "*avx512fp16_vmrsqrtv8hf2"
> +  [(set (match_operand:V8HF 0 "register_operand" "=v")
> +       (vec_merge:V8HF
> +         (vec_duplicate:V8HF
> +           (unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "vm")]
> +                      UNSPEC_RSQRT))
> +         (match_operand:V8HF 2 "register_operand" "v")
> +         (const_int 1)))]
> +  "TARGET_AVX512FP16"
> +  "vrsqrtsh\t{%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %w1}"
> +  [(set_attr "type" "sse")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "HF")])
> +
>  (define_expand "cond_<code><mode>"
>    [(set (match_operand:VFH 0 "register_operand")
>         (vec_merge:VFH
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-recip-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-recip-1.c
> new file mode 100644
> index 00000000000..bc7cbbc11b9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-recip-1.c
> @@ -0,0 +1,43 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx512fp16 -mavx512vl -ffast-math" } */
> +/* { dg-final { scan-assembler "vrcpsh.*\n.*vmulsh" } } */
> +/* { dg-final { scan-assembler "vrcpph.*\n.*vmulph" } } */
> +/* { dg-final { scan-assembler "vrsqrtsh.*\n.*vmulsh" } } */
> +/* { dg-final { scan-assembler "vrsqrtph.*\n.*vmulph" } } */
> +/* { dg-final { scan-assembler-not "vsqrtsh" } } */
> +/* { dg-final { scan-assembler-not "vsqrtph" } } */
> +/* { dg-final { scan-assembler-not "vdivsh" } } */
> +/* { dg-final { scan-assembler-not "vdivph" } } */
> +
> +#define FAST_ATTR \
> +  __attribute__((noinline, noclone, optimize("fast-math"), target("recip")))
> +
> +_Float16 FAST_ATTR
> +scalar_hf_rcp_fast (_Float16 a, _Float16 b)
> +{
> +  return a / b;
> +}
> +
> +_Float16 FAST_ATTR
> +scalar_hf_rsqrt_fast (_Float16 a, _Float16 b)
> +{
> +  return a / __builtin_sqrtf16 (b);
> +}
> +
> +void FAST_ATTR
> +vector_hf_rcp_fast (_Float16 * restrict a, _Float16 * restrict b,
> +                   _Float16 * restrict c, int n)
> +{
> +  int i;
> +  for (i = 0; i < n; i++)
> +    c[i] = a[i] / b[i];
> +}
> +
> +void FAST_ATTR
> +vector_hf_rsqrt_fast (_Float16 * restrict a, _Float16 * restrict b,
> +                   _Float16 * restrict c, int n)
> +{
> +  int i;
> +  for (i = 0; i < n; i++)
> +    c[i] = a[i] / __builtin_sqrtf16(b[i]);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-recip-2.c b/gcc/testsuite/gcc.target/i386/avx512fp16-recip-2.c
> new file mode 100644
> index 00000000000..ed7e0a2225f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-recip-2.c
> @@ -0,0 +1,97 @@
> +/* { dg-do run { target avx512fp16 } } */
> +/* { dg-options "-O3 -mavx512fp16 -mavx512vl -ffast-math" } */
> +
> +static void recip_op_test (void);
> +#define DO_TEST recip_op_test
> +#define AVX512FP16
> +#define AVX512VL
> +#include "avx512f-check.h"
> +#include "avx512fp16-recip-1.c"
> +
> +_Float16 a[32], b[32], vexp[32], vref[32], sa, sb, sexp, sref;
> +
> +#define NO_FAST_ATTR  \
> +  __attribute__((noinline, noclone, \
> +                optimize("fast-math,trapping-math")))
> +
> +_Float16 NO_FAST_ATTR
> +scalar_hf_rcp_no_fast (_Float16 a, _Float16 b)
> +{
> +  return a / b;
> +}
> +
> +_Float16 NO_FAST_ATTR
> +scalar_hf_rsqrt_no_fast (_Float16 a, _Float16 b)
> +{
> +  return a / __builtin_sqrtf16 (b);
> +}
> +
> +void NO_FAST_ATTR
> +vector_hf_rcp_no_fast (_Float16 * restrict a, _Float16 * restrict b,
> +                   _Float16 * restrict c, int n)
> +{
> +  int i;
> +  for (i = 0; i < n; i++)
> +    c[i] = a[i] / b[i];
> +}
> +
> +void NO_FAST_ATTR
> +vector_hf_rsqrt_no_fast (_Float16 * restrict a, _Float16 * restrict b,
> +                   _Float16 * restrict c, int n)
> +{
> +  int i;
> +  for (i = 0; i < n; i++)
> +    c[i] = a[i] / __builtin_sqrtf16 (b[i]);
> +}
> +
> +void init()
> +{
> +  int i;
> +  sa = 3.75;
> +  sb = 6.25;
> +  sexp = sref = 2.75;
> +  for (i = 0; i < 32; i++)
> +    {
> +      a[i] = i + 0.5;
> +      b[i] = i * 1.5;
> +      vexp[i] = vref[i] = 2.75 * i;
> +    }
> +}
> +
> +int check_cond(void *a, void *b, int size)
> +{
> +  int i;
> +  unsigned short *pa = (unsigned short *)a,
> +                *pb = (unsigned short *)b;
> +  for (i = 0; i < size; i++)
> +    if (pa[i] != pb[i])
> +      return 0;
> +  return 1;
> +}
> +
> +static void recip_op_test()
> +{
> +  init ();
> +  sexp = scalar_hf_rcp_fast (sa, sb);
> +  sref = scalar_hf_rcp_no_fast (sa, sb);
> +  if (!check_cond (&sexp, &sref, 1))
> +    abort ();
> +
> +  init ();
> +  sexp = scalar_hf_rsqrt_fast (sa, sb);
> +  sref = scalar_hf_rsqrt_no_fast (sa, sb);
> +  if (!check_cond (&sexp, &sref, 1))
> +    abort ();
> +
> +  init ();
> +  vector_hf_rcp_fast (a, b, vexp, 32);
> +  vector_hf_rcp_no_fast (a, b, vref, 32);
> +  if (!check_cond (vexp, vref, 1))
> +    abort ();
> +
> +  init ();
> +  vector_hf_rsqrt_fast (a, b, vexp, 32);
> +  vector_hf_rsqrt_no_fast (a, b, vref, 32);
> +  if (!check_cond (vexp, vref, 1))
> +    abort ();
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr102464.c b/gcc/testsuite/gcc.target/i386/pr102464.c
> index e3e060ee80b..7e1fbdccf02 100644
> --- a/gcc/testsuite/gcc.target/i386/pr102464.c
> +++ b/gcc/testsuite/gcc.target/i386/pr102464.c
> @@ -1,6 +1,6 @@
>  /* PR target/102464.  */
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -mavx512fp16" } */
> +/* { dg-options "-O2 -mavx512fp16 -fno-trapping-math" } */
>
>  #define FOO(FUNC,SUFFIX)                       \
>    _Float16                                     \
> --
> 2.27.1
>

diff mbox series

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 299e1ab2621..c5789365d3b 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -18905,9 +18905,10 @@  ix86_vectorize_builtin_scatter (const_tree vectype,
    1.0/sqrt.  */
 
 static bool
-use_rsqrt_p ()
+use_rsqrt_p (machine_mode mode)
 {
-  return (TARGET_SSE && TARGET_SSE_MATH
+  return ((mode == HFmode
+	   || (TARGET_SSE && TARGET_SSE_MATH))
 	  && flag_finite_math_only
 	  && !flag_trapping_math
 	  && flag_unsafe_math_optimizations);
@@ -23603,29 +23604,27 @@  ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
       return opt_type == OPTIMIZE_FOR_SPEED;
 
     case rint_optab:
-      if (mode1 == HFmode)
-	return true;
-      else if (SSE_FLOAT_MODE_P (mode1)
-	       && TARGET_SSE_MATH
-	       && !flag_trapping_math
-	       && !TARGET_SSE4_1)
+      if (SSE_FLOAT_MODE_P (mode1)
+	  && TARGET_SSE_MATH
+	  && !flag_trapping_math
+	  && !TARGET_SSE4_1
+	  && mode1 != HFmode)
 	return opt_type == OPTIMIZE_FOR_SPEED;
       return true;
 
     case floor_optab:
     case ceil_optab:
     case btrunc_optab:
-      if (mode1 == HFmode)
-	return true;
-      else if (SSE_FLOAT_MODE_P (mode1)
-	       && TARGET_SSE_MATH
-	       && !flag_trapping_math
-	       && TARGET_SSE4_1)
+      if (((SSE_FLOAT_MODE_P (mode1)
+	    && TARGET_SSE_MATH
+	    && TARGET_SSE4_1)
+	   || mode1 == HFmode)
+	  && !flag_trapping_math)
 	return true;
       return opt_type == OPTIMIZE_FOR_SPEED;
 
     case rsqrt_optab:
-      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
+      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
 
     default:
       return true;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e733a40fc90..11535df5425 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -8417,11 +8417,27 @@ 
 		(match_operand:XF 2 "register_operand")))]
   "TARGET_80387")
 
+/* There is no more precision loss than Newton-Rhapson approximation
+  when using HFmode rcp/rsqrt, so do the transformation directly under
+  TARGET_RECIP_DIV and fast-math.  */
 (define_expand "divhf3"
   [(set (match_operand:HF 0 "register_operand")
 	(div:HF (match_operand:HF 1 "register_operand")
 		   (match_operand:HF 2 "nonimmediate_operand")))]
-  "TARGET_AVX512FP16")
+  "TARGET_AVX512FP16"
+{
+  if (TARGET_RECIP_DIV
+      && optimize_insn_for_speed_p ()
+      && flag_finite_math_only && !flag_trapping_math
+      && flag_unsafe_math_optimizations)
+    {
+      rtx op = gen_reg_rtx (HFmode);
+      operands[2] = force_reg (HFmode, operands[2]);
+      emit_insn (gen_rcphf2 (op, operands[2]));
+      emit_insn (gen_mulhf3 (operands[0], operands[1], op));
+      DONE;
+    }
+})
 
 (define_expand "div<mode>3"
   [(set (match_operand:MODEF 0 "register_operand")
@@ -16973,6 +16989,19 @@ 
 	    ]
 	    (symbol_ref "true")))])
 
+(define_insn "rcphf2"
+  [(set (match_operand:HF 0 "register_operand" "=v,v")
+	(unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "v,m")]
+		   UNSPEC_RCP))]
+  "TARGET_AVX512FP16"
+  "@
+   vrcpsh\t{%d1, %0|%0, %d1}
+   vrcpsh\t{%1, %d0|%d0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "HF")
+   (set_attr "avx_partial_xmm_update" "false,true")])
+
 (define_insn "*fop_xf_1_i387"
   [(set (match_operand:XF 0 "register_operand" "=f,f")
 	(match_operator:XF 3 "binary_fp_operator"
@@ -17230,6 +17259,19 @@ 
   DONE;
 })
 
+(define_insn "rsqrthf2"
+  [(set (match_operand:HF 0 "register_operand" "=v,v")
+	(unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "v,m")]
+		   UNSPEC_RSQRT))]
+  "TARGET_AVX512FP16"
+  "@
+   vrsqrtsh\t{%d1, %0|%0, %d1}
+   vrsqrtsh\t{%1, %d0|%d0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "avx_partial_xmm_update" "false,true")
+   (set_attr "mode" "HF")])
+
 (define_insn "sqrthf2"
   [(set (match_operand:HF 0 "register_operand" "=v,v")
 	(sqrt:HF
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 431236ab3a4..0d87aeb75a1 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2306,11 +2306,33 @@ 
    (set_attr "mode" "<ssescalarmode>")])
 
 (define_expand "div<mode>3"
-  [(set (match_operand:VF2H 0 "register_operand")
-	(div:VF2H (match_operand:VF2H 1 "register_operand")
-		  (match_operand:VF2H 2 "vector_operand")))]
+  [(set (match_operand:VF2 0 "register_operand")
+	(div:VF2 (match_operand:VF2 1 "register_operand")
+		  (match_operand:VF2 2 "vector_operand")))]
   "TARGET_SSE2")
 
+(define_expand "div<mode>3"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+	(div:VF_AVX512FP16VL
+	  (match_operand:VF_AVX512FP16VL 1 "register_operand")
+	  (match_operand:VF_AVX512FP16VL 2 "vector_operand")))]
+  "TARGET_AVX512FP16"
+{
+  /* Transform HF vector div to vector mul/rcp.  */
+  if (GET_MODE_INNER (<MODE>mode) == HFmode
+      && TARGET_RECIP_VEC_DIV
+      && optimize_insn_for_speed_p ()
+      && flag_finite_math_only && !flag_trapping_math
+      && flag_unsafe_math_optimizations)
+    {
+      rtx op = gen_reg_rtx (<MODE>mode);
+      operands[2] = force_reg (<MODE>mode, operands[2]);
+      emit_insn (gen_avx512fp16_rcp<mode>2 (op, operands[2]));
+      emit_insn (gen_mul<mode>3 (operands[0], operands[1], op));
+      DONE;
+    }
+})
+
 (define_expand "div<mode>3"
   [(set (match_operand:VF1 0 "register_operand")
 	(div:VF1 (match_operand:VF1 1 "register_operand")
@@ -2433,6 +2455,20 @@ 
    (set_attr "prefix" "evex")
    (set_attr "mode" "HF")])
 
+(define_insn "*avx512fp16_vmrcpv8hf2"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_merge:V8HF
+	  (vec_duplicate:V8HF
+	    (unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "vm")]
+			 UNSPEC_RCP))
+	  (match_operand:V8HF 2 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512FP16"
+  "vrcpsh\t{%1, %2, %0|%0, %2, %w1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "HF")])
+
 (define_insn "<mask_codefor>rcp14<mode><mask_name>"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
 	(unspec:VF_AVX512VL
@@ -2558,6 +2594,13 @@ 
   DONE;
 })
 
+(define_expand "rsqrt<mode>2"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+	(unspec:VF_AVX512FP16VL
+	  [(match_operand:VF_AVX512FP16VL 1 "vector_operand")]
+	  UNSPEC_RSQRT))]
+  "TARGET_AVX512FP16")
+
 (define_insn "<sse>_rsqrt<mode>2"
   [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
 	(unspec:VF1_128_256
@@ -2666,6 +2709,20 @@ 
    (set_attr "prefix" "evex")
    (set_attr "mode" "HF")])
 
+(define_insn "*avx512fp16_vmrsqrtv8hf2"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_merge:V8HF
+	  (vec_duplicate:V8HF
+	    (unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "vm")]
+		       UNSPEC_RSQRT))
+	  (match_operand:V8HF 2 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512FP16"
+  "vrsqrtsh\t{%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %w1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "HF")])
+
 (define_expand "cond_<code><mode>"
   [(set (match_operand:VFH 0 "register_operand")
 	(vec_merge:VFH
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-recip-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16-recip-1.c
new file mode 100644
index 00000000000..bc7cbbc11b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-recip-1.c
@@ -0,0 +1,43 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512fp16 -mavx512vl -ffast-math" } */
+/* { dg-final { scan-assembler "vrcpsh.*\n.*vmulsh" } } */
+/* { dg-final { scan-assembler "vrcpph.*\n.*vmulph" } } */
+/* { dg-final { scan-assembler "vrsqrtsh.*\n.*vmulsh" } } */
+/* { dg-final { scan-assembler "vrsqrtph.*\n.*vmulph" } } */
+/* { dg-final { scan-assembler-not "vsqrtsh" } } */
+/* { dg-final { scan-assembler-not "vsqrtph" } } */
+/* { dg-final { scan-assembler-not "vdivsh" } } */
+/* { dg-final { scan-assembler-not "vdivph" } } */
+
+#define FAST_ATTR \
+  __attribute__((noinline, noclone, optimize("fast-math"), target("recip")))
+
+_Float16 FAST_ATTR
+scalar_hf_rcp_fast (_Float16 a, _Float16 b)
+{
+  return a / b;
+}
+
+_Float16 FAST_ATTR
+scalar_hf_rsqrt_fast (_Float16 a, _Float16 b)
+{
+  return a / __builtin_sqrtf16 (b);
+}
+
+void FAST_ATTR
+vector_hf_rcp_fast (_Float16 * restrict a, _Float16 * restrict b,
+		    _Float16 * restrict c, int n)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    c[i] = a[i] / b[i];
+}
+
+void FAST_ATTR
+vector_hf_rsqrt_fast (_Float16 * restrict a, _Float16 * restrict b,
+		    _Float16 * restrict c, int n)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    c[i] = a[i] / __builtin_sqrtf16(b[i]);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-recip-2.c b/gcc/testsuite/gcc.target/i386/avx512fp16-recip-2.c
new file mode 100644
index 00000000000..ed7e0a2225f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-recip-2.c
@@ -0,0 +1,97 @@ 
+/* { dg-do run { target avx512fp16 } } */
+/* { dg-options "-O3 -mavx512fp16 -mavx512vl -ffast-math" } */
+
+static void recip_op_test (void);
+#define DO_TEST recip_op_test
+#define AVX512FP16
+#define AVX512VL
+#include "avx512f-check.h"
+#include "avx512fp16-recip-1.c"
+
+_Float16 a[32], b[32], vexp[32], vref[32], sa, sb, sexp, sref;
+
+#define NO_FAST_ATTR  \
+  __attribute__((noinline, noclone, \
+		 optimize("fast-math,trapping-math")))
+
+_Float16 NO_FAST_ATTR
+scalar_hf_rcp_no_fast (_Float16 a, _Float16 b)
+{
+  return a / b;
+}
+
+_Float16 NO_FAST_ATTR
+scalar_hf_rsqrt_no_fast (_Float16 a, _Float16 b)
+{
+  return a / __builtin_sqrtf16 (b);
+}
+
+void NO_FAST_ATTR
+vector_hf_rcp_no_fast (_Float16 * restrict a, _Float16 * restrict b,
+		    _Float16 * restrict c, int n)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    c[i] = a[i] / b[i];
+}
+
+void NO_FAST_ATTR
+vector_hf_rsqrt_no_fast (_Float16 * restrict a, _Float16 * restrict b,
+		    _Float16 * restrict c, int n)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    c[i] = a[i] / __builtin_sqrtf16 (b[i]);
+}
+
+void init()
+{
+  int i;
+  sa = 3.75;
+  sb = 6.25;
+  sexp = sref = 2.75;
+  for (i = 0; i < 32; i++)
+    {
+      a[i] = i + 0.5; 
+      b[i] = i * 1.5;
+      vexp[i] = vref[i] = 2.75 * i;
+    }
+}
+
+int check_cond(void *a, void *b, int size)
+{
+  int i;
+  unsigned short *pa = (unsigned short *)a,
+		 *pb = (unsigned short *)b;
+  for (i = 0; i < size; i++)
+    if (pa[i] != pb[i])
+      return 0;
+  return 1;
+}
+
+static void recip_op_test()
+{
+  init ();
+  sexp = scalar_hf_rcp_fast (sa, sb);
+  sref = scalar_hf_rcp_no_fast (sa, sb);
+  if (!check_cond (&sexp, &sref, 1))
+    abort ();
+
+  init ();
+  sexp = scalar_hf_rsqrt_fast (sa, sb);
+  sref = scalar_hf_rsqrt_no_fast (sa, sb);
+  if (!check_cond (&sexp, &sref, 1))
+    abort ();
+
+  init ();
+  vector_hf_rcp_fast (a, b, vexp, 32);
+  vector_hf_rcp_no_fast (a, b, vref, 32);
+  if (!check_cond (vexp, vref, 1))
+    abort ();
+
+  init ();
+  vector_hf_rsqrt_fast (a, b, vexp, 32);
+  vector_hf_rsqrt_no_fast (a, b, vref, 32);
+  if (!check_cond (vexp, vref, 1))
+    abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr102464.c b/gcc/testsuite/gcc.target/i386/pr102464.c
index e3e060ee80b..7e1fbdccf02 100644
--- a/gcc/testsuite/gcc.target/i386/pr102464.c
+++ b/gcc/testsuite/gcc.target/i386/pr102464.c
@@ -1,6 +1,6 @@ 
 /* PR target/102464.  */
 /* { dg-do compile } */
-/* { dg-options "-O2 -mavx512fp16" } */
+/* { dg-options "-O2 -mavx512fp16 -fno-trapping-math" } */
 
 #define FOO(FUNC,SUFFIX)                       \
   _Float16                                     \