Optimize for V{8,16,32}HFmode vec_set/extract/init.

Message ID 20210915093624.1305991-1-hongtao.liu@intel.com
State New
Headers
Series Optimize for V{8,16,32}HFmode vec_set/extract/init. |

Commit Message

liuhongt Sept. 15, 2021, 9:36 a.m. UTC
  Hi:
  The optimization is decribled in PR.

  Bootstrapped and regtest on x86_64-linux-gnu{-m32,}.
  All avx512fp16 runtest cases passed on SPR.

gcc/ChangeLog:

	PR target/102327
	* config/i386/i386-expand.c
	(ix86_expand_vector_init_interleave): Use puncklwd to pack 2
	HFmodes.
	(ix86_expand_vector_set): Use blendw instead of pinsrw.
	* config/i386/i386.c (ix86_can_change_mode_class): Adjust for
	AVX512FP16 which supports 16bit vector load.
	* config/i386/sse.md (avx512bw_interleave_highv32hi<mask_name>):
	Rename to ..
	(avx512bw_interleave_high<mode><mask_name>): .. this, and
	extend to V32HFmode.
	(avx2_interleave_highv16hi<mask_name>): Rename to ..
	(avx2_interleave_high<mode><mask_name>): .. this, and extend
	to V16HFmode.
	(vec_interleave_highv8hi<mask_name>): Rename to ..
	(vec_interleave_high<mode><mask_name>): .. this, and extend to V8HFmode.
	(<mask_codefor>avx512bw_interleave_lowv32hi<mask_name>):
	Rename to ..
	(<mask_codefor>avx512bw_interleave_low<mode><mask_name>):
	this, and extend to V32HFmode.
	(avx2_interleave_lowv16hi<mask_name>): Rename to ..
	(avx2_interleave_low<mode><mask_name>): .. this, and extend to V16HFmode.
	(vec_interleave_lowv8hi<mask_name>): Rename to ..
	(vec_interleave_low<mode><mask_name>): .. this, and extend to V8HFmode.
	(sse4_1_pblendw): Rename to ..
	(sse4_1_pblend<blendsuf>): .. this, and extend to V8HFmode.
	(avx2_pblendph): New define_expand.
	(<sse2p4_1>_pinsr<ssemodesuffix>): Refactor, use
	sseintmodesuffix instead of ssemodesuffix.
	(blendsuf): New mode attr.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr102327-1.c: New test.
	* gcc.target/i386/pr102327-2.c: New test.
	* gcc.target/i386/avx512fp16-1c.c: Adjust testcase.
---
 gcc/config/i386/i386-expand.c                 |  95 ++++++----
 gcc/config/i386/i386.c                        |   7 +-
 gcc/config/i386/sse.md                        | 176 ++++++++++++------
 gcc/testsuite/gcc.target/i386/avx512fp16-1c.c |   6 +-
 gcc/testsuite/gcc.target/i386/pr102327-1.c    |  65 +++++++
 gcc/testsuite/gcc.target/i386/pr102327-2.c    |  95 ++++++++++
 6 files changed, 343 insertions(+), 101 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102327-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102327-2.c
  

Patch

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index e117afb16b8..c82b6accf1b 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -14054,7 +14054,7 @@  ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
 	  tmp1 = gen_reg_rtx (SImode);
 	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
 
-	  /* Insert the SImode value as low element of a V4SImode vector. */
+	  /* Insert the SImode value as low element of a V4SImode vector.  */
 	  tmp2 = gen_reg_rtx (V4SImode);
 	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
 	  emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
@@ -14638,7 +14638,7 @@  ix86_expand_vector_init_interleave (machine_mode mode,
   switch (mode)
     {
     case E_V8HFmode:
-      gen_load_even = gen_vec_setv8hf;
+      gen_load_even = gen_vec_interleave_lowv8hf;
       gen_interleave_first_low = gen_vec_interleave_lowv4si;
       gen_interleave_second_low = gen_vec_interleave_lowv2di;
       inner_mode = HFmode;
@@ -14673,35 +14673,40 @@  ix86_expand_vector_init_interleave (machine_mode mode,
       op = ops [i + i];
       if (inner_mode == HFmode)
 	{
-	  /* Convert HFmode to HImode.  */
-	  op1 = gen_reg_rtx (HImode);
-	  op1 = gen_rtx_SUBREG (HImode, force_reg (HFmode, op), 0);
-	  op = gen_reg_rtx (HImode);
-	  emit_move_insn (op, op1);
+	  rtx even, odd;
+	  /* Use vpuncklwd to pack 2 HFmode.  */
+	  op0 = gen_reg_rtx (V8HFmode);
+	  even = lowpart_subreg (V8HFmode, force_reg (HFmode, op), HFmode);
+	  odd = lowpart_subreg (V8HFmode,
+				force_reg (HFmode, ops[i + i + 1]),
+				HFmode);
+	  emit_insn (gen_load_even (op0, even, odd));
 	}
+      else
+	{
+	  /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
+	  op0 = gen_reg_rtx (SImode);
+	  emit_move_insn (op0, gen_lowpart (SImode, op));
 
-      /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
-      op0 = gen_reg_rtx (SImode);
-      emit_move_insn (op0, gen_lowpart (SImode, op));
-
-      /* Insert the SImode value as low element of V4SImode vector. */
-      op1 = gen_reg_rtx (V4SImode);
-      op0 = gen_rtx_VEC_MERGE (V4SImode,
-			       gen_rtx_VEC_DUPLICATE (V4SImode,
-						      op0),
-			       CONST0_RTX (V4SImode),
-			       const1_rtx);
-      emit_insn (gen_rtx_SET (op1, op0));
+	  /* Insert the SImode value as low element of V4SImode vector.  */
+	  op1 = gen_reg_rtx (V4SImode);
+	  op0 = gen_rtx_VEC_MERGE (V4SImode,
+				   gen_rtx_VEC_DUPLICATE (V4SImode,
+							  op0),
+				   CONST0_RTX (V4SImode),
+				   const1_rtx);
+	  emit_insn (gen_rtx_SET (op1, op0));
 
-      /* Cast the V4SImode vector back to a vector in orignal mode.  */
-      op0 = gen_reg_rtx (mode);
-      emit_move_insn (op0, gen_lowpart (mode, op1));
+	  /* Cast the V4SImode vector back to a vector in orignal mode.  */
+	  op0 = gen_reg_rtx (mode);
+	  emit_move_insn (op0, gen_lowpart (mode, op1));
 
-      /* Load even elements into the second position.  */
-      emit_insn (gen_load_even (op0,
-				force_reg (inner_mode,
-					   ops [i + i + 1]),
-				const1_rtx));
+	  /* Load even elements into the second position.  */
+	  emit_insn (gen_load_even (op0,
+				    force_reg (inner_mode,
+					       ops[i + i + 1]),
+				    const1_rtx));
+	}
 
       /* Cast vector to FIRST_IMODE vector.  */
       ops[i] = gen_reg_rtx (first_imode);
@@ -15182,6 +15187,7 @@  ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
   machine_mode inner_mode = GET_MODE_INNER (mode);
   machine_mode half_mode;
   bool use_vec_merge = false;
+  bool blendm_const = false;
   rtx tmp;
   static rtx (*gen_extract[7][2]) (rtx, rtx)
     = {
@@ -15369,7 +15375,14 @@  ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
       return;
 
     case E_V8HFmode:
-      use_vec_merge = true;
+      if (TARGET_AVX2)
+	{
+	  mmode = SImode;
+	  gen_blendm = gen_sse4_1_pblendph;
+	  blendm_const = true;
+	}
+      else
+	use_vec_merge = true;
       break;
 
     case E_V8HImode:
@@ -15396,10 +15409,20 @@  ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
       goto half;
 
     case E_V16HFmode:
-      half_mode = V8HFmode;
-      j = 6;
-      n = 8;
-      goto half;
+      if (TARGET_AVX2)
+	{
+	  mmode = SImode;
+	  gen_blendm = gen_avx2_pblendph;
+	  blendm_const = true;
+	  break;
+	}
+      else
+	{
+	  half_mode = V8HFmode;
+	  j = 6;
+	  n = 8;
+	  goto half;
+	}
 
     case E_V16HImode:
       half_mode = V8HImode;
@@ -15560,15 +15583,15 @@  quarter:
     {
       tmp = gen_reg_rtx (mode);
       emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
+      rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
       /* The avx512*_blendm<mode> expanders have different operand order
 	 from VEC_MERGE.  In VEC_MERGE, the first input operand is used for
 	 elements where the mask is set and second input operand otherwise,
 	 in {sse,avx}*_*blend* the first input operand is used for elements
 	 where the mask is clear and second input operand otherwise.  */
-      emit_insn (gen_blendm (target, target, tmp,
-			     force_reg (mmode,
-					gen_int_mode (HOST_WIDE_INT_1U << elt,
-						      mmode))));
+      if (!blendm_const)
+	merge_mask = force_reg (mmode, merge_mask);
+      emit_insn (gen_blendm (target, target, tmp, merge_mask));
     }
   else if (use_vec_merge)
     {
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7b173bc0beb..d7abff0f396 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19443,8 +19443,11 @@  ix86_can_change_mode_class (machine_mode from, machine_mode to,
       /* Vector registers do not support QI or HImode loads.  If we don't
 	 disallow a change to these modes, reload will assume it's ok to
 	 drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
-	 the vec_dupv4hi pattern.  */
-      if (GET_MODE_SIZE (from) < 4)
+	 the vec_dupv4hi pattern.
+	 NB: AVX512FP16 supports vmovw which can load 16bit data to sse
+	 register.  */
+      int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_AVX512FP16 ? 2 : 4;
+      if (GET_MODE_SIZE (from) < mov_size)
 	return false;
     }
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 516eb4544bc..9c3a4a9809e 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -806,6 +806,7 @@  (define_mode_iterator VF_AVX512
    (V8SF "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL")
    V16SF V8DF])
 
+(define_mode_iterator V8_128 [V8HI V8HF])
 (define_mode_iterator V16_256 [V16HI V16HF])
 (define_mode_iterator V32_512 [V32HI V32HF])
 
@@ -9891,16 +9892,33 @@  (define_insn_and_split "*vec_extract<mode>_0"
   "operands[1] = gen_lowpart (HFmode, operands[1]);")
 
 (define_insn "*vec_extracthf"
-  [(set (match_operand:HF 0 "register_sse4nonimm_operand" "=r,m")
+  [(set (match_operand:HF 0 "register_sse4nonimm_operand" "=*r,m,x,v")
 	(vec_select:HF
-	  (match_operand:V8HF 1 "register_operand" "v,v")
+	  (match_operand:V8HF 1 "register_operand" "v,v,0,v")
 	  (parallel
 	    [(match_operand:SI 2 "const_0_to_7_operand")])))]
   "TARGET_SSE2"
-  "@
-   vpextrw\t{%2, %1, %k0|%k0, %1, %2}
-   vpextrw\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "sselog1")
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "vpextrw\t{%2, %1, %k0|%k0, %1, %2}";
+    case 1:
+      return "vpextrw\t{%2, %1, %0|%0, %1, %2}";
+
+    case 2:
+      operands[2] = GEN_INT (INTVAL (operands[2]) * 2);
+      return "psrldq\t{%2, %0|%0, %2}";
+    case 3:
+      operands[2] = GEN_INT (INTVAL (operands[2]) * 2);
+      return "vpsrldq\t{%2, %1, %0|%0, %1, %2}";
+
+    default:
+      gcc_unreachable ();
+   }
+}
+  [(set_attr "isa" "*,*,noavx,avx")
+   (set_attr "type" "sselog1,sselog1,sseishft1,sseishft1")
    (set_attr "prefix" "maybe_evex")
    (set_attr "mode" "TI")])
 
@@ -15359,12 +15377,12 @@  (define_insn "vec_interleave_lowv16qi<mask_name>"
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
-(define_insn "avx512bw_interleave_highv32hi<mask_name>"
-  [(set (match_operand:V32HI 0 "register_operand" "=v")
-	(vec_select:V32HI
-	  (vec_concat:V64HI
-	    (match_operand:V32HI 1 "register_operand" "v")
-	    (match_operand:V32HI 2 "nonimmediate_operand" "vm"))
+(define_insn "avx512bw_interleave_high<mode><mask_name>"
+  [(set (match_operand:V32_512 0 "register_operand" "=v")
+	(vec_select:V32_512
+	  (vec_concat:<ssedoublevecmode>
+	    (match_operand:V32_512 1 "register_operand" "v")
+	    (match_operand:V32_512 2 "nonimmediate_operand" "vm"))
 	  (parallel [(const_int 4) (const_int 36)
 		     (const_int 5) (const_int 37)
 		     (const_int 6) (const_int 38)
@@ -15387,12 +15405,12 @@  (define_insn "avx512bw_interleave_highv32hi<mask_name>"
    (set_attr "prefix" "evex")
    (set_attr "mode" "XI")])
 
-(define_insn "avx2_interleave_highv16hi<mask_name>"
-  [(set (match_operand:V16HI 0 "register_operand" "=Yw")
-	(vec_select:V16HI
-	  (vec_concat:V32HI
-	    (match_operand:V16HI 1 "register_operand" "Yw")
-	    (match_operand:V16HI 2 "nonimmediate_operand" "Ywm"))
+(define_insn "avx2_interleave_high<mode><mask_name>"
+  [(set (match_operand:V16_256 0 "register_operand" "=Yw")
+	(vec_select:V16_256
+	  (vec_concat:<ssedoublevecmode>
+	    (match_operand:V16_256 1 "register_operand" "Yw")
+	    (match_operand:V16_256 2 "nonimmediate_operand" "Ywm"))
 	  (parallel [(const_int 4) (const_int 20)
 		     (const_int 5) (const_int 21)
 		     (const_int 6) (const_int 22)
@@ -15407,12 +15425,12 @@  (define_insn "avx2_interleave_highv16hi<mask_name>"
    (set_attr "prefix" "maybe_evex")
    (set_attr "mode" "OI")])
 
-(define_insn "vec_interleave_highv8hi<mask_name>"
-  [(set (match_operand:V8HI 0 "register_operand" "=x,Yw")
-	(vec_select:V8HI
-	  (vec_concat:V16HI
-	    (match_operand:V8HI 1 "register_operand" "0,Yw")
-	    (match_operand:V8HI 2 "vector_operand" "xBm,Ywm"))
+(define_insn "vec_interleave_high<mode><mask_name>"
+  [(set (match_operand:V8_128 0 "register_operand" "=x,Yw")
+	(vec_select:V8_128
+	  (vec_concat:<ssedoublevecmode>
+	    (match_operand:V8_128 1 "register_operand" "0,Yw")
+	    (match_operand:V8_128 2 "vector_operand" "xBm,Ywm"))
 	  (parallel [(const_int 4) (const_int 12)
 		     (const_int 5) (const_int 13)
 		     (const_int 6) (const_int 14)
@@ -15427,12 +15445,12 @@  (define_insn "vec_interleave_highv8hi<mask_name>"
    (set_attr "prefix" "orig,maybe_vex")
    (set_attr "mode" "TI")])
 
-(define_insn "<mask_codefor>avx512bw_interleave_lowv32hi<mask_name>"
-  [(set (match_operand:V32HI 0 "register_operand" "=v")
-	(vec_select:V32HI
-	  (vec_concat:V64HI
-	    (match_operand:V32HI 1 "register_operand" "v")
-	    (match_operand:V32HI 2 "nonimmediate_operand" "vm"))
+(define_insn "<mask_codefor>avx512bw_interleave_low<mode><mask_name>"
+  [(set (match_operand:V32_512 0 "register_operand" "=v")
+	(vec_select:V32_512
+	  (vec_concat:<ssedoublevecmode>
+	    (match_operand:V32_512 1 "register_operand" "v")
+	    (match_operand:V32_512 2 "nonimmediate_operand" "vm"))
 	  (parallel [(const_int 0) (const_int 32)
 		     (const_int 1) (const_int 33)
 		     (const_int 2) (const_int 34)
@@ -15455,12 +15473,12 @@  (define_insn "<mask_codefor>avx512bw_interleave_lowv32hi<mask_name>"
    (set_attr "prefix" "evex")
    (set_attr "mode" "XI")])
 
-(define_insn "avx2_interleave_lowv16hi<mask_name>"
-  [(set (match_operand:V16HI 0 "register_operand" "=Yw")
-	(vec_select:V16HI
-	  (vec_concat:V32HI
-	    (match_operand:V16HI 1 "register_operand" "Yw")
-	    (match_operand:V16HI 2 "nonimmediate_operand" "Ywm"))
+(define_insn "avx2_interleave_low<mode><mask_name>"
+  [(set (match_operand:V16_256 0 "register_operand" "=Yw")
+	(vec_select:V16_256
+	  (vec_concat:<ssedoublevecmode>
+	    (match_operand:V16_256 1 "register_operand" "Yw")
+	    (match_operand:V16_256 2 "nonimmediate_operand" "Ywm"))
 	  (parallel [(const_int 0) (const_int 16)
 		     (const_int 1) (const_int 17)
 		     (const_int 2) (const_int 18)
@@ -15475,12 +15493,12 @@  (define_insn "avx2_interleave_lowv16hi<mask_name>"
    (set_attr "prefix" "maybe_evex")
    (set_attr "mode" "OI")])
 
-(define_insn "vec_interleave_lowv8hi<mask_name>"
-  [(set (match_operand:V8HI 0 "register_operand" "=x,Yw")
-	(vec_select:V8HI
-	  (vec_concat:V16HI
-	    (match_operand:V8HI 1 "register_operand" "0,Yw")
-	    (match_operand:V8HI 2 "vector_operand" "xBm,Ywm"))
+(define_insn "vec_interleave_low<mode><mask_name>"
+  [(set (match_operand:V8_128 0 "register_operand" "=x,Yw")
+	(vec_select:V8_128
+	  (vec_concat:<ssedoublevecmode>
+	    (match_operand:V8_128 1 "register_operand" "0,Yw")
+	    (match_operand:V8_128 2 "vector_operand" "xBm,Ywm"))
 	  (parallel [(const_int 0) (const_int 8)
 		     (const_int 1) (const_int 9)
 		     (const_int 2) (const_int 10)
@@ -15655,6 +15673,7 @@  (define_mode_attr pinsr_evex_isa
    (V4SI "avx512dq") (V2DI "avx512dq")])
 
 ;; sse4_1_pinsrd must come before sse2_loadld since it is preferred.
+;; For V8HFmode and TARGET_AVX2, broadcastw + pblendw should be better.
 (define_insn "<sse2p4_1>_pinsr<ssemodesuffix>"
   [(set (match_operand:PINSR_MODE 0 "register_operand" "=x,x,x,x,v,v")
 	(vec_merge:PINSR_MODE
@@ -15664,7 +15683,8 @@  (define_insn "<sse2p4_1>_pinsr<ssemodesuffix>"
 	  (match_operand:SI 3 "const_int_operand")))]
   "TARGET_SSE2
    && ((unsigned) exact_log2 (INTVAL (operands[3]))
-       < GET_MODE_NUNITS (<MODE>mode))"
+       < GET_MODE_NUNITS (<MODE>mode))
+   && !(<MODE>mode == V8HFmode && TARGET_AVX2)"
 {
   operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])));
 
@@ -15672,26 +15692,18 @@  (define_insn "<sse2p4_1>_pinsr<ssemodesuffix>"
     {
     case 0:
       if (GET_MODE_SIZE (<ssescalarmode>mode) < GET_MODE_SIZE (SImode))
-	return "pinsr<ssemodesuffix>\t{%3, %k2, %0|%0, %k2, %3}";
+	return "pinsr<sseintmodesuffix>\t{%3, %k2, %0|%0, %k2, %3}";
       /* FALLTHRU */
     case 1:
-      return "pinsr<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}";
+      return "pinsr<sseintmodesuffix>\t{%3, %2, %0|%0, %2, %3}";
     case 2:
     case 4:
       if (GET_MODE_SIZE (<ssescalarmode>mode) < GET_MODE_SIZE (SImode))
-	{
-	  if (<MODE>mode == V8HFmode)
-	    return "vpinsrw\t{%3, %k2, %1, %0|%0, %1, %k2, %3}";
-	  else
-	    return "vpinsr<ssemodesuffix>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}";
-	}
+	return "vpinsr<sseintmodesuffix>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}";
       /* FALLTHRU */
     case 3:
     case 5:
-      if (<MODE>mode == V8HFmode)
-	return "vpinsrw\t{%3, %2, %1, %0|%0, %1, %2, %3}";
-      else
-	return "vpinsr<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+      return "vpinsr<sseintmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
     default:
       gcc_unreachable ();
     }
@@ -19179,11 +19191,14 @@  (define_insn_and_split "*<sse4_1_avx2>_pblendvb_lt_subreg_not"
 	  (lt:VI1_AVX2 (match_dup 3) (match_dup 4))] UNSPEC_BLENDV))]
   "operands[3] = gen_lowpart (<MODE>mode, operands[3]);")
 
-(define_insn "sse4_1_pblendw"
-  [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,x")
-	(vec_merge:V8HI
-	  (match_operand:V8HI 2 "vector_operand" "YrBm,*xBm,xm")
-	  (match_operand:V8HI 1 "register_operand" "0,0,x")
+(define_mode_attr blendsuf
+  [(V8HI "w") (V8HF "ph")])
+
+(define_insn "sse4_1_pblend<blendsuf>"
+  [(set (match_operand:V8_128 0 "register_operand" "=Yr,*x,x")
+	(vec_merge:V8_128
+	  (match_operand:V8_128 2 "vector_operand" "YrBm,*xBm,xm")
+	  (match_operand:V8_128 1 "register_operand" "0,0,x")
 	  (match_operand:SI 3 "const_0_to_255_operand" "n,n,n")))]
   "TARGET_SSE4_1"
   "@
@@ -19210,6 +19225,47 @@  (define_expand "avx2_pblendw"
   operands[3] = GEN_INT (val << 8 | val);
 })
 
+(define_expand "avx2_pblendph"
+  [(set (match_operand:V16HF 0 "register_operand")
+	(vec_merge:V16HF
+	  (match_operand:V16HF 2 "register_operand")
+	  (match_operand:V16HF 1 "register_operand")
+	  (match_operand:SI 3 "const_int_operand")))]
+  "TARGET_AVX2
+  && !((INTVAL (operands[3]) & 0xff) && (INTVAL (operands[3]) & 0xff00))"
+{
+  int mask = INTVAL (operands[3]);
+  if (mask == 0)
+    emit_move_insn (operands[0], operands[2]);
+  else
+   {
+     rtx tmp = gen_reg_rtx (V16HImode);
+     rtx blendw_idx, blendd_idx;
+
+     if (mask & 0xff)
+       {
+	 blendw_idx = GEN_INT (mask & 0xff);
+	 blendd_idx = GEN_INT (15);
+       }
+     else
+       {
+	 blendw_idx = GEN_INT (mask >> 8 & 0xff);
+	 blendd_idx = GEN_INT (240);
+       }
+     operands[1] = lowpart_subreg (V16HImode, operands[1], V16HFmode);
+     operands[2] = lowpart_subreg (V16HImode, operands[2], V16HFmode);
+     emit_insn (gen_avx2_pblendw (tmp, operands[1], operands[2], blendw_idx));
+
+     operands[0] = lowpart_subreg (V8SImode, operands[0], V16HFmode);
+     tmp = lowpart_subreg (V8SImode, tmp, V16HImode);
+     operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);
+     emit_insn (gen_avx2_pblenddv8si (operands[0], operands[1],
+				      tmp, blendd_idx));
+  }
+
+  DONE;
+})
+
 (define_insn "*avx2_pblendw"
   [(set (match_operand:V16HI 0 "register_operand" "=x")
 	(vec_merge:V16HI
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-1c.c b/gcc/testsuite/gcc.target/i386/avx512fp16-1c.c
index 49fc2aa42e2..b41a90b7c9d 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-1c.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-1c.c
@@ -1,8 +1,8 @@ 
 /* { dg-do compile } */
 /* { dg-options "-mavx512fp16 -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovsh|vmovw)" 2 { target { ! ia32 } } } }  */
-/* { dg-final { scan-assembler-times "vpinsrw" 1 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vpinsrw" 2 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vmovsh" 1 } }  */
+/* { dg-final { scan-assembler-times "vpblendw" 1 } } */
+/* { dg-final { scan-assembler "vpbroadcastw" } }  */
 
 typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
 typedef _Float16 __m128h __attribute__ ((__vector_size__ (16), __may_alias__));
diff --git a/gcc/testsuite/gcc.target/i386/pr102327-1.c b/gcc/testsuite/gcc.target/i386/pr102327-1.c
new file mode 100644
index 00000000000..47439261b61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102327-1.c
@@ -0,0 +1,65 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
+
+typedef _Float16 v8hf __attribute__((vector_size (16)));
+typedef _Float16 v16hf __attribute__((vector_size (32)));
+typedef _Float16 v32hf __attribute__((vector_size (64)));
+
+#define VEC_EXTRACT(V,S,IDX)			\
+  S						\
+  __attribute__((noipa))			\
+  vec_extract_##V##_##IDX (V v)			\
+  {						\
+    return v[IDX];				\
+  }
+
+#define VEC_SET(V,S,IDX)			\
+  V						\
+  __attribute__((noipa))			\
+  vec_set_##V##_##IDX (V v, S s)		\
+  {						\
+    v[IDX] = s;				\
+    return v;					\
+  }
+
+v8hf
+vec_init_v8hf (_Float16 a1, _Float16 a2, _Float16 a3, _Float16 a4, _Float16 a5,
+_Float16 a6, _Float16 a7, _Float16 a8)
+{
+    return __extension__ (v8hf) {a1, a2, a3, a4, a5, a6, a7, a8};
+}
+
+/* { dg-final { scan-assembler-times "vpunpcklwd" 4 } } */
+/* { dg-final { scan-assembler-times "vpunpckldq" 2 } } */
+/* { dg-final { scan-assembler-times "vpunpcklqdq" 1 } } */
+
+VEC_EXTRACT (v8hf, _Float16, 4);
+VEC_EXTRACT (v16hf, _Float16, 3);
+VEC_EXTRACT (v16hf, _Float16, 8);
+VEC_EXTRACT (v16hf, _Float16, 15);
+VEC_EXTRACT (v32hf, _Float16, 5);
+VEC_EXTRACT (v32hf, _Float16, 8);
+VEC_EXTRACT (v32hf, _Float16, 14);
+VEC_EXTRACT (v32hf, _Float16, 16);
+VEC_EXTRACT (v32hf, _Float16, 24);
+VEC_EXTRACT (v32hf, _Float16, 28);
+/* { dg-final { scan-assembler-times "vpsrldq\[\t ]*\\\$8" 2 } } */
+/* { dg-final { scan-assembler-times "vpsrldq\[\t ]*\\\$6" 1 } } */
+/* { dg-final { scan-assembler-times "vpsrldq\[\t ]*\\\$14" 1 } } */
+/* { dg-final { scan-assembler-times "vpsrldq\[\t ]*\\\$10" 1 } } */
+/* { dg-final { scan-assembler-times "vpsrldq\[\t ]*\\\$12" 1 } } */
+/* { dg-final { scan-assembler-times "vextract" 9 } } */
+
+VEC_SET (v8hf, _Float16, 4);
+VEC_SET (v16hf, _Float16, 3);
+VEC_SET (v16hf, _Float16, 8);
+VEC_SET (v16hf, _Float16, 15);
+VEC_SET (v32hf, _Float16, 5);
+VEC_SET (v32hf, _Float16, 8);
+VEC_SET (v32hf, _Float16, 14);
+VEC_SET (v32hf, _Float16, 16);
+VEC_SET (v32hf, _Float16, 24);
+VEC_SET (v32hf, _Float16, 28);
+/* { dg-final { scan-assembler-times "vpbroadcastw" 10 } } */
+/* { dg-final { scan-assembler-times "vpblendw" 4 } } */
+/* { dg-final { scan-assembler-times "vpblendd" 3 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102327-2.c b/gcc/testsuite/gcc.target/i386/pr102327-2.c
new file mode 100644
index 00000000000..363e4b65404
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102327-2.c
@@ -0,0 +1,95 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mavx512fp16" } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#define AVX512VL
+#define AVX512FP16
+
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+#include "pr102327-1.c"
+
+#define RUNCHECK_VEC_EXTRACT(U,V,S,IDX)		\
+  do						\
+    {						\
+      S tmp = vec_extract_##V##_##IDX ((V)U.x);	\
+      if (tmp != U.a[IDX])			\
+	abort();				\
+    }						\
+  while (0)
+
+#define RUNCHECK_VEC_SET(UTYPE,U,V,S,IDX,NUM)		\
+  do							\
+    {							\
+      S tmp = 3.0f;					\
+      UTYPE res;					\
+      res.x = vec_set_##V##_##IDX ((V)U.x, tmp);	\
+      for (int i = 0; i != NUM; i++)			\
+	if (i == IDX)					\
+	  {						\
+	    if (res.a[i] != tmp)			\
+	      abort ();					\
+	  }						\
+	else if (res.a[i] != U.a[i])			\
+	  abort();					\
+    }							\
+  while (0)
+
+void
+test_256 (void)
+{
+  union512h g1;
+  union256h t1;
+  union128h x1;
+  int sign = 1;
+
+  int i = 0;
+  for (i = 0; i < 32; i++)
+    {
+      g1.a[i] = 56.78 * (i - 30) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i != 16; i++)
+    {
+      t1.a[i] = 90.12 * (i + 40) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i != 8; i++)
+    {
+      x1.a[i] = 90.12 * (i + 40) * sign;
+      sign = -sign;
+    }
+
+  RUNCHECK_VEC_EXTRACT (x1, v8hf, _Float16, 4);
+  RUNCHECK_VEC_EXTRACT (t1, v16hf, _Float16, 3);
+  RUNCHECK_VEC_EXTRACT (t1, v16hf, _Float16, 8);
+  RUNCHECK_VEC_EXTRACT (t1, v16hf, _Float16, 15);
+  RUNCHECK_VEC_EXTRACT (g1, v32hf, _Float16, 5);
+  RUNCHECK_VEC_EXTRACT (g1, v32hf, _Float16, 8);
+  RUNCHECK_VEC_EXTRACT (g1, v32hf, _Float16, 14);
+  RUNCHECK_VEC_EXTRACT (g1, v32hf, _Float16, 16);
+  RUNCHECK_VEC_EXTRACT (g1, v32hf, _Float16, 24);
+  RUNCHECK_VEC_EXTRACT (g1, v32hf, _Float16, 28);
+
+  RUNCHECK_VEC_SET (union128h, x1, v8hf, _Float16, 4, 8);
+  RUNCHECK_VEC_SET (union256h, t1, v16hf, _Float16, 3, 16);
+  RUNCHECK_VEC_SET (union256h, t1, v16hf, _Float16, 8, 16);
+  RUNCHECK_VEC_SET (union256h, t1, v16hf, _Float16, 15, 16);
+  RUNCHECK_VEC_SET (union512h, g1, v32hf, _Float16, 5, 32);
+  RUNCHECK_VEC_SET (union512h, g1, v32hf, _Float16, 8, 32);
+  RUNCHECK_VEC_SET (union512h, g1, v32hf, _Float16, 14, 32);
+  RUNCHECK_VEC_SET (union512h, g1, v32hf, _Float16, 16, 32);
+  RUNCHECK_VEC_SET (union512h, g1, v32hf, _Float16, 24, 32);
+  RUNCHECK_VEC_SET (union512h, g1, v32hf, _Float16, 28, 32);
+}
+
+void
+test_128()
+{
+}