i386: Implement VxHF vector set/insert/extract with lower ABI levels

Message ID CAFULd4aPot3LxcLVTr=AH_C87HGCnWF9uitMyE8Kti3q85a4vw@mail.gmail.com
State New
Headers
Series i386: Implement VxHF vector set/insert/extract with lower ABI levels |

Commit Message

Uros Bizjak Dec. 14, 2021, 5:35 p.m. UTC
  This is a preparation patch that moves VxHF vector set/insert/extract
expansions from AVX512FP16 ABI to lower ABIs.  There are no functional
changes for -mavx512fp16 and a follow-up patch is needed to actually
enable VxHF vector modes for lower ABIs.

2021-12-14  Uroš Bizjak  <ubizjak@gmail.com>

gcc/ChangeLog:

    PR target/103571
    * config/i386/i386-expand.c (ix86_expand_vector_init_duplicate)
    <case E_V8HFmode>: Implement for TARGET_SSE2.
    <case E_V16HFmode>: Implement for TARGET_AVX.
    <case E_V32HFmode>: Implement for TARGET_AVX512F.
    (ix86_expand_vector_set_var): Handle V32HFmode
    without TARGET_AVX512BW.
    (ix86_expand_vector_extract)
    <case E_V8HFmode>: Implement for TARGET_SSE2.
    <case E_V16HFmode>: Implement for TARGET_AVX.
    <case E_V32HFmode>: Implement for TARGET_AVX512BW.
    (expand_vec_perm_broadcast_1) <case E_V8HFmode>: New.
    * config/i386/sse.md (VI12HF_AVX512VL): Remove
    TARGET_AVX512FP16 condition.
    (V): Ditto.
    (V_256_512): Ditto.
    (avx_vbroadcastf128_<mode>): Use V_256H mode iterator.

Bootstrapped and regression tested on x86_64-linux-gnu {-m32}.

Pushed to master.

Uros.
  

Patch

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 2bbb28e5317..7013c20a97a 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -14855,6 +14855,7 @@  ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
       goto widen;
 
     case E_V8HImode:
+    case E_V8HFmode:
       if (TARGET_AVX2)
 	return ix86_vector_duplicate_value (mode, target, val);
 
@@ -14871,15 +14872,22 @@  ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
 	  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
 	  dperm.one_operand_p = true;
 
-	  /* Extend to SImode using a paradoxical SUBREG.  */
-	  tmp1 = gen_reg_rtx (SImode);
-	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
-
-	  /* Insert the SImode value as low element of a V4SImode vector.  */
-	  tmp2 = gen_reg_rtx (V4SImode);
-	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
-	  emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
+	  if (mode == V8HFmode)
+	    tmp1 = lowpart_subreg (V8HFmode, force_reg (HFmode, val), HFmode);
+	  else
+	    {
+	      /* Extend to SImode using a paradoxical SUBREG.  */
+	      tmp1 = gen_reg_rtx (SImode);
+	      emit_move_insn (tmp1, gen_lowpart (SImode, val));
+
+	      /* Insert the SImode value as
+		 low element of a V4SImode vector.  */
+	      tmp2 = gen_reg_rtx (V4SImode);
+	      emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
+	      tmp1 = gen_lowpart (mode, tmp2);
+	    }
 
+	  emit_move_insn (dperm.op0, tmp1);
 	  ok = (expand_vec_perm_1 (&dperm)
 		|| expand_vec_perm_broadcast_1 (&dperm));
 	  gcc_assert (ok);
@@ -14926,12 +14934,15 @@  ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
       }
 
     case E_V16HImode:
+    case E_V16HFmode:
     case E_V32QImode:
       if (TARGET_AVX2)
 	return ix86_vector_duplicate_value (mode, target, val);
       else
 	{
-	  machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
+	  machine_mode hvmode = (mode == V16HImode ? V8HImode
+				 : mode == V16HFmode ? V8HFmode
+				 : V16QImode);
 	  rtx x = gen_reg_rtx (hvmode);
 
 	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
@@ -14942,13 +14953,16 @@  ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
 	}
       return true;
 
-    case E_V64QImode:
     case E_V32HImode:
+    case E_V32HFmode:
+    case E_V64QImode:
       if (TARGET_AVX512BW)
 	return ix86_vector_duplicate_value (mode, target, val);
       else
 	{
-	  machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
+	  machine_mode hvmode = (mode == V32HImode ? V16HImode
+				 : mode == V32HFmode ? V16HFmode
+				 : V32QImode);
 	  rtx x = gen_reg_rtx (hvmode);
 
 	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
@@ -14959,11 +14973,6 @@  ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
 	}
       return true;
 
-    case E_V8HFmode:
-    case E_V16HFmode:
-    case E_V32HFmode:
-      return ix86_vector_duplicate_value (mode, target, val);
-
     default:
       return false;
     }
@@ -15912,7 +15921,8 @@  ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
   /* 512-bits vector byte/word broadcast and comparison only available
      under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
      when without TARGET_AVX512BW.  */
-  if ((mode == V32HImode || mode == V64QImode) && !TARGET_AVX512BW)
+  if ((mode == V32HImode || mode == V32HFmode || mode == V64QImode)
+      && !TARGET_AVX512BW)
     {
       gcc_assert (TARGET_AVX512F);
       rtx vhi, vlo, idx_hi;
@@ -15926,6 +15936,12 @@  ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
 	  extract_hi = gen_vec_extract_hi_v32hi;
 	  extract_lo = gen_vec_extract_lo_v32hi;
 	}
+      else if (mode == V32HFmode)
+	{
+	  half_mode = V16HFmode;
+	  extract_hi = gen_vec_extract_hi_v32hf;
+	  extract_lo = gen_vec_extract_lo_v32hf;
+	}
       else
 	{
 	  half_mode = V32QImode;
@@ -15973,7 +15989,6 @@  ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
 	case E_V16SFmode:
 	  cmp_mode = V16SImode;
 	  break;
-	/* TARGET_AVX512FP16 implies TARGET_AVX512BW.  */
 	case E_V8HFmode:
 	  cmp_mode = V8HImode;
 	  break;
@@ -16538,6 +16553,7 @@  ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
       break;
 
     case E_V8HImode:
+    case E_V8HFmode:
     case E_V2HImode:
       use_vec_extr = TARGET_SSE2;
       break;
@@ -16704,25 +16720,29 @@  ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
       return;
 
     case E_V32HFmode:
-      tmp = gen_reg_rtx (V16HFmode);
-      if (elt < 16)
-	emit_insn (gen_vec_extract_lo_v32hf (tmp, vec));
-      else
-	emit_insn (gen_vec_extract_hi_v32hf (tmp, vec));
-      ix86_expand_vector_extract (false, target, tmp, elt & 15);
-      return;
+      if (TARGET_AVX512BW)
+	{
+	  tmp = gen_reg_rtx (V16HFmode);
+	  if (elt < 16)
+	    emit_insn (gen_vec_extract_lo_v32hf (tmp, vec));
+	  else
+	    emit_insn (gen_vec_extract_hi_v32hf (tmp, vec));
+	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
+	  return;
+	}
+      break;
 
     case E_V16HFmode:
-      tmp = gen_reg_rtx (V8HFmode);
-      if (elt < 8)
-	emit_insn (gen_vec_extract_lo_v16hf (tmp, vec));
-      else
-	emit_insn (gen_vec_extract_hi_v16hf (tmp, vec));
-      ix86_expand_vector_extract (false, target, tmp, elt & 7);
-      return;
-
-    case E_V8HFmode:
-      use_vec_extr = true;
+      if (TARGET_AVX)
+	{
+	  tmp = gen_reg_rtx (V8HFmode);
+	  if (elt < 8)
+	    emit_insn (gen_vec_extract_lo_v16hf (tmp, vec));
+	  else
+	    emit_insn (gen_vec_extract_hi_v16hf (tmp, vec));
+	  ix86_expand_vector_extract (false, target, tmp, elt & 7);
+	  return;
+	}
       break;
 
     case E_V8QImode:
@@ -21443,6 +21463,34 @@  expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
       return true;
 
+    case E_V8HFmode:
+      /* This can be implemented via interleave and pshufd.  */
+      if (d->testing_p)
+	return true;
+
+      if (elt >= nelt2)
+	{
+	  gen = gen_vec_interleave_highv8hf;
+	  elt -= nelt2;
+	}
+      else
+	gen = gen_vec_interleave_lowv8hf;
+      nelt2 /= 2;
+
+      dest = gen_reg_rtx (vmode);
+      emit_insn (gen (dest, op0, op0));
+
+      vmode = V4SImode;
+      op0 = gen_lowpart (vmode, dest);
+
+      memset (perm2, elt, 4);
+      dest = gen_reg_rtx (vmode);
+      ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
+      gcc_assert (ok);
+
+      emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+      return true;
+
     case E_V32QImode:
     case E_V16HImode:
     case E_V8SImode:
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5421fb51684..929eef54055 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -266,9 +266,7 @@ 
 (define_mode_iterator VI12HF_AVX512VL
   [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")
    V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")
-   (V32HF "TARGET_AVX512FP16")
-   (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
-   (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")])
+   V32HF (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")])
 
 ;; Same iterator, but without supposed TARGET_AVX512BW
 (define_mode_iterator VI12_AVX512VLBW
@@ -285,8 +283,7 @@ 
    (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI
    (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
    (V8DI "TARGET_AVX512F")  (V4DI "TARGET_AVX") V2DI
-   (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16")
-   (V8HF "TARGET_AVX512FP16")
+   (V32HF "TARGET_AVX512F") (V16HF "TARGET_AVX") V8HF
    (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
    (V8DF "TARGET_AVX512F")  (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
 
@@ -311,10 +308,10 @@ 
 
 ;; All 256bit and 512bit vector modes
 (define_mode_iterator V_256_512
-  [V32QI V16HI V8SI V4DI V8SF V4DF
-   (V64QI "TARGET_AVX512F") (V32HI "TARGET_AVX512F") (V16SI "TARGET_AVX512F")
-   (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
-   (V16HF "TARGET_AVX512FP16") (V32HF "TARGET_AVX512FP16")])
+  [V32QI V16HI V16HF V8SI V4DI V8SF V4DF
+   (V64QI "TARGET_AVX512F") (V32HI "TARGET_AVX512F") (V32HF "TARGET_AVX512F")
+   (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
+   (V8DF "TARGET_AVX512F")])
 
 ;; All vector float modes
 (define_mode_iterator VF
@@ -24892,8 +24889,8 @@ 
   "operands[2] = gen_lowpart (<ssehalfvecmode>mode, operands[0]);")
 
 (define_insn "avx_vbroadcastf128_<mode>"
-  [(set (match_operand:V_256 0 "register_operand" "=x,x,x,v,v,v,v")
-	(vec_concat:V_256
+  [(set (match_operand:V_256H 0 "register_operand" "=x,x,x,v,v,v,v")
+	(vec_concat:V_256H
 	  (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "m,0,?x,m,0,m,0")
 	  (match_dup 1)))]
   "TARGET_AVX"