i386: Introduce V2QImode vectorized shifts [PR103861]

Message ID CAFULd4arfNrTT2wciFWHL3-2povvqQNuMz=SfrSpNajSyPi8dw@mail.gmail.com
State Committed
Commit 7a7d8c3f6167fd45658ddbfa32adcfd2acc98eb4
Headers
Series i386: Introduce V2QImode vectorized shifts [PR103861] |

Commit Message

Uros Bizjak Jan. 13, 2022, 7:50 p.m. UTC
  Add V2QImode shift operations and split them to synthesized
double HI/LO QImode operations with integer registers.

Also robustify arithmetic split patterns.

2022-01-13  Uroš Bizjak  <ubizjak@gmail.com>

gcc/ChangeLog:

    PR target/103861
    * config/i386/i386.md (*ashlqi_ext<mode>_2): New insn pattern.
    (*<any_shiftrt:insn>qi_ext<mode>_2): Ditto.
    * config/i386/mmx.md (<any_shift:insn>v2qi):
    New insn_and_split pattern.

gcc/testsuite/ChangeLog:

    PR target/103861
    * gcc.target/i386/pr103861.c (shl,ashr,lshr): New tests.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Pushed to master.

Uros.
  

Patch

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index bcaaa4993b1..c2acb1dbd90 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -12413,6 +12413,54 @@ 
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*ashlqi_ext<mode>_2"
+  [(set (zero_extract:SWI248
+	  (match_operand:SWI248 0 "register_operand" "+Q")
+	  (const_int 8)
+	  (const_int 8))
+	(subreg:SWI248
+	  (ashift:QI
+	    (subreg:QI
+	      (zero_extract:SWI248
+	        (match_operand:SWI248 1 "register_operand" "0")
+		(const_int 8)
+		(const_int 8)) 0)
+	    (match_operand:QI 2 "nonmemory_operand" "cI")) 0))
+  (clobber (reg:CC FLAGS_REG))]
+  "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
+   rtx_equal_p (operands[0], operands[1])"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ALU:
+      gcc_assert (operands[2] == const1_rtx);
+      return "add{b}\t%h0, %h0";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "sal{b}\t%h0";
+      else
+	return "sal{b}\t{%2, %h0|%h0, %2}";
+    }
+}
+  [(set (attr "type")
+     (cond [(and (match_test "TARGET_DOUBLE_WITH_ADD")
+		 (match_operand 2 "const1_operand"))
+	      (const_string "alu")
+	   ]
+	   (const_string "ishift")))
+   (set (attr "length_immediate")
+     (if_then_else
+       (ior (eq_attr "type" "alu")
+	    (and (eq_attr "type" "ishift")
+		 (and (match_operand 2 "const1_operand")
+		      (ior (match_test "TARGET_SHIFT1")
+			   (match_test "optimize_function_for_size_p (cfun)")))))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "QI")])
+
 ;; See comment above `ashl<mode>3' about how this works.
 
 (define_expand "<insn><mode>3"
@@ -13143,6 +13191,39 @@ 
        (const_string "0")
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
+
+(define_insn "*<insn>qi_ext<mode>_2"
+  [(set (zero_extract:SWI248
+	  (match_operand:SWI248 0 "register_operand" "+Q")
+	  (const_int 8)
+	  (const_int 8))
+	(subreg:SWI248
+	  (any_shiftrt:QI
+	    (subreg:QI
+	      (zero_extract:SWI248
+	        (match_operand:SWI248 1 "register_operand" "0")
+		(const_int 8)
+		(const_int 8)) 0)
+	    (match_operand:QI 2 "nonmemory_operand" "cI")) 0))
+  (clobber (reg:CC FLAGS_REG))]
+  "/* FIXME: without this LRA can't reload this pattern, see PR82524.  */
+   rtx_equal_p (operands[0], operands[1])"
+{
+  if (operands[2] == const1_rtx
+      && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+    return "<shift>{b}\t%h0";
+  else
+    return "<shift>{b}\t{%2, %h0|%h0, %2}";
+}
+  [(set_attr "type" "ishift")
+   (set (attr "length_immediate")
+     (if_then_else
+       (and (match_operand 2 "const1_operand")
+	    (ior (match_test "TARGET_SHIFT1")
+		 (match_test "optimize_function_for_size_p (cfun)")))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "QI")])
 
 ;; Rotate instructions
 
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 3d99a5e851b..782da220f98 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -1657,7 +1657,8 @@ 
         (neg:V2QI
 	  (match_operand:V2QI 1 "general_reg_operand")))
    (clobber (reg:CC FLAGS_REG))]
-  "reload_completed"
+  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && reload_completed"
   [(parallel
      [(set (strict_low_part (match_dup 0))
 	   (neg:QI (match_dup 1)))
@@ -1683,7 +1684,8 @@ 
         (neg:V2QI
 	  (match_operand:V2QI 1 "sse_reg_operand")))
    (clobber (reg:CC FLAGS_REG))]
-  "reload_completed"
+  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && TARGET_SSE2 && reload_completed"
   [(set (match_dup 0) (match_dup 2))
    (set (match_dup 0)
 	(minus:V16QI (match_dup 0) (match_dup 1)))]
@@ -1757,7 +1759,8 @@ 
 	  (match_operand:V2QI 1 "general_reg_operand")
 	  (match_operand:V2QI 2 "general_reg_operand")))
    (clobber (reg:CC FLAGS_REG))]
-  "reload_completed"
+  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && reload_completed"
   [(parallel
      [(set (strict_low_part (match_dup 0))
 	   (plusminus:QI (match_dup 1) (match_dup 2)))
@@ -1790,7 +1793,8 @@ 
 	  (match_operand:V2QI 1 "sse_reg_operand")
 	  (match_operand:V2QI 2 "sse_reg_operand")))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_SSE2 && reload_completed"
+  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && TARGET_SSE2 && reload_completed"
   [(set (match_dup 0)
         (plusminus:V16QI (match_dup 1) (match_dup 2)))]
 {
@@ -2387,6 +2391,38 @@ 
        (const_string "0")))
    (set_attr "mode" "TI")])
 
+(define_insn_and_split "<insn>v2qi3"
+  [(set (match_operand:V2QI 0 "register_operand" "=Q")
+        (any_shift:V2QI
+	  (match_operand:V2QI 1 "register_operand" "0")
+	  (match_operand:QI 2 "nonmemory_operand" "cI")))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
+  "#"
+  "&& reload_completed"
+  [(parallel
+     [(set (zero_extract:HI (match_dup 3) (const_int 8) (const_int 8))
+	   (subreg:HI
+	     (any_shift:QI
+	       (subreg:QI
+	         (zero_extract:HI (match_dup 4)
+			          (const_int 8)
+				  (const_int 8)) 0)
+	       (match_dup 2)) 0))
+      (clobber (reg:CC FLAGS_REG))])
+   (parallel
+     [(set (strict_low_part (match_dup 0))
+	   (any_shift:QI (match_dup 1) (match_dup 2)))
+      (clobber (reg:CC FLAGS_REG))])]
+{
+  operands[4] = lowpart_subreg (HImode, operands[1], V2QImode);
+  operands[3] = lowpart_subreg (HImode, operands[0], V2QImode);
+  operands[1] = lowpart_subreg (QImode, operands[1], V2QImode);
+  operands[0] = lowpart_subreg (QImode, operands[0], V2QImode);
+}
+  [(set_attr "type" "multi")
+   (set_attr "mode" "QI")])
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel integral comparisons
diff --git a/gcc/testsuite/gcc.target/i386/pr103861.c b/gcc/testsuite/gcc.target/i386/pr103861.c
index 158717645b6..064b617774b 100644
--- a/gcc/testsuite/gcc.target/i386/pr103861.c
+++ b/gcc/testsuite/gcc.target/i386/pr103861.c
@@ -3,6 +3,7 @@ 
 /* { dg-options "-O2 -dp" } */
 
 typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
 
 __v2qi and (__v2qi a, __v2qi b) { return a & b; };
 
@@ -20,4 +21,10 @@  __v2qi minus  (__v2qi a, __v2qi b) { return a - b; };
 
 __v2qi neg  (__v2qi a) { return -a; };
 
+__v2qi shl (__v2qi a, int b) { return a << b; };
+
+__v2qi ashr (__v2qi a, int b) { return a >> b; };
+
+__v2qu lshr  (__v2qu a, int b) { return a >> b; };
+
 /* { dg-final { scan-assembler-not "insvhi" } } */