i386: Introduce V2QImode minmax, abs and uavgv2hi3_ceil [PR103861]

Message ID CAFULd4bE-Jed-pOosjuB7mvivWoqsyCKUrGqYF55EVsfzXNnJQ@mail.gmail.com
State Committed
Commit c166632bd22d7da66354121502019fc9c92ef07f
Headers
Series i386: Introduce V2QImode minmax, abs and uavgv2hi3_ceil [PR103861] |

Commit Message

Uros Bizjak Jan. 5, 2022, 10:26 p.m. UTC
  Add V2QImode minmax, abs and uavxv2qi3_ceil operations with SSE registers.

2022-01-05  Uroš Bizjak  <ubizjak@gmail.com>

gcc/ChangeLog:

    PR target/103861
    * config/i386/mmx.md (VI_16_32): New mode iterator.
    (VI1_16_32): Ditto.
    (mmxvecsize): Handle V2QI mode.
    (<smaxmin:code><mode>3): Rename from <smaxmin:code>v4qi3.
    Use VI1_16_32 mode iterator.
    (<umaxmin:code><mode>3): Rename from <umaxmin:code>v4qi3.
    Use VI1_16_32 mode iterator.
    (abs<mode>2): Use VI_16_32 mode iterator.
    (uavgv2qi3_ceil): New insn pattern.

gcc/testsuite/ChangeLog:

    PR target/103861
    * gcc.target/i386/pr103861-3.c: New test.
    * g++.dg/vect/slp-pr98855.cc (dg-final): Check that
    no vectorization using SLP was performed.

I have changed scan-tree-dump patterns in g++.dg/vect/slp-pr98855.cc
to check that no SLP vectorization was performed. The existing
scan-tree-dump-times was too fragile, since the message was output for
every vectorization mode.

Another issue is missing vectorization for uavgv2qi3_ceil. I will open
a PR for that.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Pushed to master.

Uros.
  

Patch

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 96d85a54e10..a409bb7c6c6 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -63,6 +63,12 @@ 
 ;; 4-byte integer vector modes
 (define_mode_iterator VI_32 [V4QI V2HI])
 
+;; 4-byte and 2-byte integer vector modes
+(define_mode_iterator VI_16_32 [V4QI V2QI V2HI])
+
+;; 4-byte and 2-byte QImode vector modes
+(define_mode_iterator VI1_16_32 [V4QI V2QI])
+
 ;; V2S* modes
 (define_mode_iterator V2FI [V2SF V2SI])
 
@@ -71,7 +77,8 @@ 
 
 ;; Mapping from integer vector mode to mnemonic suffix
 (define_mode_attr mmxvecsize
-  [(V8QI "b") (V4QI "b") (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
+  [(V8QI "b") (V4QI "b") (V2QI "b")
+   (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
 
 (define_mode_attr mmxdoublemode
   [(V8QI "V8HI") (V4HI "V4SI")])
@@ -2140,11 +2147,11 @@ 
 	  (match_operand:V4HI 2 "register_operand")))]
   "TARGET_MMX_WITH_SSE")
 
-(define_insn "<code>v4qi3"
-  [(set (match_operand:V4QI 0 "register_operand" "=Yr,*x,Yv")
-	(smaxmin:V4QI
-	  (match_operand:V4QI 1 "register_operand" "%0,0,Yv")
-	  (match_operand:V4QI 2 "register_operand" "Yr,*x,Yv")))]
+(define_insn "<code><mode>3"
+  [(set (match_operand:VI1_16_32 0 "register_operand" "=Yr,*x,Yv")
+	(smaxmin:VI1_16_32
+	  (match_operand:VI1_16_32 1 "register_operand" "%0,0,Yv")
+	  (match_operand:VI1_16_32 2 "register_operand" "Yr,*x,Yv")))]
   "TARGET_SSE4_1"
   "@
    p<maxmin_int>b\t{%2, %0|%0, %2}
@@ -2218,11 +2225,11 @@ 
 	  (match_operand:V8QI 2 "register_operand")))]
   "TARGET_MMX_WITH_SSE")
 
-(define_insn "<code>v4qi3"
-  [(set (match_operand:V4QI 0 "register_operand" "=x,Yw")
-        (umaxmin:V4QI
-	  (match_operand:V4QI 1 "register_operand" "%0,Yw")
-	  (match_operand:V4QI 2 "register_operand" "x,Yw")))]
+(define_insn "<code><mode>3"
+  [(set (match_operand:VI1_16_32 0 "register_operand" "=x,Yw")
+        (umaxmin:VI1_16_32
+	  (match_operand:VI1_16_32 1 "register_operand" "%0,Yw")
+	  (match_operand:VI1_16_32 2 "register_operand" "x,Yw")))]
   "TARGET_SSE2"
   "@
    p<maxmin_int>b\t{%2, %0|%0, %2}
@@ -2269,9 +2276,9 @@ 
   "TARGET_SSSE3 && TARGET_MMX_WITH_SSE")
 
 (define_insn "abs<mode>2"
-  [(set (match_operand:VI_32 0 "register_operand" "=Yv")
-	(abs:VI_32
-	  (match_operand:VI_32 1 "register_operand" "Yv")))]
+  [(set (match_operand:VI_16_32 0 "register_operand" "=Yv")
+	(abs:VI_16_32
+	  (match_operand:VI_16_32 1 "register_operand" "Yv")))]
   "TARGET_SSSE3"
   "%vpabs<mmxvecsize>\t{%1, %0|%0, %1}"
   [(set_attr "type" "sselog1")
@@ -4351,6 +4358,26 @@ 
    (set_attr "type" "sseiadd")
    (set_attr "mode" "TI")])
 
+(define_insn "uavgv2qi3_ceil"
+  [(set (match_operand:V2QI 0 "register_operand" "=x,Yw")
+	(truncate:V2QI
+	  (lshiftrt:V2HI
+	    (plus:V2HI
+	      (plus:V2HI
+		(zero_extend:V2HI
+		  (match_operand:V2QI 1 "register_operand" "%0,Yw"))
+		(zero_extend:V2HI
+		  (match_operand:V2QI 2 "register_operand" "x,Yw")))
+	      (const_vector:V2HI [(const_int 1) (const_int 1)]))
+	    (const_int 1))))]
+  "TARGET_SSE2"
+  "@
+   pavgb\t{%2, %0|%0, %2}
+   vpavgb\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseiadd")
+   (set_attr "mode" "TI")])
+
 (define_insn "uavgv2hi3_ceil"
   [(set (match_operand:V2HI 0 "register_operand" "=x,Yw")
 	(truncate:V2HI
diff --git a/gcc/testsuite/g++.dg/vect/slp-pr98855.cc b/gcc/testsuite/g++.dg/vect/slp-pr98855.cc
index b1010326698..ff59eb95aca 100644
--- a/gcc/testsuite/g++.dg/vect/slp-pr98855.cc
+++ b/gcc/testsuite/g++.dg/vect/slp-pr98855.cc
@@ -81,6 +81,5 @@  void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, uint32_t *EK)
     }
 }
 
-// This used to work on { target x86_64-*-* i?86-*-* } but a fix in SLP
-// discovery makes us trip over the threshold again.
-// { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 2 "slp1" { xfail *-*-* } } }
+// { dg-final { scan-tree-dump "not vectorized: vectorization is not profitable" "slp1" } }
+// { dg-final { scan-tree-dump-not "vectorizing stmts using SLP" "slp1" } }
diff --git a/gcc/testsuite/gcc.target/i386/pr103861-3.c b/gcc/testsuite/gcc.target/i386/pr103861-3.c
new file mode 100644
index 00000000000..e5099ea0a83
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103861-3.c
@@ -0,0 +1,66 @@ 
+/* PR target/103861 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msse4" } */
+
+char r[2], a[2], b[2];
+unsigned char ur[2], ua[2], ub[2];
+
+void maxs (void)
+{
+  int i;
+
+  for (i = 0; i < 2; i++)
+    r[i] = a[i] > b[i] ? a[i] : b[i];
+}
+
+/* { dg-final { scan-assembler "pmaxsb" } } */
+
+void maxu (void)
+{
+  int i;
+
+  for (i = 0; i < 2; i++)
+    ur[i] = ua[i] > ub[i] ? ua[i] : ub[i];
+}
+
+/* { dg-final { scan-assembler "pmaxub" } } */
+
+void mins (void)
+{
+  int i;
+
+  for (i = 0; i < 2; i++)
+    r[i] = a[i] < b[i] ? a[i] : b[i];
+}
+
+/* { dg-final { scan-assembler "pminsb" } } */
+
+void minu (void)
+{
+  int i;
+
+  for (i = 0; i < 2; i++)
+    ur[i] = ua[i] < ub[i] ? ua[i] : ub[i];
+}
+
+/* { dg-final { scan-assembler "pminub" } } */
+
+void _abs (void)
+{
+  int i;
+
+  for (i = 0; i < 2; i++)
+    r[i] = a[i] < 0 ? -a[i] : a[i];
+}
+
+/* { dg-final { scan-assembler "pabsb" } } */
+
+void avgu (void)
+{
+  int i;
+
+  for (i = 0; i < 2; i++)
+    ur[i] = (ua[i] + ub[i] + 1) >> 1;
+}
+
+/* { dg-final { scan-assembler "pavgb" { xfail *-*-* } } } */