[2/2] Add SVE fallback case using sdot for usdot

Message ID YqsKrV7XiJf33bNU@arm.com
State Dropped
Headers
Series [1/2] AArch64 Add fallback case using sdot for usdot |

Commit Message

Tamar Christina June 16, 2022, 10:49 a.m. UTC
  Hi All,

The usdot operation is common in video encoder and decoders including some of
the most widely used ones.

This patch adds a +dotprod version of the optab as a fallback for when you do
have sdot but not usdot available.

The fallback works by adding a bias to the unsigned argument to convert it to
a signed value and then correcting for the bias later on.

Essentially it relies on (x - 128)y + 128y == xy where x is unsigned and y is
signed (assuming both are 8-bit values).  Because the range of a signed byte is
only to 127 we split the bias correction into:

   (x - 128)y + 127y + y

Concretely for:

#define N 480
#define SIGNEDNESS_1 unsigned
#define SIGNEDNESS_2 signed
#define SIGNEDNESS_3 signed
#define SIGNEDNESS_4 unsigned

SIGNEDNESS_1 int __attribute__ ((noipa))
f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
   SIGNEDNESS_4 char *restrict b)
{
  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
    {
      int av = a[i];
      int bv = b[i];
      SIGNEDNESS_2 short mult = av * bv;
      res += mult;
    }
  return res;
}

we generate:

f:
	...
        mov     z6.b, #0
        mov     z5.b, #127
        mov     z4.b, #1
        mov     z3.b, #-128
        ptrue   p1.b, all
        movi    v0.4s, 0
.L2:
        ld1b    z2.b, p0/z, [x1, x3]
        ld1b    z1.b, p0/z, [x2, x3]
        incb    x3
        sel     z1.b, p0, z1.b, z6.b
        whilelo p0.b, w3, w4
        sub     z1.b, z1.b, z3.b
        sdot    z0.s, z1.b, z2.b
        sdot    z0.s, z5.b, z2.b
        sdot    z0.s, z4.b, z2.b
        b.any   .L2

instead of:

f:
        ...
.L2:
        ld1sb   z2.h, p0/z, [x1, x3]
        punpklo p1.h, p0.b
        ld1b    z0.h, p0/z, [x2, x3]
        add     x3, x3, x5
        mul     z0.h, p2/m, z0.h, z2.h
        sunpklo z2.s, z0.h
        sunpkhi z0.s, z0.h
        add     z1.s, p1/m, z1.s, z2.s
        punpkhi p1.h, p0.b
        whilelo p0.h, w3, w4
        add     z1.s, p1/m, z1.s, z0.s
        b.any   .L2

The new sequence is significantly faster as the operations it uses are well
optimized.  Note that execution tests are already in the mid-end testsuite.

Thanks to James Greenhalgh for the tip-off.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar


gcc/ChangeLog:

	* config/aarch64/aarch64-sve.md (@<sur>dot_prod<vsi2qi>): Generate
	fallback or call original isns ...
	(@<sur>dot_prod<vsi2qi>_insn): ...here.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/vusdot-autovec_2.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index bd60e65b0c3f05f1c931f03807170f3b9d699de5..ca60416e7d7b1d8848f4ec5a624ae479a12ae5bc 100644




--
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index bd60e65b0c3f05f1c931f03807170f3b9d699de5..ca60416e7d7b1d8848f4ec5a624ae479a12ae5bc 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -6887,7 +6887,7 @@ (define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>"
   [(set_attr "movprfx" "*,yes")]
 )
 
-(define_insn "@<sur>dot_prod<vsi2qi>"
+(define_insn "@<sur>dot_prod<vsi2qi>_insn"
   [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
         (plus:VNx4SI_ONLY
 	  (unspec:VNx4SI_ONLY
@@ -6902,6 +6902,43 @@ (define_insn "@<sur>dot_prod<vsi2qi>"
    [(set_attr "movprfx" "*,yes")]
 )
 
+(define_expand "@<sur>dot_prod<vsi2qi>"
+  [(set (match_operand:VNx4SI_ONLY 0 "register_operand")
+        (plus:VNx4SI_ONLY
+	  (unspec:VNx4SI_ONLY
+	    [(match_operand:<VSI2QI> 1 "register_operand")
+	     (match_operand:<VSI2QI> 2 "register_operand")]
+	    DOTPROD_US_ONLY)
+	  (match_operand:VNx4SI_ONLY 3 "register_operand")))]
+  "TARGET_SVE || TARGET_SVE_I8MM"
+{
+  if (TARGET_SVE_I8MM)
+    {
+      emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1],
+                                              operands[2], operands[3]));
+      DONE;
+    }
+
+  machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
+  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
+  rtx signbit = gen_int_mode (val, elemmode);
+  rtx t1 = gen_reg_rtx (<MODE>mode);
+  rtx t2 = gen_reg_rtx (<MODE>mode);
+  rtx tmp = gen_reg_rtx (<VSI2QI>mode);
+  rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
+                                    gen_int_mode (val - 1, elemmode));
+  rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode));
+  rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
+  c1 = force_reg (<VSI2QI>mode, c1);
+  c2 = force_reg (<VSI2QI>mode, c2);
+  dup = force_reg (<VSI2QI>mode, dup);
+  emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
+  emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3]));
+  emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
+  emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2));
+  DONE;
+})
+
 (define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>"
   [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
 	(plus:VNx4SI_ONLY
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..cbe6b7eb7bef5a5c4b8e5ac823ebdf1d309f8490
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#pragma GCC target "+noi8mm"
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+   SIGNEDNESS_4 char *restrict b)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-not {\tusdot\t} } } */
+/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */
  

Patch

--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -6887,7 +6887,7 @@  (define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>"
   [(set_attr "movprfx" "*,yes")]
 )
 
-(define_insn "@<sur>dot_prod<vsi2qi>"
+(define_insn "@<sur>dot_prod<vsi2qi>_insn"
   [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
         (plus:VNx4SI_ONLY
 	  (unspec:VNx4SI_ONLY
@@ -6902,6 +6902,43 @@  (define_insn "@<sur>dot_prod<vsi2qi>"
    [(set_attr "movprfx" "*,yes")]
 )
 
+(define_expand "@<sur>dot_prod<vsi2qi>"
+  [(set (match_operand:VNx4SI_ONLY 0 "register_operand")
+        (plus:VNx4SI_ONLY
+	  (unspec:VNx4SI_ONLY
+	    [(match_operand:<VSI2QI> 1 "register_operand")
+	     (match_operand:<VSI2QI> 2 "register_operand")]
+	    DOTPROD_US_ONLY)
+	  (match_operand:VNx4SI_ONLY 3 "register_operand")))]
+  "TARGET_SVE || TARGET_SVE_I8MM"
+{
+  if (TARGET_SVE_I8MM)
+    {
+      emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1],
+                                              operands[2], operands[3]));
+      DONE;
+    }
+
+  machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
+  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
+  rtx signbit = gen_int_mode (val, elemmode);
+  rtx t1 = gen_reg_rtx (<MODE>mode);
+  rtx t2 = gen_reg_rtx (<MODE>mode);
+  rtx tmp = gen_reg_rtx (<VSI2QI>mode);
+  rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
+                                    gen_int_mode (val - 1, elemmode));
+  rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode));
+  rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
+  c1 = force_reg (<VSI2QI>mode, c1);
+  c2 = force_reg (<VSI2QI>mode, c2);
+  dup = force_reg (<VSI2QI>mode, dup);
+  emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
+  emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3]));
+  emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
+  emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2));
+  DONE;
+})
+
 (define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>"
   [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
 	(plus:VNx4SI_ONLY
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..cbe6b7eb7bef5a5c4b8e5ac823ebdf1d309f8490
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c
@@ -0,0 +1,27 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#pragma GCC target "+noi8mm"
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+   SIGNEDNESS_4 char *restrict b)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-not {\tusdot\t} } } */
+/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */