[1/2] AArch64 Add fallback case using sdot for usdot

Message ID patch-15821-tamar@arm.com
State Dropped
Headers
Series [1/2] AArch64 Add fallback case using sdot for usdot |

Commit Message

Tamar Christina June 16, 2022, 10:48 a.m. UTC
  Hi All,

The usdot operation is common in video encoder and decoders including some of
the most widely used ones.

This patch adds a +dotprod version of the optab as a fallback for when you do
have sdot but not usdot available.

The fallback works by adding a bias to the unsigned argument to convert it to
a signed value and then correcting for the bias later on.

Essentially it relies on (x - 128)y + 128y == xy where x is unsigned and y is
signed (assuming both are 8-bit values).  Because the range of a signed byte is
only to 127 we split the bias correction into:

   (x - 128)y + 127y + y

Concretely for:

#define N 480
#define SIGNEDNESS_1 unsigned
#define SIGNEDNESS_2 signed
#define SIGNEDNESS_3 signed
#define SIGNEDNESS_4 unsigned

SIGNEDNESS_1 int __attribute__ ((noipa))
f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
   SIGNEDNESS_4 char *restrict b)
{
  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
    {
      int av = a[i];
      int bv = b[i];
      SIGNEDNESS_2 short mult = av * bv;
      res += mult;
    }
  return res;
}

we generate:

        movi    v5.16b, 0x7f
        mov     x3, 0
        movi    v4.16b, 0x1
        movi    v3.16b, 0xffffffffffffff80
        movi    v0.4s, 0
.L2:
        ldr     q2, [x2, x3]
        ldr     q1, [x1, x3]
        add     x3, x3, 16
        sub     v2.16b, v2.16b, v3.16b
        sdot    v0.4s, v2.16b, v1.16b
        sdot    v0.4s, v5.16b, v1.16b
        sdot    v0.4s, v4.16b, v1.16b
        cmp     x3, 480
        bne     .L2

instead of:

        movi    v0.4s, 0
        mov     x3, 0
.L2:
        ldr     q2, [x1, x3]
        ldr     q1, [x2, x3]
        add     x3, x3, 16
        sxtl    v4.8h, v2.8b
        sxtl2   v3.8h, v2.16b
        uxtl    v2.8h, v1.8b
        uxtl2   v1.8h, v1.16b
        mul     v2.8h, v2.8h, v4.8h
        mul     v1.8h, v1.8h, v3.8h
        saddw   v0.4s, v0.4s, v2.4h
        saddw2  v0.4s, v0.4s, v2.8h
        saddw   v0.4s, v0.4s, v1.4h
        saddw2  v0.4s, v0.4s, v1.8h
        cmp     x3, 480
        bne     .L2

The new sequence is significantly faster as the operations it uses are well
optimized.  Note that execution tests are already in the mid-end testsuite.

Thanks to James Greenhalgh for the tip-off.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (usdot_prod<vsi2qi>): Generate fallback
	or call original isns ...
	(usdot_prod<vsi2qi>_insn): ...here.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/simd/vusdot-autovec-2.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e8564e9cf643a74 100644




--
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e8564e9cf643a74 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -623,7 +623,7 @@ (define_insn "<sur>dot_prod<vsi2qi>"
 
 ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
 ;; (vector) Dot Product operation and the vectorized optab.
-(define_insn "usdot_prod<vsi2qi>"
+(define_insn "usdot_prod<vsi2qi>_insn"
   [(set (match_operand:VS 0 "register_operand" "=w")
 	(plus:VS
 	  (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
@@ -635,6 +635,43 @@ (define_insn "usdot_prod<vsi2qi>"
   [(set_attr "type" "neon_dot<q>")]
 )
 
+;; usdot auto-vec fallback code
+(define_expand "usdot_prod<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand")
+	(plus:VS
+	  (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
+		      (match_operand:<VSI2QI> 2 "register_operand")]
+	  UNSPEC_USDOT)
+	  (match_operand:VS 3 "register_operand")))]
+  "TARGET_DOTPROD || TARGET_I8MM"
+{
+  if (TARGET_I8MM)
+    {
+      emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1],
+					      operands[2], operands[3]));
+      DONE;
+    }
+
+  machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
+  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
+  rtx signbit = gen_int_mode (val, elemmode);
+  rtx t1 = gen_reg_rtx (<MODE>mode);
+  rtx t2 = gen_reg_rtx (<MODE>mode);
+  rtx tmp = gen_reg_rtx (<VSI2QI>mode);
+  rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
+				    gen_int_mode (val - 1, elemmode));
+  rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode));
+  rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
+  c1 = force_reg (<VSI2QI>mode, c1);
+  c2 = force_reg (<VSI2QI>mode, c2);
+  dup = force_reg (<VSI2QI>mode, dup);
+  emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
+  emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3]));
+  emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
+  emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2));
+  DONE;
+})
+
 ;; These instructions map to the __builtins for the Dot Product
 ;; indexed operations.
 (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..acd8e36209690386d021df72f1467a696750ac3e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+noi8mm+dotprod" } */
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+   SIGNEDNESS_4 char *restrict b)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-not {\tusdot\t} } } */
+/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */
  

Comments

Richard Sandiford June 16, 2022, 4:09 p.m. UTC | #1
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> The usdot operation is common in video encoder and decoders including some of
> the most widely used ones.
>
> This patch adds a +dotprod version of the optab as a fallback for when you do
> have sdot but not usdot available.
>
> The fallback works by adding a bias to the unsigned argument to convert it to
> a signed value and then correcting for the bias later on.
>
> Essentially it relies on (x - 128)y + 128y == xy where x is unsigned and y is
> signed (assuming both are 8-bit values).  Because the range of a signed byte is
> only to 127 we split the bias correction into:
>
>    (x - 128)y + 127y + y

I bet you knew this question was coming, but: this technique
isn't target-specific, so wouldn't it be better to handle it in
tree-vect-patterns.cc instead?

Thanks,
Richard

> Concretely for:
>
> #define N 480
> #define SIGNEDNESS_1 unsigned
> #define SIGNEDNESS_2 signed
> #define SIGNEDNESS_3 signed
> #define SIGNEDNESS_4 unsigned
>
> SIGNEDNESS_1 int __attribute__ ((noipa))
> f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
>    SIGNEDNESS_4 char *restrict b)
> {
>   for (__INTPTR_TYPE__ i = 0; i < N; ++i)
>     {
>       int av = a[i];
>       int bv = b[i];
>       SIGNEDNESS_2 short mult = av * bv;
>       res += mult;
>     }
>   return res;
> }
>
> we generate:
>
>         movi    v5.16b, 0x7f
>         mov     x3, 0
>         movi    v4.16b, 0x1
>         movi    v3.16b, 0xffffffffffffff80
>         movi    v0.4s, 0
> .L2:
>         ldr     q2, [x2, x3]
>         ldr     q1, [x1, x3]
>         add     x3, x3, 16
>         sub     v2.16b, v2.16b, v3.16b
>         sdot    v0.4s, v2.16b, v1.16b
>         sdot    v0.4s, v5.16b, v1.16b
>         sdot    v0.4s, v4.16b, v1.16b
>         cmp     x3, 480
>         bne     .L2
>
> instead of:
>
>         movi    v0.4s, 0
>         mov     x3, 0
> .L2:
>         ldr     q2, [x1, x3]
>         ldr     q1, [x2, x3]
>         add     x3, x3, 16
>         sxtl    v4.8h, v2.8b
>         sxtl2   v3.8h, v2.16b
>         uxtl    v2.8h, v1.8b
>         uxtl2   v1.8h, v1.16b
>         mul     v2.8h, v2.8h, v4.8h
>         mul     v1.8h, v1.8h, v3.8h
>         saddw   v0.4s, v0.4s, v2.4h
>         saddw2  v0.4s, v0.4s, v2.8h
>         saddw   v0.4s, v0.4s, v1.4h
>         saddw2  v0.4s, v0.4s, v1.8h
>         cmp     x3, 480
>         bne     .L2
>
> The new sequence is significantly faster as the operations it uses are well
> optimized.  Note that execution tests are already in the mid-end testsuite.
>
> Thanks to James Greenhalgh for the tip-off.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-simd.md (usdot_prod<vsi2qi>): Generate fallback
> 	or call original isns ...
> 	(usdot_prod<vsi2qi>_insn): ...here.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/simd/vusdot-autovec-2.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e8564e9cf643a74 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -623,7 +623,7 @@ (define_insn "<sur>dot_prod<vsi2qi>"
>  
>  ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
>  ;; (vector) Dot Product operation and the vectorized optab.
> -(define_insn "usdot_prod<vsi2qi>"
> +(define_insn "usdot_prod<vsi2qi>_insn"
>    [(set (match_operand:VS 0 "register_operand" "=w")
>  	(plus:VS
>  	  (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
> @@ -635,6 +635,43 @@ (define_insn "usdot_prod<vsi2qi>"
>    [(set_attr "type" "neon_dot<q>")]
>  )
>  
> +;; usdot auto-vec fallback code
> +(define_expand "usdot_prod<vsi2qi>"
> +  [(set (match_operand:VS 0 "register_operand")
> +	(plus:VS
> +	  (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
> +		      (match_operand:<VSI2QI> 2 "register_operand")]
> +	  UNSPEC_USDOT)
> +	  (match_operand:VS 3 "register_operand")))]
> +  "TARGET_DOTPROD || TARGET_I8MM"
> +{
> +  if (TARGET_I8MM)
> +    {
> +      emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1],
> +					      operands[2], operands[3]));
> +      DONE;
> +    }
> +
> +  machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
> +  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
> +  rtx signbit = gen_int_mode (val, elemmode);
> +  rtx t1 = gen_reg_rtx (<MODE>mode);
> +  rtx t2 = gen_reg_rtx (<MODE>mode);
> +  rtx tmp = gen_reg_rtx (<VSI2QI>mode);
> +  rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
> +				    gen_int_mode (val - 1, elemmode));
> +  rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode));
> +  rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
> +  c1 = force_reg (<VSI2QI>mode, c1);
> +  c2 = force_reg (<VSI2QI>mode, c2);
> +  dup = force_reg (<VSI2QI>mode, dup);
> +  emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
> +  emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3]));
> +  emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
> +  emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2));
> +  DONE;
> +})
> +
>  ;; These instructions map to the __builtins for the Dot Product
>  ;; indexed operations.
>  (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..acd8e36209690386d021df72f1467a696750ac3e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
> @@ -0,0 +1,25 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=armv8.2-a+noi8mm+dotprod" } */
> +
> +#define N 480
> +#define SIGNEDNESS_1 unsigned
> +#define SIGNEDNESS_2 signed
> +#define SIGNEDNESS_3 signed
> +#define SIGNEDNESS_4 unsigned
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
> +   SIGNEDNESS_4 char *restrict b)
> +{
> +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> +    {
> +      int av = a[i];
> +      int bv = b[i];
> +      SIGNEDNESS_2 short mult = av * bv;
> +      res += mult;
> +    }
> +  return res;
> +}
> +
> +/* { dg-final { scan-assembler-not {\tusdot\t} } } */
> +/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */
  
Richard Sandiford June 16, 2022, 6:53 p.m. UTC | #2
Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> Tamar Christina <tamar.christina@arm.com> writes:
>> Hi All,
>>
>> The usdot operation is common in video encoder and decoders including some of
>> the most widely used ones.
>>
>> This patch adds a +dotprod version of the optab as a fallback for when you do
>> have sdot but not usdot available.
>>
>> The fallback works by adding a bias to the unsigned argument to convert it to
>> a signed value and then correcting for the bias later on.
>>
>> Essentially it relies on (x - 128)y + 128y == xy where x is unsigned and y is
>> signed (assuming both are 8-bit values).  Because the range of a signed byte is
>> only to 127 we split the bias correction into:
>>
>>    (x - 128)y + 127y + y
>
> I bet you knew this question was coming, but: this technique
> isn't target-specific, so wouldn't it be better to handle it in
> tree-vect-patterns.cc instead?

Also, how about doing (x - 128)y + 64y + 64y instead, to reduce
the number of hoisted constants?

Thanks,
Richard

> Thanks,
> Richard
>
>> Concretely for:
>>
>> #define N 480
>> #define SIGNEDNESS_1 unsigned
>> #define SIGNEDNESS_2 signed
>> #define SIGNEDNESS_3 signed
>> #define SIGNEDNESS_4 unsigned
>>
>> SIGNEDNESS_1 int __attribute__ ((noipa))
>> f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
>>    SIGNEDNESS_4 char *restrict b)
>> {
>>   for (__INTPTR_TYPE__ i = 0; i < N; ++i)
>>     {
>>       int av = a[i];
>>       int bv = b[i];
>>       SIGNEDNESS_2 short mult = av * bv;
>>       res += mult;
>>     }
>>   return res;
>> }
>>
>> we generate:
>>
>>         movi    v5.16b, 0x7f
>>         mov     x3, 0
>>         movi    v4.16b, 0x1
>>         movi    v3.16b, 0xffffffffffffff80
>>         movi    v0.4s, 0
>> .L2:
>>         ldr     q2, [x2, x3]
>>         ldr     q1, [x1, x3]
>>         add     x3, x3, 16
>>         sub     v2.16b, v2.16b, v3.16b
>>         sdot    v0.4s, v2.16b, v1.16b
>>         sdot    v0.4s, v5.16b, v1.16b
>>         sdot    v0.4s, v4.16b, v1.16b
>>         cmp     x3, 480
>>         bne     .L2
>>
>> instead of:
>>
>>         movi    v0.4s, 0
>>         mov     x3, 0
>> .L2:
>>         ldr     q2, [x1, x3]
>>         ldr     q1, [x2, x3]
>>         add     x3, x3, 16
>>         sxtl    v4.8h, v2.8b
>>         sxtl2   v3.8h, v2.16b
>>         uxtl    v2.8h, v1.8b
>>         uxtl2   v1.8h, v1.16b
>>         mul     v2.8h, v2.8h, v4.8h
>>         mul     v1.8h, v1.8h, v3.8h
>>         saddw   v0.4s, v0.4s, v2.4h
>>         saddw2  v0.4s, v0.4s, v2.8h
>>         saddw   v0.4s, v0.4s, v1.4h
>>         saddw2  v0.4s, v0.4s, v1.8h
>>         cmp     x3, 480
>>         bne     .L2
>>
>> The new sequence is significantly faster as the operations it uses are well
>> optimized.  Note that execution tests are already in the mid-end testsuite.
>>
>> Thanks to James Greenhalgh for the tip-off.
>>
>> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>>
>> Ok for master?
>>
>> Thanks,
>> Tamar
>>
>> gcc/ChangeLog:
>>
>> 	* config/aarch64/aarch64-simd.md (usdot_prod<vsi2qi>): Generate fallback
>> 	or call original isns ...
>> 	(usdot_prod<vsi2qi>_insn): ...here.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 	* gcc.target/aarch64/simd/vusdot-autovec-2.c: New test.
>>
>> --- inline copy of patch -- 
>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>> index cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e8564e9cf643a74 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -623,7 +623,7 @@ (define_insn "<sur>dot_prod<vsi2qi>"
>>  
>>  ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
>>  ;; (vector) Dot Product operation and the vectorized optab.
>> -(define_insn "usdot_prod<vsi2qi>"
>> +(define_insn "usdot_prod<vsi2qi>_insn"
>>    [(set (match_operand:VS 0 "register_operand" "=w")
>>  	(plus:VS
>>  	  (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
>> @@ -635,6 +635,43 @@ (define_insn "usdot_prod<vsi2qi>"
>>    [(set_attr "type" "neon_dot<q>")]
>>  )
>>  
>> +;; usdot auto-vec fallback code
>> +(define_expand "usdot_prod<vsi2qi>"
>> +  [(set (match_operand:VS 0 "register_operand")
>> +	(plus:VS
>> +	  (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
>> +		      (match_operand:<VSI2QI> 2 "register_operand")]
>> +	  UNSPEC_USDOT)
>> +	  (match_operand:VS 3 "register_operand")))]
>> +  "TARGET_DOTPROD || TARGET_I8MM"
>> +{
>> +  if (TARGET_I8MM)
>> +    {
>> +      emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1],
>> +					      operands[2], operands[3]));
>> +      DONE;
>> +    }
>> +
>> +  machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
>> +  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
>> +  rtx signbit = gen_int_mode (val, elemmode);
>> +  rtx t1 = gen_reg_rtx (<MODE>mode);
>> +  rtx t2 = gen_reg_rtx (<MODE>mode);
>> +  rtx tmp = gen_reg_rtx (<VSI2QI>mode);
>> +  rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
>> +				    gen_int_mode (val - 1, elemmode));
>> +  rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode));
>> +  rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
>> +  c1 = force_reg (<VSI2QI>mode, c1);
>> +  c2 = force_reg (<VSI2QI>mode, c2);
>> +  dup = force_reg (<VSI2QI>mode, dup);
>> +  emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
>> +  emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3]));
>> +  emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
>> +  emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2));
>> +  DONE;
>> +})
>> +
>>  ;; These instructions map to the __builtins for the Dot Product
>>  ;; indexed operations.
>>  (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
>> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..acd8e36209690386d021df72f1467a696750ac3e
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
>> @@ -0,0 +1,25 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O3 -march=armv8.2-a+noi8mm+dotprod" } */
>> +
>> +#define N 480
>> +#define SIGNEDNESS_1 unsigned
>> +#define SIGNEDNESS_2 signed
>> +#define SIGNEDNESS_3 signed
>> +#define SIGNEDNESS_4 unsigned
>> +
>> +SIGNEDNESS_1 int __attribute__ ((noipa))
>> +f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
>> +   SIGNEDNESS_4 char *restrict b)
>> +{
>> +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
>> +    {
>> +      int av = a[i];
>> +      int bv = b[i];
>> +      SIGNEDNESS_2 short mult = av * bv;
>> +      res += mult;
>> +    }
>> +  return res;
>> +}
>> +
>> +/* { dg-final { scan-assembler-not {\tusdot\t} } } */
>> +/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */
  
Tamar Christina June 27, 2022, 5:24 a.m. UTC | #3
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Thursday, June 16, 2022 7:54 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH 1/2]AArch64 Add fallback case using sdot for usdot
> 
> Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > Tamar Christina <tamar.christina@arm.com> writes:
> >> Hi All,
> >>
> >> The usdot operation is common in video encoder and decoders including
> >> some of the most widely used ones.
> >>
> >> This patch adds a +dotprod version of the optab as a fallback for
> >> when you do have sdot but not usdot available.
> >>
> >> The fallback works by adding a bias to the unsigned argument to
> >> convert it to a signed value and then correcting for the bias later on.
> >>
> >> Essentially it relies on (x - 128)y + 128y == xy where x is unsigned
> >> and y is signed (assuming both are 8-bit values).  Because the range
> >> of a signed byte is only to 127 we split the bias correction into:
> >>
> >>    (x - 128)y + 127y + y
> >
> > I bet you knew this question was coming, but: this technique isn't
> > target-specific, so wouldn't it be better to handle it in
> > tree-vect-patterns.cc instead?

Ok, so after many hours of trying I don't know how to make this work.
DOT_PROD_EXPR is a reduction, but emitting them as additional pattern
statement doesn't work because they'll be marked as internal_def rather than
reduction_def.  I tried marking the new vec_stmt_info that I create explicitly as
reduction_def but this gets overwritten during analysis.

I then looked into getting it as a vectorizable_operation but has this obvious problems
In that it no longer treats it as a reduction and so tries to decompose into hi/lo.

I then looked into treating additional patterns from  a reduction as reductions themselves
but this is obviously wrong as non-reduction statements also get marked as reductions.

The conclusion is that I don't think the vectorizer allows additional reductions to be
emitted from patterns.

> Also, how about doing (x - 128)y + 64y + 64y instead, to reduce the number
> of hoisted constants?
> 
> Thanks,
> Richard
> 
> > Thanks,
> > Richard
> >
> >> Concretely for:
> >>
> >> #define N 480
> >> #define SIGNEDNESS_1 unsigned
> >> #define SIGNEDNESS_2 signed
> >> #define SIGNEDNESS_3 signed
> >> #define SIGNEDNESS_4 unsigned
> >>
> >> SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res,
> >> SIGNEDNESS_3 char *restrict a,
> >>    SIGNEDNESS_4 char *restrict b)
> >> {
> >>   for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> >>     {
> >>       int av = a[i];
> >>       int bv = b[i];
> >>       SIGNEDNESS_2 short mult = av * bv;
> >>       res += mult;
> >>     }
> >>   return res;
> >> }
> >>
> >> we generate:
> >>
> >>         movi    v5.16b, 0x7f
> >>         mov     x3, 0
> >>         movi    v4.16b, 0x1
> >>         movi    v3.16b, 0xffffffffffffff80
> >>         movi    v0.4s, 0
> >> .L2:
> >>         ldr     q2, [x2, x3]
> >>         ldr     q1, [x1, x3]
> >>         add     x3, x3, 16
> >>         sub     v2.16b, v2.16b, v3.16b
> >>         sdot    v0.4s, v2.16b, v1.16b
> >>         sdot    v0.4s, v5.16b, v1.16b
> >>         sdot    v0.4s, v4.16b, v1.16b
> >>         cmp     x3, 480
> >>         bne     .L2
> >>
> >> instead of:
> >>
> >>         movi    v0.4s, 0
> >>         mov     x3, 0
> >> .L2:
> >>         ldr     q2, [x1, x3]
> >>         ldr     q1, [x2, x3]
> >>         add     x3, x3, 16
> >>         sxtl    v4.8h, v2.8b
> >>         sxtl2   v3.8h, v2.16b
> >>         uxtl    v2.8h, v1.8b
> >>         uxtl2   v1.8h, v1.16b
> >>         mul     v2.8h, v2.8h, v4.8h
> >>         mul     v1.8h, v1.8h, v3.8h
> >>         saddw   v0.4s, v0.4s, v2.4h
> >>         saddw2  v0.4s, v0.4s, v2.8h
> >>         saddw   v0.4s, v0.4s, v1.4h
> >>         saddw2  v0.4s, v0.4s, v1.8h
> >>         cmp     x3, 480
> >>         bne     .L2
> >>
> >> The new sequence is significantly faster as the operations it uses
> >> are well optimized.  Note that execution tests are already in the mid-end
> testsuite.
> >>
> >> Thanks to James Greenhalgh for the tip-off.
> >>
> >> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >>
> >> Ok for master?
> >>
> >> Thanks,
> >> Tamar
> >>
> >> gcc/ChangeLog:
> >>
> >> 	* config/aarch64/aarch64-simd.md (usdot_prod<vsi2qi>): Generate
> fallback
> >> 	or call original isns ...
> >> 	(usdot_prod<vsi2qi>_insn): ...here.
> >>
> >> gcc/testsuite/ChangeLog:
> >>
> >> 	* gcc.target/aarch64/simd/vusdot-autovec-2.c: New test.
> >>
> >> --- inline copy of patch --
> >> diff --git a/gcc/config/aarch64/aarch64-simd.md
> >> b/gcc/config/aarch64/aarch64-simd.md
> >> index
> >>
> cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e8
> >> 564e9cf643a74 100644
> >> --- a/gcc/config/aarch64/aarch64-simd.md
> >> +++ b/gcc/config/aarch64/aarch64-simd.md
> >> @@ -623,7 +623,7 @@ (define_insn "<sur>dot_prod<vsi2qi>"
> >>
> >>  ;; These instructions map to the __builtins for the Armv8.6-a I8MM
> >> usdot  ;; (vector) Dot Product operation and the vectorized optab.
> >> -(define_insn "usdot_prod<vsi2qi>"
> >> +(define_insn "usdot_prod<vsi2qi>_insn"
> >>    [(set (match_operand:VS 0 "register_operand" "=w")
> >>  	(plus:VS
> >>  	  (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
> @@
> >> -635,6 +635,43 @@ (define_insn "usdot_prod<vsi2qi>"
> >>    [(set_attr "type" "neon_dot<q>")]
> >>  )
> >>
> >> +;; usdot auto-vec fallback code
> >> +(define_expand "usdot_prod<vsi2qi>"
> >> +  [(set (match_operand:VS 0 "register_operand")
> >> +	(plus:VS
> >> +	  (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
> >> +		      (match_operand:<VSI2QI> 2 "register_operand")]
> >> +	  UNSPEC_USDOT)
> >> +	  (match_operand:VS 3 "register_operand")))]
> >> +  "TARGET_DOTPROD || TARGET_I8MM"
> >> +{
> >> +  if (TARGET_I8MM)
> >> +    {
> >> +      emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1],
> >> +					      operands[2], operands[3]));
> >> +      DONE;
> >> +    }
> >> +
> >> +  machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
> >> +  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE
> (elemmode).to_constant
> >> +() - 1);
> >> +  rtx signbit = gen_int_mode (val, elemmode);
> >> +  rtx t1 = gen_reg_rtx (<MODE>mode);
> >> +  rtx t2 = gen_reg_rtx (<MODE>mode);
> >> +  rtx tmp = gen_reg_rtx (<VSI2QI>mode);
> >> +  rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
> >> +				    gen_int_mode (val - 1, elemmode));
> >> +  rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1,
> >> +elemmode));
> >> +  rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
> >> +  c1 = force_reg (<VSI2QI>mode, c1);
> >> +  c2 = force_reg (<VSI2QI>mode, c2);
> >> +  dup = force_reg (<VSI2QI>mode, dup);
> >> +  emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
> >> +  emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2],
> >> +operands[3]));
> >> +  emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
> >> +  emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2],
> >> +t2));
> >> +  DONE;
> >> +})
> >> +
> >>  ;; These instructions map to the __builtins for the Dot Product  ;;
> >> indexed operations.
> >>  (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
> >> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
> >> b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
> >> new file mode 100644
> >> index
> >>
> 0000000000000000000000000000000000000000..acd8e36209690386d021df72f1
> 4
> >> 67a696750ac3e
> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
> >> @@ -0,0 +1,25 @@
> >> +/* { dg-do compile } */
> >> +/* { dg-options "-O3 -march=armv8.2-a+noi8mm+dotprod" } */
> >> +
> >> +#define N 480
> >> +#define SIGNEDNESS_1 unsigned
> >> +#define SIGNEDNESS_2 signed
> >> +#define SIGNEDNESS_3 signed
> >> +#define SIGNEDNESS_4 unsigned
> >> +
> >> +SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res,
> >> +SIGNEDNESS_3 char *restrict a,
> >> +   SIGNEDNESS_4 char *restrict b)
> >> +{
> >> +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> >> +    {
> >> +      int av = a[i];
> >> +      int bv = b[i];
> >> +      SIGNEDNESS_2 short mult = av * bv;
> >> +      res += mult;
> >> +    }
> >> +  return res;
> >> +}
> >> +
> >> +/* { dg-final { scan-assembler-not {\tusdot\t} } } */
> >> +/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */
  
Richard Biener June 27, 2022, 6:09 a.m. UTC | #4
On Mon, Jun 27, 2022 at 7:25 AM Tamar Christina via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> > -----Original Message-----
> > From: Richard Sandiford <richard.sandiford@arm.com>
> > Sent: Thursday, June 16, 2022 7:54 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> > Subject: Re: [PATCH 1/2]AArch64 Add fallback case using sdot for usdot
> >
> > Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > > Tamar Christina <tamar.christina@arm.com> writes:
> > >> Hi All,
> > >>
> > >> The usdot operation is common in video encoder and decoders including
> > >> some of the most widely used ones.
> > >>
> > >> This patch adds a +dotprod version of the optab as a fallback for
> > >> when you do have sdot but not usdot available.
> > >>
> > >> The fallback works by adding a bias to the unsigned argument to
> > >> convert it to a signed value and then correcting for the bias later on.
> > >>
> > >> Essentially it relies on (x - 128)y + 128y == xy where x is unsigned
> > >> and y is signed (assuming both are 8-bit values).  Because the range
> > >> of a signed byte is only to 127 we split the bias correction into:
> > >>
> > >>    (x - 128)y + 127y + y
> > >
> > > I bet you knew this question was coming, but: this technique isn't
> > > target-specific, so wouldn't it be better to handle it in
> > > tree-vect-patterns.cc instead?
>
> Ok, so after many hours of trying I don't know how to make this work.
> DOT_PROD_EXPR is a reduction, but emitting them as additional pattern
> statement doesn't work because they'll be marked as internal_def rather than
> reduction_def.  I tried marking the new vec_stmt_info that I create explicitly as
> reduction_def but this gets overwritten during analysis.
>
> I then looked into getting it as a vectorizable_operation but has this obvious problems
> In that it no longer treats it as a reduction and so tries to decompose into hi/lo.
>
> I then looked into treating additional patterns from  a reduction as reductions themselves
> but this is obviously wrong as non-reduction statements also get marked as reductions.
>
> The conclusion is that I don't think the vectorizer allows additional reductions to be
> emitted from patterns.

Indeed.  DOT_PROD is a weird beast and it doesn't define which lanes are
reduced to which so it's only usable when the result is reduced to a
single lane.

An SLP pattern might work if you use reduc-plus for the reduced lanes and keep
the multiply separate?

Richard.

> > Also, how about doing (x - 128)y + 64y + 64y instead, to reduce the number
> > of hoisted constants?
> >
> > Thanks,
> > Richard
> >
> > > Thanks,
> > > Richard
> > >
> > >> Concretely for:
> > >>
> > >> #define N 480
> > >> #define SIGNEDNESS_1 unsigned
> > >> #define SIGNEDNESS_2 signed
> > >> #define SIGNEDNESS_3 signed
> > >> #define SIGNEDNESS_4 unsigned
> > >>
> > >> SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res,
> > >> SIGNEDNESS_3 char *restrict a,
> > >>    SIGNEDNESS_4 char *restrict b)
> > >> {
> > >>   for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> > >>     {
> > >>       int av = a[i];
> > >>       int bv = b[i];
> > >>       SIGNEDNESS_2 short mult = av * bv;
> > >>       res += mult;
> > >>     }
> > >>   return res;
> > >> }
> > >>
> > >> we generate:
> > >>
> > >>         movi    v5.16b, 0x7f
> > >>         mov     x3, 0
> > >>         movi    v4.16b, 0x1
> > >>         movi    v3.16b, 0xffffffffffffff80
> > >>         movi    v0.4s, 0
> > >> .L2:
> > >>         ldr     q2, [x2, x3]
> > >>         ldr     q1, [x1, x3]
> > >>         add     x3, x3, 16
> > >>         sub     v2.16b, v2.16b, v3.16b
> > >>         sdot    v0.4s, v2.16b, v1.16b
> > >>         sdot    v0.4s, v5.16b, v1.16b
> > >>         sdot    v0.4s, v4.16b, v1.16b
> > >>         cmp     x3, 480
> > >>         bne     .L2
> > >>
> > >> instead of:
> > >>
> > >>         movi    v0.4s, 0
> > >>         mov     x3, 0
> > >> .L2:
> > >>         ldr     q2, [x1, x3]
> > >>         ldr     q1, [x2, x3]
> > >>         add     x3, x3, 16
> > >>         sxtl    v4.8h, v2.8b
> > >>         sxtl2   v3.8h, v2.16b
> > >>         uxtl    v2.8h, v1.8b
> > >>         uxtl2   v1.8h, v1.16b
> > >>         mul     v2.8h, v2.8h, v4.8h
> > >>         mul     v1.8h, v1.8h, v3.8h
> > >>         saddw   v0.4s, v0.4s, v2.4h
> > >>         saddw2  v0.4s, v0.4s, v2.8h
> > >>         saddw   v0.4s, v0.4s, v1.4h
> > >>         saddw2  v0.4s, v0.4s, v1.8h
> > >>         cmp     x3, 480
> > >>         bne     .L2
> > >>
> > >> The new sequence is significantly faster as the operations it uses
> > >> are well optimized.  Note that execution tests are already in the mid-end
> > testsuite.
> > >>
> > >> Thanks to James Greenhalgh for the tip-off.
> > >>
> > >> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > >>
> > >> Ok for master?
> > >>
> > >> Thanks,
> > >> Tamar
> > >>
> > >> gcc/ChangeLog:
> > >>
> > >>    * config/aarch64/aarch64-simd.md (usdot_prod<vsi2qi>): Generate
> > fallback
> > >>    or call original isns ...
> > >>    (usdot_prod<vsi2qi>_insn): ...here.
> > >>
> > >> gcc/testsuite/ChangeLog:
> > >>
> > >>    * gcc.target/aarch64/simd/vusdot-autovec-2.c: New test.
> > >>
> > >> --- inline copy of patch --
> > >> diff --git a/gcc/config/aarch64/aarch64-simd.md
> > >> b/gcc/config/aarch64/aarch64-simd.md
> > >> index
> > >>
> > cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e8
> > >> 564e9cf643a74 100644
> > >> --- a/gcc/config/aarch64/aarch64-simd.md
> > >> +++ b/gcc/config/aarch64/aarch64-simd.md
> > >> @@ -623,7 +623,7 @@ (define_insn "<sur>dot_prod<vsi2qi>"
> > >>
> > >>  ;; These instructions map to the __builtins for the Armv8.6-a I8MM
> > >> usdot  ;; (vector) Dot Product operation and the vectorized optab.
> > >> -(define_insn "usdot_prod<vsi2qi>"
> > >> +(define_insn "usdot_prod<vsi2qi>_insn"
> > >>    [(set (match_operand:VS 0 "register_operand" "=w")
> > >>    (plus:VS
> > >>      (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
> > @@
> > >> -635,6 +635,43 @@ (define_insn "usdot_prod<vsi2qi>"
> > >>    [(set_attr "type" "neon_dot<q>")]
> > >>  )
> > >>
> > >> +;; usdot auto-vec fallback code
> > >> +(define_expand "usdot_prod<vsi2qi>"
> > >> +  [(set (match_operand:VS 0 "register_operand")
> > >> +  (plus:VS
> > >> +    (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
> > >> +                (match_operand:<VSI2QI> 2 "register_operand")]
> > >> +    UNSPEC_USDOT)
> > >> +    (match_operand:VS 3 "register_operand")))]
> > >> +  "TARGET_DOTPROD || TARGET_I8MM"
> > >> +{
> > >> +  if (TARGET_I8MM)
> > >> +    {
> > >> +      emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1],
> > >> +                                        operands[2], operands[3]));
> > >> +      DONE;
> > >> +    }
> > >> +
> > >> +  machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
> > >> +  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE
> > (elemmode).to_constant
> > >> +() - 1);
> > >> +  rtx signbit = gen_int_mode (val, elemmode);
> > >> +  rtx t1 = gen_reg_rtx (<MODE>mode);
> > >> +  rtx t2 = gen_reg_rtx (<MODE>mode);
> > >> +  rtx tmp = gen_reg_rtx (<VSI2QI>mode);
> > >> +  rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
> > >> +                              gen_int_mode (val - 1, elemmode));
> > >> +  rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1,
> > >> +elemmode));
> > >> +  rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
> > >> +  c1 = force_reg (<VSI2QI>mode, c1);
> > >> +  c2 = force_reg (<VSI2QI>mode, c2);
> > >> +  dup = force_reg (<VSI2QI>mode, dup);
> > >> +  emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
> > >> +  emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2],
> > >> +operands[3]));
> > >> +  emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
> > >> +  emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2],
> > >> +t2));
> > >> +  DONE;
> > >> +})
> > >> +
> > >>  ;; These instructions map to the __builtins for the Dot Product  ;;
> > >> indexed operations.
> > >>  (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
> > >> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
> > >> b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
> > >> new file mode 100644
> > >> index
> > >>
> > 0000000000000000000000000000000000000000..acd8e36209690386d021df72f1
> > 4
> > >> 67a696750ac3e
> > >> --- /dev/null
> > >> +++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
> > >> @@ -0,0 +1,25 @@
> > >> +/* { dg-do compile } */
> > >> +/* { dg-options "-O3 -march=armv8.2-a+noi8mm+dotprod" } */
> > >> +
> > >> +#define N 480
> > >> +#define SIGNEDNESS_1 unsigned
> > >> +#define SIGNEDNESS_2 signed
> > >> +#define SIGNEDNESS_3 signed
> > >> +#define SIGNEDNESS_4 unsigned
> > >> +
> > >> +SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res,
> > >> +SIGNEDNESS_3 char *restrict a,
> > >> +   SIGNEDNESS_4 char *restrict b)
> > >> +{
> > >> +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> > >> +    {
> > >> +      int av = a[i];
> > >> +      int bv = b[i];
> > >> +      SIGNEDNESS_2 short mult = av * bv;
> > >> +      res += mult;
> > >> +    }
> > >> +  return res;
> > >> +}
> > >> +
> > >> +/* { dg-final { scan-assembler-not {\tusdot\t} } } */
> > >> +/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */
  
Tamar Christina June 28, 2022, 3:54 p.m. UTC | #5
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Monday, June 27, 2022 7:10 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; gcc-
> patches@gcc.gnu.org; Marcus Shawcroft <Marcus.Shawcroft@arm.com>
> Subject: Re: [PATCH 1/2]AArch64 Add fallback case using sdot for usdot
> 
> On Mon, Jun 27, 2022 at 7:25 AM Tamar Christina via Gcc-patches <gcc-
> patches@gcc.gnu.org> wrote:
> >
> > > -----Original Message-----
> > > From: Richard Sandiford <richard.sandiford@arm.com>
> > > Sent: Thursday, June 16, 2022 7:54 PM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> > > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> > > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> > > Subject: Re: [PATCH 1/2]AArch64 Add fallback case using sdot for
> > > usdot
> > >
> > > Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > > > Tamar Christina <tamar.christina@arm.com> writes:
> > > >> Hi All,
> > > >>
> > > >> The usdot operation is common in video encoder and decoders
> > > >> including some of the most widely used ones.
> > > >>
> > > >> This patch adds a +dotprod version of the optab as a fallback for
> > > >> when you do have sdot but not usdot available.
> > > >>
> > > >> The fallback works by adding a bias to the unsigned argument to
> > > >> convert it to a signed value and then correcting for the bias later on.
> > > >>
> > > >> Essentially it relies on (x - 128)y + 128y == xy where x is
> > > >> unsigned and y is signed (assuming both are 8-bit values).
> > > >> Because the range of a signed byte is only to 127 we split the bias
> correction into:
> > > >>
> > > >>    (x - 128)y + 127y + y
> > > >
> > > > I bet you knew this question was coming, but: this technique isn't
> > > > target-specific, so wouldn't it be better to handle it in
> > > > tree-vect-patterns.cc instead?
> >
> > Ok, so after many hours of trying I don't know how to make this work.
> > DOT_PROD_EXPR is a reduction, but emitting them as additional pattern
> > statement doesn't work because they'll be marked as internal_def
> > rather than reduction_def.  I tried marking the new vec_stmt_info that
> > I create explicitly as reduction_def but this gets overwritten during analysis.
> >
> > I then looked into getting it as a vectorizable_operation but has this
> > obvious problems In that it no longer treats it as a reduction and so tries to
> decompose into hi/lo.
> >
> > I then looked into treating additional patterns from  a reduction as
> > reductions themselves but this is obviously wrong as non-reduction
> statements also get marked as reductions.
> >
> > The conclusion is that I don't think the vectorizer allows additional
> > reductions to be emitted from patterns.
> 
> Indeed.  DOT_PROD is a weird beast and it doesn't define which lanes are
> reduced to which so it's only usable when the result is reduced to a single
> lane.
> 
> An SLP pattern might work if you use reduc-plus for the reduced lanes and
> keep the multiply separate?

Unfortunately I can't seem to get it to handle the reduction in SLP.  It seems to always
use the non-SLP aware loop vectorizer here.  The suggested unroll factor is always 1 and
even trying to force it gets it to bail out later, presumable because it's reducing into a
scalar that's used outside the loop?

Thanks,
Tamar

> 
> Richard.
> 
> > > Also, how about doing (x - 128)y + 64y + 64y instead, to reduce the
> > > number of hoisted constants?
> > >
> > > Thanks,
> > > Richard
> > >
> > > > Thanks,
> > > > Richard
> > > >
> > > >> Concretely for:
> > > >>
> > > >> #define N 480
> > > >> #define SIGNEDNESS_1 unsigned
> > > >> #define SIGNEDNESS_2 signed
> > > >> #define SIGNEDNESS_3 signed
> > > >> #define SIGNEDNESS_4 unsigned
> > > >>
> > > >> SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res,
> > > >> SIGNEDNESS_3 char *restrict a,
> > > >>    SIGNEDNESS_4 char *restrict b) {
> > > >>   for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> > > >>     {
> > > >>       int av = a[i];
> > > >>       int bv = b[i];
> > > >>       SIGNEDNESS_2 short mult = av * bv;
> > > >>       res += mult;
> > > >>     }
> > > >>   return res;
> > > >> }
> > > >>
> > > >> we generate:
> > > >>
> > > >>         movi    v5.16b, 0x7f
> > > >>         mov     x3, 0
> > > >>         movi    v4.16b, 0x1
> > > >>         movi    v3.16b, 0xffffffffffffff80
> > > >>         movi    v0.4s, 0
> > > >> .L2:
> > > >>         ldr     q2, [x2, x3]
> > > >>         ldr     q1, [x1, x3]
> > > >>         add     x3, x3, 16
> > > >>         sub     v2.16b, v2.16b, v3.16b
> > > >>         sdot    v0.4s, v2.16b, v1.16b
> > > >>         sdot    v0.4s, v5.16b, v1.16b
> > > >>         sdot    v0.4s, v4.16b, v1.16b
> > > >>         cmp     x3, 480
> > > >>         bne     .L2
> > > >>
> > > >> instead of:
> > > >>
> > > >>         movi    v0.4s, 0
> > > >>         mov     x3, 0
> > > >> .L2:
> > > >>         ldr     q2, [x1, x3]
> > > >>         ldr     q1, [x2, x3]
> > > >>         add     x3, x3, 16
> > > >>         sxtl    v4.8h, v2.8b
> > > >>         sxtl2   v3.8h, v2.16b
> > > >>         uxtl    v2.8h, v1.8b
> > > >>         uxtl2   v1.8h, v1.16b
> > > >>         mul     v2.8h, v2.8h, v4.8h
> > > >>         mul     v1.8h, v1.8h, v3.8h
> > > >>         saddw   v0.4s, v0.4s, v2.4h
> > > >>         saddw2  v0.4s, v0.4s, v2.8h
> > > >>         saddw   v0.4s, v0.4s, v1.4h
> > > >>         saddw2  v0.4s, v0.4s, v1.8h
> > > >>         cmp     x3, 480
> > > >>         bne     .L2
> > > >>
> > > >> The new sequence is significantly faster as the operations it
> > > >> uses are well optimized.  Note that execution tests are already
> > > >> in the mid-end
> > > testsuite.
> > > >>
> > > >> Thanks to James Greenhalgh for the tip-off.
> > > >>
> > > >> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > > >>
> > > >> Ok for master?
> > > >>
> > > >> Thanks,
> > > >> Tamar
> > > >>
> > > >> gcc/ChangeLog:
> > > >>
> > > >>    * config/aarch64/aarch64-simd.md (usdot_prod<vsi2qi>):
> > > >> Generate
> > > fallback
> > > >>    or call original isns ...
> > > >>    (usdot_prod<vsi2qi>_insn): ...here.
> > > >>
> > > >> gcc/testsuite/ChangeLog:
> > > >>
> > > >>    * gcc.target/aarch64/simd/vusdot-autovec-2.c: New test.
> > > >>
> > > >> --- inline copy of patch --
> > > >> diff --git a/gcc/config/aarch64/aarch64-simd.md
> > > >> b/gcc/config/aarch64/aarch64-simd.md
> > > >> index
> > > >>
> > >
> cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e
> > > 8
> > > >> 564e9cf643a74 100644
> > > >> --- a/gcc/config/aarch64/aarch64-simd.md
> > > >> +++ b/gcc/config/aarch64/aarch64-simd.md
> > > >> @@ -623,7 +623,7 @@ (define_insn "<sur>dot_prod<vsi2qi>"
> > > >>
> > > >>  ;; These instructions map to the __builtins for the Armv8.6-a
> > > >> I8MM usdot  ;; (vector) Dot Product operation and the vectorized
> optab.
> > > >> -(define_insn "usdot_prod<vsi2qi>"
> > > >> +(define_insn "usdot_prod<vsi2qi>_insn"
> > > >>    [(set (match_operand:VS 0 "register_operand" "=w")
> > > >>    (plus:VS
> > > >>      (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand"
> > > >> "w")
> > > @@
> > > >> -635,6 +635,43 @@ (define_insn "usdot_prod<vsi2qi>"
> > > >>    [(set_attr "type" "neon_dot<q>")]
> > > >>  )
> > > >>
> > > >> +;; usdot auto-vec fallback code
> > > >> +(define_expand "usdot_prod<vsi2qi>"
> > > >> +  [(set (match_operand:VS 0 "register_operand")
> > > >> +  (plus:VS
> > > >> +    (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
> > > >> +                (match_operand:<VSI2QI> 2 "register_operand")]
> > > >> +    UNSPEC_USDOT)
> > > >> +    (match_operand:VS 3 "register_operand")))]
> > > >> +  "TARGET_DOTPROD || TARGET_I8MM"
> > > >> +{
> > > >> +  if (TARGET_I8MM)
> > > >> +    {
> > > >> +      emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0],
> operands[1],
> > > >> +                                        operands[2], operands[3]));
> > > >> +      DONE;
> > > >> +    }
> > > >> +
> > > >> +  machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
> > > >> + HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE
> > > (elemmode).to_constant
> > > >> +() - 1);
> > > >> +  rtx signbit = gen_int_mode (val, elemmode);
> > > >> +  rtx t1 = gen_reg_rtx (<MODE>mode);
> > > >> +  rtx t2 = gen_reg_rtx (<MODE>mode);
> > > >> +  rtx tmp = gen_reg_rtx (<VSI2QI>mode);
> > > >> +  rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
> > > >> +                              gen_int_mode (val - 1, elemmode));
> > > >> +  rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode
> > > >> +(1, elemmode));
> > > >> +  rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
> > > >> +  c1 = force_reg (<VSI2QI>mode, c1);
> > > >> +  c2 = force_reg (<VSI2QI>mode, c2);
> > > >> +  dup = force_reg (<VSI2QI>mode, dup);
> > > >> +  emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
> > > >> +  emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2],
> > > >> +operands[3]));
> > > >> +  emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
> > > >> +  emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2,
> > > >> +operands[2], t2));
> > > >> +  DONE;
> > > >> +})
> > > >> +
> > > >>  ;; These instructions map to the __builtins for the Dot Product
> > > >> ;; indexed operations.
> > > >>  (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
> > > >> diff --git
> > > >> a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
> > > >> b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
> > > >> new file mode 100644
> > > >> index
> > > >>
> > >
> 0000000000000000000000000000000000000000..acd8e36209690386d021df72f1
> > > 4
> > > >> 67a696750ac3e
> > > >> --- /dev/null
> > > >> +++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
> > > >> @@ -0,0 +1,25 @@
> > > >> +/* { dg-do compile } */
> > > >> +/* { dg-options "-O3 -march=armv8.2-a+noi8mm+dotprod" } */
> > > >> +
> > > >> +#define N 480
> > > >> +#define SIGNEDNESS_1 unsigned
> > > >> +#define SIGNEDNESS_2 signed
> > > >> +#define SIGNEDNESS_3 signed
> > > >> +#define SIGNEDNESS_4 unsigned
> > > >> +
> > > >> +SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int
> > > >> +res,
> > > >> +SIGNEDNESS_3 char *restrict a,
> > > >> +   SIGNEDNESS_4 char *restrict b) {
> > > >> +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> > > >> +    {
> > > >> +      int av = a[i];
> > > >> +      int bv = b[i];
> > > >> +      SIGNEDNESS_2 short mult = av * bv;
> > > >> +      res += mult;
> > > >> +    }
> > > >> +  return res;
> > > >> +}
> > > >> +
> > > >> +/* { dg-final { scan-assembler-not {\tusdot\t} } } */
> > > >> +/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */
  
Richard Biener June 29, 2022, 9:33 a.m. UTC | #6
On Tue, Jun 28, 2022 at 5:54 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
>
> > -----Original Message-----
> > From: Richard Biener <richard.guenther@gmail.com>
> > Sent: Monday, June 27, 2022 7:10 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Richard Earnshaw
> > <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; gcc-
> > patches@gcc.gnu.org; Marcus Shawcroft <Marcus.Shawcroft@arm.com>
> > Subject: Re: [PATCH 1/2]AArch64 Add fallback case using sdot for usdot
> >
> > On Mon, Jun 27, 2022 at 7:25 AM Tamar Christina via Gcc-patches <gcc-
> > patches@gcc.gnu.org> wrote:
> > >
> > > > -----Original Message-----
> > > > From: Richard Sandiford <richard.sandiford@arm.com>
> > > > Sent: Thursday, June 16, 2022 7:54 PM
> > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> > > > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> > > > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> > <Kyrylo.Tkachov@arm.com>
> > > > Subject: Re: [PATCH 1/2]AArch64 Add fallback case using sdot for
> > > > usdot
> > > >
> > > > Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > > > > Tamar Christina <tamar.christina@arm.com> writes:
> > > > >> Hi All,
> > > > >>
> > > > >> The usdot operation is common in video encoder and decoders
> > > > >> including some of the most widely used ones.
> > > > >>
> > > > >> This patch adds a +dotprod version of the optab as a fallback for
> > > > >> when you do have sdot but not usdot available.
> > > > >>
> > > > >> The fallback works by adding a bias to the unsigned argument to
> > > > >> convert it to a signed value and then correcting for the bias later on.
> > > > >>
> > > > >> Essentially it relies on (x - 128)y + 128y == xy where x is
> > > > >> unsigned and y is signed (assuming both are 8-bit values).
> > > > >> Because the range of a signed byte is only to 127 we split the bias
> > correction into:
> > > > >>
> > > > >>    (x - 128)y + 127y + y
> > > > >
> > > > > I bet you knew this question was coming, but: this technique isn't
> > > > > target-specific, so wouldn't it be better to handle it in
> > > > > tree-vect-patterns.cc instead?
> > >
> > > Ok, so after many hours of trying I don't know how to make this work.
> > > DOT_PROD_EXPR is a reduction, but emitting them as additional pattern
> > > statement doesn't work because they'll be marked as internal_def
> > > rather than reduction_def.  I tried marking the new vec_stmt_info that
> > > I create explicitly as reduction_def but this gets overwritten during analysis.
> > >
> > > I then looked into getting it as a vectorizable_operation but has this
> > > obvious problems In that it no longer treats it as a reduction and so tries to
> > decompose into hi/lo.
> > >
> > > I then looked into treating additional patterns from  a reduction as
> > > reductions themselves but this is obviously wrong as non-reduction
> > statements also get marked as reductions.
> > >
> > > The conclusion is that I don't think the vectorizer allows additional
> > > reductions to be emitted from patterns.
> >
> > Indeed.  DOT_PROD is a weird beast and it doesn't define which lanes are
> > reduced to which so it's only usable when the result is reduced to a single
> > lane.
> >
> > An SLP pattern might work if you use reduc-plus for the reduced lanes and
> > keep the multiply separate?
>
> Unfortunately I can't seem to get it to handle the reduction in SLP.  It seems to always
> use the non-SLP aware loop vectorizer here.  The suggested unroll factor is always 1 and
> even trying to force it gets it to bail out later, presumable because it's reducing into a
> scalar that's used outside the loop?

Yes, it possibly needs 1-lane SLP support.

> Thanks,
> Tamar
>
> >
> > Richard.
> >
> > > > Also, how about doing (x - 128)y + 64y + 64y instead, to reduce the
> > > > number of hoisted constants?
> > > >
> > > > Thanks,
> > > > Richard
> > > >
> > > > > Thanks,
> > > > > Richard
> > > > >
> > > > >> Concretely for:
> > > > >>
> > > > >> #define N 480
> > > > >> #define SIGNEDNESS_1 unsigned
> > > > >> #define SIGNEDNESS_2 signed
> > > > >> #define SIGNEDNESS_3 signed
> > > > >> #define SIGNEDNESS_4 unsigned
> > > > >>
> > > > >> SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res,
> > > > >> SIGNEDNESS_3 char *restrict a,
> > > > >>    SIGNEDNESS_4 char *restrict b) {
> > > > >>   for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> > > > >>     {
> > > > >>       int av = a[i];
> > > > >>       int bv = b[i];
> > > > >>       SIGNEDNESS_2 short mult = av * bv;
> > > > >>       res += mult;
> > > > >>     }
> > > > >>   return res;
> > > > >> }
> > > > >>
> > > > >> we generate:
> > > > >>
> > > > >>         movi    v5.16b, 0x7f
> > > > >>         mov     x3, 0
> > > > >>         movi    v4.16b, 0x1
> > > > >>         movi    v3.16b, 0xffffffffffffff80
> > > > >>         movi    v0.4s, 0
> > > > >> .L2:
> > > > >>         ldr     q2, [x2, x3]
> > > > >>         ldr     q1, [x1, x3]
> > > > >>         add     x3, x3, 16
> > > > >>         sub     v2.16b, v2.16b, v3.16b
> > > > >>         sdot    v0.4s, v2.16b, v1.16b
> > > > >>         sdot    v0.4s, v5.16b, v1.16b
> > > > >>         sdot    v0.4s, v4.16b, v1.16b
> > > > >>         cmp     x3, 480
> > > > >>         bne     .L2
> > > > >>
> > > > >> instead of:
> > > > >>
> > > > >>         movi    v0.4s, 0
> > > > >>         mov     x3, 0
> > > > >> .L2:
> > > > >>         ldr     q2, [x1, x3]
> > > > >>         ldr     q1, [x2, x3]
> > > > >>         add     x3, x3, 16
> > > > >>         sxtl    v4.8h, v2.8b
> > > > >>         sxtl2   v3.8h, v2.16b
> > > > >>         uxtl    v2.8h, v1.8b
> > > > >>         uxtl2   v1.8h, v1.16b
> > > > >>         mul     v2.8h, v2.8h, v4.8h
> > > > >>         mul     v1.8h, v1.8h, v3.8h
> > > > >>         saddw   v0.4s, v0.4s, v2.4h
> > > > >>         saddw2  v0.4s, v0.4s, v2.8h
> > > > >>         saddw   v0.4s, v0.4s, v1.4h
> > > > >>         saddw2  v0.4s, v0.4s, v1.8h
> > > > >>         cmp     x3, 480
> > > > >>         bne     .L2
> > > > >>
> > > > >> The new sequence is significantly faster as the operations it
> > > > >> uses are well optimized.  Note that execution tests are already
> > > > >> in the mid-end
> > > > testsuite.
> > > > >>
> > > > >> Thanks to James Greenhalgh for the tip-off.
> > > > >>
> > > > >> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > > > >>
> > > > >> Ok for master?
> > > > >>
> > > > >> Thanks,
> > > > >> Tamar
> > > > >>
> > > > >> gcc/ChangeLog:
> > > > >>
> > > > >>    * config/aarch64/aarch64-simd.md (usdot_prod<vsi2qi>):
> > > > >> Generate
> > > > fallback
> > > > >>    or call original isns ...
> > > > >>    (usdot_prod<vsi2qi>_insn): ...here.
> > > > >>
> > > > >> gcc/testsuite/ChangeLog:
> > > > >>
> > > > >>    * gcc.target/aarch64/simd/vusdot-autovec-2.c: New test.
> > > > >>
> > > > >> --- inline copy of patch --
> > > > >> diff --git a/gcc/config/aarch64/aarch64-simd.md
> > > > >> b/gcc/config/aarch64/aarch64-simd.md
> > > > >> index
> > > > >>
> > > >
> > cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e
> > > > 8
> > > > >> 564e9cf643a74 100644
> > > > >> --- a/gcc/config/aarch64/aarch64-simd.md
> > > > >> +++ b/gcc/config/aarch64/aarch64-simd.md
> > > > >> @@ -623,7 +623,7 @@ (define_insn "<sur>dot_prod<vsi2qi>"
> > > > >>
> > > > >>  ;; These instructions map to the __builtins for the Armv8.6-a
> > > > >> I8MM usdot  ;; (vector) Dot Product operation and the vectorized
> > optab.
> > > > >> -(define_insn "usdot_prod<vsi2qi>"
> > > > >> +(define_insn "usdot_prod<vsi2qi>_insn"
> > > > >>    [(set (match_operand:VS 0 "register_operand" "=w")
> > > > >>    (plus:VS
> > > > >>      (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand"
> > > > >> "w")
> > > > @@
> > > > >> -635,6 +635,43 @@ (define_insn "usdot_prod<vsi2qi>"
> > > > >>    [(set_attr "type" "neon_dot<q>")]
> > > > >>  )
> > > > >>
> > > > >> +;; usdot auto-vec fallback code
> > > > >> +(define_expand "usdot_prod<vsi2qi>"
> > > > >> +  [(set (match_operand:VS 0 "register_operand")
> > > > >> +  (plus:VS
> > > > >> +    (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
> > > > >> +                (match_operand:<VSI2QI> 2 "register_operand")]
> > > > >> +    UNSPEC_USDOT)
> > > > >> +    (match_operand:VS 3 "register_operand")))]
> > > > >> +  "TARGET_DOTPROD || TARGET_I8MM"
> > > > >> +{
> > > > >> +  if (TARGET_I8MM)
> > > > >> +    {
> > > > >> +      emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0],
> > operands[1],
> > > > >> +                                        operands[2], operands[3]));
> > > > >> +      DONE;
> > > > >> +    }
> > > > >> +
> > > > >> +  machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
> > > > >> + HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE
> > > > (elemmode).to_constant
> > > > >> +() - 1);
> > > > >> +  rtx signbit = gen_int_mode (val, elemmode);
> > > > >> +  rtx t1 = gen_reg_rtx (<MODE>mode);
> > > > >> +  rtx t2 = gen_reg_rtx (<MODE>mode);
> > > > >> +  rtx tmp = gen_reg_rtx (<VSI2QI>mode);
> > > > >> +  rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
> > > > >> +                              gen_int_mode (val - 1, elemmode));
> > > > >> +  rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode
> > > > >> +(1, elemmode));
> > > > >> +  rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
> > > > >> +  c1 = force_reg (<VSI2QI>mode, c1);
> > > > >> +  c2 = force_reg (<VSI2QI>mode, c2);
> > > > >> +  dup = force_reg (<VSI2QI>mode, dup);
> > > > >> +  emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
> > > > >> +  emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2],
> > > > >> +operands[3]));
> > > > >> +  emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
> > > > >> +  emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2,
> > > > >> +operands[2], t2));
> > > > >> +  DONE;
> > > > >> +})
> > > > >> +
> > > > >>  ;; These instructions map to the __builtins for the Dot Product
> > > > >> ;; indexed operations.
> > > > >>  (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
> > > > >> diff --git
> > > > >> a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
> > > > >> b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
> > > > >> new file mode 100644
> > > > >> index
> > > > >>
> > > >
> > 0000000000000000000000000000000000000000..acd8e36209690386d021df72f1
> > > > 4
> > > > >> 67a696750ac3e
> > > > >> --- /dev/null
> > > > >> +++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
> > > > >> @@ -0,0 +1,25 @@
> > > > >> +/* { dg-do compile } */
> > > > >> +/* { dg-options "-O3 -march=armv8.2-a+noi8mm+dotprod" } */
> > > > >> +
> > > > >> +#define N 480
> > > > >> +#define SIGNEDNESS_1 unsigned
> > > > >> +#define SIGNEDNESS_2 signed
> > > > >> +#define SIGNEDNESS_3 signed
> > > > >> +#define SIGNEDNESS_4 unsigned
> > > > >> +
> > > > >> +SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int
> > > > >> +res,
> > > > >> +SIGNEDNESS_3 char *restrict a,
> > > > >> +   SIGNEDNESS_4 char *restrict b) {
> > > > >> +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> > > > >> +    {
> > > > >> +      int av = a[i];
> > > > >> +      int bv = b[i];
> > > > >> +      SIGNEDNESS_2 short mult = av * bv;
> > > > >> +      res += mult;
> > > > >> +    }
> > > > >> +  return res;
> > > > >> +}
> > > > >> +
> > > > >> +/* { dg-final { scan-assembler-not {\tusdot\t} } } */
> > > > >> +/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */
  
Richard Sandiford June 29, 2022, 2:35 p.m. UTC | #7
Richard Biener <richard.guenther@gmail.com> writes:
> On Tue, Jun 28, 2022 at 5:54 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
>>
>> > -----Original Message-----
>> > From: Richard Biener <richard.guenther@gmail.com>
>> > Sent: Monday, June 27, 2022 7:10 AM
>> > To: Tamar Christina <Tamar.Christina@arm.com>
>> > Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Richard Earnshaw
>> > <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; gcc-
>> > patches@gcc.gnu.org; Marcus Shawcroft <Marcus.Shawcroft@arm.com>
>> > Subject: Re: [PATCH 1/2]AArch64 Add fallback case using sdot for usdot
>> >
>> > On Mon, Jun 27, 2022 at 7:25 AM Tamar Christina via Gcc-patches <gcc-
>> > patches@gcc.gnu.org> wrote:
>> > >
>> > > > -----Original Message-----
>> > > > From: Richard Sandiford <richard.sandiford@arm.com>
>> > > > Sent: Thursday, June 16, 2022 7:54 PM
>> > > > To: Tamar Christina <Tamar.Christina@arm.com>
>> > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> > > > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> > > > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
>> > <Kyrylo.Tkachov@arm.com>
>> > > > Subject: Re: [PATCH 1/2]AArch64 Add fallback case using sdot for
>> > > > usdot
>> > > >
>> > > > Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> > > > > Tamar Christina <tamar.christina@arm.com> writes:
>> > > > >> Hi All,
>> > > > >>
>> > > > >> The usdot operation is common in video encoder and decoders
>> > > > >> including some of the most widely used ones.
>> > > > >>
>> > > > >> This patch adds a +dotprod version of the optab as a fallback for
>> > > > >> when you do have sdot but not usdot available.
>> > > > >>
>> > > > >> The fallback works by adding a bias to the unsigned argument to
>> > > > >> convert it to a signed value and then correcting for the bias later on.
>> > > > >>
>> > > > >> Essentially it relies on (x - 128)y + 128y == xy where x is
>> > > > >> unsigned and y is signed (assuming both are 8-bit values).
>> > > > >> Because the range of a signed byte is only to 127 we split the bias
>> > correction into:
>> > > > >>
>> > > > >>    (x - 128)y + 127y + y
>> > > > >
>> > > > > I bet you knew this question was coming, but: this technique isn't
>> > > > > target-specific, so wouldn't it be better to handle it in
>> > > > > tree-vect-patterns.cc instead?
>> > >
>> > > Ok, so after many hours of trying I don't know how to make this work.
>> > > DOT_PROD_EXPR is a reduction, but emitting them as additional pattern
>> > > statement doesn't work because they'll be marked as internal_def
>> > > rather than reduction_def.  I tried marking the new vec_stmt_info that
>> > > I create explicitly as reduction_def but this gets overwritten during analysis.
>> > >
>> > > I then looked into getting it as a vectorizable_operation but has this
>> > > obvious problems In that it no longer treats it as a reduction and so tries to
>> > decompose into hi/lo.
>> > >
>> > > I then looked into treating additional patterns from  a reduction as
>> > > reductions themselves but this is obviously wrong as non-reduction
>> > statements also get marked as reductions.
>> > >
>> > > The conclusion is that I don't think the vectorizer allows additional
>> > > reductions to be emitted from patterns.
>> >
>> > Indeed.  DOT_PROD is a weird beast and it doesn't define which lanes are
>> > reduced to which so it's only usable when the result is reduced to a single
>> > lane.
>> >
>> > An SLP pattern might work if you use reduc-plus for the reduced lanes and
>> > keep the multiply separate?
>>
>> Unfortunately I can't seem to get it to handle the reduction in SLP.  It seems to always
>> use the non-SLP aware loop vectorizer here.  The suggested unroll factor is always 1 and
>> even trying to force it gets it to bail out later, presumable because it's reducing into a
>> scalar that's used outside the loop?
>
> Yes, it possibly needs 1-lane SLP support.

As I mentioned to Tamar off-list, I feel like I've been wasting
people's time recently by spewing out ideas that might or might not work
(usually "not work"), so I wanted to get some confidence that the next
suggestion made sense.  In the end I needed most of an implementation
to do that, so it seemed easiest just to finish it off rather than post
it in a half-complete state.  Sorry for the duplication. :-(

The patch certainly isn't pretty, but I think it's the best we can
do under the current infrastructure, and it should at least make
the costs reasonably accurate.  (Actually, that said, we probably
need to patch the reduction latency calculation in the aarch64
vector code -- didn't think of that until now.)

Tested on aarch64-linux-gnu and x64_64-linux-gnu.  WDYT?

Thanks,
Richard

----------------

Following a suggestion from Tamar, this patch adds a fallback
implementation of usdot using sdot.  Specifically, for 8-bit
input types:

   acc_2 = DOT_PROD_EXPR <a_unsigned, b_signed, acc_1>;

becomes:

   tmp_1 = DOT_PROD_EXPR <64, b_signed, acc_1>;
   tmp_2 = DOT_PROD_EXPR <64, b_signed, tmp_1>;
   acc_2 = DOT_PROD_EXPR <a_unsigned - 128, b_signed, tmp_2>;

on the basis that (x-128)*y + 64*y + 64*y.  Doing the two 64*y
operations first should give more time for x to be calculated,
on the off chance that that's useful.

gcc/
	* tree-vect-patterns.cc (vect_recog_dot_prod_pattern): If usdot
	isn't available, try sdot instead.
	* tree-vect-loop.cc (vect_is_emulated_mixed_dot_prod): New function.
	(vect_model_reduction_cost): Model the cost of implementing usdot
	using sdot.
	(vectorizable_reduction): Likewise.  Skip target support test
	for lane reductions.
	(vect_emulate_mixed_dot_prod): New function.
	(vect_transform_reduction): Use it to emulate usdot via sdot.

gcc/testsuite/
	* gcc.dg/vect/vect-reduc-dot-9.c: Reduce target requirements
	from i8mm to dotprod.
	* gcc.dg/vect/vect-reduc-dot-10.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-11.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-12.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-13.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-14.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-15.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-16.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-17.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-18.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-19.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-20.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-21.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-22.c: Likewise.
---
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c |   4 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c |   4 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c |   4 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c |   4 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c  |   6 +-
 gcc/tree-vect-loop.cc                         | 160 ++++++++++++++++--
 gcc/tree-vect-patterns.cc                     |  14 +-
 16 files changed, 196 insertions(+), 54 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c
index 7ce86965ea9..34e25ab7fb0 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 unsigned
 #define SIGNEDNESS_2 unsigned
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c
index 0f7cbbb87ef..3af8df54cf9 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 unsigned
 #define SIGNEDNESS_2 signed
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c
index 08412614fc6..77ceef3643b 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 unsigned
 #define SIGNEDNESS_2 signed
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c
index 7ee0f45f642..d3c0c86f529 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 signed
 #define SIGNEDNESS_2 unsigned
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c
index 2de1434528b..86a5c85753c 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 signed
 #define SIGNEDNESS_2 unsigned
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c
index dc48f95a32b..25de0940a65 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 signed
 #define SIGNEDNESS_2 signed
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c
index aec62878936..4a1dec0677e 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 signed
 #define SIGNEDNESS_2 signed
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c
index 38f86fe458a..90d21188b76 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
@@ -50,4 +50,4 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c
index 2e86ebe3c6c..81ecb158d29 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
@@ -50,4 +50,4 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c
index d00f24aae4c..cbcd4f120a5 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c
index 17adbca83a0..e81ed1da5a4 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c
index 6cc6a4f2e92..81ce5cdaffb 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c
index e13d3d5c4da..b8c9d3ca53b 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c
index d1049c96bf1..e0b132f6b35 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
@@ -50,4 +50,4 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 78dfe8519aa..3a70c15b593 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -4566,6 +4566,31 @@ have_whole_vector_shift (machine_mode mode)
   return true;
 }
 
+/* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
+   multiplication operands have differing signs and (b) we intend
+   to emulate the operation using a series of signed DOT_PROD_EXPRs.
+   See vect_emulate_mixed_dot_prod for the actual sequence used.  */
+
+static bool
+vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
+				 stmt_vec_info stmt_info)
+{
+  gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
+  if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
+    return false;
+
+  tree rhs1 = gimple_assign_rhs1 (assign);
+  tree rhs2 = gimple_assign_rhs2 (assign);
+  if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
+    return false;
+
+  stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
+  gcc_assert (reduc_info->is_reduc_info);
+  return !directly_supported_p (DOT_PROD_EXPR,
+				STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
+				optab_vector_mixed_sign);
+}
+
 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
    functions. Design better to avoid maintenance issues.  */
 
@@ -4601,6 +4626,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
     gcc_unreachable ();
 
+  bool emulated_mixed_dot_prod
+    = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
   if (reduction_type == EXTRACT_LAST_REDUCTION)
     /* No extra instructions are needed in the prologue.  The loop body
        operations are costed in vectorizable_condition.  */
@@ -4628,11 +4655,20 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
     }
   else
     {
-      /* Add in cost for initial definition.
-	 For cond reduction we have four vectors: initial index, step,
-	 initial result of the data reduction, initial value of the index
-	 reduction.  */
-      int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
+      /* Add in the cost of the initial definitions.  */
+      int prologue_stmts;
+      if (reduction_type == COND_REDUCTION)
+	/* For cond reductions we have four vectors: initial index, step,
+	   initial result of the data reduction, initial value of the index
+	   reduction.  */
+	prologue_stmts = 4;
+      else if (emulated_mixed_dot_prod)
+	/* We need the initial reduction value and two invariants:
+	   one that contains the minimum signed value and one that
+	   contains half of its negative.  */
+	prologue_stmts = 3;
+      else
+	prologue_stmts = 1;
       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
 					 scalar_to_vec, stmt_info, 0,
 					 vect_prologue);
@@ -6797,11 +6833,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
 			    || op.code == WIDEN_SUM_EXPR
 			    || op.code == SAD_EXPR);
-  enum optab_subtype optab_query_kind = optab_vector;
-  if (op.code == DOT_PROD_EXPR
-      && (TYPE_SIGN (TREE_TYPE (op.ops[0]))
-	  != TYPE_SIGN (TREE_TYPE (op.ops[1]))))
-    optab_query_kind = optab_vector_mixed_sign;
 
   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
       && !SCALAR_FLOAT_TYPE_P (op.type))
@@ -7328,9 +7359,17 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       /* 4. Supportable by target?  */
       bool ok = true;
 
-      /* 4.1. check support for the operation in the loop  */
+      /* 4.1. check support for the operation in the loop
+
+	 This isn't necessary for the lane reduction codes, since they
+	 can only be produced by pattern matching, and it's up to the
+	 pattern matcher to test for support.  The main reason for
+	 specifically skipping this step is to avoid rechecking whether
+	 mixed-sign dot-products can be implemented using signed
+	 dot-products.  */
       machine_mode vec_mode = TYPE_MODE (vectype_in);
-      if (!directly_supported_p (op.code, vectype_in, optab_query_kind))
+      if (!lane_reduc_code_p
+	  && !directly_supported_p (op.code, vectype_in))
         {
           if (dump_enabled_p ())
             dump_printf (MSG_NOTE, "op not supported by target.\n");
@@ -7398,7 +7437,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
      vect_transform_reduction.  Otherwise this is costed by the
      separate vectorizable_* routines.  */
   if (single_defuse_cycle || lane_reduc_code_p)
-    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
+    {
+      int factor = 1;
+      if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
+	/* Three dot-products and a subtraction.  */
+	factor = 4;
+      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
+			stmt_info, 0, vect_body);
+    }
 
   if (dump_enabled_p ()
       && reduction_type == FOLD_LEFT_REDUCTION)
@@ -7457,6 +7503,81 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   return true;
 }
 
+/* STMT_INFO is a dot-product reduction whose multiplication operands
+   have different signs.  Emit a sequence to emulate the operation
+   using a series of signed DOT_PROD_EXPRs and return the last
+   statement generated.  VEC_DEST is the result of the vector operation
+   and VOP lists its inputs.  */
+
+static gassign *
+vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+			     gimple_stmt_iterator *gsi, tree vec_dest,
+			     tree vop[3])
+{
+  tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
+  tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
+  tree narrow_elttype = TREE_TYPE (narrow_vectype);
+  gimple *new_stmt;
+
+  /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
+  if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
+    std::swap (vop[0], vop[1]);
+
+  /* Convert all inputs to signed types.  */
+  for (int i = 0; i < 3; ++i)
+    if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
+      {
+	tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
+	new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
+	vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+	vop[i] = tmp;
+      }
+
+  /* In the comments below we assume 8-bit inputs for simplicity,
+     but the approach works for any full integer type.  */
+
+  /* Create a vector of -128.  */
+  tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
+  tree min_narrow = build_vector_from_val (narrow_vectype,
+					   min_narrow_elttype);
+
+  /* Create a vector of 64.  */
+  auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
+  tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
+  half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
+
+  /* Emit: SUB_RES = VOP[0] - 128.  */
+  tree sub_res = make_ssa_name (narrow_vectype);
+  new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
+  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+
+  /* Emit:
+
+       STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
+       STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
+       STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
+
+     on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
+     Doing the two 64 * y steps first allows more time to compute x.  */
+  tree stage1 = make_ssa_name (wide_vectype);
+  new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
+				  vop[1], half_narrow, vop[2]);
+  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+
+  tree stage2 = make_ssa_name (wide_vectype);
+  new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
+				  vop[1], half_narrow, stage1);
+  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+
+  tree stage3 = make_ssa_name (wide_vectype);
+  new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
+				  sub_res, vop[1], stage2);
+  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+
+  /* Convert STAGE3 to the reduction type.  */
+  return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
+}
+
 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
    value.  */
 
@@ -7563,12 +7684,17 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 					: &vec_oprnds2));
     }
 
+  bool emulated_mixed_dot_prod
+    = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
     {
       gimple *new_stmt;
       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
       if (masked_loop_p && !mask_by_cond_expr)
 	{
+	  /* No conditional ifns have been defined for dot-product yet.  */
+	  gcc_assert (code != DOT_PROD_EXPR);
+
 	  /* Make sure that the reduction accumulator is vop[0].  */
 	  if (reduc_index == 1)
 	    {
@@ -7597,8 +7723,12 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	      build_vect_cond_expr (code, vop, mask, gsi);
 	    }
 
-	  new_stmt = gimple_build_assign (vec_dest, code,
-					  vop[0], vop[1], vop[2]);
+	  if (emulated_mixed_dot_prod)
+	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
+						    vec_dest, vop);
+	  else
+	    new_stmt = gimple_build_assign (vec_dest, code,
+					    vop[0], vop[1], vop[2]);
 	  new_temp = make_ssa_name (vec_dest, new_stmt);
 	  gimple_assign_set_lhs (new_stmt, new_temp);
 	  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 8f624863971..b336f12e6be 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -1148,7 +1148,19 @@ vect_recog_dot_prod_pattern (vec_info *vinfo,
   tree half_vectype;
   if (!vect_supportable_direct_optab_p (vinfo, type, DOT_PROD_EXPR, half_type,
 					type_out, &half_vectype, subtype))
-    return NULL;
+    {
+      /* We can emulate a mixed-sign dot-product using a sequence of
+	 signed dot-products; see vect_emulate_mixed_dot_prod for details.  */
+      if (subtype != optab_vector_mixed_sign
+	  || !vect_supportable_direct_optab_p (vinfo, signed_type_for (type),
+					       DOT_PROD_EXPR, half_type,
+					       type_out, &half_vectype,
+					       optab_vector))
+	return NULL;
+
+      *type_out = signed_or_unsigned_type_for (TYPE_UNSIGNED (type),
+					       *type_out);
+    }
 
   /* Get the inputs in the appropriate types.  */
   tree mult_oprnd[2];
  
Richard Biener June 30, 2022, 6:45 a.m. UTC | #8
On Wed, Jun 29, 2022 at 4:35 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Biener <richard.guenther@gmail.com> writes:
> > On Tue, Jun 28, 2022 at 5:54 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
> >>
> >> > -----Original Message-----
> >> > From: Richard Biener <richard.guenther@gmail.com>
> >> > Sent: Monday, June 27, 2022 7:10 AM
> >> > To: Tamar Christina <Tamar.Christina@arm.com>
> >> > Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Richard Earnshaw
> >> > <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; gcc-
> >> > patches@gcc.gnu.org; Marcus Shawcroft <Marcus.Shawcroft@arm.com>
> >> > Subject: Re: [PATCH 1/2]AArch64 Add fallback case using sdot for usdot
> >> >
> >> > On Mon, Jun 27, 2022 at 7:25 AM Tamar Christina via Gcc-patches <gcc-
> >> > patches@gcc.gnu.org> wrote:
> >> > >
> >> > > > -----Original Message-----
> >> > > > From: Richard Sandiford <richard.sandiford@arm.com>
> >> > > > Sent: Thursday, June 16, 2022 7:54 PM
> >> > > > To: Tamar Christina <Tamar.Christina@arm.com>
> >> > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> > > > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> > > > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> >> > <Kyrylo.Tkachov@arm.com>
> >> > > > Subject: Re: [PATCH 1/2]AArch64 Add fallback case using sdot for
> >> > > > usdot
> >> > > >
> >> > > > Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> >> > > > > Tamar Christina <tamar.christina@arm.com> writes:
> >> > > > >> Hi All,
> >> > > > >>
> >> > > > >> The usdot operation is common in video encoder and decoders
> >> > > > >> including some of the most widely used ones.
> >> > > > >>
> >> > > > >> This patch adds a +dotprod version of the optab as a fallback for
> >> > > > >> when you do have sdot but not usdot available.
> >> > > > >>
> >> > > > >> The fallback works by adding a bias to the unsigned argument to
> >> > > > >> convert it to a signed value and then correcting for the bias later on.
> >> > > > >>
> >> > > > >> Essentially it relies on (x - 128)y + 128y == xy where x is
> >> > > > >> unsigned and y is signed (assuming both are 8-bit values).
> >> > > > >> Because the range of a signed byte is only to 127 we split the bias
> >> > correction into:
> >> > > > >>
> >> > > > >>    (x - 128)y + 127y + y
> >> > > > >
> >> > > > > I bet you knew this question was coming, but: this technique isn't
> >> > > > > target-specific, so wouldn't it be better to handle it in
> >> > > > > tree-vect-patterns.cc instead?
> >> > >
> >> > > Ok, so after many hours of trying I don't know how to make this work.
> >> > > DOT_PROD_EXPR is a reduction, but emitting them as additional pattern
> >> > > statement doesn't work because they'll be marked as internal_def
> >> > > rather than reduction_def.  I tried marking the new vec_stmt_info that
> >> > > I create explicitly as reduction_def but this gets overwritten during analysis.
> >> > >
> >> > > I then looked into getting it as a vectorizable_operation but has this
> >> > > obvious problems In that it no longer treats it as a reduction and so tries to
> >> > decompose into hi/lo.
> >> > >
> >> > > I then looked into treating additional patterns from  a reduction as
> >> > > reductions themselves but this is obviously wrong as non-reduction
> >> > statements also get marked as reductions.
> >> > >
> >> > > The conclusion is that I don't think the vectorizer allows additional
> >> > > reductions to be emitted from patterns.
> >> >
> >> > Indeed.  DOT_PROD is a weird beast and it doesn't define which lanes are
> >> > reduced to which so it's only usable when the result is reduced to a single
> >> > lane.
> >> >
> >> > An SLP pattern might work if you use reduc-plus for the reduced lanes and
> >> > keep the multiply separate?
> >>
> >> Unfortunately I can't seem to get it to handle the reduction in SLP.  It seems to always
> >> use the non-SLP aware loop vectorizer here.  The suggested unroll factor is always 1 and
> >> even trying to force it gets it to bail out later, presumable because it's reducing into a
> >> scalar that's used outside the loop?
> >
> > Yes, it possibly needs 1-lane SLP support.
>
> As I mentioned to Tamar off-list, I feel like I've been wasting
> people's time recently by spewing out ideas that might or might not work
> (usually "not work"), so I wanted to get some confidence that the next
> suggestion made sense.  In the end I needed most of an implementation
> to do that, so it seemed easiest just to finish it off rather than post
> it in a half-complete state.  Sorry for the duplication. :-(
>
> The patch certainly isn't pretty, but I think it's the best we can
> do under the current infrastructure, and it should at least make
> the costs reasonably accurate.  (Actually, that said, we probably
> need to patch the reduction latency calculation in the aarch64
> vector code -- didn't think of that until now.)
>
> Tested on aarch64-linux-gnu and x64_64-linux-gnu.  WDYT?

Looks reasonable - does this end up in OKish code generation as well?

Thanks,
Richard.

> Thanks,
> Richard
>
> ----------------
>
> Following a suggestion from Tamar, this patch adds a fallback
> implementation of usdot using sdot.  Specifically, for 8-bit
> input types:
>
>    acc_2 = DOT_PROD_EXPR <a_unsigned, b_signed, acc_1>;
>
> becomes:
>
>    tmp_1 = DOT_PROD_EXPR <64, b_signed, acc_1>;
>    tmp_2 = DOT_PROD_EXPR <64, b_signed, tmp_1>;
>    acc_2 = DOT_PROD_EXPR <a_unsigned - 128, b_signed, tmp_2>;
>
> on the basis that (x-128)*y + 64*y + 64*y.  Doing the two 64*y
> operations first should give more time for x to be calculated,
> on the off chance that that's useful.
>
> gcc/
>         * tree-vect-patterns.cc (vect_recog_dot_prod_pattern): If usdot
>         isn't available, try sdot instead.
>         * tree-vect-loop.cc (vect_is_emulated_mixed_dot_prod): New function.
>         (vect_model_reduction_cost): Model the cost of implementing usdot
>         using sdot.
>         (vectorizable_reduction): Likewise.  Skip target support test
>         for lane reductions.
>         (vect_emulate_mixed_dot_prod): New function.
>         (vect_transform_reduction): Use it to emulate usdot via sdot.
>
> gcc/testsuite/
>         * gcc.dg/vect/vect-reduc-dot-9.c: Reduce target requirements
>         from i8mm to dotprod.
>         * gcc.dg/vect/vect-reduc-dot-10.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-11.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-12.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-13.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-14.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-15.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-16.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-17.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-18.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-19.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-20.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-21.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-22.c: Likewise.
> ---
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c |   4 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c |   4 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c |   4 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c |   4 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c  |   6 +-
>  gcc/tree-vect-loop.cc                         | 160 ++++++++++++++++--
>  gcc/tree-vect-patterns.cc                     |  14 +-
>  16 files changed, 196 insertions(+), 54 deletions(-)
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c
> index 7ce86965ea9..34e25ab7fb0 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #define SIGNEDNESS_1 unsigned
>  #define SIGNEDNESS_2 unsigned
> @@ -10,4 +10,4 @@
>  #include "vect-reduc-dot-9.c"
>
>  /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c
> index 0f7cbbb87ef..3af8df54cf9 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #define SIGNEDNESS_1 unsigned
>  #define SIGNEDNESS_2 signed
> @@ -10,4 +10,4 @@
>  #include "vect-reduc-dot-9.c"
>
>  /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c
> index 08412614fc6..77ceef3643b 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #define SIGNEDNESS_1 unsigned
>  #define SIGNEDNESS_2 signed
> @@ -10,4 +10,4 @@
>  #include "vect-reduc-dot-9.c"
>
>  /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c
> index 7ee0f45f642..d3c0c86f529 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #define SIGNEDNESS_1 signed
>  #define SIGNEDNESS_2 unsigned
> @@ -10,4 +10,4 @@
>  #include "vect-reduc-dot-9.c"
>
>  /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c
> index 2de1434528b..86a5c85753c 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #define SIGNEDNESS_1 signed
>  #define SIGNEDNESS_2 unsigned
> @@ -10,4 +10,4 @@
>  #include "vect-reduc-dot-9.c"
>
>  /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c
> index dc48f95a32b..25de0940a65 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #define SIGNEDNESS_1 signed
>  #define SIGNEDNESS_2 signed
> @@ -10,4 +10,4 @@
>  #include "vect-reduc-dot-9.c"
>
>  /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c
> index aec62878936..4a1dec0677e 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #define SIGNEDNESS_1 signed
>  #define SIGNEDNESS_2 signed
> @@ -10,4 +10,4 @@
>  #include "vect-reduc-dot-9.c"
>
>  /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c
> index 38f86fe458a..90d21188b76 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #include "tree-vect.h"
>
> @@ -50,4 +50,4 @@ main (void)
>  }
>
>  /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c
> index 2e86ebe3c6c..81ecb158d29 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #include "tree-vect.h"
>
> @@ -50,4 +50,4 @@ main (void)
>  }
>
>  /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c
> index d00f24aae4c..cbcd4f120a5 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #include "tree-vect.h"
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c
> index 17adbca83a0..e81ed1da5a4 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #include "tree-vect.h"
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c
> index 6cc6a4f2e92..81ce5cdaffb 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #include "tree-vect.h"
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c
> index e13d3d5c4da..b8c9d3ca53b 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #include "tree-vect.h"
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c
> index d1049c96bf1..e0b132f6b35 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #include "tree-vect.h"
>
> @@ -50,4 +50,4 @@ main (void)
>  }
>
>  /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 78dfe8519aa..3a70c15b593 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -4566,6 +4566,31 @@ have_whole_vector_shift (machine_mode mode)
>    return true;
>  }
>
> +/* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
> +   multiplication operands have differing signs and (b) we intend
> +   to emulate the operation using a series of signed DOT_PROD_EXPRs.
> +   See vect_emulate_mixed_dot_prod for the actual sequence used.  */
> +
> +static bool
> +vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
> +                                stmt_vec_info stmt_info)
> +{
> +  gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
> +  if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
> +    return false;
> +
> +  tree rhs1 = gimple_assign_rhs1 (assign);
> +  tree rhs2 = gimple_assign_rhs2 (assign);
> +  if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
> +    return false;
> +
> +  stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
> +  gcc_assert (reduc_info->is_reduc_info);
> +  return !directly_supported_p (DOT_PROD_EXPR,
> +                               STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
> +                               optab_vector_mixed_sign);
> +}
> +
>  /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
>     functions. Design better to avoid maintenance issues.  */
>
> @@ -4601,6 +4626,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
>    if (!gimple_extract_op (orig_stmt_info->stmt, &op))
>      gcc_unreachable ();
>
> +  bool emulated_mixed_dot_prod
> +    = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
>    if (reduction_type == EXTRACT_LAST_REDUCTION)
>      /* No extra instructions are needed in the prologue.  The loop body
>         operations are costed in vectorizable_condition.  */
> @@ -4628,11 +4655,20 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
>      }
>    else
>      {
> -      /* Add in cost for initial definition.
> -        For cond reduction we have four vectors: initial index, step,
> -        initial result of the data reduction, initial value of the index
> -        reduction.  */
> -      int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
> +      /* Add in the cost of the initial definitions.  */
> +      int prologue_stmts;
> +      if (reduction_type == COND_REDUCTION)
> +       /* For cond reductions we have four vectors: initial index, step,
> +          initial result of the data reduction, initial value of the index
> +          reduction.  */
> +       prologue_stmts = 4;
> +      else if (emulated_mixed_dot_prod)
> +       /* We need the initial reduction value and two invariants:
> +          one that contains the minimum signed value and one that
> +          contains half of its negative.  */
> +       prologue_stmts = 3;
> +      else
> +       prologue_stmts = 1;
>        prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
>                                          scalar_to_vec, stmt_info, 0,
>                                          vect_prologue);
> @@ -6797,11 +6833,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
>                             || op.code == WIDEN_SUM_EXPR
>                             || op.code == SAD_EXPR);
> -  enum optab_subtype optab_query_kind = optab_vector;
> -  if (op.code == DOT_PROD_EXPR
> -      && (TYPE_SIGN (TREE_TYPE (op.ops[0]))
> -         != TYPE_SIGN (TREE_TYPE (op.ops[1]))))
> -    optab_query_kind = optab_vector_mixed_sign;
>
>    if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
>        && !SCALAR_FLOAT_TYPE_P (op.type))
> @@ -7328,9 +7359,17 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        /* 4. Supportable by target?  */
>        bool ok = true;
>
> -      /* 4.1. check support for the operation in the loop  */
> +      /* 4.1. check support for the operation in the loop
> +
> +        This isn't necessary for the lane reduction codes, since they
> +        can only be produced by pattern matching, and it's up to the
> +        pattern matcher to test for support.  The main reason for
> +        specifically skipping this step is to avoid rechecking whether
> +        mixed-sign dot-products can be implemented using signed
> +        dot-products.  */
>        machine_mode vec_mode = TYPE_MODE (vectype_in);
> -      if (!directly_supported_p (op.code, vectype_in, optab_query_kind))
> +      if (!lane_reduc_code_p
> +         && !directly_supported_p (op.code, vectype_in))
>          {
>            if (dump_enabled_p ())
>              dump_printf (MSG_NOTE, "op not supported by target.\n");
> @@ -7398,7 +7437,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>       vect_transform_reduction.  Otherwise this is costed by the
>       separate vectorizable_* routines.  */
>    if (single_defuse_cycle || lane_reduc_code_p)
> -    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
> +    {
> +      int factor = 1;
> +      if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
> +       /* Three dot-products and a subtraction.  */
> +       factor = 4;
> +      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
> +                       stmt_info, 0, vect_body);
> +    }
>
>    if (dump_enabled_p ()
>        && reduction_type == FOLD_LEFT_REDUCTION)
> @@ -7457,6 +7503,81 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    return true;
>  }
>
> +/* STMT_INFO is a dot-product reduction whose multiplication operands
> +   have different signs.  Emit a sequence to emulate the operation
> +   using a series of signed DOT_PROD_EXPRs and return the last
> +   statement generated.  VEC_DEST is the result of the vector operation
> +   and VOP lists its inputs.  */
> +
> +static gassign *
> +vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
> +                            gimple_stmt_iterator *gsi, tree vec_dest,
> +                            tree vop[3])
> +{
> +  tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
> +  tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
> +  tree narrow_elttype = TREE_TYPE (narrow_vectype);
> +  gimple *new_stmt;
> +
> +  /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
> +  if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
> +    std::swap (vop[0], vop[1]);
> +
> +  /* Convert all inputs to signed types.  */
> +  for (int i = 0; i < 3; ++i)
> +    if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
> +      {
> +       tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
> +       new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
> +       vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
> +       vop[i] = tmp;
> +      }
> +
> +  /* In the comments below we assume 8-bit inputs for simplicity,
> +     but the approach works for any full integer type.  */
> +
> +  /* Create a vector of -128.  */
> +  tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
> +  tree min_narrow = build_vector_from_val (narrow_vectype,
> +                                          min_narrow_elttype);
> +
> +  /* Create a vector of 64.  */
> +  auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
> +  tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
> +  half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
> +
> +  /* Emit: SUB_RES = VOP[0] - 128.  */
> +  tree sub_res = make_ssa_name (narrow_vectype);
> +  new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
> +  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
> +
> +  /* Emit:
> +
> +       STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
> +       STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
> +       STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
> +
> +     on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
> +     Doing the two 64 * y steps first allows more time to compute x.  */
> +  tree stage1 = make_ssa_name (wide_vectype);
> +  new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
> +                                 vop[1], half_narrow, vop[2]);
> +  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
> +
> +  tree stage2 = make_ssa_name (wide_vectype);
> +  new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
> +                                 vop[1], half_narrow, stage1);
> +  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
> +
> +  tree stage3 = make_ssa_name (wide_vectype);
> +  new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
> +                                 sub_res, vop[1], stage2);
> +  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
> +
> +  /* Convert STAGE3 to the reduction type.  */
> +  return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
> +}
> +
>  /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
>     value.  */
>
> @@ -7563,12 +7684,17 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>                                         : &vec_oprnds2));
>      }
>
> +  bool emulated_mixed_dot_prod
> +    = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
>    FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
>      {
>        gimple *new_stmt;
>        tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
>        if (masked_loop_p && !mask_by_cond_expr)
>         {
> +         /* No conditional ifns have been defined for dot-product yet.  */
> +         gcc_assert (code != DOT_PROD_EXPR);
> +
>           /* Make sure that the reduction accumulator is vop[0].  */
>           if (reduc_index == 1)
>             {
> @@ -7597,8 +7723,12 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>               build_vect_cond_expr (code, vop, mask, gsi);
>             }
>
> -         new_stmt = gimple_build_assign (vec_dest, code,
> -                                         vop[0], vop[1], vop[2]);
> +         if (emulated_mixed_dot_prod)
> +           new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
> +                                                   vec_dest, vop);
> +         else
> +           new_stmt = gimple_build_assign (vec_dest, code,
> +                                           vop[0], vop[1], vop[2]);
>           new_temp = make_ssa_name (vec_dest, new_stmt);
>           gimple_assign_set_lhs (new_stmt, new_temp);
>           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index 8f624863971..b336f12e6be 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -1148,7 +1148,19 @@ vect_recog_dot_prod_pattern (vec_info *vinfo,
>    tree half_vectype;
>    if (!vect_supportable_direct_optab_p (vinfo, type, DOT_PROD_EXPR, half_type,
>                                         type_out, &half_vectype, subtype))
> -    return NULL;
> +    {
> +      /* We can emulate a mixed-sign dot-product using a sequence of
> +        signed dot-products; see vect_emulate_mixed_dot_prod for details.  */
> +      if (subtype != optab_vector_mixed_sign
> +         || !vect_supportable_direct_optab_p (vinfo, signed_type_for (type),
> +                                              DOT_PROD_EXPR, half_type,
> +                                              type_out, &half_vectype,
> +                                              optab_vector))
> +       return NULL;
> +
> +      *type_out = signed_or_unsigned_type_for (TYPE_UNSIGNED (type),
> +                                              *type_out);
> +    }
>
>    /* Get the inputs in the appropriate types.  */
>    tree mult_oprnd[2];
> --
> 2.25.1
>
>
  
Richard Sandiford July 5, 2022, 6:08 a.m. UTC | #9
Richard Biener <richard.guenther@gmail.com> writes:
> On Wed, Jun 29, 2022 at 4:35 PM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Richard Biener <richard.guenther@gmail.com> writes:
>> > On Tue, Jun 28, 2022 at 5:54 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
>> >>
>> >> > -----Original Message-----
>> >> > From: Richard Biener <richard.guenther@gmail.com>
>> >> > Sent: Monday, June 27, 2022 7:10 AM
>> >> > To: Tamar Christina <Tamar.Christina@arm.com>
>> >> > Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Richard Earnshaw
>> >> > <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; gcc-
>> >> > patches@gcc.gnu.org; Marcus Shawcroft <Marcus.Shawcroft@arm.com>
>> >> > Subject: Re: [PATCH 1/2]AArch64 Add fallback case using sdot for usdot
>> >> >
>> >> > On Mon, Jun 27, 2022 at 7:25 AM Tamar Christina via Gcc-patches <gcc-
>> >> > patches@gcc.gnu.org> wrote:
>> >> > >
>> >> > > > -----Original Message-----
>> >> > > > From: Richard Sandiford <richard.sandiford@arm.com>
>> >> > > > Sent: Thursday, June 16, 2022 7:54 PM
>> >> > > > To: Tamar Christina <Tamar.Christina@arm.com>
>> >> > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> >> > > > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> >> > > > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
>> >> > <Kyrylo.Tkachov@arm.com>
>> >> > > > Subject: Re: [PATCH 1/2]AArch64 Add fallback case using sdot for
>> >> > > > usdot
>> >> > > >
>> >> > > > Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> >> > > > > Tamar Christina <tamar.christina@arm.com> writes:
>> >> > > > >> Hi All,
>> >> > > > >>
>> >> > > > >> The usdot operation is common in video encoder and decoders
>> >> > > > >> including some of the most widely used ones.
>> >> > > > >>
>> >> > > > >> This patch adds a +dotprod version of the optab as a fallback for
>> >> > > > >> when you do have sdot but not usdot available.
>> >> > > > >>
>> >> > > > >> The fallback works by adding a bias to the unsigned argument to
>> >> > > > >> convert it to a signed value and then correcting for the bias later on.
>> >> > > > >>
>> >> > > > >> Essentially it relies on (x - 128)y + 128y == xy where x is
>> >> > > > >> unsigned and y is signed (assuming both are 8-bit values).
>> >> > > > >> Because the range of a signed byte is only to 127 we split the bias
>> >> > correction into:
>> >> > > > >>
>> >> > > > >>    (x - 128)y + 127y + y
>> >> > > > >
>> >> > > > > I bet you knew this question was coming, but: this technique isn't
>> >> > > > > target-specific, so wouldn't it be better to handle it in
>> >> > > > > tree-vect-patterns.cc instead?
>> >> > >
>> >> > > Ok, so after many hours of trying I don't know how to make this work.
>> >> > > DOT_PROD_EXPR is a reduction, but emitting them as additional pattern
>> >> > > statement doesn't work because they'll be marked as internal_def
>> >> > > rather than reduction_def.  I tried marking the new vec_stmt_info that
>> >> > > I create explicitly as reduction_def but this gets overwritten during analysis.
>> >> > >
>> >> > > I then looked into getting it as a vectorizable_operation but has this
>> >> > > obvious problems In that it no longer treats it as a reduction and so tries to
>> >> > decompose into hi/lo.
>> >> > >
>> >> > > I then looked into treating additional patterns from  a reduction as
>> >> > > reductions themselves but this is obviously wrong as non-reduction
>> >> > statements also get marked as reductions.
>> >> > >
>> >> > > The conclusion is that I don't think the vectorizer allows additional
>> >> > > reductions to be emitted from patterns.
>> >> >
>> >> > Indeed.  DOT_PROD is a weird beast and it doesn't define which lanes are
>> >> > reduced to which so it's only usable when the result is reduced to a single
>> >> > lane.
>> >> >
>> >> > An SLP pattern might work if you use reduc-plus for the reduced lanes and
>> >> > keep the multiply separate?
>> >>
>> >> Unfortunately I can't seem to get it to handle the reduction in SLP.  It seems to always
>> >> use the non-SLP aware loop vectorizer here.  The suggested unroll factor is always 1 and
>> >> even trying to force it gets it to bail out later, presumable because it's reducing into a
>> >> scalar that's used outside the loop?
>> >
>> > Yes, it possibly needs 1-lane SLP support.
>>
>> As I mentioned to Tamar off-list, I feel like I've been wasting
>> people's time recently by spewing out ideas that might or might not work
>> (usually "not work"), so I wanted to get some confidence that the next
>> suggestion made sense.  In the end I needed most of an implementation
>> to do that, so it seemed easiest just to finish it off rather than post
>> it in a half-complete state.  Sorry for the duplication. :-(
>>
>> The patch certainly isn't pretty, but I think it's the best we can
>> do under the current infrastructure, and it should at least make
>> the costs reasonably accurate.  (Actually, that said, we probably
>> need to patch the reduction latency calculation in the aarch64
>> vector code -- didn't think of that until now.)
>>
>> Tested on aarch64-linux-gnu and x64_64-linux-gnu.  WDYT?

Turned out I needed another change for this to fire on x86.  Previously
the input type (half_type) had an arbitrary sign for mixed-sign dotprods,
which was OK for the existing code, but meant that we could sometimes
query for unsigned dotprod instead of signed dotprod when considering
the fallback.  Fixed in the version below (which canonicalises on
using the signed type).

> Looks reasonable - does this end up in OKish code generation as well?

Seems OK for aarch64.  The Advanced SIMD version of vect-reduc-dot-11.c is:

.L7:
        ldr     q2, [x1, x3]
        ldr     q1, [x2, x3]
        sdot    v0.4s, v1.16b, v3.16b
        add     x3, x3, 16
        sdot    v0.4s, v1.16b, v3.16b
        add     v2.16b, v2.16b, v4.16b
        sdot    v0.4s, v1.16b, v2.16b
        cmp     x3, 48
        bne     .L7

and the SVE version is:

.L7:
        ld1b    z1.b, p0/z, [x2, x3]
        ld1b    z2.b, p0/z, [x1, x3]
        sel     z1.b, p0, z1.b, z4.b
        add     x3, x3, x5
        add     z2.b, z2.b, #128
        sdot    z0.s, z1.b, z3.b
        whilelo p0.b, w3, w4
        sdot    z0.s, z1.b, z3.b
        sdot    z0.s, z1.b, z2.b
        b.any   .L7

(with the extra SEL handling a final partial vector).

On x86, for -mavx:

int
f (int res, unsigned short *restrict a, short *restrict b)
{
  for (int i = 0; i < 256; ++i)
    res += a[i] * b[i];
  return res;
}

previously generated:

.L2:
        vmovdqu (%rsi,%rax), %xmm1
        vmovdqu (%rdx,%rax), %xmm0
        addq    $16, %rax
        vpmovsxwd       %xmm0, %xmm3
        vpsrldq $8, %xmm0, %xmm0
        vpmovzxwd       %xmm1, %xmm4
        vpsrldq $8, %xmm1, %xmm1
        vpmulld %xmm4, %xmm3, %xmm3
        vpmovsxwd       %xmm0, %xmm0
        vpmovzxwd       %xmm1, %xmm1
        vpmulld %xmm1, %xmm0, %xmm0
        vpaddd  %xmm2, %xmm3, %xmm2
        vpaddd  %xmm2, %xmm0, %xmm2
        cmpq    $512, %rax
        jne     .L2

whereas now it generates:

.L2:
        vpmaddwd        (%rdx,%rax), %xmm3, %xmm2
        vpaddw  (%rsi,%rax), %xmm4, %xmm1
        vpmaddwd        (%rdx,%rax), %xmm1, %xmm1
        addq    $16, %rax
        vpaddd  %xmm2, %xmm0, %xmm0
        vpaddd  %xmm2, %xmm0, %xmm0
        vpaddd  %xmm1, %xmm0, %xmm0
        cmpq    $512, %rax
        jne     .L2

I don't know x86 well enough to be sure that's an improvement though.
The length of the loop carry dependency has increased from 2 to 3
VPADDDs.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.

Richard


gcc/
	* tree-vect-patterns.cc (vect_convert_input): Expect the input
	type to be signed for optab_vector_mixed_sign.  Update the vectype
	at the same time as type.
	(vect_recog_dot_prod_pattern): Update accordingly.  If usdot isn't
	available, try sdot instead.
	* tree-vect-loop.cc (vect_is_emulated_mixed_dot_prod): New function.
	(vect_model_reduction_cost): Model the cost of implementing usdot
	using sdot.
	(vectorizable_reduction): Likewise.  Skip target support test
	for lane reductions.
	(vect_emulate_mixed_dot_prod): New function.
	(vect_transform_reduction): Use it to emulate usdot via sdot.

gcc/testsuite/
	* gcc.dg/vect/vect-reduc-dot-9.c: Reduce target requirements
	from i8mm to dotprod.
	* gcc.dg/vect/vect-reduc-dot-10.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-11.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-12.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-13.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-14.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-15.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-16.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-17.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-18.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-19.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-20.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-21.c: Likewise.
	* gcc.dg/vect/vect-reduc-dot-22.c: Likewise.
---
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c |   6 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c |   4 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c |   4 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c |   4 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c |   4 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c  |   6 +-
 gcc/tree-vect-loop.cc                         | 160 ++++++++++++++++--
 gcc/tree-vect-patterns.cc                     |  38 ++++-
 16 files changed, 213 insertions(+), 61 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c
index 7ce86965ea9..34e25ab7fb0 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 unsigned
 #define SIGNEDNESS_2 unsigned
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c
index 0f7cbbb87ef..3af8df54cf9 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 unsigned
 #define SIGNEDNESS_2 signed
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c
index 08412614fc6..77ceef3643b 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 unsigned
 #define SIGNEDNESS_2 signed
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c
index 7ee0f45f642..d3c0c86f529 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 signed
 #define SIGNEDNESS_2 unsigned
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c
index 2de1434528b..86a5c85753c 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 signed
 #define SIGNEDNESS_2 unsigned
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c
index dc48f95a32b..25de0940a65 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 signed
 #define SIGNEDNESS_2 signed
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c
index aec62878936..4a1dec0677e 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #define SIGNEDNESS_1 signed
 #define SIGNEDNESS_2 signed
@@ -10,4 +10,4 @@
 #include "vect-reduc-dot-9.c"
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c
index 38f86fe458a..90d21188b76 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
@@ -50,4 +50,4 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c
index 2e86ebe3c6c..81ecb158d29 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
@@ -50,4 +50,4 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c
index d00f24aae4c..cbcd4f120a5 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c
index 17adbca83a0..e81ed1da5a4 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c
index 6cc6a4f2e92..81ce5cdaffb 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c
index e13d3d5c4da..b8c9d3ca53b 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c
index d1049c96bf1..e0b132f6b35 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target vect_int } */
-/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
-/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
 
 #include "tree-vect.h"
 
@@ -50,4 +50,4 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 78dfe8519aa..3a70c15b593 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -4566,6 +4566,31 @@ have_whole_vector_shift (machine_mode mode)
   return true;
 }
 
+/* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
+   multiplication operands have differing signs and (b) we intend
+   to emulate the operation using a series of signed DOT_PROD_EXPRs.
+   See vect_emulate_mixed_dot_prod for the actual sequence used.  */
+
+static bool
+vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
+				 stmt_vec_info stmt_info)
+{
+  gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
+  if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
+    return false;
+
+  tree rhs1 = gimple_assign_rhs1 (assign);
+  tree rhs2 = gimple_assign_rhs2 (assign);
+  if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
+    return false;
+
+  stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
+  gcc_assert (reduc_info->is_reduc_info);
+  return !directly_supported_p (DOT_PROD_EXPR,
+				STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
+				optab_vector_mixed_sign);
+}
+
 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
    functions. Design better to avoid maintenance issues.  */
 
@@ -4601,6 +4626,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
     gcc_unreachable ();
 
+  bool emulated_mixed_dot_prod
+    = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
   if (reduction_type == EXTRACT_LAST_REDUCTION)
     /* No extra instructions are needed in the prologue.  The loop body
        operations are costed in vectorizable_condition.  */
@@ -4628,11 +4655,20 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
     }
   else
     {
-      /* Add in cost for initial definition.
-	 For cond reduction we have four vectors: initial index, step,
-	 initial result of the data reduction, initial value of the index
-	 reduction.  */
-      int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
+      /* Add in the cost of the initial definitions.  */
+      int prologue_stmts;
+      if (reduction_type == COND_REDUCTION)
+	/* For cond reductions we have four vectors: initial index, step,
+	   initial result of the data reduction, initial value of the index
+	   reduction.  */
+	prologue_stmts = 4;
+      else if (emulated_mixed_dot_prod)
+	/* We need the initial reduction value and two invariants:
+	   one that contains the minimum signed value and one that
+	   contains half of its negative.  */
+	prologue_stmts = 3;
+      else
+	prologue_stmts = 1;
       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
 					 scalar_to_vec, stmt_info, 0,
 					 vect_prologue);
@@ -6797,11 +6833,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
 			    || op.code == WIDEN_SUM_EXPR
 			    || op.code == SAD_EXPR);
-  enum optab_subtype optab_query_kind = optab_vector;
-  if (op.code == DOT_PROD_EXPR
-      && (TYPE_SIGN (TREE_TYPE (op.ops[0]))
-	  != TYPE_SIGN (TREE_TYPE (op.ops[1]))))
-    optab_query_kind = optab_vector_mixed_sign;
 
   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
       && !SCALAR_FLOAT_TYPE_P (op.type))
@@ -7328,9 +7359,17 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       /* 4. Supportable by target?  */
       bool ok = true;
 
-      /* 4.1. check support for the operation in the loop  */
+      /* 4.1. check support for the operation in the loop
+
+	 This isn't necessary for the lane reduction codes, since they
+	 can only be produced by pattern matching, and it's up to the
+	 pattern matcher to test for support.  The main reason for
+	 specifically skipping this step is to avoid rechecking whether
+	 mixed-sign dot-products can be implemented using signed
+	 dot-products.  */
       machine_mode vec_mode = TYPE_MODE (vectype_in);
-      if (!directly_supported_p (op.code, vectype_in, optab_query_kind))
+      if (!lane_reduc_code_p
+	  && !directly_supported_p (op.code, vectype_in))
         {
           if (dump_enabled_p ())
             dump_printf (MSG_NOTE, "op not supported by target.\n");
@@ -7398,7 +7437,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
      vect_transform_reduction.  Otherwise this is costed by the
      separate vectorizable_* routines.  */
   if (single_defuse_cycle || lane_reduc_code_p)
-    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
+    {
+      int factor = 1;
+      if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
+	/* Three dot-products and a subtraction.  */
+	factor = 4;
+      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
+			stmt_info, 0, vect_body);
+    }
 
   if (dump_enabled_p ()
       && reduction_type == FOLD_LEFT_REDUCTION)
@@ -7457,6 +7503,81 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   return true;
 }
 
+/* STMT_INFO is a dot-product reduction whose multiplication operands
+   have different signs.  Emit a sequence to emulate the operation
+   using a series of signed DOT_PROD_EXPRs and return the last
+   statement generated.  VEC_DEST is the result of the vector operation
+   and VOP lists its inputs.  */
+
+static gassign *
+vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+			     gimple_stmt_iterator *gsi, tree vec_dest,
+			     tree vop[3])
+{
+  tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
+  tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
+  tree narrow_elttype = TREE_TYPE (narrow_vectype);
+  gimple *new_stmt;
+
+  /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
+  if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
+    std::swap (vop[0], vop[1]);
+
+  /* Convert all inputs to signed types.  */
+  for (int i = 0; i < 3; ++i)
+    if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
+      {
+	tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
+	new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
+	vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+	vop[i] = tmp;
+      }
+
+  /* In the comments below we assume 8-bit inputs for simplicity,
+     but the approach works for any full integer type.  */
+
+  /* Create a vector of -128.  */
+  tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
+  tree min_narrow = build_vector_from_val (narrow_vectype,
+					   min_narrow_elttype);
+
+  /* Create a vector of 64.  */
+  auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
+  tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
+  half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
+
+  /* Emit: SUB_RES = VOP[0] - 128.  */
+  tree sub_res = make_ssa_name (narrow_vectype);
+  new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
+  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+
+  /* Emit:
+
+       STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
+       STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
+       STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
+
+     on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
+     Doing the two 64 * y steps first allows more time to compute x.  */
+  tree stage1 = make_ssa_name (wide_vectype);
+  new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
+				  vop[1], half_narrow, vop[2]);
+  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+
+  tree stage2 = make_ssa_name (wide_vectype);
+  new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
+				  vop[1], half_narrow, stage1);
+  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+
+  tree stage3 = make_ssa_name (wide_vectype);
+  new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
+				  sub_res, vop[1], stage2);
+  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+
+  /* Convert STAGE3 to the reduction type.  */
+  return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
+}
+
 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
    value.  */
 
@@ -7563,12 +7684,17 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 					: &vec_oprnds2));
     }
 
+  bool emulated_mixed_dot_prod
+    = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
     {
       gimple *new_stmt;
       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
       if (masked_loop_p && !mask_by_cond_expr)
 	{
+	  /* No conditional ifns have been defined for dot-product yet.  */
+	  gcc_assert (code != DOT_PROD_EXPR);
+
 	  /* Make sure that the reduction accumulator is vop[0].  */
 	  if (reduc_index == 1)
 	    {
@@ -7597,8 +7723,12 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	      build_vect_cond_expr (code, vop, mask, gsi);
 	    }
 
-	  new_stmt = gimple_build_assign (vec_dest, code,
-					  vop[0], vop[1], vop[2]);
+	  if (emulated_mixed_dot_prod)
+	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
+						    vec_dest, vop);
+	  else
+	    new_stmt = gimple_build_assign (vec_dest, code,
+					    vop[0], vop[1], vop[2]);
 	  new_temp = make_ssa_name (vec_dest, new_stmt);
 	  gimple_assign_set_lhs (new_stmt, new_temp);
 	  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 8f624863971..dfbfb71b3c6 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -760,12 +760,16 @@ vect_convert_input (vec_info *vinfo, stmt_vec_info stmt_info, tree type,
 		    vect_unpromoted_value *unprom, tree vectype,
 		    enum optab_subtype subtype = optab_default)
 {
-
   /* Update the type if the signs differ.  */
-  if (subtype == optab_vector_mixed_sign
-      && TYPE_SIGN (type) != TYPE_SIGN (TREE_TYPE (unprom->op)))
-    type = build_nonstandard_integer_type (TYPE_PRECISION (type),
-					   TYPE_SIGN (unprom->type));
+  if (subtype == optab_vector_mixed_sign)
+    {
+      gcc_assert (!TYPE_UNSIGNED (type));
+      if (TYPE_UNSIGNED (TREE_TYPE (unprom->op)))
+	{
+	  type = unsigned_type_for (type);
+	  vectype = unsigned_type_for (vectype);
+	}
+    }
 
   /* Check for a no-op conversion.  */
   if (types_compatible_p (type, TREE_TYPE (unprom->op)))
@@ -1139,16 +1143,34 @@ vect_recog_dot_prod_pattern (vec_info *vinfo,
      is signed; otherwise, the result has the same sign as the operands.  */
   if (TYPE_PRECISION (unprom_mult.type) != TYPE_PRECISION (type)
       && (subtype == optab_vector_mixed_sign
-	? TYPE_UNSIGNED (unprom_mult.type)
-	: TYPE_SIGN (unprom_mult.type) != TYPE_SIGN (half_type)))
+	  ? TYPE_UNSIGNED (unprom_mult.type)
+	  : TYPE_SIGN (unprom_mult.type) != TYPE_SIGN (half_type)))
     return NULL;
 
   vect_pattern_detected ("vect_recog_dot_prod_pattern", last_stmt);
 
+  /* If the inputs have mixed signs, canonicalize on using the signed
+     input type for analysis.  This also helps when emulating mixed-sign
+     operations using signed operations.  */
+  if (subtype == optab_vector_mixed_sign)
+    half_type = signed_type_for (half_type);
+
   tree half_vectype;
   if (!vect_supportable_direct_optab_p (vinfo, type, DOT_PROD_EXPR, half_type,
 					type_out, &half_vectype, subtype))
-    return NULL;
+    {
+      /* We can emulate a mixed-sign dot-product using a sequence of
+	 signed dot-products; see vect_emulate_mixed_dot_prod for details.  */
+      if (subtype != optab_vector_mixed_sign
+	  || !vect_supportable_direct_optab_p (vinfo, signed_type_for (type),
+					       DOT_PROD_EXPR, half_type,
+					       type_out, &half_vectype,
+					       optab_vector))
+	return NULL;
+
+      *type_out = signed_or_unsigned_type_for (TYPE_UNSIGNED (type),
+					       *type_out);
+    }
 
   /* Get the inputs in the appropriate types.  */
   tree mult_oprnd[2];
  
Richard Biener July 5, 2022, 7:41 a.m. UTC | #10
On Tue, Jul 5, 2022 at 8:08 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Biener <richard.guenther@gmail.com> writes:
> > On Wed, Jun 29, 2022 at 4:35 PM Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> Richard Biener <richard.guenther@gmail.com> writes:
> >> > On Tue, Jun 28, 2022 at 5:54 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
> >> >>
> >> >> > -----Original Message-----
> >> >> > From: Richard Biener <richard.guenther@gmail.com>
> >> >> > Sent: Monday, June 27, 2022 7:10 AM
> >> >> > To: Tamar Christina <Tamar.Christina@arm.com>
> >> >> > Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Richard Earnshaw
> >> >> > <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; gcc-
> >> >> > patches@gcc.gnu.org; Marcus Shawcroft <Marcus.Shawcroft@arm.com>
> >> >> > Subject: Re: [PATCH 1/2]AArch64 Add fallback case using sdot for usdot
> >> >> >
> >> >> > On Mon, Jun 27, 2022 at 7:25 AM Tamar Christina via Gcc-patches <gcc-
> >> >> > patches@gcc.gnu.org> wrote:
> >> >> > >
> >> >> > > > -----Original Message-----
> >> >> > > > From: Richard Sandiford <richard.sandiford@arm.com>
> >> >> > > > Sent: Thursday, June 16, 2022 7:54 PM
> >> >> > > > To: Tamar Christina <Tamar.Christina@arm.com>
> >> >> > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> >> > > > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> >> > > > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> >> >> > <Kyrylo.Tkachov@arm.com>
> >> >> > > > Subject: Re: [PATCH 1/2]AArch64 Add fallback case using sdot for
> >> >> > > > usdot
> >> >> > > >
> >> >> > > > Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> >> >> > > > > Tamar Christina <tamar.christina@arm.com> writes:
> >> >> > > > >> Hi All,
> >> >> > > > >>
> >> >> > > > >> The usdot operation is common in video encoder and decoders
> >> >> > > > >> including some of the most widely used ones.
> >> >> > > > >>
> >> >> > > > >> This patch adds a +dotprod version of the optab as a fallback for
> >> >> > > > >> when you do have sdot but not usdot available.
> >> >> > > > >>
> >> >> > > > >> The fallback works by adding a bias to the unsigned argument to
> >> >> > > > >> convert it to a signed value and then correcting for the bias later on.
> >> >> > > > >>
> >> >> > > > >> Essentially it relies on (x - 128)y + 128y == xy where x is
> >> >> > > > >> unsigned and y is signed (assuming both are 8-bit values).
> >> >> > > > >> Because the range of a signed byte is only to 127 we split the bias
> >> >> > correction into:
> >> >> > > > >>
> >> >> > > > >>    (x - 128)y + 127y + y
> >> >> > > > >
> >> >> > > > > I bet you knew this question was coming, but: this technique isn't
> >> >> > > > > target-specific, so wouldn't it be better to handle it in
> >> >> > > > > tree-vect-patterns.cc instead?
> >> >> > >
> >> >> > > Ok, so after many hours of trying I don't know how to make this work.
> >> >> > > DOT_PROD_EXPR is a reduction, but emitting them as additional pattern
> >> >> > > statement doesn't work because they'll be marked as internal_def
> >> >> > > rather than reduction_def.  I tried marking the new vec_stmt_info that
> >> >> > > I create explicitly as reduction_def but this gets overwritten during analysis.
> >> >> > >
> >> >> > > I then looked into getting it as a vectorizable_operation but has this
> >> >> > > obvious problems In that it no longer treats it as a reduction and so tries to
> >> >> > decompose into hi/lo.
> >> >> > >
> >> >> > > I then looked into treating additional patterns from  a reduction as
> >> >> > > reductions themselves but this is obviously wrong as non-reduction
> >> >> > statements also get marked as reductions.
> >> >> > >
> >> >> > > The conclusion is that I don't think the vectorizer allows additional
> >> >> > > reductions to be emitted from patterns.
> >> >> >
> >> >> > Indeed.  DOT_PROD is a weird beast and it doesn't define which lanes are
> >> >> > reduced to which so it's only usable when the result is reduced to a single
> >> >> > lane.
> >> >> >
> >> >> > An SLP pattern might work if you use reduc-plus for the reduced lanes and
> >> >> > keep the multiply separate?
> >> >>
> >> >> Unfortunately I can't seem to get it to handle the reduction in SLP.  It seems to always
> >> >> use the non-SLP aware loop vectorizer here.  The suggested unroll factor is always 1 and
> >> >> even trying to force it gets it to bail out later, presumable because it's reducing into a
> >> >> scalar that's used outside the loop?
> >> >
> >> > Yes, it possibly needs 1-lane SLP support.
> >>
> >> As I mentioned to Tamar off-list, I feel like I've been wasting
> >> people's time recently by spewing out ideas that might or might not work
> >> (usually "not work"), so I wanted to get some confidence that the next
> >> suggestion made sense.  In the end I needed most of an implementation
> >> to do that, so it seemed easiest just to finish it off rather than post
> >> it in a half-complete state.  Sorry for the duplication. :-(
> >>
> >> The patch certainly isn't pretty, but I think it's the best we can
> >> do under the current infrastructure, and it should at least make
> >> the costs reasonably accurate.  (Actually, that said, we probably
> >> need to patch the reduction latency calculation in the aarch64
> >> vector code -- didn't think of that until now.)
> >>
> >> Tested on aarch64-linux-gnu and x64_64-linux-gnu.  WDYT?
>
> Turned out I needed another change for this to fire on x86.  Previously
> the input type (half_type) had an arbitrary sign for mixed-sign dotprods,
> which was OK for the existing code, but meant that we could sometimes
> query for unsigned dotprod instead of signed dotprod when considering
> the fallback.  Fixed in the version below (which canonicalises on
> using the signed type).
>
> > Looks reasonable - does this end up in OKish code generation as well?
>
> Seems OK for aarch64.  The Advanced SIMD version of vect-reduc-dot-11.c is:
>
> .L7:
>         ldr     q2, [x1, x3]
>         ldr     q1, [x2, x3]
>         sdot    v0.4s, v1.16b, v3.16b
>         add     x3, x3, 16
>         sdot    v0.4s, v1.16b, v3.16b
>         add     v2.16b, v2.16b, v4.16b
>         sdot    v0.4s, v1.16b, v2.16b
>         cmp     x3, 48
>         bne     .L7
>
> and the SVE version is:
>
> .L7:
>         ld1b    z1.b, p0/z, [x2, x3]
>         ld1b    z2.b, p0/z, [x1, x3]
>         sel     z1.b, p0, z1.b, z4.b
>         add     x3, x3, x5
>         add     z2.b, z2.b, #128
>         sdot    z0.s, z1.b, z3.b
>         whilelo p0.b, w3, w4
>         sdot    z0.s, z1.b, z3.b
>         sdot    z0.s, z1.b, z2.b
>         b.any   .L7
>
> (with the extra SEL handling a final partial vector).
>
> On x86, for -mavx:
>
> int
> f (int res, unsigned short *restrict a, short *restrict b)
> {
>   for (int i = 0; i < 256; ++i)
>     res += a[i] * b[i];
>   return res;
> }
>
> previously generated:
>
> .L2:
>         vmovdqu (%rsi,%rax), %xmm1
>         vmovdqu (%rdx,%rax), %xmm0
>         addq    $16, %rax
>         vpmovsxwd       %xmm0, %xmm3
>         vpsrldq $8, %xmm0, %xmm0
>         vpmovzxwd       %xmm1, %xmm4
>         vpsrldq $8, %xmm1, %xmm1
>         vpmulld %xmm4, %xmm3, %xmm3
>         vpmovsxwd       %xmm0, %xmm0
>         vpmovzxwd       %xmm1, %xmm1
>         vpmulld %xmm1, %xmm0, %xmm0
>         vpaddd  %xmm2, %xmm3, %xmm2
>         vpaddd  %xmm2, %xmm0, %xmm2
>         cmpq    $512, %rax
>         jne     .L2
>
> whereas now it generates:
>
> .L2:
>         vpmaddwd        (%rdx,%rax), %xmm3, %xmm2
>         vpaddw  (%rsi,%rax), %xmm4, %xmm1
>         vpmaddwd        (%rdx,%rax), %xmm1, %xmm1
>         addq    $16, %rax
>         vpaddd  %xmm2, %xmm0, %xmm0
>         vpaddd  %xmm2, %xmm0, %xmm0
>         vpaddd  %xmm1, %xmm0, %xmm0
>         cmpq    $512, %rax
>         jne     .L2
>
> I don't know x86 well enough to be sure that's an improvement though.
> The length of the loop carry dependency has increased from 2 to 3
> VPADDDs.

I think that should be OK.

>
> Tested on aarch64-linux-gnu and x86_64-linux-gnu.

OK.

Thanks,
Richard.

> Richard
>
>
> gcc/
>         * tree-vect-patterns.cc (vect_convert_input): Expect the input
>         type to be signed for optab_vector_mixed_sign.  Update the vectype
>         at the same time as type.
>         (vect_recog_dot_prod_pattern): Update accordingly.  If usdot isn't
>         available, try sdot instead.
>         * tree-vect-loop.cc (vect_is_emulated_mixed_dot_prod): New function.
>         (vect_model_reduction_cost): Model the cost of implementing usdot
>         using sdot.
>         (vectorizable_reduction): Likewise.  Skip target support test
>         for lane reductions.
>         (vect_emulate_mixed_dot_prod): New function.
>         (vect_transform_reduction): Use it to emulate usdot via sdot.
>
> gcc/testsuite/
>         * gcc.dg/vect/vect-reduc-dot-9.c: Reduce target requirements
>         from i8mm to dotprod.
>         * gcc.dg/vect/vect-reduc-dot-10.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-11.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-12.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-13.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-14.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-15.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-16.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-17.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-18.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-19.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-20.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-21.c: Likewise.
>         * gcc.dg/vect/vect-reduc-dot-22.c: Likewise.
> ---
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c |   6 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c |   4 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c |   4 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c |   4 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c |   4 +-
>  gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c  |   6 +-
>  gcc/tree-vect-loop.cc                         | 160 ++++++++++++++++--
>  gcc/tree-vect-patterns.cc                     |  38 ++++-
>  16 files changed, 213 insertions(+), 61 deletions(-)
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c
> index 7ce86965ea9..34e25ab7fb0 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-10.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #define SIGNEDNESS_1 unsigned
>  #define SIGNEDNESS_2 unsigned
> @@ -10,4 +10,4 @@
>  #include "vect-reduc-dot-9.c"
>
>  /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c
> index 0f7cbbb87ef..3af8df54cf9 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-11.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #define SIGNEDNESS_1 unsigned
>  #define SIGNEDNESS_2 signed
> @@ -10,4 +10,4 @@
>  #include "vect-reduc-dot-9.c"
>
>  /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c
> index 08412614fc6..77ceef3643b 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-12.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #define SIGNEDNESS_1 unsigned
>  #define SIGNEDNESS_2 signed
> @@ -10,4 +10,4 @@
>  #include "vect-reduc-dot-9.c"
>
>  /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c
> index 7ee0f45f642..d3c0c86f529 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-13.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #define SIGNEDNESS_1 signed
>  #define SIGNEDNESS_2 unsigned
> @@ -10,4 +10,4 @@
>  #include "vect-reduc-dot-9.c"
>
>  /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c
> index 2de1434528b..86a5c85753c 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-14.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #define SIGNEDNESS_1 signed
>  #define SIGNEDNESS_2 unsigned
> @@ -10,4 +10,4 @@
>  #include "vect-reduc-dot-9.c"
>
>  /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c
> index dc48f95a32b..25de0940a65 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-15.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #define SIGNEDNESS_1 signed
>  #define SIGNEDNESS_2 signed
> @@ -10,4 +10,4 @@
>  #include "vect-reduc-dot-9.c"
>
>  /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c
> index aec62878936..4a1dec0677e 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-16.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #define SIGNEDNESS_1 signed
>  #define SIGNEDNESS_2 signed
> @@ -10,4 +10,4 @@
>  #include "vect-reduc-dot-9.c"
>
>  /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c
> index 38f86fe458a..90d21188b76 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-17.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #include "tree-vect.h"
>
> @@ -50,4 +50,4 @@ main (void)
>  }
>
>  /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c
> index 2e86ebe3c6c..81ecb158d29 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-18.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #include "tree-vect.h"
>
> @@ -50,4 +50,4 @@ main (void)
>  }
>
>  /* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c
> index d00f24aae4c..cbcd4f120a5 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-19.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #include "tree-vect.h"
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c
> index 17adbca83a0..e81ed1da5a4 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-20.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #include "tree-vect.h"
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c
> index 6cc6a4f2e92..81ce5cdaffb 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-21.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #include "tree-vect.h"
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c
> index e13d3d5c4da..b8c9d3ca53b 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-22.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #include "tree-vect.h"
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c
> index d1049c96bf1..e0b132f6b35 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-9.c
> @@ -1,6 +1,6 @@
>  /* { dg-require-effective-target vect_int } */
> -/* { dg-require-effective-target arm_v8_2a_i8mm_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> -/* { dg-add-options arm_v8_2a_i8mm }  */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
>
>  #include "tree-vect.h"
>
> @@ -50,4 +50,4 @@ main (void)
>  }
>
>  /* { dg-final { scan-tree-dump-not "vect_recog_dot_prod_pattern: detected" "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_usdot_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 78dfe8519aa..3a70c15b593 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -4566,6 +4566,31 @@ have_whole_vector_shift (machine_mode mode)
>    return true;
>  }
>
> +/* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
> +   multiplication operands have differing signs and (b) we intend
> +   to emulate the operation using a series of signed DOT_PROD_EXPRs.
> +   See vect_emulate_mixed_dot_prod for the actual sequence used.  */
> +
> +static bool
> +vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
> +                                stmt_vec_info stmt_info)
> +{
> +  gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
> +  if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
> +    return false;
> +
> +  tree rhs1 = gimple_assign_rhs1 (assign);
> +  tree rhs2 = gimple_assign_rhs2 (assign);
> +  if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
> +    return false;
> +
> +  stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
> +  gcc_assert (reduc_info->is_reduc_info);
> +  return !directly_supported_p (DOT_PROD_EXPR,
> +                               STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
> +                               optab_vector_mixed_sign);
> +}
> +
>  /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
>     functions. Design better to avoid maintenance issues.  */
>
> @@ -4601,6 +4626,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
>    if (!gimple_extract_op (orig_stmt_info->stmt, &op))
>      gcc_unreachable ();
>
> +  bool emulated_mixed_dot_prod
> +    = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
>    if (reduction_type == EXTRACT_LAST_REDUCTION)
>      /* No extra instructions are needed in the prologue.  The loop body
>         operations are costed in vectorizable_condition.  */
> @@ -4628,11 +4655,20 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
>      }
>    else
>      {
> -      /* Add in cost for initial definition.
> -        For cond reduction we have four vectors: initial index, step,
> -        initial result of the data reduction, initial value of the index
> -        reduction.  */
> -      int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
> +      /* Add in the cost of the initial definitions.  */
> +      int prologue_stmts;
> +      if (reduction_type == COND_REDUCTION)
> +       /* For cond reductions we have four vectors: initial index, step,
> +          initial result of the data reduction, initial value of the index
> +          reduction.  */
> +       prologue_stmts = 4;
> +      else if (emulated_mixed_dot_prod)
> +       /* We need the initial reduction value and two invariants:
> +          one that contains the minimum signed value and one that
> +          contains half of its negative.  */
> +       prologue_stmts = 3;
> +      else
> +       prologue_stmts = 1;
>        prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
>                                          scalar_to_vec, stmt_info, 0,
>                                          vect_prologue);
> @@ -6797,11 +6833,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
>                             || op.code == WIDEN_SUM_EXPR
>                             || op.code == SAD_EXPR);
> -  enum optab_subtype optab_query_kind = optab_vector;
> -  if (op.code == DOT_PROD_EXPR
> -      && (TYPE_SIGN (TREE_TYPE (op.ops[0]))
> -         != TYPE_SIGN (TREE_TYPE (op.ops[1]))))
> -    optab_query_kind = optab_vector_mixed_sign;
>
>    if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
>        && !SCALAR_FLOAT_TYPE_P (op.type))
> @@ -7328,9 +7359,17 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        /* 4. Supportable by target?  */
>        bool ok = true;
>
> -      /* 4.1. check support for the operation in the loop  */
> +      /* 4.1. check support for the operation in the loop
> +
> +        This isn't necessary for the lane reduction codes, since they
> +        can only be produced by pattern matching, and it's up to the
> +        pattern matcher to test for support.  The main reason for
> +        specifically skipping this step is to avoid rechecking whether
> +        mixed-sign dot-products can be implemented using signed
> +        dot-products.  */
>        machine_mode vec_mode = TYPE_MODE (vectype_in);
> -      if (!directly_supported_p (op.code, vectype_in, optab_query_kind))
> +      if (!lane_reduc_code_p
> +         && !directly_supported_p (op.code, vectype_in))
>          {
>            if (dump_enabled_p ())
>              dump_printf (MSG_NOTE, "op not supported by target.\n");
> @@ -7398,7 +7437,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>       vect_transform_reduction.  Otherwise this is costed by the
>       separate vectorizable_* routines.  */
>    if (single_defuse_cycle || lane_reduc_code_p)
> -    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
> +    {
> +      int factor = 1;
> +      if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
> +       /* Three dot-products and a subtraction.  */
> +       factor = 4;
> +      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
> +                       stmt_info, 0, vect_body);
> +    }
>
>    if (dump_enabled_p ()
>        && reduction_type == FOLD_LEFT_REDUCTION)
> @@ -7457,6 +7503,81 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    return true;
>  }
>
> +/* STMT_INFO is a dot-product reduction whose multiplication operands
> +   have different signs.  Emit a sequence to emulate the operation
> +   using a series of signed DOT_PROD_EXPRs and return the last
> +   statement generated.  VEC_DEST is the result of the vector operation
> +   and VOP lists its inputs.  */
> +
> +static gassign *
> +vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
> +                            gimple_stmt_iterator *gsi, tree vec_dest,
> +                            tree vop[3])
> +{
> +  tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
> +  tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
> +  tree narrow_elttype = TREE_TYPE (narrow_vectype);
> +  gimple *new_stmt;
> +
> +  /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
> +  if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
> +    std::swap (vop[0], vop[1]);
> +
> +  /* Convert all inputs to signed types.  */
> +  for (int i = 0; i < 3; ++i)
> +    if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
> +      {
> +       tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
> +       new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
> +       vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
> +       vop[i] = tmp;
> +      }
> +
> +  /* In the comments below we assume 8-bit inputs for simplicity,
> +     but the approach works for any full integer type.  */
> +
> +  /* Create a vector of -128.  */
> +  tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
> +  tree min_narrow = build_vector_from_val (narrow_vectype,
> +                                          min_narrow_elttype);
> +
> +  /* Create a vector of 64.  */
> +  auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
> +  tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
> +  half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
> +
> +  /* Emit: SUB_RES = VOP[0] - 128.  */
> +  tree sub_res = make_ssa_name (narrow_vectype);
> +  new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
> +  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
> +
> +  /* Emit:
> +
> +       STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
> +       STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
> +       STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
> +
> +     on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
> +     Doing the two 64 * y steps first allows more time to compute x.  */
> +  tree stage1 = make_ssa_name (wide_vectype);
> +  new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
> +                                 vop[1], half_narrow, vop[2]);
> +  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
> +
> +  tree stage2 = make_ssa_name (wide_vectype);
> +  new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
> +                                 vop[1], half_narrow, stage1);
> +  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
> +
> +  tree stage3 = make_ssa_name (wide_vectype);
> +  new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
> +                                 sub_res, vop[1], stage2);
> +  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
> +
> +  /* Convert STAGE3 to the reduction type.  */
> +  return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
> +}
> +
>  /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
>     value.  */
>
> @@ -7563,12 +7684,17 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>                                         : &vec_oprnds2));
>      }
>
> +  bool emulated_mixed_dot_prod
> +    = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
>    FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
>      {
>        gimple *new_stmt;
>        tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
>        if (masked_loop_p && !mask_by_cond_expr)
>         {
> +         /* No conditional ifns have been defined for dot-product yet.  */
> +         gcc_assert (code != DOT_PROD_EXPR);
> +
>           /* Make sure that the reduction accumulator is vop[0].  */
>           if (reduc_index == 1)
>             {
> @@ -7597,8 +7723,12 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>               build_vect_cond_expr (code, vop, mask, gsi);
>             }
>
> -         new_stmt = gimple_build_assign (vec_dest, code,
> -                                         vop[0], vop[1], vop[2]);
> +         if (emulated_mixed_dot_prod)
> +           new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
> +                                                   vec_dest, vop);
> +         else
> +           new_stmt = gimple_build_assign (vec_dest, code,
> +                                           vop[0], vop[1], vop[2]);
>           new_temp = make_ssa_name (vec_dest, new_stmt);
>           gimple_assign_set_lhs (new_stmt, new_temp);
>           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index 8f624863971..dfbfb71b3c6 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -760,12 +760,16 @@ vect_convert_input (vec_info *vinfo, stmt_vec_info stmt_info, tree type,
>                     vect_unpromoted_value *unprom, tree vectype,
>                     enum optab_subtype subtype = optab_default)
>  {
> -
>    /* Update the type if the signs differ.  */
> -  if (subtype == optab_vector_mixed_sign
> -      && TYPE_SIGN (type) != TYPE_SIGN (TREE_TYPE (unprom->op)))
> -    type = build_nonstandard_integer_type (TYPE_PRECISION (type),
> -                                          TYPE_SIGN (unprom->type));
> +  if (subtype == optab_vector_mixed_sign)
> +    {
> +      gcc_assert (!TYPE_UNSIGNED (type));
> +      if (TYPE_UNSIGNED (TREE_TYPE (unprom->op)))
> +       {
> +         type = unsigned_type_for (type);
> +         vectype = unsigned_type_for (vectype);
> +       }
> +    }
>
>    /* Check for a no-op conversion.  */
>    if (types_compatible_p (type, TREE_TYPE (unprom->op)))
> @@ -1139,16 +1143,34 @@ vect_recog_dot_prod_pattern (vec_info *vinfo,
>       is signed; otherwise, the result has the same sign as the operands.  */
>    if (TYPE_PRECISION (unprom_mult.type) != TYPE_PRECISION (type)
>        && (subtype == optab_vector_mixed_sign
> -       ? TYPE_UNSIGNED (unprom_mult.type)
> -       : TYPE_SIGN (unprom_mult.type) != TYPE_SIGN (half_type)))
> +         ? TYPE_UNSIGNED (unprom_mult.type)
> +         : TYPE_SIGN (unprom_mult.type) != TYPE_SIGN (half_type)))
>      return NULL;
>
>    vect_pattern_detected ("vect_recog_dot_prod_pattern", last_stmt);
>
> +  /* If the inputs have mixed signs, canonicalize on using the signed
> +     input type for analysis.  This also helps when emulating mixed-sign
> +     operations using signed operations.  */
> +  if (subtype == optab_vector_mixed_sign)
> +    half_type = signed_type_for (half_type);
> +
>    tree half_vectype;
>    if (!vect_supportable_direct_optab_p (vinfo, type, DOT_PROD_EXPR, half_type,
>                                         type_out, &half_vectype, subtype))
> -    return NULL;
> +    {
> +      /* We can emulate a mixed-sign dot-product using a sequence of
> +        signed dot-products; see vect_emulate_mixed_dot_prod for details.  */
> +      if (subtype != optab_vector_mixed_sign
> +         || !vect_supportable_direct_optab_p (vinfo, signed_type_for (type),
> +                                              DOT_PROD_EXPR, half_type,
> +                                              type_out, &half_vectype,
> +                                              optab_vector))
> +       return NULL;
> +
> +      *type_out = signed_or_unsigned_type_for (TYPE_UNSIGNED (type),
> +                                              *type_out);
> +    }
>
>    /* Get the inputs in the appropriate types.  */
>    tree mult_oprnd[2];
> --
> 2.25.1
>
  

Patch

--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -623,7 +623,7 @@  (define_insn "<sur>dot_prod<vsi2qi>"
 
 ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
 ;; (vector) Dot Product operation and the vectorized optab.
-(define_insn "usdot_prod<vsi2qi>"
+(define_insn "usdot_prod<vsi2qi>_insn"
   [(set (match_operand:VS 0 "register_operand" "=w")
 	(plus:VS
 	  (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
@@ -635,6 +635,43 @@  (define_insn "usdot_prod<vsi2qi>"
   [(set_attr "type" "neon_dot<q>")]
 )
 
+;; usdot auto-vec fallback code
+(define_expand "usdot_prod<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand")
+	(plus:VS
+	  (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
+		      (match_operand:<VSI2QI> 2 "register_operand")]
+	  UNSPEC_USDOT)
+	  (match_operand:VS 3 "register_operand")))]
+  "TARGET_DOTPROD || TARGET_I8MM"
+{
+  if (TARGET_I8MM)
+    {
+      emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1],
+					      operands[2], operands[3]));
+      DONE;
+    }
+
+  machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
+  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
+  rtx signbit = gen_int_mode (val, elemmode);
+  rtx t1 = gen_reg_rtx (<MODE>mode);
+  rtx t2 = gen_reg_rtx (<MODE>mode);
+  rtx tmp = gen_reg_rtx (<VSI2QI>mode);
+  rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
+				    gen_int_mode (val - 1, elemmode));
+  rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode));
+  rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
+  c1 = force_reg (<VSI2QI>mode, c1);
+  c2 = force_reg (<VSI2QI>mode, c2);
+  dup = force_reg (<VSI2QI>mode, dup);
+  emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
+  emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3]));
+  emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
+  emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2));
+  DONE;
+})
+
 ;; These instructions map to the __builtins for the Dot Product
 ;; indexed operations.
 (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..acd8e36209690386d021df72f1467a696750ac3e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+noi8mm+dotprod" } */
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+   SIGNEDNESS_4 char *restrict b)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-not {\tusdot\t} } } */
+/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */