[AArch32] : correct usdot-product RTL patterns.
Commit Message
Hi All,
There was a bug in the ACLE specication for dot product which has now
been fixed[1]. This means some intrinsics were missing and are added by this
patch.
Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues.
Ok for master?
[1] https://github.com/ARM-software/acle/releases/tag/r2021Q3
Thanks,
Tamar
gcc/ChangeLog:
* config/arm/arm_neon.h (vusdotq_s32, vusdot_laneq_s32,
vusdotq_laneq_s32, vsudot_laneq_s32, vsudotq_laneq_s32): New
* config/arm/arm_neon_builtins.def (usdot): Add V16QI.
(usdot_laneq, sudot_laneq): New.
* config/arm/neon.md (neon_<sup>dot_laneq<vsi2qi>): New.
(neon_<sup>dot_lane<vsi2qi>): Remote unneeded code.
gcc/testsuite/ChangeLog:
* gcc.target/arm/simd/vdot-2-1.c: Add new tests.
* gcc.target/arm/simd/vdot-2-2.c: Likewise and fix output.
--- inline copy of patch --
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index af6ac63dc3b47830d92f199d93153ff510f658e9..2255d600549a2a1e5dbcebc03f7d6a63bab9f5aa 100644
--
Comments
ping
> -----Original Message-----
> From: Tamar Christina
> Sent: Tuesday, December 21, 2021 12:32 PM
> To: gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Ramana Radhakrishnan
> <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> Subject: [PATCH][AArch32]: correct usdot-product RTL patterns.
>
> Hi All,
>
> There was a bug in the ACLE specication for dot product which has now been
> fixed[1]. This means some intrinsics were missing and are added by this
> patch.
>
> Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues.
>
> Ok for master?
>
> [1] https://github.com/ARM-software/acle/releases/tag/r2021Q3
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> * config/arm/arm_neon.h (vusdotq_s32, vusdot_laneq_s32,
> vusdotq_laneq_s32, vsudot_laneq_s32, vsudotq_laneq_s32): New
> * config/arm/arm_neon_builtins.def (usdot): Add V16QI.
> (usdot_laneq, sudot_laneq): New.
> * config/arm/neon.md (neon_<sup>dot_laneq<vsi2qi>): New.
> (neon_<sup>dot_lane<vsi2qi>): Remote unneeded code.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/arm/simd/vdot-2-1.c: Add new tests.
> * gcc.target/arm/simd/vdot-2-2.c: Likewise and fix output.
>
> --- inline copy of patch --
> diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index
> af6ac63dc3b47830d92f199d93153ff510f658e9..2255d600549a2a1e5dbcebc03f
> 7d6a63bab9f5aa 100644
> --- a/gcc/config/arm/arm_neon.h
> +++ b/gcc/config/arm/arm_neon.h
> @@ -18930,6 +18930,13 @@ vusdot_s32 (int32x2_t __r, uint8x8_t __a,
> int8x8_t __b)
> return __builtin_neon_usdotv8qi_ssus (__r, __a, __b); }
>
> +__extension__ extern __inline int32x4_t __attribute__
> +((__always_inline__, __gnu_inline__, __artificial__))
> +vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) {
> + return __builtin_neon_usdotv16qi_ssus (__r, __a, __b); }
> +
> __extension__ extern __inline int32x2_t __attribute__
> ((__always_inline__, __gnu_inline__, __artificial__))
> vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, @@ -18962,6 +18969,38 @@
> vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a,
> return __builtin_neon_sudot_lanev16qi_sssus (__r, __a, __b, __index); }
>
> +__extension__ extern __inline int32x2_t __attribute__
> +((__always_inline__, __gnu_inline__, __artificial__))
> +vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a,
> + int8x16_t __b, const int __index)
> +{
> + return __builtin_neon_usdot_laneqv8qi_ssuss (__r, __a, __b, __index);
> +}
> +
> +__extension__ extern __inline int32x4_t __attribute__
> +((__always_inline__, __gnu_inline__, __artificial__))
> +vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a,
> + int8x16_t __b, const int __index)
> +{
> + return __builtin_neon_usdot_laneqv16qi_ssuss (__r, __a, __b,
> +__index); }
> +
> +__extension__ extern __inline int32x2_t __attribute__
> +((__always_inline__, __gnu_inline__, __artificial__))
> +vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a,
> + uint8x16_t __b, const int __index)
> +{
> + return __builtin_neon_sudot_laneqv8qi_sssus (__r, __a, __b, __index);
> +}
> +
> +__extension__ extern __inline int32x4_t __attribute__
> +((__always_inline__, __gnu_inline__, __artificial__))
> +vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a,
> + uint8x16_t __b, const int __index) {
> + return __builtin_neon_sudot_laneqv16qi_sssus (__r, __a, __b,
> +__index); }
> +
> #pragma GCC pop_options
>
> #pragma GCC pop_options
> diff --git a/gcc/config/arm/arm_neon_builtins.def
> b/gcc/config/arm/arm_neon_builtins.def
> index
> f83dd4327c16c0af68f72eb6d9ca8cf21e2e56b5..1c150ed3b650a003b44901b4d
> 160a7d6f595f057 100644
> --- a/gcc/config/arm/arm_neon_builtins.def
> +++ b/gcc/config/arm/arm_neon_builtins.def
> @@ -345,9 +345,11 @@ VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi)
> VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi)
> VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi)
>
> -VAR1 (USTERNOP, usdot, v8qi)
> +VAR2 (USTERNOP, usdot, v8qi, v16qi)
> VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi)
> VAR2 (SUMAC_LANE_QUADTUP, sudot_lane, v8qi, v16qi)
> +VAR2 (USMAC_LANE_QUADTUP, usdot_laneq, v8qi, v16qi)
> +VAR2 (SUMAC_LANE_QUADTUP, sudot_laneq, v8qi, v16qi)
>
> VAR4 (BINOP, vcadd90, v4hf, v2sf, v8hf, v4sf)
> VAR4 (BINOP, vcadd270, v4hf, v2sf, v8hf, v4sf) diff --git
> a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index
> 848166311b5f82c5facb66e97c2260a5aba5d302..1707d8e625079b83497a3db44
> db5e33405bb5fa1 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -2977,9 +2977,33 @@ (define_insn "neon_<sup>dot_lane<vsi2qi>"
> DOTPROD_I8MM)
> (match_operand:VCVTI 1 "register_operand" "0")))]
> "TARGET_I8MM"
> + "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"
> + [(set_attr "type" "neon_dot<q>")]
> +)
> +
> +;; These instructions map to the __builtins for the Dot Product ;;
> +indexed operations in the v8.6 I8MM extension.
> +(define_insn "neon_<sup>dot_laneq<vsi2qi>"
> + [(set (match_operand:VCVTI 0 "register_operand" "=w")
> + (plus:VCVTI
> + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand"
> "w")
> + (match_operand:V16QI 3 "register_operand" "t")
> + (match_operand:SI 4 "immediate_operand" "i")]
> + DOTPROD_I8MM)
> + (match_operand:VCVTI 1 "register_operand" "0")))]
> + "TARGET_I8MM"
> {
> - operands[4] = GEN_INT (INTVAL (operands[4]));
> - return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
> + int lane = INTVAL (operands[4]);
> + if (lane > GET_MODE_NUNITS (V2SImode) - 1)
> + {
> + operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode));
> + return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]";
> + }
> + else
> + {
> + operands[4] = GEN_INT (lane);
> + return
> "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]";
> + }
> }
> [(set_attr "type" "neon_dot<q>")]
> )
> diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
> b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
> index
> 88b80cff2329d9c502f40a31bbef70d26251c909..35d713f6a60d3d5880ddc8b43e
> 238b7403b4f135 100644
> --- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
> +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
> @@ -2,7 +2,7 @@
> /* { dg-require-effective-target arm_hard_ok } */
> /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
> /* { dg-add-options arm_v8_2a_i8mm } */
> -/* { dg-additional-options "-O -save-temps -mfloat-abi=hard" } */
> +/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mfpu=auto"
> +} */
> /* { dg-final { check-function-bodies "**" "" } } */
>
> #include <arm_neon.h>
> @@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y)
> return vusdot_s32 (r, x, y);
> }
>
> +/*
> +**usfooq:
> +** ...
> +** vusdot\.s8 q0, q1, q2
> +** bx lr
> +*/
> +int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y) {
> + return vusdotq_s32 (r, x, y);
> +}
> +
> /*
> **usfoo_lane:
> ** ...
> @@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x,
> uint8x8_t y)
> return vsudotq_lane_s32 (r, x, y, 1); }
>
> +/*
> +**usfoo_laneq:
> +** ...
> +** vusdot\.s8 d0, d1, d3\[0\]
> +** bx lr
> +*/
> +int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y) {
> + return vusdot_laneq_s32 (r, x, y, 2); }
> +
> +/*
> +**usfooq_laneq:
> +** ...
> +** vusdot\.s8 q0, q1, d5\[1\]
> +** bx lr
> +*/
> +int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y) {
> + return vusdotq_laneq_s32 (r, x, y, 3); }
> +
> +/* Signed-Unsigned Dot Product instructions. */
> +
> +/*
> +**sfoo_laneq:
> +** ...
> +** vsudot\.u8 d0, d1, d3\[0\]
> +** bx lr
> +*/
> +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y) {
> + return vsudot_laneq_s32 (r, x, y, 2); }
> +
> +/*
> +**sfooq_laneq:
> +** ...
> +** vsudot\.u8 q0, q1, d5\[1\]
> +** bx lr
> +*/
> +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y) {
> + return vsudotq_laneq_s32 (r, x, y, 3); }
> +
> /*
> **usfoo_untied:
> ** ...
> diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
> b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
> index
> 1c74718ca5644be05b4d4839c3a7ea40bff11e40..c57dd423dbc45b2f9f7890ada0
> f081f80381b05c 100644
> --- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
> +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
> @@ -2,7 +2,7 @@
> /* { dg-require-effective-target arm_hard_ok } */
> /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
> /* { dg-add-options arm_v8_2a_i8mm } */
> -/* { dg-additional-options "-O -save-temps -mbig-endian -mfloat-abi=hard" }
> */
> +/* { dg-additional-options "-O -save-temps -mfloat-abi=hard
> +-mbig-endian -mfpu=auto" } */
> /* { dg-final { check-function-bodies "**" "" } } */
>
> #include <arm_neon.h>
> @@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y)
> return vusdot_s32 (r, x, y);
> }
>
> +/*
> +**usfooq:
> +** ...
> +** vusdot\.s8 q0, q1, q2
> +** bx lr
> +*/
> +int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y) {
> + return vusdotq_s32 (r, x, y);
> +}
> +
> /*
> **usfoo_lane:
> ** ...
> @@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x,
> uint8x8_t y)
> return vsudotq_lane_s32 (r, x, y, 1); }
>
> +/*
> +**usfoo_laneq:
> +** ...
> +** vusdot\.s8 d0, d1, d3\[0\]
> +** bx lr
> +*/
> +int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y) {
> + return vusdot_laneq_s32 (r, x, y, 2); }
> +
> +/*
> +**usfooq_laneq:
> +** ...
> +** vusdot\.s8 q0, q1, d5\[1\]
> +** bx lr
> +*/
> +int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y) {
> + return vusdotq_laneq_s32 (r, x, y, 3); }
> +
> +/* Signed-Unsigned Dot Product instructions. */
> +
> +/*
> +**sfoo_laneq:
> +** ...
> +** vsudot\.u8 d0, d1, d3\[0\]
> +** bx lr
> +*/
> +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y) {
> + return vsudot_laneq_s32 (r, x, y, 2); }
> +
> +/*
> +**sfooq_laneq:
> +** ...
> +** vsudot\.u8 q0, q1, d5\[1\]
> +** bx lr
> +*/
> +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y) {
> + return vsudotq_laneq_s32 (r, x, y, 3); }
> +
> /*
> **usfoo_untied:
> ** ...
> @@ -89,3 +146,4 @@ int32x2_t usfoo_lane_untied (int32x2_t unused,
> int32x2_t r, uint8x8_t x, int8x8_ {
> return vusdot_lane_s32 (r, x, y, 0);
> }
> +
>
>
> --
Ping x3
> -----Original Message-----
> From: Tamar Christina
> Sent: Tuesday, January 11, 2022 7:10 AM
> To: gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Ramana Radhakrishnan
> <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> Subject: RE: [PATCH][AArch32]: correct usdot-product RTL patterns.
>
> ping
>
> > -----Original Message-----
> > From: Tamar Christina
> > Sent: Tuesday, December 21, 2021 12:32 PM
> > To: gcc-patches@gcc.gnu.org
> > Cc: nd <nd@arm.com>; Ramana Radhakrishnan
> > <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw
> > <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov
> > <Kyrylo.Tkachov@arm.com>
> > Subject: [PATCH][AArch32]: correct usdot-product RTL patterns.
> >
> > Hi All,
> >
> > There was a bug in the ACLE specication for dot product which has now
> > been fixed[1]. This means some intrinsics were missing and are added
> > by this patch.
> >
> > Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues.
> >
> > Ok for master?
> >
> > [1] https://github.com/ARM-software/acle/releases/tag/r2021Q3
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > * config/arm/arm_neon.h (vusdotq_s32, vusdot_laneq_s32,
> > vusdotq_laneq_s32, vsudot_laneq_s32, vsudotq_laneq_s32): New
> > * config/arm/arm_neon_builtins.def (usdot): Add V16QI.
> > (usdot_laneq, sudot_laneq): New.
> > * config/arm/neon.md (neon_<sup>dot_laneq<vsi2qi>): New.
> > (neon_<sup>dot_lane<vsi2qi>): Remote unneeded code.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/arm/simd/vdot-2-1.c: Add new tests.
> > * gcc.target/arm/simd/vdot-2-2.c: Likewise and fix output.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
> > index
> >
> af6ac63dc3b47830d92f199d93153ff510f658e9..2255d600549a2a1e5dbcebc03f
> > 7d6a63bab9f5aa 100644
> > --- a/gcc/config/arm/arm_neon.h
> > +++ b/gcc/config/arm/arm_neon.h
> > @@ -18930,6 +18930,13 @@ vusdot_s32 (int32x2_t __r, uint8x8_t __a,
> > int8x8_t __b)
> > return __builtin_neon_usdotv8qi_ssus (__r, __a, __b); }
> >
> > +__extension__ extern __inline int32x4_t __attribute__
> > +((__always_inline__, __gnu_inline__, __artificial__))
> > +vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) {
> > + return __builtin_neon_usdotv16qi_ssus (__r, __a, __b); }
> > +
> > __extension__ extern __inline int32x2_t __attribute__
> > ((__always_inline__, __gnu_inline__, __artificial__))
> > vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, @@ -18962,6 +18969,38
> > @@
> > vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a,
> > return __builtin_neon_sudot_lanev16qi_sssus (__r, __a, __b,
> > __index); }
> >
> > +__extension__ extern __inline int32x2_t __attribute__
> > +((__always_inline__, __gnu_inline__, __artificial__))
> > +vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a,
> > + int8x16_t __b, const int __index) {
> > + return __builtin_neon_usdot_laneqv8qi_ssuss (__r, __a, __b,
> > +__index); }
> > +
> > +__extension__ extern __inline int32x4_t __attribute__
> > +((__always_inline__, __gnu_inline__, __artificial__))
> > +vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a,
> > + int8x16_t __b, const int __index) {
> > + return __builtin_neon_usdot_laneqv16qi_ssuss (__r, __a, __b,
> > +__index); }
> > +
> > +__extension__ extern __inline int32x2_t __attribute__
> > +((__always_inline__, __gnu_inline__, __artificial__))
> > +vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a,
> > + uint8x16_t __b, const int __index) {
> > + return __builtin_neon_sudot_laneqv8qi_sssus (__r, __a, __b,
> > +__index); }
> > +
> > +__extension__ extern __inline int32x4_t __attribute__
> > +((__always_inline__, __gnu_inline__, __artificial__))
> > +vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a,
> > + uint8x16_t __b, const int __index) {
> > + return __builtin_neon_sudot_laneqv16qi_sssus (__r, __a, __b,
> > +__index); }
> > +
> > #pragma GCC pop_options
> >
> > #pragma GCC pop_options
> > diff --git a/gcc/config/arm/arm_neon_builtins.def
> > b/gcc/config/arm/arm_neon_builtins.def
> > index
> >
> f83dd4327c16c0af68f72eb6d9ca8cf21e2e56b5..1c150ed3b650a003b44901b4d
> > 160a7d6f595f057 100644
> > --- a/gcc/config/arm/arm_neon_builtins.def
> > +++ b/gcc/config/arm/arm_neon_builtins.def
> > @@ -345,9 +345,11 @@ VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi)
> > VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi)
> > VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi)
> >
> > -VAR1 (USTERNOP, usdot, v8qi)
> > +VAR2 (USTERNOP, usdot, v8qi, v16qi)
> > VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi)
> > VAR2 (SUMAC_LANE_QUADTUP, sudot_lane, v8qi, v16qi)
> > +VAR2 (USMAC_LANE_QUADTUP, usdot_laneq, v8qi, v16qi)
> > +VAR2 (SUMAC_LANE_QUADTUP, sudot_laneq, v8qi, v16qi)
> >
> > VAR4 (BINOP, vcadd90, v4hf, v2sf, v8hf, v4sf)
> > VAR4 (BINOP, vcadd270, v4hf, v2sf, v8hf, v4sf) diff --git
> > a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index
> >
> 848166311b5f82c5facb66e97c2260a5aba5d302..1707d8e625079b83497a3db44
> > db5e33405bb5fa1 100644
> > --- a/gcc/config/arm/neon.md
> > +++ b/gcc/config/arm/neon.md
> > @@ -2977,9 +2977,33 @@ (define_insn "neon_<sup>dot_lane<vsi2qi>"
> > DOTPROD_I8MM)
> > (match_operand:VCVTI 1 "register_operand" "0")))]
> > "TARGET_I8MM"
> > + "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"
> > + [(set_attr "type" "neon_dot<q>")]
> > +)
> > +
> > +;; These instructions map to the __builtins for the Dot Product ;;
> > +indexed operations in the v8.6 I8MM extension.
> > +(define_insn "neon_<sup>dot_laneq<vsi2qi>"
> > + [(set (match_operand:VCVTI 0 "register_operand" "=w")
> > + (plus:VCVTI
> > + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand"
> > "w")
> > + (match_operand:V16QI 3 "register_operand" "t")
> > + (match_operand:SI 4 "immediate_operand" "i")]
> > + DOTPROD_I8MM)
> > + (match_operand:VCVTI 1 "register_operand" "0")))]
> > + "TARGET_I8MM"
> > {
> > - operands[4] = GEN_INT (INTVAL (operands[4]));
> > - return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
> > + int lane = INTVAL (operands[4]);
> > + if (lane > GET_MODE_NUNITS (V2SImode) - 1)
> > + {
> > + operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode));
> > + return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]";
> > + }
> > + else
> > + {
> > + operands[4] = GEN_INT (lane);
> > + return
> > "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]";
> > + }
> > }
> > [(set_attr "type" "neon_dot<q>")]
> > )
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
> > b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
> > index
> >
> 88b80cff2329d9c502f40a31bbef70d26251c909..35d713f6a60d3d5880ddc8b43e
> > 238b7403b4f135 100644
> > --- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
> > +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
> > @@ -2,7 +2,7 @@
> > /* { dg-require-effective-target arm_hard_ok } */
> > /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
> > /* { dg-add-options arm_v8_2a_i8mm } */
> > -/* { dg-additional-options "-O -save-temps -mfloat-abi=hard" } */
> > +/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mfpu=auto"
> > +} */
> > /* { dg-final { check-function-bodies "**" "" } } */
> >
> > #include <arm_neon.h>
> > @@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y)
> > return vusdot_s32 (r, x, y);
> > }
> >
> > +/*
> > +**usfooq:
> > +** ...
> > +** vusdot\.s8 q0, q1, q2
> > +** bx lr
> > +*/
> > +int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y) {
> > + return vusdotq_s32 (r, x, y);
> > +}
> > +
> > /*
> > **usfoo_lane:
> > ** ...
> > @@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x,
> > uint8x8_t y)
> > return vsudotq_lane_s32 (r, x, y, 1); }
> >
> > +/*
> > +**usfoo_laneq:
> > +** ...
> > +** vusdot\.s8 d0, d1, d3\[0\]
> > +** bx lr
> > +*/
> > +int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y) {
> > + return vusdot_laneq_s32 (r, x, y, 2); }
> > +
> > +/*
> > +**usfooq_laneq:
> > +** ...
> > +** vusdot\.s8 q0, q1, d5\[1\]
> > +** bx lr
> > +*/
> > +int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y) {
> > + return vusdotq_laneq_s32 (r, x, y, 3); }
> > +
> > +/* Signed-Unsigned Dot Product instructions. */
> > +
> > +/*
> > +**sfoo_laneq:
> > +** ...
> > +** vsudot\.u8 d0, d1, d3\[0\]
> > +** bx lr
> > +*/
> > +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y) {
> > + return vsudot_laneq_s32 (r, x, y, 2); }
> > +
> > +/*
> > +**sfooq_laneq:
> > +** ...
> > +** vsudot\.u8 q0, q1, d5\[1\]
> > +** bx lr
> > +*/
> > +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y) {
> > + return vsudotq_laneq_s32 (r, x, y, 3); }
> > +
> > /*
> > **usfoo_untied:
> > ** ...
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
> > b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
> > index
> >
> 1c74718ca5644be05b4d4839c3a7ea40bff11e40..c57dd423dbc45b2f9f7890ada0
> > f081f80381b05c 100644
> > --- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
> > +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
> > @@ -2,7 +2,7 @@
> > /* { dg-require-effective-target arm_hard_ok } */
> > /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
> > /* { dg-add-options arm_v8_2a_i8mm } */
> > -/* { dg-additional-options "-O -save-temps -mbig-endian
> > -mfloat-abi=hard" } */
> > +/* { dg-additional-options "-O -save-temps -mfloat-abi=hard
> > +-mbig-endian -mfpu=auto" } */
> > /* { dg-final { check-function-bodies "**" "" } } */
> >
> > #include <arm_neon.h>
> > @@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y)
> > return vusdot_s32 (r, x, y);
> > }
> >
> > +/*
> > +**usfooq:
> > +** ...
> > +** vusdot\.s8 q0, q1, q2
> > +** bx lr
> > +*/
> > +int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y) {
> > + return vusdotq_s32 (r, x, y);
> > +}
> > +
> > /*
> > **usfoo_lane:
> > ** ...
> > @@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x,
> > uint8x8_t y)
> > return vsudotq_lane_s32 (r, x, y, 1); }
> >
> > +/*
> > +**usfoo_laneq:
> > +** ...
> > +** vusdot\.s8 d0, d1, d3\[0\]
> > +** bx lr
> > +*/
> > +int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y) {
> > + return vusdot_laneq_s32 (r, x, y, 2); }
> > +
> > +/*
> > +**usfooq_laneq:
> > +** ...
> > +** vusdot\.s8 q0, q1, d5\[1\]
> > +** bx lr
> > +*/
> > +int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y) {
> > + return vusdotq_laneq_s32 (r, x, y, 3); }
> > +
> > +/* Signed-Unsigned Dot Product instructions. */
> > +
> > +/*
> > +**sfoo_laneq:
> > +** ...
> > +** vsudot\.u8 d0, d1, d3\[0\]
> > +** bx lr
> > +*/
> > +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y) {
> > + return vsudot_laneq_s32 (r, x, y, 2); }
> > +
> > +/*
> > +**sfooq_laneq:
> > +** ...
> > +** vsudot\.u8 q0, q1, d5\[1\]
> > +** bx lr
> > +*/
> > +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y) {
> > + return vsudotq_laneq_s32 (r, x, y, 3); }
> > +
> > /*
> > **usfoo_untied:
> > ** ...
> > @@ -89,3 +146,4 @@ int32x2_t usfoo_lane_untied (int32x2_t unused,
> > int32x2_t r, uint8x8_t x, int8x8_ {
> > return vusdot_lane_s32 (r, x, y, 0); }
> > +
> >
> >
> > --
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Tuesday, December 21, 2021 12:32 PM
> To: gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Ramana Radhakrishnan
> <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> Subject: [PATCH][AArch32]: correct usdot-product RTL patterns.
>
> Hi All,
>
> There was a bug in the ACLE specication for dot product which has now
> been fixed[1]. This means some intrinsics were missing and are added by
> this
> patch.
>
> Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues.
>
> Ok for master?
Ok.
Thanks,
Kyrill
>
> [1] https://github.com/ARM-software/acle/releases/tag/r2021Q3
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> * config/arm/arm_neon.h (vusdotq_s32, vusdot_laneq_s32,
> vusdotq_laneq_s32, vsudot_laneq_s32, vsudotq_laneq_s32): New
> * config/arm/arm_neon_builtins.def (usdot): Add V16QI.
> (usdot_laneq, sudot_laneq): New.
> * config/arm/neon.md (neon_<sup>dot_laneq<vsi2qi>): New.
> (neon_<sup>dot_lane<vsi2qi>): Remote unneeded code.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/arm/simd/vdot-2-1.c: Add new tests.
> * gcc.target/arm/simd/vdot-2-2.c: Likewise and fix output.
>
> --- inline copy of patch --
> diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
> index
> af6ac63dc3b47830d92f199d93153ff510f658e9..2255d600549a2a1e5dbcebc0
> 3f7d6a63bab9f5aa 100644
> --- a/gcc/config/arm/arm_neon.h
> +++ b/gcc/config/arm/arm_neon.h
> @@ -18930,6 +18930,13 @@ vusdot_s32 (int32x2_t __r, uint8x8_t __a,
> int8x8_t __b)
> return __builtin_neon_usdotv8qi_ssus (__r, __a, __b);
> }
>
> +__extension__ extern __inline int32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
> +{
> + return __builtin_neon_usdotv16qi_ssus (__r, __a, __b);
> +}
> +
> __extension__ extern __inline int32x2_t
> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a,
> @@ -18962,6 +18969,38 @@ vsudotq_lane_s32 (int32x4_t __r, int8x16_t
> __a,
> return __builtin_neon_sudot_lanev16qi_sssus (__r, __a, __b, __index);
> }
>
> +__extension__ extern __inline int32x2_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a,
> + int8x16_t __b, const int __index)
> +{
> + return __builtin_neon_usdot_laneqv8qi_ssuss (__r, __a, __b, __index);
> +}
> +
> +__extension__ extern __inline int32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a,
> + int8x16_t __b, const int __index)
> +{
> + return __builtin_neon_usdot_laneqv16qi_ssuss (__r, __a, __b, __index);
> +}
> +
> +__extension__ extern __inline int32x2_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a,
> + uint8x16_t __b, const int __index)
> +{
> + return __builtin_neon_sudot_laneqv8qi_sssus (__r, __a, __b, __index);
> +}
> +
> +__extension__ extern __inline int32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a,
> + uint8x16_t __b, const int __index)
> +{
> + return __builtin_neon_sudot_laneqv16qi_sssus (__r, __a, __b, __index);
> +}
> +
> #pragma GCC pop_options
>
> #pragma GCC pop_options
> diff --git a/gcc/config/arm/arm_neon_builtins.def
> b/gcc/config/arm/arm_neon_builtins.def
> index
> f83dd4327c16c0af68f72eb6d9ca8cf21e2e56b5..1c150ed3b650a003b44901b4
> d160a7d6f595f057 100644
> --- a/gcc/config/arm/arm_neon_builtins.def
> +++ b/gcc/config/arm/arm_neon_builtins.def
> @@ -345,9 +345,11 @@ VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi)
> VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi)
> VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi)
>
> -VAR1 (USTERNOP, usdot, v8qi)
> +VAR2 (USTERNOP, usdot, v8qi, v16qi)
> VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi)
> VAR2 (SUMAC_LANE_QUADTUP, sudot_lane, v8qi, v16qi)
> +VAR2 (USMAC_LANE_QUADTUP, usdot_laneq, v8qi, v16qi)
> +VAR2 (SUMAC_LANE_QUADTUP, sudot_laneq, v8qi, v16qi)
>
> VAR4 (BINOP, vcadd90, v4hf, v2sf, v8hf, v4sf)
> VAR4 (BINOP, vcadd270, v4hf, v2sf, v8hf, v4sf)
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index
> 848166311b5f82c5facb66e97c2260a5aba5d302..1707d8e625079b83497a3db
> 44db5e33405bb5fa1 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -2977,9 +2977,33 @@ (define_insn "neon_<sup>dot_lane<vsi2qi>"
> DOTPROD_I8MM)
> (match_operand:VCVTI 1 "register_operand" "0")))]
> "TARGET_I8MM"
> + "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"
> + [(set_attr "type" "neon_dot<q>")]
> +)
> +
> +;; These instructions map to the __builtins for the Dot Product
> +;; indexed operations in the v8.6 I8MM extension.
> +(define_insn "neon_<sup>dot_laneq<vsi2qi>"
> + [(set (match_operand:VCVTI 0 "register_operand" "=w")
> + (plus:VCVTI
> + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand"
> "w")
> + (match_operand:V16QI 3 "register_operand" "t")
> + (match_operand:SI 4 "immediate_operand" "i")]
> + DOTPROD_I8MM)
> + (match_operand:VCVTI 1 "register_operand" "0")))]
> + "TARGET_I8MM"
> {
> - operands[4] = GEN_INT (INTVAL (operands[4]));
> - return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
> + int lane = INTVAL (operands[4]);
> + if (lane > GET_MODE_NUNITS (V2SImode) - 1)
> + {
> + operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode));
> + return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]";
> + }
> + else
> + {
> + operands[4] = GEN_INT (lane);
> + return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]";
> + }
> }
> [(set_attr "type" "neon_dot<q>")]
> )
> diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
> b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
> index
> 88b80cff2329d9c502f40a31bbef70d26251c909..35d713f6a60d3d5880ddc8b4
> 3e238b7403b4f135 100644
> --- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
> +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c
> @@ -2,7 +2,7 @@
> /* { dg-require-effective-target arm_hard_ok } */
> /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
> /* { dg-add-options arm_v8_2a_i8mm } */
> -/* { dg-additional-options "-O -save-temps -mfloat-abi=hard" } */
> +/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mfpu=auto" }
> */
> /* { dg-final { check-function-bodies "**" "" } } */
>
> #include <arm_neon.h>
> @@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y)
> return vusdot_s32 (r, x, y);
> }
>
> +/*
> +**usfooq:
> +** ...
> +** vusdot\.s8 q0, q1, q2
> +** bx lr
> +*/
> +int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y)
> +{
> + return vusdotq_s32 (r, x, y);
> +}
> +
> /*
> **usfoo_lane:
> ** ...
> @@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x,
> uint8x8_t y)
> return vsudotq_lane_s32 (r, x, y, 1);
> }
>
> +/*
> +**usfoo_laneq:
> +** ...
> +** vusdot\.s8 d0, d1, d3\[0\]
> +** bx lr
> +*/
> +int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
> +{
> + return vusdot_laneq_s32 (r, x, y, 2);
> +}
> +
> +/*
> +**usfooq_laneq:
> +** ...
> +** vusdot\.s8 q0, q1, d5\[1\]
> +** bx lr
> +*/
> +int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
> +{
> + return vusdotq_laneq_s32 (r, x, y, 3);
> +}
> +
> +/* Signed-Unsigned Dot Product instructions. */
> +
> +/*
> +**sfoo_laneq:
> +** ...
> +** vsudot\.u8 d0, d1, d3\[0\]
> +** bx lr
> +*/
> +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
> +{
> + return vsudot_laneq_s32 (r, x, y, 2);
> +}
> +
> +/*
> +**sfooq_laneq:
> +** ...
> +** vsudot\.u8 q0, q1, d5\[1\]
> +** bx lr
> +*/
> +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
> +{
> + return vsudotq_laneq_s32 (r, x, y, 3);
> +}
> +
> /*
> **usfoo_untied:
> ** ...
> diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
> b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
> index
> 1c74718ca5644be05b4d4839c3a7ea40bff11e40..c57dd423dbc45b2f9f7890ad
> a0f081f80381b05c 100644
> --- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
> +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c
> @@ -2,7 +2,7 @@
> /* { dg-require-effective-target arm_hard_ok } */
> /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
> /* { dg-add-options arm_v8_2a_i8mm } */
> -/* { dg-additional-options "-O -save-temps -mbig-endian -mfloat-abi=hard" }
> */
> +/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mbig-endian -
> mfpu=auto" } */
> /* { dg-final { check-function-bodies "**" "" } } */
>
> #include <arm_neon.h>
> @@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y)
> return vusdot_s32 (r, x, y);
> }
>
> +/*
> +**usfooq:
> +** ...
> +** vusdot\.s8 q0, q1, q2
> +** bx lr
> +*/
> +int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y)
> +{
> + return vusdotq_s32 (r, x, y);
> +}
> +
> /*
> **usfoo_lane:
> ** ...
> @@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x,
> uint8x8_t y)
> return vsudotq_lane_s32 (r, x, y, 1);
> }
>
> +/*
> +**usfoo_laneq:
> +** ...
> +** vusdot\.s8 d0, d1, d3\[0\]
> +** bx lr
> +*/
> +int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
> +{
> + return vusdot_laneq_s32 (r, x, y, 2);
> +}
> +
> +/*
> +**usfooq_laneq:
> +** ...
> +** vusdot\.s8 q0, q1, d5\[1\]
> +** bx lr
> +*/
> +int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
> +{
> + return vusdotq_laneq_s32 (r, x, y, 3);
> +}
> +
> +/* Signed-Unsigned Dot Product instructions. */
> +
> +/*
> +**sfoo_laneq:
> +** ...
> +** vsudot\.u8 d0, d1, d3\[0\]
> +** bx lr
> +*/
> +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
> +{
> + return vsudot_laneq_s32 (r, x, y, 2);
> +}
> +
> +/*
> +**sfooq_laneq:
> +** ...
> +** vsudot\.u8 q0, q1, d5\[1\]
> +** bx lr
> +*/
> +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
> +{
> + return vsudotq_laneq_s32 (r, x, y, 3);
> +}
> +
> /*
> **usfoo_untied:
> ** ...
> @@ -89,3 +146,4 @@ int32x2_t usfoo_lane_untied (int32x2_t unused,
> int32x2_t r, uint8x8_t x, int8x8_
> {
> return vusdot_lane_s32 (r, x, y, 0);
> }
> +
>
>
> --
@@ -18930,6 +18930,13 @@ vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)
return __builtin_neon_usdotv8qi_ssus (__r, __a, __b);
}
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
+{
+ return __builtin_neon_usdotv16qi_ssus (__r, __a, __b);
+}
+
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a,
@@ -18962,6 +18969,38 @@ vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a,
return __builtin_neon_sudot_lanev16qi_sssus (__r, __a, __b, __index);
}
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a,
+ int8x16_t __b, const int __index)
+{
+ return __builtin_neon_usdot_laneqv8qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a,
+ int8x16_t __b, const int __index)
+{
+ return __builtin_neon_usdot_laneqv16qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a,
+ uint8x16_t __b, const int __index)
+{
+ return __builtin_neon_sudot_laneqv8qi_sssus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a,
+ uint8x16_t __b, const int __index)
+{
+ return __builtin_neon_sudot_laneqv16qi_sssus (__r, __a, __b, __index);
+}
+
#pragma GCC pop_options
#pragma GCC pop_options
@@ -345,9 +345,11 @@ VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi)
VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi)
VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi)
-VAR1 (USTERNOP, usdot, v8qi)
+VAR2 (USTERNOP, usdot, v8qi, v16qi)
VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi)
VAR2 (SUMAC_LANE_QUADTUP, sudot_lane, v8qi, v16qi)
+VAR2 (USMAC_LANE_QUADTUP, usdot_laneq, v8qi, v16qi)
+VAR2 (SUMAC_LANE_QUADTUP, sudot_laneq, v8qi, v16qi)
VAR4 (BINOP, vcadd90, v4hf, v2sf, v8hf, v4sf)
VAR4 (BINOP, vcadd270, v4hf, v2sf, v8hf, v4sf)
@@ -2977,9 +2977,33 @@ (define_insn "neon_<sup>dot_lane<vsi2qi>"
DOTPROD_I8MM)
(match_operand:VCVTI 1 "register_operand" "0")))]
"TARGET_I8MM"
+ "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"
+ [(set_attr "type" "neon_dot<q>")]
+)
+
+;; These instructions map to the __builtins for the Dot Product
+;; indexed operations in the v8.6 I8MM extension.
+(define_insn "neon_<sup>dot_laneq<vsi2qi>"
+ [(set (match_operand:VCVTI 0 "register_operand" "=w")
+ (plus:VCVTI
+ (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" "w")
+ (match_operand:V16QI 3 "register_operand" "t")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ DOTPROD_I8MM)
+ (match_operand:VCVTI 1 "register_operand" "0")))]
+ "TARGET_I8MM"
{
- operands[4] = GEN_INT (INTVAL (operands[4]));
- return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
+ int lane = INTVAL (operands[4]);
+ if (lane > GET_MODE_NUNITS (V2SImode) - 1)
+ {
+ operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode));
+ return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]";
+ }
+ else
+ {
+ operands[4] = GEN_INT (lane);
+ return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]";
+ }
}
[(set_attr "type" "neon_dot<q>")]
)
@@ -2,7 +2,7 @@
/* { dg-require-effective-target arm_hard_ok } */
/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
/* { dg-add-options arm_v8_2a_i8mm } */
-/* { dg-additional-options "-O -save-temps -mfloat-abi=hard" } */
+/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mfpu=auto" } */
/* { dg-final { check-function-bodies "**" "" } } */
#include <arm_neon.h>
@@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y)
return vusdot_s32 (r, x, y);
}
+/*
+**usfooq:
+** ...
+** vusdot\.s8 q0, q1, q2
+** bx lr
+*/
+int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+ return vusdotq_s32 (r, x, y);
+}
+
/*
**usfoo_lane:
** ...
@@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
return vsudotq_lane_s32 (r, x, y, 1);
}
+/*
+**usfoo_laneq:
+** ...
+** vusdot\.s8 d0, d1, d3\[0\]
+** bx lr
+*/
+int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
+{
+ return vusdot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**usfooq_laneq:
+** ...
+** vusdot\.s8 q0, q1, d5\[1\]
+** bx lr
+*/
+int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+ return vusdotq_laneq_s32 (r, x, y, 3);
+}
+
+/* Signed-Unsigned Dot Product instructions. */
+
+/*
+**sfoo_laneq:
+** ...
+** vsudot\.u8 d0, d1, d3\[0\]
+** bx lr
+*/
+int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
+{
+ return vsudot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**sfooq_laneq:
+** ...
+** vsudot\.u8 q0, q1, d5\[1\]
+** bx lr
+*/
+int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
+{
+ return vsudotq_laneq_s32 (r, x, y, 3);
+}
+
/*
**usfoo_untied:
** ...
@@ -2,7 +2,7 @@
/* { dg-require-effective-target arm_hard_ok } */
/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
/* { dg-add-options arm_v8_2a_i8mm } */
-/* { dg-additional-options "-O -save-temps -mbig-endian -mfloat-abi=hard" } */
+/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mbig-endian -mfpu=auto" } */
/* { dg-final { check-function-bodies "**" "" } } */
#include <arm_neon.h>
@@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y)
return vusdot_s32 (r, x, y);
}
+/*
+**usfooq:
+** ...
+** vusdot\.s8 q0, q1, q2
+** bx lr
+*/
+int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+ return vusdotq_s32 (r, x, y);
+}
+
/*
**usfoo_lane:
** ...
@@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
return vsudotq_lane_s32 (r, x, y, 1);
}
+/*
+**usfoo_laneq:
+** ...
+** vusdot\.s8 d0, d1, d3\[0\]
+** bx lr
+*/
+int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
+{
+ return vusdot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**usfooq_laneq:
+** ...
+** vusdot\.s8 q0, q1, d5\[1\]
+** bx lr
+*/
+int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+ return vusdotq_laneq_s32 (r, x, y, 3);
+}
+
+/* Signed-Unsigned Dot Product instructions. */
+
+/*
+**sfoo_laneq:
+** ...
+** vsudot\.u8 d0, d1, d3\[0\]
+** bx lr
+*/
+int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
+{
+ return vsudot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**sfooq_laneq:
+** ...
+** vsudot\.u8 q0, q1, d5\[1\]
+** bx lr
+*/
+int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
+{
+ return vsudotq_laneq_s32 (r, x, y, 3);
+}
+
/*
**usfoo_untied:
** ...
@@ -89,3 +146,4 @@ int32x2_t usfoo_lane_untied (int32x2_t unused, int32x2_t r, uint8x8_t x, int8x8_
{
return vusdot_lane_s32 (r, x, y, 0);
}
+