[v3] AArch64: Add LUTI ACLE for SVE2
Checks
Context |
Check |
Description |
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 |
success
|
Build passed
|
Commit Message
This patch introduces support for LUTI2/LUTI4 ACLE for SVE2.
LUTI instructions are used for efficient table lookups with 2-bit
or 4-bit indices. LUTI2 reads indexed 8-bit or 16-bit elements from
the low 128 bits of the table vector using packed 2-bit indices,
while LUTI4 can read from the low 128 or 256 bits of the table
vector or from two table vectors using packed 4-bit indices.
These instructions fill the destination vector by copying elements
indexed by segments of the source vector, selected by the vector
segment index.
The changes include the addition of a new AArch64 option
extension "lut", __ARM_FEATURE_LUT preprocessor macro, definitions
for the new LUTI instruction shapes, and implementations of the
svluti2 and svluti4 builtins.
New tests are added as well.
---
Hey,
This is a respin of
https://gcc.gnu.org/pipermail/gcc-patches/2024-July/658015.html. Rebased
with master. Regression tested on aarch64-unknown-linux-gnu and found no
regressions.
Ok for master?
Thanks,
Saurabh
---
gcc/config/aarch64/aarch64-c.cc | 2 +
.../aarch64/aarch64-sve-builtins-shapes.cc | 41 +++++++++++++++++
.../aarch64/aarch64-sve-builtins-shapes.h | 2 +
.../aarch64/aarch64-sve-builtins-sve2.cc | 17 +++++++
.../aarch64/aarch64-sve-builtins-sve2.def | 4 ++
.../aarch64/aarch64-sve-builtins-sve2.h | 2 +
gcc/config/aarch64/aarch64-sve2.md | 45 +++++++++++++++++++
gcc/config/aarch64/iterators.md | 10 +++++
.../aarch64/sve/acle/asm/test_sve_acle.h | 16 +++++++
.../aarch64/sve2/acle/asm/luti2_bf16.c | 40 +++++++++++++++++
.../aarch64/sve2/acle/asm/luti2_f16.c | 40 +++++++++++++++++
.../aarch64/sve2/acle/asm/luti2_s16.c | 40 +++++++++++++++++
.../aarch64/sve2/acle/asm/luti2_s8.c | 40 +++++++++++++++++
.../aarch64/sve2/acle/asm/luti2_u16.c | 40 +++++++++++++++++
.../aarch64/sve2/acle/asm/luti2_u8.c | 40 +++++++++++++++++
.../aarch64/sve2/acle/asm/luti4_bf16.c | 40 +++++++++++++++++
.../aarch64/sve2/acle/asm/luti4_bf16_x2.c | 20 +++++++++
.../aarch64/sve2/acle/asm/luti4_f16.c | 40 +++++++++++++++++
.../aarch64/sve2/acle/asm/luti4_f16_x2.c | 20 +++++++++
.../aarch64/sve2/acle/asm/luti4_s16.c | 40 +++++++++++++++++
.../aarch64/sve2/acle/asm/luti4_s16_x2.c | 20 +++++++++
.../aarch64/sve2/acle/asm/luti4_s8.c | 30 +++++++++++++
.../aarch64/sve2/acle/asm/luti4_u16.c | 40 +++++++++++++++++
.../aarch64/sve2/acle/asm/luti4_u16_x2.c | 20 +++++++++
.../aarch64/sve2/acle/asm/luti4_u8.c | 30 +++++++++++++
gcc/testsuite/lib/target-supports.exp | 12 +++++
26 files changed, 691 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_f16.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_s16.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_s8.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_u16.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_u8.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_f16.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_f16_x2.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s16.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s16_x2.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s8.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u16.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u16_x2.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u8.c
Comments
<saurabh.jha@arm.com> writes:
> This patch introduces support for LUTI2/LUTI4 ACLE for SVE2.
>
> LUTI instructions are used for efficient table lookups with 2-bit
> or 4-bit indices. LUTI2 reads indexed 8-bit or 16-bit elements from
> the low 128 bits of the table vector using packed 2-bit indices,
> while LUTI4 can read from the low 128 or 256 bits of the table
> vector or from two table vectors using packed 4-bit indices.
> These instructions fill the destination vector by copying elements
> indexed by segments of the source vector, selected by the vector
> segment index.
>
> The changes include the addition of a new AArch64 option
> extension "lut", __ARM_FEATURE_LUT preprocessor macro, definitions
> for the new LUTI instruction shapes, and implementations of the
> svluti2 and svluti4 builtins.
>
> New tests are added as well.
>
> ---
>
> Hey,
>
> This is a respin of
> https://gcc.gnu.org/pipermail/gcc-patches/2024-July/658015.html. Rebased
> with master. Regression tested on aarch64-unknown-linux-gnu and found no
> regressions.
Thanks for picking this up! The issues described below are of course not
your fault :)
> Ok for master?
>
> Thanks,
> Saurabh
> ---
> [...]
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
> index ca721dd2c09..0f6d366b2d6 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
> @@ -903,6 +903,47 @@ struct load_ext_gather_base : public overloaded_base<1>
> }
> };
>
> +
> +/* sv<v0>_t svlut_<t0>(sv<t0>_t, svuint8_t, uint64_t)
Because of the potential tuple argument, I suppose this should be:
sv<v0>_t svlut[_<t0>_g](sv<t0>x<g>_t, svuint8_t, uint64_t)
Unlike for the ZT version of svluti, the type suffix is optional.
To quote from the ACLE spec:
svint8_t svluti2_lane[_s8](svint8_t table, svuint8_t indices, uint64_t imm_idx);
svint16_t svluti2_lane[_s16]( svint16_t table, svuint8_t indices, uint64_t imm_idx);
svint8_t svluti4_lane[_s8](svint8_t table, svuint8_t indices, uint64_t imm_idx);
svint16_t svluti4_lane[_s16](svint16_t table, svuint8_t indices, uint64_t imm_idx);
svint16_t svluti4_lane[_s16_x2](svint16x2_t table, svuint8_t indices, uint64_t imm_idx);
Because of that:
> + where the final argument is a constant index, the instruction divides
> + the vector argument in BITS-bit quantities. */
> +template<unsigned int BITS>
> +struct luti_base : public nonoverloaded_base
...this should be an overloaded function and have a resolver.
> +{
> + void
> + build (function_builder &b, const function_group_info &group) const override
> + {
> + /* Format: return type, table vector, indices vector, immediate value. */
> + build_all (b, "v0,t0,vu8,su64", group, MODE_none);
> + }
> +
> + bool
> + check (function_checker &c) const override
> + {
> + int max_range;
> + bool byte_mode = c.type_suffix (0).element_bits == 8;
> +
> + if (BITS == 2)
> + max_range = byte_mode ? 3 : 7;
> + else if (BITS == 4)
> + max_range = byte_mode ? 1 : 7;
It looks like this should be ? 1 : 3, see:
https://developer.arm.com/documentation/ddi0602/2024-12/SVE-Instructions/LUTI4--Lookup-table-read-with-4-bit-indices-?lang=en
Or, programmatically, I think this is:
auto max_range = c.type.suffix (0).element_bits / BITS - 1;
for all cases.
> + else
> + /* Unsupported number of indices bits for LUTI. */
> + gcc_unreachable ();
> +
> + return c.require_immediate_range (2, 0, max_range);
> + }
> +
> +};
> +
> +/* Specializations for 2-bit and 4-bit indices. */
> +using luti2_def = luti_base<2>;
> +SHAPE (luti2)
> +
> +using luti4_def = luti_base<4>;
> +SHAPE (luti4)
> +
> +
> /* sv<t0>x<g>_t svfoo_t0_g(uint64_t, svuint8_t, uint64_t)
>
> where the first argument is the ZT register number (currently always 0)
> [...]
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
> index e726fa1fb68..0c4f8251ac0 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
> @@ -164,6 +164,10 @@ DEF_SVE_FUNCTION (svwhilegt, compare_scalar, while, none)
> DEF_SVE_FUNCTION (svwhilerw, compare_ptr, all_data, none)
> DEF_SVE_FUNCTION (svwhilewr, compare_ptr, all_data, none)
> DEF_SVE_FUNCTION (svxar, ternary_shift_right_imm, all_integer, none)
> +DEF_SVE_FUNCTION (svluti2_lane, luti2, bhs_data, none)
> +DEF_SVE_FUNCTION (svluti4_lane, luti4, bhs_data, none)
> +DEF_SVE_FUNCTION_GS (svluti4_lane, luti4, bhs_data, x2, none)
bhs_data looks wrong: there should be no .s versions. Similarly...
> +
> #undef REQUIRED_EXTENSIONS
>
> #define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_SVE2)
> [...]
> diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
> index f8cfe08f4c0..7dcbc0700da 100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -133,6 +133,7 @@
> ;; ---- Optional AES extensions
> ;; ---- Optional SHA-3 extensions
> ;; ---- Optional SM4 extensions
> +;; ---- Table lookup
>
> ;; =========================================================================
> ;; == Moves
> @@ -4211,3 +4212,47 @@
> "sm4ekey\t%0.s, %1.s, %2.s"
> [(set_attr "type" "crypto_sm4")]
> )
> +
> +;; -------------------------------------------------------------------------
> +;; ---- Table lookup
> +;; -------------------------------------------------------------------------
> +;; Includes:
> +;; - LUTI2
> +;; - LUTI4
> +;; -------------------------------------------------------------------------
> +
> +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
> + [(set (match_operand:SVE_FULL_BS 0 "register_operand" "=w")
> + (unspec:SVE_FULL_BS
> + [(match_operand:SVE_FULL_BS 1 "register_operand" "w")
> + (match_operand:VNx16QI 2 "register_operand" "w")
> + (match_operand:DI 3 "const_int_operand")
> + (const_int LUTI_BITS)]
> + UNSPEC_SVE_LUTI))]
> + "TARGET_SVE2"
> + "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
> +)
> +
> +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
> + [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
> + (unspec:<VSINGLE>
> + [(match_operand:SVE_FULL_H 1 "aligned_register_operand" "w")
> + (match_operand:VNx16QI 2 "register_operand" "w")
> + (match_operand:DI 3 "const_int_operand")
> + (const_int LUTI_BITS)]
> + UNSPEC_SVE_LUTI))]
> + "TARGET_SVE2"
> + "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
> +)
...there should be .S (VNx4) variants here. Also, the .H variants
don't require an aligned register operand. (FWIW, using "w" with
"aligned_register_operand" is wrong in any case, since "w" accepts
unaligned registers.)
It looks like we could merge the patterns above into a single SVE_FULL_BH
pattern.
> +
> +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
> + [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
> + (unspec:<VSINGLE>
> + [(match_operand:SVE_FULL_Hx2 1 "aligned_register_operand" "Uw2")
This operand also isn't required to be aligned: Zn has a 5-bit encoding.
> + (match_operand:VNx16QI 2 "register_operand" "w")
> + (match_operand:DI 3 "const_int_operand")
> + (const_int LUTI_BITS)]
> + UNSPEC_SVE_LUTI))]
> + "TARGET_SVE2"
> + "luti<LUTI_BITS>\t%0.<Vetype>, %1, %2[%3]"
> +)
Also, formatting nit, but: it's more usual to indent the "[" in an
unspec by 2 or 1 extra columns, rather than a full tab.
> [...]
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c
> new file mode 100644
> index 00000000000..f423bfae2c6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c
> @@ -0,0 +1,40 @@
> +/* { dg-do compile } */
This would be better as:
/* { dg-do assemble { target aarch64_asm_lut_ok } } */
/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
with lut added to:
foreach { aarch64_ext } { "fp" "simd" "crypto" "crc" "lse" "dotprod" "sve"
"i8mm" "f32mm" "f64mm" "bf16" "sb" "sve2" "ls64"
"sme" "sme-i16i64" "sme2" "sve-b16b16"
"sme-b16b16" "sme-f16f16" "sme2p1" "fp8" "fp8fma"
"ssve-fp8fma" "fp8dot2" "ssve-fp8dot2" "fp8dot4"
"ssve-fp8dot4"} {
eval [string map [list FUNC $aarch64_ext] {
proc check_effective_target_aarch64_asm_FUNC_ok { } {
if { [istarget aarch64*-*-*] } {
return [check_no_compiler_messages aarch64_FUNC_assembler object {
__asm__ (".arch_extension FUNC");
} "-march=armv8-a+FUNC"]
} else {
return 0
}
}
}]
}
in target-supports.exp.
> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
> +
> +#include "test_sve_acle.h"
> +
> +#pragma GCC target "+sve2+lut"
> +#if STREAMING_COMPATIBLE
> +#pragma GCC target "+sme2"
> +#endif
> +
> +/*
> +** luti2_test_imm0:
> +** luti2 z1\.h, \{ z28\.h \}, z0\[0\]
> +** ret
> +*/
> +
> +TEST_XN_SINGLE (luti2_test_imm0, svbfloat16_t, svuint8_t, z1,
> + svluti2_lane_bf16 (z28, z0, 0),
> + svluti2_lane_bf16 (z28, z0, 0))
Following on from the comment above about these intrinsics being
overloaded: the second call above should not have a type suffix.
Similarly for the other tests (sorry!).
> +
> +/*
> +** luti2_test_imm1:
> +** luti2 z1\.h, \{ z28\.h \}, z0\[1\]
> +** ret
> +*/
> +
> +TEST_XN_SINGLE (luti2_test_imm1, svbfloat16_t, svuint8_t, z1,
> + svluti2_lane_bf16 (z28, z0, 1),
> + svluti2_lane_bf16 (z28, z0, 1))
> +
> +/*
> +** luti2_test_tied:
> +** luti2 z28\.h, \{ z28\.h \}, z0\[2\]
> +** ret
> +*/
> +
> +TEST_XN_SINGLE (luti2_test_tied, svbfloat16_t, svuint8_t, z28,
> + svluti2_lane_bf16 (z28, z0, 2),
> + svluti2_lane_bf16 (z28, z0, 2))
I think we should test the upper bound of each range. A combination of
testing the upper bound + the dg-do change would have caught the ? 1 : 7
thing above.
> [...]
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
> index 45ba2f47a9d..e0d9867801c 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -4800,6 +4800,18 @@ proc check_effective_target_aarch64_sve2 { } {
> }]
> }
>
> +# Return 1 if this is an AArch64 target supporting LUT (Lookup table)
> +proc check_effective_target_aarch64_lut { } {
> + if { ![istarget aarch64*-*-*] || ![check_effective_target_aarch64_sve2] } {
> + return 0
> + }
> + return [check_no_compiler_messages aarch64_lut assembly {
> + #if !defined (__ARM_FEATURE_LUT)
> + #error FOO
> + #endif
> + }]
> +}
> +
> # Return 1 if this is an AArch64 target only supporting SVE (not SVE2).
> proc check_effective_target_aarch64_sve1_only { } {
> return [expr { [check_effective_target_aarch64_sve]
I'm not sure this is needed.
Thanks,
Richard
On Wed, Jan 08, 2025 at 11:13:41AM +0000, Richard Sandiford wrote:
> <saurabh.jha@arm.com> writes:
> > This patch introduces support for LUTI2/LUTI4 ACLE for SVE2.
> >
> > LUTI instructions are used for efficient table lookups with 2-bit
> > or 4-bit indices. LUTI2 reads indexed 8-bit or 16-bit elements from
> > the low 128 bits of the table vector using packed 2-bit indices,
> > while LUTI4 can read from the low 128 or 256 bits of the table
> > vector or from two table vectors using packed 4-bit indices.
> > These instructions fill the destination vector by copying elements
> > indexed by segments of the source vector, selected by the vector
> > segment index.
> >
> > The changes include the addition of a new AArch64 option
> > extension "lut", __ARM_FEATURE_LUT preprocessor macro, definitions
> > for the new LUTI instruction shapes, and implementations of the
> > svluti2 and svluti4 builtins.
> >
> > New tests are added as well.
> >
> > ---
> >
> > Hey,
> >
> > This is a respin of
> > https://gcc.gnu.org/pipermail/gcc-patches/2024-July/658015.html. Rebased
> > with master. Regression tested on aarch64-unknown-linux-gnu and found no
> > regressions.
>
> Thanks for picking this up! The issues described below are of course not
> your fault :)
I've spotted a couple more issues, noted below.
> > Ok for master?
> >
> > Thanks,
> > Saurabh
> > ---
> > [...]
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
> > index ca721dd2c09..0f6d366b2d6 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
> > @@ -903,6 +903,47 @@ struct load_ext_gather_base : public overloaded_base<1>
> > }
> > };
> >
> > +
> > +/* sv<v0>_t svlut_<t0>(sv<t0>_t, svuint8_t, uint64_t)
>
> Because of the potential tuple argument, I suppose this should be:
>
> sv<v0>_t svlut[_<t0>_g](sv<t0>x<g>_t, svuint8_t, uint64_t)
>
> Unlike for the ZT version of svluti, the type suffix is optional.
> To quote from the ACLE spec:
>
> svint8_t svluti2_lane[_s8](svint8_t table, svuint8_t indices, uint64_t imm_idx);
> svint16_t svluti2_lane[_s16]( svint16_t table, svuint8_t indices, uint64_t imm_idx);
> svint8_t svluti4_lane[_s8](svint8_t table, svuint8_t indices, uint64_t imm_idx);
> svint16_t svluti4_lane[_s16](svint16_t table, svuint8_t indices, uint64_t imm_idx);
> svint16_t svluti4_lane[_s16_x2](svint16x2_t table, svuint8_t indices, uint64_t imm_idx);
>
> Because of that:
>
> > + where the final argument is a constant index, the instruction divides
> > + the vector argument in BITS-bit quantities. */
> > +template<unsigned int BITS>
> > +struct luti_base : public nonoverloaded_base
>
> ...this should be an overloaded function and have a resolver.
>
> > +{
> > + void
> > + build (function_builder &b, const function_group_info &group) const override
> > + {
> > + /* Format: return type, table vector, indices vector, immediate value. */
> > + build_all (b, "v0,t0,vu8,su64", group, MODE_none);
> > + }
> > +
> > + bool
> > + check (function_checker &c) const override
> > + {
> > + int max_range;
> > + bool byte_mode = c.type_suffix (0).element_bits == 8;
> > +
> > + if (BITS == 2)
> > + max_range = byte_mode ? 3 : 7;
> > + else if (BITS == 4)
> > + max_range = byte_mode ? 1 : 7;
>
> It looks like this should be ? 1 : 3, see:
>
> https://developer.arm.com/documentation/ddi0602/2024-12/SVE-Instructions/LUTI4--Lookup-table-read-with-4-bit-indices-?lang=en
>
> Or, programmatically, I think this is:
>
> auto max_range = c.type.suffix (0).element_bits / BITS - 1;
>
> for all cases.
>
> > + else
> > + /* Unsupported number of indices bits for LUTI. */
> > + gcc_unreachable ();
> > +
> > + return c.require_immediate_range (2, 0, max_range);
> > + }
> > +
> > +};
> > +
> > +/* Specializations for 2-bit and 4-bit indices. */
> > +using luti2_def = luti_base<2>;
> > +SHAPE (luti2)
> > +
> > +using luti4_def = luti_base<4>;
> > +SHAPE (luti4)
> > +
> > +
> > /* sv<t0>x<g>_t svfoo_t0_g(uint64_t, svuint8_t, uint64_t)
> >
> > where the first argument is the ZT register number (currently always 0)
> > [...]
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
> > index e726fa1fb68..0c4f8251ac0 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
> > @@ -164,6 +164,10 @@ DEF_SVE_FUNCTION (svwhilegt, compare_scalar, while, none)
> > DEF_SVE_FUNCTION (svwhilerw, compare_ptr, all_data, none)
> > DEF_SVE_FUNCTION (svwhilewr, compare_ptr, all_data, none)
> > DEF_SVE_FUNCTION (svxar, ternary_shift_right_imm, all_integer, none)
> > +DEF_SVE_FUNCTION (svluti2_lane, luti2, bhs_data, none)
> > +DEF_SVE_FUNCTION (svluti4_lane, luti4, bhs_data, none)
> > +DEF_SVE_FUNCTION_GS (svluti4_lane, luti4, bhs_data, x2, none)
>
> bhs_data looks wrong: there should be no .s versions. Similarly...
This also needs gating; I think the correct condition is to prefix the new intrinsics with
#undef REQUIRED_EXTENSIONS
#define REQUIRED_EXTENSIONS \
sve_and_sme (AARCH64_FL_SVE2 | AARCH64_FL_LUT, \
AARCH64_FL_SME2 | AARCH64_FL_LUT)
> > +
> > #undef REQUIRED_EXTENSIONS
> >
> > #define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_SVE2)
> > [...]
> > diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
> > index f8cfe08f4c0..7dcbc0700da 100644
> > --- a/gcc/config/aarch64/aarch64-sve2.md
> > +++ b/gcc/config/aarch64/aarch64-sve2.md
> > @@ -133,6 +133,7 @@
> > ;; ---- Optional AES extensions
> > ;; ---- Optional SHA-3 extensions
> > ;; ---- Optional SM4 extensions
> > +;; ---- Table lookup
> >
> > ;; =========================================================================
> > ;; == Moves
> > @@ -4211,3 +4212,47 @@
> > "sm4ekey\t%0.s, %1.s, %2.s"
> > [(set_attr "type" "crypto_sm4")]
> > )
> > +
> > +;; -------------------------------------------------------------------------
> > +;; ---- Table lookup
> > +;; -------------------------------------------------------------------------
> > +;; Includes:
> > +;; - LUTI2
> > +;; - LUTI4
> > +;; -------------------------------------------------------------------------
> > +
> > +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
> > + [(set (match_operand:SVE_FULL_BS 0 "register_operand" "=w")
> > + (unspec:SVE_FULL_BS
> > + [(match_operand:SVE_FULL_BS 1 "register_operand" "w")
> > + (match_operand:VNx16QI 2 "register_operand" "w")
> > + (match_operand:DI 3 "const_int_operand")
> > + (const_int LUTI_BITS)]
> > + UNSPEC_SVE_LUTI))]
> > + "TARGET_SVE2"
> > + "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
> > +)
Similarly, these need to be gated on:
"TARGET_LUT && TARGET_SVE2_OR_SME2"
I've just realised that there's a similar issue with the FAMINMAX instructions
in the same file; would you be able to send a separate patch fixing those?
Thanks!
> > +
> > +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
> > + [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
> > + (unspec:<VSINGLE>
> > + [(match_operand:SVE_FULL_H 1 "aligned_register_operand" "w")
> > + (match_operand:VNx16QI 2 "register_operand" "w")
> > + (match_operand:DI 3 "const_int_operand")
> > + (const_int LUTI_BITS)]
> > + UNSPEC_SVE_LUTI))]
> > + "TARGET_SVE2"
> > + "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
> > +)
>
> ...there should be .S (VNx4) variants here. Also, the .H variants
> don't require an aligned register operand. (FWIW, using "w" with
> "aligned_register_operand" is wrong in any case, since "w" accepts
> unaligned registers.)
>
> It looks like we could merge the patterns above into a single SVE_FULL_BH
> pattern.
>
> > +
> > +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
> > + [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
> > + (unspec:<VSINGLE>
> > + [(match_operand:SVE_FULL_Hx2 1 "aligned_register_operand" "Uw2")
>
> This operand also isn't required to be aligned: Zn has a 5-bit encoding.
>
> > + (match_operand:VNx16QI 2 "register_operand" "w")
> > + (match_operand:DI 3 "const_int_operand")
> > + (const_int LUTI_BITS)]
> > + UNSPEC_SVE_LUTI))]
> > + "TARGET_SVE2"
> > + "luti<LUTI_BITS>\t%0.<Vetype>, %1, %2[%3]"
> > +)
>
> Also, formatting nit, but: it's more usual to indent the "[" in an
> unspec by 2 or 1 extra columns, rather than a full tab.
>
> > [...]
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c
> > new file mode 100644
> > index 00000000000..f423bfae2c6
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c
> > @@ -0,0 +1,40 @@
> > +/* { dg-do compile } */
>
> This would be better as:
>
> /* { dg-do assemble { target aarch64_asm_lut_ok } } */
> /* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
>
> with lut added to:
>
> foreach { aarch64_ext } { "fp" "simd" "crypto" "crc" "lse" "dotprod" "sve"
> "i8mm" "f32mm" "f64mm" "bf16" "sb" "sve2" "ls64"
> "sme" "sme-i16i64" "sme2" "sve-b16b16"
> "sme-b16b16" "sme-f16f16" "sme2p1" "fp8" "fp8fma"
> "ssve-fp8fma" "fp8dot2" "ssve-fp8dot2" "fp8dot4"
> "ssve-fp8dot4"} {
> eval [string map [list FUNC $aarch64_ext] {
> proc check_effective_target_aarch64_asm_FUNC_ok { } {
> if { [istarget aarch64*-*-*] } {
> return [check_no_compiler_messages aarch64_FUNC_assembler object {
> __asm__ (".arch_extension FUNC");
> } "-march=armv8-a+FUNC"]
> } else {
> return 0
> }
> }
> }]
> }
>
> in target-supports.exp.
>
> > +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
> > +
> > +#include "test_sve_acle.h"
> > +
> > +#pragma GCC target "+sve2+lut"
> > +#if STREAMING_COMPATIBLE
> > +#pragma GCC target "+sme2"
> > +#endif
> > +
> > +/*
> > +** luti2_test_imm0:
> > +** luti2 z1\.h, \{ z28\.h \}, z0\[0\]
> > +** ret
> > +*/
> > +
> > +TEST_XN_SINGLE (luti2_test_imm0, svbfloat16_t, svuint8_t, z1,
> > + svluti2_lane_bf16 (z28, z0, 0),
> > + svluti2_lane_bf16 (z28, z0, 0))
>
> Following on from the comment above about these intrinsics being
> overloaded: the second call above should not have a type suffix.
> Similarly for the other tests (sorry!).
>
> > +
> > +/*
> > +** luti2_test_imm1:
> > +** luti2 z1\.h, \{ z28\.h \}, z0\[1\]
> > +** ret
> > +*/
> > +
> > +TEST_XN_SINGLE (luti2_test_imm1, svbfloat16_t, svuint8_t, z1,
> > + svluti2_lane_bf16 (z28, z0, 1),
> > + svluti2_lane_bf16 (z28, z0, 1))
> > +
> > +/*
> > +** luti2_test_tied:
> > +** luti2 z28\.h, \{ z28\.h \}, z0\[2\]
> > +** ret
> > +*/
> > +
> > +TEST_XN_SINGLE (luti2_test_tied, svbfloat16_t, svuint8_t, z28,
> > + svluti2_lane_bf16 (z28, z0, 2),
> > + svluti2_lane_bf16 (z28, z0, 2))
>
> I think we should test the upper bound of each range. A combination of
> testing the upper bound + the dg-do change would have caught the ? 1 : 7
> thing above.
>
> > [...]
> > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
> > index 45ba2f47a9d..e0d9867801c 100644
> > --- a/gcc/testsuite/lib/target-supports.exp
> > +++ b/gcc/testsuite/lib/target-supports.exp
> > @@ -4800,6 +4800,18 @@ proc check_effective_target_aarch64_sve2 { } {
> > }]
> > }
> >
> > +# Return 1 if this is an AArch64 target supporting LUT (Lookup table)
> > +proc check_effective_target_aarch64_lut { } {
> > + if { ![istarget aarch64*-*-*] || ![check_effective_target_aarch64_sve2] } {
> > + return 0
> > + }
> > + return [check_no_compiler_messages aarch64_lut assembly {
> > + #if !defined (__ARM_FEATURE_LUT)
> > + #error FOO
> > + #endif
> > + }]
> > +}
> > +
> > # Return 1 if this is an AArch64 target only supporting SVE (not SVE2).
> > proc check_effective_target_aarch64_sve1_only { } {
> > return [expr { [check_effective_target_aarch64_sve]
>
> I'm not sure this is needed.
>
> Thanks,
> Richard
On 1/8/2025 11:13 AM, Richard Sandiford wrote:
> <saurabh.jha@arm.com> writes:
>> This patch introduces support for LUTI2/LUTI4 ACLE for SVE2.
>>
>> LUTI instructions are used for efficient table lookups with 2-bit
>> or 4-bit indices. LUTI2 reads indexed 8-bit or 16-bit elements from
>> the low 128 bits of the table vector using packed 2-bit indices,
>> while LUTI4 can read from the low 128 or 256 bits of the table
>> vector or from two table vectors using packed 4-bit indices.
>> These instructions fill the destination vector by copying elements
>> indexed by segments of the source vector, selected by the vector
>> segment index.
>>
>> The changes include the addition of a new AArch64 option
>> extension "lut", __ARM_FEATURE_LUT preprocessor macro, definitions
>> for the new LUTI instruction shapes, and implementations of the
>> svluti2 and svluti4 builtins.
>>
>> New tests are added as well.
>>
>> ---
>>
>> Hey,
>>
>> This is a respin of
>> https://gcc.gnu.org/pipermail/gcc-patches/2024-July/658015.html. Rebased
>> with master. Regression tested on aarch64-unknown-linux-gnu and found no
>> regressions.
>
> Thanks for picking this up! The issues described below are of course not
> your fault :)
>
>> Ok for master?
>>
>> Thanks,
>> Saurabh
>> ---
>> [...]
>> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
>> index ca721dd2c09..0f6d366b2d6 100644
>> --- a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
>> +++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
>> @@ -903,6 +903,47 @@ struct load_ext_gather_base : public overloaded_base<1>
>> }
>> };
>>
>> +
>> +/* sv<v0>_t svlut_<t0>(sv<t0>_t, svuint8_t, uint64_t)
>
> Because of the potential tuple argument, I suppose this should be:
>
> sv<v0>_t svlut[_<t0>_g](sv<t0>x<g>_t, svuint8_t, uint64_t)
>
> Unlike for the ZT version of svluti, the type suffix is optional.
> To quote from the ACLE spec:
>
> svint8_t svluti2_lane[_s8](svint8_t table, svuint8_t indices, uint64_t imm_idx);
> svint16_t svluti2_lane[_s16]( svint16_t table, svuint8_t indices, uint64_t imm_idx);
> svint8_t svluti4_lane[_s8](svint8_t table, svuint8_t indices, uint64_t imm_idx);
> svint16_t svluti4_lane[_s16](svint16_t table, svuint8_t indices, uint64_t imm_idx);
> svint16_t svluti4_lane[_s16_x2](svint16x2_t table, svuint8_t indices, uint64_t imm_idx);
>
> Because of that:
>
>> + where the final argument is a constant index, the instruction divides
>> + the vector argument in BITS-bit quantities. */
>> +template<unsigned int BITS>
>> +struct luti_base : public nonoverloaded_base
>
> ...this should be an overloaded function and have a resolver.
>
>> +{
>> + void
>> + build (function_builder &b, const function_group_info &group) const override
>> + {
>> + /* Format: return type, table vector, indices vector, immediate value. */
>> + build_all (b, "v0,t0,vu8,su64", group, MODE_none);
>> + }
>> +
>> + bool
>> + check (function_checker &c) const override
>> + {
>> + int max_range;
>> + bool byte_mode = c.type_suffix (0).element_bits == 8;
>> +
>> + if (BITS == 2)
>> + max_range = byte_mode ? 3 : 7;
>> + else if (BITS == 4)
>> + max_range = byte_mode ? 1 : 7;
>
> It looks like this should be ? 1 : 3, see:
>
> https://developer.arm.com/documentation/ddi0602/2024-12/SVE-Instructions/LUTI4--Lookup-table-read-with-4-bit-indices-?lang=en
>
> Or, programmatically, I think this is:
>
> auto max_range = c.type.suffix (0).element_bits / BITS - 1;
>
> for all cases.
>
>> + else
>> + /* Unsupported number of indices bits for LUTI. */
>> + gcc_unreachable ();
>> +
>> + return c.require_immediate_range (2, 0, max_range);
>> + }
>> +
>> +};
>> +
>> +/* Specializations for 2-bit and 4-bit indices. */
>> +using luti2_def = luti_base<2>;
>> +SHAPE (luti2)
>> +
>> +using luti4_def = luti_base<4>;
>> +SHAPE (luti4)
>> +
>> +
>> /* sv<t0>x<g>_t svfoo_t0_g(uint64_t, svuint8_t, uint64_t)
>>
>> where the first argument is the ZT register number (currently always 0)
>> [...]
>> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
>> index e726fa1fb68..0c4f8251ac0 100644
>> --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
>> +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
>> @@ -164,6 +164,10 @@ DEF_SVE_FUNCTION (svwhilegt, compare_scalar, while, none)
>> DEF_SVE_FUNCTION (svwhilerw, compare_ptr, all_data, none)
>> DEF_SVE_FUNCTION (svwhilewr, compare_ptr, all_data, none)
>> DEF_SVE_FUNCTION (svxar, ternary_shift_right_imm, all_integer, none)
>> +DEF_SVE_FUNCTION (svluti2_lane, luti2, bhs_data, none)
>> +DEF_SVE_FUNCTION (svluti4_lane, luti4, bhs_data, none)
>> +DEF_SVE_FUNCTION_GS (svluti4_lane, luti4, bhs_data, x2, none)
>
> bhs_data looks wrong: there should be no .s versions. Similarly...
>
>> +
>> #undef REQUIRED_EXTENSIONS
>>
>> #define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_SVE2)
>> [...]
>> diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
>> index f8cfe08f4c0..7dcbc0700da 100644
>> --- a/gcc/config/aarch64/aarch64-sve2.md
>> +++ b/gcc/config/aarch64/aarch64-sve2.md
>> @@ -133,6 +133,7 @@
>> ;; ---- Optional AES extensions
>> ;; ---- Optional SHA-3 extensions
>> ;; ---- Optional SM4 extensions
>> +;; ---- Table lookup
>>
>> ;; =========================================================================
>> ;; == Moves
>> @@ -4211,3 +4212,47 @@
>> "sm4ekey\t%0.s, %1.s, %2.s"
>> [(set_attr "type" "crypto_sm4")]
>> )
>> +
>> +;; -------------------------------------------------------------------------
>> +;; ---- Table lookup
>> +;; -------------------------------------------------------------------------
>> +;; Includes:
>> +;; - LUTI2
>> +;; - LUTI4
>> +;; -------------------------------------------------------------------------
>> +
>> +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
>> + [(set (match_operand:SVE_FULL_BS 0 "register_operand" "=w")
>> + (unspec:SVE_FULL_BS
>> + [(match_operand:SVE_FULL_BS 1 "register_operand" "w")
>> + (match_operand:VNx16QI 2 "register_operand" "w")
>> + (match_operand:DI 3 "const_int_operand")
>> + (const_int LUTI_BITS)]
>> + UNSPEC_SVE_LUTI))]
>> + "TARGET_SVE2"
>> + "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
>> +)
>> +
>> +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
>> + [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
>> + (unspec:<VSINGLE>
>> + [(match_operand:SVE_FULL_H 1 "aligned_register_operand" "w")
>> + (match_operand:VNx16QI 2 "register_operand" "w")
>> + (match_operand:DI 3 "const_int_operand")
>> + (const_int LUTI_BITS)]
>> + UNSPEC_SVE_LUTI))]
>> + "TARGET_SVE2"
>> + "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
>> +)
>
> ...there should be .S (VNx4) variants here. Also, the .H variants
> don't require an aligned register operand. (FWIW, using "w" with
> "aligned_register_operand" is wrong in any case, since "w" accepts
> unaligned registers.)
You meant "..x2" here, right?
>
> It looks like we could merge the patterns above into a single SVE_FULL_BH
> pattern.
>
>> +
>> +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
>> + [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
>> + (unspec:<VSINGLE>
>> + [(match_operand:SVE_FULL_Hx2 1 "aligned_register_operand" "Uw2")
>
> This operand also isn't required to be aligned: Zn has a 5-bit encoding.
>
>> + (match_operand:VNx16QI 2 "register_operand" "w")
>> + (match_operand:DI 3 "const_int_operand")
>> + (const_int LUTI_BITS)]
>> + UNSPEC_SVE_LUTI))]
>> + "TARGET_SVE2"
>> + "luti<LUTI_BITS>\t%0.<Vetype>, %1, %2[%3]"
>> +)
>
> Also, formatting nit, but: it's more usual to indent the "[" in an
> unspec by 2 or 1 extra columns, rather than a full tab.
>
>> [...]
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c
>> new file mode 100644
>> index 00000000000..f423bfae2c6
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c
>> @@ -0,0 +1,40 @@
>> +/* { dg-do compile } */
>
> This would be better as:
>
> /* { dg-do assemble { target aarch64_asm_lut_ok } } */
> /* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
>
> with lut added to:
>
> foreach { aarch64_ext } { "fp" "simd" "crypto" "crc" "lse" "dotprod" "sve"
> "i8mm" "f32mm" "f64mm" "bf16" "sb" "sve2" "ls64"
> "sme" "sme-i16i64" "sme2" "sve-b16b16"
> "sme-b16b16" "sme-f16f16" "sme2p1" "fp8" "fp8fma"
> "ssve-fp8fma" "fp8dot2" "ssve-fp8dot2" "fp8dot4"
> "ssve-fp8dot4"} {
> eval [string map [list FUNC $aarch64_ext] {
> proc check_effective_target_aarch64_asm_FUNC_ok { } {
> if { [istarget aarch64*-*-*] } {
> return [check_no_compiler_messages aarch64_FUNC_assembler object {
> __asm__ (".arch_extension FUNC");
> } "-march=armv8-a+FUNC"]
> } else {
> return 0
> }
> }
> }]
> }
>
> in target-supports.exp.
>
>> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
>> +
>> +#include "test_sve_acle.h"
>> +
>> +#pragma GCC target "+sve2+lut"
>> +#if STREAMING_COMPATIBLE
>> +#pragma GCC target "+sme2"
>> +#endif
>> +
>> +/*
>> +** luti2_test_imm0:
>> +** luti2 z1\.h, \{ z28\.h \}, z0\[0\]
>> +** ret
>> +*/
>> +
>> +TEST_XN_SINGLE (luti2_test_imm0, svbfloat16_t, svuint8_t, z1,
>> + svluti2_lane_bf16 (z28, z0, 0),
>> + svluti2_lane_bf16 (z28, z0, 0))
>
> Following on from the comment above about these intrinsics being
> overloaded: the second call above should not have a type suffix.
> Similarly for the other tests (sorry!).
>
>> +
>> +/*
>> +** luti2_test_imm1:
>> +** luti2 z1\.h, \{ z28\.h \}, z0\[1\]
>> +** ret
>> +*/
>> +
>> +TEST_XN_SINGLE (luti2_test_imm1, svbfloat16_t, svuint8_t, z1,
>> + svluti2_lane_bf16 (z28, z0, 1),
>> + svluti2_lane_bf16 (z28, z0, 1))
>> +
>> +/*
>> +** luti2_test_tied:
>> +** luti2 z28\.h, \{ z28\.h \}, z0\[2\]
>> +** ret
>> +*/
>> +
>> +TEST_XN_SINGLE (luti2_test_tied, svbfloat16_t, svuint8_t, z28,
>> + svluti2_lane_bf16 (z28, z0, 2),
>> + svluti2_lane_bf16 (z28, z0, 2))
>
> I think we should test the upper bound of each range. A combination of
> testing the upper bound + the dg-do change would have caught the ? 1 : 7
> thing above.
>
>> [...]
>> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
>> index 45ba2f47a9d..e0d9867801c 100644
>> --- a/gcc/testsuite/lib/target-supports.exp
>> +++ b/gcc/testsuite/lib/target-supports.exp
>> @@ -4800,6 +4800,18 @@ proc check_effective_target_aarch64_sve2 { } {
>> }]
>> }
>>
>> +# Return 1 if this is an AArch64 target supporting LUT (Lookup table)
>> +proc check_effective_target_aarch64_lut { } {
>> + if { ![istarget aarch64*-*-*] || ![check_effective_target_aarch64_sve2] } {
>> + return 0
>> + }
>> + return [check_no_compiler_messages aarch64_lut assembly {
>> + #if !defined (__ARM_FEATURE_LUT)
>> + #error FOO
>> + #endif
>> + }]
>> +}
>> +
>> # Return 1 if this is an AArch64 target only supporting SVE (not SVE2).
>> proc check_effective_target_aarch64_sve1_only { } {
>> return [expr { [check_effective_target_aarch64_sve]
>
> I'm not sure this is needed.
>
> Thanks,
> Richard
Saurabh Jha <saurabh.jha@arm.com> writes:
> On 1/8/2025 11:13 AM, Richard Sandiford wrote:
>> <saurabh.jha@arm.com> writes:
>>> [...]
>>> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
>>> index e726fa1fb68..0c4f8251ac0 100644
>>> --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
>>> +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
>>> @@ -164,6 +164,10 @@ DEF_SVE_FUNCTION (svwhilegt, compare_scalar, while, none)
>>> DEF_SVE_FUNCTION (svwhilerw, compare_ptr, all_data, none)
>>> DEF_SVE_FUNCTION (svwhilewr, compare_ptr, all_data, none)
>>> DEF_SVE_FUNCTION (svxar, ternary_shift_right_imm, all_integer, none)
>>> +DEF_SVE_FUNCTION (svluti2_lane, luti2, bhs_data, none)
>>> +DEF_SVE_FUNCTION (svluti4_lane, luti4, bhs_data, none)
>>> +DEF_SVE_FUNCTION_GS (svluti4_lane, luti4, bhs_data, x2, none)
>>
>> bhs_data looks wrong: there should be no .s versions. Similarly...
>>
>>> +
>>> #undef REQUIRED_EXTENSIONS
>>>
>>> #define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_SVE2)
>>> [...]
>>> diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
>>> index f8cfe08f4c0..7dcbc0700da 100644
>>> --- a/gcc/config/aarch64/aarch64-sve2.md
>>> +++ b/gcc/config/aarch64/aarch64-sve2.md
>>> @@ -133,6 +133,7 @@
>>> ;; ---- Optional AES extensions
>>> ;; ---- Optional SHA-3 extensions
>>> ;; ---- Optional SM4 extensions
>>> +;; ---- Table lookup
>>>
>>> ;; =========================================================================
>>> ;; == Moves
>>> @@ -4211,3 +4212,47 @@
>>> "sm4ekey\t%0.s, %1.s, %2.s"
>>> [(set_attr "type" "crypto_sm4")]
>>> )
>>> +
>>> +;; -------------------------------------------------------------------------
>>> +;; ---- Table lookup
>>> +;; -------------------------------------------------------------------------
>>> +;; Includes:
>>> +;; - LUTI2
>>> +;; - LUTI4
>>> +;; -------------------------------------------------------------------------
>>> +
>>> +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
>>> + [(set (match_operand:SVE_FULL_BS 0 "register_operand" "=w")
>>> + (unspec:SVE_FULL_BS
>>> + [(match_operand:SVE_FULL_BS 1 "register_operand" "w")
>>> + (match_operand:VNx16QI 2 "register_operand" "w")
>>> + (match_operand:DI 3 "const_int_operand")
>>> + (const_int LUTI_BITS)]
>>> + UNSPEC_SVE_LUTI))]
>>> + "TARGET_SVE2"
>>> + "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
>>> +)
>>> +
>>> +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
>>> + [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
>>> + (unspec:<VSINGLE>
>>> + [(match_operand:SVE_FULL_H 1 "aligned_register_operand" "w")
>>> + (match_operand:VNx16QI 2 "register_operand" "w")
>>> + (match_operand:DI 3 "const_int_operand")
>>> + (const_int LUTI_BITS)]
>>> + UNSPEC_SVE_LUTI))]
>>> + "TARGET_SVE2"
>>> + "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
>>> +)
>>
>> ...there should be .S (VNx4) variants here. Also, the .H variants
>> don't require an aligned register operand. (FWIW, using "w" with
>> "aligned_register_operand" is wrong in any case, since "w" accepts
>> unaligned registers.)
>
> You meant "..x2" here, right?
I did mean VNx4 (the mode prefix for full .S vectors, as in VNx4SI).
But I also meant "there should be *no* .S" -- sorry about that.
Richard
Hey Andrew,
On 1/8/2025 12:15 PM, Andrew Carlotti wrote:
> On Wed, Jan 08, 2025 at 11:13:41AM +0000, Richard Sandiford wrote:
>> <saurabh.jha@arm.com> writes:
>>> This patch introduces support for LUTI2/LUTI4 ACLE for SVE2.
>>>
>>> LUTI instructions are used for efficient table lookups with 2-bit
>>> or 4-bit indices. LUTI2 reads indexed 8-bit or 16-bit elements from
>>> the low 128 bits of the table vector using packed 2-bit indices,
>>> while LUTI4 can read from the low 128 or 256 bits of the table
>>> vector or from two table vectors using packed 4-bit indices.
>>> These instructions fill the destination vector by copying elements
>>> indexed by segments of the source vector, selected by the vector
>>> segment index.
>>>
>>> The changes include the addition of a new AArch64 option
>>> extension "lut", __ARM_FEATURE_LUT preprocessor macro, definitions
>>> for the new LUTI instruction shapes, and implementations of the
>>> svluti2 and svluti4 builtins.
>>>
>>> New tests are added as well.
>>>
>>> ---
>>>
>>> Hey,
>>>
>>> This is a respin of
>>> https://gcc.gnu.org/pipermail/gcc-patches/2024-July/658015.html. Rebased
>>> with master. Regression tested on aarch64-unknown-linux-gnu and found no
>>> regressions.
>>
>> Thanks for picking this up! The issues described below are of course not
>> your fault :)
>
> I've spotted a couple more issues, noted below.
>
>>> Ok for master?
>>>
>>> Thanks,
>>> Saurabh
>>> ---
>>> [...]
>>> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
>>> index ca721dd2c09..0f6d366b2d6 100644
>>> --- a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
>>> +++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
>>> @@ -903,6 +903,47 @@ struct load_ext_gather_base : public overloaded_base<1>
>>> }
>>> };
>>>
>>> +
>>> +/* sv<v0>_t svlut_<t0>(sv<t0>_t, svuint8_t, uint64_t)
>>
>> Because of the potential tuple argument, I suppose this should be:
>>
>> sv<v0>_t svlut[_<t0>_g](sv<t0>x<g>_t, svuint8_t, uint64_t)
>>
>> Unlike for the ZT version of svluti, the type suffix is optional.
>> To quote from the ACLE spec:
>>
>> svint8_t svluti2_lane[_s8](svint8_t table, svuint8_t indices, uint64_t imm_idx);
>> svint16_t svluti2_lane[_s16]( svint16_t table, svuint8_t indices, uint64_t imm_idx);
>> svint8_t svluti4_lane[_s8](svint8_t table, svuint8_t indices, uint64_t imm_idx);
>> svint16_t svluti4_lane[_s16](svint16_t table, svuint8_t indices, uint64_t imm_idx);
>> svint16_t svluti4_lane[_s16_x2](svint16x2_t table, svuint8_t indices, uint64_t imm_idx);
>>
>> Because of that:
>>
>>> + where the final argument is a constant index, the instruction divides
>>> + the vector argument in BITS-bit quantities. */
>>> +template<unsigned int BITS>
>>> +struct luti_base : public nonoverloaded_base
>>
>> ...this should be an overloaded function and have a resolver.
>>
>>> +{
>>> + void
>>> + build (function_builder &b, const function_group_info &group) const override
>>> + {
>>> + /* Format: return type, table vector, indices vector, immediate value. */
>>> + build_all (b, "v0,t0,vu8,su64", group, MODE_none);
>>> + }
>>> +
>>> + bool
>>> + check (function_checker &c) const override
>>> + {
>>> + int max_range;
>>> + bool byte_mode = c.type_suffix (0).element_bits == 8;
>>> +
>>> + if (BITS == 2)
>>> + max_range = byte_mode ? 3 : 7;
>>> + else if (BITS == 4)
>>> + max_range = byte_mode ? 1 : 7;
>>
>> It looks like this should be ? 1 : 3, see:
>>
>> https://developer.arm.com/documentation/ddi0602/2024-12/SVE-Instructions/LUTI4--Lookup-table-read-with-4-bit-indices-?lang=en
>>
>> Or, programmatically, I think this is:
>>
>> auto max_range = c.type.suffix (0).element_bits / BITS - 1;
>>
>> for all cases.
>>
>>> + else
>>> + /* Unsupported number of indices bits for LUTI. */
>>> + gcc_unreachable ();
>>> +
>>> + return c.require_immediate_range (2, 0, max_range);
>>> + }
>>> +
>>> +};
>>> +
>>> +/* Specializations for 2-bit and 4-bit indices. */
>>> +using luti2_def = luti_base<2>;
>>> +SHAPE (luti2)
>>> +
>>> +using luti4_def = luti_base<4>;
>>> +SHAPE (luti4)
>>> +
>>> +
>>> /* sv<t0>x<g>_t svfoo_t0_g(uint64_t, svuint8_t, uint64_t)
>>>
>>> where the first argument is the ZT register number (currently always 0)
>>> [...]
>>> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
>>> index e726fa1fb68..0c4f8251ac0 100644
>>> --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
>>> +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
>>> @@ -164,6 +164,10 @@ DEF_SVE_FUNCTION (svwhilegt, compare_scalar, while, none)
>>> DEF_SVE_FUNCTION (svwhilerw, compare_ptr, all_data, none)
>>> DEF_SVE_FUNCTION (svwhilewr, compare_ptr, all_data, none)
>>> DEF_SVE_FUNCTION (svxar, ternary_shift_right_imm, all_integer, none)
>>> +DEF_SVE_FUNCTION (svluti2_lane, luti2, bhs_data, none)
>>> +DEF_SVE_FUNCTION (svluti4_lane, luti4, bhs_data, none)
>>> +DEF_SVE_FUNCTION_GS (svluti4_lane, luti4, bhs_data, x2, none)
>>
>> bhs_data looks wrong: there should be no .s versions. Similarly...
>
> This also needs gating; I think the correct condition is to prefix the new intrinsics with
>
> #undef REQUIRED_EXTENSIONS
> #define REQUIRED_EXTENSIONS \
> sve_and_sme (AARCH64_FL_SVE2 | AARCH64_FL_LUT, \
> AARCH64_FL_SME2 | AARCH64_FL_LUT)
>
>>> +
>>> #undef REQUIRED_EXTENSIONS
>>>
>>> #define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_SVE2)
>>> [...]
>>> diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
>>> index f8cfe08f4c0..7dcbc0700da 100644
>>> --- a/gcc/config/aarch64/aarch64-sve2.md
>>> +++ b/gcc/config/aarch64/aarch64-sve2.md
>>> @@ -133,6 +133,7 @@
>>> ;; ---- Optional AES extensions
>>> ;; ---- Optional SHA-3 extensions
>>> ;; ---- Optional SM4 extensions
>>> +;; ---- Table lookup
>>>
>>> ;; =========================================================================
>>> ;; == Moves
>>> @@ -4211,3 +4212,47 @@
>>> "sm4ekey\t%0.s, %1.s, %2.s"
>>> [(set_attr "type" "crypto_sm4")]
>>> )
>>> +
>>> +;; -------------------------------------------------------------------------
>>> +;; ---- Table lookup
>>> +;; -------------------------------------------------------------------------
>>> +;; Includes:
>>> +;; - LUTI2
>>> +;; - LUTI4
>>> +;; -------------------------------------------------------------------------
>>> +
>>> +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
>>> + [(set (match_operand:SVE_FULL_BS 0 "register_operand" "=w")
>>> + (unspec:SVE_FULL_BS
>>> + [(match_operand:SVE_FULL_BS 1 "register_operand" "w")
>>> + (match_operand:VNx16QI 2 "register_operand" "w")
>>> + (match_operand:DI 3 "const_int_operand")
>>> + (const_int LUTI_BITS)]
>>> + UNSPEC_SVE_LUTI))]
>>> + "TARGET_SVE2"
>>> + "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
>>> +)
>
> Similarly, these need to be gated on:
> "TARGET_LUT && TARGET_SVE2_OR_SME2"
>
> I've just realised that there's a similar issue with the FAMINMAX instructions
> in the same file; would you be able to send a separate patch fixing those?
We're doing it differently for faminmax. We're using the instruction
pattern "@cond_<optab><mode>" in aarch64-sve.md and in the iterator
SVE_COND_FP_BINARY used by that instruction pattern, the relevant
unspecs, UNSPEC_COND_FAMAX and UNSPEC_COND_FAMIN are guarded behind
TARGET_SVE_FAMINMAX which equals "TARGET_SVE && TARGET_FAMINMNAX". So
we're gating the SVE intrinsics I think.
The other relevant instruction pattern "aarch64_pred_faminmax_fused"
is already gated behind "TARGET_SVE_FAMINMAX".
Do we need to do anything differently here?
>
> Thanks!
>
>>> +
>>> +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
>>> + [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
>>> + (unspec:<VSINGLE>
>>> + [(match_operand:SVE_FULL_H 1 "aligned_register_operand" "w")
>>> + (match_operand:VNx16QI 2 "register_operand" "w")
>>> + (match_operand:DI 3 "const_int_operand")
>>> + (const_int LUTI_BITS)]
>>> + UNSPEC_SVE_LUTI))]
>>> + "TARGET_SVE2"
>>> + "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
>>> +)
>>
>> ...there should be .S (VNx4) variants here. Also, the .H variants
>> don't require an aligned register operand. (FWIW, using "w" with
>> "aligned_register_operand" is wrong in any case, since "w" accepts
>> unaligned registers.)
>>
>> It looks like we could merge the patterns above into a single SVE_FULL_BH
>> pattern.
>>
>>> +
>>> +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
>>> + [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
>>> + (unspec:<VSINGLE>
>>> + [(match_operand:SVE_FULL_Hx2 1 "aligned_register_operand" "Uw2")
>>
>> This operand also isn't required to be aligned: Zn has a 5-bit encoding.
>>
>>> + (match_operand:VNx16QI 2 "register_operand" "w")
>>> + (match_operand:DI 3 "const_int_operand")
>>> + (const_int LUTI_BITS)]
>>> + UNSPEC_SVE_LUTI))]
>>> + "TARGET_SVE2"
>>> + "luti<LUTI_BITS>\t%0.<Vetype>, %1, %2[%3]"
>>> +)
>>
>> Also, formatting nit, but: it's more usual to indent the "[" in an
>> unspec by 2 or 1 extra columns, rather than a full tab.
>>
>>> [...]
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c
>>> new file mode 100644
>>> index 00000000000..f423bfae2c6
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c
>>> @@ -0,0 +1,40 @@
>>> +/* { dg-do compile } */
>>
>> This would be better as:
>>
>> /* { dg-do assemble { target aarch64_asm_lut_ok } } */
>> /* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
>>
>> with lut added to:
>>
>> foreach { aarch64_ext } { "fp" "simd" "crypto" "crc" "lse" "dotprod" "sve"
>> "i8mm" "f32mm" "f64mm" "bf16" "sb" "sve2" "ls64"
>> "sme" "sme-i16i64" "sme2" "sve-b16b16"
>> "sme-b16b16" "sme-f16f16" "sme2p1" "fp8" "fp8fma"
>> "ssve-fp8fma" "fp8dot2" "ssve-fp8dot2" "fp8dot4"
>> "ssve-fp8dot4"} {
>> eval [string map [list FUNC $aarch64_ext] {
>> proc check_effective_target_aarch64_asm_FUNC_ok { } {
>> if { [istarget aarch64*-*-*] } {
>> return [check_no_compiler_messages aarch64_FUNC_assembler object {
>> __asm__ (".arch_extension FUNC");
>> } "-march=armv8-a+FUNC"]
>> } else {
>> return 0
>> }
>> }
>> }]
>> }
>>
>> in target-supports.exp.
>>
>>> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
>>> +
>>> +#include "test_sve_acle.h"
>>> +
>>> +#pragma GCC target "+sve2+lut"
>>> +#if STREAMING_COMPATIBLE
>>> +#pragma GCC target "+sme2"
>>> +#endif
>>> +
>>> +/*
>>> +** luti2_test_imm0:
>>> +** luti2 z1\.h, \{ z28\.h \}, z0\[0\]
>>> +** ret
>>> +*/
>>> +
>>> +TEST_XN_SINGLE (luti2_test_imm0, svbfloat16_t, svuint8_t, z1,
>>> + svluti2_lane_bf16 (z28, z0, 0),
>>> + svluti2_lane_bf16 (z28, z0, 0))
>>
>> Following on from the comment above about these intrinsics being
>> overloaded: the second call above should not have a type suffix.
>> Similarly for the other tests (sorry!).
>>
>>> +
>>> +/*
>>> +** luti2_test_imm1:
>>> +** luti2 z1\.h, \{ z28\.h \}, z0\[1\]
>>> +** ret
>>> +*/
>>> +
>>> +TEST_XN_SINGLE (luti2_test_imm1, svbfloat16_t, svuint8_t, z1,
>>> + svluti2_lane_bf16 (z28, z0, 1),
>>> + svluti2_lane_bf16 (z28, z0, 1))
>>> +
>>> +/*
>>> +** luti2_test_tied:
>>> +** luti2 z28\.h, \{ z28\.h \}, z0\[2\]
>>> +** ret
>>> +*/
>>> +
>>> +TEST_XN_SINGLE (luti2_test_tied, svbfloat16_t, svuint8_t, z28,
>>> + svluti2_lane_bf16 (z28, z0, 2),
>>> + svluti2_lane_bf16 (z28, z0, 2))
>>
>> I think we should test the upper bound of each range. A combination of
>> testing the upper bound + the dg-do change would have caught the ? 1 : 7
>> thing above.
>>
>>> [...]
>>> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
>>> index 45ba2f47a9d..e0d9867801c 100644
>>> --- a/gcc/testsuite/lib/target-supports.exp
>>> +++ b/gcc/testsuite/lib/target-supports.exp
>>> @@ -4800,6 +4800,18 @@ proc check_effective_target_aarch64_sve2 { } {
>>> }]
>>> }
>>>
>>> +# Return 1 if this is an AArch64 target supporting LUT (Lookup table)
>>> +proc check_effective_target_aarch64_lut { } {
>>> + if { ![istarget aarch64*-*-*] || ![check_effective_target_aarch64_sve2] } {
>>> + return 0
>>> + }
>>> + return [check_no_compiler_messages aarch64_lut assembly {
>>> + #if !defined (__ARM_FEATURE_LUT)
>>> + #error FOO
>>> + #endif
>>> + }]
>>> +}
>>> +
>>> # Return 1 if this is an AArch64 target only supporting SVE (not SVE2).
>>> proc check_effective_target_aarch64_sve1_only { } {
>>> return [expr { [check_effective_target_aarch64_sve]
>>
>> I'm not sure this is needed.
>>
>> Thanks,
>> Richard
@@ -268,6 +268,8 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
aarch64_def_or_undef (TARGET_SVE_BF16,
"__ARM_FEATURE_SVE_BF16", pfile);
+ aarch64_def_or_undef (TARGET_LUT, "__ARM_FEATURE_LUT", pfile);
+
aarch64_def_or_undef (TARGET_FP8, "__ARM_FEATURE_FP8", pfile);
aarch64_def_or_undef (TARGET_FP8DOT2, "__ARM_FEATURE_FP8DOT2", pfile);
@@ -903,6 +903,47 @@ struct load_ext_gather_base : public overloaded_base<1>
}
};
+
+/* sv<v0>_t svlut_<t0>(sv<t0>_t, svuint8_t, uint64_t)
+ where the final argument is a constant index, the instruction divides
+ the vector argument in BITS-bit quantities. */
+template<unsigned int BITS>
+struct luti_base : public nonoverloaded_base
+{
+ void
+ build (function_builder &b, const function_group_info &group) const override
+ {
+ /* Format: return type, table vector, indices vector, immediate value. */
+ build_all (b, "v0,t0,vu8,su64", group, MODE_none);
+ }
+
+ bool
+ check (function_checker &c) const override
+ {
+ int max_range;
+ bool byte_mode = c.type_suffix (0).element_bits == 8;
+
+ if (BITS == 2)
+ max_range = byte_mode ? 3 : 7;
+ else if (BITS == 4)
+ max_range = byte_mode ? 1 : 7;
+ else
+ /* Unsupported number of indices bits for LUTI. */
+ gcc_unreachable ();
+
+ return c.require_immediate_range (2, 0, max_range);
+ }
+
+};
+
+/* Specializations for 2-bit and 4-bit indices. */
+using luti2_def = luti_base<2>;
+SHAPE (luti2)
+
+using luti4_def = luti_base<4>;
+SHAPE (luti4)
+
+
/* sv<t0>x<g>_t svfoo_t0_g(uint64_t, svuint8_t, uint64_t)
where the first argument is the ZT register number (currently always 0)
@@ -164,6 +164,8 @@ namespace aarch64_sve
extern const function_shape *const load_gather64_vs_offset;
extern const function_shape *const load_replicate;
extern const function_shape *const load_za;
+ extern const function_shape *const luti2;
+ extern const function_shape *const luti4;
extern const function_shape *const luti2_lane_zt;
extern const function_shape *const luti4_lane_zt;
extern const function_shape *const mmla;
@@ -914,6 +914,21 @@ public:
unsigned int m_base;
};
+class svluti_lane_impl : public function_base
+{
+public:
+ CONSTEXPR svluti_lane_impl (unsigned int bits) : m_bits (bits)
+ {}
+
+ rtx expand (function_expander &e) const override
+ {
+ auto mode = e.tuple_mode (0);
+ return e.use_exact_insn (code_for_aarch64_sve_luti (m_bits, mode));
+ }
+
+ unsigned int m_bits;
+};
+
} /* end anonymous namespace */
namespace aarch64_sve {
@@ -1205,5 +1220,7 @@ FUNCTION (svzip, multireg_permute, (UNSPEC_ZIP))
FUNCTION (svzipq, multireg_permute, (UNSPEC_ZIPQ))
FUNCTION (svzipq1, svzipq_impl, (0))
FUNCTION (svzipq2, svzipq_impl, (1))
+FUNCTION (svluti2_lane, svluti_lane_impl, (2))
+FUNCTION (svluti4_lane, svluti_lane_impl, (4))
} /* end namespace aarch64_sve */
@@ -164,6 +164,10 @@ DEF_SVE_FUNCTION (svwhilegt, compare_scalar, while, none)
DEF_SVE_FUNCTION (svwhilerw, compare_ptr, all_data, none)
DEF_SVE_FUNCTION (svwhilewr, compare_ptr, all_data, none)
DEF_SVE_FUNCTION (svxar, ternary_shift_right_imm, all_integer, none)
+DEF_SVE_FUNCTION (svluti2_lane, luti2, bhs_data, none)
+DEF_SVE_FUNCTION (svluti4_lane, luti4, bhs_data, none)
+DEF_SVE_FUNCTION_GS (svluti4_lane, luti4, bhs_data, x2, none)
+
#undef REQUIRED_EXTENSIONS
#define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_SVE2)
@@ -250,6 +250,8 @@ namespace aarch64_sve
extern const function_base *const svzipq;
extern const function_base *const svzipq1;
extern const function_base *const svzipq2;
+ extern const function_base *const svluti2_lane;
+ extern const function_base *const svluti4_lane;
}
}
@@ -133,6 +133,7 @@
;; ---- Optional AES extensions
;; ---- Optional SHA-3 extensions
;; ---- Optional SM4 extensions
+;; ---- Table lookup
;; =========================================================================
;; == Moves
@@ -4211,3 +4212,47 @@
"sm4ekey\t%0.s, %1.s, %2.s"
[(set_attr "type" "crypto_sm4")]
)
+
+;; -------------------------------------------------------------------------
+;; ---- Table lookup
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - LUTI2
+;; - LUTI4
+;; -------------------------------------------------------------------------
+
+(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
+ [(set (match_operand:SVE_FULL_BS 0 "register_operand" "=w")
+ (unspec:SVE_FULL_BS
+ [(match_operand:SVE_FULL_BS 1 "register_operand" "w")
+ (match_operand:VNx16QI 2 "register_operand" "w")
+ (match_operand:DI 3 "const_int_operand")
+ (const_int LUTI_BITS)]
+ UNSPEC_SVE_LUTI))]
+ "TARGET_SVE2"
+ "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
+)
+
+(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
+ [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
+ (unspec:<VSINGLE>
+ [(match_operand:SVE_FULL_H 1 "aligned_register_operand" "w")
+ (match_operand:VNx16QI 2 "register_operand" "w")
+ (match_operand:DI 3 "const_int_operand")
+ (const_int LUTI_BITS)]
+ UNSPEC_SVE_LUTI))]
+ "TARGET_SVE2"
+ "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
+)
+
+(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
+ [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
+ (unspec:<VSINGLE>
+ [(match_operand:SVE_FULL_Hx2 1 "aligned_register_operand" "Uw2")
+ (match_operand:VNx16QI 2 "register_operand" "w")
+ (match_operand:DI 3 "const_int_operand")
+ (const_int LUTI_BITS)]
+ UNSPEC_SVE_LUTI))]
+ "TARGET_SVE2"
+ "luti<LUTI_BITS>\t%0.<Vetype>, %1, %2[%3]"
+)
@@ -553,6 +553,15 @@
(define_mode_iterator SVE_FULL_BHS [VNx16QI VNx8HI VNx4SI
VNx8BF VNx8HF VNx4SF])
+;; Fully-packed SVE vector byte modes that have 32-bit or smaller elements.
+(define_mode_iterator SVE_FULL_BS [VNx16QI VNx4SI VNx4SF])
+
+;; Fully-packed half word SVE vector modes
+(define_mode_iterator SVE_FULL_H [VNx8HI VNx8HF VNx8BF])
+
+;; Pairs of fully-packed SVE vector modes (half word only)
+(define_mode_iterator SVE_FULL_Hx2 [VNx16HI VNx16HF VNx16BF])
+
;; Fully-packed SVE vector modes that have 32-bit elements.
(define_mode_iterator SVE_FULL_S [VNx4SI VNx4SF])
@@ -1186,6 +1195,7 @@
UNSPEC_UZPQ2
UNSPEC_ZIPQ1
UNSPEC_ZIPQ2
+ UNSPEC_SVE_LUTI
;; All used in aarch64-sme.md
UNSPEC_SME_ADD
@@ -780,4 +780,20 @@
"w" (z16), "w" (z22), "w" (z29)); \
}
+#define TEST_1X2_NARROW(NAME, RTYPE, TTYPE, ZTYPE, CODE1, CODE2) \
+ PROTO(NAME, void, ()) \
+ { \
+ register RTYPE z0 __asm ("z0"); \
+ register ZTYPE z5 __asm ("z5"); \
+ register TTYPE z6 __asm ("z6"); \
+ register RTYPE z16 __asm ("z16"); \
+ register ZTYPE z22 __asm ("z22"); \
+ register TTYPE z29 __asm ("z29"); \
+ register RTYPE z0_res __asm ("z0"); \
+ __asm volatile ("" : "=w" (z0), "=w" (z5), "=w" (z6), \
+ "=w" (z16), "=w" (z22), "=w" (z29)); \
+ INVOKE (CODE1, CODE2); \
+ __asm volatile ("" :: "w" (z0_res), "w" (z5), "w" (z22)); \
+ }
+
#endif
new file mode 100644
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti2_test_imm0:
+** luti2 z1\.h, \{ z28\.h \}, z0\[0\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_imm0, svbfloat16_t, svuint8_t, z1,
+ svluti2_lane_bf16 (z28, z0, 0),
+ svluti2_lane_bf16 (z28, z0, 0))
+
+/*
+** luti2_test_imm1:
+** luti2 z1\.h, \{ z28\.h \}, z0\[1\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_imm1, svbfloat16_t, svuint8_t, z1,
+ svluti2_lane_bf16 (z28, z0, 1),
+ svluti2_lane_bf16 (z28, z0, 1))
+
+/*
+** luti2_test_tied:
+** luti2 z28\.h, \{ z28\.h \}, z0\[2\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_tied, svbfloat16_t, svuint8_t, z28,
+ svluti2_lane_bf16 (z28, z0, 2),
+ svluti2_lane_bf16 (z28, z0, 2))
+
new file mode 100644
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti2_test_imm0:
+** luti2 z1\.h, \{ z28\.h \}, z0\[0\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_imm0, svfloat16_t, svuint8_t, z1,
+ svluti2_lane_f16 (z28, z0, 0),
+ svluti2_lane_f16 (z28, z0, 0))
+
+/*
+** luti2_test_imm1:
+** luti2 z1\.h, \{ z28\.h \}, z0\[1\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_imm1, svfloat16_t, svuint8_t, z1,
+ svluti2_lane_f16 (z28, z0, 1),
+ svluti2_lane_f16 (z28, z0, 1))
+
+/*
+** luti2_test_tied:
+** luti2 z28\.h, \{ z28\.h \}, z0\[2\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_tied, svfloat16_t, svuint8_t, z28,
+ svluti2_lane_f16 (z28, z0, 2),
+ svluti2_lane_f16 (z28, z0, 2))
+
new file mode 100644
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti2_test_imm0:
+** luti2 z1\.h, \{ z28\.h \}, z0\[0\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_imm0, svint16_t, svuint8_t, z1,
+ svluti2_lane_s16 (z28, z0, 0),
+ svluti2_lane_s16 (z28, z0, 0))
+
+/*
+** luti2_test_imm1:
+** luti2 z1\.h, \{ z28\.h \}, z0\[1\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_imm1, svint16_t, svuint8_t, z1,
+ svluti2_lane_s16 (z28, z0, 1),
+ svluti2_lane_s16 (z28, z0, 1))
+
+/*
+** luti2_test_tied:
+** luti2 z28\.h, \{ z28\.h \}, z0\[2\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_tied, svint16_t, svuint8_t, z28,
+ svluti2_lane_s16 (z28, z0, 2),
+ svluti2_lane_s16 (z28, z0, 2))
+
new file mode 100644
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti2_test_imm0:
+** luti2 z1\.b, \{ z28\.b \}, z0\[0\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_imm0, svint8_t, svuint8_t, z1,
+ svluti2_lane_s8 (z28, z0, 0),
+ svluti2_lane_s8 (z28, z0, 0))
+
+/*
+** luti2_test_imm1:
+** luti2 z1\.b, \{ z28\.b \}, z0\[1\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_imm1, svint8_t, svuint8_t, z1,
+ svluti2_lane_s8 (z28, z0, 1),
+ svluti2_lane_s8 (z28, z0, 1))
+
+/*
+** luti2_test_tied:
+** luti2 z28\.b, \{ z28\.b \}, z0\[2\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_tied, svint8_t, svuint8_t, z28,
+ svluti2_lane_s8 (z28, z0, 2),
+ svluti2_lane_s8 (z28, z0, 2))
+
new file mode 100644
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti2_test_imm0:
+** luti2 z1\.h, \{ z28\.h \}, z0\[0\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_imm0, svuint16_t, svuint8_t, z1,
+ svluti2_lane_u16 (z28, z0, 0),
+ svluti2_lane_u16 (z28, z0, 0))
+
+/*
+** luti2_test_imm1:
+** luti2 z1\.h, \{ z28\.h \}, z0\[1\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_imm1, svuint16_t, svuint8_t, z1,
+ svluti2_lane_u16 (z28, z0, 1),
+ svluti2_lane_u16 (z28, z0, 1))
+
+/*
+** luti2_test_tied:
+** luti2 z28\.h, \{ z28\.h \}, z0\[2\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_tied, svuint16_t, svuint8_t, z28,
+ svluti2_lane_u16 (z28, z0, 2),
+ svluti2_lane_u16 (z28, z0, 2))
+
new file mode 100644
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti2_test_imm0:
+** luti2 z1\.b, \{ z28\.b \}, z0\[0\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_imm0, svuint8_t, svuint8_t, z1,
+ svluti2_lane_u8 (z28, z0, 0),
+ svluti2_lane_u8 (z28, z0, 0))
+
+/*
+** luti2_test_imm1:
+** luti2 z1\.b, \{ z28\.b \}, z0\[1\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_imm1, svuint8_t, svuint8_t, z1,
+ svluti2_lane_u8 (z28, z0, 1),
+ svluti2_lane_u8 (z28, z0, 1))
+
+/*
+** luti2_test_tied:
+** luti2 z28\.b, \{ z28\.b \}, z0\[2\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti2_test_tied, svuint8_t, svuint8_t, z28,
+ svluti2_lane_u8 (z28, z0, 2),
+ svluti2_lane_u8 (z28, z0, 2))
+
new file mode 100644
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_test_imm0:
+** luti4 z1\.h, \{ z28\.h \}, z0\[0\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti4_test_imm0, svbfloat16_t, svuint8_t, z1,
+ svluti4_lane_bf16 (z28, z0, 0),
+ svluti4_lane_bf16 (z28, z0, 0))
+
+/*
+** luti4_test_imm1:
+** luti4 z1\.h, \{ z28\.h \}, z0\[1\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti4_test_imm1, svbfloat16_t, svuint8_t, z1,
+ svluti4_lane_bf16 (z28, z0, 1),
+ svluti4_lane_bf16 (z28, z0, 1))
+
+/*
+** luti4_test_tied:
+** luti4 z28\.h, \{ z28\.h \}, z0\[2\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti4_test_tied, svbfloat16_t, svuint8_t, z28,
+ svluti4_lane_bf16 (z28, z0, 2),
+ svluti4_lane_bf16 (z28, z0, 2))
+
new file mode 100644
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_test_bf16_x2:
+** luti4 z0\.h, \{z6\.h \- z7\.h\}, z5\[1\]
+** ret
+*/
+
+TEST_1X2_NARROW(luti4_test_bf16_x2, svbfloat16_t, svbfloat16x2_t, svuint8_t,
+ z0_res = svluti4_lane_bf16_x2(z6, z5, 1),
+ z0_res = svluti4_lane_bf16_x2(z6, z5, 1))
+
new file mode 100644
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_test_imm0:
+** luti4 z1\.h, \{ z28\.h \}, z0\[0\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti4_test_imm0, svfloat16_t, svuint8_t, z1,
+ svluti4_lane_f16 (z28, z0, 0),
+ svluti4_lane_f16 (z28, z0, 0))
+
+/*
+** luti4_test_imm1:
+** luti4 z1\.h, \{ z28\.h \}, z0\[1\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti4_test_imm1, svfloat16_t, svuint8_t, z1,
+ svluti4_lane_f16 (z28, z0, 1),
+ svluti4_lane_f16 (z28, z0, 1))
+
+/*
+** luti4_test_tied:
+** luti4 z28\.h, \{ z28\.h \}, z0\[2\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti4_test_tied, svfloat16_t, svuint8_t, z28,
+ svluti4_lane_f16 (z28, z0, 2),
+ svluti4_lane_f16 (z28, z0, 2))
+
new file mode 100644
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_test_f16_x2:
+** luti4 z0\.h, \{z6\.h \- z7\.h\}, z5\[1\]
+** ret
+*/
+
+TEST_1X2_NARROW(luti4_test_f16_x2, svfloat16_t, svfloat16x2_t, svuint8_t,
+ z0_res = svluti4_lane_f16_x2(z6, z5, 1),
+ z0_res = svluti4_lane_f16_x2(z6, z5, 1))
+
new file mode 100644
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_test_imm0:
+** luti4 z1\.h, \{ z28\.h \}, z0\[0\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti4_test_imm0, svint16_t, svuint8_t, z1,
+ svluti4_lane_s16 (z28, z0, 0),
+ svluti4_lane_s16 (z28, z0, 0))
+
+/*
+** luti4_test_imm1:
+** luti4 z1\.h, \{ z28\.h \}, z0\[1\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti4_test_imm1, svint16_t, svuint8_t, z1,
+ svluti4_lane_s16 (z28, z0, 1),
+ svluti4_lane_s16 (z28, z0, 1))
+
+/*
+** luti4_test_tied:
+** luti4 z28\.h, \{ z28\.h \}, z0\[2\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti4_test_tied, svint16_t, svuint8_t, z28,
+ svluti4_lane_s16 (z28, z0, 2),
+ svluti4_lane_s16 (z28, z0, 2))
+
new file mode 100644
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_test_s16_x2:
+** luti4 z0\.h, \{z6\.h \- z7\.h\}, z5\[1\]
+** ret
+*/
+
+TEST_1X2_NARROW(luti4_test_s16_x2, svint16_t, svint16x2_t, svuint8_t,
+ z0_res = svluti4_lane_s16_x2(z6, z5, 1),
+ z0_res = svluti4_lane_s16_x2(z6, z5, 1))
+
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_test_imm0:
+** luti4 z1\.b, \{ z28\.b \}, z0\[0\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti4_test_imm0, svint8_t, svuint8_t, z1,
+ svluti4_lane_s8 (z28, z0, 0),
+ svluti4_lane_s8 (z28, z0, 0))
+
+/*
+** luti4_test_imm1:
+** luti4 z1\.b, \{ z28\.b \}, z0\[1\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti4_test_imm1, svint8_t, svuint8_t, z1,
+ svluti4_lane_s8 (z28, z0, 1),
+ svluti4_lane_s8 (z28, z0, 1))
+
new file mode 100644
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_test_imm0:
+** luti4 z1\.h, \{ z28\.h \}, z0\[0\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti4_test_imm0, svuint16_t, svuint8_t, z1,
+ svluti4_lane_u16 (z28, z0, 0),
+ svluti4_lane_u16 (z28, z0, 0))
+
+/*
+** luti4_test_imm1:
+** luti4 z1\.h, \{ z28\.h \}, z0\[1\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti4_test_imm1, svuint16_t, svuint8_t, z1,
+ svluti4_lane_u16 (z28, z0, 1),
+ svluti4_lane_u16 (z28, z0, 1))
+
+/*
+** luti4_test_tied:
+** luti4 z28\.h, \{ z28\.h \}, z0\[2\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti4_test_tied, svuint16_t, svuint8_t, z28,
+ svluti4_lane_u16 (z28, z0, 2),
+ svluti4_lane_u16 (z28, z0, 2))
+
new file mode 100644
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_test_u16_x2:
+** luti4 z0\.h, \{z6\.h \- z7\.h\}, z5\[1\]
+** ret
+*/
+
+TEST_1X2_NARROW(luti4_test_u16_x2, svuint16_t, svuint16x2_t, svuint8_t,
+ z0_res = svluti4_lane_u16_x2(z6, z5, 1),
+ z0_res = svluti4_lane_u16_x2(z6, z5, 1))
+
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_test_imm0:
+** luti4 z1\.b, \{ z28\.b \}, z0\[0\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti4_test_imm0, svuint8_t, svuint8_t, z1,
+ svluti4_lane_u8 (z28, z0, 0),
+ svluti4_lane_u8 (z28, z0, 0))
+
+/*
+** luti4_test_imm1:
+** luti4 z1\.b, \{ z28\.b \}, z0\[1\]
+** ret
+*/
+
+TEST_XN_SINGLE (luti4_test_imm1, svuint8_t, svuint8_t, z1,
+ svluti4_lane_u8 (z28, z0, 1),
+ svluti4_lane_u8 (z28, z0, 1))
+
@@ -4800,6 +4800,18 @@ proc check_effective_target_aarch64_sve2 { } {
}]
}
+# Return 1 if this is an AArch64 target supporting LUT (Lookup table)
+proc check_effective_target_aarch64_lut { } {
+ if { ![istarget aarch64*-*-*] || ![check_effective_target_aarch64_sve2] } {
+ return 0
+ }
+ return [check_no_compiler_messages aarch64_lut assembly {
+ #if !defined (__ARM_FEATURE_LUT)
+ #error FOO
+ #endif
+ }]
+}
+
# Return 1 if this is an AArch64 target only supporting SVE (not SVE2).
proc check_effective_target_aarch64_sve1_only { } {
return [expr { [check_effective_target_aarch64_sve]