[v4] AArch64: Add LUTI ACLE for SVE2

Message ID 20250115200321.3758938-1-saurabh.jha@arm.com
State Superseded
Headers
Series [v4] AArch64: Add LUTI ACLE for SVE2 |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_gcc_check--master-arm success Test passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Build passed

Commit Message

Saurabh Jha Jan. 15, 2025, 8:03 p.m. UTC
  This patch introduces support for LUTI2/LUTI4 ACLE for SVE2.

LUTI instructions are used for efficient table lookups with 2-bit
or 4-bit indices. LUTI2 reads indexed 8-bit or 16-bit elements from
the low 128 bits of the table vector using packed 2-bit indices,
while LUTI4 can read from the low 128 or 256 bits of the table
vector or from two table vectors using packed 4-bit indices.
These instructions fill the destination vector by copying elements
indexed by segments of the source vector, selected by the vector
segment index.

The changes include the addition of a new AArch64 option
extension "lut", __ARM_FEATURE_LUT preprocessor macro, definitions
for the new LUTI instruction shapes, and implementations of the
svluti2 and svluti4 builtins.

New tests are added as well.

---

This is a respin of
https://gcc.gnu.org/pipermail/gcc-patches/2025-January/672910.html.
Addressed comments on lut. The faminmax comments will be addressed in a
separate patch.

Regression tested on aarch64-unknown-linux-gnu and found no regressions.

Ok for master?

Thanks,
Saurabh
---
 gcc/config/aarch64/aarch64-c.cc               |  2 +
 .../aarch64/aarch64-sve-builtins-shapes.cc    | 44 +++++++++++++
 .../aarch64/aarch64-sve-builtins-shapes.h     |  2 +
 .../aarch64/aarch64-sve-builtins-sve2.cc      | 17 +++++
 .../aarch64/aarch64-sve-builtins-sve2.def     |  8 +++
 .../aarch64/aarch64-sve-builtins-sve2.h       |  2 +
 gcc/config/aarch64/aarch64-sve-builtins.cc    |  7 +-
 gcc/config/aarch64/aarch64-sve2.md            | 33 ++++++++++
 gcc/config/aarch64/iterators.md               | 13 ++++
 .../aarch64/sve/acle/asm/test_sve_acle.h      | 16 +++++
 .../aarch64/sve/acle/general-c/lut_1.c        | 64 +++++++++++++++++++
 .../aarch64/sve/acle/general-c/lut_2.c        | 11 ++++
 .../aarch64/sve/acle/general-c/lut_3.c        | 56 ++++++++++++++++
 .../aarch64/sve2/acle/asm/luti2_bf16.c        | 50 +++++++++++++++
 .../aarch64/sve2/acle/asm/luti2_f16.c         | 50 +++++++++++++++
 .../aarch64/sve2/acle/asm/luti2_s16.c         | 50 +++++++++++++++
 .../aarch64/sve2/acle/asm/luti2_s8.c          | 50 +++++++++++++++
 .../aarch64/sve2/acle/asm/luti2_u16.c         | 50 +++++++++++++++
 .../aarch64/sve2/acle/asm/luti2_u8.c          | 50 +++++++++++++++
 .../aarch64/sve2/acle/asm/luti4_bf16.c        | 50 +++++++++++++++
 .../aarch64/sve2/acle/asm/luti4_bf16_x2.c     | 30 +++++++++
 .../aarch64/sve2/acle/asm/luti4_f16.c         | 50 +++++++++++++++
 .../aarch64/sve2/acle/asm/luti4_f16_x2.c      | 30 +++++++++
 .../aarch64/sve2/acle/asm/luti4_s16.c         | 50 +++++++++++++++
 .../aarch64/sve2/acle/asm/luti4_s16_x2.c      | 30 +++++++++
 .../aarch64/sve2/acle/asm/luti4_s8.c          | 50 +++++++++++++++
 .../aarch64/sve2/acle/asm/luti4_u16.c         | 50 +++++++++++++++
 .../aarch64/sve2/acle/asm/luti4_u16_x2.c      | 30 +++++++++
 .../aarch64/sve2/acle/asm/luti4_u8.c          | 50 +++++++++++++++
 gcc/testsuite/lib/target-supports.exp         |  2 +-
 30 files changed, 995 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_f16.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_s16.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_s8.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_u16.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_u8.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_f16.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_f16_x2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s16.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s16_x2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s8.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u16.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u16_x2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u8.c
  

Comments

Richard Sandiford Jan. 16, 2025, 8:44 a.m. UTC | #1
Thanks for the update.  Mostly LGTM, but some comments below:

<saurabh.jha@arm.com> writes:
> diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
> index f8cfe08f4c0..0a1dc314f94 100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -133,6 +133,7 @@
>  ;; ---- Optional AES extensions
>  ;; ---- Optional SHA-3 extensions
>  ;; ---- Optional SM4 extensions
> +;; ---- Table lookup

This puts it under:

;; == Cryptographic extensions

but it's not a crytographic extension.  Probably better to put it under:

;; == General

instead.

>  ;; =========================================================================
>  ;; == Moves
> @@ -4211,3 +4212,35 @@
>    "sm4ekey\t%0.s, %1.s, %2.s"
>    [(set_attr "type" "crypto_sm4")]
>  )
> +
> +;; -------------------------------------------------------------------------
> +;; ---- Table lookup
> +;; -------------------------------------------------------------------------
> +;; Includes:
> +;; - LUTI2
> +;; - LUTI4
> +;; -------------------------------------------------------------------------
> +
> +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
> +  [(set (match_operand:SVE_FULL_BH 0 "register_operand" "=w")
> +	(unspec:SVE_FULL_BH
> +	 [(match_operand:SVE_FULL_BH 1 "register_operand" "w")
> +	  (match_operand:VNx16QI 2 "register_operand" "w")

This is correct

> +	  (match_operand:DI 3 "const_int_operand")
> +	  (const_int LUTI_BITS)]
> +	 UNSPEC_SVE_LUTI))]
> +  "TARGET_LUT && TARGET_SVE2_OR_SME2"
> +  "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
> +)
> +
> +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
> +  [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
> +	(unspec:<VSINGLE>
> +	 [(match_operand:SVE_FULL_Hx2 1 "register_operand" "Uw2")

...but this should use aligned_register_operand instead of
register_operand.

> +	  (match_operand:VNx16QI 2 "register_operand" "w")
> +	  (match_operand:DI 3 "const_int_operand")
> +	  (const_int LUTI_BITS)]
> +	  UNSPEC_SVE_LUTI))]
> +  "TARGET_LUT && TARGET_SVE2_OR_SME2"
> +  "luti<LUTI_BITS>\t%0.<Vetype>, %1, %2[%3]"
> +)
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index ff0f34dd043..0fbf96f1ab9 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -553,6 +553,18 @@
>  (define_mode_iterator SVE_FULL_BHS [VNx16QI VNx8HI VNx4SI
>  				    VNx8BF VNx8HF VNx4SF])
>  
> +;; Fully-packed SVE vector byte modes that have 32-bit or smaller elements.
> +(define_mode_iterator SVE_FULL_BS [VNx16QI VNx4SI VNx4SF])

This is no longer needed.

> +
> +;; Fully-packed SVE vector byte modes that have 16-bit or smaller elements.
> +(define_mode_iterator SVE_FULL_BH [VNx16QI VNx8HI VNx8HF VNx8BF])
> +
> +;; Fully-packed half word SVE vector modes
> +(define_mode_iterator SVE_FULL_H [VNx8HI VNx8HF VNx8BF])

Similarly, SVE_FULL_H is no longer needed.

> +
> +;; Pairs of fully-packed SVE vector modes (half word only)
> +(define_mode_iterator SVE_FULL_Hx2 [VNx16HI VNx16HF VNx16BF])
> +
>  ;; Fully-packed SVE vector modes that have 32-bit elements.
>  (define_mode_iterator SVE_FULL_S [VNx4SI VNx4SF])
>  
> @@ -1186,6 +1198,7 @@
>      UNSPEC_UZPQ2
>      UNSPEC_ZIPQ1
>      UNSPEC_ZIPQ2
> +    UNSPEC_SVE_LUTI
>  
>      ;; All used in aarch64-sme.md
>      UNSPEC_SME_ADD
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
> index d3ae707ac49..c0dd89fa924 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
> @@ -780,4 +780,20 @@
>  		    "w" (z16), "w" (z22), "w" (z29));		\
>    }
>  
> +#define TEST_1X2_NARROW(NAME, RTYPE, TTYPE, ZTYPE, CODE1, CODE2)	\
> +  PROTO(NAME, void, ())							\
> +  {									\
> +    register RTYPE z0 __asm ("z0");					\
> +    register ZTYPE z5 __asm ("z5");					\
> +    register TTYPE z6 __asm ("z6");					\
> +    register RTYPE z16 __asm ("z16");					\
> +    register ZTYPE z22 __asm ("z22");					\
> +    register TTYPE z29 __asm ("z29");					\
> +    register RTYPE z0_res __asm ("z0");					\
> +    __asm volatile ("" : "=w" (z0), "=w" (z5), "=w" (z6),		\
> +		    "=w" (z16), "=w" (z22), "=w" (z29));		\
> +    INVOKE (CODE1, CODE2);						\
> +    __asm volatile ("" :: "w" (z0_res), "w" (z5), "w" (z22));		\
> +  }
> +
>  #endif
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_1.c
> new file mode 100644
> index 00000000000..142de490267
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_1.c
> @@ -0,0 +1,64 @@
> +/* { dg-do compile } */
> +
> +#include <arm_sve.h>
> +
> +#pragma GCC target ("arch=armv9.2-a+sve2+lut")
> +
> +void
> +test (svfloat16_t f16, svfloat32_t f32, svfloat64_t f64,
> +      svfloat16x2_t f16x2, svfloat32x2_t f32x2, svfloat64x2_t f64x2,
> +      svuint8_t u8, svuint16_t u16, svuint32_t u32, svuint64_t u64,
> +      svuint8x2_t u8x2, svuint16x2_t u16x2,
> +      svuint32x2_t u32x2, svuint64x2_t u64x2,
> +      svint8_t s8, svint16_t s16, svint32_t s32, svint64_t s64,
> +      svint8x2_t s8x2, svint16x2_t s16x2, svint32x2_t s32x2, svint64x2_t s64x2,
> +      svbfloat16_t bf16, svbfloat16x2_t bf16x2)
> +{

It would be good to have a test here for "too few" and "too many" arguments.

It would also be good to test cases in which non-vector arguments are passed,
such as:

  svluti2_lane (0, u8, 0);

The tests below concentrate on cases where the first argument has an
invalid type, but they don't cover any cases where the second and
third arguments have invalid types, or where the third argument is
nonconstant.  How about adding:

  svluti2_lane (f16, 0, 0);
  svluti2_lane (u16, u16, 0);
  svluti2_lane (f16, u8, u8);
  svluti2_lane (f16, u8, x);

where "x" is a new parameter of type "int".

> +  svluti2_lane (f16, u8, 0);
> +  svluti2_lane (bf16, u8, 0);
> +
> +  svluti2_lane (f32, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svfloat32_t' arguments} } */
> +  svluti2_lane (f64, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svfloat64_t' arguments} } */
> +
> +  svluti2_lane (u8, u8, 0);
> +  svluti2_lane (u16, u8, 0);
> +
> +  svluti2_lane (u32, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svuint32_t' arguments} } */
> +  svluti2_lane (u64, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svuint64_t' arguments} } */
> +
> +  svluti2_lane (s8, u8, 0);
> +  svluti2_lane (s16, u8, 0);
> +
> +  svluti2_lane (s32, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svint32_t' arguments} } */
> +  svluti2_lane (s64, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svint64_t' arguments} } */
> +
> +  svluti4_lane (f16, u8, 0);
> +  svluti4_lane (bf16, u8, 0);
> +  svluti4_lane_x2 (f16x2, u8, 0);
> +  svluti4_lane_x2 (bf16x2, u8, 0);

The _x2 shouldn't be present in the overloaded name.  The fix for that
is to add:

  bool explicit_group_suffix_p () const override { return false; }

to the shape class (canonically as the first thing in the class).

It would be good to test something like f16x3 as well, for the case
in which the element size is ok but the tuple size is wrong.

> +
> +  svluti4_lane (f32, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svfloat32_t' arguments} } */
> +  svluti4_lane (f64, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svfloat64_t' arguments} } */
> +  svluti4_lane_x2 (f32x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svfloat32x2_t' arguments} } */
> +  svluti4_lane_x2 (f64x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svfloat64x2_t' arguments} } */
> +
> +  svluti4_lane (u8, u8, 0);
> +  svluti4_lane (u16, u8, 0);
> +  svluti4_lane_x2 (u8x2, u8, 0);
> +  svluti4_lane_x2 (u16x2, u8, 0);
> +
> +  svluti4_lane (u32, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svuint32_t' arguments} } */
> +  svluti4_lane (u64, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svuint64_t' arguments} } */
> +  svluti4_lane_x2 (u32x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svuint32x2_t' arguments} } */
> +  svluti4_lane_x2 (u64x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svuint64x2_t' arguments} } */
> +
> +  svluti4_lane (s8, u8, 0);
> +  svluti4_lane (s16, u8, 0);
> +  svluti4_lane_x2 (s8x2, u8, 0);
> +  svluti4_lane_x2 (s16x2, u8, 0);
> +
> +  svluti4_lane (s32, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svint32_t' arguments} } */
> +  svluti4_lane (s64, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svint64_t' arguments} } */
> +  svluti4_lane_x2 (s32x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svint32x2_t' arguments} } */
> +  svluti4_lane_x2 (s64x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svint64x2_t' arguments} } */
> +}
> [...]
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c
> new file mode 100644
> index 00000000000..1f3f8aab5ef
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c
> @@ -0,0 +1,30 @@
> +/* { dg-do assemble { target aarch64_asm_lut_ok } } */
> +/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
> +
> +#include "test_sve_acle.h"
> +
> +#pragma GCC target "+sve2+lut"
> +#if STREAMING_COMPATIBLE
> +#pragma GCC target "+sme2"
> +#endif
> +
> +/*
> +** luti4_min_idx_test:
> +**	luti4	z0\.h, \{z6\.h \- z7\.h\}, z5\[0\]
> +**	ret
> +*/
> +
> +TEST_1X2_NARROW(luti4_min_idx_test, svbfloat16_t, svbfloat16x2_t, svuint8_t,
> +		z0_res = svluti4_lane_bf16_x2 (z6, z5, 0),
> +		z0_res = svluti4_lane_x2 (z6, z5, 0))
> +
> +/*
> +** luti4_max_idx_test:
> +**	luti4	z0\.h, \{z6\.h \- z7\.h\}, z5\[3\]
> +**	ret
> +*/
> +
> +TEST_1X2_NARROW(luti4_max_idx_test, svbfloat16_t, svbfloat16x2_t, svuint8_t,
> +		z0_res = svluti4_lane_bf16_x2 (z6, z5, 3),
> +		z0_res = svluti4_lane_x2 (z6, z5, 3))

The macro has a few variations:

+    register RTYPE z0 __asm ("z0");					\
+    register ZTYPE z5 __asm ("z5");					\
+    register TTYPE z6 __asm ("z6");					\
+    register RTYPE z16 __asm ("z16");					\
+    register ZTYPE z22 __asm ("z22");					\
+    register TTYPE z29 __asm ("z29");					\
+    register RTYPE z0_res __asm ("z0");					\

which is good. :)  I think we should try more of them here.  In particular,
I think we should try z29, for the case in which the register is naturally
unaligned.  I think that might have caught the predicate issue mentioned
above.

Richard
  
Saurabh Jha Jan. 17, 2025, 12:11 p.m. UTC | #2
On 1/16/2025 8:44 AM, Richard Sandiford wrote:
> Thanks for the update.  Mostly LGTM, but some comments below:
> 
> <saurabh.jha@arm.com> writes:
>> diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
>> index f8cfe08f4c0..0a1dc314f94 100644
>> --- a/gcc/config/aarch64/aarch64-sve2.md
>> +++ b/gcc/config/aarch64/aarch64-sve2.md
>> @@ -133,6 +133,7 @@
>>   ;; ---- Optional AES extensions
>>   ;; ---- Optional SHA-3 extensions
>>   ;; ---- Optional SM4 extensions
>> +;; ---- Table lookup
> 
> This puts it under:
> 
> ;; == Cryptographic extensions
> 
> but it's not a crytographic extension.  Probably better to put it under:
> 
> ;; == General
> 
> instead.
> 
>>   ;; =========================================================================
>>   ;; == Moves
>> @@ -4211,3 +4212,35 @@
>>     "sm4ekey\t%0.s, %1.s, %2.s"
>>     [(set_attr "type" "crypto_sm4")]
>>   )
>> +
>> +;; -------------------------------------------------------------------------
>> +;; ---- Table lookup
>> +;; -------------------------------------------------------------------------
>> +;; Includes:
>> +;; - LUTI2
>> +;; - LUTI4
>> +;; -------------------------------------------------------------------------
>> +
>> +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
>> +  [(set (match_operand:SVE_FULL_BH 0 "register_operand" "=w")
>> +	(unspec:SVE_FULL_BH
>> +	 [(match_operand:SVE_FULL_BH 1 "register_operand" "w")
>> +	  (match_operand:VNx16QI 2 "register_operand" "w")
> 
> This is correct
> 
>> +	  (match_operand:DI 3 "const_int_operand")
>> +	  (const_int LUTI_BITS)]
>> +	 UNSPEC_SVE_LUTI))]
>> +  "TARGET_LUT && TARGET_SVE2_OR_SME2"
>> +  "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
>> +)
>> +
>> +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
>> +  [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
>> +	(unspec:<VSINGLE>
>> +	 [(match_operand:SVE_FULL_Hx2 1 "register_operand" "Uw2")
> 
> ...but this should use aligned_register_operand instead of
> register_operand.
Are you sure we want it to be aligned? That requirement is not there on 
the { <Zn1>.H, <Zn2>.H } operand here: 
https://developer.arm.com/documentation/ddi0602/2024-12/SVE-Instructions/LUTI4--Lookup-table-read-with-4-bit-indices-?lang=en, 
as in, nothing like it has to be "Zn times 2...."

Agree with the rest of the review.

> 
>> +	  (match_operand:VNx16QI 2 "register_operand" "w")
>> +	  (match_operand:DI 3 "const_int_operand")
>> +	  (const_int LUTI_BITS)]
>> +	  UNSPEC_SVE_LUTI))]
>> +  "TARGET_LUT && TARGET_SVE2_OR_SME2"
>> +  "luti<LUTI_BITS>\t%0.<Vetype>, %1, %2[%3]"
>> +)
>> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
>> index ff0f34dd043..0fbf96f1ab9 100644
>> --- a/gcc/config/aarch64/iterators.md
>> +++ b/gcc/config/aarch64/iterators.md
>> @@ -553,6 +553,18 @@
>>   (define_mode_iterator SVE_FULL_BHS [VNx16QI VNx8HI VNx4SI
>>   				    VNx8BF VNx8HF VNx4SF])
>>   
>> +;; Fully-packed SVE vector byte modes that have 32-bit or smaller elements.
>> +(define_mode_iterator SVE_FULL_BS [VNx16QI VNx4SI VNx4SF])
> 
> This is no longer needed.
> 
>> +
>> +;; Fully-packed SVE vector byte modes that have 16-bit or smaller elements.
>> +(define_mode_iterator SVE_FULL_BH [VNx16QI VNx8HI VNx8HF VNx8BF])
>> +
>> +;; Fully-packed half word SVE vector modes
>> +(define_mode_iterator SVE_FULL_H [VNx8HI VNx8HF VNx8BF])
> 
> Similarly, SVE_FULL_H is no longer needed.
> 
>> +
>> +;; Pairs of fully-packed SVE vector modes (half word only)
>> +(define_mode_iterator SVE_FULL_Hx2 [VNx16HI VNx16HF VNx16BF])
>> +
>>   ;; Fully-packed SVE vector modes that have 32-bit elements.
>>   (define_mode_iterator SVE_FULL_S [VNx4SI VNx4SF])
>>   
>> @@ -1186,6 +1198,7 @@
>>       UNSPEC_UZPQ2
>>       UNSPEC_ZIPQ1
>>       UNSPEC_ZIPQ2
>> +    UNSPEC_SVE_LUTI
>>   
>>       ;; All used in aarch64-sme.md
>>       UNSPEC_SME_ADD
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
>> index d3ae707ac49..c0dd89fa924 100644
>> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
>> @@ -780,4 +780,20 @@
>>   		    "w" (z16), "w" (z22), "w" (z29));		\
>>     }
>>   
>> +#define TEST_1X2_NARROW(NAME, RTYPE, TTYPE, ZTYPE, CODE1, CODE2)	\
>> +  PROTO(NAME, void, ())							\
>> +  {									\
>> +    register RTYPE z0 __asm ("z0");					\
>> +    register ZTYPE z5 __asm ("z5");					\
>> +    register TTYPE z6 __asm ("z6");					\
>> +    register RTYPE z16 __asm ("z16");					\
>> +    register ZTYPE z22 __asm ("z22");					\
>> +    register TTYPE z29 __asm ("z29");					\
>> +    register RTYPE z0_res __asm ("z0");					\
>> +    __asm volatile ("" : "=w" (z0), "=w" (z5), "=w" (z6),		\
>> +		    "=w" (z16), "=w" (z22), "=w" (z29));		\
>> +    INVOKE (CODE1, CODE2);						\
>> +    __asm volatile ("" :: "w" (z0_res), "w" (z5), "w" (z22));		\
>> +  }
>> +
>>   #endif
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_1.c
>> new file mode 100644
>> index 00000000000..142de490267
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_1.c
>> @@ -0,0 +1,64 @@
>> +/* { dg-do compile } */
>> +
>> +#include <arm_sve.h>
>> +
>> +#pragma GCC target ("arch=armv9.2-a+sve2+lut")
>> +
>> +void
>> +test (svfloat16_t f16, svfloat32_t f32, svfloat64_t f64,
>> +      svfloat16x2_t f16x2, svfloat32x2_t f32x2, svfloat64x2_t f64x2,
>> +      svuint8_t u8, svuint16_t u16, svuint32_t u32, svuint64_t u64,
>> +      svuint8x2_t u8x2, svuint16x2_t u16x2,
>> +      svuint32x2_t u32x2, svuint64x2_t u64x2,
>> +      svint8_t s8, svint16_t s16, svint32_t s32, svint64_t s64,
>> +      svint8x2_t s8x2, svint16x2_t s16x2, svint32x2_t s32x2, svint64x2_t s64x2,
>> +      svbfloat16_t bf16, svbfloat16x2_t bf16x2)
>> +{
> 
> It would be good to have a test here for "too few" and "too many" arguments.
> 
> It would also be good to test cases in which non-vector arguments are passed,
> such as:
> 
>    svluti2_lane (0, u8, 0);
> 
> The tests below concentrate on cases where the first argument has an
> invalid type, but they don't cover any cases where the second and
> third arguments have invalid types, or where the third argument is
> nonconstant.  How about adding:
> 
>    svluti2_lane (f16, 0, 0);
>    svluti2_lane (u16, u16, 0);
>    svluti2_lane (f16, u8, u8);
>    svluti2_lane (f16, u8, x);
> 
> where "x" is a new parameter of type "int".
> 
>> +  svluti2_lane (f16, u8, 0);
>> +  svluti2_lane (bf16, u8, 0);
>> +
>> +  svluti2_lane (f32, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svfloat32_t' arguments} } */
>> +  svluti2_lane (f64, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svfloat64_t' arguments} } */
>> +
>> +  svluti2_lane (u8, u8, 0);
>> +  svluti2_lane (u16, u8, 0);
>> +
>> +  svluti2_lane (u32, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svuint32_t' arguments} } */
>> +  svluti2_lane (u64, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svuint64_t' arguments} } */
>> +
>> +  svluti2_lane (s8, u8, 0);
>> +  svluti2_lane (s16, u8, 0);
>> +
>> +  svluti2_lane (s32, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svint32_t' arguments} } */
>> +  svluti2_lane (s64, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svint64_t' arguments} } */
>> +
>> +  svluti4_lane (f16, u8, 0);
>> +  svluti4_lane (bf16, u8, 0);
>> +  svluti4_lane_x2 (f16x2, u8, 0);
>> +  svluti4_lane_x2 (bf16x2, u8, 0);
> 
> The _x2 shouldn't be present in the overloaded name.  The fix for that
> is to add:
> 
>    bool explicit_group_suffix_p () const override { return false; }
> 
> to the shape class (canonically as the first thing in the class).
> 
> It would be good to test something like f16x3 as well, for the case
> in which the element size is ok but the tuple size is wrong.
> 
>> +
>> +  svluti4_lane (f32, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svfloat32_t' arguments} } */
>> +  svluti4_lane (f64, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svfloat64_t' arguments} } */
>> +  svluti4_lane_x2 (f32x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svfloat32x2_t' arguments} } */
>> +  svluti4_lane_x2 (f64x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svfloat64x2_t' arguments} } */
>> +
>> +  svluti4_lane (u8, u8, 0);
>> +  svluti4_lane (u16, u8, 0);
>> +  svluti4_lane_x2 (u8x2, u8, 0);
>> +  svluti4_lane_x2 (u16x2, u8, 0);
>> +
>> +  svluti4_lane (u32, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svuint32_t' arguments} } */
>> +  svluti4_lane (u64, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svuint64_t' arguments} } */
>> +  svluti4_lane_x2 (u32x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svuint32x2_t' arguments} } */
>> +  svluti4_lane_x2 (u64x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svuint64x2_t' arguments} } */
>> +
>> +  svluti4_lane (s8, u8, 0);
>> +  svluti4_lane (s16, u8, 0);
>> +  svluti4_lane_x2 (s8x2, u8, 0);
>> +  svluti4_lane_x2 (s16x2, u8, 0);
>> +
>> +  svluti4_lane (s32, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svint32_t' arguments} } */
>> +  svluti4_lane (s64, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svint64_t' arguments} } */
>> +  svluti4_lane_x2 (s32x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svint32x2_t' arguments} } */
>> +  svluti4_lane_x2 (s64x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svint64x2_t' arguments} } */
>> +}
>> [...]
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c
>> new file mode 100644
>> index 00000000000..1f3f8aab5ef
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c
>> @@ -0,0 +1,30 @@
>> +/* { dg-do assemble { target aarch64_asm_lut_ok } } */
>> +/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
>> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
>> +
>> +#include "test_sve_acle.h"
>> +
>> +#pragma GCC target "+sve2+lut"
>> +#if STREAMING_COMPATIBLE
>> +#pragma GCC target "+sme2"
>> +#endif
>> +
>> +/*
>> +** luti4_min_idx_test:
>> +**	luti4	z0\.h, \{z6\.h \- z7\.h\}, z5\[0\]
>> +**	ret
>> +*/
>> +
>> +TEST_1X2_NARROW(luti4_min_idx_test, svbfloat16_t, svbfloat16x2_t, svuint8_t,
>> +		z0_res = svluti4_lane_bf16_x2 (z6, z5, 0),
>> +		z0_res = svluti4_lane_x2 (z6, z5, 0))
>> +
>> +/*
>> +** luti4_max_idx_test:
>> +**	luti4	z0\.h, \{z6\.h \- z7\.h\}, z5\[3\]
>> +**	ret
>> +*/
>> +
>> +TEST_1X2_NARROW(luti4_max_idx_test, svbfloat16_t, svbfloat16x2_t, svuint8_t,
>> +		z0_res = svluti4_lane_bf16_x2 (z6, z5, 3),
>> +		z0_res = svluti4_lane_x2 (z6, z5, 3))
> 
> The macro has a few variations:
> 
> +    register RTYPE z0 __asm ("z0");					\
> +    register ZTYPE z5 __asm ("z5");					\
> +    register TTYPE z6 __asm ("z6");					\
> +    register RTYPE z16 __asm ("z16");					\
> +    register ZTYPE z22 __asm ("z22");					\
> +    register TTYPE z29 __asm ("z29");					\
> +    register RTYPE z0_res __asm ("z0");					\
> 
> which is good. :)  I think we should try more of them here.  In particular,
> I think we should try z29, for the case in which the register is naturally
> unaligned.  I think that might have caught the predicate issue mentioned
> above.
> 
> Richard
  
Richard Sandiford Jan. 17, 2025, 12:39 p.m. UTC | #3
Saurabh Jha <saurabh.jha@arm.com> writes:
> On 1/16/2025 8:44 AM, Richard Sandiford wrote:
>> Thanks for the update.  Mostly LGTM, but some comments below:
>> 
>> <saurabh.jha@arm.com> writes:
>>> diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
>>> index f8cfe08f4c0..0a1dc314f94 100644
>>> --- a/gcc/config/aarch64/aarch64-sve2.md
>>> +++ b/gcc/config/aarch64/aarch64-sve2.md
>>> @@ -133,6 +133,7 @@
>>>   ;; ---- Optional AES extensions
>>>   ;; ---- Optional SHA-3 extensions
>>>   ;; ---- Optional SM4 extensions
>>> +;; ---- Table lookup
>> 
>> This puts it under:
>> 
>> ;; == Cryptographic extensions
>> 
>> but it's not a crytographic extension.  Probably better to put it under:
>> 
>> ;; == General
>> 
>> instead.
>> 
>>>   ;; =========================================================================
>>>   ;; == Moves
>>> @@ -4211,3 +4212,35 @@
>>>     "sm4ekey\t%0.s, %1.s, %2.s"
>>>     [(set_attr "type" "crypto_sm4")]
>>>   )
>>> +
>>> +;; -------------------------------------------------------------------------
>>> +;; ---- Table lookup
>>> +;; -------------------------------------------------------------------------
>>> +;; Includes:
>>> +;; - LUTI2
>>> +;; - LUTI4
>>> +;; -------------------------------------------------------------------------
>>> +
>>> +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
>>> +  [(set (match_operand:SVE_FULL_BH 0 "register_operand" "=w")
>>> +	(unspec:SVE_FULL_BH
>>> +	 [(match_operand:SVE_FULL_BH 1 "register_operand" "w")
>>> +	  (match_operand:VNx16QI 2 "register_operand" "w")
>> 
>> This is correct
>> 
>>> +	  (match_operand:DI 3 "const_int_operand")
>>> +	  (const_int LUTI_BITS)]
>>> +	 UNSPEC_SVE_LUTI))]
>>> +  "TARGET_LUT && TARGET_SVE2_OR_SME2"
>>> +  "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
>>> +)
>>> +
>>> +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
>>> +  [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
>>> +	(unspec:<VSINGLE>
>>> +	 [(match_operand:SVE_FULL_Hx2 1 "register_operand" "Uw2")
>> 
>> ...but this should use aligned_register_operand instead of
>> register_operand.
> Are you sure we want it to be aligned? That requirement is not there on 
> the { <Zn1>.H, <Zn2>.H } operand here: 
> https://developer.arm.com/documentation/ddi0602/2024-12/SVE-Instructions/LUTI4--Lookup-table-read-with-4-bit-indices-?lang=en, 
> as in, nothing like it has to be "Zn times 2....

Ah, right, sorry.  I'd even said that last time:

> This operand also isn't required to be aligned: Zn has a 5-bit encoding.

but got it the wrong way round when doing this review. :(

But in that case the constraint should be "w" rather than "Uw2".
"Uw2" is for aligned registers only:

  (define_register_constraint "Uw2" "FP_REGS"
    "Even floating point and SIMD vector registers."
    "regno % 2 == 0")

Richard
  

Patch

diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
index 92fcf5389a3..d1e2ab9831d 100644
--- a/gcc/config/aarch64/aarch64-c.cc
+++ b/gcc/config/aarch64/aarch64-c.cc
@@ -268,6 +268,8 @@  aarch64_update_cpp_builtins (cpp_reader *pfile)
   aarch64_def_or_undef (TARGET_SVE_BF16,
 			"__ARM_FEATURE_SVE_BF16", pfile);
 
+  aarch64_def_or_undef (TARGET_LUT, "__ARM_FEATURE_LUT", pfile);
+
   aarch64_def_or_undef (TARGET_FP8, "__ARM_FEATURE_FP8", pfile);
 
   aarch64_def_or_undef (TARGET_FP8DOT2, "__ARM_FEATURE_FP8DOT2", pfile);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
index ca721dd2c09..6fbbfdd05b7 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
@@ -903,6 +903,50 @@  struct load_ext_gather_base : public overloaded_base<1>
   }
 };
 
+
+/* sv<v0>_t svlut[_<t0>_g](sv<t0>x<g>_t, svuint8_t, uint64_t)
+   where the final argument is a constant index, the instruction divides
+   the vector argument in BITS-bit quantities.  */
+template<unsigned int BITS>
+struct luti_base : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const override
+  {
+    /* Format: return type, table vector, indices vector, immediate value.  */
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,t0,vu8,su64", group, MODE_none);
+  }
+
+  bool
+  check (function_checker &c) const override
+  {
+    auto max_range = c.type_suffix (0).element_bits / BITS - 1;
+    return c.require_immediate_range (2, 0, max_range);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+    sve_type type;
+    if (!r.check_num_arguments (3)
+	|| !(type = r.infer_sve_type (0))
+	|| !r.require_vector_type (1, VECTOR_TYPE_svuint8_t)
+	|| !r.require_scalar_type (2, "uint64_t"))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+
+/* Specializations for 2-bit and 4-bit indices.  */
+using luti2_def = luti_base<2>;
+SHAPE (luti2)
+
+using luti4_def = luti_base<4>;
+SHAPE (luti4)
+
+
 /* sv<t0>x<g>_t svfoo_t0_g(uint64_t, svuint8_t, uint64_t)
 
    where the first argument is the ZT register number (currently always 0)
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.h b/gcc/config/aarch64/aarch64-sve-builtins-shapes.h
index 56e2f57b036..349eae6e34a 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-shapes.h
+++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.h
@@ -164,6 +164,8 @@  namespace aarch64_sve
     extern const function_shape *const load_gather64_vs_offset;
     extern const function_shape *const load_replicate;
     extern const function_shape *const load_za;
+    extern const function_shape *const luti2;
+    extern const function_shape *const luti4;
     extern const function_shape *const luti2_lane_zt;
     extern const function_shape *const luti4_lane_zt;
     extern const function_shape *const mmla;
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
index 0c5290411bb..d9922de7ca5 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
@@ -914,6 +914,21 @@  public:
   unsigned int m_base;
 };
 
+class svluti_lane_impl : public function_base
+{
+public:
+  CONSTEXPR svluti_lane_impl (unsigned int bits) : m_bits (bits)
+  {}
+
+  rtx expand (function_expander &e) const override
+  {
+    auto mode = e.tuple_mode (0);
+    return e.use_exact_insn (code_for_aarch64_sve_luti (m_bits, mode));
+  }
+
+  unsigned int m_bits;
+};
+
 } /* end anonymous namespace */
 
 namespace aarch64_sve {
@@ -1205,5 +1220,7 @@  FUNCTION (svzip, multireg_permute, (UNSPEC_ZIP))
 FUNCTION (svzipq, multireg_permute, (UNSPEC_ZIPQ))
 FUNCTION (svzipq1, svzipq_impl, (0))
 FUNCTION (svzipq2, svzipq_impl, (1))
+FUNCTION (svluti2_lane, svluti_lane_impl, (2))
+FUNCTION (svluti4_lane, svluti_lane_impl, (4))
 
 } /* end namespace aarch64_sve */
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
index e726fa1fb68..0cd187cc2bd 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
+++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
@@ -336,6 +336,14 @@  DEF_SVE_FUNCTION (svamax, binary_opt_single_n, all_float, mxz)
 DEF_SVE_FUNCTION (svamin, binary_opt_single_n, all_float, mxz)
 #undef REQUIRED_EXTENSIONS
 
+#define REQUIRED_EXTENSIONS \
+  sve_and_sme (AARCH64_FL_SVE2 | AARCH64_FL_LUT, \
+	       AARCH64_FL_SME2 | AARCH64_FL_LUT)
+DEF_SVE_FUNCTION (svluti2_lane, luti2, bh_data, none)
+DEF_SVE_FUNCTION (svluti4_lane, luti4, bh_data, none)
+DEF_SVE_FUNCTION_GS (svluti4_lane, luti4, bh_data, x2, none)
+#undef REQUIRED_EXTENSIONS
+
 #define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME_F16F16)
 DEF_SVE_FUNCTION_GS (svcvt, unary_convertxn, cvt_f32_f16, x2, none)
 DEF_SVE_FUNCTION_GS (svcvtl, unary_convertxn, cvt_f32_f16, x2, none)
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.h b/gcc/config/aarch64/aarch64-sve-builtins-sve2.h
index 14d668d2649..6d7d0af2641 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.h
+++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.h
@@ -250,6 +250,8 @@  namespace aarch64_sve
     extern const function_base *const svzipq;
     extern const function_base *const svzipq1;
     extern const function_base *const svzipq2;
+    extern const function_base *const svluti2_lane;
+    extern const function_base *const svluti4_lane;
   }
 }
 
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
index cf8ca89aefa..2c55c9481eb 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -283,7 +283,11 @@  CONSTEXPR const group_suffix_info group_suffixes[] = {
 #define TYPES_bhs_integer(S, D) \
   TYPES_bhs_signed (S, D), TYPES_bhs_unsigned (S, D)
 
-#define TYPES_bhs_data(S, D) \
+#define TYPES_bh_data(S, D)			\
+  TYPES_b_data (S, D), \
+  TYPES_h_data (S, D)
+
+#define TYPES_bhs_data(S, D)			\
   TYPES_b_data (S, D), \
   TYPES_h_data (S, D), \
   TYPES_s_data (S, D)
@@ -782,6 +786,7 @@  DEF_SVE_TYPES_ARRAY (bs_unsigned);
 DEF_SVE_TYPES_ARRAY (bhs_signed);
 DEF_SVE_TYPES_ARRAY (bhs_unsigned);
 DEF_SVE_TYPES_ARRAY (bhs_integer);
+DEF_SVE_TYPES_ARRAY (bh_data);
 DEF_SVE_TYPES_ARRAY (bhs_data);
 DEF_SVE_TYPES_ARRAY (bhs_widen);
 DEF_SVE_TYPES_ARRAY (c);
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index f8cfe08f4c0..0a1dc314f94 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -133,6 +133,7 @@ 
 ;; ---- Optional AES extensions
 ;; ---- Optional SHA-3 extensions
 ;; ---- Optional SM4 extensions
+;; ---- Table lookup
 
 ;; =========================================================================
 ;; == Moves
@@ -4211,3 +4212,35 @@ 
   "sm4ekey\t%0.s, %1.s, %2.s"
   [(set_attr "type" "crypto_sm4")]
 )
+
+;; -------------------------------------------------------------------------
+;; ---- Table lookup
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - LUTI2
+;; - LUTI4
+;; -------------------------------------------------------------------------
+
+(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
+  [(set (match_operand:SVE_FULL_BH 0 "register_operand" "=w")
+	(unspec:SVE_FULL_BH
+	 [(match_operand:SVE_FULL_BH 1 "register_operand" "w")
+	  (match_operand:VNx16QI 2 "register_operand" "w")
+	  (match_operand:DI 3 "const_int_operand")
+	  (const_int LUTI_BITS)]
+	 UNSPEC_SVE_LUTI))]
+  "TARGET_LUT && TARGET_SVE2_OR_SME2"
+  "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
+)
+
+(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
+  [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
+	(unspec:<VSINGLE>
+	 [(match_operand:SVE_FULL_Hx2 1 "register_operand" "Uw2")
+	  (match_operand:VNx16QI 2 "register_operand" "w")
+	  (match_operand:DI 3 "const_int_operand")
+	  (const_int LUTI_BITS)]
+	  UNSPEC_SVE_LUTI))]
+  "TARGET_LUT && TARGET_SVE2_OR_SME2"
+  "luti<LUTI_BITS>\t%0.<Vetype>, %1, %2[%3]"
+)
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index ff0f34dd043..0fbf96f1ab9 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -553,6 +553,18 @@ 
 (define_mode_iterator SVE_FULL_BHS [VNx16QI VNx8HI VNx4SI
 				    VNx8BF VNx8HF VNx4SF])
 
+;; Fully-packed SVE vector byte modes that have 32-bit or smaller elements.
+(define_mode_iterator SVE_FULL_BS [VNx16QI VNx4SI VNx4SF])
+
+;; Fully-packed SVE vector byte modes that have 16-bit or smaller elements.
+(define_mode_iterator SVE_FULL_BH [VNx16QI VNx8HI VNx8HF VNx8BF])
+
+;; Fully-packed half word SVE vector modes
+(define_mode_iterator SVE_FULL_H [VNx8HI VNx8HF VNx8BF])
+
+;; Pairs of fully-packed SVE vector modes (half word only)
+(define_mode_iterator SVE_FULL_Hx2 [VNx16HI VNx16HF VNx16BF])
+
 ;; Fully-packed SVE vector modes that have 32-bit elements.
 (define_mode_iterator SVE_FULL_S [VNx4SI VNx4SF])
 
@@ -1186,6 +1198,7 @@ 
     UNSPEC_UZPQ2
     UNSPEC_ZIPQ1
     UNSPEC_ZIPQ2
+    UNSPEC_SVE_LUTI
 
     ;; All used in aarch64-sme.md
     UNSPEC_SME_ADD
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
index d3ae707ac49..c0dd89fa924 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
@@ -780,4 +780,20 @@ 
 		    "w" (z16), "w" (z22), "w" (z29));		\
   }
 
+#define TEST_1X2_NARROW(NAME, RTYPE, TTYPE, ZTYPE, CODE1, CODE2)	\
+  PROTO(NAME, void, ())							\
+  {									\
+    register RTYPE z0 __asm ("z0");					\
+    register ZTYPE z5 __asm ("z5");					\
+    register TTYPE z6 __asm ("z6");					\
+    register RTYPE z16 __asm ("z16");					\
+    register ZTYPE z22 __asm ("z22");					\
+    register TTYPE z29 __asm ("z29");					\
+    register RTYPE z0_res __asm ("z0");					\
+    __asm volatile ("" : "=w" (z0), "=w" (z5), "=w" (z6),		\
+		    "=w" (z16), "=w" (z22), "=w" (z29));		\
+    INVOKE (CODE1, CODE2);						\
+    __asm volatile ("" :: "w" (z0_res), "w" (z5), "w" (z22));		\
+  }
+
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_1.c
new file mode 100644
index 00000000000..142de490267
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_1.c
@@ -0,0 +1,64 @@ 
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+#pragma GCC target ("arch=armv9.2-a+sve2+lut")
+
+void
+test (svfloat16_t f16, svfloat32_t f32, svfloat64_t f64,
+      svfloat16x2_t f16x2, svfloat32x2_t f32x2, svfloat64x2_t f64x2,
+      svuint8_t u8, svuint16_t u16, svuint32_t u32, svuint64_t u64,
+      svuint8x2_t u8x2, svuint16x2_t u16x2,
+      svuint32x2_t u32x2, svuint64x2_t u64x2,
+      svint8_t s8, svint16_t s16, svint32_t s32, svint64_t s64,
+      svint8x2_t s8x2, svint16x2_t s16x2, svint32x2_t s32x2, svint64x2_t s64x2,
+      svbfloat16_t bf16, svbfloat16x2_t bf16x2)
+{
+  svluti2_lane (f16, u8, 0);
+  svluti2_lane (bf16, u8, 0);
+
+  svluti2_lane (f32, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svfloat32_t' arguments} } */
+  svluti2_lane (f64, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svfloat64_t' arguments} } */
+
+  svluti2_lane (u8, u8, 0);
+  svluti2_lane (u16, u8, 0);
+
+  svluti2_lane (u32, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svuint32_t' arguments} } */
+  svluti2_lane (u64, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svuint64_t' arguments} } */
+
+  svluti2_lane (s8, u8, 0);
+  svluti2_lane (s16, u8, 0);
+
+  svluti2_lane (s32, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svint32_t' arguments} } */
+  svluti2_lane (s64, u8, 0); /* { dg-error {'svluti2_lane' has no form that takes 'svint64_t' arguments} } */
+
+  svluti4_lane (f16, u8, 0);
+  svluti4_lane (bf16, u8, 0);
+  svluti4_lane_x2 (f16x2, u8, 0);
+  svluti4_lane_x2 (bf16x2, u8, 0);
+
+  svluti4_lane (f32, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svfloat32_t' arguments} } */
+  svluti4_lane (f64, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svfloat64_t' arguments} } */
+  svluti4_lane_x2 (f32x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svfloat32x2_t' arguments} } */
+  svluti4_lane_x2 (f64x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svfloat64x2_t' arguments} } */
+
+  svluti4_lane (u8, u8, 0);
+  svluti4_lane (u16, u8, 0);
+  svluti4_lane_x2 (u8x2, u8, 0);
+  svluti4_lane_x2 (u16x2, u8, 0);
+
+  svluti4_lane (u32, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svuint32_t' arguments} } */
+  svluti4_lane (u64, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svuint64_t' arguments} } */
+  svluti4_lane_x2 (u32x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svuint32x2_t' arguments} } */
+  svluti4_lane_x2 (u64x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svuint64x2_t' arguments} } */
+
+  svluti4_lane (s8, u8, 0);
+  svluti4_lane (s16, u8, 0);
+  svluti4_lane_x2 (s8x2, u8, 0);
+  svluti4_lane_x2 (s16x2, u8, 0);
+
+  svluti4_lane (s32, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svint32_t' arguments} } */
+  svluti4_lane (s64, u8, 0); /* { dg-error {'svluti4_lane' has no form that takes 'svint64_t' arguments} } */
+  svluti4_lane_x2 (s32x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svint32x2_t' arguments} } */
+  svluti4_lane_x2 (s64x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form that takes 'svint64x2_t' arguments} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_2.c
new file mode 100644
index 00000000000..3c0664df943
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_2.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+#pragma GCC target ("arch=armv9.2-a+sve2")
+
+void
+test (svfloat16_t f16, svuint8_t u8)
+{
+  svluti2_lane (f16, u8, 0); /* { dg-error {ACLE function 'svluti2_lane_f16' requires ISA extension 'lut'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_3.c
new file mode 100644
index 00000000000..d35fda9e5b0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_3.c
@@ -0,0 +1,56 @@ 
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+#pragma GCC target ("arch=armv9.2-a+sve2+lut")
+
+void
+test (svfloat16_t f16, svfloat16x2_t f16x2,
+      svuint8_t u8, svuint16_t u16, svuint8x2_t u8x2, svuint16x2_t u16x2,
+      svint8_t s8, svint16_t s16, svint8x2_t s8x2, svint16x2_t s16x2,
+      svbfloat16_t bf16, svbfloat16x2_t bf16x2)
+{
+  svluti2_lane (f16, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti2_lane', which expects a value in the range \[0, 7\]} } */
+  svluti2_lane (f16, u8, 8); /* { dg-error {passing 8 to argument 3 of 'svluti2_lane', which expects a value in the range \[0, 7\]} } */
+
+  svluti2_lane (bf16, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti2_lane', which expects a value in the range \[0, 7\]} } */
+  svluti2_lane (bf16, u8, 8); /* { dg-error {passing 8 to argument 3 of 'svluti2_lane', which expects a value in the range \[0, 7\]} } */
+
+  svluti2_lane (u8, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti2_lane', which expects a value in the range \[0, 3\]} } */
+  svluti2_lane (u8, u8, 4); /* { dg-error {passing 4 to argument 3 of 'svluti2_lane', which expects a value in the range \[0, 3\]} } */
+  svluti2_lane (u16, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti2_lane', which expects a value in the range \[0, 7\]} } */
+  svluti2_lane (u16, u8, 8); /* { dg-error {passing 8 to argument 3 of 'svluti2_lane', which expects a value in the range \[0, 7\]} } */
+
+  svluti2_lane (s8, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti2_lane', which expects a value in the range \[0, 3\]} } */
+  svluti2_lane (s8, u8, 4); /* { dg-error {passing 4 to argument 3 of 'svluti2_lane', which expects a value in the range \[0, 3\]} } */
+  svluti2_lane (s16, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti2_lane', which expects a value in the range \[0, 7\]} } */
+  svluti2_lane (s16, u8, 8); /* { dg-error {passing 8 to argument 3 of 'svluti2_lane', which expects a value in the range \[0, 7\]} } */
+
+  svluti4_lane (f16, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti4_lane', which expects a value in the range \[0, 3\]} } */
+  svluti4_lane (f16, u8, 4); /* { dg-error {passing 4 to argument 3 of 'svluti4_lane', which expects a value in the range \[0, 3\]} } */
+  svluti4_lane_x2 (f16x2, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti4_lane_x2', which expects a value in the range \[0, 3\]} } */
+  svluti4_lane_x2 (f16x2, u8, 4); /* { dg-error {passing 4 to argument 3 of 'svluti4_lane_x2', which expects a value in the range \[0, 3\]} } */
+
+  svluti4_lane (bf16, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti4_lane', which expects a value in the range \[0, 3\]} } */
+  svluti4_lane (bf16, u8, 4); /* { dg-error {passing 4 to argument 3 of 'svluti4_lane', which expects a value in the range \[0, 3\]} } */
+  svluti4_lane_x2 (bf16x2, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti4_lane_x2', which expects a value in the range \[0, 3\]} } */
+  svluti4_lane_x2 (bf16x2, u8, 4); /* { dg-error {passing 4 to argument 3 of 'svluti4_lane_x2', which expects a value in the range \[0, 3\]} } */
+
+  svluti4_lane (u8, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti4_lane', which expects a value in the range \[0, 1\]} } */
+  svluti4_lane (u8, u8, 2); /* { dg-error {passing 2 to argument 3 of 'svluti4_lane', which expects a value in the range \[0, 1\]} } */
+  svluti4_lane (u16, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti4_lane', which expects a value in the range \[0, 3\]} } */
+  svluti4_lane (u16, u8, 4); /* { dg-error {passing 4 to argument 3 of 'svluti4_lane', which expects a value in the range \[0, 3\]} } */
+  svluti4_lane_x2 (u8x2, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti4_lane_x2', which expects a value in the range \[0, 1\]} } */
+  svluti4_lane_x2 (u8x2, u8, 2); /* { dg-error {passing 2 to argument 3 of 'svluti4_lane_x2', which expects a value in the range \[0, 1\]} } */
+  svluti4_lane_x2 (u16x2, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti4_lane_x2', which expects a value in the range \[0, 3\]} } */
+  svluti4_lane_x2 (u16x2, u8, 4); /* { dg-error {passing 4 to argument 3 of 'svluti4_lane_x2', which expects a value in the range \[0, 3\]} } */
+
+  svluti4_lane (s8, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti4_lane', which expects a value in the range \[0, 1\]} } */
+  svluti4_lane (s8, u8, 2); /* { dg-error {passing 2 to argument 3 of 'svluti4_lane', which expects a value in the range \[0, 1\]} } */
+  svluti4_lane (s16, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti4_lane', which expects a value in the range \[0, 3\]} } */
+  svluti4_lane (s16, u8, 4); /* { dg-error {passing 4 to argument 3 of 'svluti4_lane', which expects a value in the range \[0, 3\]} } */
+  svluti4_lane_x2 (s8x2, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti4_lane_x2', which expects a value in the range \[0, 1\]} } */
+  svluti4_lane_x2 (s8x2, u8, 2); /* { dg-error {passing 2 to argument 3 of 'svluti4_lane_x2', which expects a value in the range \[0, 1\]} } */
+  svluti4_lane_x2 (s16x2, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svluti4_lane_x2', which expects a value in the range \[0, 3\]} } */
+  svluti4_lane_x2 (s16x2, u8, 4); /* { dg-error {passing 4 to argument 3 of 'svluti4_lane_x2', which expects a value in the range \[0, 3\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c
new file mode 100644
index 00000000000..f25c2f8bb45
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c
@@ -0,0 +1,50 @@ 
+/* { dg-do assemble { target aarch64_asm_lut_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti2_min_idx_test:
+**	 luti2	z1\.h, \{ z28\.h \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_min_idx_test, svbfloat16_t, svuint8_t, z1,
+                svluti2_lane_bf16 (z28, z0, 0),
+                svluti2_lane (z28, z0, 0))
+
+/*
+** luti2_max_idx_test:
+**	 luti2	z1\.h, \{ z28\.h \}, z0\[7\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_max_idx_test, svbfloat16_t, svuint8_t, z1,
+                svluti2_lane_bf16 (z28, z0, 7),
+                svluti2_lane (z28, z0, 7))
+
+/*
+** luti2_tied_min_idx_test:
+**	 luti2	z28\.h, \{ z28\.h \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_tied_min_idx_test, svbfloat16_t, svuint8_t, z28,
+                svluti2_lane_bf16 (z28, z0, 0),
+                svluti2_lane (z28, z0, 0))
+
+/*
+** luti2_tied_max_idx_test:
+**	 luti2	z28\.h, \{ z28\.h \}, z0\[7\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_tied_max_idx_test, svbfloat16_t, svuint8_t, z28,
+                svluti2_lane_bf16 (z28, z0, 7),
+                svluti2_lane (z28, z0, 7))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_f16.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_f16.c
new file mode 100644
index 00000000000..39c814c08f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_f16.c
@@ -0,0 +1,50 @@ 
+/* { dg-do assemble { target aarch64_asm_lut_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti2_min_idx_test:
+**	 luti2	z1\.h, \{ z28\.h \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_min_idx_test, svfloat16_t, svuint8_t, z1,
+                svluti2_lane_f16 (z28, z0, 0),
+                svluti2_lane (z28, z0, 0))
+
+/*
+** luti2_max_idx_test:
+**	 luti2	z1\.h, \{ z28\.h \}, z0\[7\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_max_idx_test, svfloat16_t, svuint8_t, z1,
+                svluti2_lane_f16 (z28, z0, 7),
+                svluti2_lane (z28, z0, 7))
+
+/*
+** luti2_tied_min_idx_test:
+**	 luti2	z28\.h, \{ z28\.h \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_tied_min_idx_test, svfloat16_t, svuint8_t, z28,
+                svluti2_lane_f16 (z28, z0, 0),
+                svluti2_lane (z28, z0, 0))
+
+/*
+** luti2_tied_max_idx_test:
+**	 luti2	z28\.h, \{ z28\.h \}, z0\[7\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_tied_max_idx_test, svfloat16_t, svuint8_t, z28,
+                svluti2_lane_f16 (z28, z0, 7),
+                svluti2_lane (z28, z0, 7))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_s16.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_s16.c
new file mode 100644
index 00000000000..5f4bc0d488b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_s16.c
@@ -0,0 +1,50 @@ 
+/* { dg-do assemble { target aarch64_asm_lut_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti2_min_idx_test:
+**	 luti2	z1\.h, \{ z28\.h \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_min_idx_test, svint16_t, svuint8_t, z1,
+                svluti2_lane_s16 (z28, z0, 0),
+                svluti2_lane (z28, z0, 0))
+
+/*
+** luti2_max_idx_test:
+**	 luti2	z1\.h, \{ z28\.h \}, z0\[7\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_max_idx_test, svint16_t, svuint8_t, z1,
+                svluti2_lane_s16 (z28, z0, 7),
+                svluti2_lane (z28, z0, 7))
+
+/*
+** luti2_tied_min_idx_test:
+**	 luti2	z28\.h, \{ z28\.h \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_tied_min_idx_test, svint16_t, svuint8_t, z28,
+                svluti2_lane_s16 (z28, z0, 0),
+                svluti2_lane (z28, z0, 0))
+
+/*
+** luti2_tied_max_idx_test:
+**	 luti2	z28\.h, \{ z28\.h \}, z0\[7\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_tied_max_idx_test, svint16_t, svuint8_t, z28,
+                svluti2_lane_s16 (z28, z0, 7),
+                svluti2_lane (z28, z0, 7))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_s8.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_s8.c
new file mode 100644
index 00000000000..f923296f44b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_s8.c
@@ -0,0 +1,50 @@ 
+/* { dg-do assemble { target aarch64_asm_lut_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti2_min_idx_test:
+**	 luti2	z1\.b, \{ z28\.b \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_min_idx_test, svint8_t, svuint8_t, z1,
+                svluti2_lane_s8 (z28, z0, 0),
+                svluti2_lane (z28, z0, 0))
+
+/*
+** luti2_max_idx_test:
+**	 luti2	z1\.b, \{ z28\.b \}, z0\[3\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_max_idx_test, svint8_t, svuint8_t, z1,
+                svluti2_lane_s8 (z28, z0, 3),
+                svluti2_lane (z28, z0, 3))
+
+/*
+** luti2_tied_min_idx_test:
+**	 luti2	z28\.b, \{ z28\.b \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_tied_min_idx_test, svint8_t, svuint8_t, z28,
+                svluti2_lane_s8 (z28, z0, 0),
+                svluti2_lane (z28, z0, 0))
+
+/*
+** luti2_tied_max_idx_test:
+**	 luti2	z28\.b, \{ z28\.b \}, z0\[3\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_tied_max_idx_test, svint8_t, svuint8_t, z28,
+                svluti2_lane_s8 (z28, z0, 3),
+                svluti2_lane (z28, z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_u16.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_u16.c
new file mode 100644
index 00000000000..bb9c3e098f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_u16.c
@@ -0,0 +1,50 @@ 
+/* { dg-do assemble { target aarch64_asm_lut_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti2_min_idx_test:
+**	 luti2	z1\.h, \{ z28\.h \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_min_idx_test, svuint16_t, svuint8_t, z1,
+                svluti2_lane_u16 (z28, z0, 0),
+                svluti2_lane (z28, z0, 0))
+
+/*
+** luti2_max_idx_test:
+**	 luti2	z1\.h, \{ z28\.h \}, z0\[7\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_max_idx_test, svuint16_t, svuint8_t, z1,
+                svluti2_lane_u16 (z28, z0, 7),
+                svluti2_lane (z28, z0, 7))
+
+/*
+** luti2_tied_min_idx_test:
+**	 luti2	z28\.h, \{ z28\.h \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_tied_min_idx_test, svuint16_t, svuint8_t, z28,
+                svluti2_lane_u16 (z28, z0, 0),
+                svluti2_lane (z28, z0, 0))
+
+/*
+** luti2_tied_max_idx_test:
+**	 luti2	z28\.h, \{ z28\.h \}, z0\[7\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_tied_max_idx_test, svuint16_t, svuint8_t, z28,
+                svluti2_lane_u16 (z28, z0, 7),
+                svluti2_lane (z28, z0, 7))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_u8.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_u8.c
new file mode 100644
index 00000000000..895d850aaa6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_u8.c
@@ -0,0 +1,50 @@ 
+/* { dg-do assemble { target aarch64_asm_lut_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti2_min_idx_test:
+**	 luti2	z1\.b, \{ z28\.b \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_min_idx_test, svuint8_t, svuint8_t, z1,
+                svluti2_lane_u8 (z28, z0, 0),
+                svluti2_lane (z28, z0, 0))
+
+/*
+** luti2_max_idx_test:
+**	 luti2	z1\.b, \{ z28\.b \}, z0\[3\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_max_idx_test, svuint8_t, svuint8_t, z1,
+                svluti2_lane_u8 (z28, z0, 3),
+                svluti2_lane (z28, z0, 3))
+
+/*
+** luti2_tied_min_idx_test:
+**	 luti2	z28\.b, \{ z28\.b \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_tied_min_idx_test, svuint8_t, svuint8_t, z28,
+                svluti2_lane_u8 (z28, z0, 0),
+                svluti2_lane (z28, z0, 0))
+
+/*
+** luti2_tied_max_idx_test:
+**	 luti2	z28\.b, \{ z28\.b \}, z0\[3\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti2_tied_max_idx_test, svuint8_t, svuint8_t, z28,
+                svluti2_lane_u8 (z28, z0, 3),
+                svluti2_lane (z28, z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16.c
new file mode 100644
index 00000000000..b005ba6ca47
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16.c
@@ -0,0 +1,50 @@ 
+/* { dg-do assemble { target aarch64_asm_lut_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_min_idx_test:
+**	 luti4	z1\.h, \{ z28\.h \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_min_idx_test, svbfloat16_t, svuint8_t, z1,
+                svluti4_lane_bf16 (z28, z0, 0),
+                svluti4_lane (z28, z0, 0))
+
+/*
+** luti4_max_idx_test:
+**	 luti4	z1\.h, \{ z28\.h \}, z0\[3\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_max_idx_test, svbfloat16_t, svuint8_t, z1,
+                svluti4_lane_bf16 (z28, z0, 3),
+                svluti4_lane (z28, z0, 3))
+
+/*
+** luti4_tied_min_idx_test:
+**	 luti4	z28\.h, \{ z28\.h \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_tied_min_idx_test, svbfloat16_t, svuint8_t, z28,
+                svluti4_lane_bf16 (z28, z0, 0),
+                svluti4_lane (z28, z0, 0))
+
+/*
+** luti4_tied_max_idx_test:
+**	 luti4	z28\.h, \{ z28\.h \}, z0\[3\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_tied_max_idx_test, svbfloat16_t, svuint8_t, z28,
+                svluti4_lane_bf16 (z28, z0, 3),
+                svluti4_lane (z28, z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c
new file mode 100644
index 00000000000..1f3f8aab5ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c
@@ -0,0 +1,30 @@ 
+/* { dg-do assemble { target aarch64_asm_lut_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_min_idx_test:
+**	luti4	z0\.h, \{z6\.h \- z7\.h\}, z5\[0\]
+**	ret
+*/
+
+TEST_1X2_NARROW(luti4_min_idx_test, svbfloat16_t, svbfloat16x2_t, svuint8_t,
+		z0_res = svluti4_lane_bf16_x2 (z6, z5, 0),
+		z0_res = svluti4_lane_x2 (z6, z5, 0))
+
+/*
+** luti4_max_idx_test:
+**	luti4	z0\.h, \{z6\.h \- z7\.h\}, z5\[3\]
+**	ret
+*/
+
+TEST_1X2_NARROW(luti4_max_idx_test, svbfloat16_t, svbfloat16x2_t, svuint8_t,
+		z0_res = svluti4_lane_bf16_x2 (z6, z5, 3),
+		z0_res = svluti4_lane_x2 (z6, z5, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_f16.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_f16.c
new file mode 100644
index 00000000000..644c0b425c5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_f16.c
@@ -0,0 +1,50 @@ 
+/* { dg-do assemble { target aarch64_asm_lut_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_min_idx_test:
+**	 luti4	z1\.h, \{ z28\.h \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_min_idx_test, svfloat16_t, svuint8_t, z1,
+                svluti4_lane_f16 (z28, z0, 0),
+                svluti4_lane (z28, z0, 0))
+
+/*
+** luti4_max_idx_test:
+**	 luti4	z1\.h, \{ z28\.h \}, z0\[3\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_max_idx_test, svfloat16_t, svuint8_t, z1,
+                svluti4_lane_f16 (z28, z0, 3),
+                svluti4_lane (z28, z0, 3))
+
+/*
+** luti4_tied_min_idx_test:
+**	 luti4	z28\.h, \{ z28\.h \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_tied_min_idx_test, svfloat16_t, svuint8_t, z28,
+                svluti4_lane_f16 (z28, z0, 0),
+                svluti4_lane (z28, z0, 0))
+
+/*
+** luti4_tied_max_idx_test:
+**	 luti4	z28\.h, \{ z28\.h \}, z0\[3\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_tied_max_idx_test, svfloat16_t, svuint8_t, z28,
+                svluti4_lane_f16 (z28, z0, 3),
+                svluti4_lane (z28, z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_f16_x2.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_f16_x2.c
new file mode 100644
index 00000000000..041f7fff1e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_f16_x2.c
@@ -0,0 +1,30 @@ 
+/* { dg-do assemble { target aarch64_asm_lut_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_min_idx_test:
+**	luti4	z0\.h, \{z6\.h \- z7\.h\}, z5\[0\]
+**	ret
+*/
+
+TEST_1X2_NARROW(luti4_min_idx_test, svfloat16_t, svfloat16x2_t, svuint8_t,
+		z0_res = svluti4_lane_f16_x2 (z6, z5, 0),
+		z0_res = svluti4_lane_x2 (z6, z5, 0))
+
+/*
+** luti4_max_idx_test:
+**	luti4	z0\.h, \{z6\.h \- z7\.h\}, z5\[3\]
+**	ret
+*/
+
+TEST_1X2_NARROW(luti4_max_idx_test, svfloat16_t, svfloat16x2_t, svuint8_t,
+		z0_res = svluti4_lane_f16_x2 (z6, z5, 3),
+		z0_res = svluti4_lane_x2 (z6, z5, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s16.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s16.c
new file mode 100644
index 00000000000..ad35a214549
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s16.c
@@ -0,0 +1,50 @@ 
+/* { dg-do assemble { target aarch64_asm_lut_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_min_idx_test:
+**	 luti4	z1\.h, \{ z28\.h \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_min_idx_test, svint16_t, svuint8_t, z1,
+                svluti4_lane_s16 (z28, z0, 0),
+                svluti4_lane (z28, z0, 0))
+
+/*
+** luti4_max_idx_test:
+**	 luti4	z1\.h, \{ z28\.h \}, z0\[3\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_max_idx_test, svint16_t, svuint8_t, z1,
+                svluti4_lane_s16 (z28, z0, 3),
+                svluti4_lane (z28, z0, 3))
+
+/*
+** luti4_tied_min_idx_test:
+**	 luti4	z28\.h, \{ z28\.h \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_tied_min_idx_test, svint16_t, svuint8_t, z28,
+                svluti4_lane_s16 (z28, z0, 0),
+                svluti4_lane (z28, z0, 0))
+
+/*
+** luti4_tied_max_idx_test:
+**	 luti4	z28\.h, \{ z28\.h \}, z0\[3\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_tied_max_idx_test, svint16_t, svuint8_t, z28,
+                svluti4_lane_s16 (z28, z0, 3),
+                svluti4_lane (z28, z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s16_x2.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s16_x2.c
new file mode 100644
index 00000000000..96c9a0d48de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s16_x2.c
@@ -0,0 +1,30 @@ 
+/* { dg-do assemble { target aarch64_asm_lut_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_min_idx_test:
+**	luti4	z0\.h, \{z6\.h \- z7\.h\}, z5\[0\]
+**	ret
+*/
+
+TEST_1X2_NARROW(luti4_min_idx_test, svint16_t, svint16x2_t, svuint8_t,
+		z0_res = svluti4_lane_s16_x2 (z6, z5, 0),
+		z0_res = svluti4_lane_x2 (z6, z5, 0))
+
+/*
+** luti4_max_idx_test:
+**	luti4	z0\.h, \{z6\.h \- z7\.h\}, z5\[3\]
+**	ret
+*/
+
+TEST_1X2_NARROW(luti4_max_idx_test, svint16_t, svint16x2_t, svuint8_t,
+		z0_res = svluti4_lane_s16_x2 (z6, z5, 3),
+		z0_res = svluti4_lane_x2 (z6, z5, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s8.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s8.c
new file mode 100644
index 00000000000..b6f8a5c7c4a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s8.c
@@ -0,0 +1,50 @@ 
+/* { dg-do assemble { target aarch64_asm_lut_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_min_idx_test:
+**	 luti4	z1\.b, \{ z28\.b \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_min_idx_test, svint8_t, svuint8_t, z1,
+                svluti4_lane_s8 (z28, z0, 0),
+                svluti4_lane (z28, z0, 0))
+
+/*
+** luti4_max_idx_test:
+**	 luti4	z1\.b, \{ z28\.b \}, z0\[1\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_max_idx_test, svint8_t, svuint8_t, z1,
+                svluti4_lane_s8 (z28, z0, 1),
+                svluti4_lane (z28, z0, 1))
+
+/*
+** luti4_tied_min_idx_test:
+**	 luti4	z28\.b, \{ z28\.b \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_tied_min_idx_test, svint8_t, svuint8_t, z28,
+                svluti4_lane_s8 (z28, z0, 0),
+                svluti4_lane (z28, z0, 0))
+
+/*
+** luti4_tied_max_idx_test:
+**	 luti4	z28\.b, \{ z28\.b \}, z0\[1\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_tied_max_idx_test, svint8_t, svuint8_t, z28,
+                svluti4_lane_s8 (z28, z0, 1),
+                svluti4_lane (z28, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u16.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u16.c
new file mode 100644
index 00000000000..92138bd7b41
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u16.c
@@ -0,0 +1,50 @@ 
+/* { dg-do assemble { target aarch64_asm_lut_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_min_idx_test:
+**	 luti4	z1\.h, \{ z28\.h \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_min_idx_test, svuint16_t, svuint8_t, z1,
+                svluti4_lane_u16 (z28, z0, 0),
+                svluti4_lane (z28, z0, 0))
+
+/*
+** luti4_max_idx_test:
+**	 luti4	z1\.h, \{ z28\.h \}, z0\[3\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_max_idx_test, svuint16_t, svuint8_t, z1,
+                svluti4_lane_u16 (z28, z0, 3),
+                svluti4_lane (z28, z0, 3))
+
+/*
+** luti4_tied_min_idx_test:
+**	 luti4	z28\.h, \{ z28\.h \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_tied_min_idx_test, svuint16_t, svuint8_t, z28,
+                svluti4_lane_u16 (z28, z0, 0),
+                svluti4_lane (z28, z0, 0))
+
+/*
+** luti4_tied_max_idx_test:
+**	 luti4	z28\.h, \{ z28\.h \}, z0\[3\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_tied_max_idx_test, svuint16_t, svuint8_t, z28,
+                svluti4_lane_u16 (z28, z0, 3),
+                svluti4_lane (z28, z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u16_x2.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u16_x2.c
new file mode 100644
index 00000000000..4de2234d9af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u16_x2.c
@@ -0,0 +1,30 @@ 
+/* { dg-do assemble { target aarch64_asm_lut_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_min_idx_test:
+**	luti4	z0\.h, \{z6\.h \- z7\.h\}, z5\[0\]
+**	ret
+*/
+
+TEST_1X2_NARROW(luti4_min_idx_test, svuint16_t, svuint16x2_t, svuint8_t,
+		z0_res = svluti4_lane_u16_x2 (z6, z5, 0),
+		z0_res = svluti4_lane_x2 (z6, z5, 0))
+
+/*
+** luti4_max_idx_test:
+**	luti4	z0\.h, \{z6\.h \- z7\.h\}, z5\[3\]
+**	ret
+*/
+
+TEST_1X2_NARROW(luti4_max_idx_test, svuint16_t, svuint16x2_t, svuint8_t,
+		z0_res = svluti4_lane_u16_x2 (z6, z5, 3),
+		z0_res = svluti4_lane_x2 (z6, z5, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u8.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u8.c
new file mode 100644
index 00000000000..e1e9523e375
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u8.c
@@ -0,0 +1,50 @@ 
+/* { dg-do assemble { target aarch64_asm_lut_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_min_idx_test:
+**	 luti4	z1\.b, \{ z28\.b \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_min_idx_test, svuint8_t, svuint8_t, z1,
+                svluti4_lane_u8 (z28, z0, 0),
+                svluti4_lane (z28, z0, 0))
+
+/*
+** luti4_max_idx_test:
+**	 luti4	z1\.b, \{ z28\.b \}, z0\[1\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_max_idx_test, svuint8_t, svuint8_t, z1,
+                svluti4_lane_u8 (z28, z0, 1),
+                svluti4_lane (z28, z0, 1))
+
+/*
+** luti4_tied_min_idx_test:
+**	 luti4	z28\.b, \{ z28\.b \}, z0\[0\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_tied_min_idx_test, svuint8_t, svuint8_t, z28,
+                svluti4_lane_u8 (z28, z0, 0),
+                svluti4_lane (z28, z0, 0))
+
+/*
+** luti4_tied_max_idx_test:
+**	 luti4	z28\.b, \{ z28\.b \}, z0\[1\]
+**	 ret
+*/
+
+TEST_XN_SINGLE (luti4_tied_max_idx_test, svuint8_t, svuint8_t, z28,
+                svluti4_lane_u8 (z28, z0, 1),
+                svluti4_lane (z28, z0, 1))
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 939ef3a4119..a8f87b66cd7 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -12304,7 +12304,7 @@  proc check_effective_target_aarch64_tiny { } {
 
 foreach { aarch64_ext } { "fp" "simd" "crypto" "crc" "lse" "dotprod" "sve"
 			  "i8mm" "f32mm" "f64mm" "bf16" "sb" "sve2" "ls64"
-			  "sme" "sme-i16i64" "sme2" "sve-b16b16"
+			  "lut" "sme" "sme-i16i64" "sme2" "sve-b16b16"
 			  "sme-b16b16" "sme-f16f16" "sme2p1" "fp8" "fp8fma"
 			  "ssve-fp8fma" "fp8dot2" "ssve-fp8dot2" "fp8dot4"
 			  "ssve-fp8dot4"} {