x86: Properly implement AMX-TILE load/store intrinsics
Checks
Commit Message
ldtilecfg and sttilecfg take a 512-byte memory block. With
_tile_loadconfig implemented as
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_tile_loadconfig (const void *__config)
{
__asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
}
GCC sees:
(parallel [
(asm_operands/v ("ldtilecfg %X0") ("") 0
[(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars)
(const_int -64 [0xffffffffffffffc0])) [1 MEM[(const void * *)&tile_data]+0 S8 A128])]
[(asm_input:DI ("m"))]
(clobber (reg:CC 17 flags))])
and the memory operand size is 1 byte. As the result, the rest of 511
bytes is ignored by GCC. Implement ldtilecfg and sttilecfg intrinsics
with a pointer to BLKmode to honor the 512-byte memory block.
gcc/ChangeLog:
PR target/114098
* config/i386/amxtileintrin.h (_tile_loadconfig): Use
__builtin_ia32_ldtilecfg.
(_tile_storeconfig): Use __builtin_ia32_sttilecfg.
* config/i386/i386-builtin.def (BDESC): Add
__builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg.
* config/i386/i386-expand.cc (ix86_expand_builtin): Handle
IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG.
* config/i386/i386.md (ldtilecfg): New pattern.
(sttilecfg): Likewise.
gcc/testsuite/ChangeLog:
PR target/114098
* gcc.target/i386/amxtile-4.c: New test.
---
gcc/config/i386/amxtileintrin.h | 4 +-
gcc/config/i386/i386-builtin.def | 4 ++
gcc/config/i386/i386-expand.cc | 19 ++++++++
gcc/config/i386/i386.md | 24 ++++++++++
gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 +++++++++++++++++++++++
5 files changed, 104 insertions(+), 2 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c
Comments
Thanks for fixing this! Didn't notice that the pointer conversion can
cause this issue...
Was it possible to use local array like
char a[64] = (char *)p
__asm__ volatile ("ldtilecfg\t%X0" :: "m" (a)));
If not, for the two patterns we can use "m" instead of "jm" as APX
supports EGPR extension for AMX.
On Mon, Feb 26, 2024 at 5:11 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> ldtilecfg and sttilecfg take a 512-byte memory block. With
> _tile_loadconfig implemented as
>
> extern __inline void
> __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _tile_loadconfig (const void *__config)
> {
> __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
> }
>
> GCC sees:
>
> (parallel [
> (asm_operands/v ("ldtilecfg %X0") ("") 0
> [(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars)
> (const_int -64 [0xffffffffffffffc0])) [1 MEM[(const void * *)&tile_data]+0 S8 A128])]
> [(asm_input:DI ("m"))]
> (clobber (reg:CC 17 flags))])
>
> and the memory operand size is 1 byte. As the result, the rest of 511
> bytes is ignored by GCC. Implement ldtilecfg and sttilecfg intrinsics
> with a pointer to BLKmode to honor the 512-byte memory block.
>
> gcc/ChangeLog:
>
> PR target/114098
> * config/i386/amxtileintrin.h (_tile_loadconfig): Use
> __builtin_ia32_ldtilecfg.
> (_tile_storeconfig): Use __builtin_ia32_sttilecfg.
> * config/i386/i386-builtin.def (BDESC): Add
> __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg.
> * config/i386/i386-expand.cc (ix86_expand_builtin): Handle
> IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG.
> * config/i386/i386.md (ldtilecfg): New pattern.
> (sttilecfg): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> PR target/114098
> * gcc.target/i386/amxtile-4.c: New test.
> ---
> gcc/config/i386/amxtileintrin.h | 4 +-
> gcc/config/i386/i386-builtin.def | 4 ++
> gcc/config/i386/i386-expand.cc | 19 ++++++++
> gcc/config/i386/i386.md | 24 ++++++++++
> gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 +++++++++++++++++++++++
> 5 files changed, 104 insertions(+), 2 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c
>
> diff --git a/gcc/config/i386/amxtileintrin.h b/gcc/config/i386/amxtileintrin.h
> index d1a26e0fea5..5081b326498 100644
> --- a/gcc/config/i386/amxtileintrin.h
> +++ b/gcc/config/i386/amxtileintrin.h
> @@ -39,14 +39,14 @@ extern __inline void
> __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _tile_loadconfig (const void *__config)
> {
> - __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
> + __builtin_ia32_ldtilecfg (__config);
> }
>
> extern __inline void
> __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _tile_storeconfig (void *__config)
> {
> - __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config)));
> + __builtin_ia32_sttilecfg (__config);
> }
>
> extern __inline void
> diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
> index 729355230b8..88dd7f8857f 100644
> --- a/gcc/config/i386/i386-builtin.def
> +++ b/gcc/config/i386/i386-builtin.def
> @@ -126,6 +126,10 @@ BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__b
> BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_xrstors64", IX86_BUILTIN_XRSTORS64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64)
> BDESC (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_xsavec64", IX86_BUILTIN_XSAVEC64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64)
>
> +/* LDFILECFG and STFILECFG. */
> +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, "__builtin_ia32_ldtilecfg", IX86_BUILTIN_LDTILECFG, UNKNOWN, (int) VOID_FTYPE_PCVOID)
> +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, "__builtin_ia32_sttilecfg", IX86_BUILTIN_STTILECFG, UNKNOWN, (int) VOID_FTYPE_PVOID)
CODE_FOR_sttilecfg.
> +
> /* SSE */
> BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_movv4sf_internal, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF)
> BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF)
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index a4d3369f01b..17993eb837f 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -14152,6 +14152,25 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
> emit_insn (pat);
> return 0;
>
> + case IX86_BUILTIN_LDTILECFG:
> + case IX86_BUILTIN_STTILECFG:
> + arg0 = CALL_EXPR_ARG (exp, 0);
> + op0 = expand_normal (arg0);
> +
> + if (!address_operand (op0, VOIDmode))
> + {
> + op0 = convert_memory_address (Pmode, op0);
> + op0 = copy_addr_to_reg (op0);
> + }
> + op0 = gen_rtx_MEM (BLKmode, op0);
maybe we can just use XImode, and adjust the patterns with XI.
> + if (fcode == IX86_BUILTIN_LDTILECFG)
> + icode = CODE_FOR_ldtilecfg;
> + else
> + icode = CODE_FOR_sttilecfg;
> + pat = GEN_FCN (icode) (op0);
> + emit_insn (pat);
> + return 0;
> +
> case IX86_BUILTIN_LLWPCB:
> arg0 = CALL_EXPR_ARG (exp, 0);
> op0 = expand_normal (arg0);
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index 6a26d966a0e..0ede6adac2f 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -353,6 +353,10 @@ (define_c_enum "unspecv" [
> ;; For USER_MSR support
> UNSPECV_URDMSR
> UNSPECV_UWRMSR
> +
> + ;; For AMX-TILE
> + UNSPECV_LDTILECFG
> + UNSPECV_STTILECFG
> ])
>
> ;; Constants to represent rounding modes in the ROUND instruction
> @@ -28152,6 +28156,26 @@ (define_insn "uwrmsr"
> [(set_attr "prefix" "vex")
> (set_attr "type" "other")])
>
> +
> +(define_insn "ldtilecfg"
> + [(unspec_volatile [(match_operand:BLK 0 "memory_operand" "jm")]
ldtilecfg/sttilecfg can be extended with gpr32, so just use m instead of jm.
> + UNSPECV_LDTILECFG)]
> + "TARGET_AMX_TILE"
> + "ldtilecfg\t%0"
> + [(set_attr "type" "other")
> + (set_attr "addr" "gpr16")
Remove this.
> + (set_attr "prefix" "vex")
Possible better with maybe_evex.
> + (set_attr "memory" "load")])
> +
> +(define_insn "sttilecfg"
> + [(set (match_operand:BLK 0 "memory_operand" "=jm")
> + (unspec_volatile:BLK [(const_int 0)] UNSPECV_STTILECFG))]
> + "TARGET_AMX_TILE"
> + "sttilecfg\t%0"
> + [(set_attr "type" "other")
> + (set_attr "addr" "gpr16")
> + (set_attr "prefix" "vex")
> + (set_attr "memory" "store")])
> (include "mmx.md")
> (include "sse.md")
> (include "sync.md")
> diff --git a/gcc/testsuite/gcc.target/i386/amxtile-4.c b/gcc/testsuite/gcc.target/i386/amxtile-4.c
> new file mode 100644
> index 00000000000..1255af2594e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/amxtile-4.c
> @@ -0,0 +1,55 @@
> +/* PR target/114098 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mamx-tile" } */
> +
> +#include <stdint.h>
> +#include <x86intrin.h>
> +
> +#define MAX_ROWS 16
> +#define MAX_COLS 64
> +#define MAX 1024
> +#define STRIDE 64
> +
> +typedef struct __tile_config
> +{
> + uint8_t palette_id;
> + uint8_t start_row;
> + uint8_t reserved_0[14];
> + uint16_t colsb[16];
> + uint8_t rows[16];
> +} __tilecfg;
> +
> +
> +extern void bar (__tilecfg *tileinfo);
> +
> +/* Initialize tile config */
> +static void
> +init_tile_config (__tilecfg *tileinfo)
> +{
> + int i;
> + tileinfo->palette_id = 1;
> + tileinfo->start_row = 0;
> +
> + for (i = 0; i < 1; ++i)
> + {
> + tileinfo->colsb[i] = MAX_ROWS;
> + tileinfo->rows[i] = MAX_ROWS;
> + }
> +
> + for (i = 1; i < 4; ++i)
> + {
> + tileinfo->colsb[i] = MAX_COLS;
> + tileinfo->rows[i] = MAX_ROWS;
> + }
> +
> + _tile_loadconfig (tileinfo);
> +}
> +
> +void
> +enable_amx (void)
> +{
> + __tilecfg tile_data = {0};
> + init_tile_config (&tile_data);
> +}
> +
> +/* { dg-final { scan-assembler-times "pxor\[^\n\]*%xmm" 1 } } */
> --
> 2.43.2
>
On Sun, Feb 25, 2024 at 6:03 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, Feb 26, 2024 at 5:11 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > ldtilecfg and sttilecfg take a 512-byte memory block. With
> > _tile_loadconfig implemented as
> >
> > extern __inline void
> > __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > _tile_loadconfig (const void *__config)
> > {
> > __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
> > }
> >
> > GCC sees:
> >
> > (parallel [
> > (asm_operands/v ("ldtilecfg %X0") ("") 0
> > [(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars)
> > (const_int -64 [0xffffffffffffffc0])) [1 MEM[(const void * *)&tile_data]+0 S8 A128])]
> > [(asm_input:DI ("m"))]
> > (clobber (reg:CC 17 flags))])
> >
> > and the memory operand size is 1 byte. As the result, the rest of 511
> > bytes is ignored by GCC. Implement ldtilecfg and sttilecfg intrinsics
> > with a pointer to BLKmode to honor the 512-byte memory block.
> >
> > gcc/ChangeLog:
> >
> > PR target/114098
> > * config/i386/amxtileintrin.h (_tile_loadconfig): Use
> > __builtin_ia32_ldtilecfg.
> > (_tile_storeconfig): Use __builtin_ia32_sttilecfg.
> > * config/i386/i386-builtin.def (BDESC): Add
> > __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg.
> > * config/i386/i386-expand.cc (ix86_expand_builtin): Handle
> > IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG.
> > * config/i386/i386.md (ldtilecfg): New pattern.
> > (sttilecfg): Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR target/114098
> > * gcc.target/i386/amxtile-4.c: New test.
> > ---
> > gcc/config/i386/amxtileintrin.h | 4 +-
> > gcc/config/i386/i386-builtin.def | 4 ++
> > gcc/config/i386/i386-expand.cc | 19 ++++++++
> > gcc/config/i386/i386.md | 24 ++++++++++
> > gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 +++++++++++++++++++++++
> > 5 files changed, 104 insertions(+), 2 deletions(-)
> > create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c
> >
> > diff --git a/gcc/config/i386/amxtileintrin.h b/gcc/config/i386/amxtileintrin.h
> > index d1a26e0fea5..5081b326498 100644
> > --- a/gcc/config/i386/amxtileintrin.h
> > +++ b/gcc/config/i386/amxtileintrin.h
> > @@ -39,14 +39,14 @@ extern __inline void
> > __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > _tile_loadconfig (const void *__config)
> > {
> > - __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
> > + __builtin_ia32_ldtilecfg (__config);
> > }
> >
> > extern __inline void
> > __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > _tile_storeconfig (void *__config)
> > {
> > - __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config)));
> > + __builtin_ia32_sttilecfg (__config);
> > }
> >
> > extern __inline void
> > diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
> > index 729355230b8..88dd7f8857f 100644
> > --- a/gcc/config/i386/i386-builtin.def
> > +++ b/gcc/config/i386/i386-builtin.def
> > @@ -126,6 +126,10 @@ BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__b
> > BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_xrstors64", IX86_BUILTIN_XRSTORS64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64)
> > BDESC (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_xsavec64", IX86_BUILTIN_XSAVEC64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64)
> >
> > +/* LDFILECFG and STFILECFG. */
> > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, "__builtin_ia32_ldtilecfg", IX86_BUILTIN_LDTILECFG, UNKNOWN, (int) VOID_FTYPE_PCVOID)
> > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, "__builtin_ia32_sttilecfg", IX86_BUILTIN_STTILECFG, UNKNOWN, (int) VOID_FTYPE_PVOID)
> CODE_FOR_sttilecfg.
It is unused. I changed both to CODE_FOR_nothing.
> > +
> > /* SSE */
> > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_movv4sf_internal, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF)
> > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF)
> > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > index a4d3369f01b..17993eb837f 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -14152,6 +14152,25 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
> > emit_insn (pat);
> > return 0;
> >
> > + case IX86_BUILTIN_LDTILECFG:
> > + case IX86_BUILTIN_STTILECFG:
> > + arg0 = CALL_EXPR_ARG (exp, 0);
> > + op0 = expand_normal (arg0);
> > +
> > + if (!address_operand (op0, VOIDmode))
> > + {
> > + op0 = convert_memory_address (Pmode, op0);
> > + op0 = copy_addr_to_reg (op0);
> > + }
> > + op0 = gen_rtx_MEM (BLKmode, op0);
> maybe we can just use XImode, and adjust the patterns with XI.
Changed.
> > + if (fcode == IX86_BUILTIN_LDTILECFG)
> > + icode = CODE_FOR_ldtilecfg;
> > + else
> > + icode = CODE_FOR_sttilecfg;
> > + pat = GEN_FCN (icode) (op0);
> > + emit_insn (pat);
> > + return 0;
> > +
> > case IX86_BUILTIN_LLWPCB:
> > arg0 = CALL_EXPR_ARG (exp, 0);
> > op0 = expand_normal (arg0);
> > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > index 6a26d966a0e..0ede6adac2f 100644
> > --- a/gcc/config/i386/i386.md
> > +++ b/gcc/config/i386/i386.md
> > @@ -353,6 +353,10 @@ (define_c_enum "unspecv" [
> > ;; For USER_MSR support
> > UNSPECV_URDMSR
> > UNSPECV_UWRMSR
> > +
> > + ;; For AMX-TILE
> > + UNSPECV_LDTILECFG
> > + UNSPECV_STTILECFG
> > ])
> >
> > ;; Constants to represent rounding modes in the ROUND instruction
> > @@ -28152,6 +28156,26 @@ (define_insn "uwrmsr"
> > [(set_attr "prefix" "vex")
> > (set_attr "type" "other")])
> >
> > +
> > +(define_insn "ldtilecfg"
> > + [(unspec_volatile [(match_operand:BLK 0 "memory_operand" "jm")]
> ldtilecfg/sttilecfg can be extended with gpr32, so just use m instead of jm.
> > + UNSPECV_LDTILECFG)]
> > + "TARGET_AMX_TILE"
> > + "ldtilecfg\t%0"
>
> > + [(set_attr "type" "other")
> > + (set_attr "addr" "gpr16")
> Remove this.
Done.
> > + (set_attr "prefix" "vex")
> Possible better with maybe_evex.
Done.
> > + (set_attr "memory" "load")])
> > +
> > +(define_insn "sttilecfg"
> > + [(set (match_operand:BLK 0 "memory_operand" "=jm")
> > + (unspec_volatile:BLK [(const_int 0)] UNSPECV_STTILECFG))]
> > + "TARGET_AMX_TILE"
> > + "sttilecfg\t%0"
> > + [(set_attr "type" "other")
> > + (set_attr "addr" "gpr16")
> > + (set_attr "prefix" "vex")
> > + (set_attr "memory" "store")])
> > (include "mmx.md")
> > (include "sse.md")
> > (include "sync.md")
> > diff --git a/gcc/testsuite/gcc.target/i386/amxtile-4.c b/gcc/testsuite/gcc.target/i386/amxtile-4.c
> > new file mode 100644
> > index 00000000000..1255af2594e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/amxtile-4.c
> > @@ -0,0 +1,55 @@
> > +/* PR target/114098 */
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-O2 -mamx-tile" } */
> > +
> > +#include <stdint.h>
> > +#include <x86intrin.h>
> > +
> > +#define MAX_ROWS 16
> > +#define MAX_COLS 64
> > +#define MAX 1024
> > +#define STRIDE 64
> > +
> > +typedef struct __tile_config
> > +{
> > + uint8_t palette_id;
> > + uint8_t start_row;
> > + uint8_t reserved_0[14];
> > + uint16_t colsb[16];
> > + uint8_t rows[16];
> > +} __tilecfg;
> > +
> > +
> > +extern void bar (__tilecfg *tileinfo);
> > +
> > +/* Initialize tile config */
> > +static void
> > +init_tile_config (__tilecfg *tileinfo)
> > +{
> > + int i;
> > + tileinfo->palette_id = 1;
> > + tileinfo->start_row = 0;
> > +
> > + for (i = 0; i < 1; ++i)
> > + {
> > + tileinfo->colsb[i] = MAX_ROWS;
> > + tileinfo->rows[i] = MAX_ROWS;
> > + }
> > +
> > + for (i = 1; i < 4; ++i)
> > + {
> > + tileinfo->colsb[i] = MAX_COLS;
> > + tileinfo->rows[i] = MAX_ROWS;
> > + }
> > +
> > + _tile_loadconfig (tileinfo);
> > +}
> > +
> > +void
> > +enable_amx (void)
> > +{
> > + __tilecfg tile_data = {0};
> > + init_tile_config (&tile_data);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "pxor\[^\n\]*%xmm" 1 } } */
> > --
> > 2.43.2
> >
>
I am testing this patch now.
Thanks.
On Mon, Feb 26, 2024 at 10:37 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sun, Feb 25, 2024 at 6:03 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Mon, Feb 26, 2024 at 5:11 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > ldtilecfg and sttilecfg take a 512-byte memory block. With
> > > _tile_loadconfig implemented as
> > >
> > > extern __inline void
> > > __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > > _tile_loadconfig (const void *__config)
> > > {
> > > __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
> > > }
> > >
> > > GCC sees:
> > >
> > > (parallel [
> > > (asm_operands/v ("ldtilecfg %X0") ("") 0
> > > [(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars)
> > > (const_int -64 [0xffffffffffffffc0])) [1 MEM[(const void * *)&tile_data]+0 S8 A128])]
> > > [(asm_input:DI ("m"))]
> > > (clobber (reg:CC 17 flags))])
> > >
> > > and the memory operand size is 1 byte. As the result, the rest of 511
> > > bytes is ignored by GCC. Implement ldtilecfg and sttilecfg intrinsics
> > > with a pointer to BLKmode to honor the 512-byte memory block.
> > >
> > > gcc/ChangeLog:
> > >
> > > PR target/114098
> > > * config/i386/amxtileintrin.h (_tile_loadconfig): Use
> > > __builtin_ia32_ldtilecfg.
> > > (_tile_storeconfig): Use __builtin_ia32_sttilecfg.
> > > * config/i386/i386-builtin.def (BDESC): Add
> > > __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg.
> > > * config/i386/i386-expand.cc (ix86_expand_builtin): Handle
> > > IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG.
> > > * config/i386/i386.md (ldtilecfg): New pattern.
> > > (sttilecfg): Likewise.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > PR target/114098
> > > * gcc.target/i386/amxtile-4.c: New test.
> > > ---
> > > gcc/config/i386/amxtileintrin.h | 4 +-
> > > gcc/config/i386/i386-builtin.def | 4 ++
> > > gcc/config/i386/i386-expand.cc | 19 ++++++++
> > > gcc/config/i386/i386.md | 24 ++++++++++
> > > gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 +++++++++++++++++++++++
> > > 5 files changed, 104 insertions(+), 2 deletions(-)
> > > create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c
> > >
> > > diff --git a/gcc/config/i386/amxtileintrin.h b/gcc/config/i386/amxtileintrin.h
> > > index d1a26e0fea5..5081b326498 100644
> > > --- a/gcc/config/i386/amxtileintrin.h
> > > +++ b/gcc/config/i386/amxtileintrin.h
> > > @@ -39,14 +39,14 @@ extern __inline void
> > > __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > > _tile_loadconfig (const void *__config)
> > > {
> > > - __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
> > > + __builtin_ia32_ldtilecfg (__config);
> > > }
> > >
> > > extern __inline void
> > > __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > > _tile_storeconfig (void *__config)
> > > {
> > > - __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config)));
> > > + __builtin_ia32_sttilecfg (__config);
> > > }
> > >
> > > extern __inline void
> > > diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
> > > index 729355230b8..88dd7f8857f 100644
> > > --- a/gcc/config/i386/i386-builtin.def
> > > +++ b/gcc/config/i386/i386-builtin.def
> > > @@ -126,6 +126,10 @@ BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__b
> > > BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_xrstors64", IX86_BUILTIN_XRSTORS64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64)
> > > BDESC (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_xsavec64", IX86_BUILTIN_XSAVEC64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64)
> > >
> > > +/* LDFILECFG and STFILECFG. */
> > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, "__builtin_ia32_ldtilecfg", IX86_BUILTIN_LDTILECFG, UNKNOWN, (int) VOID_FTYPE_PCVOID)
> > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, "__builtin_ia32_sttilecfg", IX86_BUILTIN_STTILECFG, UNKNOWN, (int) VOID_FTYPE_PVOID)
> > CODE_FOR_sttilecfg.
>
> It is unused. I changed both to CODE_FOR_nothing.
>
> > > +
> > > /* SSE */
> > > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_movv4sf_internal, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF)
> > > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF)
> > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > > index a4d3369f01b..17993eb837f 100644
> > > --- a/gcc/config/i386/i386-expand.cc
> > > +++ b/gcc/config/i386/i386-expand.cc
> > > @@ -14152,6 +14152,25 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
> > > emit_insn (pat);
> > > return 0;
> > >
> > > + case IX86_BUILTIN_LDTILECFG:
> > > + case IX86_BUILTIN_STTILECFG:
> > > + arg0 = CALL_EXPR_ARG (exp, 0);
> > > + op0 = expand_normal (arg0);
> > > +
> > > + if (!address_operand (op0, VOIDmode))
> > > + {
> > > + op0 = convert_memory_address (Pmode, op0);
> > > + op0 = copy_addr_to_reg (op0);
> > > + }
> > > + op0 = gen_rtx_MEM (BLKmode, op0);
> > maybe we can just use XImode, and adjust the patterns with XI.
>
> Changed.
>
> > > + if (fcode == IX86_BUILTIN_LDTILECFG)
> > > + icode = CODE_FOR_ldtilecfg;
> > > + else
> > > + icode = CODE_FOR_sttilecfg;
> > > + pat = GEN_FCN (icode) (op0);
> > > + emit_insn (pat);
> > > + return 0;
> > > +
> > > case IX86_BUILTIN_LLWPCB:
> > > arg0 = CALL_EXPR_ARG (exp, 0);
> > > op0 = expand_normal (arg0);
> > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > > index 6a26d966a0e..0ede6adac2f 100644
> > > --- a/gcc/config/i386/i386.md
> > > +++ b/gcc/config/i386/i386.md
> > > @@ -353,6 +353,10 @@ (define_c_enum "unspecv" [
> > > ;; For USER_MSR support
> > > UNSPECV_URDMSR
> > > UNSPECV_UWRMSR
> > > +
> > > + ;; For AMX-TILE
> > > + UNSPECV_LDTILECFG
> > > + UNSPECV_STTILECFG
> > > ])
> > >
> > > ;; Constants to represent rounding modes in the ROUND instruction
> > > @@ -28152,6 +28156,26 @@ (define_insn "uwrmsr"
> > > [(set_attr "prefix" "vex")
> > > (set_attr "type" "other")])
> > >
> > > +
> > > +(define_insn "ldtilecfg"
> > > + [(unspec_volatile [(match_operand:BLK 0 "memory_operand" "jm")]
> > ldtilecfg/sttilecfg can be extended with gpr32, so just use m instead of jm.
> > > + UNSPECV_LDTILECFG)]
> > > + "TARGET_AMX_TILE"
> > > + "ldtilecfg\t%0"
> >
> > > + [(set_attr "type" "other")
> > > + (set_attr "addr" "gpr16")
> > Remove this.
>
> Done.
>
> > > + (set_attr "prefix" "vex")
> > Possible better with maybe_evex.
>
> Done.
>
> > > + (set_attr "memory" "load")])
> > > +
> > > +(define_insn "sttilecfg"
> > > + [(set (match_operand:BLK 0 "memory_operand" "=jm")
> > > + (unspec_volatile:BLK [(const_int 0)] UNSPECV_STTILECFG))]
> > > + "TARGET_AMX_TILE"
> > > + "sttilecfg\t%0"
> > > + [(set_attr "type" "other")
> > > + (set_attr "addr" "gpr16")
> > > + (set_attr "prefix" "vex")
> > > + (set_attr "memory" "store")])
> > > (include "mmx.md")
> > > (include "sse.md")
> > > (include "sync.md")
> > > diff --git a/gcc/testsuite/gcc.target/i386/amxtile-4.c b/gcc/testsuite/gcc.target/i386/amxtile-4.c
> > > new file mode 100644
> > > index 00000000000..1255af2594e
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/amxtile-4.c
> > > @@ -0,0 +1,55 @@
> > > +/* PR target/114098 */
> > > +/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-options "-O2 -mamx-tile" } */
> > > +
> > > +#include <stdint.h>
> > > +#include <x86intrin.h>
> > > +
> > > +#define MAX_ROWS 16
> > > +#define MAX_COLS 64
> > > +#define MAX 1024
> > > +#define STRIDE 64
> > > +
> > > +typedef struct __tile_config
> > > +{
> > > + uint8_t palette_id;
> > > + uint8_t start_row;
> > > + uint8_t reserved_0[14];
> > > + uint16_t colsb[16];
> > > + uint8_t rows[16];
> > > +} __tilecfg;
> > > +
> > > +
> > > +extern void bar (__tilecfg *tileinfo);
> > > +
> > > +/* Initialize tile config */
> > > +static void
> > > +init_tile_config (__tilecfg *tileinfo)
> > > +{
> > > + int i;
> > > + tileinfo->palette_id = 1;
> > > + tileinfo->start_row = 0;
> > > +
> > > + for (i = 0; i < 1; ++i)
> > > + {
> > > + tileinfo->colsb[i] = MAX_ROWS;
> > > + tileinfo->rows[i] = MAX_ROWS;
> > > + }
> > > +
> > > + for (i = 1; i < 4; ++i)
> > > + {
> > > + tileinfo->colsb[i] = MAX_COLS;
> > > + tileinfo->rows[i] = MAX_ROWS;
> > > + }
> > > +
> > > + _tile_loadconfig (tileinfo);
> > > +}
> > > +
> > > +void
> > > +enable_amx (void)
> > > +{
> > > + __tilecfg tile_data = {0};
> > > + init_tile_config (&tile_data);
> > > +}
> > > +
> > > +/* { dg-final { scan-assembler-times "pxor\[^\n\]*%xmm" 1 } } */
> > > --
> > > 2.43.2
> > >
> >
>
> I am testing this patch now.
Ok if it passes the regression test.
>
> Thanks.
>
> --
> H.J.
On Sun, Feb 25, 2024 at 7:03 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, Feb 26, 2024 at 10:37 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Sun, Feb 25, 2024 at 6:03 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Mon, Feb 26, 2024 at 5:11 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > ldtilecfg and sttilecfg take a 512-byte memory block. With
> > > > _tile_loadconfig implemented as
> > > >
> > > > extern __inline void
> > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > > > _tile_loadconfig (const void *__config)
> > > > {
> > > > __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
> > > > }
> > > >
> > > > GCC sees:
> > > >
> > > > (parallel [
> > > > (asm_operands/v ("ldtilecfg %X0") ("") 0
> > > > [(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars)
> > > > (const_int -64 [0xffffffffffffffc0])) [1 MEM[(const void * *)&tile_data]+0 S8 A128])]
> > > > [(asm_input:DI ("m"))]
> > > > (clobber (reg:CC 17 flags))])
> > > >
> > > > and the memory operand size is 1 byte. As the result, the rest of 511
> > > > bytes is ignored by GCC. Implement ldtilecfg and sttilecfg intrinsics
> > > > with a pointer to BLKmode to honor the 512-byte memory block.
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > PR target/114098
> > > > * config/i386/amxtileintrin.h (_tile_loadconfig): Use
> > > > __builtin_ia32_ldtilecfg.
> > > > (_tile_storeconfig): Use __builtin_ia32_sttilecfg.
> > > > * config/i386/i386-builtin.def (BDESC): Add
> > > > __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg.
> > > > * config/i386/i386-expand.cc (ix86_expand_builtin): Handle
> > > > IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG.
> > > > * config/i386/i386.md (ldtilecfg): New pattern.
> > > > (sttilecfg): Likewise.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > PR target/114098
> > > > * gcc.target/i386/amxtile-4.c: New test.
> > > > ---
> > > > gcc/config/i386/amxtileintrin.h | 4 +-
> > > > gcc/config/i386/i386-builtin.def | 4 ++
> > > > gcc/config/i386/i386-expand.cc | 19 ++++++++
> > > > gcc/config/i386/i386.md | 24 ++++++++++
> > > > gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 +++++++++++++++++++++++
> > > > 5 files changed, 104 insertions(+), 2 deletions(-)
> > > > create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c
> > > >
> > > > diff --git a/gcc/config/i386/amxtileintrin.h b/gcc/config/i386/amxtileintrin.h
> > > > index d1a26e0fea5..5081b326498 100644
> > > > --- a/gcc/config/i386/amxtileintrin.h
> > > > +++ b/gcc/config/i386/amxtileintrin.h
> > > > @@ -39,14 +39,14 @@ extern __inline void
> > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > > > _tile_loadconfig (const void *__config)
> > > > {
> > > > - __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
> > > > + __builtin_ia32_ldtilecfg (__config);
> > > > }
> > > >
> > > > extern __inline void
> > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > > > _tile_storeconfig (void *__config)
> > > > {
> > > > - __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config)));
> > > > + __builtin_ia32_sttilecfg (__config);
> > > > }
> > > >
> > > > extern __inline void
> > > > diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
> > > > index 729355230b8..88dd7f8857f 100644
> > > > --- a/gcc/config/i386/i386-builtin.def
> > > > +++ b/gcc/config/i386/i386-builtin.def
> > > > @@ -126,6 +126,10 @@ BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__b
> > > > BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_xrstors64", IX86_BUILTIN_XRSTORS64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64)
> > > > BDESC (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_xsavec64", IX86_BUILTIN_XSAVEC64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64)
> > > >
> > > > +/* LDFILECFG and STFILECFG. */
> > > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, "__builtin_ia32_ldtilecfg", IX86_BUILTIN_LDTILECFG, UNKNOWN, (int) VOID_FTYPE_PCVOID)
> > > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, "__builtin_ia32_sttilecfg", IX86_BUILTIN_STTILECFG, UNKNOWN, (int) VOID_FTYPE_PVOID)
> > > CODE_FOR_sttilecfg.
> >
> > It is unused. I changed both to CODE_FOR_nothing.
> >
> > > > +
> > > > /* SSE */
> > > > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_movv4sf_internal, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF)
> > > > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF)
> > > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > > > index a4d3369f01b..17993eb837f 100644
> > > > --- a/gcc/config/i386/i386-expand.cc
> > > > +++ b/gcc/config/i386/i386-expand.cc
> > > > @@ -14152,6 +14152,25 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
> > > > emit_insn (pat);
> > > > return 0;
> > > >
> > > > + case IX86_BUILTIN_LDTILECFG:
> > > > + case IX86_BUILTIN_STTILECFG:
> > > > + arg0 = CALL_EXPR_ARG (exp, 0);
> > > > + op0 = expand_normal (arg0);
> > > > +
> > > > + if (!address_operand (op0, VOIDmode))
> > > > + {
> > > > + op0 = convert_memory_address (Pmode, op0);
> > > > + op0 = copy_addr_to_reg (op0);
> > > > + }
> > > > + op0 = gen_rtx_MEM (BLKmode, op0);
> > > maybe we can just use XImode, and adjust the patterns with XI.
> >
> > Changed.
> >
> > > > + if (fcode == IX86_BUILTIN_LDTILECFG)
> > > > + icode = CODE_FOR_ldtilecfg;
> > > > + else
> > > > + icode = CODE_FOR_sttilecfg;
> > > > + pat = GEN_FCN (icode) (op0);
> > > > + emit_insn (pat);
> > > > + return 0;
> > > > +
> > > > case IX86_BUILTIN_LLWPCB:
> > > > arg0 = CALL_EXPR_ARG (exp, 0);
> > > > op0 = expand_normal (arg0);
> > > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > > > index 6a26d966a0e..0ede6adac2f 100644
> > > > --- a/gcc/config/i386/i386.md
> > > > +++ b/gcc/config/i386/i386.md
> > > > @@ -353,6 +353,10 @@ (define_c_enum "unspecv" [
> > > > ;; For USER_MSR support
> > > > UNSPECV_URDMSR
> > > > UNSPECV_UWRMSR
> > > > +
> > > > + ;; For AMX-TILE
> > > > + UNSPECV_LDTILECFG
> > > > + UNSPECV_STTILECFG
> > > > ])
> > > >
> > > > ;; Constants to represent rounding modes in the ROUND instruction
> > > > @@ -28152,6 +28156,26 @@ (define_insn "uwrmsr"
> > > > [(set_attr "prefix" "vex")
> > > > (set_attr "type" "other")])
> > > >
> > > > +
> > > > +(define_insn "ldtilecfg"
> > > > + [(unspec_volatile [(match_operand:BLK 0 "memory_operand" "jm")]
> > > ldtilecfg/sttilecfg can be extended with gpr32, so just use m instead of jm.
> > > > + UNSPECV_LDTILECFG)]
> > > > + "TARGET_AMX_TILE"
> > > > + "ldtilecfg\t%0"
> > >
> > > > + [(set_attr "type" "other")
> > > > + (set_attr "addr" "gpr16")
> > > Remove this.
> >
> > Done.
> >
> > > > + (set_attr "prefix" "vex")
> > > Possible better with maybe_evex.
> >
> > Done.
> >
> > > > + (set_attr "memory" "load")])
> > > > +
> > > > +(define_insn "sttilecfg"
> > > > + [(set (match_operand:BLK 0 "memory_operand" "=jm")
> > > > + (unspec_volatile:BLK [(const_int 0)] UNSPECV_STTILECFG))]
> > > > + "TARGET_AMX_TILE"
> > > > + "sttilecfg\t%0"
> > > > + [(set_attr "type" "other")
> > > > + (set_attr "addr" "gpr16")
> > > > + (set_attr "prefix" "vex")
> > > > + (set_attr "memory" "store")])
> > > > (include "mmx.md")
> > > > (include "sse.md")
> > > > (include "sync.md")
> > > > diff --git a/gcc/testsuite/gcc.target/i386/amxtile-4.c b/gcc/testsuite/gcc.target/i386/amxtile-4.c
> > > > new file mode 100644
> > > > index 00000000000..1255af2594e
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/amxtile-4.c
> > > > @@ -0,0 +1,55 @@
> > > > +/* PR target/114098 */
> > > > +/* { dg-do compile { target { ! ia32 } } } */
> > > > +/* { dg-options "-O2 -mamx-tile" } */
> > > > +
> > > > +#include <stdint.h>
> > > > +#include <x86intrin.h>
> > > > +
> > > > +#define MAX_ROWS 16
> > > > +#define MAX_COLS 64
> > > > +#define MAX 1024
> > > > +#define STRIDE 64
> > > > +
> > > > +typedef struct __tile_config
> > > > +{
> > > > + uint8_t palette_id;
> > > > + uint8_t start_row;
> > > > + uint8_t reserved_0[14];
> > > > + uint16_t colsb[16];
> > > > + uint8_t rows[16];
> > > > +} __tilecfg;
> > > > +
> > > > +
> > > > +extern void bar (__tilecfg *tileinfo);
> > > > +
> > > > +/* Initialize tile config */
> > > > +static void
> > > > +init_tile_config (__tilecfg *tileinfo)
> > > > +{
> > > > + int i;
> > > > + tileinfo->palette_id = 1;
> > > > + tileinfo->start_row = 0;
> > > > +
> > > > + for (i = 0; i < 1; ++i)
> > > > + {
> > > > + tileinfo->colsb[i] = MAX_ROWS;
> > > > + tileinfo->rows[i] = MAX_ROWS;
> > > > + }
> > > > +
> > > > + for (i = 1; i < 4; ++i)
> > > > + {
> > > > + tileinfo->colsb[i] = MAX_COLS;
> > > > + tileinfo->rows[i] = MAX_ROWS;
> > > > + }
> > > > +
> > > > + _tile_loadconfig (tileinfo);
> > > > +}
> > > > +
> > > > +void
> > > > +enable_amx (void)
> > > > +{
> > > > + __tilecfg tile_data = {0};
> > > > + init_tile_config (&tile_data);
> > > > +}
> > > > +
> > > > +/* { dg-final { scan-assembler-times "pxor\[^\n\]*%xmm" 1 } } */
> > > > --
> > > > 2.43.2
> > > >
> > >
> >
> > I am testing this patch now.
> Ok if it passes the regression test.
Test passed. I am checking it in.
Thanks.
On Sun, Feb 25, 2024 at 8:25 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sun, Feb 25, 2024 at 7:03 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Mon, Feb 26, 2024 at 10:37 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Sun, Feb 25, 2024 at 6:03 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Mon, Feb 26, 2024 at 5:11 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > ldtilecfg and sttilecfg take a 512-byte memory block. With
> > > > > _tile_loadconfig implemented as
> > > > >
> > > > > extern __inline void
> > > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > > > > _tile_loadconfig (const void *__config)
> > > > > {
> > > > > __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
> > > > > }
> > > > >
> > > > > GCC sees:
> > > > >
> > > > > (parallel [
> > > > > (asm_operands/v ("ldtilecfg %X0") ("") 0
> > > > > [(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars)
> > > > > (const_int -64 [0xffffffffffffffc0])) [1 MEM[(const void * *)&tile_data]+0 S8 A128])]
> > > > > [(asm_input:DI ("m"))]
> > > > > (clobber (reg:CC 17 flags))])
> > > > >
> > > > > and the memory operand size is 1 byte. As the result, the rest of 511
> > > > > bytes is ignored by GCC. Implement ldtilecfg and sttilecfg intrinsics
> > > > > with a pointer to BLKmode to honor the 512-byte memory block.
> > > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > > PR target/114098
> > > > > * config/i386/amxtileintrin.h (_tile_loadconfig): Use
> > > > > __builtin_ia32_ldtilecfg.
> > > > > (_tile_storeconfig): Use __builtin_ia32_sttilecfg.
> > > > > * config/i386/i386-builtin.def (BDESC): Add
> > > > > __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg.
> > > > > * config/i386/i386-expand.cc (ix86_expand_builtin): Handle
> > > > > IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG.
> > > > > * config/i386/i386.md (ldtilecfg): New pattern.
> > > > > (sttilecfg): Likewise.
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > > PR target/114098
> > > > > * gcc.target/i386/amxtile-4.c: New test.
> > > > > ---
> > > > > gcc/config/i386/amxtileintrin.h | 4 +-
> > > > > gcc/config/i386/i386-builtin.def | 4 ++
> > > > > gcc/config/i386/i386-expand.cc | 19 ++++++++
> > > > > gcc/config/i386/i386.md | 24 ++++++++++
> > > > > gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 +++++++++++++++++++++++
> > > > > 5 files changed, 104 insertions(+), 2 deletions(-)
> > > > > create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c
> > > > >
> > > > > diff --git a/gcc/config/i386/amxtileintrin.h b/gcc/config/i386/amxtileintrin.h
> > > > > index d1a26e0fea5..5081b326498 100644
> > > > > --- a/gcc/config/i386/amxtileintrin.h
> > > > > +++ b/gcc/config/i386/amxtileintrin.h
> > > > > @@ -39,14 +39,14 @@ extern __inline void
> > > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > > > > _tile_loadconfig (const void *__config)
> > > > > {
> > > > > - __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
> > > > > + __builtin_ia32_ldtilecfg (__config);
> > > > > }
> > > > >
> > > > > extern __inline void
> > > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > > > > _tile_storeconfig (void *__config)
> > > > > {
> > > > > - __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config)));
> > > > > + __builtin_ia32_sttilecfg (__config);
> > > > > }
> > > > >
> > > > > extern __inline void
> > > > > diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
> > > > > index 729355230b8..88dd7f8857f 100644
> > > > > --- a/gcc/config/i386/i386-builtin.def
> > > > > +++ b/gcc/config/i386/i386-builtin.def
> > > > > @@ -126,6 +126,10 @@ BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__b
> > > > > BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_xrstors64", IX86_BUILTIN_XRSTORS64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64)
> > > > > BDESC (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_xsavec64", IX86_BUILTIN_XSAVEC64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64)
> > > > >
> > > > > +/* LDFILECFG and STFILECFG. */
> > > > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, "__builtin_ia32_ldtilecfg", IX86_BUILTIN_LDTILECFG, UNKNOWN, (int) VOID_FTYPE_PCVOID)
> > > > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, "__builtin_ia32_sttilecfg", IX86_BUILTIN_STTILECFG, UNKNOWN, (int) VOID_FTYPE_PVOID)
> > > > CODE_FOR_sttilecfg.
> > >
> > > It is unused. I changed both to CODE_FOR_nothing.
> > >
> > > > > +
> > > > > /* SSE */
> > > > > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_movv4sf_internal, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF)
> > > > > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF)
> > > > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > > > > index a4d3369f01b..17993eb837f 100644
> > > > > --- a/gcc/config/i386/i386-expand.cc
> > > > > +++ b/gcc/config/i386/i386-expand.cc
> > > > > @@ -14152,6 +14152,25 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
> > > > > emit_insn (pat);
> > > > > return 0;
> > > > >
> > > > > + case IX86_BUILTIN_LDTILECFG:
> > > > > + case IX86_BUILTIN_STTILECFG:
> > > > > + arg0 = CALL_EXPR_ARG (exp, 0);
> > > > > + op0 = expand_normal (arg0);
> > > > > +
> > > > > + if (!address_operand (op0, VOIDmode))
> > > > > + {
> > > > > + op0 = convert_memory_address (Pmode, op0);
> > > > > + op0 = copy_addr_to_reg (op0);
> > > > > + }
> > > > > + op0 = gen_rtx_MEM (BLKmode, op0);
> > > > maybe we can just use XImode, and adjust the patterns with XI.
> > >
> > > Changed.
> > >
> > > > > + if (fcode == IX86_BUILTIN_LDTILECFG)
> > > > > + icode = CODE_FOR_ldtilecfg;
> > > > > + else
> > > > > + icode = CODE_FOR_sttilecfg;
> > > > > + pat = GEN_FCN (icode) (op0);
> > > > > + emit_insn (pat);
> > > > > + return 0;
> > > > > +
> > > > > case IX86_BUILTIN_LLWPCB:
> > > > > arg0 = CALL_EXPR_ARG (exp, 0);
> > > > > op0 = expand_normal (arg0);
> > > > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > > > > index 6a26d966a0e..0ede6adac2f 100644
> > > > > --- a/gcc/config/i386/i386.md
> > > > > +++ b/gcc/config/i386/i386.md
> > > > > @@ -353,6 +353,10 @@ (define_c_enum "unspecv" [
> > > > > ;; For USER_MSR support
> > > > > UNSPECV_URDMSR
> > > > > UNSPECV_UWRMSR
> > > > > +
> > > > > + ;; For AMX-TILE
> > > > > + UNSPECV_LDTILECFG
> > > > > + UNSPECV_STTILECFG
> > > > > ])
> > > > >
> > > > > ;; Constants to represent rounding modes in the ROUND instruction
> > > > > @@ -28152,6 +28156,26 @@ (define_insn "uwrmsr"
> > > > > [(set_attr "prefix" "vex")
> > > > > (set_attr "type" "other")])
> > > > >
> > > > > +
> > > > > +(define_insn "ldtilecfg"
> > > > > + [(unspec_volatile [(match_operand:BLK 0 "memory_operand" "jm")]
> > > > ldtilecfg/sttilecfg can be extended with gpr32, so just use m instead of jm.
> > > > > + UNSPECV_LDTILECFG)]
> > > > > + "TARGET_AMX_TILE"
> > > > > + "ldtilecfg\t%0"
> > > >
> > > > > + [(set_attr "type" "other")
> > > > > + (set_attr "addr" "gpr16")
> > > > Remove this.
> > >
> > > Done.
> > >
> > > > > + (set_attr "prefix" "vex")
> > > > Possible better with maybe_evex.
> > >
> > > Done.
> > >
> > > > > + (set_attr "memory" "load")])
> > > > > +
> > > > > +(define_insn "sttilecfg"
> > > > > + [(set (match_operand:BLK 0 "memory_operand" "=jm")
> > > > > + (unspec_volatile:BLK [(const_int 0)] UNSPECV_STTILECFG))]
> > > > > + "TARGET_AMX_TILE"
> > > > > + "sttilecfg\t%0"
> > > > > + [(set_attr "type" "other")
> > > > > + (set_attr "addr" "gpr16")
> > > > > + (set_attr "prefix" "vex")
> > > > > + (set_attr "memory" "store")])
> > > > > (include "mmx.md")
> > > > > (include "sse.md")
> > > > > (include "sync.md")
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/amxtile-4.c b/gcc/testsuite/gcc.target/i386/amxtile-4.c
> > > > > new file mode 100644
> > > > > index 00000000000..1255af2594e
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/amxtile-4.c
> > > > > @@ -0,0 +1,55 @@
> > > > > +/* PR target/114098 */
> > > > > +/* { dg-do compile { target { ! ia32 } } } */
> > > > > +/* { dg-options "-O2 -mamx-tile" } */
> > > > > +
> > > > > +#include <stdint.h>
> > > > > +#include <x86intrin.h>
> > > > > +
> > > > > +#define MAX_ROWS 16
> > > > > +#define MAX_COLS 64
> > > > > +#define MAX 1024
> > > > > +#define STRIDE 64
> > > > > +
> > > > > +typedef struct __tile_config
> > > > > +{
> > > > > + uint8_t palette_id;
> > > > > + uint8_t start_row;
> > > > > + uint8_t reserved_0[14];
> > > > > + uint16_t colsb[16];
> > > > > + uint8_t rows[16];
> > > > > +} __tilecfg;
> > > > > +
> > > > > +
> > > > > +extern void bar (__tilecfg *tileinfo);
> > > > > +
> > > > > +/* Initialize tile config */
> > > > > +static void
> > > > > +init_tile_config (__tilecfg *tileinfo)
> > > > > +{
> > > > > + int i;
> > > > > + tileinfo->palette_id = 1;
> > > > > + tileinfo->start_row = 0;
> > > > > +
> > > > > + for (i = 0; i < 1; ++i)
> > > > > + {
> > > > > + tileinfo->colsb[i] = MAX_ROWS;
> > > > > + tileinfo->rows[i] = MAX_ROWS;
> > > > > + }
> > > > > +
> > > > > + for (i = 1; i < 4; ++i)
> > > > > + {
> > > > > + tileinfo->colsb[i] = MAX_COLS;
> > > > > + tileinfo->rows[i] = MAX_ROWS;
> > > > > + }
> > > > > +
> > > > > + _tile_loadconfig (tileinfo);
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +enable_amx (void)
> > > > > +{
> > > > > + __tilecfg tile_data = {0};
> > > > > + init_tile_config (&tile_data);
> > > > > +}
> > > > > +
> > > > > +/* { dg-final { scan-assembler-times "pxor\[^\n\]*%xmm" 1 } } */
> > > > > --
> > > > > 2.43.2
> > > > >
> > > >
> > >
> > > I am testing this patch now.
> > Ok if it passes the regression test.
>
> Test passed. I am checking it in.
>
> Thanks.
>
OK to backport to release branches?
On Mon, Feb 26, 2024 at 6:30 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sun, Feb 25, 2024 at 8:25 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Sun, Feb 25, 2024 at 7:03 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Mon, Feb 26, 2024 at 10:37 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Sun, Feb 25, 2024 at 6:03 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > On Mon, Feb 26, 2024 at 5:11 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > ldtilecfg and sttilecfg take a 512-byte memory block. With
> > > > > > _tile_loadconfig implemented as
> > > > > >
> > > > > > extern __inline void
> > > > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > > > > > _tile_loadconfig (const void *__config)
> > > > > > {
> > > > > > __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
> > > > > > }
> > > > > >
> > > > > > GCC sees:
> > > > > >
> > > > > > (parallel [
> > > > > > (asm_operands/v ("ldtilecfg %X0") ("") 0
> > > > > > [(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars)
> > > > > > (const_int -64 [0xffffffffffffffc0])) [1 MEM[(const void * *)&tile_data]+0 S8 A128])]
> > > > > > [(asm_input:DI ("m"))]
> > > > > > (clobber (reg:CC 17 flags))])
> > > > > >
> > > > > > and the memory operand size is 1 byte. As the result, the rest of 511
> > > > > > bytes is ignored by GCC. Implement ldtilecfg and sttilecfg intrinsics
> > > > > > with a pointer to BLKmode to honor the 512-byte memory block.
> > > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > > PR target/114098
> > > > > > * config/i386/amxtileintrin.h (_tile_loadconfig): Use
> > > > > > __builtin_ia32_ldtilecfg.
> > > > > > (_tile_storeconfig): Use __builtin_ia32_sttilecfg.
> > > > > > * config/i386/i386-builtin.def (BDESC): Add
> > > > > > __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg.
> > > > > > * config/i386/i386-expand.cc (ix86_expand_builtin): Handle
> > > > > > IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG.
> > > > > > * config/i386/i386.md (ldtilecfg): New pattern.
> > > > > > (sttilecfg): Likewise.
> > > > > >
> > > > > > gcc/testsuite/ChangeLog:
> > > > > >
> > > > > > PR target/114098
> > > > > > * gcc.target/i386/amxtile-4.c: New test.
> > > > > > ---
> > > > > > gcc/config/i386/amxtileintrin.h | 4 +-
> > > > > > gcc/config/i386/i386-builtin.def | 4 ++
> > > > > > gcc/config/i386/i386-expand.cc | 19 ++++++++
> > > > > > gcc/config/i386/i386.md | 24 ++++++++++
> > > > > > gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 +++++++++++++++++++++++
> > > > > > 5 files changed, 104 insertions(+), 2 deletions(-)
> > > > > > create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c
> > > > > >
> > > > > > diff --git a/gcc/config/i386/amxtileintrin.h b/gcc/config/i386/amxtileintrin.h
> > > > > > index d1a26e0fea5..5081b326498 100644
> > > > > > --- a/gcc/config/i386/amxtileintrin.h
> > > > > > +++ b/gcc/config/i386/amxtileintrin.h
> > > > > > @@ -39,14 +39,14 @@ extern __inline void
> > > > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > > > > > _tile_loadconfig (const void *__config)
> > > > > > {
> > > > > > - __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
> > > > > > + __builtin_ia32_ldtilecfg (__config);
> > > > > > }
> > > > > >
> > > > > > extern __inline void
> > > > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > > > > > _tile_storeconfig (void *__config)
> > > > > > {
> > > > > > - __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config)));
> > > > > > + __builtin_ia32_sttilecfg (__config);
> > > > > > }
> > > > > >
> > > > > > extern __inline void
> > > > > > diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
> > > > > > index 729355230b8..88dd7f8857f 100644
> > > > > > --- a/gcc/config/i386/i386-builtin.def
> > > > > > +++ b/gcc/config/i386/i386-builtin.def
> > > > > > @@ -126,6 +126,10 @@ BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__b
> > > > > > BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_xrstors64", IX86_BUILTIN_XRSTORS64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64)
> > > > > > BDESC (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_xsavec64", IX86_BUILTIN_XSAVEC64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64)
> > > > > >
> > > > > > +/* LDFILECFG and STFILECFG. */
> > > > > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, "__builtin_ia32_ldtilecfg", IX86_BUILTIN_LDTILECFG, UNKNOWN, (int) VOID_FTYPE_PCVOID)
> > > > > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, "__builtin_ia32_sttilecfg", IX86_BUILTIN_STTILECFG, UNKNOWN, (int) VOID_FTYPE_PVOID)
> > > > > CODE_FOR_sttilecfg.
> > > >
> > > > It is unused. I changed both to CODE_FOR_nothing.
> > > >
> > > > > > +
> > > > > > /* SSE */
> > > > > > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_movv4sf_internal, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF)
> > > > > > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF)
> > > > > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > > > > > index a4d3369f01b..17993eb837f 100644
> > > > > > --- a/gcc/config/i386/i386-expand.cc
> > > > > > +++ b/gcc/config/i386/i386-expand.cc
> > > > > > @@ -14152,6 +14152,25 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
> > > > > > emit_insn (pat);
> > > > > > return 0;
> > > > > >
> > > > > > + case IX86_BUILTIN_LDTILECFG:
> > > > > > + case IX86_BUILTIN_STTILECFG:
> > > > > > + arg0 = CALL_EXPR_ARG (exp, 0);
> > > > > > + op0 = expand_normal (arg0);
> > > > > > +
> > > > > > + if (!address_operand (op0, VOIDmode))
> > > > > > + {
> > > > > > + op0 = convert_memory_address (Pmode, op0);
> > > > > > + op0 = copy_addr_to_reg (op0);
> > > > > > + }
> > > > > > + op0 = gen_rtx_MEM (BLKmode, op0);
> > > > > maybe we can just use XImode, and adjust the patterns with XI.
> > > >
> > > > Changed.
> > > >
> > > > > > + if (fcode == IX86_BUILTIN_LDTILECFG)
> > > > > > + icode = CODE_FOR_ldtilecfg;
> > > > > > + else
> > > > > > + icode = CODE_FOR_sttilecfg;
> > > > > > + pat = GEN_FCN (icode) (op0);
> > > > > > + emit_insn (pat);
> > > > > > + return 0;
> > > > > > +
> > > > > > case IX86_BUILTIN_LLWPCB:
> > > > > > arg0 = CALL_EXPR_ARG (exp, 0);
> > > > > > op0 = expand_normal (arg0);
> > > > > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > > > > > index 6a26d966a0e..0ede6adac2f 100644
> > > > > > --- a/gcc/config/i386/i386.md
> > > > > > +++ b/gcc/config/i386/i386.md
> > > > > > @@ -353,6 +353,10 @@ (define_c_enum "unspecv" [
> > > > > > ;; For USER_MSR support
> > > > > > UNSPECV_URDMSR
> > > > > > UNSPECV_UWRMSR
> > > > > > +
> > > > > > + ;; For AMX-TILE
> > > > > > + UNSPECV_LDTILECFG
> > > > > > + UNSPECV_STTILECFG
> > > > > > ])
> > > > > >
> > > > > > ;; Constants to represent rounding modes in the ROUND instruction
> > > > > > @@ -28152,6 +28156,26 @@ (define_insn "uwrmsr"
> > > > > > [(set_attr "prefix" "vex")
> > > > > > (set_attr "type" "other")])
> > > > > >
> > > > > > +
> > > > > > +(define_insn "ldtilecfg"
> > > > > > + [(unspec_volatile [(match_operand:BLK 0 "memory_operand" "jm")]
> > > > > ldtilecfg/sttilecfg can be extended with gpr32, so just use m instead of jm.
> > > > > > + UNSPECV_LDTILECFG)]
> > > > > > + "TARGET_AMX_TILE"
> > > > > > + "ldtilecfg\t%0"
> > > > >
> > > > > > + [(set_attr "type" "other")
> > > > > > + (set_attr "addr" "gpr16")
> > > > > Remove this.
> > > >
> > > > Done.
> > > >
> > > > > > + (set_attr "prefix" "vex")
> > > > > Possible better with maybe_evex.
> > > >
> > > > Done.
> > > >
> > > > > > + (set_attr "memory" "load")])
> > > > > > +
> > > > > > +(define_insn "sttilecfg"
> > > > > > + [(set (match_operand:BLK 0 "memory_operand" "=jm")
> > > > > > + (unspec_volatile:BLK [(const_int 0)] UNSPECV_STTILECFG))]
> > > > > > + "TARGET_AMX_TILE"
> > > > > > + "sttilecfg\t%0"
> > > > > > + [(set_attr "type" "other")
> > > > > > + (set_attr "addr" "gpr16")
> > > > > > + (set_attr "prefix" "vex")
> > > > > > + (set_attr "memory" "store")])
> > > > > > (include "mmx.md")
> > > > > > (include "sse.md")
> > > > > > (include "sync.md")
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/amxtile-4.c b/gcc/testsuite/gcc.target/i386/amxtile-4.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..1255af2594e
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/amxtile-4.c
> > > > > > @@ -0,0 +1,55 @@
> > > > > > +/* PR target/114098 */
> > > > > > +/* { dg-do compile { target { ! ia32 } } } */
> > > > > > +/* { dg-options "-O2 -mamx-tile" } */
> > > > > > +
> > > > > > +#include <stdint.h>
> > > > > > +#include <x86intrin.h>
> > > > > > +
> > > > > > +#define MAX_ROWS 16
> > > > > > +#define MAX_COLS 64
> > > > > > +#define MAX 1024
> > > > > > +#define STRIDE 64
> > > > > > +
> > > > > > +typedef struct __tile_config
> > > > > > +{
> > > > > > + uint8_t palette_id;
> > > > > > + uint8_t start_row;
> > > > > > + uint8_t reserved_0[14];
> > > > > > + uint16_t colsb[16];
> > > > > > + uint8_t rows[16];
> > > > > > +} __tilecfg;
> > > > > > +
> > > > > > +
> > > > > > +extern void bar (__tilecfg *tileinfo);
> > > > > > +
> > > > > > +/* Initialize tile config */
> > > > > > +static void
> > > > > > +init_tile_config (__tilecfg *tileinfo)
> > > > > > +{
> > > > > > + int i;
> > > > > > + tileinfo->palette_id = 1;
> > > > > > + tileinfo->start_row = 0;
> > > > > > +
> > > > > > + for (i = 0; i < 1; ++i)
> > > > > > + {
> > > > > > + tileinfo->colsb[i] = MAX_ROWS;
> > > > > > + tileinfo->rows[i] = MAX_ROWS;
> > > > > > + }
> > > > > > +
> > > > > > + for (i = 1; i < 4; ++i)
> > > > > > + {
> > > > > > + tileinfo->colsb[i] = MAX_COLS;
> > > > > > + tileinfo->rows[i] = MAX_ROWS;
> > > > > > + }
> > > > > > +
> > > > > > + _tile_loadconfig (tileinfo);
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +enable_amx (void)
> > > > > > +{
> > > > > > + __tilecfg tile_data = {0};
> > > > > > + init_tile_config (&tile_data);
> > > > > > +}
> > > > > > +
> > > > > > +/* { dg-final { scan-assembler-times "pxor\[^\n\]*%xmm" 1 } } */
> > > > > > --
> > > > > > 2.43.2
> > > > > >
> > > > >
> > > >
> > > > I am testing this patch now.
> > > Ok if it passes the regression test.
> >
> > Test passed. I am checking it in.
> >
> > Thanks.
> >
>
> OK to backport to release branches?
Ok.
>
>
> --
> H.J.
@@ -39,14 +39,14 @@ extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_tile_loadconfig (const void *__config)
{
- __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
+ __builtin_ia32_ldtilecfg (__config);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_tile_storeconfig (void *__config)
{
- __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config)));
+ __builtin_ia32_sttilecfg (__config);
}
extern __inline void
@@ -126,6 +126,10 @@ BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__b
BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_xrstors64", IX86_BUILTIN_XRSTORS64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64)
BDESC (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_xsavec64", IX86_BUILTIN_XSAVEC64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64)
+/* LDFILECFG and STFILECFG. */
+BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, "__builtin_ia32_ldtilecfg", IX86_BUILTIN_LDTILECFG, UNKNOWN, (int) VOID_FTYPE_PCVOID)
+BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, "__builtin_ia32_sttilecfg", IX86_BUILTIN_STTILECFG, UNKNOWN, (int) VOID_FTYPE_PVOID)
+
/* SSE */
BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_movv4sf_internal, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF)
BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF)
@@ -14152,6 +14152,25 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
emit_insn (pat);
return 0;
+ case IX86_BUILTIN_LDTILECFG:
+ case IX86_BUILTIN_STTILECFG:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+
+ if (!address_operand (op0, VOIDmode))
+ {
+ op0 = convert_memory_address (Pmode, op0);
+ op0 = copy_addr_to_reg (op0);
+ }
+ op0 = gen_rtx_MEM (BLKmode, op0);
+ if (fcode == IX86_BUILTIN_LDTILECFG)
+ icode = CODE_FOR_ldtilecfg;
+ else
+ icode = CODE_FOR_sttilecfg;
+ pat = GEN_FCN (icode) (op0);
+ emit_insn (pat);
+ return 0;
+
case IX86_BUILTIN_LLWPCB:
arg0 = CALL_EXPR_ARG (exp, 0);
op0 = expand_normal (arg0);
@@ -353,6 +353,10 @@ (define_c_enum "unspecv" [
;; For USER_MSR support
UNSPECV_URDMSR
UNSPECV_UWRMSR
+
+ ;; For AMX-TILE
+ UNSPECV_LDTILECFG
+ UNSPECV_STTILECFG
])
;; Constants to represent rounding modes in the ROUND instruction
@@ -28152,6 +28156,26 @@ (define_insn "uwrmsr"
[(set_attr "prefix" "vex")
(set_attr "type" "other")])
+
+(define_insn "ldtilecfg"
+ [(unspec_volatile [(match_operand:BLK 0 "memory_operand" "jm")]
+ UNSPECV_LDTILECFG)]
+ "TARGET_AMX_TILE"
+ "ldtilecfg\t%0"
+ [(set_attr "type" "other")
+ (set_attr "addr" "gpr16")
+ (set_attr "prefix" "vex")
+ (set_attr "memory" "load")])
+
+(define_insn "sttilecfg"
+ [(set (match_operand:BLK 0 "memory_operand" "=jm")
+ (unspec_volatile:BLK [(const_int 0)] UNSPECV_STTILECFG))]
+ "TARGET_AMX_TILE"
+ "sttilecfg\t%0"
+ [(set_attr "type" "other")
+ (set_attr "addr" "gpr16")
+ (set_attr "prefix" "vex")
+ (set_attr "memory" "store")])
(include "mmx.md")
(include "sse.md")
(include "sync.md")
new file mode 100644
@@ -0,0 +1,55 @@
+/* PR target/114098 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mamx-tile" } */
+
+#include <stdint.h>
+#include <x86intrin.h>
+
+#define MAX_ROWS 16
+#define MAX_COLS 64
+#define MAX 1024
+#define STRIDE 64
+
+typedef struct __tile_config
+{
+ uint8_t palette_id;
+ uint8_t start_row;
+ uint8_t reserved_0[14];
+ uint16_t colsb[16];
+ uint8_t rows[16];
+} __tilecfg;
+
+
+extern void bar (__tilecfg *tileinfo);
+
+/* Initialize tile config */
+static void
+init_tile_config (__tilecfg *tileinfo)
+{
+ int i;
+ tileinfo->palette_id = 1;
+ tileinfo->start_row = 0;
+
+ for (i = 0; i < 1; ++i)
+ {
+ tileinfo->colsb[i] = MAX_ROWS;
+ tileinfo->rows[i] = MAX_ROWS;
+ }
+
+ for (i = 1; i < 4; ++i)
+ {
+ tileinfo->colsb[i] = MAX_COLS;
+ tileinfo->rows[i] = MAX_ROWS;
+ }
+
+ _tile_loadconfig (tileinfo);
+}
+
+void
+enable_amx (void)
+{
+ __tilecfg tile_data = {0};
+ init_tile_config (&tile_data);
+}
+
+/* { dg-final { scan-assembler-times "pxor\[^\n\]*%xmm" 1 } } */