@@ -1182,6 +1182,7 @@ static const arch_entry cpu_arch[] =
SUBARCH (amx_bf16, AMX_BF16, ANY_AMX_BF16, false),
SUBARCH (amx_fp16, AMX_FP16, ANY_AMX_FP16, false),
SUBARCH (amx_complex, AMX_COMPLEX, ANY_AMX_COMPLEX, false),
+ SUBARCH (amx_transpose, AMX_TRANSPOSE, ANY_AMX_TRANSPOSE, false),
SUBARCH (amx_tile, AMX_TILE, ANY_AMX_TILE, false),
SUBARCH (movdiri, MOVDIRI, MOVDIRI, false),
SUBARCH (movdir64b, MOVDIR64B, MOVDIR64B, false),
@@ -1868,6 +1869,7 @@ _is_cpu (const i386_cpu_attr *a, enum i386_cpu cpu)
case CpuAVX512VL: return a->bitfield.cpuavx512vl;
case CpuAPX_F: return a->bitfield.cpuapx_f;
case CpuAVX10_2: return a->bitfield.cpuavx10_2;
+ case CpuAMX_TRANSPOSE: return a->bitfield.cpuamx_transpose;
case Cpu64: return a->bitfield.cpu64;
case CpuNo64: return a->bitfield.cpuno64;
default:
@@ -2245,7 +2247,7 @@ cpu_flags_match (const insn_template *t)
|| any.bitfield.cpuavx512f || any.bitfield.cpuavx512bw
|| any.bitfield.cpuavx512dq || any.bitfield.cpuamx_tile
|| any.bitfield.cpucmpccxadd || any.bitfield.cpuuser_msr
- || any.bitfield.cpumsr_imm))
+ || any.bitfield.cpumsr_imm || any.bitfield.cpuamx_transpose))
{
/* These checks (verifying that APX_F() was properly used in the
opcode table entry) make sure there's no need for an "else" to
@@ -4050,7 +4052,7 @@ install_template (const insn_template *t)
|| maybe_cpu (t, CpuAVX512F) || maybe_cpu (t, CpuAVX512DQ)
|| maybe_cpu (t, CpuAVX512BW) || maybe_cpu (t, CpuBMI)
|| maybe_cpu (t, CpuBMI2) || maybe_cpu (t, CpuUSER_MSR)
- || maybe_cpu (t, CpuMSR_IMM))
+ || maybe_cpu (t, CpuMSR_IMM) || maybe_cpu (t, CpuAMX_TRANSPOSE))
&& maybe_cpu (t, CpuAPX_F))
{
if (need_evex_encoding (t))
@@ -10750,15 +10752,24 @@ process_operands (void)
unsigned int op, extra;
const reg_entry *first;
- /* The second operand must be {x,y,z}mmN. */
- gas_assert (i.operands == 3 && i.types[1].bitfield.class == RegSIMD);
+ /* The second operand must be {x,y,z,t}mmN */
+ gas_assert ((i.operands == 2 || i.operands == 3)
+ && i.types[1].bitfield.class == RegSIMD);
- switch (i.types[2].bitfield.class)
+ switch (i.types[i.operands - 1].bitfield.class)
{
case RegSIMD:
- /* AVX512-{4FMAPS,4VNNIW} operand 2: N must be a multiple of 4. */
op = 1;
- extra = 3;
+ if (i.types[i.operands - 1].bitfield.tmmword)
+ {
+ /* AMX-TRANSPOSE operand 2: N must be a multiple of 2. */
+ extra = 1;
+ }
+ else
+ {
+ /* AVX512-{4FMAPS,4VNNIW} operand 2: N must be a multiple of 4. */
+ extra = 3;
+ }
break;
case RegMask:
@@ -228,6 +228,7 @@ accept various extension mnemonics. For example,
@code{amx_bf16},
@code{amx_fp16},
@code{amx_complex},
+@code{amx_transpose},
@code{amx_tile},
@code{vmx},
@code{vmfunc},
@@ -1700,7 +1701,7 @@ supported on the CPU specified. The choices for @var{cpu_type} are:
@item @samp{.shstk} @tab @samp{.gfni} @tab @samp{.vaes} @tab @samp{.vpclmulqdq}
@item @samp{.movdiri} @tab @samp{.movdir64b} @tab @samp{.enqcmd} @tab @samp{.tsxldtrk}
@item @samp{.amx_int8} @tab @samp{.amx_bf16} @tab @samp{.amx_fp16}
-@item @samp{.amx_complex} @tab @samp{.amx_tile}
+@item @samp{.amx_complex} @tab @samp{.amx_transpose} @tab @samp{.amx_tile}
@item @samp{.kl} @tab @samp{.widekl} @tab @samp{.uintr} @tab @samp{.hreset}
@item @samp{.3dnow} @tab @samp{.3dnowa} @tab @samp{.sse4a} @tab @samp{.sse5}
@item @samp{.syscall} @tab @samp{.rdtscp} @tab @samp{.svme}
new file mode 100644
@@ -0,0 +1,17 @@
+#objdump: -drw
+#name: x86_64 AMX_TRANSPOSE bad insns
+
+.*: +file format .*
+
+
+Disassembly of section \.text:
+
+0+ <\.text>:
+\s*[a-f0-9]+:\s*c4 e2 72 6c d1\s+ttdpbf16ps %tmm1/\(bad\),%tmm1/\(bad\),%tmm2
+\s*[a-f0-9]+:\s*c4 e2 6a 6c c9\s+ttdpbf16ps %tmm2,%tmm1/\(bad\),%tmm1/\(bad\)
+\s*[a-f0-9]+:\s*c4 e2 72 6c ca\s+ttdpbf16ps %tmm1/\(bad\),%tmm2,%tmm1\/\(bad\)
+\s*[a-f0-9]+:\s*c4 e2 73 6c d1\s+ttdpfp16ps %tmm1/\(bad\),%tmm1/\(bad\),%tmm2
+\s*[a-f0-9]+:\s*c4 e2 6b 6c c9\s+ttdpfp16ps %tmm2,%tmm1/\(bad\),%tmm1/\(bad\)
+\s*[a-f0-9]+:\s*c4 e2 73 6c ca\s+ttdpfp16ps %tmm1/\(bad\),%tmm2,%tmm1/\(bad\)
+#pass
+
new file mode 100644
@@ -0,0 +1,18 @@
+ .text
+ # ttdpbf16ps %tmm1, %tmm1, %tmm2 all tmm registers should be distinct
+ .insn VEX.128.f3.0F38.W0 0x6c, %tmm1, %tmm1, %tmm2
+
+ # ttdpbf16ps %tmm1, %tmm2, %tmm1 all tmm registers should be distinct
+ .insn VEX.128.f3.0F38.W0 0x6c, %tmm1, %tmm2, %tmm1
+
+ # ttdpbf16ps %tmm2, %tmm1, %tmm1 all tmm registers should be distinct
+ .insn VEX.128.f3.0F38.W0 0x6c, %tmm2, %tmm1, %tmm1
+
+ # ttdpfp16ps %tmm1, %tmm1, %tmm2 all tmm registers should be distinct
+ .insn VEX.128.f2.0F38.W0 0x6c, %tmm1, %tmm1, %tmm2
+
+ # ttdpfp16ps %tmm1, %tmm2, %tmm1 all tmm registers should be distinct
+ .insn VEX.128.f2.0F38.W0 0x6c, %tmm1, %tmm2, %tmm1
+
+ # ttdpfp16ps %tmm2, %tmm1, %tmm1 all tmm registers should be distinct
+ .insn VEX.128.f2.0F38.W0 0x6c, %tmm2, %tmm1, %tmm1
new file mode 100644
@@ -0,0 +1,33 @@
+#objdump: -dw -Mintel
+#name: x86_64 AMX-TRANSPOSE insns (Intel disassembly)
+#source: x86-64-amx-transpose.s
+
+.*: +file format .*
+
+Disassembly of section \.text:
+
+#...
+[a-f0-9]+ <_intel>:
+\s*[a-f0-9]+:\s*c4 e2 5a 6c f5\s+ttdpbf16ps tmm6,tmm5,tmm4
+\s*[a-f0-9]+:\s*c4 e2 72 6c da\s+ttdpbf16ps tmm3,tmm2,tmm1
+\s*[a-f0-9]+:\s*c4 e2 5b 6c f5\s+ttdpfp16ps tmm6,tmm5,tmm4
+\s*[a-f0-9]+:\s*c4 e2 73 6c da\s+ttdpfp16ps tmm3,tmm2,tmm1
+\s*[a-f0-9]+:\s*c4 e2 7a 5f f5\s+ttransposed tmm6,tmm5
+\s*[a-f0-9]+:\s*c4 e2 7a 5f da\s+ttransposed tmm3,tmm2
+\s*[a-f0-9]+:\s*c4 a2 78 6e b4 f5 00 00 00 10\s+t2rpntlvwz0 tmm6,\[rbp\+r14\*8\+0x10000000\]
+\s*[a-f0-9]+:\s*c4 c2 78 6e 14 21\s+t2rpntlvwz0 tmm2,\[r9\+riz\*1\]
+\s*[a-f0-9]+:\s*c4 a2 78 6f b4 f5 00 00 00 10\s+t2rpntlvwz0t1 tmm6,\[rbp\+r14\*8\+0x10000000\]
+\s*[a-f0-9]+:\s*c4 c2 78 6f 14 21\s+t2rpntlvwz0t1 tmm2,\[r9\+riz\*1\]
+\s*[a-f0-9]+:\s*c4 a2 79 6e b4 f5 00 00 00 10\s+t2rpntlvwz1 tmm6,\[rbp\+r14\*8\+0x10000000\]
+\s*[a-f0-9]+:\s*c4 c2 79 6e 14 21\s+t2rpntlvwz1 tmm2,\[r9\+riz\*1\]
+\s*[a-f0-9]+:\s*c4 a2 79 6f b4 f5 00 00 00 10\s+t2rpntlvwz1t1 tmm6,\[rbp\+r14\*8\+0x10000000\]
+\s*[a-f0-9]+:\s*c4 c2 79 6f 14 21\s+t2rpntlvwz1t1 tmm2,\[r9\+riz\*1\]
+\s*[a-f0-9]+:\s*c4 e2 58 6b f5\s+tconjtcmmimfp16ps tmm6,tmm5,tmm4
+\s*[a-f0-9]+:\s*c4 e2 70 6b da\s+tconjtcmmimfp16ps tmm3,tmm2,tmm1
+\s*[a-f0-9]+:\s*c4 e2 79 6b f5\s+tconjtfp16 tmm6,tmm5
+\s*[a-f0-9]+:\s*c4 e2 79 6b da\s+tconjtfp16 tmm3,tmm2
+\s*[a-f0-9]+:\s*c4 e2 5b 6b f5\s+ttcmmimfp16ps tmm6,tmm5,tmm4
+\s*[a-f0-9]+:\s*c4 e2 73 6b da\s+ttcmmimfp16ps tmm3,tmm2,tmm1
+\s*[a-f0-9]+:\s*c4 e2 5a 6b f5\s+ttcmmrlfp16ps tmm6,tmm5,tmm4
+\s*[a-f0-9]+:\s*c4 e2 72 6b da\s+ttcmmrlfp16ps tmm3,tmm2,tmm1
+#pass
new file mode 100644
@@ -0,0 +1,15 @@
+.* Assembler messages:
+.*:5: Error: all tmm registers must be distinct for `ttdpbf16ps'
+.*:6: Error: all tmm registers must be distinct for `ttdpbf16ps'
+.*:7: Error: all tmm registers must be distinct for `ttdpbf16ps'
+.*:8: Error: all tmm registers must be distinct for `ttdpfp16ps'
+.*:9: Error: all tmm registers must be distinct for `ttdpfp16ps'
+.*:10: Error: all tmm registers must be distinct for `ttdpfp16ps'
+.*:11: Error: `\(%rip\)' cannot be used here
+.*:12: Error: `\(%rip\)' cannot be used here
+.*:13: Error: `\(%rip\)' cannot be used here
+.*:14: Error: `\(%rip\)' cannot be used here
+.*:15: Warning: operand 2 `%tmm1' implicitly denotes `%tmm0' to `%tmm1' group in `t2rpntlvwz0'
+.*:16: Warning: operand 2 `%tmm3' implicitly denotes `%tmm2' to `%tmm3' group in `t2rpntlvwz0t1'
+.*:17: Warning: operand 2 `%tmm5' implicitly denotes `%tmm4' to `%tmm5' group in `t2rpntlvwz1'
+.*:18: Warning: operand 2 `%tmm7' implicitly denotes `%tmm6' to `%tmm7' group in `t2rpntlvwz1t1'
new file mode 100644
@@ -0,0 +1,18 @@
+# Check Illegal AMX-TRANSPOSE instructions
+
+ .text
+_start:
+ ttdpbf16ps %tmm1, %tmm1, %tmm2
+ ttdpbf16ps %tmm1, %tmm2, %tmm1
+ ttdpbf16ps %tmm2, %tmm1, %tmm1
+ ttdpfp16ps %tmm1, %tmm1, %tmm2
+ ttdpfp16ps %tmm1, %tmm2, %tmm1
+ ttdpfp16ps %tmm2, %tmm1, %tmm1
+ t2rpntlvwz0 (%rip), %tmm2
+ t2rpntlvwz0t1 (%rip), %tmm2
+ t2rpntlvwz1 (%rip), %tmm2
+ t2rpntlvwz1t1 (%rip), %tmm2
+ t2rpntlvwz0 (%r9), %tmm1
+ t2rpntlvwz0t1 (%r9), %tmm3
+ t2rpntlvwz1 (%r9), %tmm5
+ t2rpntlvwz1t1 (%r9), %tmm7
new file mode 100644
@@ -0,0 +1,31 @@
+#objdump: -dw
+#name: x86_64 AMX-TRANSPOSE insns
+
+.*: +file format .*
+
+Disassembly of section \.text:
+
+0+ <_start>:
+\s*[a-f0-9]+:\s*c4 e2 5a 6c f5\s+ttdpbf16ps %tmm4,%tmm5,%tmm6
+\s*[a-f0-9]+:\s*c4 e2 72 6c da\s+ttdpbf16ps %tmm1,%tmm2,%tmm3
+\s*[a-f0-9]+:\s*c4 e2 5b 6c f5\s+ttdpfp16ps %tmm4,%tmm5,%tmm6
+\s*[a-f0-9]+:\s*c4 e2 73 6c da\s+ttdpfp16ps %tmm1,%tmm2,%tmm3
+\s*[a-f0-9]+:\s*c4 e2 7a 5f f5\s+ttransposed %tmm5,%tmm6
+\s*[a-f0-9]+:\s*c4 e2 7a 5f da\s+ttransposed %tmm2,%tmm3
+\s*[a-f0-9]+:\s*c4 a2 78 6e b4 f5 00 00 00 10\s+t2rpntlvwz0 0x10000000\(%rbp,%r14,8\),%tmm6
+\s*[a-f0-9]+:\s*c4 c2 78 6e 14 21\s+t2rpntlvwz0 \(%r9,%riz,1\),%tmm2
+\s*[a-f0-9]+:\s*c4 a2 78 6f b4 f5 00 00 00 10\s+t2rpntlvwz0t1 0x10000000\(%rbp,%r14,8\),%tmm6
+\s*[a-f0-9]+:\s*c4 c2 78 6f 14 21\s+t2rpntlvwz0t1 \(%r9,%riz,1\),%tmm2
+\s*[a-f0-9]+:\s*c4 a2 79 6e b4 f5 00 00 00 10\s+t2rpntlvwz1 0x10000000\(%rbp,%r14,8\),%tmm6
+\s*[a-f0-9]+:\s*c4 c2 79 6e 14 21\s+t2rpntlvwz1 \(%r9,%riz,1\),%tmm2
+\s*[a-f0-9]+:\s*c4 a2 79 6f b4 f5 00 00 00 10\s+t2rpntlvwz1t1 0x10000000\(%rbp,%r14,8\),%tmm6
+\s*[a-f0-9]+:\s*c4 c2 79 6f 14 21\s+t2rpntlvwz1t1 \(%r9,%riz,1\),%tmm2
+\s*[a-f0-9]+:\s*c4 e2 58 6b f5\s+tconjtcmmimfp16ps %tmm4,%tmm5,%tmm6
+\s*[a-f0-9]+:\s*c4 e2 70 6b da\s+tconjtcmmimfp16ps %tmm1,%tmm2,%tmm3
+\s*[a-f0-9]+:\s*c4 e2 79 6b f5\s+tconjtfp16 %tmm5,%tmm6
+\s*[a-f0-9]+:\s*c4 e2 79 6b da\s+tconjtfp16 %tmm2,%tmm3
+\s*[a-f0-9]+:\s*c4 e2 5b 6b f5\s+ttcmmimfp16ps %tmm4,%tmm5,%tmm6
+\s*[a-f0-9]+:\s*c4 e2 73 6b da\s+ttcmmimfp16ps %tmm1,%tmm2,%tmm3
+\s*[a-f0-9]+:\s*c4 e2 5a 6b f5\s+ttcmmrlfp16ps %tmm4,%tmm5,%tmm6
+\s*[a-f0-9]+:\s*c4 e2 72 6b da\s+ttcmmrlfp16ps %tmm1,%tmm2,%tmm3
+#pass
new file mode 100644
@@ -0,0 +1,51 @@
+# Check 64bit AMX-TRANSPOSE instructions
+
+ .text
+_start:
+ ttdpbf16ps %tmm4, %tmm5, %tmm6
+ ttdpbf16ps %tmm1, %tmm2, %tmm3
+ ttdpfp16ps %tmm4, %tmm5, %tmm6
+ ttdpfp16ps %tmm1, %tmm2, %tmm3
+ ttransposed %tmm5, %tmm6
+ ttransposed %tmm2, %tmm3
+ t2rpntlvwz0 0x10000000(%rbp, %r14, 8), %tmm6
+ t2rpntlvwz0 (%r9), %tmm2
+ t2rpntlvwz0t1 0x10000000(%rbp, %r14, 8), %tmm6
+ t2rpntlvwz0t1 (%r9), %tmm2
+ t2rpntlvwz1 0x10000000(%rbp, %r14, 8), %tmm6
+ t2rpntlvwz1 (%r9), %tmm2
+ t2rpntlvwz1t1 0x10000000(%rbp, %r14, 8), %tmm6
+ t2rpntlvwz1t1 (%r9), %tmm2
+ tconjtcmmimfp16ps %tmm4, %tmm5, %tmm6
+ tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3
+ tconjtfp16 %tmm5, %tmm6
+ tconjtfp16 %tmm2, %tmm3
+ ttcmmimfp16ps %tmm4, %tmm5, %tmm6
+ ttcmmimfp16ps %tmm1, %tmm2, %tmm3
+ ttcmmrlfp16ps %tmm4, %tmm5, %tmm6
+ ttcmmrlfp16ps %tmm1, %tmm2, %tmm3
+
+_intel:
+ .intel_syntax noprefix
+ ttdpbf16ps tmm6, tmm5, tmm4
+ ttdpbf16ps tmm3, tmm2, tmm1
+ ttdpfp16ps tmm6, tmm5, tmm4
+ ttdpfp16ps tmm3, tmm2, tmm1
+ ttransposed tmm6, tmm5
+ ttransposed tmm3, tmm2
+ t2rpntlvwz0 tmm6, [rbp+r14*8+0x10000000]
+ t2rpntlvwz0 tmm2, [r9]
+ t2rpntlvwz0t1 tmm6, [rbp+r14*8+0x10000000]
+ t2rpntlvwz0t1 tmm2, [r9]
+ t2rpntlvwz1 tmm6, [rbp+r14*8+0x10000000]
+ t2rpntlvwz1 tmm2, [r9]
+ t2rpntlvwz1t1 tmm6, [rbp+r14*8+0x10000000]
+ t2rpntlvwz1t1 tmm2, [r9]
+ tconjtcmmimfp16ps tmm6, tmm5, tmm4
+ tconjtcmmimfp16ps tmm3, tmm2, tmm1
+ tconjtfp16 tmm6, tmm5
+ tconjtfp16 tmm3, tmm2
+ ttcmmimfp16ps tmm6, tmm5, tmm4
+ ttcmmimfp16ps tmm3, tmm2, tmm1
+ ttcmmrlfp16ps tmm6, tmm5, tmm4
+ ttcmmrlfp16ps tmm3, tmm2, tmm1
@@ -138,6 +138,10 @@ Disassembly of section \.text:
[ ]*[a-f0-9]+:[ ]*62 da 7f 08 4b b4 87 23 01 00 00[ ]+tileloadd tmm6,\[r31\+rax\*4\+0x123\]
[ ]*[a-f0-9]+:[ ]*62 da 7d 08 4b b4 87 23 01 00 00[ ]+tileloaddt1 tmm6,\[r31\+rax\*4\+0x123\]
[ ]*[a-f0-9]+:[ ]*62 da 7e 08 4b b4 87 23 01 00 00[ ]+tilestored[ ]+\[r31\+rax\*4\+0x123\],tmm6
+[ ]*[a-f0-9]+:[ ]*62 da 7c 08 6e b4 c7 23 01 00 00[ ]+t2rpntlvwz0 tmm6,\[r31\+rax\*8\+0x123\]
+[ ]*[a-f0-9]+:[ ]*62 da 7c 08 6f b4 c7 23 01 00 00[ ]+t2rpntlvwz0t1 tmm6,\[r31\+rax\*8\+0x123\]
+[ ]*[a-f0-9]+:[ ]*62 da 7d 08 6e b4 c7 23 01 00 00[ ]+t2rpntlvwz1 tmm6,\[r31\+rax\*8\+0x123\]
+[ ]*[a-f0-9]+:[ ]*62 da 7d 08 6f b4 c7 23 01 00 00[ ]+t2rpntlvwz1t1 tmm6,\[r31\+rax\*8\+0x123\]
[ ]*[a-f0-9]+:[ ]*62 4c 7c 08 66 8c 87 23 01 00 00[ ]+wrssd[ ]+\[r31\+rax\*4\+0x123\],r25d
[ ]*[a-f0-9]+:[ ]*62 4c fc 08 66 bc 87 23 01 00 00[ ]+wrssq[ ]+\[r31\+rax\*4\+0x123\],r31
[ ]*[a-f0-9]+:[ ]*62 4c 7d 08 65 8c 87 23 01 00 00[ ]+wrussd[ ]+\[r31\+rax\*4\+0x123\],r25d
@@ -269,6 +273,10 @@ Disassembly of section \.text:
[ ]*[a-f0-9]+:[ ]*62 da 7f 08 4b b4 87 23 01 00 00[ ]+tileloadd tmm6,\[r31\+rax\*4\+0x123\]
[ ]*[a-f0-9]+:[ ]*62 da 7d 08 4b b4 87 23 01 00 00[ ]+tileloaddt1 tmm6,\[r31\+rax\*4\+0x123\]
[ ]*[a-f0-9]+:[ ]*62 da 7e 08 4b b4 87 23 01 00 00[ ]+tilestored[ ]+\[r31\+rax\*4\+0x123\],tmm6
+[ ]*[a-f0-9]+:[ ]*62 da 7c 08 6e b4 c7 23 01 00 00[ ]+t2rpntlvwz0 tmm6,\[r31\+rax\*8\+0x123\]
+[ ]*[a-f0-9]+:[ ]*62 da 7c 08 6f b4 c7 23 01 00 00[ ]+t2rpntlvwz0t1 tmm6,\[r31\+rax\*8\+0x123\]
+[ ]*[a-f0-9]+:[ ]*62 da 7d 08 6e b4 c7 23 01 00 00[ ]+t2rpntlvwz1 tmm6,\[r31\+rax\*8\+0x123\]
+[ ]*[a-f0-9]+:[ ]*62 da 7d 08 6f b4 c7 23 01 00 00[ ]+t2rpntlvwz1t1 tmm6,\[r31\+rax\*8\+0x123\]
[ ]*[a-f0-9]+:[ ]*62 4c 7c 08 66 8c 87 23 01 00 00[ ]+wrssd[ ]+\[r31\+rax\*4\+0x123\],r25d
[ ]*[a-f0-9]+:[ ]*62 4c fc 08 66 bc 87 23 01 00 00[ ]+wrssq[ ]+\[r31\+rax\*4\+0x123\],r31
[ ]*[a-f0-9]+:[ ]*62 4c 7d 08 65 8c 87 23 01 00 00[ ]+wrussd[ ]+\[r31\+rax\*4\+0x123\],r25d
@@ -138,6 +138,10 @@ Disassembly of section \.text:
[ ]*[a-f0-9]+:[ ]*62 da 7f 08 4b b4 87 23 01 00 00[ ]+tileloadd[ ]+0x123\(%r31,%rax,4\),%tmm6
[ ]*[a-f0-9]+:[ ]*62 da 7d 08 4b b4 87 23 01 00 00[ ]+tileloaddt1[ ]+0x123\(%r31,%rax,4\),%tmm6
[ ]*[a-f0-9]+:[ ]*62 da 7e 08 4b b4 87 23 01 00 00[ ]+tilestored[ ]+%tmm6,0x123\(%r31,%rax,4\)
+[ ]*[a-f0-9]+:[ ]*62 da 7c 08 6e b4 c7 23 01 00 00[ ]+t2rpntlvwz0[ ]+0x123\(%r31,%rax,8\),%tmm6
+[ ]*[a-f0-9]+:[ ]*62 da 7c 08 6f b4 c7 23 01 00 00[ ]+t2rpntlvwz0t1[ ]+0x123\(%r31,%rax,8\),%tmm6
+[ ]*[a-f0-9]+:[ ]*62 da 7d 08 6e b4 c7 23 01 00 00[ ]+t2rpntlvwz1[ ]+0x123\(%r31,%rax,8\),%tmm6
+[ ]*[a-f0-9]+:[ ]*62 da 7d 08 6f b4 c7 23 01 00 00[ ]+t2rpntlvwz1t1[ ]+0x123\(%r31,%rax,8\),%tmm6
[ ]*[a-f0-9]+:[ ]*62 4c 7c 08 66 8c 87 23 01 00 00[ ]+wrssd[ ]+%r25d,0x123\(%r31,%rax,4\)
[ ]*[a-f0-9]+:[ ]*62 4c fc 08 66 bc 87 23 01 00 00[ ]+wrssq[ ]+%r31,0x123\(%r31,%rax,4\)
[ ]*[a-f0-9]+:[ ]*62 4c 7d 08 65 8c 87 23 01 00 00[ ]+wrussd[ ]+%r25d,0x123\(%r31,%rax,4\)
@@ -269,6 +273,10 @@ Disassembly of section \.text:
[ ]*[a-f0-9]+:[ ]*62 da 7f 08 4b b4 87 23 01 00 00[ ]+tileloadd[ ]+0x123\(%r31,%rax,4\),%tmm6
[ ]*[a-f0-9]+:[ ]*62 da 7d 08 4b b4 87 23 01 00 00[ ]+tileloaddt1[ ]+0x123\(%r31,%rax,4\),%tmm6
[ ]*[a-f0-9]+:[ ]*62 da 7e 08 4b b4 87 23 01 00 00[ ]+tilestored[ ]+%tmm6,0x123\(%r31,%rax,4\)
+[ ]*[a-f0-9]+:[ ]*62 da 7c 08 6e b4 c7 23 01 00 00[ ]+t2rpntlvwz0[ ]+0x123\(%r31,%rax,8\),%tmm6
+[ ]*[a-f0-9]+:[ ]*62 da 7c 08 6f b4 c7 23 01 00 00[ ]+t2rpntlvwz0t1[ ]+0x123\(%r31,%rax,8\),%tmm6
+[ ]*[a-f0-9]+:[ ]*62 da 7d 08 6e b4 c7 23 01 00 00[ ]+t2rpntlvwz1[ ]+0x123\(%r31,%rax,8\),%tmm6
+[ ]*[a-f0-9]+:[ ]*62 da 7d 08 6f b4 c7 23 01 00 00[ ]+t2rpntlvwz1t1[ ]+0x123\(%r31,%rax,8\),%tmm6
[ ]*[a-f0-9]+:[ ]*62 4c 7c 08 66 8c 87 23 01 00 00[ ]+wrssd[ ]+%r25d,0x123\(%r31,%rax,4\)
[ ]*[a-f0-9]+:[ ]*62 4c fc 08 66 bc 87 23 01 00 00[ ]+wrssq[ ]+%r31,0x123\(%r31,%rax,4\)
[ ]*[a-f0-9]+:[ ]*62 4c 7d 08 65 8c 87 23 01 00 00[ ]+wrussd[ ]+%r25d,0x123\(%r31,%rax,4\)
@@ -138,6 +138,10 @@ Disassembly of section \.text:
[ ]*[a-f0-9]+:[ ]*62 da 7f 08 4b b4 87 23 01 00 00[ ]+tileloadd[ ]+0x123\(%r31,%rax,4\),%tmm6
[ ]*[a-f0-9]+:[ ]*62 da 7d 08 4b b4 87 23 01 00 00[ ]+tileloaddt1[ ]+0x123\(%r31,%rax,4\),%tmm6
[ ]*[a-f0-9]+:[ ]*62 da 7e 08 4b b4 87 23 01 00 00[ ]+tilestored[ ]+%tmm6,0x123\(%r31,%rax,4\)
+[ ]*[a-f0-9]+:[ ]*62 da 7c 08 6e b4 c7 23 01 00 00[ ]+t2rpntlvwz0[ ]+0x123\(%r31,%rax,8\),%tmm6
+[ ]*[a-f0-9]+:[ ]*62 da 7c 08 6f b4 c7 23 01 00 00[ ]+t2rpntlvwz0t1[ ]+0x123\(%r31,%rax,8\),%tmm6
+[ ]*[a-f0-9]+:[ ]*62 da 7d 08 6e b4 c7 23 01 00 00[ ]+t2rpntlvwz1[ ]+0x123\(%r31,%rax,8\),%tmm6
+[ ]*[a-f0-9]+:[ ]*62 da 7d 08 6f b4 c7 23 01 00 00[ ]+t2rpntlvwz1t1[ ]+0x123\(%r31,%rax,8\),%tmm6
[ ]*[a-f0-9]+:[ ]*62 4c 7c 08 66 8c 87 23 01 00 00[ ]+wrssd[ ]+%r25d,0x123\(%r31,%rax,4\)
[ ]*[a-f0-9]+:[ ]*62 4c fc 08 66 bc 87 23 01 00 00[ ]+wrssq[ ]+%r31,0x123\(%r31,%rax,4\)
[ ]*[a-f0-9]+:[ ]*62 4c 7d 08 65 8c 87 23 01 00 00[ ]+wrussd[ ]+%r25d,0x123\(%r31,%rax,4\)
@@ -269,6 +273,10 @@ Disassembly of section \.text:
[ ]*[a-f0-9]+:[ ]*62 da 7f 08 4b b4 87 23 01 00 00[ ]+tileloadd[ ]+0x123\(%r31,%rax,4\),%tmm6
[ ]*[a-f0-9]+:[ ]*62 da 7d 08 4b b4 87 23 01 00 00[ ]+tileloaddt1[ ]+0x123\(%r31,%rax,4\),%tmm6
[ ]*[a-f0-9]+:[ ]*62 da 7e 08 4b b4 87 23 01 00 00[ ]+tilestored[ ]+%tmm6,0x123\(%r31,%rax,4\)
+[ ]*[a-f0-9]+:[ ]*62 da 7c 08 6e b4 c7 23 01 00 00[ ]+t2rpntlvwz0[ ]+0x123\(%r31,%rax,8\),%tmm6
+[ ]*[a-f0-9]+:[ ]*62 da 7c 08 6f b4 c7 23 01 00 00[ ]+t2rpntlvwz0t1[ ]+0x123\(%r31,%rax,8\),%tmm6
+[ ]*[a-f0-9]+:[ ]*62 da 7d 08 6e b4 c7 23 01 00 00[ ]+t2rpntlvwz1[ ]+0x123\(%r31,%rax,8\),%tmm6
+[ ]*[a-f0-9]+:[ ]*62 da 7d 08 6f b4 c7 23 01 00 00[ ]+t2rpntlvwz1t1[ ]+0x123\(%r31,%rax,8\),%tmm6
[ ]*[a-f0-9]+:[ ]*62 4c 7c 08 66 8c 87 23 01 00 00[ ]+wrssd[ ]+%r25d,0x123\(%r31,%rax,4\)
[ ]*[a-f0-9]+:[ ]*62 4c fc 08 66 bc 87 23 01 00 00[ ]+wrssq[ ]+%r31,0x123\(%r31,%rax,4\)
[ ]*[a-f0-9]+:[ ]*62 4c 7d 08 65 8c 87 23 01 00 00[ ]+wrussd[ ]+%r25d,0x123\(%r31,%rax,4\)
@@ -132,6 +132,10 @@ _start:
tileloadd 0x123(%r31,%rax,4),%tmm6
tileloaddt1 0x123(%r31,%rax,4),%tmm6
tilestored %tmm6,0x123(%r31,%rax,4)
+ t2rpntlvwz0 0x123(%r31,%rax,8),%tmm6
+ t2rpntlvwz0t1 0x123(%r31,%rax,8),%tmm6
+ t2rpntlvwz1 0x123(%r31,%rax,8),%tmm6
+ t2rpntlvwz1t1 0x123(%r31,%rax,8),%tmm6
wrssd %r25d,0x123(%r31,%rax,4)
wrssq %r31,0x123(%r31,%rax,4)
wrussd %r25d,0x123(%r31,%rax,4)
@@ -265,6 +269,10 @@ _start:
tileloadd tmm6,[r31+rax*4+0x123]
tileloaddt1 tmm6,[r31+rax*4+0x123]
tilestored [r31+rax*4+0x123],tmm6
+ t2rpntlvwz0 tmm6,[r31+rax*8+0x123]
+ t2rpntlvwz0t1 tmm6,[r31+rax*8+0x123]
+ t2rpntlvwz1 tmm6,[r31+rax*8+0x123]
+ t2rpntlvwz1t1 tmm6,[r31+rax*8+0x123]
wrssd DWORD PTR [r31+rax*4+0x123],r25d
wrssq QWORD PTR [r31+rax*4+0x123],r31
wrussd DWORD PTR [r31+rax*4+0x123],r25d
@@ -523,6 +523,10 @@ run_dump_test "x86-64-avx10_2-256-sm4-intel"
run_dump_test "x86-64-avx10_2-512-sm4"
run_dump_test "x86-64-avx10_2-512-sm4-intel"
run_list_test "x86-64-avx10_2-sm4-inval"
+run_dump_test "x86-64-amx-transpose"
+run_dump_test "x86-64-amx-transpose-intel"
+run_list_test "x86-64-amx-transpose-inval"
+run_dump_test "x86-64-amx-transpose-bad"
run_dump_test "x86-64-clzero"
run_dump_test "x86-64-mwaitx-bdver4"
run_list_test "x86-64-mwaitx-reg"
@@ -416,8 +416,8 @@ static const struct dis386 evex_table[][256] = {
{ Bad_Opcode },
{ Bad_Opcode },
{ Bad_Opcode },
- { Bad_Opcode },
- { Bad_Opcode },
+ { X86_64_EVEX_FROM_VEX_TABLE (X86_64_VEX_0F386E) },
+ { X86_64_EVEX_FROM_VEX_TABLE (X86_64_VEX_0F386F) },
/* 70 */
{ VEX_W_TABLE (EVEX_W_0F3870) },
{ "vpshldv%DQ", { XM, Vex, EXx }, PREFIX_DATA },
@@ -963,6 +963,8 @@ enum
MOD_0F38F8,
MOD_VEX_0F3849_X86_64_L_0_W_0,
+ MOD_VEX_0F386E_X86_64,
+ MOD_VEX_0F386F_X86_64,
MOD_EVEX_MAP4_60,
MOD_EVEX_MAP4_61,
@@ -1136,7 +1138,11 @@ enum
PREFIX_VEX_0F3851_W_0,
PREFIX_VEX_0F385C_X86_64_L_0_W_0,
PREFIX_VEX_0F385E_X86_64_L_0_W_0,
+ PREFIX_VEX_0F385F_X86_64_L_0_W_0,
+ PREFIX_VEX_0F386B_X86_64_L_0_W_0,
PREFIX_VEX_0F386C_X86_64_L_0_W_0,
+ PREFIX_VEX_0F386E_X86_64_M_0_L_0_W_0,
+ PREFIX_VEX_0F386F_X86_64_M_0_L_0_W_0,
PREFIX_VEX_0F3872,
PREFIX_VEX_0F38B0_W_0,
PREFIX_VEX_0F38B1_W_0,
@@ -1345,7 +1351,11 @@ enum
X86_64_VEX_0F384B,
X86_64_VEX_0F385C,
X86_64_VEX_0F385E,
+ X86_64_VEX_0F385F,
+ X86_64_VEX_0F386B,
X86_64_VEX_0F386C,
+ X86_64_VEX_0F386E,
+ X86_64_VEX_0F386F,
X86_64_VEX_0F38Ex,
X86_64_VEX_MAP7_F6_L_0_W_0_R_0,
@@ -1424,7 +1434,11 @@ enum
VEX_LEN_0F385A,
VEX_LEN_0F385C_X86_64,
VEX_LEN_0F385E_X86_64,
+ VEX_LEN_0F385F_X86_64,
+ VEX_LEN_0F386B_X86_64,
VEX_LEN_0F386C_X86_64,
+ VEX_LEN_0F386E_X86_64_M_0,
+ VEX_LEN_0F386F_X86_64_M_0,
VEX_LEN_0F38CB_P_3_W_0,
VEX_LEN_0F38CC_P_3_W_0,
VEX_LEN_0F38CD_P_3_W_0,
@@ -1597,7 +1611,11 @@ enum
VEX_W_0F385A_L_0,
VEX_W_0F385C_X86_64_L_0,
VEX_W_0F385E_X86_64_L_0,
+ VEX_W_0F385F_X86_64_L_0,
+ VEX_W_0F386B_X86_64_L_0,
VEX_W_0F386C_X86_64_L_0,
+ VEX_W_0F386E_X86_64_M_0_L_0,
+ VEX_W_0F386F_X86_64_M_0_L_0,
VEX_W_0F3872_P_1,
VEX_W_0F3878,
VEX_W_0F3879,
@@ -4104,11 +4122,40 @@ static const struct dis386 prefix_table[][4] = {
{ "tdpbssd", {TMM, Rtmm, VexTmm }, 0 },
},
+ /* PREFIX_VEX_0F385F_X86_64_L_0_W_0 */
+ {
+ { Bad_Opcode },
+ { "ttransposed", { TMM, Rtmm }, 0 },
+ },
+
+ /* PREFIX_VEX_0F386B_X86_64_L_0_W_0 */
+ {
+ { "tconjtcmmimfp16ps", { TMM, Rtmm, VexTmm }, 0 },
+ { "ttcmmrlfp16ps", { TMM, Rtmm, VexTmm }, 0 },
+ { "tconjtfp16", { TMM, Rtmm }, 0 },
+ { "ttcmmimfp16ps", { TMM, Rtmm, VexTmm }, 0 },
+ },
+
/* PREFIX_VEX_0F386C_X86_64_L_0_W_0 */
{
- { "tcmmrlfp16ps", { TMM, Rtmm, VexTmm }, 0 },
+ { "tcmmrlfp16ps", { TMM, Rtmm, VexTmm }, 0 },
+ { "ttdpbf16ps", { TMM, Rtmm, VexTmm }, 0 },
+ { "tcmmimfp16ps", { TMM, Rtmm, VexTmm }, 0 },
+ { "ttdpfp16ps", { TMM, Rtmm, VexTmm }, 0 },
+ },
+
+ /* PREFIX_VEX_0F386E_X86_64_M_0_L_0_W_0 */
+ {
+ { "t2rpntlvwz0", { TMM, MVexSIBMEM }, 0 },
{ Bad_Opcode },
- { "tcmmimfp16ps", { TMM, Rtmm, VexTmm }, 0 },
+ { "t2rpntlvwz1", { TMM, MVexSIBMEM }, 0 },
+ },
+
+ /* PREFIX_VEX_0F386F_X86_64_M_0_L_0_W_0 */
+ {
+ { "t2rpntlvwz0t1", { TMM, MVexSIBMEM }, 0 },
+ { Bad_Opcode },
+ { "t2rpntlvwz1t1", { TMM, MVexSIBMEM }, 0 },
},
/* PREFIX_VEX_0F3872 */
@@ -4580,12 +4627,36 @@ static const struct dis386 x86_64_table[][2] = {
{ VEX_LEN_TABLE (VEX_LEN_0F385E_X86_64) },
},
+ /* X86_64_VEX_0F385F */
+ {
+ { Bad_Opcode },
+ { VEX_LEN_TABLE (VEX_LEN_0F385F_X86_64) },
+ },
+
+ /* X86_64_VEX_0F386B */
+ {
+ { Bad_Opcode },
+ { VEX_LEN_TABLE (VEX_LEN_0F386B_X86_64) },
+ },
+
/* X86_64_VEX_0F386C */
{
{ Bad_Opcode },
{ VEX_LEN_TABLE (VEX_LEN_0F386C_X86_64) },
},
+ /* X86_64_VEX_0F386E */
+ {
+ { Bad_Opcode },
+ { MOD_TABLE (MOD_VEX_0F386E_X86_64) },
+ },
+
+ /* X86_64_VEX_0F386F */
+ {
+ { Bad_Opcode },
+ { MOD_TABLE (MOD_VEX_0F386F_X86_64) },
+ },
+
/* X86_64_VEX_0F38Ex */
{
{ Bad_Opcode },
@@ -6468,7 +6539,7 @@ static const struct dis386 vex_table[][256] = {
{ X86_64_TABLE (X86_64_VEX_0F385C) },
{ Bad_Opcode },
{ X86_64_TABLE (X86_64_VEX_0F385E) },
- { Bad_Opcode },
+ { X86_64_TABLE (X86_64_VEX_0F385F) },
/* 60 */
{ Bad_Opcode },
{ Bad_Opcode },
@@ -6482,11 +6553,11 @@ static const struct dis386 vex_table[][256] = {
{ Bad_Opcode },
{ Bad_Opcode },
{ Bad_Opcode },
- { Bad_Opcode },
+ { X86_64_TABLE (X86_64_VEX_0F386B) },
{ X86_64_TABLE (X86_64_VEX_0F386C) },
{ Bad_Opcode },
- { Bad_Opcode },
- { Bad_Opcode },
+ { X86_64_TABLE (X86_64_VEX_0F386E) },
+ { X86_64_TABLE (X86_64_VEX_0F386F) },
/* 70 */
{ Bad_Opcode },
{ Bad_Opcode },
@@ -7149,11 +7220,31 @@ static const struct dis386 vex_len_table[][2] = {
{ VEX_W_TABLE (VEX_W_0F385E_X86_64_L_0) },
},
+ /* VEX_LEN_0F385F_X86_64 */
+ {
+ { VEX_W_TABLE (VEX_W_0F385F_X86_64_L_0) },
+ },
+
+ /* VEX_LEN_0F386B_X86_64 */
+ {
+ { VEX_W_TABLE (VEX_W_0F386B_X86_64_L_0) },
+ },
+
/* VEX_LEN_0F386C_X86_64 */
{
{ VEX_W_TABLE (VEX_W_0F386C_X86_64_L_0) },
},
+ /* VEX_LEN_0F386E_X86_64_M_0 */
+ {
+ { VEX_W_TABLE (VEX_W_0F386E_X86_64_M_0_L_0) },
+ },
+
+ /* VEX_LEN_0F386F_X86_64_M_0 */
+ {
+ { VEX_W_TABLE (VEX_W_0F386F_X86_64_M_0_L_0) },
+ },
+
/* VEX_LEN_0F38CB_P_3_W_0 */
{
{ Bad_Opcode },
@@ -7833,10 +7924,26 @@ static const struct dis386 vex_w_table[][2] = {
/* VEX_W_0F385E_X86_64_L_0 */
{ PREFIX_TABLE (PREFIX_VEX_0F385E_X86_64_L_0_W_0) },
},
+ {
+ /* VEX_W_0F385F_X86_64_L_0 */
+ { PREFIX_TABLE (PREFIX_VEX_0F385F_X86_64_L_0_W_0) },
+ },
+ {
+ /* VEX_W_0F386B_X86_64_L_0 */
+ { PREFIX_TABLE (PREFIX_VEX_0F386B_X86_64_L_0_W_0) },
+ },
{
/* VEX_W_0F386C_X86_64_L_0 */
{ PREFIX_TABLE (PREFIX_VEX_0F386C_X86_64_L_0_W_0) },
},
+ {
+ /* VEX_W_0F386E_X86_64_M_0_L_0 */
+ { PREFIX_TABLE (PREFIX_VEX_0F386E_X86_64_M_0_L_0_W_0) },
+ },
+ {
+ /* VEX_W_0F386F_X86_64_M_0_L_0 */
+ { PREFIX_TABLE (PREFIX_VEX_0F386F_X86_64_M_0_L_0_W_0) },
+ },
{
/* VEX_W_0F3872_P_1 */
{ "%XVvcvtneps2bf16%XY", { XMM, EXx }, 0 },
@@ -8331,6 +8438,14 @@ static const struct dis386 mod_table[][2] = {
{ PREFIX_TABLE (PREFIX_VEX_0F3849_X86_64_L_0_W_0_M_0) },
{ PREFIX_TABLE (PREFIX_VEX_0F3849_X86_64_L_0_W_0_M_1) },
},
+ {
+ /* MOD_VEX_0F386E_X86_64 */
+ { VEX_LEN_TABLE (VEX_LEN_0F386E_X86_64_M_0) },
+ },
+ {
+ /* MOD_VEX_0F386F_X86_64 */
+ { VEX_LEN_TABLE (VEX_LEN_0F386F_X86_64_M_0) },
+ },
#include "i386-dis-evex-mod.h"
};
@@ -265,6 +265,8 @@ static const dependency isa_dependencies[] =
"AMX_TILE" },
{ "AMX_COMPLEX",
"AMX_TILE" },
+ { "AMX_TRANSPOSE",
+ "AMX_TILE" },
{ "KL",
"SSE2" },
{ "WIDEKL",
@@ -431,6 +433,7 @@ static bitfield cpu_flags[] =
BITFIELD (AMX_BF16),
BITFIELD (AMX_FP16),
BITFIELD (AMX_COMPLEX),
+ BITFIELD (AMX_TRANSPOSE),
BITFIELD (AMX_TILE),
BITFIELD (MOVDIRI),
BITFIELD (MOVDIR64B),
@@ -327,6 +327,8 @@ enum i386_cpu
CpuAPX_F,
/* Intel AVX10.2 Instructions support required. */
CpuAVX10_2,
+ /* Intel AMX-TRANSPOSE Instructions support required. */
+ CpuAMX_TRANSPOSE,
/* Not supported in the 64bit mode */
CpuNo64,
@@ -364,6 +366,7 @@ enum i386_cpu
cpuavx512vl:1, \
cpuapx_f:1, \
cpuavx10_2:1, \
+ cpuamx_transpose:1, \
/* NOTE: This field needs to remain last. */ \
cpuno64:1
@@ -3210,6 +3210,26 @@ tilerelease, 0x49c0, AMX_TILE, Vex128|Space0F38|VexW0|NoSuf, {}
tilezero, 0xf249, AMX_TILE, Modrm|Vex128|Space0F38|VexW0|NoSuf, { RegTMM }
+<z:opc, z0:, z1:66>
+
+<loc:opc, $t:0x0, t1:0x1>
+
+t2rpntlvw<z><loc>, 0x<z:opc>6e | <loc:opc>, APX_F(AMX_TRANSPOSE), Sibmem|Vex128|EVex128|Space0F38|VexW0|NoSuf|ImplicitGroup, { Unspecified|BaseIndex, RegTMM }
+
+<z>
+<loc>
+
+tconjtcmmimfp16ps, 0x6b, AMX_COMPLEX&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
+tconjtfp16, 0x666b, AMX_COMPLEX&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|VexW0|NoSuf, { RegTMM, RegTMM }
+
+ttcmmimfp16ps, 0xf26b, AMX_COMPLEX&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
+ttcmmrlfp16ps, 0xf36b, AMX_COMPLEX&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
+
+ttdpbf16ps, 0xf36c, AMX_BF16&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
+ttdpfp16ps, 0xf26c, AMX_FP16&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
+
+ttransposed, 0xf35f, AMX_TRANSPOSE, Modrm|Vex128|Space0F38|VexW0|NoSuf, { RegTMM, RegTMM }
+
// AMX instructions end.
// KEYLOCKER instructions.