On 13.11.2024 09:44, Haochen Jiang wrote:
> --- a/gas/NEWS
> +++ b/gas/NEWS
> @@ -1,5 +1,7 @@
> -*- text -*-
>
> +* Add support for Intel AMX-TRANSPOSE instructions.
As before, preferably with "x86" added, please.
> --- a/gas/config/tc-i386.c
> +++ b/gas/config/tc-i386.c
> @@ -1182,6 +1182,7 @@ static const arch_entry cpu_arch[] =
> SUBARCH (amx_bf16, AMX_BF16, ANY_AMX_BF16, false),
> SUBARCH (amx_fp16, AMX_FP16, ANY_AMX_FP16, false),
> SUBARCH (amx_complex, AMX_COMPLEX, ANY_AMX_COMPLEX, false),
> + SUBARCH (amx_transpose, AMX_TRANSPOSE, ANY_AMX_TRANSPOSE, false),
> SUBARCH (amx_tile, AMX_TILE, ANY_AMX_TILE, false),
> SUBARCH (movdiri, MOVDIRI, MOVDIRI, false),
> SUBARCH (movdir64b, MOVDIR64B, MOVDIR64B, false),
> @@ -1858,6 +1859,7 @@ _is_cpu (const i386_cpu_attr *a, enum i386_cpu cpu)
> case CpuAVX512F: return a->bitfield.cpuavx512f;
> case CpuAVX512VL: return a->bitfield.cpuavx512vl;
> case CpuAPX_F: return a->bitfield.cpuapx_f;
> + case CpuAMX_TRANSPOSE: return a->bitfield.cpuamx_transpose;
Nit: One too many padding blanks.
> @@ -10977,7 +10979,7 @@ build_modrm_byte (void)
> {
> if (i.mem_operands)
> {
> - unsigned int fake_zero_displacement = 0;
> + unsigned int fake_zero_displacement = 0, tmmpair = 0, pos = 0;
>
> gas_assert (i.flags[op] & Operand_Mem);
>
> @@ -11009,6 +11011,20 @@ build_modrm_byte (void)
> i.sib.index = i.index_reg->reg_num;
> set_rex_vrex (i.index_reg, REX_X, false);
> }
> +
> + /* Since some amx instructions uses tmm pairs, which will
> + automatically change tmm with odd number to even number.
> + So we will handle this here. */
> + tmmpair = i.tm.opcode_modifier.tmmpairoperand & 7;
> + while (tmmpair)
> + {
> + if (tmmpair % 2 == 1
> + && i.op[pos].regs->reg_num % 2 == 1)
> + i.op[pos].regs--;
> + tmmpair >>= 1;
> + pos++;
> + }
> +
> }
Besides you wanting to re-use what we already have, we also want to be
consistent in how we handle this: For the other cases we don't adjust the
register; we merely warn about the anomaly. Same should then be happening
for this case.
> --- /dev/null
> +++ b/gas/testsuite/gas/i386/amx-transpose-inval.l
> @@ -0,0 +1,12 @@
> +.* Assembler messages:
> +.*:6: Error: `ttdpbf16ps' is only supported in 64-bit mode
> +.*:7: Error: `ttdpfp16ps' is only supported in 64-bit mode
> +.*:8: Error: `ttransposed' is only supported in 64-bit mode
> +.*:9: Error: `t2rpntlvwz0' is only supported in 64-bit mode
> +.*:10: Error: `t2rpntlvwz0t1' is only supported in 64-bit mode
> +.*:11: Error: `t2rpntlvwz1' is only supported in 64-bit mode
> +.*:12: Error: `t2rpntlvwz1t1' is only supported in 64-bit mode
> +.*:13: Error: `tconjtcmmimfp16ps' is only supported in 64-bit mode
> +.*:14: Error: `tconjtfp16' is only supported in 64-bit mode
> +.*:15: Error: `ttcmmimfp16ps' is only supported in 64-bit mode
> +.*:16: Error: `ttcmmrlfp16ps' is only supported in 64-bit mode
I question the value of this test (and similar ones, especially when the
base feature already isn't permitted outside of 64-bit mode).
> --- /dev/null
> +++ b/gas/testsuite/gas/i386/x86-64-amx-transpose.d
> @@ -0,0 +1,31 @@
> +#objdump: -dw
> +#name: x86_64 AMX-TRANSPOSE insns
> +
> +.*: +file format .*
> +
> +Disassembly of section \.text:
> +
> +0+ <_start>:
> +\s*[a-f0-9]+:\s*c4 e2 5a 6c f5\s+ttdpbf16ps %tmm4,%tmm5,%tmm6
> +\s*[a-f0-9]+:\s*c4 e2 72 6c da\s+ttdpbf16ps %tmm1,%tmm2,%tmm3
> +\s*[a-f0-9]+:\s*c4 e2 5b 6c f5\s+ttdpfp16ps %tmm4,%tmm5,%tmm6
> +\s*[a-f0-9]+:\s*c4 e2 73 6c da\s+ttdpfp16ps %tmm1,%tmm2,%tmm3
> +\s*[a-f0-9]+:\s*c4 e2 7a 5f f5\s+ttransposed %tmm5,%tmm6
> +\s*[a-f0-9]+:\s*c4 e2 7a 5f da\s+ttransposed %tmm2,%tmm3
> +\s*[a-f0-9]+:\s*c4 a2 78 6e b4 f5 00 00 00 10\s+t2rpntlvwz0 0x10000000\(%rbp,%r14,8\),%tmm6
> +\s*[a-f0-9]+:\s*c4 c2 78 6e 14 21\s+t2rpntlvwz0 \(%r9,%riz,1\),%tmm2
> +\s*[a-f0-9]+:\s*c4 a2 78 6f b4 f5 00 00 00 10\s+t2rpntlvwz0t1 0x10000000\(%rbp,%r14,8\),%tmm6
> +\s*[a-f0-9]+:\s*c4 c2 78 6f 14 21\s+t2rpntlvwz0t1 \(%r9,%riz,1\),%tmm2
> +\s*[a-f0-9]+:\s*c4 a2 79 6e b4 f5 00 00 00 10\s+t2rpntlvwz1 0x10000000\(%rbp,%r14,8\),%tmm6
> +\s*[a-f0-9]+:\s*c4 c2 79 6e 14 21\s+t2rpntlvwz1 \(%r9,%riz,1\),%tmm2
> +\s*[a-f0-9]+:\s*c4 a2 79 6f b4 f5 00 00 00 10\s+t2rpntlvwz1t1 0x10000000\(%rbp,%r14,8\),%tmm6
> +\s*[a-f0-9]+:\s*c4 c2 79 6f 14 21\s+t2rpntlvwz1t1 \(%r9,%riz,1\),%tmm2
With what I said above, the use of %tmm3 in the source file should
result in %tmm3 being displayed here. As mentioned in the series extending
the group handling, we ought to think about how to express odd registers in
disassembly. Ideally that would happen before the issue is widened by this
introducing further instances.
> --- a/opcodes/i386-opc.tbl
> +++ b/opcodes/i386-opc.tbl
> @@ -3183,14 +3183,27 @@ xresldtrk, 0xf20f01e9, TSXLDTRK, NoSuf, {}
>
> // TSXLDTRK instructions end.
>
> +#define TMMPairOperand0 TMMPairOperand=1
> +#define TMMPairOperand1 TMMPairOperand=2
> +#define TMMPairOperand2 TMMPairOperand=4
> +
> // AMX instructions.
>
> ldtilecfg, 0x49/0, APX_F(AMX_TILE), Modrm|Vex128|EVex128|Space0F38|VexW0|NoSuf, { Unspecified|BaseIndex }
> sttilecfg, 0x6649/0, APX_F(AMX_TILE), Modrm|Vex128|EVex128|Space0F38|VexW0|NoSuf, { Unspecified|BaseIndex }
>
> +t2rpntlvwz0, 0x6e, AMX_TRANSPOSE, TMMPairOperand1|Sibmem|Vex128|Space0F38|VexW0|NoSuf, { Unspecified|BaseIndex, RegTMM }
> +t2rpntlvwz0t1, 0x6f, AMX_TRANSPOSE, TMMPairOperand1|Sibmem|Vex128|Space0F38|VexW0|NoSuf, { Unspecified|BaseIndex, RegTMM }
> +t2rpntlvwz1, 0x666e, AMX_TRANSPOSE, TMMPairOperand1|Sibmem|Vex128|Space0F38|VexW0|NoSuf, { Unspecified|BaseIndex, RegTMM }
> +t2rpntlvwz1t1, 0x666f, AMX_TRANSPOSE, TMMPairOperand1|Sibmem|Vex128|Space0F38|VexW0|NoSuf, { Unspecified|BaseIndex, RegTMM }
> +
> tcmmimfp16ps, 0x666c, AMX_COMPLEX, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
> tcmmrlfp16ps, 0x6c, AMX_COMPLEX, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
>
> +tconjtcmmimfp16ps, 0x6b, AMX_COMPLEX&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
> +
> +tconjtfp16, 0x666b, AMX_COMPLEX&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|VexW0|NoSuf, { RegTMM, RegTMM }
> +
> tdpbf16ps, 0xf35c, AMX_BF16, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
> tdpfp16ps, 0xf25c, AMX_FP16, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
> tdpbssd, 0xf25e, AMX_INT8, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
> @@ -3206,6 +3219,14 @@ tilerelease, 0x49c0, AMX_TILE, Vex128|Space0F38|VexW0|NoSuf, {}
>
> tilezero, 0xf249, AMX_TILE, Modrm|Vex128|Space0F38|VexW0|NoSuf, { RegTMM }
>
> +ttcmmimfp16ps, 0xf26b, AMX_COMPLEX&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
> +ttcmmrlfp16ps, 0xf36b, AMX_COMPLEX&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
> +
> +ttdpbf16ps, 0xf36c, AMX_BF16&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
> +ttdpfp16ps, 0xf26c, AMX_FP16&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
> +
> +ttransposed, 0xf35f, AMX_TRANSPOSE, Modrm|Vex128|Space0F38|VexW0|NoSuf, { RegTMM, RegTMM }
> +
> // AMX instructions end.
I'm struggling some in trying to determine on what basis you've established
where to add the new insns. Would imo be nice if all AMX-COMPLEX ones ended
up together, all AMX-BF16 etc. Or alternatively if all AMX-TRANSPOSE ones
ended up together (and not at the very top of the section).
Jan
@@ -1,5 +1,7 @@
-*- text -*-
+* Add support for Intel AMX-TRANSPOSE instructions.
+
* Add support for Intel MSR_IMM instructions.
* Add support for Intel AVX10.2 instructions.
@@ -1182,6 +1182,7 @@ static const arch_entry cpu_arch[] =
SUBARCH (amx_bf16, AMX_BF16, ANY_AMX_BF16, false),
SUBARCH (amx_fp16, AMX_FP16, ANY_AMX_FP16, false),
SUBARCH (amx_complex, AMX_COMPLEX, ANY_AMX_COMPLEX, false),
+ SUBARCH (amx_transpose, AMX_TRANSPOSE, ANY_AMX_TRANSPOSE, false),
SUBARCH (amx_tile, AMX_TILE, ANY_AMX_TILE, false),
SUBARCH (movdiri, MOVDIRI, MOVDIRI, false),
SUBARCH (movdir64b, MOVDIR64B, MOVDIR64B, false),
@@ -1858,6 +1859,7 @@ _is_cpu (const i386_cpu_attr *a, enum i386_cpu cpu)
case CpuAVX512F: return a->bitfield.cpuavx512f;
case CpuAVX512VL: return a->bitfield.cpuavx512vl;
case CpuAPX_F: return a->bitfield.cpuapx_f;
+ case CpuAMX_TRANSPOSE: return a->bitfield.cpuamx_transpose;
case Cpu64: return a->bitfield.cpu64;
case CpuNo64: return a->bitfield.cpuno64;
default:
@@ -10977,7 +10979,7 @@ build_modrm_byte (void)
{
if (i.mem_operands)
{
- unsigned int fake_zero_displacement = 0;
+ unsigned int fake_zero_displacement = 0, tmmpair = 0, pos = 0;
gas_assert (i.flags[op] & Operand_Mem);
@@ -11009,6 +11011,20 @@ build_modrm_byte (void)
i.sib.index = i.index_reg->reg_num;
set_rex_vrex (i.index_reg, REX_X, false);
}
+
+ /* Since some amx instructions uses tmm pairs, which will
+ automatically change tmm with odd number to even number.
+ So we will handle this here. */
+ tmmpair = i.tm.opcode_modifier.tmmpairoperand & 7;
+ while (tmmpair)
+ {
+ if (tmmpair % 2 == 1
+ && i.op[pos].regs->reg_num % 2 == 1)
+ i.op[pos].regs--;
+ tmmpair >>= 1;
+ pos++;
+ }
+
}
default_seg = reg_ds;
@@ -228,6 +228,7 @@ accept various extension mnemonics. For example,
@code{amx_bf16},
@code{amx_fp16},
@code{amx_complex},
+@code{amx_transpose},
@code{amx_tile},
@code{vmx},
@code{vmfunc},
@@ -1700,7 +1701,7 @@ supported on the CPU specified. The choices for @var{cpu_type} are:
@item @samp{.shstk} @tab @samp{.gfni} @tab @samp{.vaes} @tab @samp{.vpclmulqdq}
@item @samp{.movdiri} @tab @samp{.movdir64b} @tab @samp{.enqcmd} @tab @samp{.tsxldtrk}
@item @samp{.amx_int8} @tab @samp{.amx_bf16} @tab @samp{.amx_fp16}
-@item @samp{.amx_complex} @tab @samp{.amx_tile}
+@item @samp{.amx_complex} @tab @samp{.amx_transpose} @tab @samp{.amx_tile}
@item @samp{.kl} @tab @samp{.widekl} @tab @samp{.uintr} @tab @samp{.hreset}
@item @samp{.3dnow} @tab @samp{.3dnowa} @tab @samp{.sse4a} @tab @samp{.sse5}
@item @samp{.syscall} @tab @samp{.rdtscp} @tab @samp{.svme}
new file mode 100644
@@ -0,0 +1,12 @@
+.* Assembler messages:
+.*:6: Error: `ttdpbf16ps' is only supported in 64-bit mode
+.*:7: Error: `ttdpfp16ps' is only supported in 64-bit mode
+.*:8: Error: `ttransposed' is only supported in 64-bit mode
+.*:9: Error: `t2rpntlvwz0' is only supported in 64-bit mode
+.*:10: Error: `t2rpntlvwz0t1' is only supported in 64-bit mode
+.*:11: Error: `t2rpntlvwz1' is only supported in 64-bit mode
+.*:12: Error: `t2rpntlvwz1t1' is only supported in 64-bit mode
+.*:13: Error: `tconjtcmmimfp16ps' is only supported in 64-bit mode
+.*:14: Error: `tconjtfp16' is only supported in 64-bit mode
+.*:15: Error: `ttcmmimfp16ps' is only supported in 64-bit mode
+.*:16: Error: `ttcmmrlfp16ps' is only supported in 64-bit mode
new file mode 100644
@@ -0,0 +1,16 @@
+# Check Illegal AMX-TRANSPOSE instructions
+
+ .allow_index_reg
+ .text
+_start:
+ ttdpbf16ps %tmm1, %tmm2, %tmm3
+ ttdpfp16ps %tmm1, %tmm2, %tmm3
+ ttransposed %tmm2, %tmm3
+ t2rpntlvwz0 (%r9), %tmm3
+ t2rpntlvwz0t1 (%r9), %tmm3
+ t2rpntlvwz1 (%r9), %tmm3
+ t2rpntlvwz1t1 (%r9), %tmm3
+ tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3
+ tconjtfp16 %tmm5, %tmm6
+ ttcmmimfp16ps %tmm4, %tmm5, %tmm6
+ ttcmmrlfp16ps %tmm4, %tmm5, %tmm6
@@ -546,6 +546,7 @@ if [gas_32_check] then {
run_dump_test "avx10_2-256-miscs"
run_dump_test "avx10_2-256-miscs-intel"
run_list_test "msr_imm-inval"
+ run_list_test "amx-transpose-inval"
run_list_test "sg"
run_dump_test "clzero"
run_dump_test "invlpgb"
new file mode 100644
@@ -0,0 +1,33 @@
+#objdump: -dw -Mintel
+#name: x86_64 AMX-TRANSPOSE insns (Intel disassembly)
+#source: x86-64-amx-transpose.s
+
+.*: +file format .*
+
+Disassembly of section \.text:
+
+#...
+[a-f0-9]+ <_intel>:
+\s*[a-f0-9]+:\s*c4 e2 5a 6c f5\s+ttdpbf16ps tmm6,tmm5,tmm4
+\s*[a-f0-9]+:\s*c4 e2 72 6c da\s+ttdpbf16ps tmm3,tmm2,tmm1
+\s*[a-f0-9]+:\s*c4 e2 5b 6c f5\s+ttdpfp16ps tmm6,tmm5,tmm4
+\s*[a-f0-9]+:\s*c4 e2 73 6c da\s+ttdpfp16ps tmm3,tmm2,tmm1
+\s*[a-f0-9]+:\s*c4 e2 7a 5f f5\s+ttransposed tmm6,tmm5
+\s*[a-f0-9]+:\s*c4 e2 7a 5f da\s+ttransposed tmm3,tmm2
+\s*[a-f0-9]+:\s*c4 a2 78 6e b4 f5 00 00 00 10\s+t2rpntlvwz0 tmm6,\[rbp\+r14\*8\+0x10000000\]
+\s*[a-f0-9]+:\s*c4 c2 78 6e 14 21\s+t2rpntlvwz0 tmm2,\[r9\+riz\*1\]
+\s*[a-f0-9]+:\s*c4 a2 78 6f b4 f5 00 00 00 10\s+t2rpntlvwz0t1 tmm6,\[rbp\+r14\*8\+0x10000000\]
+\s*[a-f0-9]+:\s*c4 c2 78 6f 14 21\s+t2rpntlvwz0t1 tmm2,\[r9\+riz\*1\]
+\s*[a-f0-9]+:\s*c4 a2 79 6e b4 f5 00 00 00 10\s+t2rpntlvwz1 tmm6,\[rbp\+r14\*8\+0x10000000\]
+\s*[a-f0-9]+:\s*c4 c2 79 6e 14 21\s+t2rpntlvwz1 tmm2,\[r9\+riz\*1\]
+\s*[a-f0-9]+:\s*c4 a2 79 6f b4 f5 00 00 00 10\s+t2rpntlvwz1t1 tmm6,\[rbp\+r14\*8\+0x10000000\]
+\s*[a-f0-9]+:\s*c4 c2 79 6f 14 21\s+t2rpntlvwz1t1 tmm2,\[r9\+riz\*1\]
+\s*[a-f0-9]+:\s*c4 e2 58 6b f5\s+tconjtcmmimfp16ps tmm6,tmm5,tmm4
+\s*[a-f0-9]+:\s*c4 e2 70 6b da\s+tconjtcmmimfp16ps tmm3,tmm2,tmm1
+\s*[a-f0-9]+:\s*c4 e2 79 6b f5\s+tconjtfp16 tmm6,tmm5
+\s*[a-f0-9]+:\s*c4 e2 79 6b da\s+tconjtfp16 tmm3,tmm2
+\s*[a-f0-9]+:\s*c4 e2 5b 6b f5\s+ttcmmimfp16ps tmm6,tmm5,tmm4
+\s*[a-f0-9]+:\s*c4 e2 73 6b da\s+ttcmmimfp16ps tmm3,tmm2,tmm1
+\s*[a-f0-9]+:\s*c4 e2 5a 6b f5\s+ttcmmrlfp16ps tmm6,tmm5,tmm4
+\s*[a-f0-9]+:\s*c4 e2 72 6b da\s+ttcmmrlfp16ps tmm3,tmm2,tmm1
+#pass
new file mode 100644
@@ -0,0 +1,11 @@
+.* Assembler messages:
+.*:6: Error: all tmm registers must be distinct for `ttdpbf16ps'
+.*:7: Error: all tmm registers must be distinct for `ttdpbf16ps'
+.*:8: Error: all tmm registers must be distinct for `ttdpbf16ps'
+.*:9: Error: all tmm registers must be distinct for `ttdpfp16ps'
+.*:10: Error: all tmm registers must be distinct for `ttdpfp16ps'
+.*:11: Error: all tmm registers must be distinct for `ttdpfp16ps'
+.*:12: Error: `\(%rip\)' cannot be used here
+.*:13: Error: `\(%rip\)' cannot be used here
+.*:14: Error: `\(%rip\)' cannot be used here
+.*:15: Error: `\(%rip\)' cannot be used here
new file mode 100644
@@ -0,0 +1,15 @@
+# Check Illegal AMX-TRANSPOSE instructions
+
+ .allow_index_reg
+ .text
+_start:
+ ttdpbf16ps %tmm1, %tmm1, %tmm2
+ ttdpbf16ps %tmm1, %tmm2, %tmm1
+ ttdpbf16ps %tmm2, %tmm1, %tmm1
+ ttdpfp16ps %tmm1, %tmm1, %tmm2
+ ttdpfp16ps %tmm1, %tmm2, %tmm1
+ ttdpfp16ps %tmm2, %tmm1, %tmm1
+ t2rpntlvwz0 (%rip), %tmm1
+ t2rpntlvwz0t1 (%rip), %tmm1
+ t2rpntlvwz1 (%rip), %tmm1
+ t2rpntlvwz1t1 (%rip), %tmm1
new file mode 100644
@@ -0,0 +1,31 @@
+#objdump: -dw
+#name: x86_64 AMX-TRANSPOSE insns
+
+.*: +file format .*
+
+Disassembly of section \.text:
+
+0+ <_start>:
+\s*[a-f0-9]+:\s*c4 e2 5a 6c f5\s+ttdpbf16ps %tmm4,%tmm5,%tmm6
+\s*[a-f0-9]+:\s*c4 e2 72 6c da\s+ttdpbf16ps %tmm1,%tmm2,%tmm3
+\s*[a-f0-9]+:\s*c4 e2 5b 6c f5\s+ttdpfp16ps %tmm4,%tmm5,%tmm6
+\s*[a-f0-9]+:\s*c4 e2 73 6c da\s+ttdpfp16ps %tmm1,%tmm2,%tmm3
+\s*[a-f0-9]+:\s*c4 e2 7a 5f f5\s+ttransposed %tmm5,%tmm6
+\s*[a-f0-9]+:\s*c4 e2 7a 5f da\s+ttransposed %tmm2,%tmm3
+\s*[a-f0-9]+:\s*c4 a2 78 6e b4 f5 00 00 00 10\s+t2rpntlvwz0 0x10000000\(%rbp,%r14,8\),%tmm6
+\s*[a-f0-9]+:\s*c4 c2 78 6e 14 21\s+t2rpntlvwz0 \(%r9,%riz,1\),%tmm2
+\s*[a-f0-9]+:\s*c4 a2 78 6f b4 f5 00 00 00 10\s+t2rpntlvwz0t1 0x10000000\(%rbp,%r14,8\),%tmm6
+\s*[a-f0-9]+:\s*c4 c2 78 6f 14 21\s+t2rpntlvwz0t1 \(%r9,%riz,1\),%tmm2
+\s*[a-f0-9]+:\s*c4 a2 79 6e b4 f5 00 00 00 10\s+t2rpntlvwz1 0x10000000\(%rbp,%r14,8\),%tmm6
+\s*[a-f0-9]+:\s*c4 c2 79 6e 14 21\s+t2rpntlvwz1 \(%r9,%riz,1\),%tmm2
+\s*[a-f0-9]+:\s*c4 a2 79 6f b4 f5 00 00 00 10\s+t2rpntlvwz1t1 0x10000000\(%rbp,%r14,8\),%tmm6
+\s*[a-f0-9]+:\s*c4 c2 79 6f 14 21\s+t2rpntlvwz1t1 \(%r9,%riz,1\),%tmm2
+\s*[a-f0-9]+:\s*c4 e2 58 6b f5\s+tconjtcmmimfp16ps %tmm4,%tmm5,%tmm6
+\s*[a-f0-9]+:\s*c4 e2 70 6b da\s+tconjtcmmimfp16ps %tmm1,%tmm2,%tmm3
+\s*[a-f0-9]+:\s*c4 e2 79 6b f5\s+tconjtfp16 %tmm5,%tmm6
+\s*[a-f0-9]+:\s*c4 e2 79 6b da\s+tconjtfp16 %tmm2,%tmm3
+\s*[a-f0-9]+:\s*c4 e2 5b 6b f5\s+ttcmmimfp16ps %tmm4,%tmm5,%tmm6
+\s*[a-f0-9]+:\s*c4 e2 73 6b da\s+ttcmmimfp16ps %tmm1,%tmm2,%tmm3
+\s*[a-f0-9]+:\s*c4 e2 5a 6b f5\s+ttcmmrlfp16ps %tmm4,%tmm5,%tmm6
+\s*[a-f0-9]+:\s*c4 e2 72 6b da\s+ttcmmrlfp16ps %tmm1,%tmm2,%tmm3
+#pass
new file mode 100644
@@ -0,0 +1,51 @@
+# Check 64bit AMX-TRANSPOSE instructions
+
+ .text
+_start:
+ ttdpbf16ps %tmm4, %tmm5, %tmm6
+ ttdpbf16ps %tmm1, %tmm2, %tmm3
+ ttdpfp16ps %tmm4, %tmm5, %tmm6
+ ttdpfp16ps %tmm1, %tmm2, %tmm3
+ ttransposed %tmm5, %tmm6
+ ttransposed %tmm2, %tmm3
+ t2rpntlvwz0 0x10000000(%rbp, %r14, 8), %tmm6
+ t2rpntlvwz0 (%r9), %tmm3
+ t2rpntlvwz0t1 0x10000000(%rbp, %r14, 8), %tmm6
+ t2rpntlvwz0t1 (%r9), %tmm3
+ t2rpntlvwz1 0x10000000(%rbp, %r14, 8), %tmm6
+ t2rpntlvwz1 (%r9), %tmm3
+ t2rpntlvwz1t1 0x10000000(%rbp, %r14, 8), %tmm6
+ t2rpntlvwz1t1 (%r9), %tmm3
+ tconjtcmmimfp16ps %tmm4, %tmm5, %tmm6
+ tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3
+ tconjtfp16 %tmm5, %tmm6
+ tconjtfp16 %tmm2, %tmm3
+ ttcmmimfp16ps %tmm4, %tmm5, %tmm6
+ ttcmmimfp16ps %tmm1, %tmm2, %tmm3
+ ttcmmrlfp16ps %tmm4, %tmm5, %tmm6
+ ttcmmrlfp16ps %tmm1, %tmm2, %tmm3
+
+_intel:
+ .intel_syntax noprefix
+ ttdpbf16ps tmm6, tmm5, tmm4
+ ttdpbf16ps tmm3, tmm2, tmm1
+ ttdpfp16ps tmm6, tmm5, tmm4
+ ttdpfp16ps tmm3, tmm2, tmm1
+ ttransposed tmm6, tmm5
+ ttransposed tmm3, tmm2
+ t2rpntlvwz0 tmm6, [rbp+r14*8+0x10000000]
+ t2rpntlvwz0 tmm3, [r9]
+ t2rpntlvwz0t1 tmm6, [rbp+r14*8+0x10000000]
+ t2rpntlvwz0t1 tmm3, [r9]
+ t2rpntlvwz1 tmm6, [rbp+r14*8+0x10000000]
+ t2rpntlvwz1 tmm3, [r9]
+ t2rpntlvwz1t1 tmm6, [rbp+r14*8+0x10000000]
+ t2rpntlvwz1t1 tmm3, [r9]
+ tconjtcmmimfp16ps tmm6, tmm5, tmm4
+ tconjtcmmimfp16ps tmm3, tmm2, tmm1
+ tconjtfp16 tmm6, tmm5
+ tconjtfp16 tmm3, tmm2
+ ttcmmimfp16ps tmm6, tmm5, tmm4
+ ttcmmimfp16ps tmm3, tmm2, tmm1
+ ttcmmrlfp16ps tmm6, tmm5, tmm4
+ ttcmmrlfp16ps tmm3, tmm2, tmm1
@@ -524,6 +524,9 @@ run_dump_test "x86-64-avx10_2-256-miscs-intel"
run_dump_test "x86-64-msr_imm"
run_dump_test "x86-64-msr_imm-intel"
run_list_test "x86-64-msr_imm-inval"
+run_dump_test "x86-64-amx-transpose"
+run_dump_test "x86-64-amx-transpose-intel"
+run_list_test "x86-64-amx-transpose-inval"
run_dump_test "x86-64-clzero"
run_dump_test "x86-64-mwaitx-bdver4"
run_list_test "x86-64-mwaitx-reg"
@@ -963,6 +963,8 @@ enum
MOD_0F38F8,
MOD_VEX_0F3849_X86_64_L_0_W_0,
+ MOD_VEX_0F386E_X86_64,
+ MOD_VEX_0F386F_X86_64,
MOD_EVEX_MAP4_60,
MOD_EVEX_MAP4_61,
@@ -1136,7 +1138,11 @@ enum
PREFIX_VEX_0F3851_W_0,
PREFIX_VEX_0F385C_X86_64_L_0_W_0,
PREFIX_VEX_0F385E_X86_64_L_0_W_0,
+ PREFIX_VEX_0F385F_X86_64_L_0_W_0,
+ PREFIX_VEX_0F386B_X86_64_L_0_W_0,
PREFIX_VEX_0F386C_X86_64_L_0_W_0,
+ PREFIX_VEX_0F386E_X86_64_M_0_L_0_W_0,
+ PREFIX_VEX_0F386F_X86_64_M_0_L_0_W_0,
PREFIX_VEX_0F3872,
PREFIX_VEX_0F38B0_W_0,
PREFIX_VEX_0F38B1_W_0,
@@ -1347,7 +1353,11 @@ enum
X86_64_VEX_0F384B,
X86_64_VEX_0F385C,
X86_64_VEX_0F385E,
+ X86_64_VEX_0F385F,
+ X86_64_VEX_0F386B,
X86_64_VEX_0F386C,
+ X86_64_VEX_0F386E,
+ X86_64_VEX_0F386F,
X86_64_VEX_0F38Ex,
X86_64_VEX_MAP7_F6_L_0_W_0_R_0,
@@ -1431,7 +1441,11 @@ enum
VEX_LEN_0F385A,
VEX_LEN_0F385C_X86_64,
VEX_LEN_0F385E_X86_64,
+ VEX_LEN_0F385F_X86_64,
+ VEX_LEN_0F386B_X86_64,
VEX_LEN_0F386C_X86_64,
+ VEX_LEN_0F386E_X86_64_M_0,
+ VEX_LEN_0F386F_X86_64_M_0,
VEX_LEN_0F38CB_P_3_W_0,
VEX_LEN_0F38CC_P_3_W_0,
VEX_LEN_0F38CD_P_3_W_0,
@@ -1604,7 +1618,11 @@ enum
VEX_W_0F385A_L_0,
VEX_W_0F385C_X86_64_L_0,
VEX_W_0F385E_X86_64_L_0,
+ VEX_W_0F385F_X86_64_L_0,
+ VEX_W_0F386B_X86_64_L_0,
VEX_W_0F386C_X86_64_L_0,
+ VEX_W_0F386E_X86_64_M_0_L_0,
+ VEX_W_0F386F_X86_64_M_0_L_0,
VEX_W_0F3872_P_1,
VEX_W_0F3878,
VEX_W_0F3879,
@@ -4105,11 +4123,40 @@ static const struct dis386 prefix_table[][4] = {
{ "tdpbssd", {TMM, Rtmm, VexTmm }, 0 },
},
+ /* PREFIX_VEX_0F385F_X86_64_L_0_W_0 */
+ {
+ { Bad_Opcode },
+ { "ttransposed", { TMM, Rtmm }, 0 },
+ },
+
+ /* PREFIX_VEX_0F386B_X86_64_L_0_W_0 */
+ {
+ { "tconjtcmmimfp16ps", { TMM, Rtmm, VexTmm }, 0 },
+ { "ttcmmrlfp16ps", { TMM, Rtmm, VexTmm }, 0 },
+ { "tconjtfp16", { TMM, Rtmm }, 0 },
+ { "ttcmmimfp16ps", { TMM, Rtmm, VexTmm }, 0 },
+ },
+
/* PREFIX_VEX_0F386C_X86_64_L_0_W_0 */
{
- { "tcmmrlfp16ps", { TMM, Rtmm, VexTmm }, 0 },
+ { "tcmmrlfp16ps", { TMM, Rtmm, VexTmm }, 0 },
+ { "ttdpbf16ps", { TMM, Rtmm, VexTmm }, 0 },
+ { "tcmmimfp16ps", { TMM, Rtmm, VexTmm }, 0 },
+ { "ttdpfp16ps", { TMM, Rtmm, VexTmm }, 0 },
+ },
+
+ /* PREFIX_VEX_0F386E_X86_64_M_0_L_0_W_0 */
+ {
+ { "t2rpntlvwz0", { TMM, MVexSIBMEM }, 0 },
{ Bad_Opcode },
- { "tcmmimfp16ps", { TMM, Rtmm, VexTmm }, 0 },
+ { "t2rpntlvwz1", { TMM, MVexSIBMEM }, 0 },
+ },
+
+ /* PREFIX_VEX_0F386F_X86_64_M_0_L_0_W_0 */
+ {
+ { "t2rpntlvwz0t1", { TMM, MVexSIBMEM }, 0 },
+ { Bad_Opcode },
+ { "t2rpntlvwz1t1", { TMM, MVexSIBMEM }, 0 },
},
/* PREFIX_VEX_0F3872 */
@@ -4581,12 +4628,36 @@ static const struct dis386 x86_64_table[][2] = {
{ VEX_LEN_TABLE (VEX_LEN_0F385E_X86_64) },
},
+ /* X86_64_VEX_0F385F */
+ {
+ { Bad_Opcode },
+ { VEX_LEN_TABLE (VEX_LEN_0F385F_X86_64) },
+ },
+
+ /* X86_64_VEX_0F386B */
+ {
+ { Bad_Opcode },
+ { VEX_LEN_TABLE (VEX_LEN_0F386B_X86_64) },
+ },
+
/* X86_64_VEX_0F386C */
{
{ Bad_Opcode },
{ VEX_LEN_TABLE (VEX_LEN_0F386C_X86_64) },
},
+ /* X86_64_VEX_0F386E */
+ {
+ { Bad_Opcode },
+ { MOD_TABLE (MOD_VEX_0F386E_X86_64) },
+ },
+
+ /* X86_64_VEX_0F386F */
+ {
+ { Bad_Opcode },
+ { MOD_TABLE (MOD_VEX_0F386F_X86_64) },
+ },
+
/* X86_64_VEX_0F38Ex */
{
{ Bad_Opcode },
@@ -6471,7 +6542,7 @@ static const struct dis386 vex_table[][256] = {
{ X86_64_TABLE (X86_64_VEX_0F385C) },
{ Bad_Opcode },
{ X86_64_TABLE (X86_64_VEX_0F385E) },
- { Bad_Opcode },
+ { X86_64_TABLE (X86_64_VEX_0F385F) },
/* 60 */
{ Bad_Opcode },
{ Bad_Opcode },
@@ -6485,11 +6556,11 @@ static const struct dis386 vex_table[][256] = {
{ Bad_Opcode },
{ Bad_Opcode },
{ Bad_Opcode },
- { Bad_Opcode },
+ { X86_64_TABLE (X86_64_VEX_0F386B) },
{ X86_64_TABLE (X86_64_VEX_0F386C) },
{ Bad_Opcode },
- { Bad_Opcode },
- { Bad_Opcode },
+ { X86_64_TABLE (X86_64_VEX_0F386E) },
+ { X86_64_TABLE (X86_64_VEX_0F386F) },
/* 70 */
{ Bad_Opcode },
{ Bad_Opcode },
@@ -7152,11 +7223,31 @@ static const struct dis386 vex_len_table[][2] = {
{ VEX_W_TABLE (VEX_W_0F385E_X86_64_L_0) },
},
+ /* VEX_LEN_0F385F_X86_64 */
+ {
+ { VEX_W_TABLE (VEX_W_0F385F_X86_64_L_0) },
+ },
+
+ /* VEX_LEN_0F386B_X86_64 */
+ {
+ { VEX_W_TABLE (VEX_W_0F386B_X86_64_L_0) },
+ },
+
/* VEX_LEN_0F386C_X86_64 */
{
{ VEX_W_TABLE (VEX_W_0F386C_X86_64_L_0) },
},
+ /* VEX_LEN_0F386E_X86_64_M_0 */
+ {
+ { VEX_W_TABLE (VEX_W_0F386E_X86_64_M_0_L_0) },
+ },
+
+ /* VEX_LEN_0F386F_X86_64_M_0 */
+ {
+ { VEX_W_TABLE (VEX_W_0F386F_X86_64_M_0_L_0) },
+ },
+
/* VEX_LEN_0F38CB_P_3_W_0 */
{
{ Bad_Opcode },
@@ -7836,10 +7927,26 @@ static const struct dis386 vex_w_table[][2] = {
/* VEX_W_0F385E_X86_64_L_0 */
{ PREFIX_TABLE (PREFIX_VEX_0F385E_X86_64_L_0_W_0) },
},
+ {
+ /* VEX_W_0F385F_X86_64_L_0 */
+ { PREFIX_TABLE (PREFIX_VEX_0F385F_X86_64_L_0_W_0) },
+ },
+ {
+ /* VEX_W_0F386B_X86_64_L_0 */
+ { PREFIX_TABLE (PREFIX_VEX_0F386B_X86_64_L_0_W_0) },
+ },
{
/* VEX_W_0F386C_X86_64_L_0 */
{ PREFIX_TABLE (PREFIX_VEX_0F386C_X86_64_L_0_W_0) },
},
+ {
+ /* VEX_W_0F386E_X86_64_M_0_L_0 */
+ { PREFIX_TABLE (PREFIX_VEX_0F386E_X86_64_M_0_L_0_W_0) },
+ },
+ {
+ /* VEX_W_0F386F_X86_64_M_0_L_0 */
+ { PREFIX_TABLE (PREFIX_VEX_0F386F_X86_64_M_0_L_0_W_0) },
+ },
{
/* VEX_W_0F3872_P_1 */
{ "%XVvcvtneps2bf16%XY", { XMM, EXx }, 0 },
@@ -8334,6 +8441,14 @@ static const struct dis386 mod_table[][2] = {
{ PREFIX_TABLE (PREFIX_VEX_0F3849_X86_64_L_0_W_0_M_0) },
{ PREFIX_TABLE (PREFIX_VEX_0F3849_X86_64_L_0_W_0_M_1) },
},
+ {
+ /* MOD_VEX_0F386E_X86_64 */
+ { VEX_LEN_TABLE (VEX_LEN_0F386E_X86_64_M_0) },
+ },
+ {
+ /* MOD_VEX_0F386F_X86_64 */
+ { VEX_LEN_TABLE (VEX_LEN_0F386F_X86_64_M_0) },
+ },
#include "i386-dis-evex-mod.h"
};
@@ -263,6 +263,8 @@ static const dependency isa_dependencies[] =
"AMX_TILE" },
{ "AMX_COMPLEX",
"AMX_TILE" },
+ { "AMX_TRANSPOSE",
+ "AMX_TILE" },
{ "KL",
"SSE2" },
{ "WIDEKL",
@@ -429,6 +431,7 @@ static bitfield cpu_flags[] =
BITFIELD (AMX_BF16),
BITFIELD (AMX_FP16),
BITFIELD (AMX_COMPLEX),
+ BITFIELD (AMX_TRANSPOSE),
BITFIELD (AMX_TILE),
BITFIELD (MOVDIRI),
BITFIELD (MOVDIR64B),
@@ -499,6 +502,7 @@ static bitfield opcode_modifiers[] =
BITFIELD (NoEgpr),
BITFIELD (NF),
BITFIELD (Rex2),
+ BITFIELD (TMMPairOperand),
};
#define CLASS(n) #n, n
@@ -327,6 +327,8 @@ enum i386_cpu
CpuAVX512VL,
/* Intel APX_F Instructions support required. */
CpuAPX_F,
+ /* Intel AMX-TRANSPOSE Instructions support required. */
+ CpuAMX_TRANSPOSE,
/* Not supported in the 64bit mode */
CpuNo64,
@@ -363,6 +365,7 @@ enum i386_cpu
cpuavx512f:1, \
cpuavx512vl:1, \
cpuapx_f:1, \
+ cpuamx_transpose:1, \
/* NOTE: This field needs to remain last. */ \
cpuno64:1
@@ -772,6 +775,9 @@ enum
/* Instrucion requires REX2 prefix. */
Rex2,
+ /* Check whether or which tmm register is pair register. */
+ TMMPairOperand,
+
/* The last bitfield in i386_opcode_modifier. */
Opcode_Modifier_Num
};
@@ -820,6 +826,7 @@ typedef struct i386_opcode_modifier
unsigned int noegpr:1;
unsigned int nf:1;
unsigned int rex2:1;
+ unsigned int tmmpairoperand:3;
} i386_opcode_modifier;
/* Operand classes. */
@@ -3183,14 +3183,27 @@ xresldtrk, 0xf20f01e9, TSXLDTRK, NoSuf, {}
// TSXLDTRK instructions end.
+#define TMMPairOperand0 TMMPairOperand=1
+#define TMMPairOperand1 TMMPairOperand=2
+#define TMMPairOperand2 TMMPairOperand=4
+
// AMX instructions.
ldtilecfg, 0x49/0, APX_F(AMX_TILE), Modrm|Vex128|EVex128|Space0F38|VexW0|NoSuf, { Unspecified|BaseIndex }
sttilecfg, 0x6649/0, APX_F(AMX_TILE), Modrm|Vex128|EVex128|Space0F38|VexW0|NoSuf, { Unspecified|BaseIndex }
+t2rpntlvwz0, 0x6e, AMX_TRANSPOSE, TMMPairOperand1|Sibmem|Vex128|Space0F38|VexW0|NoSuf, { Unspecified|BaseIndex, RegTMM }
+t2rpntlvwz0t1, 0x6f, AMX_TRANSPOSE, TMMPairOperand1|Sibmem|Vex128|Space0F38|VexW0|NoSuf, { Unspecified|BaseIndex, RegTMM }
+t2rpntlvwz1, 0x666e, AMX_TRANSPOSE, TMMPairOperand1|Sibmem|Vex128|Space0F38|VexW0|NoSuf, { Unspecified|BaseIndex, RegTMM }
+t2rpntlvwz1t1, 0x666f, AMX_TRANSPOSE, TMMPairOperand1|Sibmem|Vex128|Space0F38|VexW0|NoSuf, { Unspecified|BaseIndex, RegTMM }
+
tcmmimfp16ps, 0x666c, AMX_COMPLEX, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
tcmmrlfp16ps, 0x6c, AMX_COMPLEX, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
+tconjtcmmimfp16ps, 0x6b, AMX_COMPLEX&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
+
+tconjtfp16, 0x666b, AMX_COMPLEX&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|VexW0|NoSuf, { RegTMM, RegTMM }
+
tdpbf16ps, 0xf35c, AMX_BF16, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
tdpfp16ps, 0xf25c, AMX_FP16, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
tdpbssd, 0xf25e, AMX_INT8, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
@@ -3206,6 +3219,14 @@ tilerelease, 0x49c0, AMX_TILE, Vex128|Space0F38|VexW0|NoSuf, {}
tilezero, 0xf249, AMX_TILE, Modrm|Vex128|Space0F38|VexW0|NoSuf, { RegTMM }
+ttcmmimfp16ps, 0xf26b, AMX_COMPLEX&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
+ttcmmrlfp16ps, 0xf36b, AMX_COMPLEX&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
+
+ttdpbf16ps, 0xf36c, AMX_BF16&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
+ttdpfp16ps, 0xf26c, AMX_FP16&AMX_TRANSPOSE, Modrm|Vex128|Space0F38|Src2VVVV|VexW0|NoSuf, { RegTMM, RegTMM, RegTMM }
+
+ttransposed, 0xf35f, AMX_TRANSPOSE, Modrm|Vex128|Space0F38|VexW0|NoSuf, { RegTMM, RegTMM }
+
// AMX instructions end.
// KEYLOCKER instructions.