@@ -5619,6 +5619,102 @@ optimize_encoding (void)
i.operands = 2;
i.imm_operands = 0;
}
+ else if (i.tm.base_opcode == 0x21
+ && i.tm.opcode_space == SPACE_0F3A
+ && i.op[0].imms->X_op == O_constant
+ && (i.operands == i.reg_operands + 1
+ ? i.op[0].imms->X_add_number == 0
+ || (i.op[0].imms->X_add_number & 0xf) == 0xf
+ : (i.op[0].imms->X_add_number & 0x3f) == 0x0e
+ && (i.reg_operands == 1 || i.op[2].regs == i.op[3].regs)))
+ {
+ /* Optimize: -O:
+ insertps $0b....1111, %xmmN, %xmmM -> xorps %xmmM, %xmmM
+ insertps $0b00000000, %xmmN, %xmmM -> movss %xmmN, %xmmM
+ insertps $0b..001110, mem, %xmmN -> movss mem, %xmmN
+ vinsertps $0b....1111, %xmmN, %xmmM, %xmmK -> vxorps %xmm?, %xmm?, %xmmK
+ vinsertps $0b00000000, %xmmN, %xmmM, %xmmK -> vmovss %xmmN, %xmmM, %xmmK
+ vinsertps $0b..001110, mem, %xmmN, %xmmN -> vmovss mem, %xmmN
+ */
+ i.tm.opcode_space = SPACE_0F;
+ if ((i.op[0].imms->X_add_number & 0xf) == 0xf)
+ {
+ i.tm.base_opcode = 0x57;
+ i.tm.opcode_modifier.opcodeprefix = PREFIX_NONE;
+
+ --i.operands;
+
+ i.op[i.operands - 1].regs = i.op[i.operands].regs;
+ i.types[i.operands - 1] = i.types[i.operands];
+ i.flags[i.operands - 1] = i.flags[i.operands];
+ i.tm.operand_types[i.operands - 1] = i.tm.operand_types[i.operands];
+
+ i.op[1].regs = i.op[i.operands - 1].regs;
+ i.types[1] = i.types[i.operands - 1];
+ i.flags[1] = i.flags[i.operands - 1];
+ i.tm.operand_types[1] = i.tm.operand_types[i.operands - 1];
+
+ i.op[0].regs = i.op[1].regs;
+ i.types[0] = i.types[1];
+ i.flags[0] = i.flags[1];
+ i.tm.operand_types[0] = i.tm.operand_types[1];
+
+ /* Switch from EVEX to VEX encoding if possible. Sadly we can't
+ (always) tell use of the {evex} pseudo-prefix (which otherwise
+ we'd like to respect) from use of %xmm16-%xmm31. */
+ if (pp.encoding == encoding_evex)
+ pp.encoding = encoding_default;
+ if (i.tm.opcode_modifier.evex
+ && pp.encoding <= encoding_vex3
+ && !(i.op[0].regs->reg_flags & RegVRex))
+ {
+ i.tm.opcode_modifier.evex = 0;
+ i.tm.opcode_modifier.vex = VEX128;
+ }
+
+ /* Switch from VEX3 to VEX2 encoding if possible. */
+ if (i.tm.opcode_modifier.vex
+ && pp.encoding <= encoding_vex
+ && (i.op[0].regs->reg_flags & RegRex))
+ {
+ i.op[0].regs -= 8;
+ i.op[1].regs = i.op[0].regs;
+ }
+ }
+ else
+ {
+ i.tm.base_opcode = 0x10;
+ i.tm.opcode_modifier.opcodeprefix = PREFIX_0XF3;
+
+ if (i.op[0].imms->X_add_number == 0)
+ {
+ i.op[0].regs = i.op[1].regs;
+ --i.operands;
+ }
+ else
+ {
+ i.op[0].disps = i.op[1].disps;
+ i.reloc[0] = i.reloc[1];
+ i.operands = 2;
+ i.tm.opcode_modifier.vexvvvv = 0;
+ }
+ i.types[0] = i.types[1];
+ i.flags[0] = i.flags[1];
+ i.tm.operand_types[0] = i.tm.operand_types[1];
+
+ i.op[1].regs = i.op[2].regs;
+ i.types[1] = i.types[2];
+ i.flags[1] = i.flags[2];
+ i.tm.operand_types[1] = i.tm.operand_types[2];
+
+ i.op[2].regs = i.op[3].regs;
+ i.types[2] = i.types[3];
+ i.flags[2] = i.flags[3];
+ i.tm.operand_types[2] = i.tm.operand_types[3];
+ }
+
+ i.imm_operands = 0;
+ }
}
/* Check whether the promoted (to address size) register is usable as index
@@ -190,6 +190,12 @@ Disassembly of section .text:
+[a-f0-9]+: c5 .* vmovupd %ymm1,\(%edx\)
+[a-f0-9]+: c5 .* vmovdqa %ymm1,%ymm2
+[a-f0-9]+: c5 .* vmovdqu %ymm1,\(%edx\)
+ +[a-f0-9]+: f3 .* movss %xmm1,%xmm2
+ +[a-f0-9]+: f3 .* movss \(%ecx\),%xmm2
+ +[a-f0-9]+: 0f .* xorps %xmm2,%xmm2
+ +[a-f0-9]+: c5 .* vmovss %xmm1,%xmm2,%xmm3
+ +[a-f0-9]+: c5 .* vmovss \(%ecx\),%xmm2
+ +[a-f0-9]+: c5 .* vxorps %xmm3,%xmm3,%xmm3
+[a-f0-9]+: 0f ba e0 0f bt \$0xf,%eax
+[a-f0-9]+: 66 0f ba e0 10 bt \$0x10,%ax
+[a-f0-9]+: 0f ba f8 0f btc \$0xf,%eax
@@ -224,6 +224,14 @@ _start:
vextracti64x4 $0, %zmm1, %ymm2
vextracti64x4 $0, %zmm1, (%edx)
+ insertps $0, %xmm1, %xmm2
+ insertps $0xce, (%ecx), %xmm2
+ insertps $0xff, %xmm1, %xmm2
+
+ vinsertps $0, %xmm1, %xmm2, %xmm3
+ vinsertps $0xce, (%ecx), %xmm2, %xmm2
+ vinsertps $0xff, %xmm1, %xmm2, %xmm3
+
bt $15, %ax
bt $16, %ax
btc $15, %ax
@@ -191,6 +191,12 @@ Disassembly of section .text:
+[a-f0-9]+: c5 .* vmovupd %ymm1,\(%edx\)
+[a-f0-9]+: c5 .* vmovdqa %ymm1,%ymm2
+[a-f0-9]+: c5 .* vmovdqu %ymm1,\(%edx\)
+ +[a-f0-9]+: f3 .* movss %xmm1,%xmm2
+ +[a-f0-9]+: f3 .* movss \(%ecx\),%xmm2
+ +[a-f0-9]+: 0f .* xorps %xmm2,%xmm2
+ +[a-f0-9]+: c5 .* vmovss %xmm1,%xmm2,%xmm3
+ +[a-f0-9]+: c5 .* vmovss \(%ecx\),%xmm2
+ +[a-f0-9]+: c5 .* vxorps %xmm3,%xmm3,%xmm3
+[a-f0-9]+: 0f ba e0 0f bt \$0xf,%eax
+[a-f0-9]+: 66 0f ba e0 10 bt \$0x10,%ax
+[a-f0-9]+: 0f ba f8 0f btc \$0xf,%eax
@@ -190,6 +190,12 @@ Disassembly of section .text:
+[a-f0-9]+: c5 .* vmovupd %ymm1,\(%edx\)
+[a-f0-9]+: c5 .* vmovdqa %ymm1,%ymm2
+[a-f0-9]+: c5 .* vmovdqu %ymm1,\(%edx\)
+ +[a-f0-9]+: f3 .* movss %xmm1,%xmm2
+ +[a-f0-9]+: f3 .* movss \(%ecx\),%xmm2
+ +[a-f0-9]+: 0f .* xorps %xmm2,%xmm2
+ +[a-f0-9]+: c5 .* vmovss %xmm1,%xmm2,%xmm3
+ +[a-f0-9]+: c5 .* vmovss \(%ecx\),%xmm2
+ +[a-f0-9]+: c5 .* vxorps %xmm3,%xmm3,%xmm3
+[a-f0-9]+: 0f ba e0 0f bt \$0xf,%eax
+[a-f0-9]+: 66 0f ba e0 10 bt \$0x10,%ax
+[a-f0-9]+: 0f ba f8 0f btc \$0xf,%eax
@@ -190,6 +190,12 @@ Disassembly of section .text:
+[a-f0-9]+: c5 .* vmovupd %ymm1,\(%edx\)
+[a-f0-9]+: c5 .* vmovdqa %ymm1,%ymm2
+[a-f0-9]+: c5 .* vmovdqu %ymm1,\(%edx\)
+ +[a-f0-9]+: f3 .* movss %xmm1,%xmm2
+ +[a-f0-9]+: f3 .* movss \(%ecx\),%xmm2
+ +[a-f0-9]+: 0f .* xorps %xmm2,%xmm2
+ +[a-f0-9]+: c5 .* vmovss %xmm1,%xmm2,%xmm3
+ +[a-f0-9]+: c5 .* vmovss \(%ecx\),%xmm2
+ +[a-f0-9]+: c5 .* vxorps %xmm3,%xmm3,%xmm3
+[a-f0-9]+: 0f ba e0 0f bt \$0xf,%eax
+[a-f0-9]+: 66 0f ba e0 10 bt \$0x10,%ax
+[a-f0-9]+: 0f ba f8 0f btc \$0xf,%eax
@@ -597,6 +597,8 @@ run_dump_test "x86-64-optimize-pextr"
run_list_test "x86-64-optimize-extractps" "-O -aln"
run_dump_test "x86-64-optimize-extractps"
run_dump_test "x86-64-optimize-vextractNN"
+run_list_test "x86-64-optimize-insertps" "-O -aln"
+run_dump_test "x86-64-optimize-insertps"
run_dump_test "x86-64-apx-ndd-optimize"
run_dump_test "x86-64-align-branch-1a"
run_dump_test "x86-64-align-branch-1b"
@@ -0,0 +1,26 @@
+#as: -O -msse2avx
+#objdump: -drw
+#name: x86-64 EXTRACTPS optimized encoding with -msse2avx
+
+.*: +file format .*
+
+
+Disassembly of section .text:
+
+0+ <insertps>:
+ +[a-f0-9]+: c5 ea 10 d1 vmovss %xmm1,%xmm2,%xmm2
+ +[a-f0-9]+: c5 fa 10 11 vmovss \(%rcx\),%xmm2
+ +[a-f0-9]+: 62 f9 7e 08 10 11 vmovss \(%r17\),%xmm2
+ +[a-f0-9]+: c5 e8 57 d2 vxorps %xmm2,%xmm2,%xmm2
+ +[a-f0-9]+: c5 ea 10 d9 vmovss %xmm1,%xmm2,%xmm3
+ +[a-f0-9]+: c5 fa 10 11 vmovss \(%rcx\),%xmm2
+ +[a-f0-9]+: c5 e0 57 db vxorps %xmm3,%xmm3,%xmm3
+ +[a-f0-9]+: c5 e0 57 db vxorps %xmm3,%xmm3,%xmm3
+ +[a-f0-9]+: c5 e0 57 db vxorps %xmm3,%xmm3,%xmm3
+ +[a-f0-9]+: c5 60 57 db vxorps %xmm3,%xmm3,%xmm11
+ +[a-f0-9]+: 62 b1 6e 08 10 d9 vmovss %xmm17,%xmm2,%xmm3
+ +[a-f0-9]+: 62 e1 7e 08 10 11 vmovss \(%rcx\),%xmm18
+ +[a-f0-9]+: c5 e0 57 db vxorps %xmm3,%xmm3,%xmm3
+ +[a-f0-9]+: c5 e0 57 db vxorps %xmm3,%xmm3,%xmm3
+ +[a-f0-9]+: 62 a1 64 00 57 db vxorps %xmm19,%xmm19,%xmm19
+#pass
@@ -0,0 +1,26 @@
+.*: Assembler messages:
+.*:5: Error: .*
+[ ]*[0-9a-f]+[ ]+\.text
+[ ]*[0-9a-f]+[ ]+insertps:
+[ ]*[0-9a-f]+[ ]+\?\?\?\? F30F10D1[ ]+insertps \$0, %xmm1, %xmm2
+[ ]*[0-9a-f]+[ ]+\?\?\?\? F30F1011[ ]+insertps \$0xce, \(%rcx\), %xmm2
+[ ]*[0-9a-f]+[ ]+insertps \$0x0e, \(%r17\), %xmm2
+[ ]*[0-9a-f]+[ ]+\?\?\?\? 0F57D2[ ]+insertps \$0xff, %xmm1, %xmm2
+[ ]*[0-9a-f]+[ ]+
+[ ]*[0-9a-f]+[ ]+\?\?\?\? C5EA10D9[ ]+vinsertps \$0, %xmm1, %xmm2, %xmm3
+[ ]*[0-9a-f]+[ ]+\?\?\?\? C5FA1011[ ]+vinsertps \$0xce, \(%rcx\), %xmm2, %xmm2
+[ ]*[0-9a-f]+[ ]+\?\?\?\? C5E057DB[ ]+vinsertps \$0xff, %xmm1, %xmm2, %xmm3
+[ ]*[0-9a-f]+[ ]+
+[ ]*[0-9a-f]+[ ]+\?\?\?\? C5E057DB[ ]+vinsertps \$0xbf, %xmm9, %xmm2, %xmm3
+[ ]*[0-9a-f]+[ ]+\?\?\?\? C5E057DB[ ]+vinsertps \$0x7f, %xmm1, %xmm10, %xmm3
+[ ]*[0-9a-f]+[ ]+\?\?\?\? C56057DB[ ]+vinsertps \$0x3f, %xmm1, %xmm2, %xmm11
+[ ]*[0-9a-f]+[ ]+
+[ ]*[0-9a-f]+[ ]+\?\?\?\? 62B16E08[ ]+vinsertps \$0, %xmm17, %xmm2, %xmm3
+[ ]*[0-9a-f]+[ ]+10D9
+[ ]*[0-9a-f]+[ ]+\?\?\?\? 62E17E08[ ]+vinsertps \$0xce, \(%rcx\), %xmm18, %xmm18
+[ ]*[0-9a-f]+[ ]+1011
+[ ]*[0-9a-f]+[ ]+\?\?\?\? C5E057DB[ ]+vinsertps \$0xff, %xmm17, %xmm2, %xmm3
+[ ]*[0-9a-f]+[ ]+\?\?\?\? C5E057DB[ ]+vinsertps \$0xff, %xmm1, %xmm18, %xmm3
+[ ]*[0-9a-f]+[ ]+\?\?\?\? 62A16400[ ]+vinsertps \$0xff, %xmm1, %xmm2, %xmm19
+[ ]*[0-9a-f]+[ ]+57DB
+#pass
@@ -0,0 +1,20 @@
+ .text
+insertps:
+ insertps $0, %xmm1, %xmm2
+ insertps $0xce, (%rcx), %xmm2
+ insertps $0x0e, (%r17), %xmm2
+ insertps $0xff, %xmm1, %xmm2
+
+ vinsertps $0, %xmm1, %xmm2, %xmm3
+ vinsertps $0xce, (%rcx), %xmm2, %xmm2
+ vinsertps $0xff, %xmm1, %xmm2, %xmm3
+
+ vinsertps $0xbf, %xmm9, %xmm2, %xmm3
+ vinsertps $0x7f, %xmm1, %xmm10, %xmm3
+ vinsertps $0x3f, %xmm1, %xmm2, %xmm11
+
+ vinsertps $0, %xmm17, %xmm2, %xmm3
+ vinsertps $0xce, (%rcx), %xmm18, %xmm18
+ vinsertps $0xff, %xmm17, %xmm2, %xmm3
+ vinsertps $0xff, %xmm1, %xmm18, %xmm3
+ vinsertps $0xff, %xmm1, %xmm2, %xmm19
@@ -1495,7 +1495,7 @@ extractps, 0x6617, AVX|AVX512F, Modrm|Ve
extractps, 0x6617, x64&(AVX|AVX512F), RegMem|Vex128|EVex128|Space0F3A|VexW1|NoSuf|SSE2AVX|Optimize, { Imm8, RegXMM, Reg64 }
extractps, 0x660f3a17, SSE4_1, Modrm|IgnoreSize|NoSuf|Optimize, { Imm8, RegXMM, Reg32|Unspecified|BaseIndex }
extractps, 0x660f3a17, SSE4_1&x64, RegMem|NoSuf|Optimize|NoRex64, { Imm8, RegXMM, Reg64 }
-insertps<SSE41D>, 0x660f3a21, <SSE41D:cpu>, Modrm|<SSE41D:attr>|<SSE41D:vvvv>|Disp8MemShift|NoSuf, { Imm8, Dword|Unspecified|BaseIndex|RegXMM, RegXMM }
+insertps<SSE41D>, 0x660f3a21, <SSE41D:cpu>, Modrm|<SSE41D:attr>|<SSE41D:vvvv>|Disp8MemShift|NoSuf|Optimize, { Imm8, Dword|Unspecified|BaseIndex|RegXMM, RegXMM }
movntdqa<SSE41D>, 0x660f382a, <SSE41D:cpu>, Modrm|<SSE41D:attr>|NoSuf, { Xmmword|Unspecified|BaseIndex, RegXMM }
mpsadbw<sse41>, 0x660f3a42, <sse41:cpu>, Modrm|<sse41:attr>|<sse41:vvvv>|NoSuf, { Imm8|Imm8S, RegXMM|Unspecified|BaseIndex, RegXMM }
packusdw<SSE41BW>, 0x660f382b, <SSE41BW:cpu>, Modrm|<SSE41BW:attr>|<SSE41BW:vvvv>|NoSuf, { RegXMM|Unspecified|BaseIndex, RegXMM }
@@ -1678,7 +1678,7 @@ vhsubps, 0xf27d, AVX, Modrm|Vex|Space0F|
vinsertf128, 0x6618, AVX, Modrm|Vex256|Space0F3A|Src1VVVV|VexW0|NoSuf, { Imm8, Unspecified|BaseIndex|RegXMM, RegYMM, RegYMM }
// vinsertf32x4 in disguise (see vround{p,s}{s,d} comment)
vinsertf128, 0x6618, APX_F&AVX512VL, Modrm|EVex256|Space0F3A|Src1VVVV|VexW0|Disp8MemShift=4|NoSuf, { Imm8, Xmmword|Unspecified|BaseIndex, RegYMM, RegYMM }
-vinsertps, 0x6621, AVX, Modrm|Vex|Space0F3A|Src1VVVV|VexWIG|NoSuf, { Imm8, Dword|Unspecified|BaseIndex|RegXMM, RegXMM, RegXMM }
+vinsertps, 0x6621, AVX, Modrm|Vex|Space0F3A|Src1VVVV|VexWIG|NoSuf|Optimize, { Imm8, Dword|Unspecified|BaseIndex|RegXMM, RegXMM, RegXMM }
vlddqu, 0xf2f0, AVX, Modrm|Vex|Space0F|VexWIG|CheckOperandSize|NoSuf, { Xmmword|Ymmword|Unspecified|BaseIndex, RegXMM|RegYMM }
vldmxcsr, 0xae/2, AVX, Modrm|Vex128|Space0F|VexWIG|NoSuf, { Dword|Unspecified|BaseIndex }
vmaskmovdqu, 0x66f7, AVX, Modrm|Vex|Space0F|VexWIG|NoSuf, { RegXMM, RegXMM }
@@ -2413,7 +2413,7 @@ vinserti32x4, 0x6638, AVX512F, Modrm|Mas
vinsertf64x4, 0x661A, AVX512F, Modrm|EVex=1|Masking|Space0F3A|Src1VVVV|VexW1|Disp8MemShift=5|NoSuf, { Imm8, RegYMM|Unspecified|BaseIndex, RegZMM, RegZMM }
vinserti64x4, 0x663A, AVX512F, Modrm|EVex=1|Masking|Space0F3A|Src1VVVV|VexW1|Disp8MemShift=5|NoSuf, { Imm8, RegYMM|Unspecified|BaseIndex, RegZMM, RegZMM }
-vinsertps, 0x6621, AVX512F, Modrm|EVex128|Space0F3A|Src1VVVV|VexW0|Disp8MemShift=2|NoSuf, { Imm8, RegXMM|Dword|Unspecified|BaseIndex, RegXMM, RegXMM }
+vinsertps, 0x6621, AVX512F, Modrm|EVex128|Space0F3A|Src1VVVV|VexW0|Disp8MemShift=2|NoSuf|Optimize, { Imm8, RegXMM|Dword|Unspecified|BaseIndex, RegXMM, RegXMM }
vmovap<sd>, 0x<sd:ppfx>28, AVX512F, D|Modrm|Masking|Space0F|<sd:vexw>|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
vmovntp<sd>, 0x<sd:ppfx>2B, AVX512F, Modrm|Space0F|<sd:vexw>|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM, XMMword|YMMword|ZMMword|Unspecified|BaseIndex }