@@ -5071,6 +5071,55 @@ i386_record_vex (struct i386_record_s *ir, uint8_t vex_w, uint8_t vex_r,
}
break;
+ case 0x6f: /* VMOVDQ (U|A) */
+ case 0x7f: /* VMOVDQ (U|A) */
+ /* vmovdq instructions have information about source/destination
+ spread over many places, so this code ended up messier than
+ I'd like. */
+ /* The VEX.pp bits identify if the move is aligned or not, but this
+ doesn't influence the recording so we can ignore it. */
+ i386_record_modrm (ir);
+ /* The first bit of modrm identifies if both operands of the instruction
+ are registers (bit = 1) or if one of the operands is memory. */
+ if (ir->mod & 2)
+ {
+ if (opcode == 0x6f)
+ {
+ /* vex_r will identify the high bit of the destination
+ register. Source is identified by ir->rex_b, but that
+ doesn't matter for recording. */
+ record_full_arch_list_add_reg (ir->regcache,
+ tdep->ymm0_regnum + 8*vex_r + ir->reg);
+ }
+ else
+ {
+ /* The origin operand is >7 and destination operand is <= 7.
+ This is special cased because in this one vex_r is used to
+ identify the high bit of the SOURCE operand, not destination
+ which would mess the previous expression. */
+ record_full_arch_list_add_reg (ir->regcache,
+ tdep->ymm0_regnum + ir->rm);
+ }
+ }
+ else
+ {
+ /* This is the easy branch. We just need to check the opcode
+ to see if the source or destination is memory. */
+ if (opcode == 0x6f)
+ {
+ record_full_arch_list_add_reg (ir->regcache,
+ tdep->ymm0_regnum
+ + ir->reg + vex_r * 8);
+ }
+ else
+ {
+ /* We're writing 256 bits, so 1<<8. */
+ ir->ot = 8;
+ i386_record_lea_modrm (ir);
+ }
+ }
+ break;
+
case 0x60: /* VPUNPCKLBW */
case 0x61: /* VPUNPCKLWD */
case 0x62: /* VPUNPCKLDQ */
@@ -20,8 +20,12 @@
#include <stdlib.h>
char global_buf0[] = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
char global_buf1[] = {0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0};
char *dyn_buf0;
char *dyn_buf1;
@@ -30,8 +34,12 @@ int
vmov_test ()
{
char buf0[] = {0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f};
char buf1[] = {0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0};
/*start vmov_test. */
@@ -73,6 +81,32 @@ vmov_test ()
asm volatile ("vmovq %0, %%xmm15": "=m" (buf0));
asm volatile ("vmovq %0, %%xmm15": "=m" (buf1));
+ /* Test vmovdq style instructions. */
+ /* For local and dynamic buffers, we can't guarantee they will be aligned.
+ However, the aligned and unaligned versions seem to be encoded the same,
+ so testing one is enough to validate both. */
+
+ /* Operations based on local buffers. */
+ asm volatile ("vmovdqu %0, %%ymm0": : "m"(buf0));
+ asm volatile ("vmovdqu %%ymm0, %0": "=m"(buf1));
+
+ /* Operations based on global buffers. */
+ /* Global buffers seem to always be aligned, lets sanity check vmovdqa. */
+ asm volatile ("vmovdqa %0, %%ymm15": : "m"(global_buf0));
+ asm volatile ("vmovdqa %%ymm15, %0": "=m"(global_buf1));
+ asm volatile ("vmovdqu %0, %%ymm0": : "m"(global_buf0));
+ asm volatile ("vmovdqu %%ymm0, %0": "=m"(global_buf1));
+
+ /* Operations based on dynamic buffers. */
+ /* The dynamic buffers are not aligned, so we skip vmovdqa. */
+ asm volatile ("vmovdqu %0, %%ymm0": : "m"(*dyn_buf0));
+ asm volatile ("vmovdqu %%ymm0, %0": "=m"(*dyn_buf1));
+
+ /* Operations between 2 registers. */
+ asm volatile ("vmovdqu %ymm15, %ymm0");
+ asm volatile ("vmovdqu %ymm2, %ymm15");
+ asm volatile ("vmovdqa %ymm15, %ymm0");
+
/* We have a return statement to deal with
epilogue in different compilers. */
return 0; /* end vmov_test */
@@ -161,11 +195,11 @@ vpbroadcast_test ()
int
main ()
{
- dyn_buf0 = (char *) malloc(sizeof(char) * 16);
- dyn_buf1 = (char *) malloc(sizeof(char) * 16);
- for (int i =0; i < 16; i++)
+ dyn_buf0 = (char *) malloc(sizeof(char) * 32);
+ dyn_buf1 = (char *) malloc(sizeof(char) * 32);
+ for (int i =0; i < 32; i++)
{
- dyn_buf0[i] = 0x20 + i;
+ dyn_buf0[i] = 0x20 + (i % 16);
dyn_buf1[i] = 0;
}
/* Zero relevant xmm registers, se we know what to look for. */
@@ -134,6 +134,34 @@ global decimal
if {[record_full_function "vmov"] == true} {
# Now execute backwards, checking all instructions.
+ test_one_register "vmovdqa" "ymm0" \
+ "0x1f1e1d1c1b1a19181716151413121110, 0x1f1e1d1c1b1a19181716151413121110" \
+ "from register: "
+ test_one_register "vmovdqu" "ymm15" \
+ "0x1f1e1d1c1b1a19181716151413121110, 0x1f1e1d1c1b1a19181716151413121110" \
+ "from register: "
+ test_one_register "vmovdqu" "ymm0" \
+ "0x2f2e2d2c2b2a29282726252423222120, 0x2f2e2d2c2b2a29282726252423222120" \
+ "from register: "
+
+ test_one_memory "vmovdqu" "dyn_buf1" "0x0 .repeats 32 times" \
+ true "dynamic buffer: "
+ test_one_register "vmovdqu" "ymm0" \
+ "0x1f1e1d1c1b1a19181716151413121110, 0x1f1e1d1c1b1a19181716151413121110" \
+ "dynamic buffer: "
+
+ # Don't check the full buffer because that'd be too long
+ test_one_memory "vmovdqu" "global_buf1" \
+ "0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19" \
+ "global buffer: "
+ test_one_register "vmovdqu" "ymm0" \
+ "0x3f3e3d3c3b3a39383736353433323130, 0x3f3e3d3c3b3a39383736353433323130" \
+ "global buffer: "
+ test_one_memory "vmovdqa" "global_buf1" "0x0 .repeats 32 times"
+ test_one_register "vmovdqa" "ymm15" "0x0, 0x0"
+
+ test_one_memory "vmovdqu" "buf1" "0x0 .repeats 32 times"
+ test_one_register "vmovdqu" "ymm0" "0x2726252423222120, 0x0" "local buffer: "
test_one_register "vmovq" "xmm15" "0x3736353433323130" "reg_reset: "
test_one_register "vmovq" "xmm15" "0x0"