[5/5] x86: optimize {,V}INSERTPS with certain immediates

Message ID 5e6077cd-61bb-4c4c-9100-ddc7b33a6b4b@suse.com
State New
Headers
Series x86: (mainly) insert/extract optimizations |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_binutils_build--master-arm fail Patch failed to apply
linaro-tcwg-bot/tcwg_binutils_build--master-aarch64 fail Patch failed to apply

Commit Message

Jan Beulich Sept. 6, 2024, 11:54 a.m. UTC
  They are equivalent to simple moves or xors, which are up to 3 bytes
shorter to encode (and maybe/likely also cheaper to execute).
  

Patch

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -5619,6 +5619,102 @@  optimize_encoding (void)
       i.operands = 2;
       i.imm_operands = 0;
     }
+  else if (i.tm.base_opcode == 0x21
+	   && i.tm.opcode_space == SPACE_0F3A
+	   && i.op[0].imms->X_op == O_constant
+	   && (i.operands == i.reg_operands + 1
+	       ? i.op[0].imms->X_add_number == 0
+		 || (i.op[0].imms->X_add_number & 0xf) == 0xf
+	       : (i.op[0].imms->X_add_number & 0x3f) == 0x0e
+		  && (i.reg_operands == 1 || i.op[2].regs == i.op[3].regs)))
+    {
+      /* Optimize: -O:
+         insertps $0b....1111, %xmmN, %xmmM          -> xorps %xmmM, %xmmM
+         insertps $0b00000000, %xmmN, %xmmM          -> movss %xmmN, %xmmM
+         insertps $0b..001110, mem, %xmmN            -> movss mem, %xmmN
+         vinsertps $0b....1111, %xmmN, %xmmM, %xmmK  -> vxorps %xmm?, %xmm?, %xmmK
+         vinsertps $0b00000000, %xmmN, %xmmM, %xmmK  -> vmovss %xmmN, %xmmM, %xmmK
+         vinsertps $0b..001110, mem, %xmmN, %xmmN    -> vmovss mem, %xmmN
+       */
+      i.tm.opcode_space = SPACE_0F;
+      if ((i.op[0].imms->X_add_number & 0xf) == 0xf)
+	{
+	  i.tm.base_opcode = 0x57;
+	  i.tm.opcode_modifier.opcodeprefix = PREFIX_NONE;
+
+	  --i.operands;
+
+	  i.op[i.operands - 1].regs = i.op[i.operands].regs;
+	  i.types[i.operands - 1] = i.types[i.operands];
+	  i.flags[i.operands - 1] = i.flags[i.operands];
+	  i.tm.operand_types[i.operands - 1] = i.tm.operand_types[i.operands];
+
+	  i.op[1].regs = i.op[i.operands - 1].regs;
+	  i.types[1] = i.types[i.operands - 1];
+	  i.flags[1] = i.flags[i.operands - 1];
+	  i.tm.operand_types[1] = i.tm.operand_types[i.operands - 1];
+
+	  i.op[0].regs = i.op[1].regs;
+	  i.types[0] = i.types[1];
+	  i.flags[0] = i.flags[1];
+	  i.tm.operand_types[0] = i.tm.operand_types[1];
+
+	  /* Switch from EVEX to VEX encoding if possible.  Sadly we can't
+	     (always) tell use of the {evex} pseudo-prefix (which otherwise
+	     we'd like to respect) from use of %xmm16-%xmm31.  */
+	  if (pp.encoding == encoding_evex)
+	    pp.encoding = encoding_default;
+	  if (i.tm.opcode_modifier.evex
+	      && pp.encoding <= encoding_vex3
+	      && !(i.op[0].regs->reg_flags & RegVRex))
+	    {
+	      i.tm.opcode_modifier.evex = 0;
+	      i.tm.opcode_modifier.vex = VEX128;
+	    }
+
+	  /* Switch from VEX3 to VEX2 encoding if possible.  */
+	  if (i.tm.opcode_modifier.vex
+	      && pp.encoding <= encoding_vex
+	      && (i.op[0].regs->reg_flags & RegRex))
+	    {
+	      i.op[0].regs -= 8;
+	      i.op[1].regs = i.op[0].regs;
+	    }
+	}
+      else
+	{
+	  i.tm.base_opcode = 0x10;
+	  i.tm.opcode_modifier.opcodeprefix = PREFIX_0XF3;
+
+	  if (i.op[0].imms->X_add_number == 0)
+	    {
+	      i.op[0].regs = i.op[1].regs;
+	      --i.operands;
+	    }
+	  else
+	    {
+	      i.op[0].disps = i.op[1].disps;
+	      i.reloc[0] = i.reloc[1];
+	      i.operands = 2;
+	      i.tm.opcode_modifier.vexvvvv = 0;
+	    }
+	  i.types[0] = i.types[1];
+	  i.flags[0] = i.flags[1];
+	  i.tm.operand_types[0] = i.tm.operand_types[1];
+
+	  i.op[1].regs = i.op[2].regs;
+	  i.types[1] = i.types[2];
+	  i.flags[1] = i.flags[2];
+	  i.tm.operand_types[1] = i.tm.operand_types[2];
+
+	  i.op[2].regs = i.op[3].regs;
+	  i.types[2] = i.types[3];
+	  i.flags[2] = i.flags[3];
+	  i.tm.operand_types[2] = i.tm.operand_types[3];
+	}
+
+      i.imm_operands = 0;
+    }
 }
 
 /* Check whether the promoted (to address size) register is usable as index
--- a/gas/testsuite/gas/i386/optimize-1.d
+++ b/gas/testsuite/gas/i386/optimize-1.d
@@ -190,6 +190,12 @@  Disassembly of section .text:
  +[a-f0-9]+:	c5 .*	vmovupd %ymm1,\(%edx\)
  +[a-f0-9]+:	c5 .*	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 .*	vmovdqu %ymm1,\(%edx\)
+ +[a-f0-9]+:	f3 .*	movss  %xmm1,%xmm2
+ +[a-f0-9]+:	f3 .*	movss  \(%ecx\),%xmm2
+ +[a-f0-9]+:	0f .*	xorps  %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovss %xmm1,%xmm2,%xmm3
+ +[a-f0-9]+:	c5 .*	vmovss \(%ecx\),%xmm2
+ +[a-f0-9]+:	c5 .*	vxorps %xmm3,%xmm3,%xmm3
  +[a-f0-9]+:	0f ba e0 0f          	bt     \$0xf,%eax
  +[a-f0-9]+:	66 0f ba e0 10       	bt     \$0x10,%ax
  +[a-f0-9]+:	0f ba f8 0f          	btc    \$0xf,%eax
--- a/gas/testsuite/gas/i386/optimize-1.s
+++ b/gas/testsuite/gas/i386/optimize-1.s
@@ -224,6 +224,14 @@  _start:
 	vextracti64x4 $0, %zmm1, %ymm2
 	vextracti64x4 $0, %zmm1, (%edx)
 
+	insertps $0, %xmm1, %xmm2
+	insertps $0xce, (%ecx), %xmm2
+	insertps $0xff, %xmm1, %xmm2
+
+	vinsertps $0, %xmm1, %xmm2, %xmm3
+	vinsertps $0xce, (%ecx), %xmm2, %xmm2
+	vinsertps $0xff, %xmm1, %xmm2, %xmm3
+
 	bt	$15, %ax
 	bt	$16, %ax
 	btc	$15, %ax
--- a/gas/testsuite/gas/i386/optimize-1a.d
+++ b/gas/testsuite/gas/i386/optimize-1a.d
@@ -191,6 +191,12 @@  Disassembly of section .text:
  +[a-f0-9]+:	c5 .*	vmovupd %ymm1,\(%edx\)
  +[a-f0-9]+:	c5 .*	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 .*	vmovdqu %ymm1,\(%edx\)
+ +[a-f0-9]+:	f3 .*	movss  %xmm1,%xmm2
+ +[a-f0-9]+:	f3 .*	movss  \(%ecx\),%xmm2
+ +[a-f0-9]+:	0f .*	xorps  %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovss %xmm1,%xmm2,%xmm3
+ +[a-f0-9]+:	c5 .*	vmovss \(%ecx\),%xmm2
+ +[a-f0-9]+:	c5 .*	vxorps %xmm3,%xmm3,%xmm3
  +[a-f0-9]+:	0f ba e0 0f          	bt     \$0xf,%eax
  +[a-f0-9]+:	66 0f ba e0 10       	bt     \$0x10,%ax
  +[a-f0-9]+:	0f ba f8 0f          	btc    \$0xf,%eax
--- a/gas/testsuite/gas/i386/optimize-4.d
+++ b/gas/testsuite/gas/i386/optimize-4.d
@@ -190,6 +190,12 @@  Disassembly of section .text:
  +[a-f0-9]+:	c5 .*	vmovupd %ymm1,\(%edx\)
  +[a-f0-9]+:	c5 .*	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 .*	vmovdqu %ymm1,\(%edx\)
+ +[a-f0-9]+:	f3 .*	movss  %xmm1,%xmm2
+ +[a-f0-9]+:	f3 .*	movss  \(%ecx\),%xmm2
+ +[a-f0-9]+:	0f .*	xorps  %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovss %xmm1,%xmm2,%xmm3
+ +[a-f0-9]+:	c5 .*	vmovss \(%ecx\),%xmm2
+ +[a-f0-9]+:	c5 .*	vxorps %xmm3,%xmm3,%xmm3
  +[a-f0-9]+:	0f ba e0 0f          	bt     \$0xf,%eax
  +[a-f0-9]+:	66 0f ba e0 10       	bt     \$0x10,%ax
  +[a-f0-9]+:	0f ba f8 0f          	btc    \$0xf,%eax
--- a/gas/testsuite/gas/i386/optimize-5.d
+++ b/gas/testsuite/gas/i386/optimize-5.d
@@ -190,6 +190,12 @@  Disassembly of section .text:
  +[a-f0-9]+:	c5 .*	vmovupd %ymm1,\(%edx\)
  +[a-f0-9]+:	c5 .*	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 .*	vmovdqu %ymm1,\(%edx\)
+ +[a-f0-9]+:	f3 .*	movss  %xmm1,%xmm2
+ +[a-f0-9]+:	f3 .*	movss  \(%ecx\),%xmm2
+ +[a-f0-9]+:	0f .*	xorps  %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovss %xmm1,%xmm2,%xmm3
+ +[a-f0-9]+:	c5 .*	vmovss \(%ecx\),%xmm2
+ +[a-f0-9]+:	c5 .*	vxorps %xmm3,%xmm3,%xmm3
  +[a-f0-9]+:	0f ba e0 0f          	bt     \$0xf,%eax
  +[a-f0-9]+:	66 0f ba e0 10       	bt     \$0x10,%ax
  +[a-f0-9]+:	0f ba f8 0f          	btc    \$0xf,%eax
--- a/gas/testsuite/gas/i386/x86-64.exp
+++ b/gas/testsuite/gas/i386/x86-64.exp
@@ -597,6 +597,8 @@  run_dump_test "x86-64-optimize-pextr"
 run_list_test "x86-64-optimize-extractps" "-O -aln"
 run_dump_test "x86-64-optimize-extractps"
 run_dump_test "x86-64-optimize-vextractNN"
+run_list_test "x86-64-optimize-insertps" "-O -aln"
+run_dump_test "x86-64-optimize-insertps"
 run_dump_test "x86-64-apx-ndd-optimize"
 run_dump_test "x86-64-align-branch-1a"
 run_dump_test "x86-64-align-branch-1b"
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-optimize-insertps.d
@@ -0,0 +1,26 @@ 
+#as: -O -msse2avx
+#objdump: -drw
+#name: x86-64 EXTRACTPS optimized encoding with -msse2avx
+
+.*: +file format .*
+
+
+Disassembly of section .text:
+
+0+ <insertps>:
+ +[a-f0-9]+:	c5 ea 10 d1          	vmovss %xmm1,%xmm2,%xmm2
+ +[a-f0-9]+:	c5 fa 10 11          	vmovss \(%rcx\),%xmm2
+ +[a-f0-9]+:	62 f9 7e 08 10 11    	vmovss \(%r17\),%xmm2
+ +[a-f0-9]+:	c5 e8 57 d2          	vxorps %xmm2,%xmm2,%xmm2
+ +[a-f0-9]+:	c5 ea 10 d9          	vmovss %xmm1,%xmm2,%xmm3
+ +[a-f0-9]+:	c5 fa 10 11          	vmovss \(%rcx\),%xmm2
+ +[a-f0-9]+:	c5 e0 57 db          	vxorps %xmm3,%xmm3,%xmm3
+ +[a-f0-9]+:	c5 e0 57 db          	vxorps %xmm3,%xmm3,%xmm3
+ +[a-f0-9]+:	c5 e0 57 db          	vxorps %xmm3,%xmm3,%xmm3
+ +[a-f0-9]+:	c5 60 57 db          	vxorps %xmm3,%xmm3,%xmm11
+ +[a-f0-9]+:	62 b1 6e 08 10 d9    	vmovss %xmm17,%xmm2,%xmm3
+ +[a-f0-9]+:	62 e1 7e 08 10 11    	vmovss \(%rcx\),%xmm18
+ +[a-f0-9]+:	c5 e0 57 db          	vxorps %xmm3,%xmm3,%xmm3
+ +[a-f0-9]+:	c5 e0 57 db          	vxorps %xmm3,%xmm3,%xmm3
+ +[a-f0-9]+:	62 a1 64 00 57 db    	vxorps %xmm19,%xmm19,%xmm19
+#pass
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-optimize-insertps.l
@@ -0,0 +1,26 @@ 
+.*: Assembler messages:
+.*:5: Error: .*
+[ 	]*[0-9a-f]+[ 	]+\.text
+[ 	]*[0-9a-f]+[ 	]+insertps:
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? F30F10D1[ 	]+insertps \$0, %xmm1, %xmm2
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? F30F1011[ 	]+insertps \$0xce, \(%rcx\), %xmm2
+[ 	]*[0-9a-f]+[ 	]+insertps \$0x0e, \(%r17\), %xmm2
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? 0F57D2[ 	]+insertps \$0xff, %xmm1, %xmm2
+[ 	]*[0-9a-f]+[ 	]+
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? C5EA10D9[ 	]+vinsertps \$0, %xmm1, %xmm2, %xmm3
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? C5FA1011[ 	]+vinsertps \$0xce, \(%rcx\), %xmm2, %xmm2
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? C5E057DB[ 	]+vinsertps \$0xff, %xmm1, %xmm2, %xmm3
+[ 	]*[0-9a-f]+[ 	]+
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? C5E057DB[ 	]+vinsertps \$0xbf, %xmm9, %xmm2, %xmm3
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? C5E057DB[ 	]+vinsertps \$0x7f, %xmm1, %xmm10, %xmm3
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? C56057DB[ 	]+vinsertps \$0x3f, %xmm1, %xmm2, %xmm11
+[ 	]*[0-9a-f]+[ 	]+
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? 62B16E08[ 	]+vinsertps \$0, %xmm17, %xmm2, %xmm3
+[ 	]*[0-9a-f]+[ 	]+10D9
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? 62E17E08[ 	]+vinsertps \$0xce, \(%rcx\), %xmm18, %xmm18
+[ 	]*[0-9a-f]+[ 	]+1011
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? C5E057DB[ 	]+vinsertps \$0xff, %xmm17, %xmm2, %xmm3
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? C5E057DB[ 	]+vinsertps \$0xff, %xmm1, %xmm18, %xmm3
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? 62A16400[ 	]+vinsertps \$0xff, %xmm1, %xmm2, %xmm19
+[ 	]*[0-9a-f]+[ 	]+57DB
+#pass
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-optimize-insertps.s
@@ -0,0 +1,20 @@ 
+	.text
+insertps:
+	insertps $0, %xmm1, %xmm2
+	insertps $0xce, (%rcx), %xmm2
+	insertps $0x0e, (%r17), %xmm2
+	insertps $0xff, %xmm1, %xmm2
+
+	vinsertps $0, %xmm1, %xmm2, %xmm3
+	vinsertps $0xce, (%rcx), %xmm2, %xmm2
+	vinsertps $0xff, %xmm1, %xmm2, %xmm3
+
+	vinsertps $0xbf, %xmm9, %xmm2, %xmm3
+	vinsertps $0x7f, %xmm1, %xmm10, %xmm3
+	vinsertps $0x3f, %xmm1, %xmm2, %xmm11
+
+	vinsertps $0, %xmm17, %xmm2, %xmm3
+	vinsertps $0xce, (%rcx), %xmm18, %xmm18
+	vinsertps $0xff, %xmm17, %xmm2, %xmm3
+	vinsertps $0xff, %xmm1, %xmm18, %xmm3
+	vinsertps $0xff, %xmm1, %xmm2, %xmm19
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -1495,7 +1495,7 @@  extractps, 0x6617, AVX|AVX512F, Modrm|Ve
 extractps, 0x6617, x64&(AVX|AVX512F), RegMem|Vex128|EVex128|Space0F3A|VexW1|NoSuf|SSE2AVX|Optimize, { Imm8, RegXMM, Reg64 }
 extractps, 0x660f3a17, SSE4_1, Modrm|IgnoreSize|NoSuf|Optimize, { Imm8, RegXMM, Reg32|Unspecified|BaseIndex }
 extractps, 0x660f3a17, SSE4_1&x64, RegMem|NoSuf|Optimize|NoRex64, { Imm8, RegXMM, Reg64 }
-insertps<SSE41D>, 0x660f3a21, <SSE41D:cpu>, Modrm|<SSE41D:attr>|<SSE41D:vvvv>|Disp8MemShift|NoSuf, { Imm8, Dword|Unspecified|BaseIndex|RegXMM, RegXMM }
+insertps<SSE41D>, 0x660f3a21, <SSE41D:cpu>, Modrm|<SSE41D:attr>|<SSE41D:vvvv>|Disp8MemShift|NoSuf|Optimize, { Imm8, Dword|Unspecified|BaseIndex|RegXMM, RegXMM }
 movntdqa<SSE41D>, 0x660f382a, <SSE41D:cpu>, Modrm|<SSE41D:attr>|NoSuf, { Xmmword|Unspecified|BaseIndex, RegXMM }
 mpsadbw<sse41>, 0x660f3a42, <sse41:cpu>, Modrm|<sse41:attr>|<sse41:vvvv>|NoSuf, { Imm8|Imm8S, RegXMM|Unspecified|BaseIndex, RegXMM }
 packusdw<SSE41BW>, 0x660f382b, <SSE41BW:cpu>, Modrm|<SSE41BW:attr>|<SSE41BW:vvvv>|NoSuf, { RegXMM|Unspecified|BaseIndex, RegXMM }
@@ -1678,7 +1678,7 @@  vhsubps, 0xf27d, AVX, Modrm|Vex|Space0F|
 vinsertf128, 0x6618, AVX, Modrm|Vex256|Space0F3A|Src1VVVV|VexW0|NoSuf, { Imm8, Unspecified|BaseIndex|RegXMM, RegYMM, RegYMM }
 // vinsertf32x4 in disguise (see vround{p,s}{s,d} comment)
 vinsertf128, 0x6618, APX_F&AVX512VL, Modrm|EVex256|Space0F3A|Src1VVVV|VexW0|Disp8MemShift=4|NoSuf, { Imm8, Xmmword|Unspecified|BaseIndex, RegYMM, RegYMM }
-vinsertps, 0x6621, AVX, Modrm|Vex|Space0F3A|Src1VVVV|VexWIG|NoSuf, { Imm8, Dword|Unspecified|BaseIndex|RegXMM, RegXMM, RegXMM }
+vinsertps, 0x6621, AVX, Modrm|Vex|Space0F3A|Src1VVVV|VexWIG|NoSuf|Optimize, { Imm8, Dword|Unspecified|BaseIndex|RegXMM, RegXMM, RegXMM }
 vlddqu, 0xf2f0, AVX, Modrm|Vex|Space0F|VexWIG|CheckOperandSize|NoSuf, { Xmmword|Ymmword|Unspecified|BaseIndex, RegXMM|RegYMM }
 vldmxcsr, 0xae/2, AVX, Modrm|Vex128|Space0F|VexWIG|NoSuf, { Dword|Unspecified|BaseIndex }
 vmaskmovdqu, 0x66f7, AVX, Modrm|Vex|Space0F|VexWIG|NoSuf, { RegXMM, RegXMM }
@@ -2413,7 +2413,7 @@  vinserti32x4, 0x6638, AVX512F, Modrm|Mas
 vinsertf64x4, 0x661A, AVX512F, Modrm|EVex=1|Masking|Space0F3A|Src1VVVV|VexW1|Disp8MemShift=5|NoSuf, { Imm8, RegYMM|Unspecified|BaseIndex, RegZMM, RegZMM }
 vinserti64x4, 0x663A, AVX512F, Modrm|EVex=1|Masking|Space0F3A|Src1VVVV|VexW1|Disp8MemShift=5|NoSuf, { Imm8, RegYMM|Unspecified|BaseIndex, RegZMM, RegZMM }
 
-vinsertps, 0x6621, AVX512F, Modrm|EVex128|Space0F3A|Src1VVVV|VexW0|Disp8MemShift=2|NoSuf, { Imm8, RegXMM|Dword|Unspecified|BaseIndex, RegXMM, RegXMM }
+vinsertps, 0x6621, AVX512F, Modrm|EVex128|Space0F3A|Src1VVVV|VexW0|Disp8MemShift=2|NoSuf|Optimize, { Imm8, RegXMM|Dword|Unspecified|BaseIndex, RegXMM, RegXMM }
 
 vmovap<sd>, 0x<sd:ppfx>28, AVX512F, D|Modrm|Masking|Space0F|<sd:vexw>|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 vmovntp<sd>, 0x<sd:ppfx>2B, AVX512F, Modrm|Space0F|<sd:vexw>|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM, XMMword|YMMword|ZMMword|Unspecified|BaseIndex }