[2/5] x86: optimize {,V}EXTRACTPS with immediate 0

Message ID 4b12e7b1-ace8-4469-aefe-f0db2419fcd7@suse.com
State New
Headers
Series x86: (mainly) insert/extract optimizations |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_binutils_build--master-aarch64 fail Patch failed to apply
linaro-tcwg-bot/tcwg_binutils_build--master-arm fail Patch failed to apply

Commit Message

Jan Beulich Sept. 6, 2024, 11:52 a.m. UTC
  They are equivalent to simple moves, which are up to 2 bytes shorter to
encode (and maybe also cheaper to execute).
  

Patch

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -5509,6 +5509,42 @@  optimize_encoding (void)
       i.operands = 2;
       i.imm_operands = 0;
     }
+  else if (i.tm.base_opcode == 0x17
+	   && i.tm.opcode_space == SPACE_0F3A
+	   && i.op[0].imms->X_op == O_constant
+	   && i.op[0].imms->X_add_number == 0)
+    {
+      /* Optimize: -O:
+         extractps $0, %xmmN, %rM   -> movd %xmmN, %rM
+         extractps $0, %xmmN, mem   -> movss %xmmN, mem
+         vextractps $0, %xmmN, %rM  -> vmovd %xmmN, %rM
+         vextractps $0, %xmmN, mem  -> vmovss %xmmN, mem
+       */
+      i.tm.opcode_space = SPACE_0F;
+      i.tm.opcode_modifier.vexw = VEXW0;
+
+      if (!i.mem_operands)
+	i.tm.base_opcode = 0x7e;
+      else
+	{
+	  i.tm.base_opcode = 0x11;
+	  i.tm.opcode_modifier.opcodeprefix = PREFIX_0XF3;
+	}
+
+      i.op[0].regs = i.op[1].regs;
+      i.types[0] = i.types[1];
+      i.flags[0] = i.flags[1];
+      i.tm.operand_types[0] = i.tm.operand_types[1];
+
+      i.op[1].regs = i.op[2].regs;
+      i.types[1] = i.types[2];
+      i.flags[1] = i.flags[2];
+      i.reloc[1] = i.reloc[2];
+      i.tm.operand_types[1] = i.tm.operand_types[2];
+
+      i.operands = 2;
+      i.imm_operands = 0;
+    }
 }
 
 /* Check whether the promoted (to address size) register is usable as index
--- a/gas/testsuite/gas/i386/optimize-1.d
+++ b/gas/testsuite/gas/i386/optimize-1.d
@@ -166,6 +166,10 @@  Disassembly of section .text:
  +[a-f0-9]+:	66 .*	movd   %xmm1,\(%edx\)
  +[a-f0-9]+:	c5 .*	vmovd  %xmm1,%edx
  +[a-f0-9]+:	c5 .*	vmovd  %xmm1,\(%edx\)
+ +[a-f0-9]+:	66 .*	movd   %xmm1,%edx
+ +[a-f0-9]+:	f3 .*	movss  %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovd  %xmm1,%edx
+ +[a-f0-9]+:	c5 .*	vmovss %xmm1,\(%edx\)
  +[a-f0-9]+:	0f ba e0 0f          	bt     \$0xf,%eax
  +[a-f0-9]+:	66 0f ba e0 10       	bt     \$0x10,%ax
  +[a-f0-9]+:	0f ba f8 0f          	btc    \$0xf,%eax
--- a/gas/testsuite/gas/i386/optimize-1.s
+++ b/gas/testsuite/gas/i386/optimize-1.s
@@ -194,6 +194,11 @@  _start:
 	vpextrd		$0, %xmm1, %edx
 	vpextrd		$0, %xmm1, (%edx)
 
+	extractps	$0, %xmm1, %edx
+	extractps	$0, %xmm1, (%edx)
+	vextractps	$0, %xmm1, %edx
+	vextractps	$0, %xmm1, (%edx)
+
 	bt	$15, %ax
 	bt	$16, %ax
 	btc	$15, %ax
--- a/gas/testsuite/gas/i386/optimize-1a.d
+++ b/gas/testsuite/gas/i386/optimize-1a.d
@@ -167,6 +167,10 @@  Disassembly of section .text:
  +[a-f0-9]+:	66 .*	movd   %xmm1,\(%edx\)
  +[a-f0-9]+:	c5 .*	vmovd  %xmm1,%edx
  +[a-f0-9]+:	c5 .*	vmovd  %xmm1,\(%edx\)
+ +[a-f0-9]+:	66 .*	movd   %xmm1,%edx
+ +[a-f0-9]+:	f3 .*	movss  %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovd  %xmm1,%edx
+ +[a-f0-9]+:	c5 .*	vmovss %xmm1,\(%edx\)
  +[a-f0-9]+:	0f ba e0 0f          	bt     \$0xf,%eax
  +[a-f0-9]+:	66 0f ba e0 10       	bt     \$0x10,%ax
  +[a-f0-9]+:	0f ba f8 0f          	btc    \$0xf,%eax
--- a/gas/testsuite/gas/i386/optimize-4.d
+++ b/gas/testsuite/gas/i386/optimize-4.d
@@ -166,6 +166,10 @@  Disassembly of section .text:
  +[a-f0-9]+:	66 .*	movd   %xmm1,\(%edx\)
  +[a-f0-9]+:	c5 .*	vmovd  %xmm1,%edx
  +[a-f0-9]+:	c5 .*	vmovd  %xmm1,\(%edx\)
+ +[a-f0-9]+:	66 .*	movd   %xmm1,%edx
+ +[a-f0-9]+:	f3 .*	movss  %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovd  %xmm1,%edx
+ +[a-f0-9]+:	c5 .*	vmovss %xmm1,\(%edx\)
  +[a-f0-9]+:	0f ba e0 0f          	bt     \$0xf,%eax
  +[a-f0-9]+:	66 0f ba e0 10       	bt     \$0x10,%ax
  +[a-f0-9]+:	0f ba f8 0f          	btc    \$0xf,%eax
--- a/gas/testsuite/gas/i386/optimize-5.d
+++ b/gas/testsuite/gas/i386/optimize-5.d
@@ -166,6 +166,10 @@  Disassembly of section .text:
  +[a-f0-9]+:	66 .*	movd   %xmm1,\(%edx\)
  +[a-f0-9]+:	c5 .*	vmovd  %xmm1,%edx
  +[a-f0-9]+:	c5 .*	vmovd  %xmm1,\(%edx\)
+ +[a-f0-9]+:	66 .*	movd   %xmm1,%edx
+ +[a-f0-9]+:	f3 .*	movss  %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovd  %xmm1,%edx
+ +[a-f0-9]+:	c5 .*	vmovss %xmm1,\(%edx\)
  +[a-f0-9]+:	0f ba e0 0f          	bt     \$0xf,%eax
  +[a-f0-9]+:	66 0f ba e0 10       	bt     \$0x10,%ax
  +[a-f0-9]+:	0f ba f8 0f          	btc    \$0xf,%eax
--- a/gas/testsuite/gas/i386/x86-64.exp
+++ b/gas/testsuite/gas/i386/x86-64.exp
@@ -594,6 +594,8 @@  run_dump_test "x86-64-optimize-7b"
 run_list_test "x86-64-optimize-8" "-I${srcdir}/$subdir -march=+noavx2 -al"
 run_list_test "x86-64-optimize-pextr" "-O -aln"
 run_dump_test "x86-64-optimize-pextr"
+run_list_test "x86-64-optimize-extractps" "-O -aln"
+run_dump_test "x86-64-optimize-extractps"
 run_dump_test "x86-64-apx-ndd-optimize"
 run_dump_test "x86-64-align-branch-1a"
 run_dump_test "x86-64-align-branch-1b"
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-optimize-extractps.d
@@ -0,0 +1,20 @@ 
+#as: -O -msse2avx
+#objdump: -drw
+#name: x86-64 EXTRACTPS optimized encoding with -msse2avx
+
+.*: +file format .*
+
+
+Disassembly of section .text:
+
+0+ <extractps>:
+ +[a-f0-9]+:	c5 f9 7e ca          	vmovd  %xmm1,%edx
+ +[a-f0-9]+:	c5 fa 11 0a          	vmovss %xmm1,\(%rdx\)
+ +[a-f0-9]+:	62 f9 7d 08 7e ca    	vmovd  %xmm1,%r18d
+ +[a-f0-9]+:	62 f9 7e 08 11 0a    	vmovss %xmm1,\(%r18\)
+ +[a-f0-9]+:	c5 f9 7e ca          	vmovd  %xmm1,%edx
+ +[a-f0-9]+:	c5 fa 11 0a          	vmovss %xmm1,\(%rdx\)
+ +[a-f0-9]+:	62 e1 7d 08 7e ca    	vmovd  %xmm17,%edx
+ +[a-f0-9]+:	62 f9 7d 08 7e ca    	vmovd  %xmm1,%r18d
+ +[a-f0-9]+:	62 f9 7e 08 11 0a    	vmovss %xmm1,\(%r18\)
+#pass
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-optimize-extractps.l
@@ -0,0 +1,21 @@ 
+.*: Assembler messages:
+.*:6: Error: .*
+.*:7: Error: .*
+[ 	]*[0-9a-f]+[ 	]+\.text
+[ 	]*[0-9a-f]+[ 	]+extractps:
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? 660F7ECA[ 	]+extractps	\$0, %xmm1, %edx
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? F30F110A[ 	]+extractps	\$0, %xmm1, \(%rdx\)
+[ 	]*[0-9a-f]+[ 	]+
+[ 	]*[0-9a-f]+[ 	]+extractps	\$0, %xmm1, %r18d
+[ 	]*[0-9a-f]+[ 	]+extractps	\$0, %xmm1, \(%r18\)
+[ 	]*[0-9a-f]+[ 	]+
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? C5F97ECA[ 	]+vextractps	\$0, %xmm1, %edx
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? C5FA110A[ 	]+vextractps	\$0, %xmm1, \(%rdx\)
+[ 	]*[0-9a-f]+[ 	]+
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? 62E17D08[ 	]+vextractps	\$0, %xmm17, %edx
+[ 	]*[0-9a-f]+[ 	]+7ECA
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? 62F97D08[ 	]+vextractps	\$0, %xmm1, %r18d
+[ 	]*[0-9a-f]+[ 	]+7ECA
+[ 	]*[0-9a-f]+[ 	]+\?\?\?\? 62F97E08[ 	]+vextractps	\$0, %xmm1, \(%r18\)
+[ 	]*[0-9a-f]+[ 	]+110A
+#pass
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-optimize-extractps.s
@@ -0,0 +1,14 @@ 
+	.text
+extractps:
+	extractps	$0, %xmm1, %edx
+	extractps	$0, %xmm1, (%rdx)
+
+	extractps	$0, %xmm1, %r18d
+	extractps	$0, %xmm1, (%r18)
+
+	vextractps	$0, %xmm1, %edx
+	vextractps	$0, %xmm1, (%rdx)
+
+	vextractps	$0, %xmm17, %edx
+	vextractps	$0, %xmm1, %r18d
+	vextractps	$0, %xmm1, (%r18)
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -1491,10 +1491,10 @@  blendvp<sd>, 0x664a | <sd:opc>, AVX, Mod
 blendvp<sd>, 0x660f3814 | <sd:opc>, SSE4_1, Modrm|NoSuf, { Acc|Xmmword, RegXMM|Unspecified|BaseIndex, RegXMM }
 blendvp<sd>, 0x660f3814 | <sd:opc>, SSE4_1, Modrm|NoSuf, { RegXMM|Unspecified|BaseIndex, RegXMM }
 dpp<sd><sse41>, 0x660f3a40 | <sd:opc>, <sse41:cpu>, Modrm|<sse41:attr>|<sse41:vvvv>|NoSuf, { Imm8|Imm8S, RegXMM|Unspecified|BaseIndex, RegXMM }
-extractps, 0x6617, AVX|AVX512F, Modrm|Vex128|EVex128|Space0F3A|VexW0|Disp8MemShift=2|NoSuf|SSE2AVX, { Imm8, RegXMM, Reg32|Unspecified|BaseIndex }
-extractps, 0x6617, x64&(AVX|AVX512F), RegMem|Vex128|EVex128|Space0F3A|VexW1|NoSuf|SSE2AVX, { Imm8, RegXMM, Reg64 }
-extractps, 0x660f3a17, SSE4_1, Modrm|IgnoreSize|NoSuf, { Imm8, RegXMM, Reg32|Unspecified|BaseIndex }
-extractps, 0x660f3a17, SSE4_1&x64, RegMem|NoSuf|NoRex64, { Imm8, RegXMM, Reg64 }
+extractps, 0x6617, AVX|AVX512F, Modrm|Vex128|EVex128|Space0F3A|VexW0|Disp8MemShift=2|NoSuf|SSE2AVX|Optimize, { Imm8, RegXMM, Reg32|Unspecified|BaseIndex }
+extractps, 0x6617, x64&(AVX|AVX512F), RegMem|Vex128|EVex128|Space0F3A|VexW1|NoSuf|SSE2AVX|Optimize, { Imm8, RegXMM, Reg64 }
+extractps, 0x660f3a17, SSE4_1, Modrm|IgnoreSize|NoSuf|Optimize, { Imm8, RegXMM, Reg32|Unspecified|BaseIndex }
+extractps, 0x660f3a17, SSE4_1&x64, RegMem|NoSuf|Optimize|NoRex64, { Imm8, RegXMM, Reg64 }
 insertps<SSE41D>, 0x660f3a21, <SSE41D:cpu>, Modrm|<SSE41D:attr>|<SSE41D:vvvv>|Disp8MemShift|NoSuf, { Imm8, Dword|Unspecified|BaseIndex|RegXMM, RegXMM }
 movntdqa<SSE41D>, 0x660f382a, <SSE41D:cpu>, Modrm|<SSE41D:attr>|NoSuf, { Xmmword|Unspecified|BaseIndex, RegXMM }
 mpsadbw<sse41>, 0x660f3a42, <sse41:cpu>, Modrm|<sse41:attr>|<sse41:vvvv>|NoSuf, { Imm8|Imm8S, RegXMM|Unspecified|BaseIndex, RegXMM }
@@ -1669,8 +1669,8 @@  vdpps, 0x6640, AVX, Modrm|Vex|Space0F3A|
 vextractf128, 0x6619, AVX, Modrm|Vex256|Space0F3A|VexW0|NoSuf, { Imm8, RegYMM, Unspecified|BaseIndex|RegXMM }
 // vextractf32x4 in disguise (see vround{p,s}{s,d} comment)
 vextractf128, 0x6619, APX_F&AVX512VL, Modrm|EVex256|Space0F3A|VexW0|Disp8MemShift=4|NoSuf, { Imm8, RegYMM, Xmmword|Unspecified|BaseIndex }
-vextractps, 0x6617, AVX|AVX512F, Modrm|Vex128|EVex128|Space0F3A|VexWIG|Disp8MemShift=2|NoSuf, { Imm8, RegXMM, Reg32|Unspecified|BaseIndex }
-vextractps, 0x6617, x64&(AVX|AVX512F), RegMem|Vex128|EVex128|Space0F3A|VexWIG|NoSuf, { Imm8, RegXMM, Reg64 }
+vextractps, 0x6617, AVX|AVX512F, Modrm|Vex128|EVex128|Space0F3A|VexWIG|Disp8MemShift=2|NoSuf|Optimize, { Imm8, RegXMM, Reg32|Unspecified|BaseIndex }
+vextractps, 0x6617, x64&(AVX|AVX512F), RegMem|Vex128|EVex128|Space0F3A|VexWIG|NoSuf|Optimize, { Imm8, RegXMM, Reg64 }
 vhaddpd, 0x667c, AVX, Modrm|Vex|Space0F|Src1VVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vhaddps, 0xf27c, AVX, Modrm|Vex|Space0F|Src1VVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vhsubpd, 0x667d, AVX, Modrm|Vex|Space0F|Src1VVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }