[4/5] x86: optimize {,V}EXTRACT{F,I}{128,32x{4,8},64x{2,4}} with immediate 0

Message ID c8cddbd7-d8d1-4427-8c82-4b0f06fa659a@suse.com
State New
Headers
Series x86: (mainly) insert/extract optimizations |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_binutils_build--master-arm fail Patch failed to apply
linaro-tcwg-bot/tcwg_binutils_build--master-aarch64 fail Patch failed to apply

Commit Message

Jan Beulich Sept. 6, 2024, 11:53 a.m. UTC
  They, too, are equivalent to simple moves, which are up to 3 bytes
shorter to encode (and maybe also cheaper to execute).
---
I don't really like using goto-s, yet the latest here the question
arises whether the "shifting" of operands, the 3rd instance of which is
being added here, wouldn't better be folded by using the one in
{,V}PEXTR{D,Q} handling.
  

Patch

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -5545,6 +5545,80 @@  optimize_encoding (void)
       i.operands = 2;
       i.imm_operands = 0;
     }
+  else if ((i.tm.base_opcode | 0x22) == 0x3b
+	   && i.tm.opcode_space == SPACE_0F3A
+	   && i.op[0].imms->X_op == O_constant
+	   && i.op[0].imms->X_add_number == 0)
+    {
+      /* Optimize: -O:
+         vextractf128 $0, %ymmN, %xmmM      -> vmovaps %xmmN, %xmmM
+         vextractf128 $0, %ymmN, mem        -> vmovups %xmmN, mem
+         vextractf32x4 $0, %[yz]mmN, %xmmM  -> vmovaps %xmmN, %xmmM
+         vextractf32x4 $0, %[yz]mmN, mem    -> vmovups %xmmN, mem
+         vextractf64x2 $0, %[yz]mmN, %xmmM  -> vmovapd %xmmN, %xmmM
+         vextractf64x2 $0, %[yz]mmN, mem    -> vmovupd %xmmN, mem
+         vextractf32x8 $0, %zmmN, %ymmM     -> vmovaps %ymmN, %ymmM
+         vextractf32x8 $0, %zmmN, mem       -> vmovups %ymmN, mem
+         vextractf64x4 $0, %zmmN, %ymmM     -> vmovapd %ymmN, %ymmM
+         vextractf64x4 $0, %zmmN, mem       -> vmovupd %ymmN, mem
+         vextracti128 $0, %ymmN, %xmmM      -> vmovdqa %xmmN, %xmmM
+         vextracti128 $0, %ymmN, mem        -> vmovdqu %xmmN, mem
+         vextracti32x4 $0, %[yz]mmN, %xmmM  -> vmovdqa{,32} %xmmN, %xmmM
+         vextracti32x4 $0, %[yz]mmN, mem    -> vmovdqu{,32} %xmmN, mem
+         vextracti64x2 $0, %[yz]mmN, %xmmM  -> vmovdqa{,64} %xmmN, %xmmM
+         vextracti64x2 $0, %[yz]mmN, mem    -> vmovdqu{,64} %xmmN, mem
+         vextracti32x8 $0, %zmmN, %ymmM     -> vmovdqa{,32} %ymmN, %ymmM
+         vextracti32x8 $0, %zmmN, mem       -> vmovdqu{,32} %ymmN, mem
+         vextracti64x4 $0, %zmmN, %ymmM     -> vmovdqa{,64} %ymmN, %ymmM
+         vextracti64x4 $0, %zmmN, mem       -> vmovdqu{,64} %ymmN, mem
+       */
+      i.tm.opcode_space = SPACE_0F;
+
+      if (!i.mask.reg
+	  && (pp.encoding <= encoding_vex3
+	      || (pp.encoding == encoding_evex512
+		  && (!i.base_reg || !(i.base_reg->reg_flags & RegRex2))
+		  && (!i.index_reg || !(i.index_reg->reg_flags & RegRex2)))))
+	{
+	  i.tm.opcode_modifier.vex = i.tm.base_opcode & 2 ? VEX256 : VEX128;
+	  i.tm.opcode_modifier.evex = 0;
+	}
+      else
+	i.tm.opcode_modifier.evex = i.tm.base_opcode & 2 ? EVEX256 : EVEX128;
+
+      if (i.tm.base_opcode & 0x20)
+	{
+	  i.tm.base_opcode = 0x7f;
+	  if (i.reg_operands != 2)
+	    i.tm.opcode_modifier.opcodeprefix = PREFIX_0XF3;
+	}
+      else
+	{
+	  if (i.reg_operands == 2)
+	    i.tm.base_opcode = 0x29;
+	  else
+	    i.tm.base_opcode = 0x11;
+	  if (i.tm.opcode_modifier.vexw != VEXW1)
+	    i.tm.opcode_modifier.opcodeprefix = PREFIX_NONE;
+	}
+
+      if (i.tm.opcode_modifier.vex)
+	i.tm.opcode_modifier.vexw = VEXWIG;
+
+      i.op[0].regs = i.op[1].regs;
+      i.types[0] = i.types[1];
+      i.flags[0] = i.flags[1];
+      i.tm.operand_types[0] = i.tm.operand_types[1];
+
+      i.op[1].regs = i.op[2].regs;
+      i.types[1] = i.types[2];
+      i.flags[1] = i.flags[2];
+      i.reloc[1] = i.reloc[2];
+      i.tm.operand_types[1] = i.tm.operand_types[2];
+
+      i.operands = 2;
+      i.imm_operands = 0;
+    }
 }
 
 /* Check whether the promoted (to address size) register is usable as index
--- a/gas/testsuite/gas/i386/optimize-1.d
+++ b/gas/testsuite/gas/i386/optimize-1.d
@@ -170,6 +170,26 @@  Disassembly of section .text:
  +[a-f0-9]+:	f3 .*	movss  %xmm1,\(%edx\)
  +[a-f0-9]+:	c5 .*	vmovd  %xmm1,%edx
  +[a-f0-9]+:	c5 .*	vmovss %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovaps %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovups %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovaps %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovups %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovapd %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovupd %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovaps %ymm1,%ymm2
+ +[a-f0-9]+:	c5 .*	vmovups %ymm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %ymm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovapd %ymm1,%ymm2
+ +[a-f0-9]+:	c5 .*	vmovupd %ymm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %ymm1,\(%edx\)
  +[a-f0-9]+:	0f ba e0 0f          	bt     \$0xf,%eax
  +[a-f0-9]+:	66 0f ba e0 10       	bt     \$0x10,%ax
  +[a-f0-9]+:	0f ba f8 0f          	btc    \$0xf,%eax
--- a/gas/testsuite/gas/i386/optimize-1.s
+++ b/gas/testsuite/gas/i386/optimize-1.s
@@ -199,6 +199,31 @@  _start:
 	vextractps	$0, %xmm1, %edx
 	vextractps	$0, %xmm1, (%edx)
 
+	vextractf128 $0, %ymm1, %xmm2
+	vextractf128 $0, %ymm1, (%edx)
+	vextracti128 $0, %ymm1, %xmm2
+	vextracti128 $0, %ymm1, (%edx)
+
+	vextractf32x4 $0, %ymm1, %xmm2
+	vextractf32x4 $0, %ymm1, (%edx)
+	vextracti32x4 $0, %ymm1, %xmm2
+	vextracti32x4 $0, %ymm1, (%edx)
+
+	vextractf64x2 $0, %ymm1, %xmm2
+	vextractf64x2 $0, %ymm1, (%edx)
+	vextracti64x2 $0, %ymm1, %xmm2
+	vextracti64x2 $0, %ymm1, (%edx)
+
+	vextractf32x8 $0, %zmm1, %ymm2
+	vextractf32x8 $0, %zmm1, (%edx)
+	vextracti32x8 $0, %zmm1, %ymm2
+	vextracti32x8 $0, %zmm1, (%edx)
+
+	vextractf64x4 $0, %zmm1, %ymm2
+	vextractf64x4 $0, %zmm1, (%edx)
+	vextracti64x4 $0, %zmm1, %ymm2
+	vextracti64x4 $0, %zmm1, (%edx)
+
 	bt	$15, %ax
 	bt	$16, %ax
 	btc	$15, %ax
--- a/gas/testsuite/gas/i386/optimize-1a.d
+++ b/gas/testsuite/gas/i386/optimize-1a.d
@@ -171,6 +171,26 @@  Disassembly of section .text:
  +[a-f0-9]+:	f3 .*	movss  %xmm1,\(%edx\)
  +[a-f0-9]+:	c5 .*	vmovd  %xmm1,%edx
  +[a-f0-9]+:	c5 .*	vmovss %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovaps %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovups %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovaps %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovups %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovapd %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovupd %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovaps %ymm1,%ymm2
+ +[a-f0-9]+:	c5 .*	vmovups %ymm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %ymm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovapd %ymm1,%ymm2
+ +[a-f0-9]+:	c5 .*	vmovupd %ymm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %ymm1,\(%edx\)
  +[a-f0-9]+:	0f ba e0 0f          	bt     \$0xf,%eax
  +[a-f0-9]+:	66 0f ba e0 10       	bt     \$0x10,%ax
  +[a-f0-9]+:	0f ba f8 0f          	btc    \$0xf,%eax
--- a/gas/testsuite/gas/i386/optimize-4.d
+++ b/gas/testsuite/gas/i386/optimize-4.d
@@ -170,6 +170,26 @@  Disassembly of section .text:
  +[a-f0-9]+:	f3 .*	movss  %xmm1,\(%edx\)
  +[a-f0-9]+:	c5 .*	vmovd  %xmm1,%edx
  +[a-f0-9]+:	c5 .*	vmovss %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovaps %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovups %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovaps %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovups %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovapd %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovupd %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovaps %ymm1,%ymm2
+ +[a-f0-9]+:	c5 .*	vmovups %ymm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %ymm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovapd %ymm1,%ymm2
+ +[a-f0-9]+:	c5 .*	vmovupd %ymm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %ymm1,\(%edx\)
  +[a-f0-9]+:	0f ba e0 0f          	bt     \$0xf,%eax
  +[a-f0-9]+:	66 0f ba e0 10       	bt     \$0x10,%ax
  +[a-f0-9]+:	0f ba f8 0f          	btc    \$0xf,%eax
--- a/gas/testsuite/gas/i386/optimize-5.d
+++ b/gas/testsuite/gas/i386/optimize-5.d
@@ -170,6 +170,26 @@  Disassembly of section .text:
  +[a-f0-9]+:	f3 .*	movss  %xmm1,\(%edx\)
  +[a-f0-9]+:	c5 .*	vmovd  %xmm1,%edx
  +[a-f0-9]+:	c5 .*	vmovss %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovaps %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovups %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovaps %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovups %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovapd %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovupd %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %xmm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovaps %ymm1,%ymm2
+ +[a-f0-9]+:	c5 .*	vmovups %ymm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %ymm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovapd %ymm1,%ymm2
+ +[a-f0-9]+:	c5 .*	vmovupd %ymm1,\(%edx\)
+ +[a-f0-9]+:	c5 .*	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 .*	vmovdqu %ymm1,\(%edx\)
  +[a-f0-9]+:	0f ba e0 0f          	bt     \$0xf,%eax
  +[a-f0-9]+:	66 0f ba e0 10       	bt     \$0x10,%ax
  +[a-f0-9]+:	0f ba f8 0f          	btc    \$0xf,%eax
--- a/gas/testsuite/gas/i386/x86-64.exp
+++ b/gas/testsuite/gas/i386/x86-64.exp
@@ -596,6 +596,7 @@  run_list_test "x86-64-optimize-pextr" "-
 run_dump_test "x86-64-optimize-pextr"
 run_list_test "x86-64-optimize-extractps" "-O -aln"
 run_dump_test "x86-64-optimize-extractps"
+run_dump_test "x86-64-optimize-vextractNN"
 run_dump_test "x86-64-apx-ndd-optimize"
 run_dump_test "x86-64-align-branch-1a"
 run_dump_test "x86-64-align-branch-1b"
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-optimize-vextractNN.d
@@ -0,0 +1,61 @@ 
+#as: -O
+#objdump: -drw
+#name: x86-64 VEXTRACT{F,I}<nn> optimized encoding with -msse2avx
+
+.*: +file format .*
+
+
+Disassembly of section .text:
+
+0+ <vextract_128>:
+ +[a-f0-9]+:	c5 f8 29 ca          	vmovaps %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f8 11 0a          	vmovups %xmm1,\(%rdx\)
+ +[a-f0-9]+:	62 f9 7c 08 11 0a    	vmovups %xmm1,\(%r18\)
+ +[a-f0-9]+:	c5 f9 7f ca          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 7f 0a          	vmovdqu %xmm1,\(%rdx\)
+ +[a-f0-9]+:	62 f9 7e 08 7f 0a    	vmovdqu32 %xmm1,\(%r18\)
+
+0+[a-f0-9]+ <vextract_NNxM_XMM>:
+ +[a-f0-9]+:	c5 f8 29 ca          	vmovaps %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f8 11 0a          	vmovups %xmm1,\(%rdx\)
+ +[a-f0-9]+:	62 e1 7c 08 29 ca    	vmovaps %xmm17,%xmm2
+ +[a-f0-9]+:	62 e1 7c 08 11 0a    	vmovups %xmm17,\(%rdx\)
+ +[a-f0-9]+:	62 f9 7c 08 11 0a    	vmovups %xmm1,\(%r18\)
+ +[a-f0-9]+:	c5 f9 29 ca          	vmovapd %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 11 0a          	vmovupd %xmm1,\(%rdx\)
+ +[a-f0-9]+:	62 e1 fd 08 29 ca    	vmovapd %xmm17,%xmm2
+ +[a-f0-9]+:	62 e1 fd 08 11 0a    	vmovupd %xmm17,\(%rdx\)
+ +[a-f0-9]+:	62 f9 fd 08 11 0a    	vmovupd %xmm1,\(%r18\)
+ +[a-f0-9]+:	c5 f9 7f ca          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 7f 0a          	vmovdqu %xmm1,\(%rdx\)
+ +[a-f0-9]+:	62 e1 7d 08 7f ca    	vmovdqa32 %xmm17,%xmm2
+ +[a-f0-9]+:	62 e1 7e 08 7f 0a    	vmovdqu32 %xmm17,\(%rdx\)
+ +[a-f0-9]+:	62 f9 7e 08 7f 0a    	vmovdqu32 %xmm1,\(%r18\)
+ +[a-f0-9]+:	c5 f9 7f ca          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 7f 0a          	vmovdqu %xmm1,\(%rdx\)
+ +[a-f0-9]+:	62 e1 fd 08 7f ca    	vmovdqa64 %xmm17,%xmm2
+ +[a-f0-9]+:	62 e1 fe 08 7f 0a    	vmovdqu64 %xmm17,\(%rdx\)
+ +[a-f0-9]+:	62 f9 fe 08 7f 0a    	vmovdqu64 %xmm1,\(%r18\)
+
+0+[a-f0-9]+ <vextract_NNxM_YMM>:
+ +[a-f0-9]+:	c5 fc 29 ca          	vmovaps %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fc 11 0a          	vmovups %ymm1,\(%rdx\)
+ +[a-f0-9]+:	62 e1 7c 28 29 ca    	vmovaps %ymm17,%ymm2
+ +[a-f0-9]+:	62 e1 7c 28 11 0a    	vmovups %ymm17,\(%rdx\)
+ +[a-f0-9]+:	62 f9 7c 28 11 0a    	vmovups %ymm1,\(%r18\)
+ +[a-f0-9]+:	c5 fd 29 ca          	vmovapd %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 11 0a          	vmovupd %ymm1,\(%rdx\)
+ +[a-f0-9]+:	62 e1 fd 28 29 ca    	vmovapd %ymm17,%ymm2
+ +[a-f0-9]+:	62 e1 fd 28 11 0a    	vmovupd %ymm17,\(%rdx\)
+ +[a-f0-9]+:	62 f9 fd 28 11 0a    	vmovupd %ymm1,\(%r18\)
+ +[a-f0-9]+:	c5 fd 7f ca          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 7f 0a          	vmovdqu %ymm1,\(%rdx\)
+ +[a-f0-9]+:	62 e1 7d 28 7f ca    	vmovdqa32 %ymm17,%ymm2
+ +[a-f0-9]+:	62 e1 7e 28 7f 0a    	vmovdqu32 %ymm17,\(%rdx\)
+ +[a-f0-9]+:	62 f9 7e 28 7f 0a    	vmovdqu32 %ymm1,\(%r18\)
+ +[a-f0-9]+:	c5 fd 7f ca          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 7f 0a          	vmovdqu %ymm1,\(%rdx\)
+ +[a-f0-9]+:	62 e1 fd 28 7f ca    	vmovdqa64 %ymm17,%ymm2
+ +[a-f0-9]+:	62 e1 fe 28 7f 0a    	vmovdqu64 %ymm17,\(%rdx\)
+ +[a-f0-9]+:	62 f9 fe 28 7f 0a    	vmovdqu64 %ymm1,\(%r18\)
+#pass
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-optimize-vextractNN.s
@@ -0,0 +1,59 @@ 
+	.text
+vextract_128:
+	vextractf128 $0, %ymm1, %xmm2
+	vextractf128 $0, %ymm1, (%rdx)
+	vextractf128 $0, %ymm1, (%r18)
+
+	vextracti128 $0, %ymm1, %xmm2
+	vextracti128 $0, %ymm1, (%rdx)
+	vextracti128 $0, %ymm1, (%r18)
+
+vextract_NNxM_XMM:
+	vextractf32x4 $0, %ymm1, %xmm2
+	vextractf32x4 $0, %ymm1, (%rdx)
+	vextractf32x4 $0, %ymm17, %xmm2
+	vextractf32x4 $0, %ymm17, (%rdx)
+	vextractf32x4 $0, %ymm1, (%r18)
+
+	vextractf64x2 $0, %ymm1, %xmm2
+	vextractf64x2 $0, %ymm1, (%rdx)
+	vextractf64x2 $0, %ymm17, %xmm2
+	vextractf64x2 $0, %ymm17, (%rdx)
+	vextractf64x2 $0, %ymm1, (%r18)
+
+	vextracti32x4 $0, %ymm1, %xmm2
+	vextracti32x4 $0, %ymm1, (%rdx)
+	vextracti32x4 $0, %ymm17, %xmm2
+	vextracti32x4 $0, %ymm17, (%rdx)
+	vextracti32x4 $0, %ymm1, (%r18)
+
+	vextracti64x2 $0, %ymm1, %xmm2
+	vextracti64x2 $0, %ymm1, (%rdx)
+	vextracti64x2 $0, %ymm17, %xmm2
+	vextracti64x2 $0, %ymm17, (%rdx)
+	vextracti64x2 $0, %ymm1, (%r18)
+
+vextract_NNxM_YMM:
+	vextractf32x8 $0, %zmm1, %ymm2
+	vextractf32x8 $0, %zmm1, (%rdx)
+	vextractf32x8 $0, %zmm17, %ymm2
+	vextractf32x8 $0, %zmm17, (%rdx)
+	vextractf32x8 $0, %zmm1, (%r18)
+
+	vextractf64x4 $0, %zmm1, %ymm2
+	vextractf64x4 $0, %zmm1, (%rdx)
+	vextractf64x4 $0, %zmm17, %ymm2
+	vextractf64x4 $0, %zmm17, (%rdx)
+	vextractf64x4 $0, %zmm1, (%r18)
+
+	vextracti32x8 $0, %zmm1, %ymm2
+	vextracti32x8 $0, %zmm1, (%rdx)
+	vextracti32x8 $0, %zmm17, %ymm2
+	vextracti32x8 $0, %zmm17, (%rdx)
+	vextracti32x8 $0, %zmm1, (%r18)
+
+	vextracti64x4 $0, %zmm1, %ymm2
+	vextracti64x4 $0, %zmm1, (%rdx)
+	vextracti64x4 $0, %zmm17, %ymm2
+	vextracti64x4 $0, %zmm17, (%rdx)
+	vextracti64x4 $0, %zmm1, (%r18)
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -1666,9 +1666,9 @@  vcvttps2dq, 0xf35b, AVX, Modrm|Vex|Space
 vcvtts<sd>2si, 0x<sd:spfx>2c, AVX, Modrm|VexLIG|Space0F|No_bSuf|No_wSuf|No_sSuf, { <sd:elem>|Unspecified|BaseIndex|RegXMM, Reg32|Reg64 }
 vdppd, 0x6641, AVX, Modrm|Vex|Space0F3A|Src1VVVV|VexWIG|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegXMM, RegXMM, RegXMM }
 vdpps, 0x6640, AVX, Modrm|Vex|Space0F3A|Src1VVVV|VexWIG|CheckOperandSize|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vextractf128, 0x6619, AVX, Modrm|Vex256|Space0F3A|VexW0|NoSuf, { Imm8, RegYMM, Unspecified|BaseIndex|RegXMM }
+vextractf128, 0x6619, AVX, Modrm|Vex256|Space0F3A|VexW0|NoSuf|Optimize, { Imm8, RegYMM, Unspecified|BaseIndex|RegXMM }
 // vextractf32x4 in disguise (see vround{p,s}{s,d} comment)
-vextractf128, 0x6619, APX_F&AVX512VL, Modrm|EVex256|Space0F3A|VexW0|Disp8MemShift=4|NoSuf, { Imm8, RegYMM, Xmmword|Unspecified|BaseIndex }
+vextractf128, 0x6619, APX_F&AVX512VL, Modrm|EVex256|Space0F3A|VexW0|Disp8MemShift=4|NoSuf|Optimize, { Imm8, RegYMM, Xmmword|Unspecified|BaseIndex }
 vextractps, 0x6617, AVX|AVX512F, Modrm|Vex128|EVex128|Space0F3A|VexWIG|Disp8MemShift=2|NoSuf|Optimize, { Imm8, RegXMM, Reg32|Unspecified|BaseIndex }
 vextractps, 0x6617, x64&(AVX|AVX512F), RegMem|Vex128|EVex128|Space0F3A|VexWIG|NoSuf|Optimize, { Imm8, RegXMM, Reg64 }
 vhaddpd, 0x667c, AVX, Modrm|Vex|Space0F|Src1VVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
@@ -1879,9 +1879,9 @@  vpermd, 0x6636, AVX2|AVX512F, Modrm|Vex2
 vpermpd, 0x6601, AVX2|AVX512F, Modrm|Vex256|EVexDYN|Masking|Space0F3A|VexW1|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8|Imm8S, RegYMM|RegZMM|Qword|Unspecified|BaseIndex, RegYMM|RegZMM }
 vpermps, 0x6616, AVX2|AVX512F, Modrm|Vex256|EVexDYN|Masking|Space0F38|Src1VVVV|VexW0|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegYMM|RegZMM|Dword|Unspecified|BaseIndex, RegYMM|RegZMM, RegYMM|RegZMM }
 vpermq, 0x6600, AVX2|AVX512F, Modrm|Vex256|EVexDYN|Masking|Space0F3A|VexW1|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8|Imm8S, RegYMM|RegZMM|Qword|Unspecified|BaseIndex, RegYMM|RegZMM }
-vextracti128, 0x6639, AVX2, Modrm|Vex256|Space0F3A|VexW0|NoSuf, { Imm8, RegYMM, Unspecified|BaseIndex|RegXMM }
+vextracti128, 0x6639, AVX2, Modrm|Vex256|Space0F3A|VexW0|NoSuf|Optimize, { Imm8, RegYMM, Unspecified|BaseIndex|RegXMM }
 // vextracti32x4 in disguise (see vround{p,s}{s,d} comment)
-vextracti128, 0x6639, APX_F&AVX512VL, Modrm|EVex256|Space0F3A|VexW0|Disp8MemShift=4|NoSuf, { Imm8, RegYMM, Xmmword|Unspecified|BaseIndex }
+vextracti128, 0x6639, APX_F&AVX512VL, Modrm|EVex256|Space0F3A|VexW0|Disp8MemShift=4|NoSuf|Optimize, { Imm8, RegYMM, Xmmword|Unspecified|BaseIndex }
 vinserti128, 0x6638, AVX2, Modrm|Vex256|Space0F3A|Src1VVVV|VexW0|NoSuf, { Imm8, Unspecified|BaseIndex|RegXMM, RegYMM, RegYMM }
 // vinserti32x4 in disguise (see vround{p,s}{s,d} comment)
 vinserti128, 0x6638, APX_F&AVX512VL, Modrm|EVex256|Space0F3A|Src1VVVV|VexW0|Disp8MemShift=4|NoSuf, { Imm8, Xmmword|Unspecified|BaseIndex, RegYMM, RegYMM }
@@ -2375,11 +2375,11 @@  vpexpandq, 0x6689, AVX512F, Modrm|Maskin
 vexpandps, 0x6688, AVX512F, Modrm|Masking|Space0F38|VexW=1|Disp8MemShift=2|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 vpexpandd, 0x6689, AVX512F, Modrm|Masking|Space0F38|VexW=1|Disp8MemShift=2|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 
-vextractf32x4, 0x6619, AVX512F, Modrm|Masking|Space0F3A|VexW=1|Disp8MemShift=4|NoSuf, { Imm8, RegYMM|RegZMM, RegXMM|Unspecified|BaseIndex }
-vextracti32x4, 0x6639, AVX512F, Modrm|Masking|Space0F3A|VexW=1|Disp8MemShift=4|NoSuf, { Imm8, RegYMM|RegZMM, RegXMM|Unspecified|BaseIndex }
+vextractf32x4, 0x6619, AVX512F, Modrm|Masking|Space0F3A|VexW=1|Disp8MemShift=4|NoSuf|Optimize, { Imm8, RegYMM|RegZMM, RegXMM|Unspecified|BaseIndex }
+vextracti32x4, 0x6639, AVX512F, Modrm|Masking|Space0F3A|VexW=1|Disp8MemShift=4|NoSuf|Optimize, { Imm8, RegYMM|RegZMM, RegXMM|Unspecified|BaseIndex }
 
-vextractf64x4, 0x661B, AVX512F, Modrm|EVex=1|Masking|Space0F3A|VexW=2|Disp8MemShift=5|NoSuf, { Imm8, RegZMM, RegYMM|Unspecified|BaseIndex }
-vextracti64x4, 0x663B, AVX512F, Modrm|EVex=1|Masking|Space0F3A|VexW=2|Disp8MemShift=5|NoSuf, { Imm8, RegZMM, RegYMM|Unspecified|BaseIndex }
+vextractf64x4, 0x661B, AVX512F, Modrm|EVex=1|Masking|Space0F3A|VexW=2|Disp8MemShift=5|NoSuf|Optimize, { Imm8, RegZMM, RegYMM|Unspecified|BaseIndex }
+vextracti64x4, 0x663B, AVX512F, Modrm|EVex=1|Masking|Space0F3A|VexW=2|Disp8MemShift=5|NoSuf|Optimize, { Imm8, RegZMM, RegYMM|Unspecified|BaseIndex }
 
 vfixupimmp<sd>, 0x6654, AVX512F, Modrm|Masking|Space0F3A|Src1VVVV|<sd:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf|SAE, { Imm8|Imm8S, RegXMM|RegYMM|RegZMM|<sd:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
 vfixupimms<sd>, 0x6655, AVX512F, Modrm|EVexLIG|Masking|Space0F3A|Src1VVVV|<sd:vexw>|Disp8MemShift|NoSuf|SAE, { Imm8|Imm8S, RegXMM|<sd:elem>|Unspecified|BaseIndex, RegXMM, RegXMM }
@@ -2833,16 +2833,16 @@  vcvttps2uqq, 0x6678, AVX512DQ&AVX512VL,
 
 vcvtuqq2ps<Exy>, 0xf27a, AVX512DQ&<Exy:vl>, Modrm|<Exy:attr>|Masking|Space0F|VexW1|Broadcast|NoSuf|<Exy:sr>, { <Exy:src>|Qword, <Exy:dst> }
 
-vextractf32x8, 0x661B, AVX512DQ, Modrm|EVex=1|Masking|Space0F3A|VexW=1|Disp8MemShift=5|NoSuf, { Imm8, RegZMM, RegYMM|Unspecified|BaseIndex }
-vextracti32x8, 0x663B, AVX512DQ, Modrm|EVex=1|Masking|Space0F3A|VexW=1|Disp8MemShift=5|NoSuf, { Imm8, RegZMM, RegYMM|Unspecified|BaseIndex }
+vextractf32x8, 0x661B, AVX512DQ, Modrm|EVex=1|Masking|Space0F3A|VexW=1|Disp8MemShift=5|NoSuf|Optimize, { Imm8, RegZMM, RegYMM|Unspecified|BaseIndex }
+vextracti32x8, 0x663B, AVX512DQ, Modrm|EVex=1|Masking|Space0F3A|VexW=1|Disp8MemShift=5|NoSuf|Optimize, { Imm8, RegZMM, RegYMM|Unspecified|BaseIndex }
 vinsertf32x8, 0x661A, AVX512DQ, Modrm|EVex512|Masking|Space0F3A|Src1VVVV|VexW0|Disp8MemShift=5|NoSuf, { Imm8, RegYMM|Unspecified|BaseIndex, RegZMM, RegZMM }
 vinserti32x8, 0x663A, AVX512DQ, Modrm|EVex512|Masking|Space0F3A|Src1VVVV|VexW0|Disp8MemShift=5|NoSuf, { Imm8, RegYMM|Unspecified|BaseIndex, RegZMM, RegZMM }
 
 vpextr<dq>, 0x6616, AVX512DQ&<dq:cpu64>, Modrm|EVex128|Space0F3A|<dq:vexw64>|Disp8MemShift|NoSuf|Optimize, { Imm8, RegXMM, <dq:gpr>|Unspecified|BaseIndex }
 vpinsr<dq>, 0x6622, AVX512DQ&<dq:cpu64>, Modrm|EVex128|Space0F3A|Src1VVVV|<dq:vexw64>|Disp8MemShift|NoSuf, { Imm8, <dq:gpr>|Unspecified|BaseIndex, RegXMM, RegXMM }
 
-vextractf64x2, 0x6619, AVX512DQ, Modrm|Masking|Space0F3A|VexW=2|Disp8MemShift=4|NoSuf, { Imm8, RegYMM|RegZMM, RegXMM|Unspecified|BaseIndex }
-vextracti64x2, 0x6639, AVX512DQ, Modrm|Masking|Space0F3A|VexW=2|Disp8MemShift=4|NoSuf, { Imm8, RegYMM|RegZMM, RegXMM|Unspecified|BaseIndex }
+vextractf64x2, 0x6619, AVX512DQ, Modrm|Masking|Space0F3A|VexW=2|Disp8MemShift=4|NoSuf|Optimize, { Imm8, RegYMM|RegZMM, RegXMM|Unspecified|BaseIndex }
+vextracti64x2, 0x6639, AVX512DQ, Modrm|Masking|Space0F3A|VexW=2|Disp8MemShift=4|NoSuf|Optimize, { Imm8, RegYMM|RegZMM, RegXMM|Unspecified|BaseIndex }
 vinsertf64x2, 0x6618, AVX512DQ, Modrm|Masking|Space0F3A|Src1VVVV|VexW1|Disp8MemShift=4|CheckOperandSize|NoSuf, { Imm8, RegXMM|Unspecified|BaseIndex, RegYMM|RegZMM, RegYMM|RegZMM }
 vinserti64x2, 0x6638, AVX512DQ, Modrm|Masking|Space0F3A|Src1VVVV|VexW1|Disp8MemShift=4|CheckOperandSize|NoSuf, { Imm8, RegXMM|Unspecified|BaseIndex, RegYMM|RegZMM, RegYMM|RegZMM }