[v2,6/8] x86/APX: optimize {nf}-form IMUL-by-power-of-2 to SHL

Message ID 7c01753d-2f2a-4642-bcbe-1cbcbe00cbc1@suse.com
State New
Headers
Series x86: a few more optimizations |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_binutils_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_binutils_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_binutils_check--master-aarch64 success Test passed
linaro-tcwg-bot/tcwg_binutils_check--master-arm success Test passed

Commit Message

Jan Beulich June 21, 2024, 12:52 p.m. UTC
  ..., for differing only in the resulting EFLAGS, which are left
untouched anyway. That's a shorter encoding, available as long as
certain constraints on operands are met; see code comments. (SHL-by-1
forms may then be subject to further optimization that was introduced
earlier.)

Note that kind of as a side effect this also converts multiplication by
1 to shift by 0, which is a plain move or even no-op anyway. That could
be further shrunk (as could be presence of shifts/rotates by 0 in the
original code as  well as a fair set of other {nf}-form insns), yet the
expectation (for now) is that people won't write such code in the first
place.
---
RFC: Comparing i.op[2].regs against i.op[1].regs without first checking
     that operand 1 isn't a memory operand is at least UB-ish, for
     memory operands setting i.op[].disps instead (if anything). Do we
     deem this tolerable?
---
v2: New.
  

Patch

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -5458,6 +5458,75 @@  optimize_nf_encoding (void)
       i.tm.operand_types[0].bitfield.imm1 = 1;
       i.imm_operands = 0;
     }
+  else if ((i.tm.base_opcode | 2) == 0x6b
+	   && i.op[0].imms->X_op == O_constant
+	   && (i.op[0].imms->X_add_number > 0
+	       ? !(i.op[0].imms->X_add_number & (i.op[0].imms->X_add_number - 1))
+	       /* optimize_imm() converts to sign-extended representation where
+		  possible (and input can also come with these specific numbers).  */
+	       : (i.types[i.operands - 1].bitfield.word
+		  && i.op[0].imms->X_add_number == -0x8000)
+		 || (i.types[i.operands - 1].bitfield.dword
+		     && i.op[0].imms->X_add_number + 1 == -0x7fffffff))
+	   /* 16-bit 3-operand non-ZU forms need leaviong alone, to prevent
+	      zero-extension of the result.  Unless, of course, both non-
+	      immediate operands match (which can be converted to the non-NDD
+	      form).  */
+	   && (i.operands < 3
+	       || !i.types[2].bitfield.word
+	       || i.tm.mnem_off == MN_imulzu
+	       || i.op[2].regs == i.op[1].regs)
+	   /* When merely optimizing for size, exclude cases where we'd convert
+	      from Imm8S to Imm8 encoding, thus not actually reducing size.  */
+	   && (!optimize_for_space
+	       || i.tm.base_opcode == 0x69
+	       || !(i.op[0].imms->X_add_number & 0x7d)))
+    {
+      /* Optimize: -O:
+	   {nf} imul   $1<<N, ...   -> {nf} shl $N, ...
+	   {nf} imulzu $1<<N, ...   -> {nf} shl $N, ...
+       */
+      if (i.op[0].imms->X_add_number != 2)
+	{
+	  i.tm.base_opcode = 0xc0;
+	  i.op[0].imms->X_add_number = ffs (i.op[0].imms->X_add_number) - 1;
+	  i.tm.operand_types[0].bitfield.imm8 = 1;
+	  i.tm.operand_types[0].bitfield.imm16 = 0;
+	  i.tm.operand_types[0].bitfield.imm32 = 0;
+	  i.tm.operand_types[0].bitfield.imm32s = 0;
+	}
+      else
+	{
+	  i.tm.base_opcode = 0xd0;
+	  i.tm.operand_types[0].bitfield.imm1 = 1;
+	}
+      i.types[0] = i.tm.operand_types[0];
+      i.tm.extension_opcode = 4;
+      i.tm.opcode_modifier.w = 1;
+      i.tm.opcode_modifier.operandconstraint = 0;
+      if (i.operands == 3)
+	{
+	  if (i.op[2].regs == i.op[1].regs && i.tm.mnem_off != MN_imulzu)
+	    {
+	      /* Convert to non-NDD form.  This is required for 16-bit insns
+	         (to prevent zero-extension) and benign for others.  */
+	      i.operands = 2;
+	      i.reg_operands = 1;
+	    }
+	  else
+	    i.tm.opcode_modifier.vexvvvv = VexVVVV_DST;
+	}
+      else if (i.tm.mnem_off == MN_imulzu)
+	{
+	  /* Convert to NDD form, to effect zero-extension of the result.  */
+	  i.tm.opcode_modifier.vexvvvv = VexVVVV_DST;
+	  i.operands = 3;
+	  i.reg_operands = 2;
+	  i.op[2].regs = i.op[1].regs;
+ 	  i.tm.operand_types[2] = i.tm.operand_types[1];
+ 	  i.types[2] = i.types[1];
+	}
+    }
 
   if (optimize_for_space
       && i.encoding != encoding_evex
@@ -5604,6 +5673,7 @@  optimize_nf_encoding (void)
   else if (i.tm.base_opcode == 0x6b
 	   && !i.mem_operands
 	   && i.encoding != encoding_evex
+	   && i.tm.mnem_off != MN_imulzu
 	   && is_plausible_suffix (1)
 	   /* %rsp can't be the index.  */
 	   && is_index (i.op[1].regs)
--- a/gas/testsuite/gas/i386/x86-64-apx-nf.s
+++ b/gas/testsuite/gas/i386/x86-64-apx-nf.s
@@ -1472,4 +1472,40 @@  optimize:
 	{nf} imul $5, %r21w, %dx
 	{nf} imul $9, %r21w
 	.endif
+
+	# Note: 2-6 want leaving alone with -Os.
+	.irp n, 1, 2, 6, 7
+	# Note: 16-bit 3-operand src!=dst non-ZU form needs leaving alone.
+	{nf} imul $1<<\n, %\r\()dx, %\r\()cx
+	{nf} imul $1<<\n, (%rdx), %\r\()cx
+	{nf} imul $1<<\n, %\r\()cx, %\r\()cx
+	{nf} imul $1<<\n, %\r\()cx
+
+	.ifeqs "\r",""
+	{nf} imulzu $1<<\n, %dx, %cx
+	{nf} imulzu $1<<\n, (%rdx), %cx
+	{nf} imulzu $1<<\n, %cx, %cx
+	{nf} imulzu $1<<\n, %cx
+	.endif
+	.endr
+
+	.ifeqs "\r",""
+	# Note: 3-operand src!=dst non-ZU form needs leaving alone.
+	{nf} imul $1<<15, %dx, %cx
+	{nf} imul $-1<<15, (%rdx), %cx
+	{nf} imul $1<<15, %cx, %cx
+	{nf} imul $-1<<15, %cx
+	{nf} imulzu $1<<15, %cx
+	.endif
+
+	.ifeqs "\r","e"
+	{nf} imul $1<<31, %edx, %ecx
+	{nf} imul $-1<<31, (%rdx), %ecx
+	.endif
+
+	.ifeqs "\r","r"
+	{nf} imul $1<<30, %rdx, %rcx
+	# Needs leaving alone.
+	{nf} imul $-1<<31, %rdx, %rcx
+	.endif
 	.endr
--- a/gas/testsuite/gas/i386/x86-64-apx-nf-optimize.d
+++ b/gas/testsuite/gas/i386/x86-64-apx-nf-optimize.d
@@ -1522,14 +1522,87 @@  Disassembly of section \.text:
 [ 	]*[a-f0-9]+:[ 	]*66 d5 40 8d 44 6d 00[ 	]+lea    0x0\(%rbp,%rbp,2\),%r16w
 [ 	]*[a-f0-9]+:[ 	]*66 d5 30 8d 54 ad 00[ 	]+lea    0x0\(%r21,%r21,4\),%dx
 [ 	]*[a-f0-9]+:[ 	]*66 d5 70 8d 6c ed 00[ 	]+lea    0x0\(%r21,%r21,8\),%r21w
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b ca 02 	\{nf\} imul \$0x2,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b 0a 02 	\{nf\} imul \$0x2,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 01 c9    	\{nf\} add %cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 01 c9    	\{nf\} add %cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c 01 d2    	\{nf\} add %dx,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c d1 22    	\{nf\} shl \$1,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c 01 c9    	\{nf\} add %cx,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c 01 c9    	\{nf\} add %cx,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b ca 04 	\{nf\} imul \$0x4,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b 0a 04 	\{nf\} imul \$0x4,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c c1 e1 02 	\{nf\} shl \$0x2,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c c1 e1 02 	\{nf\} shl \$0x2,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 e2 02 	\{nf\} shl \$0x2,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 22 02 	\{nf\} shl \$0x2,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 e1 02 	\{nf\} shl \$0x2,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 e1 02 	\{nf\} shl \$0x2,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b ca 40 	\{nf\} imul \$0x40,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b 0a 40 	\{nf\} imul \$0x40,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c c1 e1 06 	\{nf\} shl \$0x6,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c c1 e1 06 	\{nf\} shl \$0x6,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 e2 06 	\{nf\} shl \$0x6,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 22 06 	\{nf\} shl \$0x6,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 e1 06 	\{nf\} shl \$0x6,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 e1 06 	\{nf\} shl \$0x6,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 69 ca 80 00 	\{nf\} imul \$0x80,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 69 0a 80 00 	\{nf\} imul \$0x80,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c c1 e1 07 	\{nf\} shl \$0x7,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c c1 e1 07 	\{nf\} shl \$0x7,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 e2 07 	\{nf\} shl \$0x7,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 22 07 	\{nf\} shl \$0x7,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 e1 07 	\{nf\} shl \$0x7,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 e1 07 	\{nf\} shl \$0x7,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 69 ca 00 80 	\{nf\} imul \$0x8000,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 69 0a 00 80 	\{nf\} imul \$0x8000,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c c1 e1 0f 	\{nf\} shl \$0xf,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c c1 e1 0f 	\{nf\} shl \$0xf,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 e1 0f 	\{nf\} shl \$0xf,%cx,%cx
 [ 	]*[a-f0-9]+:[ 	]*8d 14 49[ 	]+lea    \(%rcx,%rcx,2\),%edx
 [ 	]*[a-f0-9]+:[ 	]*8d 54 ad 00[ 	]+lea    0x0\(%rbp,%rbp,4\),%edx
 [ 	]*[a-f0-9]+:[ 	]*8d 2c c9[ 	]+lea    \(%rcx,%rcx,8\),%ebp
 [ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c 6b d4 03[ 	]+\{nf\} imul \$0x3,%esp,%edx
 [ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c 6b e4 05[ 	]+\{nf\} imul \$0x5,%esp,%esp
+[ 	]*[a-f0-9]+:[ 	]*62 f4 74 1c 01 d2    	\{nf\} add %edx,%edx,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 74 1c d1 22    	\{nf\} shl \$1,\(%rdx\),%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c 01 c9    	\{nf\} add %ecx,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c 01 c9    	\{nf\} add %ecx,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 74 1c c1 e2 02 	\{nf\} shl \$0x2,%edx,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 74 1c c1 22 02 	\{nf\} shl \$0x2,\(%rdx\),%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c c1 e1 02 	\{nf\} shl \$0x2,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c c1 e1 02 	\{nf\} shl \$0x2,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 74 1c c1 e2 06 	\{nf\} shl \$0x6,%edx,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 74 1c c1 22 06 	\{nf\} shl \$0x6,\(%rdx\),%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c c1 e1 06 	\{nf\} shl \$0x6,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c c1 e1 06 	\{nf\} shl \$0x6,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 74 1c c1 e2 07 	\{nf\} shl \$0x7,%edx,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 74 1c c1 22 07 	\{nf\} shl \$0x7,\(%rdx\),%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c c1 e1 07 	\{nf\} shl \$0x7,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c c1 e1 07 	\{nf\} shl \$0x7,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 74 1c c1 e2 1f 	\{nf\} shl \$0x1f,%edx,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 74 1c c1 22 1f 	\{nf\} shl \$0x1f,\(%rdx\),%ecx
 [ 	]*[a-f0-9]+:[ 	]*48 8d 14 49[ 	]+lea    \(%rcx,%rcx,2\),%rdx
 [ 	]*[a-f0-9]+:[ 	]*48 8d 54 ad 00[ 	]+lea    0x0\(%rbp,%rbp,4\),%rdx
 [ 	]*[a-f0-9]+:[ 	]*48 8d 2c c9[ 	]+lea    \(%rcx,%rcx,8\),%rbp
 [ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 6b d4 03[ 	]+\{nf\} imul \$0x3,%rsp,%rdx
 [ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 6b e4 05[ 	]+\{nf\} imul \$0x5,%rsp,%rsp
+[ 	]*[a-f0-9]+:[ 	]*62 f4 f4 1c 01 d2    	\{nf\} add %rdx,%rdx,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 f4 1c d1 22    	\{nf\} shl \$1,\(%rdx\),%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 01 c9    	\{nf\} add %rcx,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 01 c9    	\{nf\} add %rcx,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 f4 1c c1 e2 02 	\{nf\} shl \$0x2,%rdx,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 f4 1c c1 22 02 	\{nf\} shl \$0x2,\(%rdx\),%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c c1 e1 02 	\{nf\} shl \$0x2,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c c1 e1 02 	\{nf\} shl \$0x2,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 f4 1c c1 e2 06 	\{nf\} shl \$0x6,%rdx,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 f4 1c c1 22 06 	\{nf\} shl \$0x6,\(%rdx\),%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c c1 e1 06 	\{nf\} shl \$0x6,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c c1 e1 06 	\{nf\} shl \$0x6,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 f4 1c c1 e2 07 	\{nf\} shl \$0x7,%rdx,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 f4 1c c1 22 07 	\{nf\} shl \$0x7,\(%rdx\),%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c c1 e1 07 	\{nf\} shl \$0x7,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c c1 e1 07 	\{nf\} shl \$0x7,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 f4 1c c1 e2 1e 	\{nf\} shl \$0x1e,%rdx,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 69 ca 00 00 00 80 	\{nf\} imul \$0xffffffff80000000,%rdx,%rcx
 #pass
--- a/gas/testsuite/gas/i386/x86-64-apx-nf-optimize-size.d
+++ b/gas/testsuite/gas/i386/x86-64-apx-nf-optimize-size.d
@@ -1522,14 +1522,87 @@  Disassembly of section \.text:
 [ 	]*[a-f0-9]+:[ 	]*62 e4 7d 0c 6b c5 03[ 	]+\{nf\} imul \$0x3,%bp,%r16w
 [ 	]*[a-f0-9]+:[ 	]*62 fc 7d 0c 6b d5 05[ 	]+\{nf\} imul \$0x5,%r21w,%dx
 [ 	]*[a-f0-9]+:[ 	]*62 ec 7d 0c 6b ed 09[ 	]+\{nf\} imul \$0x9,%r21w,%r21w
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b ca 02 	\{nf\} imul \$0x2,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b 0a 02 	\{nf\} imul \$0x2,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*66 8d 0c 09[ 	]+lea    \(%rcx,%rcx,1\),%cx
+[ 	]*[a-f0-9]+:[ 	]*66 8d 0c 09[ 	]+lea    \(%rcx,%rcx,1\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c d1 e2    	\{nf\} shl \$1,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c d1 22    	\{nf\} shl \$1,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c d1 e1    	\{nf\} shl \$1,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c d1 e1    	\{nf\} shl \$1,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b ca 04 	\{nf\} imul \$0x4,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b 0a 04 	\{nf\} imul \$0x4,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b c9 04 	\{nf\} imul \$0x4,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b c9 04 	\{nf\} imul \$0x4,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 1c 6b ca 04 	\{nf\} imulzu \$0x4,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 1c 6b 0a 04 	\{nf\} imulzu \$0x4,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 1c 6b c9 04 	\{nf\} imulzu \$0x4,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 1c 6b c9 04 	\{nf\} imulzu \$0x4,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b ca 40 	\{nf\} imul \$0x40,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b 0a 40 	\{nf\} imul \$0x40,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b c9 40 	\{nf\} imul \$0x40,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 6b c9 40 	\{nf\} imul \$0x40,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 1c 6b ca 40 	\{nf\} imulzu \$0x40,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 1c 6b 0a 40 	\{nf\} imulzu \$0x40,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 1c 6b c9 40 	\{nf\} imulzu \$0x40,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 1c 6b c9 40 	\{nf\} imulzu \$0x40,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 69 ca 80 00 	\{nf\} imul \$0x80,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 69 0a 80 00 	\{nf\} imul \$0x80,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c c1 e1 07 	\{nf\} shl \$0x7,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c c1 e1 07 	\{nf\} shl \$0x7,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 e2 07 	\{nf\} shl \$0x7,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 22 07 	\{nf\} shl \$0x7,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 e1 07 	\{nf\} shl \$0x7,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 e1 07 	\{nf\} shl \$0x7,%cx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 69 ca 00 80 	\{nf\} imul \$0x8000,%dx,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c 69 0a 00 80 	\{nf\} imul \$0x8000,\(%rdx\),%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c c1 e1 0f 	\{nf\} shl \$0xf,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c c1 e1 0f 	\{nf\} shl \$0xf,%cx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 75 1c c1 e1 0f 	\{nf\} shl \$0xf,%cx,%cx
 [ 	]*[a-f0-9]+:[ 	]*8d 14 49[ 	]+lea    \(%rcx,%rcx,2\),%edx
 [ 	]*[a-f0-9]+:[ 	]*8d 54 ad 00[ 	]+lea    0x0\(%rbp,%rbp,4\),%edx
 [ 	]*[a-f0-9]+:[ 	]*8d 2c c9[ 	]+lea    \(%rcx,%rcx,8\),%ebp
 [ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c 6b d4 03[ 	]+\{nf\} imul \$0x3,%esp,%edx
 [ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c 6b e4 05[ 	]+\{nf\} imul \$0x5,%esp,%esp
+[ 	]*[a-f0-9]+:[ 	]*8d 0c 12[ 	]+lea    \(%rdx,%rdx,1\),%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 74 1c d1 22    	\{nf\} shl \$1,\(%rdx\),%ecx
+[ 	]*[a-f0-9]+:[ 	]*8d 0c 09[ 	]+lea    \(%rcx,%rcx,1\),%ecx
+[ 	]*[a-f0-9]+:[ 	]*8d 0c 09[ 	]+lea    \(%rcx,%rcx,1\),%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c 6b ca 04 	\{nf\} imul \$0x4,%edx,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c 6b 0a 04 	\{nf\} imul \$0x4,\(%rdx\),%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c 6b c9 04 	\{nf\} imul \$0x4,%ecx,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c 6b c9 04 	\{nf\} imul \$0x4,%ecx,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c 6b ca 40 	\{nf\} imul \$0x40,%edx,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c 6b 0a 40 	\{nf\} imul \$0x40,\(%rdx\),%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c 6b c9 40 	\{nf\} imul \$0x40,%ecx,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c 6b c9 40 	\{nf\} imul \$0x40,%ecx,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 74 1c c1 e2 07 	\{nf\} shl \$0x7,%edx,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 74 1c c1 22 07 	\{nf\} shl \$0x7,\(%rdx\),%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c c1 e1 07 	\{nf\} shl \$0x7,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c c1 e1 07 	\{nf\} shl \$0x7,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 74 1c c1 e2 1f 	\{nf\} shl \$0x1f,%edx,%ecx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 74 1c c1 22 1f 	\{nf\} shl \$0x1f,\(%rdx\),%ecx
 [ 	]*[a-f0-9]+:[ 	]*48 8d 14 49[ 	]+lea    \(%rcx,%rcx,2\),%rdx
 [ 	]*[a-f0-9]+:[ 	]*48 8d 54 ad 00[ 	]+lea    0x0\(%rbp,%rbp,4\),%rdx
 [ 	]*[a-f0-9]+:[ 	]*48 8d 2c c9[ 	]+lea    \(%rcx,%rcx,8\),%rbp
 [ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 6b d4 03[ 	]+\{nf\} imul \$0x3,%rsp,%rdx
 [ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 6b e4 05[ 	]+\{nf\} imul \$0x5,%rsp,%rsp
+[ 	]*[a-f0-9]+:[ 	]*48 8d 0c 12[ 	]+lea    \(%rdx,%rdx,1\),%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 f4 1c d1 22    	\{nf\} shl \$1,\(%rdx\),%rcx
+[ 	]*[a-f0-9]+:[ 	]*48 8d 0c 09[ 	]+lea    \(%rcx,%rcx,1\),%rcx
+[ 	]*[a-f0-9]+:[ 	]*48 8d 0c 09[ 	]+lea    \(%rcx,%rcx,1\),%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 6b ca 04 	\{nf\} imul \$0x4,%rdx,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 6b 0a 04 	\{nf\} imul \$0x4,\(%rdx\),%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 6b c9 04 	\{nf\} imul \$0x4,%rcx,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 6b c9 04 	\{nf\} imul \$0x4,%rcx,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 6b ca 40 	\{nf\} imul \$0x40,%rdx,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 6b 0a 40 	\{nf\} imul \$0x40,\(%rdx\),%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 6b c9 40 	\{nf\} imul \$0x40,%rcx,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 6b c9 40 	\{nf\} imul \$0x40,%rcx,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 f4 1c c1 e2 07 	\{nf\} shl \$0x7,%rdx,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 f4 1c c1 22 07 	\{nf\} shl \$0x7,\(%rdx\),%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c c1 e1 07 	\{nf\} shl \$0x7,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c c1 e1 07 	\{nf\} shl \$0x7,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 f4 1c c1 e2 1e 	\{nf\} shl \$0x1e,%rdx,%rcx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 69 ca 00 00 00 80 	\{nf\} imul \$0xffffffff80000000,%rdx,%rcx
 #pass
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -419,21 +419,21 @@  imul, 0xfaf, i386, Modrm|CheckOperandSiz
 imul, 0xaf, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|EVexMap4|NF, { Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 imul, 0x6b, i186, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 imul, 0x6b, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|EVexMap4|NF|Optimize, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
-imulzu, 0x6b, APX_F, Modrm|No_bSuf|No_sSuf|EVexMap4|NF|ZU, { Imm8S, Reg16|Unspecified|BaseIndex, Reg16 }
+imulzu, 0x6b, APX_F, Modrm|No_bSuf|No_sSuf|EVexMap4|NF|ZU|Optimize, { Imm8S, Reg16|Unspecified|BaseIndex, Reg16 }
 imul, 0x69, i186, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Imm16|Imm32|Imm32S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
-imul, 0x69, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|EVexMap4|NF, { Imm16|Imm32|Imm32S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
-imulzu, 0x69, APX_F, Modrm|No_bSuf|No_sSuf|EVexMap4|NF|ZU, { Imm16, Reg16|Unspecified|BaseIndex, Reg16 }
+imul, 0x69, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|EVexMap4|NF|Optimize, { Imm16|Imm32|Imm32S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
+imulzu, 0x69, APX_F, Modrm|No_bSuf|No_sSuf|EVexMap4|NF|ZU|Optimize, { Imm16, Reg16|Unspecified|BaseIndex, Reg16 }
 // imul with 2 operands mimics imul with 3 by putting the register in
 // both i.rm.reg & i.rm.regmem fields.  RegKludge enables this
 // transformation.
 imul, 0x6b, i186, Modrm|No_bSuf|No_sSuf|RegKludge, { Imm8S, Reg16|Reg32|Reg64 }
 imul, 0x6b, APX_F, Modrm|No_bSuf|No_sSuf|RegKludge|EVexMap4|NF|Optimize, { Imm8S, Reg16|Reg32|Reg64 }
 imul, 0x69, i186, Modrm|No_bSuf|No_sSuf|RegKludge, { Imm16|Imm32|Imm32S, Reg16|Reg32|Reg64 }
-imul, 0x69, APX_F, Modrm|No_bSuf|No_sSuf|RegKludge|EVexMap4|NF, { Imm16|Imm32|Imm32S, Reg16|Reg32|Reg64 }
+imul, 0x69, APX_F, Modrm|No_bSuf|No_sSuf|RegKludge|EVexMap4|NF|Optimize, { Imm16|Imm32|Imm32S, Reg16|Reg32|Reg64 }
 // ZU is omitted here, for colliding with RegKludge.  process_operands() will
 // replace the constraint value after processing RegKludge.
-imulzu, 0x6b, APX_F, Modrm|No_bSuf|No_sSuf|RegKludge|EVexMap4|NF/*|ZU*/, { Imm8S, Reg16 }
-imulzu, 0x69, APX_F, Modrm|No_bSuf|No_sSuf|RegKludge|EVexMap4|NF/*|ZU*/, { Imm16, Reg16 }
+imulzu, 0x6b, APX_F, Modrm|No_bSuf|No_sSuf|RegKludge|EVexMap4|NF/*|ZU*/|Optimize, { Imm8S, Reg16 }
+imulzu, 0x69, APX_F, Modrm|No_bSuf|No_sSuf|RegKludge|EVexMap4|NF/*|ZU*/|Optimize, { Imm16, Reg16 }
 
 <mul>