[3/6] x86/APX: optimize {nf}-form rotate-by-width-less-1

Message ID 1e22a43f-03d6-4af5-a0a0-3d59b0c8eb10@suse.com
State New
Headers
Series x86: a few more optimizations |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_binutils_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_binutils_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_binutils_check--master-aarch64 success Test passed
linaro-tcwg-bot/tcwg_binutils_check--master-arm success Test passed

Commit Message

Jan Beulich June 14, 2024, 12:13 p.m. UTC
  Unlike for the legacy forms, where there's a difference in the resulting
EFLAGS.CF, for the NF variants the immediate can be got rid of in that
case by switching to a 1-bit rotate in the opposite direction.
  

Patch

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -4844,6 +4844,26 @@  optimize_encoding (void)
       i.reg_operands++;
       i.imm_operands = 0;
     }
+  else if (i.has_nf && i.tm.base_opcode == 0xc0
+	   && i.op[0].imms->X_op == O_constant
+	   && i.op[0].imms->X_add_number
+	      == (i.types[i.operands - 1].bitfield.byte
+		  || i.suffix == BYTE_MNEM_SUFFIX
+		  ? 7 : i.types[i.operands - 1].bitfield.word
+			|| i.suffix == WORD_MNEM_SUFFIX
+			? 15 : 63 >> (i.types[i.operands - 1].bitfield.dword
+				      || i.suffix == LONG_MNEM_SUFFIX)))
+    {
+      /* Optimize: -O:
+	   {nf} rol $osz-1, ...   -> {nf} ror $1, ...
+	   {nf} ror $osz-1, ...   -> {nf} rol $1, ...
+       */
+      gas_assert (i.tm.extension_opcode <= 1);
+      i.tm.extension_opcode ^= 1;
+      i.tm.base_opcode = 0xd0;
+      i.tm.operand_types[0].bitfield.imm1 = 1;
+      i.imm_operands = 0;
+    }
   else if (i.tm.base_opcode == 0xba
 	   && i.tm.opcode_space == SPACE_0F
 	   && i.reg_operands == 1
--- a/gas/testsuite/gas/i386/x86-64-apx-nf.s
+++ b/gas/testsuite/gas/i386/x86-64-apx-nf.s
@@ -1399,3 +1399,23 @@  optimize:
 	{nf}	\op\()q	$128, (%rax)
 	{nf}	\op	$128, (%rax), %r9
 	.endr
+
+	.irp dir, l, r
+	{nf}	ro\dir	$7, %dl
+	{nf}	ro\dir	$7, %dl, %al
+	{nf}	ro\dir	$15, %dx
+	{nf}	ro\dir	$15, %dx, %ax
+	{nf}	ro\dir	$31, %edx
+	{nf}	ro\dir	$31, %edx, %eax
+	{nf}	ro\dir	$63, %rdx
+	{nf}	ro\dir	$63, %rdx, %rax
+
+	{nf}	ro\dir\()b	$7, (%rdx)
+	{nf}	ro\dir		$7, (%rdx), %al
+	{nf}	ro\dir\()w	$15, (%rdx)
+	{nf}	ro\dir		$15, (%rdx), %ax
+	{nf}	ro\dir\()l	$31, (%rdx)
+	{nf}	ro\dir		$31, (%rdx), %eax
+	{nf}	ro\dir\()q	$63, (%rdx)
+	{nf}	ro\dir		$63, (%rdx), %rax
+	.endr
--- a/gas/testsuite/gas/i386/x86-64-apx-nf-optimize.d
+++ b/gas/testsuite/gas/i386/x86-64-apx-nf-optimize.d
@@ -1416,4 +1416,36 @@  Disassembly of section \.text:
 [ 	]*[a-f0-9]+:[ 	]*62 f4 74 1c 83 00 80[ 	]+\{nf\} add \$0xf+80,\(%rax\),%ecx
 [ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c 83 00 80[ 	]+\{nf\} addq \$0xf+80,\(%rax\)
 [ 	]*[a-f0-9]+:[ 	]*62 f4 b4 1c 83 00 80[ 	]+\{nf\} add \$0xf+80,\(%rax\),%r9
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c d0 ca[ 	]+\{nf\} ror \$1,%dl
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 1c d0 ca[ 	]+\{nf\} ror \$1,%dl,%al
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c d1 ca[ 	]+\{nf\} ror \$1,%dx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 1c d1 ca[ 	]+\{nf\} ror \$1,%dx,%ax
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c d1 ca[ 	]+\{nf\} ror \$1,%edx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 1c d1 ca[ 	]+\{nf\} ror \$1,%edx,%eax
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c d1 ca[ 	]+\{nf\} ror \$1,%rdx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 1c d1 ca[ 	]+\{nf\} ror \$1,%rdx,%rax
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c d0 0a[ 	]+\{nf\} rorb \$1,\(%rdx\)
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 1c d0 0a[ 	]+\{nf\} ror \$1,\(%rdx\),%al
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c d1 0a[ 	]+\{nf\} rorw \$1,\(%rdx\)
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 1c d1 0a[ 	]+\{nf\} ror \$1,\(%rdx\),%ax
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c d1 0a[ 	]+\{nf\} rorl \$1,\(%rdx\)
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 1c d1 0a[ 	]+\{nf\} ror \$1,\(%rdx\),%eax
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c d1 0a[ 	]+\{nf\} rorq \$1,\(%rdx\)
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 1c d1 0a[ 	]+\{nf\} ror \$1,\(%rdx\),%rax
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c d0 c2[ 	]+\{nf\} rol \$1,%dl
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 1c d0 c2[ 	]+\{nf\} rol \$1,%dl,%al
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c d1 c2[ 	]+\{nf\} rol \$1,%dx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 1c d1 c2[ 	]+\{nf\} rol \$1,%dx,%ax
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c d1 c2[ 	]+\{nf\} rol \$1,%edx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 1c d1 c2[ 	]+\{nf\} rol \$1,%edx,%eax
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c d1 c2[ 	]+\{nf\} rol \$1,%rdx
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 1c d1 c2[ 	]+\{nf\} rol \$1,%rdx,%rax
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c d0 02[ 	]+\{nf\} rolb \$1,\(%rdx\)
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 1c d0 02[ 	]+\{nf\} rol \$1,\(%rdx\),%al
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 0c d1 02[ 	]+\{nf\} rolw \$1,\(%rdx\)
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7d 1c d1 02[ 	]+\{nf\} rol \$1,\(%rdx\),%ax
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 0c d1 02[ 	]+\{nf\} roll \$1,\(%rdx\)
+[ 	]*[a-f0-9]+:[ 	]*62 f4 7c 1c d1 02[ 	]+\{nf\} rol \$1,\(%rdx\),%eax
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 0c d1 02[ 	]+\{nf\} rolq \$1,\(%rdx\)
+[ 	]*[a-f0-9]+:[ 	]*62 f4 fc 1c d1 02[ 	]+\{nf\} rol \$1,\(%rdx\),%rax
 #pass
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -428,22 +428,22 @@  imulzu, 0x69, APX_F, Modrm|No_bSuf|No_sS
 
 <div>
 
-<sr:opc:imm8:opt1:nf, +
-    rol:0:Imm8|Imm8S::NF, +
-    ror:1:Imm8|Imm8S::NF, +
-    rcl:2:Imm8::, +
-    rcr:3:Imm8::, +
-    sal:4:Imm8:Optimize:NF, +
-    shl:4:Imm8:Optimize:NF, +
-    shr:5:Imm8::NF, +
-    sar:7:Imm8::NF>
+<sr:opc:imm8:opt1:opti:nf, +
+    rol:0:Imm8|Imm8S::Optimize:NF, +
+    ror:1:Imm8|Imm8S::Optimize:NF, +
+    rcl:2:Imm8:::, +
+    rcr:3:Imm8:::, +
+    sal:4:Imm8:Optimize::NF, +
+    shl:4:Imm8:Optimize::NF, +
+    shr:5:Imm8:::NF, +
+    sar:7:Imm8:::NF>
 
 <sr>, 0xd0/<sr:opc>, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVexMap4|<sr:opt1>|<sr:nf>, { Imm1, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 <sr>, 0xd0/<sr:opc>, 0, W|Modrm|No_sSuf|<sr:opt1>, { Imm1, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
 <sr>, 0xd0/<sr:opc>, APX_F, W|Modrm|No_sSuf|EVexMap4|<sr:opt1>|<sr:nf>, { Imm1, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
-<sr>, 0xc0/<sr:opc>, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVexMap4|<sr:nf>, { <sr:imm8>, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
+<sr>, 0xc0/<sr:opc>, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVexMap4|<sr:opti>|<sr:nf>, { <sr:imm8>, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 <sr>, 0xc0/<sr:opc>, i186, W|Modrm|No_sSuf, { <sr:imm8>, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
-<sr>, 0xc0/<sr:opc>, APX_F, W|Modrm|No_sSuf|EVexMap4|<sr:nf>, { <sr:imm8>, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
+<sr>, 0xc0/<sr:opc>, APX_F, W|Modrm|No_sSuf|EVexMap4|<sr:opti>|<sr:nf>, { <sr:imm8>, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
 <sr>, 0xd2/<sr:opc>, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVexMap4|<sr:nf>, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 <sr>, 0xd2/<sr:opc>, 0, W|Modrm|No_sSuf, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
 <sr>, 0xd2/<sr:opc>, APX_F, W|Modrm|No_sSuf|EVexMap4|<sr:nf>, { ShiftCount, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }