[v2,3/4] x86/APX: optimize certain XOR and SUB forms

Message ID ff9af77d-8d0f-4552-bd42-48c60188af49@suse.com
State New
Headers
Series x86/APX: misc adjustments |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_binutils_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_binutils_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_binutils_check--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_binutils_check--master-arm success Testing passed

Commit Message

Jan Beulich Feb. 23, 2024, 11:12 a.m. UTC
  While most logic in optimize_encoding() is already covering APX by way
of the earlier NDD->REX2 conversion, there's a remaining set of cases
which wants handling separately.
  

Patch

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -4693,6 +4693,34 @@  optimize_encoding (void)
 	    }
 	}
     }
+  else if (i.reg_operands == 3
+	   && i.op[0].regs == i.op[1].regs
+	   && i.encoding != encoding_evex
+	   && (i.tm.mnem_off == MN_xor
+	       || i.tm.mnem_off == MN_sub))
+    {
+      /* Optimize: -O:
+	   xorb %rNb, %rNb, %rMb  -> xorl %rMd, %rMd
+	   xorw %rNw, %rNw, %rMw  -> xorl %rMd, %rMd
+	   xorl %rNd, %rNd, %rMd  -> xorl %rMd, %rMd
+	   xorq %rN,  %rN,  %rM   -> xorl %rMd, %rMd
+	   subb %rNb, %rNb, %rMb  -> subl %rMd, %rMd
+	   subw %rNw, %rNw, %rMw  -> subl %rMd, %rMd
+	   subl %rNd, %rNd, %rMd  -> subl %rMd, %rMd
+	   subq %rN,  %rN,  %rM   -> subl %rMd, %rMd
+        */
+      i.tm.opcode_space = SPACE_BASE;
+      i.tm.opcode_modifier.evex = 0;
+      i.tm.opcode_modifier.size = SIZE32;
+      i.types[0].bitfield.byte = 0;
+      i.types[0].bitfield.word = 0;
+      i.types[0].bitfield.dword = 1;
+      i.types[0].bitfield.qword = 0;
+      i.op[0].regs = i.op[2].regs;
+      i.types[1] = i.types[0];
+      i.op[1].regs = i.op[2].regs;
+      i.reg_operands = 2;
+    }
   else if (optimize > 1
 	   && !optimize_for_space
 	   && i.reg_operands == 2
--- a/gas/testsuite/gas/i386/x86-64-optimize-1.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-1.d
@@ -71,4 +71,28 @@  Disassembly of section .text:
  +[a-f0-9]+:	48 0f ba f0 1f       	btr    \$0x1f,%rax
  +[a-f0-9]+:	66 0f ba e8 0f       	bts    \$0xf,%ax
  +[a-f0-9]+:	48 0f ba e8 1f       	bts    \$0x1f,%rax
+ +[a-f0-9]+:	31 c9                	xor    %ecx,%ecx
+ +[a-f0-9]+:	48 31 d1             	xor    %rdx,%rcx
+ +[a-f0-9]+:	31 c9                	xor    %ecx,%ecx
+ +[a-f0-9]+:	29 c9                	sub    %ecx,%ecx
+ +[a-f0-9]+:	48 29 d1             	sub    %rdx,%rcx
+ +[a-f0-9]+:	29 c9                	sub    %ecx,%ecx
+ +[a-f0-9]+:	d5 50 31 c9          	xor    %r17d,%r17d
+ +[a-f0-9]+:	d5 58 31 d1          	xor    %r18,%r17
+ +[a-f0-9]+:	d5 50 31 c9          	xor    %r17d,%r17d
+ +[a-f0-9]+:	d5 50 29 c9          	sub    %r17d,%r17d
+ +[a-f0-9]+:	d5 58 29 d1          	sub    %r18,%r17
+ +[a-f0-9]+:	d5 50 29 c9          	sub    %r17d,%r17d
+ +[a-f0-9]+:	31 c9                	xor    %ecx,%ecx
+ +[a-f0-9]+:	62 f4 75 18 31 d1    	xor    %dx,%cx,%cx
+ +[a-f0-9]+:	31 c9                	xor    %ecx,%ecx
+ +[a-f0-9]+:	29 c9                	sub    %ecx,%ecx
+ +[a-f0-9]+:	62 f4 75 18 29 d1    	sub    %dx,%cx,%cx
+ +[a-f0-9]+:	29 c9                	sub    %ecx,%ecx
+ +[a-f0-9]+:	d5 50 31 c9          	xor    %r17d,%r17d
+ +[a-f0-9]+:	62 ec 74 10 30 d1    	xor    %r18b,%r17b,%r17b
+ +[a-f0-9]+:	d5 50 31 c9          	xor    %r17d,%r17d
+ +[a-f0-9]+:	d5 50 29 c9          	sub    %r17d,%r17d
+ +[a-f0-9]+:	62 ec 74 10 28 d1    	sub    %r18b,%r17b,%r17b
+ +[a-f0-9]+:	d5 50 29 c9          	sub    %r17d,%r17d
 #pass
--- a/gas/testsuite/gas/i386/x86-64-optimize-1.s
+++ b/gas/testsuite/gas/i386/x86-64-optimize-1.s
@@ -65,3 +65,27 @@  _start:
 	btr	$31, %rax
 	bts	$15, %ax
 	bts	$31, %rax
+	xor	%rcx, %rcx, %rcx
+	xor	%rdx, %rcx, %rcx
+	xor	%rdx, %rdx, %rcx
+	sub	%rcx, %rcx, %rcx
+	sub	%rdx, %rcx, %rcx
+	sub	%rdx, %rdx, %rcx
+	xor	%r17, %r17, %r17
+	xor	%r18, %r17, %r17
+	xor	%r18, %r18, %r17
+	sub	%r17, %r17, %r17
+	sub	%r18, %r17, %r17
+	sub	%r18, %r18, %r17
+	xor	%cx, %cx, %cx
+	xor	%dx, %cx, %cx
+	xor	%dx, %dx, %cx
+	sub	%cx, %cx, %cx
+	sub	%dx, %cx, %cx
+	sub	%dx, %dx, %cx
+	xor	%r17b, %r17b, %r17b
+	xor	%r18b, %r17b, %r17b
+	xor	%r18b, %r18b, %r17b
+	sub	%r17b, %r17b, %r17b
+	sub	%r18b, %r17b, %r17b
+	sub	%r18b, %r18b, %r17b
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -320,7 +320,7 @@  inc, 0x40, No64, No_bSuf|No_sSuf|No_qSuf
 inc, 0xfe/0, APX_F, W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVexMap4|NF, {Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64}
 inc, 0xfe/0, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
 
-sub, 0x28, APX_F, D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64, }
+sub, 0x28, APX_F, D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVexMap4|NF|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64, }
 sub, 0x28, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
 sub, 0x83/5, APX_F, Modrm|No_bSuf|No_sSuf|DstVVVV|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 sub, 0x83/5, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex }
@@ -366,7 +366,7 @@  or, 0xc, 0, W|No_sSuf, { Imm8|Imm16|Imm3
 or, 0x80/1, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVexMap4|NF, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 or, 0x80/1, 0, W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
 
-xor, 0x30, APX_F, D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVexMap4|NF, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
+xor, 0x30, APX_F, D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVexMap4|NF|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
 xor, 0x30, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
 xor, 0x83/6, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVexMap4|NF, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 xor, 0x83/6, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex }