[v2,8/8] x86/APX: apply NDD-to-legacy transformation to further CMOVcc forms

Message ID b1964eb3-0b47-4339-89f1-843115b73dca@suse.com
State New
Headers
Series x86: a few more optimizations |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_binutils_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_binutils_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_binutils_check--master-aarch64 success Test passed
linaro-tcwg-bot/tcwg_binutils_check--master-arm success Test passed

Commit Message

Jan Beulich June 21, 2024, 12:53 p.m. UTC
  With both sources being registers, these insns are almost commutative;
the only extra adjustment needed is inversion of the encoded condition.
---
Down the road the same will want doing for register-only 3-operand
CFCMOVcc, just that there it'll likely be less desirable to re-use the
NDD-to-legacy logic.
---
v2: New.
  

Patch

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -456,6 +456,9 @@  struct _i386_insn
     /* Disable instruction size optimization.  */
     bool no_optimize;
 
+    /* Invert the condition encoded in a base opcode.  */
+    bool invert_cond;
+
     /* How to encode instructions.  */
     enum
       {
@@ -3918,6 +3921,11 @@  install_template (const insn_template *t
       i.tm.base_opcode >>= 8;
     }
 
+  /* For CMOVcc having undergone NDD-to-legacy optimization with its source
+     operands being swapped, we need to invert the encoded condition.  */
+  if (i.invert_cond)
+    i.tm.base_opcode ^= 1;
+
   /* Note that for pseudo prefixes this produces a length of 1. But for them
      the length isn't interesting at all.  */
   for (l = 1; l < 4; ++l)
@@ -9952,7 +9960,14 @@  match_template (char mnem_suffix)
 			  && !i.op[i.operands - 1].regs->reg_type.bitfield.qword)))
 		{
 		  if (i.operands > 2 && match_dest_op == i.operands - 3)
-		    swap_2_operands (match_dest_op, i.operands - 2);
+		    {
+		      swap_2_operands (match_dest_op, i.operands - 2);
+
+		      /* CMOVcc is marked commutative, but then also needs its
+			 encoded condition inverted.  */
+		      if ((t->base_opcode | 0xf) == 0x4f)
+			i.invert_cond = true;
+		    }
 
 		  --i.operands;
 		  --i.reg_operands;
--- a/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.d
+++ b/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.d
@@ -118,6 +118,22 @@  Disassembly of section .text:
 \s*[a-f0-9]+:\s*67 0f 4d 90 90 90 90 90 	cmovge -0x6f6f6f70\(%eax\),%edx
 \s*[a-f0-9]+:\s*67 0f 4e 90 90 90 90 90 	cmovle -0x6f6f6f70\(%eax\),%edx
 \s*[a-f0-9]+:\s*67 0f 4f 90 90 90 90 90 	cmovg  -0x6f6f6f70\(%eax\),%edx
+\s*[a-f0-9]+:\s*0f 41 d1             	cmovno %ecx,%edx
+\s*[a-f0-9]+:\s*0f 40 d1             	cmovo  %ecx,%edx
+\s*[a-f0-9]+:\s*0f 43 d1             	cmovae %ecx,%edx
+\s*[a-f0-9]+:\s*0f 42 d1             	cmovb  %ecx,%edx
+\s*[a-f0-9]+:\s*0f 45 d1             	cmovne %ecx,%edx
+\s*[a-f0-9]+:\s*0f 44 d1             	cmove  %ecx,%edx
+\s*[a-f0-9]+:\s*0f 47 d1             	cmova  %ecx,%edx
+\s*[a-f0-9]+:\s*0f 46 d1             	cmovbe %ecx,%edx
+\s*[a-f0-9]+:\s*0f 49 d1             	cmovns %ecx,%edx
+\s*[a-f0-9]+:\s*0f 48 d1             	cmovs  %ecx,%edx
+\s*[a-f0-9]+:\s*0f 4b d1             	cmovnp %ecx,%edx
+\s*[a-f0-9]+:\s*0f 4a d1             	cmovp  %ecx,%edx
+\s*[a-f0-9]+:\s*0f 4d d1             	cmovge %ecx,%edx
+\s*[a-f0-9]+:\s*0f 4c d1             	cmovl  %ecx,%edx
+\s*[a-f0-9]+:\s*0f 4f d1             	cmovg  %ecx,%edx
+\s*[a-f0-9]+:\s*0f 4e d1             	cmovle %ecx,%edx
 \s*[a-f0-9]+:\s*62 f4 7d 08 60 c0    	movbe  %ax,%ax
 \s*[a-f0-9]+:\s*49 0f c8             	bswap  %r8
 \s*[a-f0-9]+:\s*d5 98 c8             	bswap  %r16
--- a/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.s
+++ b/gas/testsuite/gas/i386/x86-64-apx-ndd-optimize.s
@@ -112,6 +112,22 @@  cmovl  0x90909090(%eax),%edx,%edx
 cmovge 0x90909090(%eax),%edx,%edx
 cmovle 0x90909090(%eax),%edx,%edx
 cmovg  0x90909090(%eax),%edx,%edx
+cmovo  %edx,%ecx,%edx
+cmovno %edx,%ecx,%edx
+cmovc  %edx,%ecx,%edx
+cmovnc %edx,%ecx,%edx
+cmovz  %edx,%ecx,%edx
+cmovnz %edx,%ecx,%edx
+cmovna %edx,%ecx,%edx
+cmovnbe %edx,%ecx,%edx
+cmovs  %edx,%ecx,%edx
+cmovns %edx,%ecx,%edx
+cmovpe %edx,%ecx,%edx
+cmovpo %edx,%ecx,%edx
+cmovnge %edx,%ecx,%edx
+cmovnl %edx,%ecx,%edx
+cmovng %edx,%ecx,%edx
+cmovnle %edx,%ecx,%edx
 movbe  %ax,%ax
 movbe  %r8,%r8
 movbe  %r16,%r16
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -985,7 +985,10 @@  ud2b, 0xfb9, i186, Modrm|CheckOperandSiz
 // 3rd official undefined instr (older CPUs don't take a ModR/M byte)
 ud0, 0xfff, i186, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 
-cmov<cc>, 0x4<cc:opc>, CMOV&APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVexMap4, { Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64, Reg16|Reg32|Reg64 }
+// C (commutative) isn't quite correct here on its own; the condition also
+// needs inverting when source operands are swapped in order to convert to
+// legacy encoding.  The assembler will take care of that.
+cmov<cc>, 0x4<cc:opc>, CMOV&APX_F, C|Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVexMap4|Optimize, { Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64, Reg16|Reg32|Reg64 }
 cmov<cc>, 0xf4<cc:opc>, CMOV, Modrm|CheckOperandSize|No_bSuf|No_sSuf, { Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
 
 fcmovb, 0xda/0, i687, Modrm|NoSuf, { FloatReg, FloatAcc }