Zen5 tuning part 5: update instruction latencies in x86-tune-costs

Message ID ZtmiZ40NAFLc-97y@kam.mff.cuni.cz
State Committed
Commit 4292297a0f938ffc953422fa246ff00fe345fe3d
Headers
Series Zen5 tuning part 5: update instruction latencies in x86-tune-costs |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm warning Patch is already merged
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 warning Patch is already merged

Commit Message

Jan Hubicka Sept. 5, 2024, 12:21 p.m. UTC
  Hi,
there is nothing exciting in this patch.  I measured latencies and also
compared them with newly released optimization guide and it seems that
only important change is that addss is fastr now. It can be 2 cycles
instaead of 3 in some cases when the input parameter is computed by
another addition. The throughput has increased but we have no model for
that.

I added comments whic should make it easier to update the table for
future revisions.

I also increased the large insn bound since decoders seems no longer
require instructions to be 8 bytes or less.

Bootstrapped/rgtested x86_64-linux, comitted.

gcc/ChangeLog:

	* config/i386/x86-tune-costs.h (znver5_cost): Update instruction
	costs.
  

Patch

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index b90567fbbf2..1b3227ace16 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2034,6 +2034,7 @@  struct processor_costs znver5_cost = {
   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
   COSTS_N_INSNS (1),			/* variable shift costs.  */
   COSTS_N_INSNS (1),			/* constant shift costs.  */
+  /* mul has latency 3, executes in 3 integer units.  */
   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
    COSTS_N_INSNS (3),			/* 				 HI.  */
    COSTS_N_INSNS (3),			/*				 SI.  */
@@ -2041,6 +2042,8 @@  struct processor_costs znver5_cost = {
    COSTS_N_INSNS (3)},			/*			other.  */
   0,					/* cost of multiply per each bit
 					   set.  */
+  /* integer divide has latency of 8 cycles
+     plus 1 for every 9 bits of quotient.  */
   {COSTS_N_INSNS (10),			/* cost of a divide/mod for QI.  */
    COSTS_N_INSNS (11),			/* 			    HI.  */
    COSTS_N_INSNS (13),			/*			    SI.  */
@@ -2048,7 +2051,7 @@  struct processor_costs znver5_cost = {
    COSTS_N_INSNS (16)},			/*			    other.  */
   COSTS_N_INSNS (1),			/* cost of movsx.  */
   COSTS_N_INSNS (1),			/* cost of movzx.  */
-  8,					/* "large" insn.  */
+  15,					/* "large" insn.  */
   9,					/* MOVE_RATIO.  */
   6,					/* CLEAR_RATIO */
   {6, 6, 6},				/* cost of loading integer registers
@@ -2065,12 +2068,13 @@  struct processor_costs znver5_cost = {
   2, 2, 2,				/* cost of moving XMM,YMM,ZMM
 					   register.  */
   6,					/* cost of moving SSE register to integer.  */
-  /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
-     throughput 5.  Approx 7 uops do not depend on vector size and every load
-     is 5 uops.  */
+
+  /* TODO: gather and scatter instructions are currently disabled in
+     x86-tune.def.  In some cases they are however a win, see PR116582
+     We however need good cost model for them.  */
   14, 10,				/* Gather load static, per_elt.  */
   14, 20,				/* Gather store static, per_elt.  */
-  32,					/* size of l1 cache.  */
+  48,					/* size of l1 cache.  */
   1024,					/* size of l2 cache.  */
   64,					/* size of prefetch block.  */
   /* New AMD processors never drop prefetches; if they cannot be performed
@@ -2080,6 +2084,8 @@  struct processor_costs znver5_cost = {
      time).  */
   100,					/* number of parallel prefetches.  */
   3,					/* Branch cost.  */
+  /* TODO x87 latencies are still based on znver4.
+     Probably not very important these days.  */
   COSTS_N_INSNS (7),			/* cost of FADD and FSUB insns.  */
   COSTS_N_INSNS (7),			/* cost of FMUL instruction.  */
   /* Latency of fdiv is 8-15.  */
@@ -2089,16 +2095,24 @@  struct processor_costs znver5_cost = {
   /* Latency of fsqrt is 4-10.  */
   COSTS_N_INSNS (25),			/* cost of FSQRT instruction.  */
 
+  /* SSE instructions have typical throughput 4 and latency 1.  */
   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
-  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  /* ADDSS has throughput 2 and latency 2
+     (in some cases when source is another addition).  */
+  COSTS_N_INSNS (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  /* MULSS has throughput 2 and latency 3.  */
   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
+  /* FMA had throughput 2 and latency 4.  */
   COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
   COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
+  /* DIVSS has throughtput 0.4 and latency 10.  */
   COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
-  /* 9-13.  */
+  /* DIVSD has throughtput 0.25 and latency 13.  */
   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
+  /* DIVSD has throughtput 0.22 and latency 14.  */
   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
+  /* DIVSD has throughtput 0.13 and latency 20.  */
   COSTS_N_INSNS (20),			/* cost of SQRTSD instruction.  */
   /* Zen5 can execute:
       - integer ops: 6 per cycle, at most 3 multiplications.