[2/4,2/4] x86: Update memcpy/memset inline strategies for -mtune=tremont
Commit Message
From: "H.J. Lu" <hjl.tools@gmail.com>
Simply memcpy and memset inline strategies to avoid branches for
-mtune=tremont:
1. Create Tremont cost model from generic cost model.
2. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
load and store for up to 16 * 16 (256) bytes when the data size is
fixed and known.
3. Inline only if data size is known to be <= 256.
a. Use "rep movsb/stosb" with simple code sequence if the data size
is a constant.
b. Use loop if data size is not a constant.
4. Use memcpy/memset libray function if data size is unknown or > 256.
* config/i386/i386-options.c (processor_cost_table): Use
tremont_cost for Tremont.
* config/i386/x86-tune-costs.h (tremont_memcpy): New.
(tremont_memset): Likewise.
(tremont_cost): Likewise.
* config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
Enable for Tremont.
---
gcc/config/i386/i386-options.c | 2 +-
gcc/config/i386/x86-tune-costs.h | 124 +++++++++++++++++++++++++++++++
gcc/config/i386/x86-tune.def | 2 +-
3 files changed, 126 insertions(+), 2 deletions(-)
Comments
On Wed, Sep 15, 2021 at 10:10 AM <lili.cui@intel.com> wrote:
>
> From: "H.J. Lu" <hjl.tools@gmail.com>
>
> Simply memcpy and memset inline strategies to avoid branches for
> -mtune=tremont:
>
> 1. Create Tremont cost model from generic cost model.
> 2. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
> load and store for up to 16 * 16 (256) bytes when the data size is
> fixed and known.
> 3. Inline only if data size is known to be <= 256.
> a. Use "rep movsb/stosb" with simple code sequence if the data size
> is a constant.
> b. Use loop if data size is not a constant.
> 4. Use memcpy/memset libray function if data size is unknown or > 256.
>
> * config/i386/i386-options.c (processor_cost_table): Use
> tremont_cost for Tremont.
> * config/i386/x86-tune-costs.h (tremont_memcpy): New.
> (tremont_memset): Likewise.
> (tremont_cost): Likewise.
> * config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
> Enable for Tremont.
OK, and also obvious as a tuning patch.
Thanks,
Uros.
> ---
> gcc/config/i386/i386-options.c | 2 +-
> gcc/config/i386/x86-tune-costs.h | 124 +++++++++++++++++++++++++++++++
> gcc/config/i386/x86-tune.def | 2 +-
> 3 files changed, 126 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
> index c0006b3674b..e7a3bd4aaea 100644
> --- a/gcc/config/i386/i386-options.c
> +++ b/gcc/config/i386/i386-options.c
> @@ -724,7 +724,7 @@ static const struct processor_costs *processor_cost_table[] =
> &slm_cost,
> &slm_cost,
> &slm_cost,
> - &slm_cost,
> + &tremont_cost,
> &slm_cost,
> &slm_cost,
> &skylake_cost,
> diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> index ffe810f2bcb..93644be9cb3 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -2734,6 +2734,130 @@ struct processor_costs slm_cost = {
> "16", /* Func alignment. */
> };
>
> +static stringop_algs tremont_memcpy[2] = {
> + {libcall,
> + {{256, rep_prefix_1_byte, true},
> + {256, loop, false},
> + {-1, libcall, false}}},
> + {libcall,
> + {{256, rep_prefix_1_byte, true},
> + {256, loop, false},
> + {-1, libcall, false}}}};
> +static stringop_algs tremont_memset[2] = {
> + {libcall,
> + {{256, rep_prefix_1_byte, true},
> + {256, loop, false},
> + {-1, libcall, false}}},
> + {libcall,
> + {{256, rep_prefix_1_byte, true},
> + {256, loop, false},
> + {-1, libcall, false}}}};
> +static const
> +struct processor_costs tremont_cost = {
> + {
> + /* Start of register allocator costs. integer->integer move cost is 2. */
> + 6, /* cost for loading QImode using movzbl */
> + {6, 6, 6}, /* cost of loading integer registers
> + in QImode, HImode and SImode.
> + Relative to reg-reg move (2). */
> + {6, 6, 6}, /* cost of storing integer registers */
> + 4, /* cost of reg,reg fld/fst */
> + {6, 6, 12}, /* cost of loading fp registers
> + in SFmode, DFmode and XFmode */
> + {6, 6, 12}, /* cost of storing fp registers
> + in SFmode, DFmode and XFmode */
> + 2, /* cost of moving MMX register */
> + {6, 6}, /* cost of loading MMX registers
> + in SImode and DImode */
> + {6, 6}, /* cost of storing MMX registers
> + in SImode and DImode */
> + 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
> + {6, 6, 6, 10, 15}, /* cost of loading SSE registers
> + in 32,64,128,256 and 512-bit */
> + {6, 6, 6, 10, 15}, /* cost of storing SSE registers
> + in 32,64,128,256 and 512-bit */
> + 6, 6, /* SSE->integer and integer->SSE moves */
> + 6, 6, /* mask->integer and integer->mask moves */
> + {6, 6, 6}, /* cost of loading mask register
> + in QImode, HImode, SImode. */
> + {6, 6, 6}, /* cost if storing mask register
> + in QImode, HImode, SImode. */
> + 2, /* cost of moving mask register. */
> + /* End of register allocator costs. */
> + },
> +
> + COSTS_N_INSNS (1), /* cost of an add instruction */
> + /* Setting cost to 2 makes our current implementation of synth_mult result in
> + use of unnecessary temporary registers causing regression on several
> + SPECfp benchmarks. */
> + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
> + COSTS_N_INSNS (1), /* variable shift costs */
> + COSTS_N_INSNS (1), /* constant shift costs */
> + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
> + COSTS_N_INSNS (4), /* HI */
> + COSTS_N_INSNS (3), /* SI */
> + COSTS_N_INSNS (4), /* DI */
> + COSTS_N_INSNS (4)}, /* other */
> + 0, /* cost of multiply per each bit set */
> + {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
> + COSTS_N_INSNS (22), /* HI */
> + COSTS_N_INSNS (30), /* SI */
> + COSTS_N_INSNS (74), /* DI */
> + COSTS_N_INSNS (74)}, /* other */
> + COSTS_N_INSNS (1), /* cost of movsx */
> + COSTS_N_INSNS (1), /* cost of movzx */
> + 8, /* "large" insn */
> + 17, /* MOVE_RATIO */
> + 17, /* CLEAR_RATIO */
> + {6, 6, 6}, /* cost of loading integer registers
> + in QImode, HImode and SImode.
> + Relative to reg-reg move (2). */
> + {6, 6, 6}, /* cost of storing integer registers */
> + {6, 6, 6, 10, 15}, /* cost of loading SSE register
> + in 32bit, 64bit, 128bit, 256bit and 512bit */
> + {6, 6, 6, 10, 15}, /* cost of storing SSE register
> + in 32bit, 64bit, 128bit, 256bit and 512bit */
> + {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
> + {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
> + 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
> + 6, /* cost of moving SSE register to integer. */
> + 18, 6, /* Gather load static, per_elt. */
> + 18, 6, /* Gather store static, per_elt. */
> + 32, /* size of l1 cache. */
> + 512, /* size of l2 cache. */
> + 64, /* size of prefetch block */
> + 6, /* number of parallel prefetches */
> + /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
> + value is increased to perhaps more appropriate value of 5. */
> + 3, /* Branch cost */
> + COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
> + COSTS_N_INSNS (5), /* cost of FMUL instruction. */
> + COSTS_N_INSNS (17), /* cost of FDIV instruction. */
> + COSTS_N_INSNS (1), /* cost of FABS instruction. */
> + COSTS_N_INSNS (1), /* cost of FCHS instruction. */
> + COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
> +
> + COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
> + COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
> + COSTS_N_INSNS (4), /* cost of MULSS instruction. */
> + COSTS_N_INSNS (5), /* cost of MULSD instruction. */
> + COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
> + COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
> + COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
> + COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
> + COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
> + COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
> + 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
> + tremont_memcpy,
> + tremont_memset,
> + COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
> + COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
> + "16:11:8", /* Loop alignment. */
> + "16:11:8", /* Jump alignment. */
> + "0:0:8", /* Label alignment. */
> + "16", /* Func alignment. */
> +};
> +
> static stringop_algs intel_memcpy[2] = {
> {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
> {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index 385e275bbd9..088edb6c4ca 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -273,7 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
> move/set sequences of bytes with known size. */
> DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
> "prefer_known_rep_movsb_stosb",
> - m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
> + m_SKYLAKE | m_ALDERLAKE | m_TREMONT | m_CORE_AVX512)
>
> /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
> compact prologues and epilogues by issuing a misaligned moves. This
> --
> 2.17.1
>
@@ -724,7 +724,7 @@ static const struct processor_costs *processor_cost_table[] =
&slm_cost,
&slm_cost,
&slm_cost,
- &slm_cost,
+ &tremont_cost,
&slm_cost,
&slm_cost,
&skylake_cost,
@@ -2734,6 +2734,130 @@ struct processor_costs slm_cost = {
"16", /* Func alignment. */
};
+static stringop_algs tremont_memcpy[2] = {
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}},
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}}};
+static stringop_algs tremont_memset[2] = {
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}},
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}}};
+static const
+struct processor_costs tremont_cost = {
+ {
+ /* Start of register allocator costs. integer->integer move cost is 2. */
+ 6, /* cost for loading QImode using movzbl */
+ {6, 6, 6}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {6, 6, 6}, /* cost of storing integer registers */
+ 4, /* cost of reg,reg fld/fst */
+ {6, 6, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {6, 6, 12}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {6, 6}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {6, 6}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
+ {6, 6, 6, 10, 15}, /* cost of loading SSE registers
+ in 32,64,128,256 and 512-bit */
+ {6, 6, 6, 10, 15}, /* cost of storing SSE registers
+ in 32,64,128,256 and 512-bit */
+ 6, 6, /* SSE->integer and integer->SSE moves */
+ 6, 6, /* mask->integer and integer->mask moves */
+ {6, 6, 6}, /* cost of loading mask register
+ in QImode, HImode, SImode. */
+ {6, 6, 6}, /* cost if storing mask register
+ in QImode, HImode, SImode. */
+ 2, /* cost of moving mask register. */
+ /* End of register allocator costs. */
+ },
+
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ /* Setting cost to 2 makes our current implementation of synth_mult result in
+ use of unnecessary temporary registers causing regression on several
+ SPECfp benchmarks. */
+ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (4), /* HI */
+ COSTS_N_INSNS (3), /* SI */
+ COSTS_N_INSNS (4), /* DI */
+ COSTS_N_INSNS (4)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (22), /* HI */
+ COSTS_N_INSNS (30), /* SI */
+ COSTS_N_INSNS (74), /* DI */
+ COSTS_N_INSNS (74)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 17, /* MOVE_RATIO */
+ 17, /* CLEAR_RATIO */
+ {6, 6, 6}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {6, 6, 6}, /* cost of storing integer registers */
+ {6, 6, 6, 10, 15}, /* cost of loading SSE register
+ in 32bit, 64bit, 128bit, 256bit and 512bit */
+ {6, 6, 6, 10, 15}, /* cost of storing SSE register
+ in 32bit, 64bit, 128bit, 256bit and 512bit */
+ {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
+ {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
+ 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
+ 6, /* cost of moving SSE register to integer. */
+ 18, 6, /* Gather load static, per_elt. */
+ 18, 6, /* Gather store static, per_elt. */
+ 32, /* size of l1 cache. */
+ 512, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ 6, /* number of parallel prefetches */
+ /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
+ value is increased to perhaps more appropriate value of 5. */
+ 3, /* Branch cost */
+ COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (5), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (17), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (1), /* cost of FABS instruction. */
+ COSTS_N_INSNS (1), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
+
+ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
+ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
+ COSTS_N_INSNS (4), /* cost of MULSS instruction. */
+ COSTS_N_INSNS (5), /* cost of MULSD instruction. */
+ COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
+ COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
+ COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
+ COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
+ COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
+ COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
+ 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
+ tremont_memcpy,
+ tremont_memset,
+ COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
+ COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
+ "16:11:8", /* Loop alignment. */
+ "16:11:8", /* Jump alignment. */
+ "0:0:8", /* Label alignment. */
+ "16", /* Func alignment. */
+};
+
static stringop_algs intel_memcpy[2] = {
{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
@@ -273,7 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
move/set sequences of bytes with known size. */
DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
"prefer_known_rep_movsb_stosb",
- m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
+ m_SKYLAKE | m_ALDERLAKE | m_TREMONT | m_CORE_AVX512)
/* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
compact prologues and epilogues by issuing a misaligned moves. This