diff mbox series

aarch64: Add the cost model for Neoverse N1

Message ID	6A93A02F-3719-4751-9055-C774F8FC1D78@icloud.com
State	New
Headers	DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org C9A853856954 Content-Type: text/plain; charset=utf-8 Mime-Version: 1.0 (Mac OS X Mail 16.0 \(3731.400.51.1.1\)) Subject: [PATCH] aarch64: Add the cost model for Neoverse N1 In-Reply-To: <8E0E3524-094D-43CD-93B1-B99D26ABD724@icloud.com> Date: Tue, 18 Apr 2023 16:41:47 -0500 Content-Transfer-Encoding: quoted-printable Message-Id: <6A93A02F-3719-4751-9055-C774F8FC1D78@icloud.com> References: <8E0E3524-094D-43CD-93B1-B99D26ABD724@icloud.com> To: gcc-patches@gcc.gnu.org Precedence: list From: Evandro Menezes via Gcc-patches <gcc-patches@gcc.gnu.org> Reply-To: evandro+gcc-patches@gcc.gnu.org Cc: Evandro Menezes <ebahapo@icloud.com>, Richard Sandiford <richard.sandiford@arm.com>, Kyrylo Tkachov <Kyrylo.Tkachov@arm.com> Errors-To: gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org>
Series	aarch64: Add the cost model for Neoverse N1 \| aarch64: Add the cost model for Neoverse N1

Commit Message

Evandro Menezes April 18, 2023, 9:41 p.m. UTC

  This patch adds the cost model for Neoverse N1, based on the information from the "Arm Neoverse N1 Software Optimization Guide”.

Comments

Tamar Christina April 24, 2023, 5:37 p.m. UTC | #1

Hi Evandro,

I wanted to give this patch a try, but the diff seems corrupt, the whitespaces at the start of the context lines seem to have gone missing.

Could you try resending it?

Thanks,
Tamar

> -----Original Message-----
> From: Gcc-patches <gcc-patches-
> bounces+tamar.christina=arm.com@gcc.gnu.org> On Behalf Of Evandro
> Menezes via Gcc-patches
> Sent: Tuesday, April 18, 2023 10:42 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Evandro Menezes <ebahapo@icloud.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: [PATCH] aarch64: Add the cost model for Neoverse N1
> 
> This patch adds the cost model for Neoverse N1, based on the information
> from the "Arm Neoverse N1 Software Optimization Guide”.
> 
> --
> Evandro Menezes
> 
> ===================================================================
> =============
> 
> gcc/ChangeLog:
> 
>        * config/aarch64/aarch64-cores.def: Use the Neoverse N1 cost model.
>        * config/aarch64/aarch64.cc
>        (cortexa76_tunings): Rename variable.
>        (neoversen1_addrcost_table): New variable.
>        (neoversen1_vector_cost): Likewise.
>        (neoversen1_regmove_cost): Likewise.
>        (neoversen1_advsimd_vector_cost): Likewise.
>        (neoversen1_scalar_issue_info): Likewise.
>        (neoversen1_advsimd_issue_info): Likewise.
>        (neoversen1_vec_issue_info): Likewise.
>        (neoversen1_vector_cost): Likewise.
>        (neoversen1_tunings): Likewise.
>        * config/arm/aarch-cost-tables.h
>        (neoversen1_extra_costs): New variable.
> 
> Signed-off-by: Evandro Menezes <evandro@gcc.gnu.org>
> ---
> gcc/config/aarch64/aarch64-cores.def |  20 ++--
> gcc/config/aarch64/aarch64.cc        | 155 ++++++++++++++++++++++++---
> gcc/config/arm/aarch-cost-tables.h   | 107 ++++++++++++++++++
> 3 files changed, 259 insertions(+), 23 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64-cores.def
> b/gcc/config/aarch64/aarch64-cores.def
> index 2ec88c98400..e352e4077b1 100644
> --- a/gcc/config/aarch64/aarch64-cores.def
> +++ b/gcc/config/aarch64/aarch64-cores.def
> @@ -105,17 +105,17 @@ AARCH64_CORE("thunderx2t99",  thunderx2t99,
> thunderx2t99, V8_1A,  (CRYPTO), thu
> /* ARM ('A') cores. */
> AARCH64_CORE("cortex-a55",  cortexa55, cortexa53, V8_2A,  (F16, RCPC,
> DOTPROD), cortexa53, 0x41, 0xd05, -1) AARCH64_CORE("cortex-a75",
> cortexa75, cortexa57, V8_2A,  (F16, RCPC, DOTPROD), cortexa73, 0x41,
> 0xd0a, -1) -AARCH64_CORE("cortex-a76",  cortexa76, cortexa57, V8_2A,
> (F16, RCPC, DOTPROD), neoversen1, 0x41, 0xd0b, -1) -
> AARCH64_CORE("cortex-a76ae",  cortexa76ae, cortexa57, V8_2A,  (F16,
> RCPC, DOTPROD, SSBS), neoversen1, 0x41, 0xd0e, -1) -
> AARCH64_CORE("cortex-a77",  cortexa77, cortexa57, V8_2A,  (F16, RCPC,
> DOTPROD, SSBS), neoversen1, 0x41, 0xd0d, -1) -AARCH64_CORE("cortex-
> a78",  cortexa78, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE),
> neoversen1, 0x41, 0xd41, -1) -AARCH64_CORE("cortex-a78ae",  cortexa78ae,
> cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41,
> 0xd42, -1) -AARCH64_CORE("cortex-a78c",  cortexa78c, cortexa57, V8_2A,
> (F16, RCPC, DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), neoversen1, 0x41,
> 0xd4b, -1)
> +AARCH64_CORE("cortex-a76",  cortexa76, cortexa57, V8_2A,  (F16, RCPC,
> +DOTPROD), cortexa76, 0x41, 0xd0b, -1) AARCH64_CORE("cortex-a76ae",
> +cortexa76ae, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS), cortexa76,
> +0x41, 0xd0e, -1) AARCH64_CORE("cortex-a77",  cortexa77, cortexa57,
> +V8_2A,  (F16, RCPC, DOTPROD, SSBS), cortexa76, 0x41, 0xd0d, -1)
> +AARCH64_CORE("cortex-a78",  cortexa78, cortexa57, V8_2A,  (F16, RCPC,
> +DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd41, -1)
> +AARCH64_CORE("cortex-a78ae",  cortexa78ae, cortexa57, V8_2A,  (F16,
> +RCPC, DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd42, -1)
> +AARCH64_CORE("cortex-a78c",  cortexa78c, cortexa57, V8_2A,  (F16, RCPC,
> +DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), cortexa76, 0x41, 0xd4b, -1)
> AARCH64_CORE("cortex-a65",  cortexa65, cortexa53, V8_2A,  (F16, RCPC,
> DOTPROD, SSBS), cortexa73, 0x41, 0xd06, -1) AARCH64_CORE("cortex-
> a65ae",  cortexa65ae, cortexa53, V8_2A,  (F16, RCPC, DOTPROD, SSBS),
> cortexa73, 0x41, 0xd43, -1) -AARCH64_CORE("cortex-x1",  cortexx1,
> cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41,
> 0xd44, -1) -AARCH64_CORE("cortex-x1c",  cortexx1c, cortexa57, V8_2A,
> (F16, RCPC, DOTPROD, SSBS, PROFILE, PAUTH), neoversen1, 0x41, 0xd4c, -1)
> -AARCH64_CORE("ares",  ares, cortexa57, V8_2A,  (F16, RCPC, DOTPROD,
> PROFILE), neoversen1, 0x41, 0xd0c, -1)
> +AARCH64_CORE("cortex-x1",  cortexx1, cortexa57, V8_2A,  (F16, RCPC,
> +DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd44, -1)
> +AARCH64_CORE("cortex-x1c",  cortexx1c, cortexa57, V8_2A,  (F16, RCPC,
> +DOTPROD, SSBS, PROFILE, PAUTH), cortexa76, 0x41, 0xd4c, -1)
> +AARCH64_CORE("ares",  ares, cortexa57, V8_2A,  (F16, RCPC, DOTPROD,
> +PROFILE), cortexa76, 0x41, 0xd0c, -1)
> AARCH64_CORE("neoverse-n1",  neoversen1, cortexa57, V8_2A,  (F16, RCPC,
> DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1)
> AARCH64_CORE("neoverse-e1",  neoversee1, cortexa53, V8_2A,  (F16, RCPC,
> DOTPROD, SSBS), cortexa73, 0x41, 0xd4a, -1)
> 
> @@ -160,7 +160,7 @@ AARCH64_CORE("cortex-a73.cortex-a53",
> cortexa73cortexa53, cortexa53, V8A,  (CRC
> /* ARM DynamIQ big.LITTLE configurations.  */
> 
> AARCH64_CORE("cortex-a75.cortex-a55",  cortexa75cortexa55, cortexa53,
> V8_2A,  (F16, RCPC, DOTPROD), cortexa73, 0x41, AARCH64_BIG_LITTLE
> (0xd0a, 0xd05), -1) -AARCH64_CORE("cortex-a76.cortex-a55",
> cortexa76cortexa55, cortexa53, V8_2A,  (F16, RCPC, DOTPROD), neoversen1,
> 0x41, AARCH64_BIG_LITTLE (0xd0b, 0xd05), -1)
> +AARCH64_CORE("cortex-a76.cortex-a55",  cortexa76cortexa55, cortexa53,
> +V8_2A,  (F16, RCPC, DOTPROD), cortexa76, 0x41, AARCH64_BIG_LITTLE
> +(0xd0b, 0xd05), -1)
> 
> /* Armv8-R Architecture Processors.  */
> AARCH64_CORE("cortex-r82", cortexr82, cortexa53, V8R, (), cortexa53, 0x41,
> 0xd15, -1) diff --git a/gcc/config/aarch64/aarch64.cc
> b/gcc/config/aarch64/aarch64.cc index 42617ced73a..46710490a39
> 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -1867,7 +1867,7 @@ static const struct tune_params
> thunderx3t110_tunings =
>   &thunderx3t110_prefetch_tune
> };
> 
> -static const struct tune_params neoversen1_tunings =
> +static const struct tune_params cortexa76_tunings =
> {
>   &cortexa76_extra_costs,
>   &generic_addrcost_table,
> @@ -1885,18 +1885,18 @@ static const struct tune_params
> neoversen1_tunings =
>   }, /* memmov_cost.  */
>   3, /* issue_rate  */
>   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /*
> fusible_ops  */
> -  "32:16", /* function_align.  */
> -  "4", /* jump_align.  */
> -  "32:16", /* loop_align.  */
> -  2, /* int_reassoc_width.  */
> -  4, /* fp_reassoc_width.  */
> -  1, /* fma_reassoc_width.  */
> -  2, /* vec_reassoc_width.  */
> -  2, /* min_div_recip_mul_sf.  */
> -  2, /* min_div_recip_mul_df.  */
> -  0, /* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags.  */
> +  "32:16",     /* function_align.  */
> +  "4",         /* jump_align.  */
> +  "32:16",     /* loop_align.  */
> +  2,   /* int_reassoc_width.  */
> +  4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
> +  2,   /* vec_reassoc_width.  */
> +  2,   /* min_div_recip_mul_sf.  */
> +  2,   /* min_div_recip_mul_df.  */
> +  0,   /* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),     /* tune_flags.  */
>   &generic_prefetch_tune
> };
> 
> @@ -2293,6 +2293,135 @@ static const struct tune_params
> neoverse512tvb_tunings =
>   &generic_prefetch_tune
> };
> 
> +static const struct cpu_addrcost_table neoversen1_addrcost_table = {
> +    {
> +      0, /* hi  */
> +      0, /* si  */
> +      0, /* di  */
> +      1, /* ti  */
> +    },
> +  0, /* pre_modify  */
> +  0, /* post_modify  */
> +  1, /* post_modify_ld3_st3  */
> +  1, /* post_modify_ld4_st4  */
> +  0, /* register_offset  */
> +  0, /* register_sextend  */
> +  0, /* register_zextend  */
> +  0 /* imm_offset  */
> +};
> +
> +static const struct cpu_regmove_cost neoversen1_regmove_cost = {
> +  1, /* GP2GP  */
> +  /* Avoid the use of slow int<->fp moves for spilling by setting
> +     their cost higher than memmov_cost.  */
> +  3, /* GP2FP  */
> +  2, /* FP2GP  */
> +  2 /* FP2FP  */
> +};
> +
> +static const advsimd_vec_cost neoversen1_advsimd_vector_cost = {
> +  2, /* int_stmt_cost  */
> +  2, /* fp_stmt_cost  */
> +  0, /* ld2_st2_permute_cost  */
> +  0, /* ld3_st3_permute_cost  */
> +  0, /* ld4_st4_permute_cost  */
> +  3, /* permute_cost  */
> +  6, /* reduc_i8_cost  */
> +  5, /* reduc_i16_cost  */
> +  3, /* reduc_i32_cost  */
> +  3, /* reduc_i64_cost  */
> +  8, /* reduc_f16_cost  */
> +  5, /* reduc_f32_cost  */
> +  5, /* reduc_f64_cost  */
> +  0, /* store_elt_extra_cost  */
> +  2, /* vec_to_scalar_cost  */
> +  2, /* scalar_to_vec_cost  */
> +  4, /* align_load_cost  */
> +  4, /* unalign_load_cost  */
> +  1, /* unalign_store_cost  */
> +  1  /* store_cost  */
> +};
> +
> +static const aarch64_scalar_vec_issue_info neoversen1_scalar_issue_info
> += {
> +  2, /* loads_stores_per_cycle  */
> +  2, /* stores_per_cycle  */
> +  2, /* general_ops_per_cycle  */
> +  0, /* fp_simd_load_general_ops  */
> +  1 /* fp_simd_store_general_ops  */
> +};
> +
> +static const aarch64_advsimd_vec_issue_info
> +neoversen1_advsimd_issue_info = {
> +  {
> +    2, /* loads_stores_per_cycle  */
> +    2, /* stores_per_cycle  */
> +    2, /* general_ops_per_cycle  */
> +    0, /* fp_simd_load_general_ops  */
> +    1 /* fp_simd_store_general_ops  */
> +  },
> +  3, /* ld2_st2_general_ops  */
> +  5, /* ld3_st3_general_ops  */
> +  11 /* ld4_st4_general_ops  */
> +};
> +
> +static const aarch64_vec_issue_info neoversen1_vec_issue_info = {
> +  &neoversen1_scalar_issue_info, /* scalar  */
> +  &neoversen1_advsimd_issue_info, /* advsimd  */
> +  nullptr /* sve  */
> +};
> +
> +
> +static const struct cpu_vector_cost neoversen1_vector_cost = {
> +  1, /* scalar_int_stmt_cost  */
> +  1, /* scalar_fp_stmt_cost  */
> +  4, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  1, /* cond_taken_branch_cost  */
> +  1, /* cond_not_taken_branch_cost  */
> +  &neoversen1_advsimd_vector_cost, /* advsimd  */
> +  nullptr, /* sve  */
> +  &neoversen1_vec_issue_info /* issue_info  */ };
> +
> +static const struct tune_params neoversen1_tunings = {
> +  &neoversen1_extra_costs,
> +  &neoversen1_addrcost_table,
> +  &neoversen1_regmove_cost,
> +  &neoversen1_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    2, /* store_int.  */
> +    5, /* load_fp.  */
> +    2, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  4, /* issue_rate  */
> +  AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
> +  "32:16", /* function_align.  */
> +  "4", /* jump_align.  */
> +  "32:16", /* loop_align.  */
> +  2, /* int_reassoc_width.  */
> +  4, /* fp_reassoc_width.  */
> +  1, /* fma_reassoc_width.  */
> +  2, /* vec_reassoc_width.  */
> +  2, /* min_div_recip_mul_sf.  */
> +  2, /* min_div_recip_mul_df.  */
> +  0, /* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
> +  AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND, /* tune_flags.  */
> +  &generic_prefetch_tune
> +};
> +
> static const advsimd_vec_cost neoversen2_advsimd_vector_cost = {
>   2, /* int_stmt_cost  */
> diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-
> tables.h
> index e3848214728..fce6da6bbcc 100644
> --- a/gcc/config/arm/aarch-cost-tables.h
> +++ b/gcc/config/arm/aarch-cost-tables.h
> @@ -450,6 +450,113 @@ const struct cpu_cost_table cortexa76_extra_costs
> =
>   }
> };
> 
> +const struct cpu_cost_table neoversen1_extra_costs = {
> +  /* ALU */
> +  {
> +    0,                 /* arith.  */
> +    0,                 /* logical.  */
> +    0,                 /* shift.  */
> +    0,                 /* shift_reg.  */
> +    COSTS_N_INSNS (1), /* arith_shift.  */
> +    COSTS_N_INSNS (1), /* arith_shift_reg.  */
> +    0,       /* log_shift.  */
> +    COSTS_N_INSNS (1), /* log_shift_reg.  */
> +    0,                 /* extend.  */
> +    COSTS_N_INSNS (1), /* extend_arith.  */
> +    COSTS_N_INSNS (1), /* bfi.  */
> +    0,                 /* bfx.  */
> +    0,                 /* clz.  */
> +    0,                 /* rev.  */
> +    0,                 /* non_exec.  */
> +    true               /* non_exec_costs_exec.  */
> +  },
> +  {
> +    /* MULT SImode */
> +    {
> +      COSTS_N_INSNS (1),       /* simple.  */
> +      COSTS_N_INSNS (2),       /* flag_setting.  */
> +      COSTS_N_INSNS (1),       /* extend.  */
> +      COSTS_N_INSNS (1),       /* add.  */
> +      COSTS_N_INSNS (1),       /* extend_add.  */
> +      COSTS_N_INSNS (11)       /* idiv.  */
> +    },
> +    /* MULT DImode */
> +    {
> +      COSTS_N_INSNS (3),       /* simple.  */
> +      0,                       /* flag_setting (N/A).  */
> +      COSTS_N_INSNS (1),       /* extend.  */
> +      COSTS_N_INSNS (3),       /* add.  */
> +      COSTS_N_INSNS (1),       /* extend_add.  */
> +      COSTS_N_INSNS (19)       /* idiv.  */
> +    }
> +  },
> +  /* LD/ST */
> +  {
> +    COSTS_N_INSNS (3),         /* load.  */
> +    COSTS_N_INSNS (3),         /* load_sign_extend.  */
> +    COSTS_N_INSNS (3),         /* ldrd.  */
> +    COSTS_N_INSNS (2),         /* ldm_1st.  */
> +    1,                         /* ldm_regs_per_insn_1st.  */
> +    2,                         /* ldm_regs_per_insn_subsequent.  */
> +    COSTS_N_INSNS (4),         /* loadf.  */
> +    COSTS_N_INSNS (4),         /* loadd.  */
> +    COSTS_N_INSNS (3),         /* load_unaligned.  */
> +    0,                         /* store.  */
> +    0,                         /* strd.  */
> +    0,                         /* stm_1st.  */
> +    1,                         /* stm_regs_per_insn_1st.  */
> +    2,                         /* stm_regs_per_insn_subsequent.  */
> +    0,                         /* storef.  */
> +    0,                         /* stored.  */
> +    COSTS_N_INSNS (1),         /* store_unaligned.  */
> +    COSTS_N_INSNS (1),         /* loadv.  */
> +    COSTS_N_INSNS (1)          /* storev.  */
> +  },
> +  {
> +    /* FP SFmode */
> +    {
> +      COSTS_N_INSNS (9),       /* div.  */
> +      COSTS_N_INSNS (2),       /* mult.  */
> +      COSTS_N_INSNS (3),       /* mult_addsub.  */
> +      COSTS_N_INSNS (3),       /* fma.  */
> +      COSTS_N_INSNS (1),       /* addsub.  */
> +      COSTS_N_INSNS (1),       /* fpconst.  */
> +      0,                       /* neg.  */
> +      0,                       /* compare.  */
> +      COSTS_N_INSNS (1),       /* widen.  */
> +      COSTS_N_INSNS (1),       /* narrow.  */
> +      COSTS_N_INSNS (1),       /* toint.  */
> +      COSTS_N_INSNS (1),       /* fromint.  */
> +      COSTS_N_INSNS (1)        /* roundint.  */
> +    },
> +    /* FP DFmode */
> +    {
> +      COSTS_N_INSNS (14),      /* div.  */
> +      COSTS_N_INSNS (2),       /* mult.  */
> +      COSTS_N_INSNS (3),       /* mult_addsub.  */
> +      COSTS_N_INSNS (3),       /* fma.  */
> +      COSTS_N_INSNS (1),       /* addsub.  */
> +      COSTS_N_INSNS (1),       /* fpconst.  */
> +      0,                       /* neg.  */
> +      0,                       /* compare.  */
> +      COSTS_N_INSNS (1),       /* widen.  */
> +      COSTS_N_INSNS (1),       /* narrow.  */
> +      COSTS_N_INSNS (1),       /* toint.  */
> +      COSTS_N_INSNS (1),       /* fromint.  */
> +      COSTS_N_INSNS (1)        /* roundint.  */
> +    }
> +  },
> +  /* Vector */
> +  {
> +    COSTS_N_INSNS (1),  /* alu.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (1),  /* dup.  */
> +    COSTS_N_INSNS (1)   /* extract.  */
> +  }
> +};
> +
> const struct cpu_cost_table exynosm1_extra_costs = {
>   /* ALU */
> --
> 2.39.2 (Apple Git-143)
> 
> 
> 
> 
> --
> Evandro Menezes ◊ evandro@yahoo.com ◊ Austin, TX
> Άγιος ο Θεός ⁂ ܩܕܝܫܐ ܐܢ̱ܬ ܠܐ ܡܝܘܬܐ ⁂ Sanctus Deus

Evandro Menezes April 24, 2023, 10:48 p.m. UTC | #2

Hi, Tamara.

Does this work?

Thank you,

Evandro Menezes April 24, 2023, 10:51 p.m. UTC | #3

Sorry, but it seems that, before sending, the email client is stripping leading spaces.  I’m attaching the file here.

Tamar Christina April 25, 2023, 10:03 a.m. UTC | #4

Thanks Evandro,

That one works.  I’ll run the new cost model and sched modules through a number of workloads and come back with the results.

Cheers,
Tamar

From: Evandro Menezes <ebahapo@icloud.com>
Sent: Monday, April 24, 2023 11:52 PM
To: Evandro Menezes <ebahapo@icloud.com>
Cc: Tamar Christina <Tamar.Christina@arm.com>; evandro+gcc-patches@gcc.gnu.org; gcc-patches@gcc.gnu.org; Richard Sandiford <Richard.Sandiford@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
Subject: Re: [PATCH] aarch64: Add the cost model for Neoverse N1

Sorry, but it seems that, before sending, the email client is stripping leading spaces.  I’m attaching the file here.

--
Evandro Menezes ◊ evandro@yahoo.com<mailto:evandro@yahoo.com> ◊ Austin, TX
Άγιος ο Θεός ⁂ ܩܕܝܫܐ ܐܢ̱ܬ ܠܐ ܡܝܘܬܐ ⁂ Sanctus Deus

Em 24 de abr. de 2023, à(s) 17:48, Evandro Menezes <ebahapo@icloud.com<mailto:ebahapo@icloud.com>> escreveu:

Hi, Tamara.

Does this work?

Thank you,

--
Evandro Menezes ◊ evandro@yahoo.com<mailto:evandro@yahoo.com> ◊ Austin, TX
Άγιος ο Θεός ⁂ ܩܕܝܫܐ ܐܢ̱ܬ ܠܐ ܡܝܘܬܐ ⁂ Sanctus Deus

Em 24 de abr. de 2023, à(s) 12:37, Tamar Christina <tamar.christina@arm.com<mailto:tamar.christina@arm.com>> escreveu:

Hi Evandro,

I wanted to give this patch a try, but the diff seems corrupt, the whitespaces at the start of the context lines seem to have gone missing.

Could you try resending it?

Thanks,
Tamar

diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index 2ec88c98400..e352e4077b1 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -105,17 +105,17 @@  AARCH64_CORE("thunderx2t99",  thunderx2t99,  thunderx2t99, V8_1A,  (CRYPTO), thu
/* ARM ('A') cores. */
AARCH64_CORE("cortex-a55",  cortexa55, cortexa53, V8_2A,  (F16, RCPC, DOTPROD), cortexa53, 0x41, 0xd05, -1)
AARCH64_CORE("cortex-a75",  cortexa75, cortexa57, V8_2A,  (F16, RCPC, DOTPROD), cortexa73, 0x41, 0xd0a, -1)
-AARCH64_CORE("cortex-a76",  cortexa76, cortexa57, V8_2A,  (F16, RCPC, DOTPROD), neoversen1, 0x41, 0xd0b, -1)
-AARCH64_CORE("cortex-a76ae",  cortexa76ae, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS), neoversen1, 0x41, 0xd0e, -1)
-AARCH64_CORE("cortex-a77",  cortexa77, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS), neoversen1, 0x41, 0xd0d, -1)
-AARCH64_CORE("cortex-a78",  cortexa78, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd41, -1)
-AARCH64_CORE("cortex-a78ae",  cortexa78ae, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd42, -1)
-AARCH64_CORE("cortex-a78c",  cortexa78c, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), neoversen1, 0x41, 0xd4b, -1)
+AARCH64_CORE("cortex-a76",  cortexa76, cortexa57, V8_2A,  (F16, RCPC, DOTPROD), cortexa76, 0x41, 0xd0b, -1)
+AARCH64_CORE("cortex-a76ae",  cortexa76ae, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS), cortexa76, 0x41, 0xd0e, -1)
+AARCH64_CORE("cortex-a77",  cortexa77, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS), cortexa76, 0x41, 0xd0d, -1)
+AARCH64_CORE("cortex-a78",  cortexa78, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd41, -1)
+AARCH64_CORE("cortex-a78ae",  cortexa78ae, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd42, -1)
+AARCH64_CORE("cortex-a78c",  cortexa78c, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), cortexa76, 0x41, 0xd4b, -1)
AARCH64_CORE("cortex-a65",  cortexa65, cortexa53, V8_2A,  (F16, RCPC, DOTPROD, SSBS), cortexa73, 0x41, 0xd06, -1)
AARCH64_CORE("cortex-a65ae",  cortexa65ae, cortexa53, V8_2A,  (F16, RCPC, DOTPROD, SSBS), cortexa73, 0x41, 0xd43, -1)
-AARCH64_CORE("cortex-x1",  cortexx1, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd44, -1)
-AARCH64_CORE("cortex-x1c",  cortexx1c, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE, PAUTH), neoversen1, 0x41, 0xd4c, -1)
-AARCH64_CORE("ares",  ares, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1)
+AARCH64_CORE("cortex-x1",  cortexx1, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd44, -1)
+AARCH64_CORE("cortex-x1c",  cortexx1c, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE, PAUTH), cortexa76, 0x41, 0xd4c, -1)
+AARCH64_CORE("ares",  ares, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, PROFILE), cortexa76, 0x41, 0xd0c, -1)
AARCH64_CORE("neoverse-n1",  neoversen1, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1)
AARCH64_CORE("neoverse-e1",  neoversee1, cortexa53, V8_2A,  (F16, RCPC, DOTPROD, SSBS), cortexa73, 0x41, 0xd4a, -1)

@@ -160,7 +160,7 @@  AARCH64_CORE("cortex-a73.cortex-a53",  cortexa73cortexa53, cortexa53, V8A,  (CRC
/* ARM DynamIQ big.LITTLE configurations.  */

AARCH64_CORE("cortex-a75.cortex-a55",  cortexa75cortexa55, cortexa53, V8_2A,  (F16, RCPC, DOTPROD), cortexa73, 0x41, AARCH64_BIG_LITTLE (0xd0a, 0xd05), -1)
-AARCH64_CORE("cortex-a76.cortex-a55",  cortexa76cortexa55, cortexa53, V8_2A,  (F16, RCPC, DOTPROD), neoversen1, 0x41, AARCH64_BIG_LITTLE (0xd0b, 0xd05), -1)
+AARCH64_CORE("cortex-a76.cortex-a55",  cortexa76cortexa55, cortexa53, V8_2A,  (F16, RCPC, DOTPROD), cortexa76, 0x41, AARCH64_BIG_LITTLE (0xd0b, 0xd05), -1)

/* Armv8-R Architecture Processors.  */
AARCH64_CORE("cortex-r82", cortexr82, cortexa53, V8R, (), cortexa53, 0x41, 0xd15, -1)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 42617ced73a..46710490a39 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1867,7 +1867,7 @@  static const struct tune_params thunderx3t110_tunings =
  &thunderx3t110_prefetch_tune
};

-static const struct tune_params neoversen1_tunings =
+static const struct tune_params cortexa76_tunings =
{
  &cortexa76_extra_costs,
  &generic_addrcost_table,
@@ -1885,18 +1885,18 @@  static const struct tune_params neoversen1_tunings =
  }, /* memmov_cost.  */
  3, /* issue_rate  */
  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16", /* function_align.  */
-  "4", /* jump_align.  */
-  "32:16", /* loop_align.  */
-  2, /* int_reassoc_width.  */
-  4, /* fp_reassoc_width.  */
-  1, /* fma_reassoc_width.  */
-  2, /* vec_reassoc_width.  */
-  2, /* min_div_recip_mul_sf.  */
-  2, /* min_div_recip_mul_df.  */
-  0, /* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags.  */
+  "32:16",     /* function_align.  */
+  "4",         /* jump_align.  */
+  "32:16",     /* loop_align.  */
+  2,   /* int_reassoc_width.  */
+  4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
+  2,   /* vec_reassoc_width.  */
+  2,   /* min_div_recip_mul_sf.  */
+  2,   /* min_div_recip_mul_df.  */
+  0,   /* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),     /* tune_flags.  */
  &generic_prefetch_tune
};

@@ -2293,6 +2293,135 @@  static const struct tune_params neoverse512tvb_tunings =
  &generic_prefetch_tune
};

+static const struct cpu_addrcost_table neoversen1_addrcost_table =
+{
+    {
+      0, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  1, /* post_modify_ld3_st3  */
+  1, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost neoversen1_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  3, /* GP2FP  */
+  2, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const advsimd_vec_cost neoversen1_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  6, /* reduc_i8_cost  */
+  5, /* reduc_i16_cost  */
+  3, /* reduc_i32_cost  */
+  3, /* reduc_i64_cost  */
+  8, /* reduc_f16_cost  */
+  5, /* reduc_f32_cost  */
+  5, /* reduc_f64_cost  */
+  0, /* store_elt_extra_cost  */
+  2, /* vec_to_scalar_cost  */
+  2, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const aarch64_scalar_vec_issue_info neoversen1_scalar_issue_info =
+{
+  2, /* loads_stores_per_cycle  */
+  2, /* stores_per_cycle  */
+  2, /* general_ops_per_cycle  */
+  0, /* fp_simd_load_general_ops  */
+  1 /* fp_simd_store_general_ops  */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversen1_advsimd_issue_info =
+{
+  {
+    2, /* loads_stores_per_cycle  */
+    2, /* stores_per_cycle  */
+    2, /* general_ops_per_cycle  */
+    0, /* fp_simd_load_general_ops  */
+    1 /* fp_simd_store_general_ops  */
+  },
+  3, /* ld2_st2_general_ops  */
+  5, /* ld3_st3_general_ops  */
+  11 /* ld4_st4_general_ops  */
+};
+
+static const aarch64_vec_issue_info neoversen1_vec_issue_info =
+{
+  &neoversen1_scalar_issue_info, /* scalar  */
+  &neoversen1_advsimd_issue_info, /* advsimd  */
+  nullptr /* sve  */
+};
+
+
+static const struct cpu_vector_cost neoversen1_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &neoversen1_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  &neoversen1_vec_issue_info /* issue_info  */
+};
+
+static const struct tune_params neoversen1_tunings =
+{
+  &neoversen1_extra_costs,
+  &neoversen1_addrcost_table,
+  &neoversen1_regmove_cost,
+  &neoversen1_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    2, /* store_int.  */
+    5, /* load_fp.  */
+    2, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate  */
+  AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
+  "32:16", /* function_align.  */
+  "4", /* jump_align.  */
+  "32:16", /* loop_align.  */
+  2, /* int_reassoc_width.  */
+  4, /* fp_reassoc_width.  */
+  1, /* fma_reassoc_width.  */
+  2, /* vec_reassoc_width.  */
+  2, /* min_div_recip_mul_sf.  */
+  2, /* min_div_recip_mul_df.  */
+  0, /* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
+  AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND, /* tune_flags.  */
+  &generic_prefetch_tune
+};
+
static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
{
  2, /* int_stmt_cost  */
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index e3848214728..fce6da6bbcc 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -450,6 +450,113 @@  const struct cpu_cost_table cortexa76_extra_costs =
  }
};

+const struct cpu_cost_table neoversen1_extra_costs =
+{
+  /* ALU */
+  {
+    0,                 /* arith.  */
+    0,                 /* logical.  */
+    0,                 /* shift.  */
+    0,                 /* shift_reg.  */
+    COSTS_N_INSNS (1), /* arith_shift.  */
+    COSTS_N_INSNS (1), /* arith_shift_reg.  */
+    0,       /* log_shift.  */
+    COSTS_N_INSNS (1), /* log_shift_reg.  */
+    0,                 /* extend.  */
+    COSTS_N_INSNS (1), /* extend_arith.  */
+    COSTS_N_INSNS (1), /* bfi.  */
+    0,                 /* bfx.  */
+    0,                 /* clz.  */
+    0,                 /* rev.  */
+    0,                 /* non_exec.  */
+    true               /* non_exec_costs_exec.  */
+  },
+  {
+    /* MULT SImode */
+    {
+      COSTS_N_INSNS (1),       /* simple.  */
+      COSTS_N_INSNS (2),       /* flag_setting.  */
+      COSTS_N_INSNS (1),       /* extend.  */
+      COSTS_N_INSNS (1),       /* add.  */
+      COSTS_N_INSNS (1),       /* extend_add.  */
+      COSTS_N_INSNS (11)       /* idiv.  */
+    },
+    /* MULT DImode */
+    {
+      COSTS_N_INSNS (3),       /* simple.  */
+      0,                       /* flag_setting (N/A).  */
+      COSTS_N_INSNS (1),       /* extend.  */
+      COSTS_N_INSNS (3),       /* add.  */
+      COSTS_N_INSNS (1),       /* extend_add.  */
+      COSTS_N_INSNS (19)       /* idiv.  */
+    }
+  },
+  /* LD/ST */
+  {
+    COSTS_N_INSNS (3),         /* load.  */
+    COSTS_N_INSNS (3),         /* load_sign_extend.  */
+    COSTS_N_INSNS (3),         /* ldrd.  */
+    COSTS_N_INSNS (2),         /* ldm_1st.  */
+    1,                         /* ldm_regs_per_insn_1st.  */
+    2,                         /* ldm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (4),         /* loadf.  */
+    COSTS_N_INSNS (4),         /* loadd.  */
+    COSTS_N_INSNS (3),         /* load_unaligned.  */
+    0,                         /* store.  */
+    0,                         /* strd.  */
+    0,                         /* stm_1st.  */
+    1,                         /* stm_regs_per_insn_1st.  */
+    2,                         /* stm_regs_per_insn_subsequent.  */
+    0,                         /* storef.  */
+    0,                         /* stored.  */
+    COSTS_N_INSNS (1),         /* store_unaligned.  */
+    COSTS_N_INSNS (1),         /* loadv.  */
+    COSTS_N_INSNS (1)          /* storev.  */
+  },
+  {
+    /* FP SFmode */
+    {
+      COSTS_N_INSNS (9),       /* div.  */
+      COSTS_N_INSNS (2),       /* mult.  */
+      COSTS_N_INSNS (3),       /* mult_addsub.  */
+      COSTS_N_INSNS (3),       /* fma.  */
+      COSTS_N_INSNS (1),       /* addsub.  */
+      COSTS_N_INSNS (1),       /* fpconst.  */
+      0,                       /* neg.  */
+      0,                       /* compare.  */
+      COSTS_N_INSNS (1),       /* widen.  */
+      COSTS_N_INSNS (1),       /* narrow.  */
+      COSTS_N_INSNS (1),       /* toint.  */
+      COSTS_N_INSNS (1),       /* fromint.  */
+      COSTS_N_INSNS (1)        /* roundint.  */
+    },
+    /* FP DFmode */
+    {
+      COSTS_N_INSNS (14),      /* div.  */
+      COSTS_N_INSNS (2),       /* mult.  */
+      COSTS_N_INSNS (3),       /* mult_addsub.  */
+      COSTS_N_INSNS (3),       /* fma.  */
+      COSTS_N_INSNS (1),       /* addsub.  */
+      COSTS_N_INSNS (1),       /* fpconst.  */
+      0,                       /* neg.  */
+      0,                       /* compare.  */
+      COSTS_N_INSNS (1),       /* widen.  */
+      COSTS_N_INSNS (1),       /* narrow.  */
+      COSTS_N_INSNS (1),       /* toint.  */
+      COSTS_N_INSNS (1),       /* fromint.  */
+      COSTS_N_INSNS (1)        /* roundint.  */
+    }
+  },
+  /* Vector */
+  {
+    COSTS_N_INSNS (1),  /* alu.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (1),  /* dup.  */
+    COSTS_N_INSNS (1)   /* extract.  */
+  }
+};
+
const struct cpu_cost_table exynosm1_extra_costs =
{
  /* ALU */