From patchwork Wed Mar 16 14:46:14 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: "Andre Vieira (lists)" X-Patchwork-Id: 52001 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 05CFA3864C15 for ; Wed, 16 Mar 2022 14:47:27 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 05CFA3864C15 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1647442047; bh=EM5TOm+NIYUTk3Absb8S8PBFTiRzrAlNpQy0RqtvnL8=; h=Date:To:Subject:List-Id:List-Unsubscribe:List-Archive:List-Post: List-Help:List-Subscribe:From:Reply-To:Cc:From; b=LyzKdMI0z0d5XR2hoOviPoTdLrOGT6RhJtptJnr681h+aL0HwnUI6DorSM2qA54zt hMgydLLbcQMIPqRYHGLNKtSHNo6gKzrWx8fmrnzSt+1qBOC8ErzeWrpeMNTyvI+5a8 G+4fircWXxFT+Wt0W9Z3LVZtQ3XeNZR+H83JoIpM= X-Original-To: gcc-patches@gcc.gnu.org Delivered-To: gcc-patches@gcc.gnu.org Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by sourceware.org (Postfix) with ESMTP id CA7973843858 for ; Wed, 16 Mar 2022 14:46:33 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org CA7973843858 Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 5CCFB1476; Wed, 16 Mar 2022 07:46:33 -0700 (PDT) Received: from [10.1.38.140] (E121495.Arm.com [10.1.38.140]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id DA1D43F766; Wed, 16 Mar 2022 07:46:32 -0700 (PDT) Message-ID: Date: Wed, 16 Mar 2022 14:46:14 +0000 MIME-Version: 1.0 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Thunderbird/91.6.2 Content-Language: en-US To: "gcc-patches@gcc.gnu.org" Subject: [aarch64] Add Neoverse N2 tuning structs X-Spam-Status: No, score=-11.6 required=5.0 tests=BAYES_00, BODY_8BITS, GIT_PATCH_0, KAM_DMARC_STATUS, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: "Andre Vieira \(lists\) via Gcc-patches" From: "Andre Vieira (lists)" Reply-To: "Andre Vieira \(lists\)" Cc: Richard Sandiford Errors-To: gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org Sender: "Gcc-patches" Hi, This patch adds tuning structures for Neoverse N2. 2022-03-16  Tamar Christina                         Andre Vieira     * config/aarch64/aarch64.cc (neoversen2_addrcost_table, neoversen2_regmove_cost,     neoversen2_advsimd_vector_cost, neoversen2_sve_vector_cost, neoversen2_scalar_issue_info,     neoversen2_advsimd_issue_info, neoversen2_sve_issue_info, neoversen2_vec_issue_info,     neoversen2_tunings): New structs.     (neoversen2_tunings): Use new structs and update tuning flags.     (aarch64_vec_op_count::rename_cycles_per_iter): Enable for neoversen2 tuning. diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index d504fe7607b66a9c9ed9b183a2d3c03d34fb0f80..e0bb447beb9eae74551d863505eb265737d36334 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -519,6 +519,24 @@ static const struct cpu_addrcost_table neoversev1_addrcost_table = 0 /* imm_offset */ }; +static const struct cpu_addrcost_table neoversen2_addrcost_table = +{ + { + 1, /* hi */ + 0, /* si */ + 0, /* di */ + 1, /* ti */ + }, + 0, /* pre_modify */ + 0, /* post_modify */ + 2, /* post_modify_ld3_st3 */ + 2, /* post_modify_ld4_st4 */ + 0, /* register_offset */ + 0, /* register_sextend */ + 0, /* register_zextend */ + 0 /* imm_offset */ +}; + static const struct cpu_regmove_cost generic_regmove_cost = { 1, /* GP2GP */ @@ -624,6 +642,16 @@ static const struct cpu_regmove_cost a64fx_regmove_cost = 2 /* FP2FP */ }; +static const struct cpu_regmove_cost neoversen2_regmove_cost = +{ + 1, /* GP2GP */ + /* Spilling to int<->fp instead of memory is recommended so set + realistic costs compared to memmv_cost. */ + 3, /* GP2FP */ + 2, /* FP2GP */ + 2 /* FP2FP */ +}; + /* Generic costs for Advanced SIMD vector operations. */ static const advsimd_vec_cost generic_advsimd_vector_cost = { @@ -2174,12 +2202,166 @@ static const struct tune_params neoverse512tvb_tunings = &generic_prefetch_tune }; +static const advsimd_vec_cost neoversen2_advsimd_vector_cost = +{ + 2, /* int_stmt_cost */ + 2, /* fp_stmt_cost */ + 2, /* ld2_st2_permute_cost */ + 2, /* ld3_st3_permute_cost */ + 3, /* ld4_st4_permute_cost */ + 3, /* permute_cost */ + 4, /* reduc_i8_cost */ + 4, /* reduc_i16_cost */ + 2, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 6, /* reduc_f16_cost */ + 4, /* reduc_f32_cost */ + 2, /* reduc_f64_cost */ + 2, /* store_elt_extra_cost */ + /* This value is just inherited from the Cortex-A57 table. */ + 8, /* vec_to_scalar_cost */ + /* This depends very much on what the scalar value is and + where it comes from. E.g. some constants take two dependent + instructions or a load, while others might be moved from a GPR. + 4 seems to be a reasonable compromise in practice. */ + 4, /* scalar_to_vec_cost */ + 4, /* align_load_cost */ + 4, /* unalign_load_cost */ + /* Although stores have a latency of 2 and compete for the + vector pipes, in practice it's better not to model that. */ + 1, /* unalign_store_cost */ + 1 /* store_cost */ +}; + +static const sve_vec_cost neoversen2_sve_vector_cost = +{ + { + 2, /* int_stmt_cost */ + 2, /* fp_stmt_cost */ + 3, /* ld2_st2_permute_cost */ + 4, /* ld3_st3_permute_cost */ + 4, /* ld4_st4_permute_cost */ + 3, /* permute_cost */ + /* Theoretically, a reduction involving 15 scalar ADDs could + complete in ~5 cycles and would have a cost of 15. [SU]ADDV + completes in 11 cycles, so give it a cost of 15 + 6. */ + 21, /* reduc_i8_cost */ + /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */ + 13, /* reduc_i16_cost */ + /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */ + 9, /* reduc_i32_cost */ + /* Likewise for 1 scalar ADDs (~1 cycles) vs. 2: 1 + 1. */ + 2, /* reduc_i64_cost */ + /* Theoretically, a reduction involving 7 scalar FADDs could + complete in ~8 cycles and would have a cost of 14. FADDV + completes in 6 cycles, so give it a cost of 14 - 2. */ + 12, /* reduc_f16_cost */ + /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0. */ + 6, /* reduc_f32_cost */ + /* Likewise for 1 scalar FADDs (~2 cycles) vs. 2: 2 - 0. */ + 2, /* reduc_f64_cost */ + 2, /* store_elt_extra_cost */ + /* This value is just inherited from the Cortex-A57 table. */ + 8, /* vec_to_scalar_cost */ + /* See the comment above the Advanced SIMD versions. */ + 4, /* scalar_to_vec_cost */ + 4, /* align_load_cost */ + 4, /* unalign_load_cost */ + /* Although stores have a latency of 2 and compete for the + vector pipes, in practice it's better not to model that. */ + 1, /* unalign_store_cost */ + 1 /* store_cost */ + }, + 3, /* clast_cost */ + 10, /* fadda_f16_cost */ + 6, /* fadda_f32_cost */ + 4, /* fadda_f64_cost */ + /* A strided Advanced SIMD x64 load would take two parallel FP loads + (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather + is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads + (cost 8) and a vec_construct (cost 2). Add a full vector operation + (cost 2) to that, to avoid the difference being lost in rounding. + + There is no easy comparison between a strided Advanced SIMD x32 load + and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector + operation more than a 64-bit gather. */ + 14, /* gather_load_x32_cost */ + 12, /* gather_load_x64_cost */ + 3 /* scatter_store_elt_cost */ +}; + +static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info = +{ + 3, /* loads_stores_per_cycle */ + 2, /* stores_per_cycle */ + 4, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ +}; + +static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info = +{ + { + 3, /* loads_stores_per_cycle */ + 2, /* stores_per_cycle */ + 2, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ + }, + 2, /* ld2_st2_general_ops */ + 2, /* ld3_st3_general_ops */ + 3 /* ld4_st4_general_ops */ +}; + +static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info = +{ + { + { + 3, /* loads_per_cycle */ + 2, /* stores_per_cycle */ + 2, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ + }, + 2, /* ld2_st2_general_ops */ + 3, /* ld3_st3_general_ops */ + 3 /* ld4_st4_general_ops */ + }, + 2, /* pred_ops_per_cycle */ + 2, /* while_pred_ops */ + 2, /* int_cmp_pred_ops */ + 1, /* fp_cmp_pred_ops */ + 1, /* gather_scatter_pair_general_ops */ + 1 /* gather_scatter_pair_pred_ops */ +}; + +static const aarch64_vec_issue_info neoversen2_vec_issue_info = +{ + &neoversen2_scalar_issue_info, + &neoversen2_advsimd_issue_info, + &neoversen2_sve_issue_info +}; + +/* Neoverse N2 costs for vector insn classes. */ +static const struct cpu_vector_cost neoversen2_vector_cost = +{ + 1, /* scalar_int_stmt_cost */ + 2, /* scalar_fp_stmt_cost */ + 4, /* scalar_load_cost */ + 1, /* scalar_store_cost */ + 1, /* cond_taken_branch_cost */ + 1, /* cond_not_taken_branch_cost */ + &neoversen2_advsimd_vector_cost, /* advsimd */ + &neoversen2_sve_vector_cost, /* sve */ + &neoversen2_vec_issue_info /* issue_info */ +}; + static const struct tune_params neoversen2_tunings = { &cortexa76_extra_costs, - &generic_addrcost_table, - &generic_regmove_cost, - &cortexa57_vector_cost, + &neoversen2_addrcost_table, + &neoversen2_regmove_cost, + &neoversen2_vector_cost, &generic_branch_cost, &generic_approx_modes, SVE_128, /* sve_width */ @@ -2202,7 +2384,10 @@ static const struct tune_params neoversen2_tunings = 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND + | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS + | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS + | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ &generic_prefetch_tune }; @@ -15131,7 +15316,8 @@ aarch64_vec_op_count::sve_issue_info () const fractional_cost aarch64_vec_op_count::rename_cycles_per_iter () const { - if (sve_issue_info () == &neoverse512tvb_sve_issue_info) + if (sve_issue_info () == &neoverse512tvb_sve_issue_info + || sve_issue_info () == &neoversen2_sve_issue_info) /* + 1 for an addition. We've already counted a general op for each store, so we don't need to account for stores separately. The branch reads no registers and so does not need to be counted either.