[1/2] Implementation of new RISCV optimizations pass: fold-mem-offsets.

Message ID 20230525123550.1072506-2-manolis.tsamis@vrull.eu
State New
Headers
Series RISC-V: New pass to optimize calculation of offsets for memory operations. |

Commit Message

Manolis Tsamis May 25, 2023, 12:35 p.m. UTC
  Implementation of the new RISC-V optimization pass for memory offset
calculations, documentation and testcases.

gcc/ChangeLog:

	* config.gcc: Add riscv-fold-mem-offsets.o to extra_objs.
	* config/riscv/riscv-passes.def (INSERT_PASS_AFTER): Schedule a new
	pass.
	* config/riscv/riscv-protos.h (make_pass_fold_mem_offsets): Declare.
	* config/riscv/riscv.opt: New options.
	* config/riscv/t-riscv: New build rule.
	* doc/invoke.texi: Document new option.
	* config/riscv/riscv-fold-mem-offsets.cc: New file.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/fold-mem-offsets-1.c: New test.
	* gcc.target/riscv/fold-mem-offsets-2.c: New test.
	* gcc.target/riscv/fold-mem-offsets-3.c: New test.

Signed-off-by: Manolis Tsamis <manolis.tsamis@vrull.eu>
---

 gcc/config.gcc                                |   2 +-
 gcc/config/riscv/riscv-fold-mem-offsets.cc    | 637 ++++++++++++++++++
 gcc/config/riscv/riscv-passes.def             |   1 +
 gcc/config/riscv/riscv-protos.h               |   1 +
 gcc/config/riscv/riscv.opt                    |   4 +
 gcc/config/riscv/t-riscv                      |   4 +
 gcc/doc/invoke.texi                           |   8 +
 .../gcc.target/riscv/fold-mem-offsets-1.c     |  16 +
 .../gcc.target/riscv/fold-mem-offsets-2.c     |  24 +
 .../gcc.target/riscv/fold-mem-offsets-3.c     |  17 +
 10 files changed, 713 insertions(+), 1 deletion(-)
 create mode 100644 gcc/config/riscv/riscv-fold-mem-offsets.cc
 create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c
  

Comments

Richard Biener May 25, 2023, 1:01 p.m. UTC | #1
On Thu, May 25, 2023 at 2:36 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
>
> Implementation of the new RISC-V optimization pass for memory offset
> calculations, documentation and testcases.

Why do fwprop or combine not what you want to do?

> gcc/ChangeLog:
>
>         * config.gcc: Add riscv-fold-mem-offsets.o to extra_objs.
>         * config/riscv/riscv-passes.def (INSERT_PASS_AFTER): Schedule a new
>         pass.
>         * config/riscv/riscv-protos.h (make_pass_fold_mem_offsets): Declare.
>         * config/riscv/riscv.opt: New options.
>         * config/riscv/t-riscv: New build rule.
>         * doc/invoke.texi: Document new option.
>         * config/riscv/riscv-fold-mem-offsets.cc: New file.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/fold-mem-offsets-1.c: New test.
>         * gcc.target/riscv/fold-mem-offsets-2.c: New test.
>         * gcc.target/riscv/fold-mem-offsets-3.c: New test.
>
> Signed-off-by: Manolis Tsamis <manolis.tsamis@vrull.eu>
> ---
>
>  gcc/config.gcc                                |   2 +-
>  gcc/config/riscv/riscv-fold-mem-offsets.cc    | 637 ++++++++++++++++++
>  gcc/config/riscv/riscv-passes.def             |   1 +
>  gcc/config/riscv/riscv-protos.h               |   1 +
>  gcc/config/riscv/riscv.opt                    |   4 +
>  gcc/config/riscv/t-riscv                      |   4 +
>  gcc/doc/invoke.texi                           |   8 +
>  .../gcc.target/riscv/fold-mem-offsets-1.c     |  16 +
>  .../gcc.target/riscv/fold-mem-offsets-2.c     |  24 +
>  .../gcc.target/riscv/fold-mem-offsets-3.c     |  17 +
>  10 files changed, 713 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/config/riscv/riscv-fold-mem-offsets.cc
>  create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c
>
> diff --git a/gcc/config.gcc b/gcc/config.gcc
> index d88071773c9..5dffd21b4c8 100644
> --- a/gcc/config.gcc
> +++ b/gcc/config.gcc
> @@ -529,7 +529,7 @@ pru-*-*)
>         ;;
>  riscv*)
>         cpu_type=riscv
> -       extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o riscv-shorten-memrefs.o riscv-selftests.o riscv-v.o riscv-vsetvl.o"
> +       extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o riscv-shorten-memrefs.o riscv-fold-mem-offsets.o riscv-selftests.o riscv-v.o riscv-vsetvl.o"
>         extra_objs="${extra_objs} riscv-vector-builtins.o riscv-vector-builtins-shapes.o riscv-vector-builtins-bases.o"
>         extra_objs="${extra_objs} thead.o"
>         d_target_objs="riscv-d.o"
> diff --git a/gcc/config/riscv/riscv-fold-mem-offsets.cc b/gcc/config/riscv/riscv-fold-mem-offsets.cc
> new file mode 100644
> index 00000000000..81325bb3beb
> --- /dev/null
> +++ b/gcc/config/riscv/riscv-fold-mem-offsets.cc
> @@ -0,0 +1,637 @@
> +/* Fold memory offsets pass for RISC-V.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify
> +it under the terms of the GNU General Public License as published by
> +the Free Software Foundation; either version 3, or (at your option)
> +any later version.
> +
> +GCC is distributed in the hope that it will be useful,
> +but WITHOUT ANY WARRANTY; without even the implied warranty of
> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +GNU General Public License for more details.
> +
> +You should have received a copy of the GNU General Public License
> +along with GCC; see the file COPYING3.  If not see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#define IN_TARGET_CODE 1
> +
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "tm.h"
> +#include "rtl.h"
> +#include "tree.h"
> +#include "expr.h"
> +#include "backend.h"
> +#include "regs.h"
> +#include "target.h"
> +#include "memmodel.h"
> +#include "emit-rtl.h"
> +#include "insn-config.h"
> +#include "recog.h"
> +#include "predict.h"
> +#include "df.h"
> +#include "tree-pass.h"
> +#include "cfgrtl.h"
> +
> +/* This pass tries to optimize memory offset calculations by moving them
> +   from add immediate instructions to the memory loads/stores.
> +   For example it can transform this:
> +
> +     addi t4,sp,16
> +     add  t2,a6,t4
> +     shl  t3,t2,1
> +     ld   a2,0(t3)
> +     addi a2,1
> +     sd   a2,8(t2)
> +
> +   into the following (one instruction less):
> +
> +     add  t2,a6,sp
> +     shl  t3,t2,1
> +     ld   a2,32(t3)
> +     addi a2,1
> +     sd   a2,24(t2)
> +
> +   Usually, the code generated from the previous passes tries to have the
> +   offsets in the memory instructions but this pass is still beneficial
> +   because:
> +
> +    - There are cases where add instructions are added in a late rtl pass
> +      and the rest of the pipeline cannot eliminate them.  Specifically,
> +      arrays and structs allocated on the stack can result in multiple
> +      unnecessary add instructions that cannot be eliminated easily
> +      otherwise.
> +
> +    - The existing mechanisms that move offsets to memory instructions
> +      usually apply only to specific patterns or have other limitations.
> +      This pass is very generic and can fold offsets through complex
> +      calculations with multiple memory uses and partially overlapping
> +      calculations.  As a result it can eliminate more instructions than
> +      what is possible otherwise.
> +
> +   This pass runs inside a single basic blocks and consists of 4 phases:
> +
> +    - Phase 1 (Analysis): Find "foldable" instructions.
> +      Foldable instructions are those that we know how to propagate
> +      a constant addition through (add, slli, mv, ...) and only have other
> +      foldable instructions for uses.  In that phase a DFS traversal on the
> +      definition tree is performed and foldable instructions are marked on
> +      a bitmap.  The add immediate instructions that are reachable in this
> +      DFS are candidates for removal since all the intermediate
> +      calculations affected by them are also foldable.
> +
> +    - Phase 2 (Validity): Traverse again, this time calculating the
> +      offsets that would result from folding all add immediate instructions
> +      found.  Also keep track of which instructions will be folded for this
> +      particular offset because folding can be partially or completely
> +      shared across an number of different memory instructions.  At this point,
> +      since we calculated the actual offset resulting from folding, we check
> +      and keep track if it's a valid 12-bit immediate.
> +
> +    - Phase 3 (Commit offsets): Traverse again.  This time it is known if
> +      a particular fold is valid so actually fold the offset by changing
> +      the RTL statement.  It's important that this phase is separate from the
> +      previous because one instruction that is foldable with a valid offset
> +      can become result in an invalid offset for another instruction later on.
> +
> +    - Phase 4 (Commit instruction deletions): Scan all insns and delete
> +      all add immediate instructions that were folded.  */
> +
> +namespace {
> +
> +const pass_data pass_data_fold_mem =
> +{
> +  RTL_PASS, /* type */
> +  "fold_mem_offsets", /* name */
> +  OPTGROUP_NONE, /* optinfo_flags */
> +  TV_NONE, /* tv_id */
> +  0, /* properties_required */
> +  0, /* properties_provided */
> +  0, /* properties_destroyed */
> +  0, /* todo_flags_start */
> +  TODO_df_finish, /* todo_flags_finish */
> +};
> +
> +class pass_fold_mem_offsets : public rtl_opt_pass
> +{
> +public:
> +  pass_fold_mem_offsets (gcc::context *ctxt)
> +    : rtl_opt_pass (pass_data_fold_mem, ctxt)
> +  {}
> +
> +  /* opt_pass methods: */
> +  virtual bool gate (function *)
> +    {
> +      return riscv_mfold_mem_offsets
> +              && optimize >= 2;
> +    }
> +
> +  virtual unsigned int execute (function *);
> +}; // class pass_fold_mem_offsets
> +
> +/* Bitmap that tracks which instructions are reachable through sequences
> +   of foldable instructions.  */
> +static bitmap_head can_fold_insn;
> +
> +/* Bitmap with instructions marked for deletion due to folding.  */
> +static bitmap_head pending_remove_insn;
> +
> +/* Bitmap with instructions that cannot be deleted because that would
> +   require folding an offset that's invalid in some memory access.
> +   An instruction can be in both PENDING_REMOVE_INSN and CANNOT_REMOVE_INSN
> +   at the same time, in which case it cannot be safely deleted.  */
> +static bitmap_head cannot_remove_insn;
> +
> +/* The number of folded addi instructions of the form "addi reg, sp, X".  */
> +static int stats_folded_sp;
> +
> +/* The number of the rest folded addi instructions.  */
> +static int stats_folded_other;
> +
> +enum fold_mem_phase
> +{
> +  FM_PHASE_ANALYSIS,
> +  FM_PHASE_VALIDITY,
> +  FM_PHASE_COMMIT_OFFSETS,
> +  FM_PHASE_COMMIT_INSNS
> +};
> +
> +/* Helper function for fold_offsets.
> +  Get the single reaching definition of an instruction inside a BB.
> +  The definition is desired for REG used in INSN.
> +  Return the definition insn or NULL if there's no definition with
> +  the desired criteria.  */
> +static rtx_insn*
> +get_single_def_in_bb (rtx_insn *insn, rtx reg)
> +{
> +  df_ref use;
> +  struct df_link *ref_chain, *ref_link;
> +
> +  FOR_EACH_INSN_USE (use, insn)
> +    {
> +      if (GET_CODE (DF_REF_REG (use)) == SUBREG)
> +       return NULL;
> +      if (REGNO (DF_REF_REG (use)) == REGNO (reg))
> +       break;
> +    }
> +
> +  if (!use)
> +    return NULL;
> +
> +  ref_chain = DF_REF_CHAIN (use);
> +
> +  if (!ref_chain)
> +    return NULL;
> +
> +  for (ref_link = ref_chain; ref_link; ref_link = ref_link->next)
> +    {
> +      /* Problem getting some definition for this instruction.  */
> +      if (ref_link->ref == NULL)
> +       return NULL;
> +      if (DF_REF_INSN_INFO (ref_link->ref) == NULL)
> +       return NULL;
> +      if (global_regs[REGNO (reg)]
> +         && !set_of (reg, DF_REF_INSN (ref_link->ref)))
> +       return NULL;
> +    }
> +
> +  if (ref_chain->next)
> +    return NULL;
> +
> +  rtx_insn* def = DF_REF_INSN (ref_chain->ref);
> +
> +  if (BLOCK_FOR_INSN (def) != BLOCK_FOR_INSN (insn))
> +    return NULL;
> +
> +  if (DF_INSN_LUID (def) > DF_INSN_LUID (insn))
> +    return NULL;
> +
> +  return def;
> +}
> +
> +/* Helper function for fold_offsets.
> +   Get all the reaching uses of an instruction.  The uses are desired for REG
> +   set in INSN.  Return use list or NULL if a use is missing or irregular.
> +   If SUCCESS is not NULL then it's value is set to false if there are
> +   missing or irregular uses and to true otherwise.  */
> +static struct df_link*
> +get_uses (rtx_insn *insn, rtx reg, bool* success)
> +{
> +  df_ref def;
> +  struct df_link *ref_chain, *ref_link;
> +
> +  if (success != NULL)
> +    *success = false;
> +
> +  FOR_EACH_INSN_DEF (def, insn)
> +    if (REGNO (DF_REF_REG (def)) == REGNO (reg))
> +      break;
> +
> +  if (!def)
> +    return NULL;
> +
> +  ref_chain = DF_REF_CHAIN (def);
> +
> +  for (ref_link = ref_chain; ref_link; ref_link = ref_link->next)
> +    {
> +      /* Problem getting some use for this instruction.  */
> +      if (ref_link->ref == NULL)
> +       return NULL;
> +      if (DF_REF_CLASS (ref_link->ref) != DF_REF_REGULAR)
> +       return NULL;
> +    }
> +
> +  if (success != NULL)
> +    *success = true;
> +
> +  return ref_chain;
> +}
> +
> +/* Recursive function that computes the foldable offsets through the
> +   definitions of REG in INSN given an integer scale factor SCALE.
> +   Returns the offset that would have to be added if all instructions
> +   in PENDING_DELETES were to be deleted.
> +
> +  - if ANALYZE is true then it recurses through definitions with the common
> +    code and marks eligible for folding instructions in the bitmap
> +    can_fold_insn.  An instruction is eligible if all it's uses are also
> +    eligible.  Initially can_fold_insn is true for memory accesses.
> +
> +  - if ANALYZE is false then it recurses through definitions with the common
> +    code and computes and returns the offset that would result from folding
> +    the instructions in PENDING_DELETES were to be deleted.  */
> +static HOST_WIDE_INT
> +fold_offsets (rtx_insn* insn, rtx reg, int scale, bool analyze,
> +             bitmap pending_deletes)
> +{
> +  rtx_insn* def = get_single_def_in_bb (insn, reg);
> +
> +  if (!def)
> +    return 0;
> +
> +  rtx set = single_set (def);
> +
> +  if (!set)
> +    return 0;
> +
> +  rtx src = SET_SRC (set);
> +  rtx dest = SET_DEST (set);
> +
> +  enum rtx_code code = GET_CODE (src);
> +
> +  /* Return early for SRC codes that we don't know how to handle.  */
> +  if (code != PLUS && code != ASHIFT && code != REG)
> +    return 0;
> +
> +  unsigned int dest_regno = REGNO (dest);
> +
> +  /* We don't want to fold offsets from instructions that change some
> +     particular registers with potentially global side effects.  */
> +  if (!GP_REG_P (dest_regno)
> +      || dest_regno == STACK_POINTER_REGNUM
> +      || (frame_pointer_needed && dest_regno == HARD_FRAME_POINTER_REGNUM)
> +      || dest_regno == GP_REGNUM
> +      || dest_regno == THREAD_POINTER_REGNUM
> +      || dest_regno == RETURN_ADDR_REGNUM)
> +    return 0;
> +
> +  if (analyze)
> +    {
> +      /* We can only fold through instructions that are eventually used as
> +        memory addresses and do not have other uses.  Use the same logic
> +        from the offset calculation to visit instructions that can
> +        propagate offsets and keep track in can_fold_insn which have uses
> +        that end always in memory instructions.  */
> +
> +      if (REG_P (dest))
> +       {
> +         bool success;
> +         struct df_link *uses = get_uses (def, dest, &success), *ref_link;
> +
> +         if (!success)
> +           return 0;
> +
> +         for (ref_link = uses; ref_link; ref_link = ref_link->next)
> +           {
> +             rtx_insn* use = DF_REF_INSN (ref_link->ref);
> +
> +             /* Ignore debug insns during analysis.  */
> +             if (DEBUG_INSN_P (use))
> +               continue;
> +
> +             if (!bitmap_bit_p (&can_fold_insn, INSN_UID (use)))
> +               return 0;
> +
> +             rtx use_set = single_set (use);
> +
> +             /* Prevent folding when a memory store uses the dest register.  */
> +             if (use_set
> +                 && MEM_P (SET_DEST (use_set))
> +                 && REG_P (SET_SRC (use_set))
> +                 && REGNO (SET_SRC (use_set)) == REGNO (dest))
> +               return 0;
> +           }
> +
> +         bitmap_set_bit (&can_fold_insn, INSN_UID (def));
> +       }
> +    }
> +
> +  if (!bitmap_bit_p (&can_fold_insn, INSN_UID (def)))
> +    return 0;
> +
> +  switch (code)
> +    {
> +    case PLUS:
> +      {
> +       /* Propagate through add.  */
> +       rtx arg1 = XEXP (src, 0);
> +       rtx arg2 = XEXP (src, 1);
> +
> +       HOST_WIDE_INT offset = 0;
> +
> +       if (REG_P (arg1))
> +         offset += fold_offsets (def, arg1, 1, analyze, pending_deletes);
> +       else if (GET_CODE (arg1) == ASHIFT && REG_P (XEXP (arg1, 0))
> +                && CONST_INT_P (XEXP (arg1, 1)))
> +         {
> +           /* Also handle shift-and-add from the zbb extension.  */
> +           int shift_scale = (1 << (int) INTVAL (XEXP (arg1, 1)));
> +           offset += fold_offsets (def, XEXP (arg1, 0), shift_scale, analyze,
> +                                   pending_deletes);
> +         }
> +
> +       if (REG_P (arg2))
> +         offset += fold_offsets (def, arg2, 1, analyze, pending_deletes);
> +       else if (CONST_INT_P (arg2) && !analyze)
> +         {
> +           offset += INTVAL (arg2);
> +           bitmap_set_bit (pending_deletes, INSN_UID (def));
> +         }
> +
> +       return scale * offset;
> +      }
> +    case ASHIFT:
> +      {
> +       /* Propagate through sll.  */
> +       rtx arg1 = XEXP (src, 0);
> +       rtx arg2 = XEXP (src, 1);
> +
> +       if (REG_P (arg1) && CONST_INT_P (arg2))
> +         {
> +           int shift_scale = (1 << (int) INTVAL (arg2));
> +           return scale * fold_offsets (def, arg1, shift_scale, analyze,
> +                                        pending_deletes);
> +         }
> +
> +       return 0;
> +      }
> +    case REG:
> +      /* Propagate through mv.  */
> +      return scale * fold_offsets (def, src, 1, analyze, pending_deletes);
> +    default:
> +      /* Cannot propagate.  */
> +      return 0;
> +    }
> +}
> +
> +/* Helper function for fold_offset_mem.
> +   If INSN is a set rtx that loads from or stores to
> +   some memory location that could have an offset folded
> +   to it, return the rtx for the memory operand.  */
> +static rtx
> +get_foldable_mem_rtx (rtx_insn* insn)
> +{
> +  rtx set = single_set (insn);
> +
> +  if (set != NULL_RTX)
> +    {
> +      rtx src = SET_SRC (set);
> +      rtx dest = SET_DEST (set);
> +
> +      /* We don't want folding if the memory has
> +        unspec/unspec volatile in either src or dest.
> +        In particular this also prevents folding
> +        when atomics are involved.  */
> +      if (GET_CODE (src) == UNSPEC
> +         || GET_CODE (src) == UNSPEC_VOLATILE
> +         || GET_CODE (dest) == UNSPEC
> +         || GET_CODE (dest) == UNSPEC_VOLATILE)
> +       return NULL;
> +
> +      if (MEM_P (src))
> +       return src;
> +      else if (MEM_P (dest))
> +       return dest;
> +      else if ((
> +               GET_CODE (src) == SIGN_EXTEND
> +               || GET_CODE (src) == ZERO_EXTEND
> +             )
> +             && MEM_P (XEXP (src, 0)))
> +       return XEXP (src, 0);
> +    }
> +
> +  return NULL;
> +}
> +
> +/* Driver function that performs the actions defined by PHASE for INSN.  */
> +static void
> +fold_offset_mem (rtx_insn* insn, int phase)
> +{
> +  if (phase == FM_PHASE_COMMIT_INSNS)
> +    {
> +      if (bitmap_bit_p (&pending_remove_insn, INSN_UID (insn))
> +         && !bitmap_bit_p (&cannot_remove_insn, INSN_UID (insn)))
> +       {
> +         rtx set = single_set (insn);
> +         rtx src = SET_SRC (set);
> +         rtx dest = SET_DEST (set);
> +         rtx arg1 = XEXP (src, 0);
> +
> +         /* INSN is an add immidiate addi DEST, SRC1, SRC2 that we
> +            must replace with addi DEST, SRC1, 0.  */
> +         if (XEXP (src, 0) == stack_pointer_rtx)
> +           stats_folded_sp++;
> +         else
> +           stats_folded_other++;
> +
> +         if (dump_file)
> +           {
> +             fprintf (dump_file, "Instruction deleted from folding:");
> +             print_rtl_single (dump_file, insn);
> +           }
> +
> +         if (REGNO (dest) != REGNO (arg1))
> +           {
> +             /* If the dest register is different than the fisrt argument
> +                then the addition with constant 0 is equivalent to a move
> +                instruction.  We emit the move and let the subsequent
> +                pass cprop_hardreg eliminate that if possible.  */
> +             rtx arg1_reg_rtx = gen_rtx_REG (GET_MODE (dest), REGNO (arg1));
> +             rtx mov_rtx = gen_move_insn (dest, arg1_reg_rtx);
> +             df_insn_rescan (emit_insn_after (mov_rtx, insn));
> +           }
> +
> +         /* If the dest register is the same with the first argument
> +            then the addition with constant 0 is a no-op.
> +            We can now delete the original add immidiate instruction.  */
> +         delete_insn (insn);
> +       }
> +    }
> +  else
> +    {
> +      rtx mem = get_foldable_mem_rtx (insn);
> +
> +      if (!mem)
> +       return;
> +
> +      rtx mem_addr = XEXP (mem, 0);
> +      rtx reg;
> +      HOST_WIDE_INT cur_off;
> +
> +      if (REG_P (mem_addr))
> +       {
> +         reg = mem_addr;
> +         cur_off = 0;
> +       }
> +      else if (GET_CODE (mem_addr) == PLUS
> +              && REG_P (XEXP (mem_addr, 0))
> +              && CONST_INT_P (XEXP (mem_addr, 1)))
> +       {
> +         reg = XEXP (mem_addr, 0);
> +         cur_off = INTVAL (XEXP (mem_addr, 1));
> +       }
> +      else
> +       return;
> +
> +      if (phase == FM_PHASE_ANALYSIS)
> +       {
> +         bitmap_set_bit (&can_fold_insn, INSN_UID (insn));
> +         fold_offsets (insn, reg, 1, true, NULL);
> +       }
> +      else if (phase == FM_PHASE_VALIDITY)
> +       {
> +         bitmap_head new_pending_deletes;
> +         bitmap_initialize (&new_pending_deletes, NULL);
> +         HOST_WIDE_INT offset = cur_off + fold_offsets (insn, reg, 1, false,
> +                                                       &new_pending_deletes);
> +
> +         /* Temporarily change the offset in MEM to test whether
> +            it results in a valid instruction.  */
> +         machine_mode mode = GET_MODE (mem_addr);
> +         XEXP (mem, 0) = gen_rtx_PLUS (mode, reg, GEN_INT (offset));
> +
> +         bool valid_change = recog (PATTERN (insn), insn, 0) >= 0;
> +
> +         /* Restore the instruction.  */
> +         XEXP (mem, 0) = mem_addr;
> +
> +         if (valid_change)
> +           bitmap_ior_into (&pending_remove_insn, &new_pending_deletes);
> +         else
> +           bitmap_ior_into (&cannot_remove_insn, &new_pending_deletes);
> +         bitmap_release (&new_pending_deletes);
> +       }
> +      else if (phase == FM_PHASE_COMMIT_OFFSETS)
> +       {
> +         bitmap_head required_deletes;
> +         bitmap_initialize (&required_deletes, NULL);
> +         HOST_WIDE_INT offset = cur_off + fold_offsets (insn, reg, 1, false,
> +                                                        &required_deletes);
> +         bool illegal = bitmap_intersect_p (&required_deletes,
> +                                            &cannot_remove_insn);
> +
> +         if (offset == cur_off)
> +           return;
> +
> +         gcc_assert (!bitmap_empty_p (&required_deletes));
> +
> +         /* We have to update CANNOT_REMOVE_INSN again if transforming
> +            this instruction is illegal.  */
> +         if (illegal)
> +           bitmap_ior_into (&cannot_remove_insn, &required_deletes);
> +         else
> +           {
> +             machine_mode mode = GET_MODE (mem_addr);
> +             XEXP (mem, 0) = gen_rtx_PLUS (mode, reg, GEN_INT (offset));
> +             df_insn_rescan (insn);
> +
> +             if (dump_file)
> +               {
> +                 fprintf (dump_file, "Memory offset changed from "
> +                                     HOST_WIDE_INT_PRINT_DEC
> +                                     " to "
> +                                     HOST_WIDE_INT_PRINT_DEC
> +                                     " for instruction:\n", cur_off, offset);
> +                       print_rtl_single (dump_file, insn);
> +               }
> +           }
> +         bitmap_release (&required_deletes);
> +       }
> +    }
> +}
> +
> +unsigned int
> +pass_fold_mem_offsets::execute (function *fn)
> +{
> +  basic_block bb;
> +  rtx_insn *insn;
> +
> +  df_set_flags (DF_RD_PRUNE_DEAD_DEFS | DF_DEFER_INSN_RESCAN);
> +  df_chain_add_problem (DF_UD_CHAIN + DF_DU_CHAIN);
> +  df_analyze ();
> +
> +  bitmap_initialize (&can_fold_insn, NULL);
> +  bitmap_initialize (&pending_remove_insn, NULL);
> +  bitmap_initialize (&cannot_remove_insn, NULL);
> +
> +  stats_folded_sp = 0;
> +  stats_folded_other = 0;
> +
> +  FOR_ALL_BB_FN (bb, fn)
> +    {
> +      /* The shorten-memrefs pass runs when a BB is optimized for size
> +        and moves offsets from multiple memory instructions to a common
> +        add instruction.  Disable folding if optimizing for size because
> +        this pass will cancel the effects of shorten-memrefs.  */
> +      if (optimize_bb_for_size_p (bb))
> +       continue;
> +
> +      bitmap_clear (&can_fold_insn);
> +      bitmap_clear (&pending_remove_insn);
> +      bitmap_clear (&cannot_remove_insn);
> +
> +      FOR_BB_INSNS (bb, insn)
> +       fold_offset_mem (insn, FM_PHASE_ANALYSIS);
> +
> +      FOR_BB_INSNS (bb, insn)
> +       fold_offset_mem (insn, FM_PHASE_VALIDITY);
> +
> +      FOR_BB_INSNS (bb, insn)
> +       fold_offset_mem (insn, FM_PHASE_COMMIT_OFFSETS);
> +
> +      FOR_BB_INSNS (bb, insn)
> +       fold_offset_mem (insn, FM_PHASE_COMMIT_INSNS);
> +    }
> +
> +  statistics_counter_event (cfun, "addi with sp fold", stats_folded_sp);
> +  statistics_counter_event (cfun, "other addi fold", stats_folded_other);
> +
> +  bitmap_release (&can_fold_insn);
> +  bitmap_release (&pending_remove_insn);
> +  bitmap_release (&cannot_remove_insn);
> +
> +  return 0;
> +}
> +
> +} // anon namespace
> +
> +rtl_opt_pass *
> +make_pass_fold_mem_offsets (gcc::context *ctxt)
> +{
> +  return new pass_fold_mem_offsets (ctxt);
> +}
> diff --git a/gcc/config/riscv/riscv-passes.def b/gcc/config/riscv/riscv-passes.def
> index 4084122cf0a..dc08daadc66 100644
> --- a/gcc/config/riscv/riscv-passes.def
> +++ b/gcc/config/riscv/riscv-passes.def
> @@ -18,4 +18,5 @@
>     <http://www.gnu.org/licenses/>.  */
>
>  INSERT_PASS_AFTER (pass_rtl_store_motion, 1, pass_shorten_memrefs);
> +INSERT_PASS_AFTER (pass_regrename, 1, pass_fold_mem_offsets);
>  INSERT_PASS_BEFORE (pass_fast_rtl_dce, 1, pass_vsetvl);
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index 5f78fd579bb..b89a82adb0e 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -104,6 +104,7 @@ extern void riscv_parse_arch_string (const char *, struct gcc_options *, locatio
>  extern bool riscv_hard_regno_rename_ok (unsigned, unsigned);
>
>  rtl_opt_pass * make_pass_shorten_memrefs (gcc::context *ctxt);
> +rtl_opt_pass * make_pass_fold_mem_offsets (gcc::context *ctxt);
>  rtl_opt_pass * make_pass_vsetvl (gcc::context *ctxt);
>
>  /* Information about one CPU we know about.  */
> diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
> index 63d4710cb15..5e1fbdbedcc 100644
> --- a/gcc/config/riscv/riscv.opt
> +++ b/gcc/config/riscv/riscv.opt
> @@ -105,6 +105,10 @@ Convert BASE + LARGE_OFFSET addresses to NEW_BASE + SMALL_OFFSET to allow more
>  memory accesses to be generated as compressed instructions.  Currently targets
>  32-bit integer load/stores.
>
> +mfold-mem-offsets
> +Target Bool Var(riscv_mfold_mem_offsets) Init(1)
> +Fold instructions calculating memory offsets to the memory access instruction if possible.
> +
>  mcmodel=
>  Target RejectNegative Joined Enum(code_model) Var(riscv_cmodel) Init(TARGET_DEFAULT_CMODEL)
>  Specify the code model.
> diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv
> index 1252d6f851a..f29cf463867 100644
> --- a/gcc/config/riscv/t-riscv
> +++ b/gcc/config/riscv/t-riscv
> @@ -76,6 +76,10 @@ riscv-shorten-memrefs.o: $(srcdir)/config/riscv/riscv-shorten-memrefs.cc \
>         $(COMPILE) $<
>         $(POSTCOMPILE)
>
> +riscv-fold-mem-offsets.o: $(srcdir)/config/riscv/riscv-fold-mem-offsets.cc
> +       $(COMPILE) $<
> +       $(POSTCOMPILE)
> +
>  riscv-selftests.o: $(srcdir)/config/riscv/riscv-selftests.cc \
>    $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) $(TREE_H) output.h \
>    $(C_COMMON_H) $(TARGET_H) $(OPTABS_H) $(EXPR_H) $(INSN_ATTR_H) $(EMIT_RTL_H)
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index ee78591c73e..39b57cab595 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -1218,6 +1218,7 @@ See RS/6000 and PowerPC Options.
>  -msmall-data-limit=@var{N-bytes}
>  -msave-restore  -mno-save-restore
>  -mshorten-memrefs  -mno-shorten-memrefs
> +-mfold-mem-offsets  -mno-fold-mem-offsets
>  -mstrict-align  -mno-strict-align
>  -mcmodel=medlow  -mcmodel=medany
>  -mexplicit-relocs  -mno-explicit-relocs
> @@ -29048,6 +29049,13 @@ of 'new base + small offset'.  If the new base gets stored in a compressed
>  register, then the new load/store can be compressed.  Currently targets 32-bit
>  integer load/stores only.
>
> +@opindex mfold-mem-offsets
> +@item -mfold-mem-offsets
> +@itemx -mno-fold-mem-offsets
> +Do or do not attempt to move constant addition calculations used to for memory
> +offsets to the corresponding memory instructions.  The default is
> +@option{-mfold-mem-offsets} at levels @option{-O2}, @option{-O3}.
> +
>  @opindex mstrict-align
>  @item -mstrict-align
>  @itemx -mno-strict-align
> diff --git a/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c
> new file mode 100644
> index 00000000000..574cc92b6ab
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mfold-mem-offsets" } */
> +
> +void sink(int arr[2]);
> +
> +void
> +foo(int a, int b, int i)
> +{
> +  int arr[2] = {a, b};
> +  arr[i]++;
> +  sink(arr);
> +}
> +
> +// Should compile without negative memory offsets when using -mfold-mem-offsets
> +/* { dg-final { scan-assembler-not "lw\t.*,-.*\\(.*\\)" } } */
> +/* { dg-final { scan-assembler-not "sw\t.*,-.*\\(.*\\)" } } */
> \ No newline at end of file
> diff --git a/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c
> new file mode 100644
> index 00000000000..e6c251d3a3c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mfold-mem-offsets" } */
> +
> +void sink(int arr[3]);
> +
> +void
> +foo(int a, int b, int c, int i)
> +{
> +  int arr1[3] = {a, b, c};
> +  int arr2[3] = {a, c, b};
> +  int arr3[3] = {c, b, a};
> +
> +  arr1[i]++;
> +  arr2[i]++;
> +  arr3[i]++;
> +
> +  sink(arr1);
> +  sink(arr2);
> +  sink(arr3);
> +}
> +
> +// Should compile without negative memory offsets when using -mfold-mem-offsets
> +/* { dg-final { scan-assembler-not "lw\t.*,-.*\\(.*\\)" } } */
> +/* { dg-final { scan-assembler-not "sw\t.*,-.*\\(.*\\)" } } */
> \ No newline at end of file
> diff --git a/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c
> new file mode 100644
> index 00000000000..8586d3e3a29
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mfold-mem-offsets" } */
> +
> +void load(int arr[2]);
> +
> +int
> +foo(long unsigned int i)
> +{
> +  int arr[2];
> +  load(arr);
> +
> +  return arr[3 * i + 77];
> +}
> +
> +// Should compile without negative memory offsets when using -mfold-mem-offsets
> +/* { dg-final { scan-assembler-not "lw\t.*,-.*\\(.*\\)" } } */
> +/* { dg-final { scan-assembler-not "addi\t.*,.*,77" } } */
> \ No newline at end of file
> --
> 2.34.1
>
  
Manolis Tsamis May 25, 2023, 1:25 p.m. UTC | #2
On Thu, May 25, 2023 at 4:03 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Thu, May 25, 2023 at 2:36 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
> >
> > Implementation of the new RISC-V optimization pass for memory offset
> > calculations, documentation and testcases.
>
> Why do fwprop or combine not what you want to do?
>

Hi Richard,

At least from my experiments, the existing mechanisms (fwprop,
combine, ...) cannot handle the more difficult cases for which this
was created.

As can be seen in the example presented in the cover letter, this pass
is designed to work with partially-overlapping offset calculations,
multiple memory operations sharing some intermediate calculations
while also taking care of the offset range restrictions.
Also some offset calculation is introduced late enough (mostly
involving the stack pointer, local vars etc) that I think fwprop
cannot do something about them. Please correct me if I am wrong.

Prior to implementing this I did analyze the code generated for
benchmarks and found out that a lot of the harder cases are missed,
but they require powerful analysis and cannot be handled with combine.

Thanks,
Manolis

On Thu, May 25, 2023 at 4:03 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Thu, May 25, 2023 at 2:36 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
> >
> > Implementation of the new RISC-V optimization pass for memory offset
> > calculations, documentation and testcases.
>
> Why do fwprop or combine not what you want to do?
>
> > gcc/ChangeLog:
> >
> >         * config.gcc: Add riscv-fold-mem-offsets.o to extra_objs.
> >         * config/riscv/riscv-passes.def (INSERT_PASS_AFTER): Schedule a new
> >         pass.
> >         * config/riscv/riscv-protos.h (make_pass_fold_mem_offsets): Declare.
> >         * config/riscv/riscv.opt: New options.
> >         * config/riscv/t-riscv: New build rule.
> >         * doc/invoke.texi: Document new option.
> >         * config/riscv/riscv-fold-mem-offsets.cc: New file.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/riscv/fold-mem-offsets-1.c: New test.
> >         * gcc.target/riscv/fold-mem-offsets-2.c: New test.
> >         * gcc.target/riscv/fold-mem-offsets-3.c: New test.
> >
> > Signed-off-by: Manolis Tsamis <manolis.tsamis@vrull.eu>
> > ---
> >
> >  gcc/config.gcc                                |   2 +-
> >  gcc/config/riscv/riscv-fold-mem-offsets.cc    | 637 ++++++++++++++++++
> >  gcc/config/riscv/riscv-passes.def             |   1 +
> >  gcc/config/riscv/riscv-protos.h               |   1 +
> >  gcc/config/riscv/riscv.opt                    |   4 +
> >  gcc/config/riscv/t-riscv                      |   4 +
> >  gcc/doc/invoke.texi                           |   8 +
> >  .../gcc.target/riscv/fold-mem-offsets-1.c     |  16 +
> >  .../gcc.target/riscv/fold-mem-offsets-2.c     |  24 +
> >  .../gcc.target/riscv/fold-mem-offsets-3.c     |  17 +
> >  10 files changed, 713 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/config/riscv/riscv-fold-mem-offsets.cc
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c
> >
> > diff --git a/gcc/config.gcc b/gcc/config.gcc
> > index d88071773c9..5dffd21b4c8 100644
> > --- a/gcc/config.gcc
> > +++ b/gcc/config.gcc
> > @@ -529,7 +529,7 @@ pru-*-*)
> >         ;;
> >  riscv*)
> >         cpu_type=riscv
> > -       extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o riscv-shorten-memrefs.o riscv-selftests.o riscv-v.o riscv-vsetvl.o"
> > +       extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o riscv-shorten-memrefs.o riscv-fold-mem-offsets.o riscv-selftests.o riscv-v.o riscv-vsetvl.o"
> >         extra_objs="${extra_objs} riscv-vector-builtins.o riscv-vector-builtins-shapes.o riscv-vector-builtins-bases.o"
> >         extra_objs="${extra_objs} thead.o"
> >         d_target_objs="riscv-d.o"
> > diff --git a/gcc/config/riscv/riscv-fold-mem-offsets.cc b/gcc/config/riscv/riscv-fold-mem-offsets.cc
> > new file mode 100644
> > index 00000000000..81325bb3beb
> > --- /dev/null
> > +++ b/gcc/config/riscv/riscv-fold-mem-offsets.cc
> > @@ -0,0 +1,637 @@
> > +/* Fold memory offsets pass for RISC-V.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +
> > +This file is part of GCC.
> > +
> > +GCC is free software; you can redistribute it and/or modify
> > +it under the terms of the GNU General Public License as published by
> > +the Free Software Foundation; either version 3, or (at your option)
> > +any later version.
> > +
> > +GCC is distributed in the hope that it will be useful,
> > +but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > +GNU General Public License for more details.
> > +
> > +You should have received a copy of the GNU General Public License
> > +along with GCC; see the file COPYING3.  If not see
> > +<http://www.gnu.org/licenses/>.  */
> > +
> > +#define IN_TARGET_CODE 1
> > +
> > +#include "config.h"
> > +#include "system.h"
> > +#include "coretypes.h"
> > +#include "tm.h"
> > +#include "rtl.h"
> > +#include "tree.h"
> > +#include "expr.h"
> > +#include "backend.h"
> > +#include "regs.h"
> > +#include "target.h"
> > +#include "memmodel.h"
> > +#include "emit-rtl.h"
> > +#include "insn-config.h"
> > +#include "recog.h"
> > +#include "predict.h"
> > +#include "df.h"
> > +#include "tree-pass.h"
> > +#include "cfgrtl.h"
> > +
> > +/* This pass tries to optimize memory offset calculations by moving them
> > +   from add immediate instructions to the memory loads/stores.
> > +   For example it can transform this:
> > +
> > +     addi t4,sp,16
> > +     add  t2,a6,t4
> > +     shl  t3,t2,1
> > +     ld   a2,0(t3)
> > +     addi a2,1
> > +     sd   a2,8(t2)
> > +
> > +   into the following (one instruction less):
> > +
> > +     add  t2,a6,sp
> > +     shl  t3,t2,1
> > +     ld   a2,32(t3)
> > +     addi a2,1
> > +     sd   a2,24(t2)
> > +
> > +   Usually, the code generated from the previous passes tries to have the
> > +   offsets in the memory instructions but this pass is still beneficial
> > +   because:
> > +
> > +    - There are cases where add instructions are added in a late rtl pass
> > +      and the rest of the pipeline cannot eliminate them.  Specifically,
> > +      arrays and structs allocated on the stack can result in multiple
> > +      unnecessary add instructions that cannot be eliminated easily
> > +      otherwise.
> > +
> > +    - The existing mechanisms that move offsets to memory instructions
> > +      usually apply only to specific patterns or have other limitations.
> > +      This pass is very generic and can fold offsets through complex
> > +      calculations with multiple memory uses and partially overlapping
> > +      calculations.  As a result it can eliminate more instructions than
> > +      what is possible otherwise.
> > +
> > +   This pass runs inside a single basic blocks and consists of 4 phases:
> > +
> > +    - Phase 1 (Analysis): Find "foldable" instructions.
> > +      Foldable instructions are those that we know how to propagate
> > +      a constant addition through (add, slli, mv, ...) and only have other
> > +      foldable instructions for uses.  In that phase a DFS traversal on the
> > +      definition tree is performed and foldable instructions are marked on
> > +      a bitmap.  The add immediate instructions that are reachable in this
> > +      DFS are candidates for removal since all the intermediate
> > +      calculations affected by them are also foldable.
> > +
> > +    - Phase 2 (Validity): Traverse again, this time calculating the
> > +      offsets that would result from folding all add immediate instructions
> > +      found.  Also keep track of which instructions will be folded for this
> > +      particular offset because folding can be partially or completely
> > +      shared across an number of different memory instructions.  At this point,
> > +      since we calculated the actual offset resulting from folding, we check
> > +      and keep track if it's a valid 12-bit immediate.
> > +
> > +    - Phase 3 (Commit offsets): Traverse again.  This time it is known if
> > +      a particular fold is valid so actually fold the offset by changing
> > +      the RTL statement.  It's important that this phase is separate from the
> > +      previous because one instruction that is foldable with a valid offset
> > +      can become result in an invalid offset for another instruction later on.
> > +
> > +    - Phase 4 (Commit instruction deletions): Scan all insns and delete
> > +      all add immediate instructions that were folded.  */
> > +
> > +namespace {
> > +
> > +const pass_data pass_data_fold_mem =
> > +{
> > +  RTL_PASS, /* type */
> > +  "fold_mem_offsets", /* name */
> > +  OPTGROUP_NONE, /* optinfo_flags */
> > +  TV_NONE, /* tv_id */
> > +  0, /* properties_required */
> > +  0, /* properties_provided */
> > +  0, /* properties_destroyed */
> > +  0, /* todo_flags_start */
> > +  TODO_df_finish, /* todo_flags_finish */
> > +};
> > +
> > +class pass_fold_mem_offsets : public rtl_opt_pass
> > +{
> > +public:
> > +  pass_fold_mem_offsets (gcc::context *ctxt)
> > +    : rtl_opt_pass (pass_data_fold_mem, ctxt)
> > +  {}
> > +
> > +  /* opt_pass methods: */
> > +  virtual bool gate (function *)
> > +    {
> > +      return riscv_mfold_mem_offsets
> > +              && optimize >= 2;
> > +    }
> > +
> > +  virtual unsigned int execute (function *);
> > +}; // class pass_fold_mem_offsets
> > +
> > +/* Bitmap that tracks which instructions are reachable through sequences
> > +   of foldable instructions.  */
> > +static bitmap_head can_fold_insn;
> > +
> > +/* Bitmap with instructions marked for deletion due to folding.  */
> > +static bitmap_head pending_remove_insn;
> > +
> > +/* Bitmap with instructions that cannot be deleted because that would
> > +   require folding an offset that's invalid in some memory access.
> > +   An instruction can be in both PENDING_REMOVE_INSN and CANNOT_REMOVE_INSN
> > +   at the same time, in which case it cannot be safely deleted.  */
> > +static bitmap_head cannot_remove_insn;
> > +
> > +/* The number of folded addi instructions of the form "addi reg, sp, X".  */
> > +static int stats_folded_sp;
> > +
> > +/* The number of the rest folded addi instructions.  */
> > +static int stats_folded_other;
> > +
> > +enum fold_mem_phase
> > +{
> > +  FM_PHASE_ANALYSIS,
> > +  FM_PHASE_VALIDITY,
> > +  FM_PHASE_COMMIT_OFFSETS,
> > +  FM_PHASE_COMMIT_INSNS
> > +};
> > +
> > +/* Helper function for fold_offsets.
> > +  Get the single reaching definition of an instruction inside a BB.
> > +  The definition is desired for REG used in INSN.
> > +  Return the definition insn or NULL if there's no definition with
> > +  the desired criteria.  */
> > +static rtx_insn*
> > +get_single_def_in_bb (rtx_insn *insn, rtx reg)
> > +{
> > +  df_ref use;
> > +  struct df_link *ref_chain, *ref_link;
> > +
> > +  FOR_EACH_INSN_USE (use, insn)
> > +    {
> > +      if (GET_CODE (DF_REF_REG (use)) == SUBREG)
> > +       return NULL;
> > +      if (REGNO (DF_REF_REG (use)) == REGNO (reg))
> > +       break;
> > +    }
> > +
> > +  if (!use)
> > +    return NULL;
> > +
> > +  ref_chain = DF_REF_CHAIN (use);
> > +
> > +  if (!ref_chain)
> > +    return NULL;
> > +
> > +  for (ref_link = ref_chain; ref_link; ref_link = ref_link->next)
> > +    {
> > +      /* Problem getting some definition for this instruction.  */
> > +      if (ref_link->ref == NULL)
> > +       return NULL;
> > +      if (DF_REF_INSN_INFO (ref_link->ref) == NULL)
> > +       return NULL;
> > +      if (global_regs[REGNO (reg)]
> > +         && !set_of (reg, DF_REF_INSN (ref_link->ref)))
> > +       return NULL;
> > +    }
> > +
> > +  if (ref_chain->next)
> > +    return NULL;
> > +
> > +  rtx_insn* def = DF_REF_INSN (ref_chain->ref);
> > +
> > +  if (BLOCK_FOR_INSN (def) != BLOCK_FOR_INSN (insn))
> > +    return NULL;
> > +
> > +  if (DF_INSN_LUID (def) > DF_INSN_LUID (insn))
> > +    return NULL;
> > +
> > +  return def;
> > +}
> > +
> > +/* Helper function for fold_offsets.
> > +   Get all the reaching uses of an instruction.  The uses are desired for REG
> > +   set in INSN.  Return use list or NULL if a use is missing or irregular.
> > +   If SUCCESS is not NULL then it's value is set to false if there are
> > +   missing or irregular uses and to true otherwise.  */
> > +static struct df_link*
> > +get_uses (rtx_insn *insn, rtx reg, bool* success)
> > +{
> > +  df_ref def;
> > +  struct df_link *ref_chain, *ref_link;
> > +
> > +  if (success != NULL)
> > +    *success = false;
> > +
> > +  FOR_EACH_INSN_DEF (def, insn)
> > +    if (REGNO (DF_REF_REG (def)) == REGNO (reg))
> > +      break;
> > +
> > +  if (!def)
> > +    return NULL;
> > +
> > +  ref_chain = DF_REF_CHAIN (def);
> > +
> > +  for (ref_link = ref_chain; ref_link; ref_link = ref_link->next)
> > +    {
> > +      /* Problem getting some use for this instruction.  */
> > +      if (ref_link->ref == NULL)
> > +       return NULL;
> > +      if (DF_REF_CLASS (ref_link->ref) != DF_REF_REGULAR)
> > +       return NULL;
> > +    }
> > +
> > +  if (success != NULL)
> > +    *success = true;
> > +
> > +  return ref_chain;
> > +}
> > +
> > +/* Recursive function that computes the foldable offsets through the
> > +   definitions of REG in INSN given an integer scale factor SCALE.
> > +   Returns the offset that would have to be added if all instructions
> > +   in PENDING_DELETES were to be deleted.
> > +
> > +  - if ANALYZE is true then it recurses through definitions with the common
> > +    code and marks eligible for folding instructions in the bitmap
> > +    can_fold_insn.  An instruction is eligible if all it's uses are also
> > +    eligible.  Initially can_fold_insn is true for memory accesses.
> > +
> > +  - if ANALYZE is false then it recurses through definitions with the common
> > +    code and computes and returns the offset that would result from folding
> > +    the instructions in PENDING_DELETES were to be deleted.  */
> > +static HOST_WIDE_INT
> > +fold_offsets (rtx_insn* insn, rtx reg, int scale, bool analyze,
> > +             bitmap pending_deletes)
> > +{
> > +  rtx_insn* def = get_single_def_in_bb (insn, reg);
> > +
> > +  if (!def)
> > +    return 0;
> > +
> > +  rtx set = single_set (def);
> > +
> > +  if (!set)
> > +    return 0;
> > +
> > +  rtx src = SET_SRC (set);
> > +  rtx dest = SET_DEST (set);
> > +
> > +  enum rtx_code code = GET_CODE (src);
> > +
> > +  /* Return early for SRC codes that we don't know how to handle.  */
> > +  if (code != PLUS && code != ASHIFT && code != REG)
> > +    return 0;
> > +
> > +  unsigned int dest_regno = REGNO (dest);
> > +
> > +  /* We don't want to fold offsets from instructions that change some
> > +     particular registers with potentially global side effects.  */
> > +  if (!GP_REG_P (dest_regno)
> > +      || dest_regno == STACK_POINTER_REGNUM
> > +      || (frame_pointer_needed && dest_regno == HARD_FRAME_POINTER_REGNUM)
> > +      || dest_regno == GP_REGNUM
> > +      || dest_regno == THREAD_POINTER_REGNUM
> > +      || dest_regno == RETURN_ADDR_REGNUM)
> > +    return 0;
> > +
> > +  if (analyze)
> > +    {
> > +      /* We can only fold through instructions that are eventually used as
> > +        memory addresses and do not have other uses.  Use the same logic
> > +        from the offset calculation to visit instructions that can
> > +        propagate offsets and keep track in can_fold_insn which have uses
> > +        that end always in memory instructions.  */
> > +
> > +      if (REG_P (dest))
> > +       {
> > +         bool success;
> > +         struct df_link *uses = get_uses (def, dest, &success), *ref_link;
> > +
> > +         if (!success)
> > +           return 0;
> > +
> > +         for (ref_link = uses; ref_link; ref_link = ref_link->next)
> > +           {
> > +             rtx_insn* use = DF_REF_INSN (ref_link->ref);
> > +
> > +             /* Ignore debug insns during analysis.  */
> > +             if (DEBUG_INSN_P (use))
> > +               continue;
> > +
> > +             if (!bitmap_bit_p (&can_fold_insn, INSN_UID (use)))
> > +               return 0;
> > +
> > +             rtx use_set = single_set (use);
> > +
> > +             /* Prevent folding when a memory store uses the dest register.  */
> > +             if (use_set
> > +                 && MEM_P (SET_DEST (use_set))
> > +                 && REG_P (SET_SRC (use_set))
> > +                 && REGNO (SET_SRC (use_set)) == REGNO (dest))
> > +               return 0;
> > +           }
> > +
> > +         bitmap_set_bit (&can_fold_insn, INSN_UID (def));
> > +       }
> > +    }
> > +
> > +  if (!bitmap_bit_p (&can_fold_insn, INSN_UID (def)))
> > +    return 0;
> > +
> > +  switch (code)
> > +    {
> > +    case PLUS:
> > +      {
> > +       /* Propagate through add.  */
> > +       rtx arg1 = XEXP (src, 0);
> > +       rtx arg2 = XEXP (src, 1);
> > +
> > +       HOST_WIDE_INT offset = 0;
> > +
> > +       if (REG_P (arg1))
> > +         offset += fold_offsets (def, arg1, 1, analyze, pending_deletes);
> > +       else if (GET_CODE (arg1) == ASHIFT && REG_P (XEXP (arg1, 0))
> > +                && CONST_INT_P (XEXP (arg1, 1)))
> > +         {
> > +           /* Also handle shift-and-add from the zbb extension.  */
> > +           int shift_scale = (1 << (int) INTVAL (XEXP (arg1, 1)));
> > +           offset += fold_offsets (def, XEXP (arg1, 0), shift_scale, analyze,
> > +                                   pending_deletes);
> > +         }
> > +
> > +       if (REG_P (arg2))
> > +         offset += fold_offsets (def, arg2, 1, analyze, pending_deletes);
> > +       else if (CONST_INT_P (arg2) && !analyze)
> > +         {
> > +           offset += INTVAL (arg2);
> > +           bitmap_set_bit (pending_deletes, INSN_UID (def));
> > +         }
> > +
> > +       return scale * offset;
> > +      }
> > +    case ASHIFT:
> > +      {
> > +       /* Propagate through sll.  */
> > +       rtx arg1 = XEXP (src, 0);
> > +       rtx arg2 = XEXP (src, 1);
> > +
> > +       if (REG_P (arg1) && CONST_INT_P (arg2))
> > +         {
> > +           int shift_scale = (1 << (int) INTVAL (arg2));
> > +           return scale * fold_offsets (def, arg1, shift_scale, analyze,
> > +                                        pending_deletes);
> > +         }
> > +
> > +       return 0;
> > +      }
> > +    case REG:
> > +      /* Propagate through mv.  */
> > +      return scale * fold_offsets (def, src, 1, analyze, pending_deletes);
> > +    default:
> > +      /* Cannot propagate.  */
> > +      return 0;
> > +    }
> > +}
> > +
> > +/* Helper function for fold_offset_mem.
> > +   If INSN is a set rtx that loads from or stores to
> > +   some memory location that could have an offset folded
> > +   to it, return the rtx for the memory operand.  */
> > +static rtx
> > +get_foldable_mem_rtx (rtx_insn* insn)
> > +{
> > +  rtx set = single_set (insn);
> > +
> > +  if (set != NULL_RTX)
> > +    {
> > +      rtx src = SET_SRC (set);
> > +      rtx dest = SET_DEST (set);
> > +
> > +      /* We don't want folding if the memory has
> > +        unspec/unspec volatile in either src or dest.
> > +        In particular this also prevents folding
> > +        when atomics are involved.  */
> > +      if (GET_CODE (src) == UNSPEC
> > +         || GET_CODE (src) == UNSPEC_VOLATILE
> > +         || GET_CODE (dest) == UNSPEC
> > +         || GET_CODE (dest) == UNSPEC_VOLATILE)
> > +       return NULL;
> > +
> > +      if (MEM_P (src))
> > +       return src;
> > +      else if (MEM_P (dest))
> > +       return dest;
> > +      else if ((
> > +               GET_CODE (src) == SIGN_EXTEND
> > +               || GET_CODE (src) == ZERO_EXTEND
> > +             )
> > +             && MEM_P (XEXP (src, 0)))
> > +       return XEXP (src, 0);
> > +    }
> > +
> > +  return NULL;
> > +}
> > +
> > +/* Driver function that performs the actions defined by PHASE for INSN.  */
> > +static void
> > +fold_offset_mem (rtx_insn* insn, int phase)
> > +{
> > +  if (phase == FM_PHASE_COMMIT_INSNS)
> > +    {
> > +      if (bitmap_bit_p (&pending_remove_insn, INSN_UID (insn))
> > +         && !bitmap_bit_p (&cannot_remove_insn, INSN_UID (insn)))
> > +       {
> > +         rtx set = single_set (insn);
> > +         rtx src = SET_SRC (set);
> > +         rtx dest = SET_DEST (set);
> > +         rtx arg1 = XEXP (src, 0);
> > +
> > +         /* INSN is an add immidiate addi DEST, SRC1, SRC2 that we
> > +            must replace with addi DEST, SRC1, 0.  */
> > +         if (XEXP (src, 0) == stack_pointer_rtx)
> > +           stats_folded_sp++;
> > +         else
> > +           stats_folded_other++;
> > +
> > +         if (dump_file)
> > +           {
> > +             fprintf (dump_file, "Instruction deleted from folding:");
> > +             print_rtl_single (dump_file, insn);
> > +           }
> > +
> > +         if (REGNO (dest) != REGNO (arg1))
> > +           {
> > +             /* If the dest register is different than the fisrt argument
> > +                then the addition with constant 0 is equivalent to a move
> > +                instruction.  We emit the move and let the subsequent
> > +                pass cprop_hardreg eliminate that if possible.  */
> > +             rtx arg1_reg_rtx = gen_rtx_REG (GET_MODE (dest), REGNO (arg1));
> > +             rtx mov_rtx = gen_move_insn (dest, arg1_reg_rtx);
> > +             df_insn_rescan (emit_insn_after (mov_rtx, insn));
> > +           }
> > +
> > +         /* If the dest register is the same with the first argument
> > +            then the addition with constant 0 is a no-op.
> > +            We can now delete the original add immidiate instruction.  */
> > +         delete_insn (insn);
> > +       }
> > +    }
> > +  else
> > +    {
> > +      rtx mem = get_foldable_mem_rtx (insn);
> > +
> > +      if (!mem)
> > +       return;
> > +
> > +      rtx mem_addr = XEXP (mem, 0);
> > +      rtx reg;
> > +      HOST_WIDE_INT cur_off;
> > +
> > +      if (REG_P (mem_addr))
> > +       {
> > +         reg = mem_addr;
> > +         cur_off = 0;
> > +       }
> > +      else if (GET_CODE (mem_addr) == PLUS
> > +              && REG_P (XEXP (mem_addr, 0))
> > +              && CONST_INT_P (XEXP (mem_addr, 1)))
> > +       {
> > +         reg = XEXP (mem_addr, 0);
> > +         cur_off = INTVAL (XEXP (mem_addr, 1));
> > +       }
> > +      else
> > +       return;
> > +
> > +      if (phase == FM_PHASE_ANALYSIS)
> > +       {
> > +         bitmap_set_bit (&can_fold_insn, INSN_UID (insn));
> > +         fold_offsets (insn, reg, 1, true, NULL);
> > +       }
> > +      else if (phase == FM_PHASE_VALIDITY)
> > +       {
> > +         bitmap_head new_pending_deletes;
> > +         bitmap_initialize (&new_pending_deletes, NULL);
> > +         HOST_WIDE_INT offset = cur_off + fold_offsets (insn, reg, 1, false,
> > +                                                       &new_pending_deletes);
> > +
> > +         /* Temporarily change the offset in MEM to test whether
> > +            it results in a valid instruction.  */
> > +         machine_mode mode = GET_MODE (mem_addr);
> > +         XEXP (mem, 0) = gen_rtx_PLUS (mode, reg, GEN_INT (offset));
> > +
> > +         bool valid_change = recog (PATTERN (insn), insn, 0) >= 0;
> > +
> > +         /* Restore the instruction.  */
> > +         XEXP (mem, 0) = mem_addr;
> > +
> > +         if (valid_change)
> > +           bitmap_ior_into (&pending_remove_insn, &new_pending_deletes);
> > +         else
> > +           bitmap_ior_into (&cannot_remove_insn, &new_pending_deletes);
> > +         bitmap_release (&new_pending_deletes);
> > +       }
> > +      else if (phase == FM_PHASE_COMMIT_OFFSETS)
> > +       {
> > +         bitmap_head required_deletes;
> > +         bitmap_initialize (&required_deletes, NULL);
> > +         HOST_WIDE_INT offset = cur_off + fold_offsets (insn, reg, 1, false,
> > +                                                        &required_deletes);
> > +         bool illegal = bitmap_intersect_p (&required_deletes,
> > +                                            &cannot_remove_insn);
> > +
> > +         if (offset == cur_off)
> > +           return;
> > +
> > +         gcc_assert (!bitmap_empty_p (&required_deletes));
> > +
> > +         /* We have to update CANNOT_REMOVE_INSN again if transforming
> > +            this instruction is illegal.  */
> > +         if (illegal)
> > +           bitmap_ior_into (&cannot_remove_insn, &required_deletes);
> > +         else
> > +           {
> > +             machine_mode mode = GET_MODE (mem_addr);
> > +             XEXP (mem, 0) = gen_rtx_PLUS (mode, reg, GEN_INT (offset));
> > +             df_insn_rescan (insn);
> > +
> > +             if (dump_file)
> > +               {
> > +                 fprintf (dump_file, "Memory offset changed from "
> > +                                     HOST_WIDE_INT_PRINT_DEC
> > +                                     " to "
> > +                                     HOST_WIDE_INT_PRINT_DEC
> > +                                     " for instruction:\n", cur_off, offset);
> > +                       print_rtl_single (dump_file, insn);
> > +               }
> > +           }
> > +         bitmap_release (&required_deletes);
> > +       }
> > +    }
> > +}
> > +
> > +unsigned int
> > +pass_fold_mem_offsets::execute (function *fn)
> > +{
> > +  basic_block bb;
> > +  rtx_insn *insn;
> > +
> > +  df_set_flags (DF_RD_PRUNE_DEAD_DEFS | DF_DEFER_INSN_RESCAN);
> > +  df_chain_add_problem (DF_UD_CHAIN + DF_DU_CHAIN);
> > +  df_analyze ();
> > +
> > +  bitmap_initialize (&can_fold_insn, NULL);
> > +  bitmap_initialize (&pending_remove_insn, NULL);
> > +  bitmap_initialize (&cannot_remove_insn, NULL);
> > +
> > +  stats_folded_sp = 0;
> > +  stats_folded_other = 0;
> > +
> > +  FOR_ALL_BB_FN (bb, fn)
> > +    {
> > +      /* The shorten-memrefs pass runs when a BB is optimized for size
> > +        and moves offsets from multiple memory instructions to a common
> > +        add instruction.  Disable folding if optimizing for size because
> > +        this pass will cancel the effects of shorten-memrefs.  */
> > +      if (optimize_bb_for_size_p (bb))
> > +       continue;
> > +
> > +      bitmap_clear (&can_fold_insn);
> > +      bitmap_clear (&pending_remove_insn);
> > +      bitmap_clear (&cannot_remove_insn);
> > +
> > +      FOR_BB_INSNS (bb, insn)
> > +       fold_offset_mem (insn, FM_PHASE_ANALYSIS);
> > +
> > +      FOR_BB_INSNS (bb, insn)
> > +       fold_offset_mem (insn, FM_PHASE_VALIDITY);
> > +
> > +      FOR_BB_INSNS (bb, insn)
> > +       fold_offset_mem (insn, FM_PHASE_COMMIT_OFFSETS);
> > +
> > +      FOR_BB_INSNS (bb, insn)
> > +       fold_offset_mem (insn, FM_PHASE_COMMIT_INSNS);
> > +    }
> > +
> > +  statistics_counter_event (cfun, "addi with sp fold", stats_folded_sp);
> > +  statistics_counter_event (cfun, "other addi fold", stats_folded_other);
> > +
> > +  bitmap_release (&can_fold_insn);
> > +  bitmap_release (&pending_remove_insn);
> > +  bitmap_release (&cannot_remove_insn);
> > +
> > +  return 0;
> > +}
> > +
> > +} // anon namespace
> > +
> > +rtl_opt_pass *
> > +make_pass_fold_mem_offsets (gcc::context *ctxt)
> > +{
> > +  return new pass_fold_mem_offsets (ctxt);
> > +}
> > diff --git a/gcc/config/riscv/riscv-passes.def b/gcc/config/riscv/riscv-passes.def
> > index 4084122cf0a..dc08daadc66 100644
> > --- a/gcc/config/riscv/riscv-passes.def
> > +++ b/gcc/config/riscv/riscv-passes.def
> > @@ -18,4 +18,5 @@
> >     <http://www.gnu.org/licenses/>.  */
> >
> >  INSERT_PASS_AFTER (pass_rtl_store_motion, 1, pass_shorten_memrefs);
> > +INSERT_PASS_AFTER (pass_regrename, 1, pass_fold_mem_offsets);
> >  INSERT_PASS_BEFORE (pass_fast_rtl_dce, 1, pass_vsetvl);
> > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > index 5f78fd579bb..b89a82adb0e 100644
> > --- a/gcc/config/riscv/riscv-protos.h
> > +++ b/gcc/config/riscv/riscv-protos.h
> > @@ -104,6 +104,7 @@ extern void riscv_parse_arch_string (const char *, struct gcc_options *, locatio
> >  extern bool riscv_hard_regno_rename_ok (unsigned, unsigned);
> >
> >  rtl_opt_pass * make_pass_shorten_memrefs (gcc::context *ctxt);
> > +rtl_opt_pass * make_pass_fold_mem_offsets (gcc::context *ctxt);
> >  rtl_opt_pass * make_pass_vsetvl (gcc::context *ctxt);
> >
> >  /* Information about one CPU we know about.  */
> > diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
> > index 63d4710cb15..5e1fbdbedcc 100644
> > --- a/gcc/config/riscv/riscv.opt
> > +++ b/gcc/config/riscv/riscv.opt
> > @@ -105,6 +105,10 @@ Convert BASE + LARGE_OFFSET addresses to NEW_BASE + SMALL_OFFSET to allow more
> >  memory accesses to be generated as compressed instructions.  Currently targets
> >  32-bit integer load/stores.
> >
> > +mfold-mem-offsets
> > +Target Bool Var(riscv_mfold_mem_offsets) Init(1)
> > +Fold instructions calculating memory offsets to the memory access instruction if possible.
> > +
> >  mcmodel=
> >  Target RejectNegative Joined Enum(code_model) Var(riscv_cmodel) Init(TARGET_DEFAULT_CMODEL)
> >  Specify the code model.
> > diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv
> > index 1252d6f851a..f29cf463867 100644
> > --- a/gcc/config/riscv/t-riscv
> > +++ b/gcc/config/riscv/t-riscv
> > @@ -76,6 +76,10 @@ riscv-shorten-memrefs.o: $(srcdir)/config/riscv/riscv-shorten-memrefs.cc \
> >         $(COMPILE) $<
> >         $(POSTCOMPILE)
> >
> > +riscv-fold-mem-offsets.o: $(srcdir)/config/riscv/riscv-fold-mem-offsets.cc
> > +       $(COMPILE) $<
> > +       $(POSTCOMPILE)
> > +
> >  riscv-selftests.o: $(srcdir)/config/riscv/riscv-selftests.cc \
> >    $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) $(TREE_H) output.h \
> >    $(C_COMMON_H) $(TARGET_H) $(OPTABS_H) $(EXPR_H) $(INSN_ATTR_H) $(EMIT_RTL_H)
> > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> > index ee78591c73e..39b57cab595 100644
> > --- a/gcc/doc/invoke.texi
> > +++ b/gcc/doc/invoke.texi
> > @@ -1218,6 +1218,7 @@ See RS/6000 and PowerPC Options.
> >  -msmall-data-limit=@var{N-bytes}
> >  -msave-restore  -mno-save-restore
> >  -mshorten-memrefs  -mno-shorten-memrefs
> > +-mfold-mem-offsets  -mno-fold-mem-offsets
> >  -mstrict-align  -mno-strict-align
> >  -mcmodel=medlow  -mcmodel=medany
> >  -mexplicit-relocs  -mno-explicit-relocs
> > @@ -29048,6 +29049,13 @@ of 'new base + small offset'.  If the new base gets stored in a compressed
> >  register, then the new load/store can be compressed.  Currently targets 32-bit
> >  integer load/stores only.
> >
> > +@opindex mfold-mem-offsets
> > +@item -mfold-mem-offsets
> > +@itemx -mno-fold-mem-offsets
> > +Do or do not attempt to move constant addition calculations used to for memory
> > +offsets to the corresponding memory instructions.  The default is
> > +@option{-mfold-mem-offsets} at levels @option{-O2}, @option{-O3}.
> > +
> >  @opindex mstrict-align
> >  @item -mstrict-align
> >  @itemx -mno-strict-align
> > diff --git a/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c
> > new file mode 100644
> > index 00000000000..574cc92b6ab
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c
> > @@ -0,0 +1,16 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mfold-mem-offsets" } */
> > +
> > +void sink(int arr[2]);
> > +
> > +void
> > +foo(int a, int b, int i)
> > +{
> > +  int arr[2] = {a, b};
> > +  arr[i]++;
> > +  sink(arr);
> > +}
> > +
> > +// Should compile without negative memory offsets when using -mfold-mem-offsets
> > +/* { dg-final { scan-assembler-not "lw\t.*,-.*\\(.*\\)" } } */
> > +/* { dg-final { scan-assembler-not "sw\t.*,-.*\\(.*\\)" } } */
> > \ No newline at end of file
> > diff --git a/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c
> > new file mode 100644
> > index 00000000000..e6c251d3a3c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c
> > @@ -0,0 +1,24 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mfold-mem-offsets" } */
> > +
> > +void sink(int arr[3]);
> > +
> > +void
> > +foo(int a, int b, int c, int i)
> > +{
> > +  int arr1[3] = {a, b, c};
> > +  int arr2[3] = {a, c, b};
> > +  int arr3[3] = {c, b, a};
> > +
> > +  arr1[i]++;
> > +  arr2[i]++;
> > +  arr3[i]++;
> > +
> > +  sink(arr1);
> > +  sink(arr2);
> > +  sink(arr3);
> > +}
> > +
> > +// Should compile without negative memory offsets when using -mfold-mem-offsets
> > +/* { dg-final { scan-assembler-not "lw\t.*,-.*\\(.*\\)" } } */
> > +/* { dg-final { scan-assembler-not "sw\t.*,-.*\\(.*\\)" } } */
> > \ No newline at end of file
> > diff --git a/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c
> > new file mode 100644
> > index 00000000000..8586d3e3a29
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c
> > @@ -0,0 +1,17 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mfold-mem-offsets" } */
> > +
> > +void load(int arr[2]);
> > +
> > +int
> > +foo(long unsigned int i)
> > +{
> > +  int arr[2];
> > +  load(arr);
> > +
> > +  return arr[3 * i + 77];
> > +}
> > +
> > +// Should compile without negative memory offsets when using -mfold-mem-offsets
> > +/* { dg-final { scan-assembler-not "lw\t.*,-.*\\(.*\\)" } } */
> > +/* { dg-final { scan-assembler-not "addi\t.*,.*,77" } } */
> > \ No newline at end of file
> > --
> > 2.34.1
> >


> > gcc/ChangeLog:
> >
> >         * config.gcc: Add riscv-fold-mem-offsets.o to extra_objs.
> >         * config/riscv/riscv-passes.def (INSERT_PASS_AFTER): Schedule a new
> >         pass.
> >         * config/riscv/riscv-protos.h (make_pass_fold_mem_offsets): Declare.
> >         * config/riscv/riscv.opt: New options.
> >         * config/riscv/t-riscv: New build rule.
> >         * doc/invoke.texi: Document new option.
> >         * config/riscv/riscv-fold-mem-offsets.cc: New file.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/riscv/fold-mem-offsets-1.c: New test.
> >         * gcc.target/riscv/fold-mem-offsets-2.c: New test.
> >         * gcc.target/riscv/fold-mem-offsets-3.c: New test.
> >
> > Signed-off-by: Manolis Tsamis <manolis.tsamis@vrull.eu>
> > ---
> >
> >  gcc/config.gcc                                |   2 +-
> >  gcc/config/riscv/riscv-fold-mem-offsets.cc    | 637 ++++++++++++++++++
> >  gcc/config/riscv/riscv-passes.def             |   1 +
> >  gcc/config/riscv/riscv-protos.h               |   1 +
> >  gcc/config/riscv/riscv.opt                    |   4 +
> >  gcc/config/riscv/t-riscv                      |   4 +
> >  gcc/doc/invoke.texi                           |   8 +
> >  .../gcc.target/riscv/fold-mem-offsets-1.c     |  16 +
> >  .../gcc.target/riscv/fold-mem-offsets-2.c     |  24 +
> >  .../gcc.target/riscv/fold-mem-offsets-3.c     |  17 +
> >  10 files changed, 713 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/config/riscv/riscv-fold-mem-offsets.cc
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c
> >
> > diff --git a/gcc/config.gcc b/gcc/config.gcc
> > index d88071773c9..5dffd21b4c8 100644
> > --- a/gcc/config.gcc
> > +++ b/gcc/config.gcc
> > @@ -529,7 +529,7 @@ pru-*-*)
> >         ;;
> >  riscv*)
> >         cpu_type=riscv
> > -       extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o riscv-shorten-memrefs.o riscv-selftests.o riscv-v.o riscv-vsetvl.o"
> > +       extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o riscv-shorten-memrefs.o riscv-fold-mem-offsets.o riscv-selftests.o riscv-v.o riscv-vsetvl.o"
> >         extra_objs="${extra_objs} riscv-vector-builtins.o riscv-vector-builtins-shapes.o riscv-vector-builtins-bases.o"
> >         extra_objs="${extra_objs} thead.o"
> >         d_target_objs="riscv-d.o"
> > diff --git a/gcc/config/riscv/riscv-fold-mem-offsets.cc b/gcc/config/riscv/riscv-fold-mem-offsets.cc
> > new file mode 100644
> > index 00000000000..81325bb3beb
> > --- /dev/null
> > +++ b/gcc/config/riscv/riscv-fold-mem-offsets.cc
> > @@ -0,0 +1,637 @@
> > +/* Fold memory offsets pass for RISC-V.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +
> > +This file is part of GCC.
> > +
> > +GCC is free software; you can redistribute it and/or modify
> > +it under the terms of the GNU General Public License as published by
> > +the Free Software Foundation; either version 3, or (at your option)
> > +any later version.
> > +
> > +GCC is distributed in the hope that it will be useful,
> > +but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > +GNU General Public License for more details.
> > +
> > +You should have received a copy of the GNU General Public License
> > +along with GCC; see the file COPYING3.  If not see
> > +<http://www.gnu.org/licenses/>.  */
> > +
> > +#define IN_TARGET_CODE 1
> > +
> > +#include "config.h"
> > +#include "system.h"
> > +#include "coretypes.h"
> > +#include "tm.h"
> > +#include "rtl.h"
> > +#include "tree.h"
> > +#include "expr.h"
> > +#include "backend.h"
> > +#include "regs.h"
> > +#include "target.h"
> > +#include "memmodel.h"
> > +#include "emit-rtl.h"
> > +#include "insn-config.h"
> > +#include "recog.h"
> > +#include "predict.h"
> > +#include "df.h"
> > +#include "tree-pass.h"
> > +#include "cfgrtl.h"
> > +
> > +/* This pass tries to optimize memory offset calculations by moving them
> > +   from add immediate instructions to the memory loads/stores.
> > +   For example it can transform this:
> > +
> > +     addi t4,sp,16
> > +     add  t2,a6,t4
> > +     shl  t3,t2,1
> > +     ld   a2,0(t3)
> > +     addi a2,1
> > +     sd   a2,8(t2)
> > +
> > +   into the following (one instruction less):
> > +
> > +     add  t2,a6,sp
> > +     shl  t3,t2,1
> > +     ld   a2,32(t3)
> > +     addi a2,1
> > +     sd   a2,24(t2)
> > +
> > +   Usually, the code generated from the previous passes tries to have the
> > +   offsets in the memory instructions but this pass is still beneficial
> > +   because:
> > +
> > +    - There are cases where add instructions are added in a late rtl pass
> > +      and the rest of the pipeline cannot eliminate them.  Specifically,
> > +      arrays and structs allocated on the stack can result in multiple
> > +      unnecessary add instructions that cannot be eliminated easily
> > +      otherwise.
> > +
> > +    - The existing mechanisms that move offsets to memory instructions
> > +      usually apply only to specific patterns or have other limitations.
> > +      This pass is very generic and can fold offsets through complex
> > +      calculations with multiple memory uses and partially overlapping
> > +      calculations.  As a result it can eliminate more instructions than
> > +      what is possible otherwise.
> > +
> > +   This pass runs inside a single basic blocks and consists of 4 phases:
> > +
> > +    - Phase 1 (Analysis): Find "foldable" instructions.
> > +      Foldable instructions are those that we know how to propagate
> > +      a constant addition through (add, slli, mv, ...) and only have other
> > +      foldable instructions for uses.  In that phase a DFS traversal on the
> > +      definition tree is performed and foldable instructions are marked on
> > +      a bitmap.  The add immediate instructions that are reachable in this
> > +      DFS are candidates for removal since all the intermediate
> > +      calculations affected by them are also foldable.
> > +
> > +    - Phase 2 (Validity): Traverse again, this time calculating the
> > +      offsets that would result from folding all add immediate instructions
> > +      found.  Also keep track of which instructions will be folded for this
> > +      particular offset because folding can be partially or completely
> > +      shared across an number of different memory instructions.  At this point,
> > +      since we calculated the actual offset resulting from folding, we check
> > +      and keep track if it's a valid 12-bit immediate.
> > +
> > +    - Phase 3 (Commit offsets): Traverse again.  This time it is known if
> > +      a particular fold is valid so actually fold the offset by changing
> > +      the RTL statement.  It's important that this phase is separate from the
> > +      previous because one instruction that is foldable with a valid offset
> > +      can become result in an invalid offset for another instruction later on.
> > +
> > +    - Phase 4 (Commit instruction deletions): Scan all insns and delete
> > +      all add immediate instructions that were folded.  */
> > +
> > +namespace {
> > +
> > +const pass_data pass_data_fold_mem =
> > +{
> > +  RTL_PASS, /* type */
> > +  "fold_mem_offsets", /* name */
> > +  OPTGROUP_NONE, /* optinfo_flags */
> > +  TV_NONE, /* tv_id */
> > +  0, /* properties_required */
> > +  0, /* properties_provided */
> > +  0, /* properties_destroyed */
> > +  0, /* todo_flags_start */
> > +  TODO_df_finish, /* todo_flags_finish */
> > +};
> > +
> > +class pass_fold_mem_offsets : public rtl_opt_pass
> > +{
> > +public:
> > +  pass_fold_mem_offsets (gcc::context *ctxt)
> > +    : rtl_opt_pass (pass_data_fold_mem, ctxt)
> > +  {}
> > +
> > +  /* opt_pass methods: */
> > +  virtual bool gate (function *)
> > +    {
> > +      return riscv_mfold_mem_offsets
> > +              && optimize >= 2;
> > +    }
> > +
> > +  virtual unsigned int execute (function *);
> > +}; // class pass_fold_mem_offsets
> > +
> > +/* Bitmap that tracks which instructions are reachable through sequences
> > +   of foldable instructions.  */
> > +static bitmap_head can_fold_insn;
> > +
> > +/* Bitmap with instructions marked for deletion due to folding.  */
> > +static bitmap_head pending_remove_insn;
> > +
> > +/* Bitmap with instructions that cannot be deleted because that would
> > +   require folding an offset that's invalid in some memory access.
> > +   An instruction can be in both PENDING_REMOVE_INSN and CANNOT_REMOVE_INSN
> > +   at the same time, in which case it cannot be safely deleted.  */
> > +static bitmap_head cannot_remove_insn;
> > +
> > +/* The number of folded addi instructions of the form "addi reg, sp, X".  */
> > +static int stats_folded_sp;
> > +
> > +/* The number of the rest folded addi instructions.  */
> > +static int stats_folded_other;
> > +
> > +enum fold_mem_phase
> > +{
> > +  FM_PHASE_ANALYSIS,
> > +  FM_PHASE_VALIDITY,
> > +  FM_PHASE_COMMIT_OFFSETS,
> > +  FM_PHASE_COMMIT_INSNS
> > +};
> > +
> > +/* Helper function for fold_offsets.
> > +  Get the single reaching definition of an instruction inside a BB.
> > +  The definition is desired for REG used in INSN.
> > +  Return the definition insn or NULL if there's no definition with
> > +  the desired criteria.  */
> > +static rtx_insn*
> > +get_single_def_in_bb (rtx_insn *insn, rtx reg)
> > +{
> > +  df_ref use;
> > +  struct df_link *ref_chain, *ref_link;
> > +
> > +  FOR_EACH_INSN_USE (use, insn)
> > +    {
> > +      if (GET_CODE (DF_REF_REG (use)) == SUBREG)
> > +       return NULL;
> > +      if (REGNO (DF_REF_REG (use)) == REGNO (reg))
> > +       break;
> > +    }
> > +
> > +  if (!use)
> > +    return NULL;
> > +
> > +  ref_chain = DF_REF_CHAIN (use);
> > +
> > +  if (!ref_chain)
> > +    return NULL;
> > +
> > +  for (ref_link = ref_chain; ref_link; ref_link = ref_link->next)
> > +    {
> > +      /* Problem getting some definition for this instruction.  */
> > +      if (ref_link->ref == NULL)
> > +       return NULL;
> > +      if (DF_REF_INSN_INFO (ref_link->ref) == NULL)
> > +       return NULL;
> > +      if (global_regs[REGNO (reg)]
> > +         && !set_of (reg, DF_REF_INSN (ref_link->ref)))
> > +       return NULL;
> > +    }
> > +
> > +  if (ref_chain->next)
> > +    return NULL;
> > +
> > +  rtx_insn* def = DF_REF_INSN (ref_chain->ref);
> > +
> > +  if (BLOCK_FOR_INSN (def) != BLOCK_FOR_INSN (insn))
> > +    return NULL;
> > +
> > +  if (DF_INSN_LUID (def) > DF_INSN_LUID (insn))
> > +    return NULL;
> > +
> > +  return def;
> > +}
> > +
> > +/* Helper function for fold_offsets.
> > +   Get all the reaching uses of an instruction.  The uses are desired for REG
> > +   set in INSN.  Return use list or NULL if a use is missing or irregular.
> > +   If SUCCESS is not NULL then it's value is set to false if there are
> > +   missing or irregular uses and to true otherwise.  */
> > +static struct df_link*
> > +get_uses (rtx_insn *insn, rtx reg, bool* success)
> > +{
> > +  df_ref def;
> > +  struct df_link *ref_chain, *ref_link;
> > +
> > +  if (success != NULL)
> > +    *success = false;
> > +
> > +  FOR_EACH_INSN_DEF (def, insn)
> > +    if (REGNO (DF_REF_REG (def)) == REGNO (reg))
> > +      break;
> > +
> > +  if (!def)
> > +    return NULL;
> > +
> > +  ref_chain = DF_REF_CHAIN (def);
> > +
> > +  for (ref_link = ref_chain; ref_link; ref_link = ref_link->next)
> > +    {
> > +      /* Problem getting some use for this instruction.  */
> > +      if (ref_link->ref == NULL)
> > +       return NULL;
> > +      if (DF_REF_CLASS (ref_link->ref) != DF_REF_REGULAR)
> > +       return NULL;
> > +    }
> > +
> > +  if (success != NULL)
> > +    *success = true;
> > +
> > +  return ref_chain;
> > +}
> > +
> > +/* Recursive function that computes the foldable offsets through the
> > +   definitions of REG in INSN given an integer scale factor SCALE.
> > +   Returns the offset that would have to be added if all instructions
> > +   in PENDING_DELETES were to be deleted.
> > +
> > +  - if ANALYZE is true then it recurses through definitions with the common
> > +    code and marks eligible for folding instructions in the bitmap
> > +    can_fold_insn.  An instruction is eligible if all it's uses are also
> > +    eligible.  Initially can_fold_insn is true for memory accesses.
> > +
> > +  - if ANALYZE is false then it recurses through definitions with the common
> > +    code and computes and returns the offset that would result from folding
> > +    the instructions in PENDING_DELETES were to be deleted.  */
> > +static HOST_WIDE_INT
> > +fold_offsets (rtx_insn* insn, rtx reg, int scale, bool analyze,
> > +             bitmap pending_deletes)
> > +{
> > +  rtx_insn* def = get_single_def_in_bb (insn, reg);
> > +
> > +  if (!def)
> > +    return 0;
> > +
> > +  rtx set = single_set (def);
> > +
> > +  if (!set)
> > +    return 0;
> > +
> > +  rtx src = SET_SRC (set);
> > +  rtx dest = SET_DEST (set);
> > +
> > +  enum rtx_code code = GET_CODE (src);
> > +
> > +  /* Return early for SRC codes that we don't know how to handle.  */
> > +  if (code != PLUS && code != ASHIFT && code != REG)
> > +    return 0;
> > +
> > +  unsigned int dest_regno = REGNO (dest);
> > +
> > +  /* We don't want to fold offsets from instructions that change some
> > +     particular registers with potentially global side effects.  */
> > +  if (!GP_REG_P (dest_regno)
> > +      || dest_regno == STACK_POINTER_REGNUM
> > +      || (frame_pointer_needed && dest_regno == HARD_FRAME_POINTER_REGNUM)
> > +      || dest_regno == GP_REGNUM
> > +      || dest_regno == THREAD_POINTER_REGNUM
> > +      || dest_regno == RETURN_ADDR_REGNUM)
> > +    return 0;
> > +
> > +  if (analyze)
> > +    {
> > +      /* We can only fold through instructions that are eventually used as
> > +        memory addresses and do not have other uses.  Use the same logic
> > +        from the offset calculation to visit instructions that can
> > +        propagate offsets and keep track in can_fold_insn which have uses
> > +        that end always in memory instructions.  */
> > +
> > +      if (REG_P (dest))
> > +       {
> > +         bool success;
> > +         struct df_link *uses = get_uses (def, dest, &success), *ref_link;
> > +
> > +         if (!success)
> > +           return 0;
> > +
> > +         for (ref_link = uses; ref_link; ref_link = ref_link->next)
> > +           {
> > +             rtx_insn* use = DF_REF_INSN (ref_link->ref);
> > +
> > +             /* Ignore debug insns during analysis.  */
> > +             if (DEBUG_INSN_P (use))
> > +               continue;
> > +
> > +             if (!bitmap_bit_p (&can_fold_insn, INSN_UID (use)))
> > +               return 0;
> > +
> > +             rtx use_set = single_set (use);
> > +
> > +             /* Prevent folding when a memory store uses the dest register.  */
> > +             if (use_set
> > +                 && MEM_P (SET_DEST (use_set))
> > +                 && REG_P (SET_SRC (use_set))
> > +                 && REGNO (SET_SRC (use_set)) == REGNO (dest))
> > +               return 0;
> > +           }
> > +
> > +         bitmap_set_bit (&can_fold_insn, INSN_UID (def));
> > +       }
> > +    }
> > +
> > +  if (!bitmap_bit_p (&can_fold_insn, INSN_UID (def)))
> > +    return 0;
> > +
> > +  switch (code)
> > +    {
> > +    case PLUS:
> > +      {
> > +       /* Propagate through add.  */
> > +       rtx arg1 = XEXP (src, 0);
> > +       rtx arg2 = XEXP (src, 1);
> > +
> > +       HOST_WIDE_INT offset = 0;
> > +
> > +       if (REG_P (arg1))
> > +         offset += fold_offsets (def, arg1, 1, analyze, pending_deletes);
> > +       else if (GET_CODE (arg1) == ASHIFT && REG_P (XEXP (arg1, 0))
> > +                && CONST_INT_P (XEXP (arg1, 1)))
> > +         {
> > +           /* Also handle shift-and-add from the zbb extension.  */
> > +           int shift_scale = (1 << (int) INTVAL (XEXP (arg1, 1)));
> > +           offset += fold_offsets (def, XEXP (arg1, 0), shift_scale, analyze,
> > +                                   pending_deletes);
> > +         }
> > +
> > +       if (REG_P (arg2))
> > +         offset += fold_offsets (def, arg2, 1, analyze, pending_deletes);
> > +       else if (CONST_INT_P (arg2) && !analyze)
> > +         {
> > +           offset += INTVAL (arg2);
> > +           bitmap_set_bit (pending_deletes, INSN_UID (def));
> > +         }
> > +
> > +       return scale * offset;
> > +      }
> > +    case ASHIFT:
> > +      {
> > +       /* Propagate through sll.  */
> > +       rtx arg1 = XEXP (src, 0);
> > +       rtx arg2 = XEXP (src, 1);
> > +
> > +       if (REG_P (arg1) && CONST_INT_P (arg2))
> > +         {
> > +           int shift_scale = (1 << (int) INTVAL (arg2));
> > +           return scale * fold_offsets (def, arg1, shift_scale, analyze,
> > +                                        pending_deletes);
> > +         }
> > +
> > +       return 0;
> > +      }
> > +    case REG:
> > +      /* Propagate through mv.  */
> > +      return scale * fold_offsets (def, src, 1, analyze, pending_deletes);
> > +    default:
> > +      /* Cannot propagate.  */
> > +      return 0;
> > +    }
> > +}
> > +
> > +/* Helper function for fold_offset_mem.
> > +   If INSN is a set rtx that loads from or stores to
> > +   some memory location that could have an offset folded
> > +   to it, return the rtx for the memory operand.  */
> > +static rtx
> > +get_foldable_mem_rtx (rtx_insn* insn)
> > +{
> > +  rtx set = single_set (insn);
> > +
> > +  if (set != NULL_RTX)
> > +    {
> > +      rtx src = SET_SRC (set);
> > +      rtx dest = SET_DEST (set);
> > +
> > +      /* We don't want folding if the memory has
> > +        unspec/unspec volatile in either src or dest.
> > +        In particular this also prevents folding
> > +        when atomics are involved.  */
> > +      if (GET_CODE (src) == UNSPEC
> > +         || GET_CODE (src) == UNSPEC_VOLATILE
> > +         || GET_CODE (dest) == UNSPEC
> > +         || GET_CODE (dest) == UNSPEC_VOLATILE)
> > +       return NULL;
> > +
> > +      if (MEM_P (src))
> > +       return src;
> > +      else if (MEM_P (dest))
> > +       return dest;
> > +      else if ((
> > +               GET_CODE (src) == SIGN_EXTEND
> > +               || GET_CODE (src) == ZERO_EXTEND
> > +             )
> > +             && MEM_P (XEXP (src, 0)))
> > +       return XEXP (src, 0);
> > +    }
> > +
> > +  return NULL;
> > +}
> > +
> > +/* Driver function that performs the actions defined by PHASE for INSN.  */
> > +static void
> > +fold_offset_mem (rtx_insn* insn, int phase)
> > +{
> > +  if (phase == FM_PHASE_COMMIT_INSNS)
> > +    {
> > +      if (bitmap_bit_p (&pending_remove_insn, INSN_UID (insn))
> > +         && !bitmap_bit_p (&cannot_remove_insn, INSN_UID (insn)))
> > +       {
> > +         rtx set = single_set (insn);
> > +         rtx src = SET_SRC (set);
> > +         rtx dest = SET_DEST (set);
> > +         rtx arg1 = XEXP (src, 0);
> > +
> > +         /* INSN is an add immidiate addi DEST, SRC1, SRC2 that we
> > +            must replace with addi DEST, SRC1, 0.  */
> > +         if (XEXP (src, 0) == stack_pointer_rtx)
> > +           stats_folded_sp++;
> > +         else
> > +           stats_folded_other++;
> > +
> > +         if (dump_file)
> > +           {
> > +             fprintf (dump_file, "Instruction deleted from folding:");
> > +             print_rtl_single (dump_file, insn);
> > +           }
> > +
> > +         if (REGNO (dest) != REGNO (arg1))
> > +           {
> > +             /* If the dest register is different than the fisrt argument
> > +                then the addition with constant 0 is equivalent to a move
> > +                instruction.  We emit the move and let the subsequent
> > +                pass cprop_hardreg eliminate that if possible.  */
> > +             rtx arg1_reg_rtx = gen_rtx_REG (GET_MODE (dest), REGNO (arg1));
> > +             rtx mov_rtx = gen_move_insn (dest, arg1_reg_rtx);
> > +             df_insn_rescan (emit_insn_after (mov_rtx, insn));
> > +           }
> > +
> > +         /* If the dest register is the same with the first argument
> > +            then the addition with constant 0 is a no-op.
> > +            We can now delete the original add immidiate instruction.  */
> > +         delete_insn (insn);
> > +       }
> > +    }
> > +  else
> > +    {
> > +      rtx mem = get_foldable_mem_rtx (insn);
> > +
> > +      if (!mem)
> > +       return;
> > +
> > +      rtx mem_addr = XEXP (mem, 0);
> > +      rtx reg;
> > +      HOST_WIDE_INT cur_off;
> > +
> > +      if (REG_P (mem_addr))
> > +       {
> > +         reg = mem_addr;
> > +         cur_off = 0;
> > +       }
> > +      else if (GET_CODE (mem_addr) == PLUS
> > +              && REG_P (XEXP (mem_addr, 0))
> > +              && CONST_INT_P (XEXP (mem_addr, 1)))
> > +       {
> > +         reg = XEXP (mem_addr, 0);
> > +         cur_off = INTVAL (XEXP (mem_addr, 1));
> > +       }
> > +      else
> > +       return;
> > +
> > +      if (phase == FM_PHASE_ANALYSIS)
> > +       {
> > +         bitmap_set_bit (&can_fold_insn, INSN_UID (insn));
> > +         fold_offsets (insn, reg, 1, true, NULL);
> > +       }
> > +      else if (phase == FM_PHASE_VALIDITY)
> > +       {
> > +         bitmap_head new_pending_deletes;
> > +         bitmap_initialize (&new_pending_deletes, NULL);
> > +         HOST_WIDE_INT offset = cur_off + fold_offsets (insn, reg, 1, false,
> > +                                                       &new_pending_deletes);
> > +
> > +         /* Temporarily change the offset in MEM to test whether
> > +            it results in a valid instruction.  */
> > +         machine_mode mode = GET_MODE (mem_addr);
> > +         XEXP (mem, 0) = gen_rtx_PLUS (mode, reg, GEN_INT (offset));
> > +
> > +         bool valid_change = recog (PATTERN (insn), insn, 0) >= 0;
> > +
> > +         /* Restore the instruction.  */
> > +         XEXP (mem, 0) = mem_addr;
> > +
> > +         if (valid_change)
> > +           bitmap_ior_into (&pending_remove_insn, &new_pending_deletes);
> > +         else
> > +           bitmap_ior_into (&cannot_remove_insn, &new_pending_deletes);
> > +         bitmap_release (&new_pending_deletes);
> > +       }
> > +      else if (phase == FM_PHASE_COMMIT_OFFSETS)
> > +       {
> > +         bitmap_head required_deletes;
> > +         bitmap_initialize (&required_deletes, NULL);
> > +         HOST_WIDE_INT offset = cur_off + fold_offsets (insn, reg, 1, false,
> > +                                                        &required_deletes);
> > +         bool illegal = bitmap_intersect_p (&required_deletes,
> > +                                            &cannot_remove_insn);
> > +
> > +         if (offset == cur_off)
> > +           return;
> > +
> > +         gcc_assert (!bitmap_empty_p (&required_deletes));
> > +
> > +         /* We have to update CANNOT_REMOVE_INSN again if transforming
> > +            this instruction is illegal.  */
> > +         if (illegal)
> > +           bitmap_ior_into (&cannot_remove_insn, &required_deletes);
> > +         else
> > +           {
> > +             machine_mode mode = GET_MODE (mem_addr);
> > +             XEXP (mem, 0) = gen_rtx_PLUS (mode, reg, GEN_INT (offset));
> > +             df_insn_rescan (insn);
> > +
> > +             if (dump_file)
> > +               {
> > +                 fprintf (dump_file, "Memory offset changed from "
> > +                                     HOST_WIDE_INT_PRINT_DEC
> > +                                     " to "
> > +                                     HOST_WIDE_INT_PRINT_DEC
> > +                                     " for instruction:\n", cur_off, offset);
> > +                       print_rtl_single (dump_file, insn);
> > +               }
> > +           }
> > +         bitmap_release (&required_deletes);
> > +       }
> > +    }
> > +}
> > +
> > +unsigned int
> > +pass_fold_mem_offsets::execute (function *fn)
> > +{
> > +  basic_block bb;
> > +  rtx_insn *insn;
> > +
> > +  df_set_flags (DF_RD_PRUNE_DEAD_DEFS | DF_DEFER_INSN_RESCAN);
> > +  df_chain_add_problem (DF_UD_CHAIN + DF_DU_CHAIN);
> > +  df_analyze ();
> > +
> > +  bitmap_initialize (&can_fold_insn, NULL);
> > +  bitmap_initialize (&pending_remove_insn, NULL);
> > +  bitmap_initialize (&cannot_remove_insn, NULL);
> > +
> > +  stats_folded_sp = 0;
> > +  stats_folded_other = 0;
> > +
> > +  FOR_ALL_BB_FN (bb, fn)
> > +    {
> > +      /* The shorten-memrefs pass runs when a BB is optimized for size
> > +        and moves offsets from multiple memory instructions to a common
> > +        add instruction.  Disable folding if optimizing for size because
> > +        this pass will cancel the effects of shorten-memrefs.  */
> > +      if (optimize_bb_for_size_p (bb))
> > +       continue;
> > +
> > +      bitmap_clear (&can_fold_insn);
> > +      bitmap_clear (&pending_remove_insn);
> > +      bitmap_clear (&cannot_remove_insn);
> > +
> > +      FOR_BB_INSNS (bb, insn)
> > +       fold_offset_mem (insn, FM_PHASE_ANALYSIS);
> > +
> > +      FOR_BB_INSNS (bb, insn)
> > +       fold_offset_mem (insn, FM_PHASE_VALIDITY);
> > +
> > +      FOR_BB_INSNS (bb, insn)
> > +       fold_offset_mem (insn, FM_PHASE_COMMIT_OFFSETS);
> > +
> > +      FOR_BB_INSNS (bb, insn)
> > +       fold_offset_mem (insn, FM_PHASE_COMMIT_INSNS);
> > +    }
> > +
> > +  statistics_counter_event (cfun, "addi with sp fold", stats_folded_sp);
> > +  statistics_counter_event (cfun, "other addi fold", stats_folded_other);
> > +
> > +  bitmap_release (&can_fold_insn);
> > +  bitmap_release (&pending_remove_insn);
> > +  bitmap_release (&cannot_remove_insn);
> > +
> > +  return 0;
> > +}
> > +
> > +} // anon namespace
> > +
> > +rtl_opt_pass *
> > +make_pass_fold_mem_offsets (gcc::context *ctxt)
> > +{
> > +  return new pass_fold_mem_offsets (ctxt);
> > +}
> > diff --git a/gcc/config/riscv/riscv-passes.def b/gcc/config/riscv/riscv-passes.def
> > index 4084122cf0a..dc08daadc66 100644
> > --- a/gcc/config/riscv/riscv-passes.def
> > +++ b/gcc/config/riscv/riscv-passes.def
> > @@ -18,4 +18,5 @@
> >     <http://www.gnu.org/licenses/>.  */
> >
> >  INSERT_PASS_AFTER (pass_rtl_store_motion, 1, pass_shorten_memrefs);
> > +INSERT_PASS_AFTER (pass_regrename, 1, pass_fold_mem_offsets);
> >  INSERT_PASS_BEFORE (pass_fast_rtl_dce, 1, pass_vsetvl);
> > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > index 5f78fd579bb..b89a82adb0e 100644
> > --- a/gcc/config/riscv/riscv-protos.h
> > +++ b/gcc/config/riscv/riscv-protos.h
> > @@ -104,6 +104,7 @@ extern void riscv_parse_arch_string (const char *, struct gcc_options *, locatio
> >  extern bool riscv_hard_regno_rename_ok (unsigned, unsigned);
> >
> >  rtl_opt_pass * make_pass_shorten_memrefs (gcc::context *ctxt);
> > +rtl_opt_pass * make_pass_fold_mem_offsets (gcc::context *ctxt);
> >  rtl_opt_pass * make_pass_vsetvl (gcc::context *ctxt);
> >
> >  /* Information about one CPU we know about.  */
> > diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
> > index 63d4710cb15..5e1fbdbedcc 100644
> > --- a/gcc/config/riscv/riscv.opt
> > +++ b/gcc/config/riscv/riscv.opt
> > @@ -105,6 +105,10 @@ Convert BASE + LARGE_OFFSET addresses to NEW_BASE + SMALL_OFFSET to allow more
> >  memory accesses to be generated as compressed instructions.  Currently targets
> >  32-bit integer load/stores.
> >
> > +mfold-mem-offsets
> > +Target Bool Var(riscv_mfold_mem_offsets) Init(1)
> > +Fold instructions calculating memory offsets to the memory access instruction if possible.
> > +
> >  mcmodel=
> >  Target RejectNegative Joined Enum(code_model) Var(riscv_cmodel) Init(TARGET_DEFAULT_CMODEL)
> >  Specify the code model.
> > diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv
> > index 1252d6f851a..f29cf463867 100644
> > --- a/gcc/config/riscv/t-riscv
> > +++ b/gcc/config/riscv/t-riscv
> > @@ -76,6 +76,10 @@ riscv-shorten-memrefs.o: $(srcdir)/config/riscv/riscv-shorten-memrefs.cc \
> >         $(COMPILE) $<
> >         $(POSTCOMPILE)
> >
> > +riscv-fold-mem-offsets.o: $(srcdir)/config/riscv/riscv-fold-mem-offsets.cc
> > +       $(COMPILE) $<
> > +       $(POSTCOMPILE)
> > +
> >  riscv-selftests.o: $(srcdir)/config/riscv/riscv-selftests.cc \
> >    $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) $(TREE_H) output.h \
> >    $(C_COMMON_H) $(TARGET_H) $(OPTABS_H) $(EXPR_H) $(INSN_ATTR_H) $(EMIT_RTL_H)
> > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> > index ee78591c73e..39b57cab595 100644
> > --- a/gcc/doc/invoke.texi
> > +++ b/gcc/doc/invoke.texi
> > @@ -1218,6 +1218,7 @@ See RS/6000 and PowerPC Options.
> >  -msmall-data-limit=@var{N-bytes}
> >  -msave-restore  -mno-save-restore
> >  -mshorten-memrefs  -mno-shorten-memrefs
> > +-mfold-mem-offsets  -mno-fold-mem-offsets
> >  -mstrict-align  -mno-strict-align
> >  -mcmodel=medlow  -mcmodel=medany
> >  -mexplicit-relocs  -mno-explicit-relocs
> > @@ -29048,6 +29049,13 @@ of 'new base + small offset'.  If the new base gets stored in a compressed
> >  register, then the new load/store can be compressed.  Currently targets 32-bit
> >  integer load/stores only.
> >
> > +@opindex mfold-mem-offsets
> > +@item -mfold-mem-offsets
> > +@itemx -mno-fold-mem-offsets
> > +Do or do not attempt to move constant addition calculations used to for memory
> > +offsets to the corresponding memory instructions.  The default is
> > +@option{-mfold-mem-offsets} at levels @option{-O2}, @option{-O3}.
> > +
> >  @opindex mstrict-align
> >  @item -mstrict-align
> >  @itemx -mno-strict-align
> > diff --git a/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c
> > new file mode 100644
> > index 00000000000..574cc92b6ab
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c
> > @@ -0,0 +1,16 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mfold-mem-offsets" } */
> > +
> > +void sink(int arr[2]);
> > +
> > +void
> > +foo(int a, int b, int i)
> > +{
> > +  int arr[2] = {a, b};
> > +  arr[i]++;
> > +  sink(arr);
> > +}
> > +
> > +// Should compile without negative memory offsets when using -mfold-mem-offsets
> > +/* { dg-final { scan-assembler-not "lw\t.*,-.*\\(.*\\)" } } */
> > +/* { dg-final { scan-assembler-not "sw\t.*,-.*\\(.*\\)" } } */
> > \ No newline at end of file
> > diff --git a/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c
> > new file mode 100644
> > index 00000000000..e6c251d3a3c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c
> > @@ -0,0 +1,24 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mfold-mem-offsets" } */
> > +
> > +void sink(int arr[3]);
> > +
> > +void
> > +foo(int a, int b, int c, int i)
> > +{
> > +  int arr1[3] = {a, b, c};
> > +  int arr2[3] = {a, c, b};
> > +  int arr3[3] = {c, b, a};
> > +
> > +  arr1[i]++;
> > +  arr2[i]++;
> > +  arr3[i]++;
> > +
> > +  sink(arr1);
> > +  sink(arr2);
> > +  sink(arr3);
> > +}
> > +
> > +// Should compile without negative memory offsets when using -mfold-mem-offsets
> > +/* { dg-final { scan-assembler-not "lw\t.*,-.*\\(.*\\)" } } */
> > +/* { dg-final { scan-assembler-not "sw\t.*,-.*\\(.*\\)" } } */
> > \ No newline at end of file
> > diff --git a/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c
> > new file mode 100644
> > index 00000000000..8586d3e3a29
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c
> > @@ -0,0 +1,17 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mfold-mem-offsets" } */
> > +
> > +void load(int arr[2]);
> > +
> > +int
> > +foo(long unsigned int i)
> > +{
> > +  int arr[2];
> > +  load(arr);
> > +
> > +  return arr[3 * i + 77];
> > +}
> > +
> > +// Should compile without negative memory offsets when using -mfold-mem-offsets
> > +/* { dg-final { scan-assembler-not "lw\t.*,-.*\\(.*\\)" } } */
> > +/* { dg-final { scan-assembler-not "addi\t.*,.*,77" } } */
> > \ No newline at end of file
> > --
> > 2.34.1
> >
  
Jeff Law May 25, 2023, 1:31 p.m. UTC | #3
On 5/25/23 07:01, Richard Biener via Gcc-patches wrote:
> On Thu, May 25, 2023 at 2:36 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
>>
>> Implementation of the new RISC-V optimization pass for memory offset
>> calculations, documentation and testcases.
> 
> Why do fwprop or combine not what you want to do?
I think a lot of them end up coming from register elimination.

jeff
  
Richard Biener May 25, 2023, 1:50 p.m. UTC | #4
On Thu, May 25, 2023 at 3:32 PM Jeff Law via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
>
>
> On 5/25/23 07:01, Richard Biener via Gcc-patches wrote:
> > On Thu, May 25, 2023 at 2:36 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
> >>
> >> Implementation of the new RISC-V optimization pass for memory offset
> >> calculations, documentation and testcases.
> >
> > Why do fwprop or combine not what you want to do?
> I think a lot of them end up coming from register elimination.

Why isn't this a problem for other targets then?  Or maybe it is and this
shouldn't be a machine specific pass?  Maybe postreload-gcse should
perform strength reduction (I can't think of any other post reload pass
that would do something even remotely related).

Richard.

> jeff
  
Manolis Tsamis May 25, 2023, 2:02 p.m. UTC | #5
On Thu, May 25, 2023 at 4:53 PM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Thu, May 25, 2023 at 3:32 PM Jeff Law via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> >
> >
> > On 5/25/23 07:01, Richard Biener via Gcc-patches wrote:
> > > On Thu, May 25, 2023 at 2:36 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
> > >>
> > >> Implementation of the new RISC-V optimization pass for memory offset
> > >> calculations, documentation and testcases.
> > >
> > > Why do fwprop or combine not what you want to do?
> > I think a lot of them end up coming from register elimination.
>
> Why isn't this a problem for other targets then?  Or maybe it is and this
> shouldn't be a machine specific pass?  Maybe postreload-gcse should
> perform strength reduction (I can't think of any other post reload pass
> that would do something even remotely related).
>
> Richard.
>

It should be a problem for other targets as well (especially RISC-style ISAs).

It can be easily seen by comparing the generated code for the
testcases: Example for testcase-2 on AArch64:
https://godbolt.org/z/GMT1K7Ebr
Although the patterns in the test cases are the ones that are simple
as the complex ones manifest in complex programs, the case still
holds.
The code for this pass is quite generic and could work for most/all
targets if that would be interesting.

Manolis

> > jeff
  
Jeff Law May 25, 2023, 2:13 p.m. UTC | #6
On 5/25/23 07:50, Richard Biener wrote:
> On Thu, May 25, 2023 at 3:32 PM Jeff Law via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>>
>>
>> On 5/25/23 07:01, Richard Biener via Gcc-patches wrote:
>>> On Thu, May 25, 2023 at 2:36 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
>>>>
>>>> Implementation of the new RISC-V optimization pass for memory offset
>>>> calculations, documentation and testcases.
>>>
>>> Why do fwprop or combine not what you want to do?
>> I think a lot of them end up coming from register elimination.
> 
> Why isn't this a problem for other targets then?  Or maybe it is and this
> shouldn't be a machine specific pass?  Maybe postreload-gcse should
> perform strength reduction (I can't think of any other post reload pass
> that would do something even remotely related).
It is to some degree.  I ran into similar problems at my prior employer. 
  We ended up working around it in the target files in a different way 
-- which didn't work when I quickly tried it on RISC-V.

Seems like it would be worth another investigative step as part of the 
evaluation of this patch.  I wasn't at 100% when I did that poking 
around many months ago.

Jeff
  
Philipp Tomsich May 25, 2023, 2:18 p.m. UTC | #7
On Thu, 25 May 2023 at 16:14, Jeff Law via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
>
>
> On 5/25/23 07:50, Richard Biener wrote:
> > On Thu, May 25, 2023 at 3:32 PM Jeff Law via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> >>
> >>
> >>
> >> On 5/25/23 07:01, Richard Biener via Gcc-patches wrote:
> >>> On Thu, May 25, 2023 at 2:36 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
> >>>>
> >>>> Implementation of the new RISC-V optimization pass for memory offset
> >>>> calculations, documentation and testcases.
> >>>
> >>> Why do fwprop or combine not what you want to do?

At least for stack variables, the virtual-stack-vars is not resolved
until reload.
So combine will be running much too early to be of any use (and I
haven't recently looked at whether one of the propagation passes runs
after).

Philipp.

> >> I think a lot of them end up coming from register elimination.
> >
> > Why isn't this a problem for other targets then?  Or maybe it is and this
> > shouldn't be a machine specific pass?  Maybe postreload-gcse should
> > perform strength reduction (I can't think of any other post reload pass
> > that would do something even remotely related).
> It is to some degree.  I ran into similar problems at my prior employer.
>   We ended up working around it in the target files in a different way
> -- which didn't work when I quickly tried it on RISC-V.
>
> Seems like it would be worth another investigative step as part of the
> evaluation of this patch.  I wasn't at 100% when I did that poking
> around many months ago.
>
> Jeff
  
Jeff Law May 29, 2023, 11:30 p.m. UTC | #8
On 5/25/23 08:02, Manolis Tsamis wrote:
> On Thu, May 25, 2023 at 4:53 PM Richard Biener via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> On Thu, May 25, 2023 at 3:32 PM Jeff Law via Gcc-patches
>> <gcc-patches@gcc.gnu.org> wrote:
>>>
>>>
>>>
>>> On 5/25/23 07:01, Richard Biener via Gcc-patches wrote:
>>>> On Thu, May 25, 2023 at 2:36 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
>>>>>
>>>>> Implementation of the new RISC-V optimization pass for memory offset
>>>>> calculations, documentation and testcases.
>>>>
>>>> Why do fwprop or combine not what you want to do?
>>> I think a lot of them end up coming from register elimination.
>>
>> Why isn't this a problem for other targets then?  Or maybe it is and this
>> shouldn't be a machine specific pass?  Maybe postreload-gcse should
>> perform strength reduction (I can't think of any other post reload pass
>> that would do something even remotely related).
>>
>> Richard.
>>
> 
> It should be a problem for other targets as well (especially RISC-style ISAs).
> 
> It can be easily seen by comparing the generated code for the
> testcases: Example for testcase-2 on AArch64:
> https://godbolt.org/z/GMT1K7Ebr
> Although the patterns in the test cases are the ones that are simple
> as the complex ones manifest in complex programs, the case still
> holds.
> The code for this pass is quite generic and could work for most/all
> targets if that would be interesting.
Interestly enough, fold-mem-offsets seems to interact strangely with the 
load/store pair support on aarch64.  Note show store2a uses 2 stp 
instructions on the trunk, but 4 str instructions with fold-mem-offsets. 
  Yet in load1r we're able to generate a load-quad rather than two load 
pairs.  Weird.

jeff
  
Manolis Tsamis May 31, 2023, 12:19 p.m. UTC | #9
On Tue, May 30, 2023 at 2:30 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>
>
> On 5/25/23 08:02, Manolis Tsamis wrote:
> > On Thu, May 25, 2023 at 4:53 PM Richard Biener via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> >>
> >> On Thu, May 25, 2023 at 3:32 PM Jeff Law via Gcc-patches
> >> <gcc-patches@gcc.gnu.org> wrote:
> >>>
> >>>
> >>>
> >>> On 5/25/23 07:01, Richard Biener via Gcc-patches wrote:
> >>>> On Thu, May 25, 2023 at 2:36 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
> >>>>>
> >>>>> Implementation of the new RISC-V optimization pass for memory offset
> >>>>> calculations, documentation and testcases.
> >>>>
> >>>> Why do fwprop or combine not what you want to do?
> >>> I think a lot of them end up coming from register elimination.
> >>
> >> Why isn't this a problem for other targets then?  Or maybe it is and this
> >> shouldn't be a machine specific pass?  Maybe postreload-gcse should
> >> perform strength reduction (I can't think of any other post reload pass
> >> that would do something even remotely related).
> >>
> >> Richard.
> >>
> >
> > It should be a problem for other targets as well (especially RISC-style ISAs).
> >
> > It can be easily seen by comparing the generated code for the
> > testcases: Example for testcase-2 on AArch64:
> > https://godbolt.org/z/GMT1K7Ebr
> > Although the patterns in the test cases are the ones that are simple
> > as the complex ones manifest in complex programs, the case still
> > holds.
> > The code for this pass is quite generic and could work for most/all
> > targets if that would be interesting.
> Interestly enough, fold-mem-offsets seems to interact strangely with the
> load/store pair support on aarch64.  Note show store2a uses 2 stp
> instructions on the trunk, but 4 str instructions with fold-mem-offsets.
>   Yet in load1r we're able to generate a load-quad rather than two load
> pairs.  Weird.
>

I'm confused, where is this comparison from?
The fold-mem-offsets pass is only run on RISCV and doesn't (shouldn't)
affect AArch64.

I only see the 2x stp / 4x str in the godbolt link, but that is gcc vs
clang, no fold-mem-offsets involved here.

Manolis

> jeff
  
Jeff Law May 31, 2023, 2 p.m. UTC | #10
On 5/31/23 06:19, Manolis Tsamis wrote:
> On Tue, May 30, 2023 at 2:30 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
>>
>>
>>
>> On 5/25/23 08:02, Manolis Tsamis wrote:
>>> On Thu, May 25, 2023 at 4:53 PM Richard Biener via Gcc-patches
>>> <gcc-patches@gcc.gnu.org> wrote:
>>>>
>>>> On Thu, May 25, 2023 at 3:32 PM Jeff Law via Gcc-patches
>>>> <gcc-patches@gcc.gnu.org> wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 5/25/23 07:01, Richard Biener via Gcc-patches wrote:
>>>>>> On Thu, May 25, 2023 at 2:36 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
>>>>>>>
>>>>>>> Implementation of the new RISC-V optimization pass for memory offset
>>>>>>> calculations, documentation and testcases.
>>>>>>
>>>>>> Why do fwprop or combine not what you want to do?
>>>>> I think a lot of them end up coming from register elimination.
>>>>
>>>> Why isn't this a problem for other targets then?  Or maybe it is and this
>>>> shouldn't be a machine specific pass?  Maybe postreload-gcse should
>>>> perform strength reduction (I can't think of any other post reload pass
>>>> that would do something even remotely related).
>>>>
>>>> Richard.
>>>>
>>>
>>> It should be a problem for other targets as well (especially RISC-style ISAs).
>>>
>>> It can be easily seen by comparing the generated code for the
>>> testcases: Example for testcase-2 on AArch64:
>>> https://godbolt.org/z/GMT1K7Ebr
>>> Although the patterns in the test cases are the ones that are simple
>>> as the complex ones manifest in complex programs, the case still
>>> holds.
>>> The code for this pass is quite generic and could work for most/all
>>> targets if that would be interesting.
>> Interestly enough, fold-mem-offsets seems to interact strangely with the
>> load/store pair support on aarch64.  Note show store2a uses 2 stp
>> instructions on the trunk, but 4 str instructions with fold-mem-offsets.
>>    Yet in load1r we're able to generate a load-quad rather than two load
>> pairs.  Weird.
>>
> 
> I'm confused, where is this comparison from?
> The fold-mem-offsets pass is only run on RISCV and doesn't (shouldn't)
> affect AArch64.
> 
> I only see the 2x stp / 4x str in the godbolt link, but that is gcc vs
> clang, no fold-mem-offsets involved here.
My bad!  I should have looked at the headings more closely.  I thought 
you'd set up a with/without fold-mem-offsets comparison.

jeff
  
Jeff Law June 8, 2023, 5:37 a.m. UTC | #11
On 5/25/23 06:35, Manolis Tsamis wrote:
> Implementation of the new RISC-V optimization pass for memory offset
> calculations, documentation and testcases.
> 
> gcc/ChangeLog:
> 
> 	* config.gcc: Add riscv-fold-mem-offsets.o to extra_objs.
> 	* config/riscv/riscv-passes.def (INSERT_PASS_AFTER): Schedule a new
> 	pass.
> 	* config/riscv/riscv-protos.h (make_pass_fold_mem_offsets): Declare.
> 	* config/riscv/riscv.opt: New options.
> 	* config/riscv/t-riscv: New build rule.
> 	* doc/invoke.texi: Document new option.
> 	* config/riscv/riscv-fold-mem-offsets.cc: New file.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/riscv/fold-mem-offsets-1.c: New test.
> 	* gcc.target/riscv/fold-mem-offsets-2.c: New test.
> 	* gcc.target/riscv/fold-mem-offsets-3.c: New test.
So not going into the guts of the patch yet.

 From a benchmark standpoint the only two that get out of the +-0.05% 
range are mcf and deepsjeng (from a dynamic instruction standpoint).  So 
from an evaluation standpoint we can probably focus our efforts there. 
And as we know, mcf is actually memory bound, so while improving its 
dynamic instruction count is good, the end performance improvement may 
be marginal.

As I mentioned to Philipp many months ago this reminds me a lot of a 
problem I've seen before.  Basically register elimination emits code 
that can be terrible in some circumstances.  So I went and poked at this 
again.

I think the key difference between now and what I was dealing with 
before is for the cases that really matter for rv64 we have a shNadd 
insn in the sequence.  That private port I was working on before did not 
have shNadd (don't ask, I probably can't tell).  Our target also had 
reg+reg addressing modes.  What I can't remember was if we were trying 
harder to fold the constant terms into the memory reference or if we 
were more focused on the reg+reg.  Ultimately it's probably not that 
important to remember -- the key is there are very significant 
differences in the target's capabilities which impact how we should be 
generating code in this case.  Those differences affect the code we 
generate *and* the places where we can potentially get control and do 
some address rewriting.

A  key sequence in mcf looks something like this in IRA, others have 
similar structure:

> (insn 237 234 239 26 (set (reg:DI 377)
>         (plus:DI (ashift:DI (reg:DI 200 [ _173 ])
>                 (const_int 3 [0x3]))
>             (reg/f:DI 65 frame))) "pbeampp.c":139:15 333 {*shNadd}
>      (nil))
> (insn 239 237 235 26 (set (reg/f:DI 380)
>         (plus:DI (reg:DI 513)
>             (reg:DI 377))) "pbeampp.c":139:15 5 {adddi3}
>      (expr_list:REG_DEAD (reg:DI 377)
>         (expr_list:REG_EQUAL (plus:DI (reg:DI 377)
>                 (const_int -32768 [0xffffffffffff8000]))
>             (nil))))
[ ... ]
> (insn 240 235 255 26 (set (reg/f:DI 204 [ _177 ])
>         (mem/f:DI (plus:DI (reg/f:DI 380)
>                 (const_int 280 [0x118])) [7 *_176+0 S8 A64])) "pbeampp.c":139:15 179 {*movdi_64bit}
>      (expr_list:REG_DEAD (reg/f:DI 380)
>         (nil)))


The key here is insn 237.  It's generally going to be bad to have FP 
show up in a shadd insn because its going to be eliminated into 
sp+offset.  That'll generate an input reload before insn 237 and we 
can't do any combination with the constant in insn 239.

After LRA it looks like this:

> (insn 1540 234 1541 26 (set (reg:DI 11 a1 [750])
>         (const_int 32768 [0x8000])) "pbeampp.c":139:15 179 {*movdi_64bit}
>      (nil))
> (insn 1541 1540 1611 26 (set (reg:DI 12 a2 [749])
>         (plus:DI (reg:DI 11 a1 [750])
>             (const_int -272 [0xfffffffffffffef0]))) "pbeampp.c":139:15 5 {adddi3}
>      (expr_list:REG_EQUAL (const_int 32496 [0x7ef0])
>         (nil))) 
> (insn 1611 1541 1542 26 (set (reg:DI 29 t4 [795])
>         (plus:DI (reg/f:DI 2 sp)
>             (const_int 64 [0x40]))) "pbeampp.c":139:15 5 {adddi3}
>      (nil))
> (insn 1542 1611 237 26 (set (reg:DI 12 a2 [749])
>         (plus:DI (reg:DI 12 a2 [749])
>             (reg:DI 29 t4 [795]))) "pbeampp.c":139:15 5 {adddi3}
>      (nil))
> (insn 237 1542 239 26 (set (reg:DI 12 a2 [377])
>         (plus:DI (ashift:DI (reg:DI 14 a4 [orig:200 _173 ] [200])
>                 (const_int 3 [0x3]))
>             (reg:DI 12 a2 [749]))) "pbeampp.c":139:15 333 {*shNadd}
>      (nil))
> (insn 239 237 235 26 (set (reg/f:DI 12 a2 [380])
>         (plus:DI (reg:DI 10 a0 [513])
>             (reg:DI 12 a2 [377]))) "pbeampp.c":139:15 5 {adddi3}
>      (expr_list:REG_EQUAL (plus:DI (reg:DI 12 a2 [377])
>             (const_int -32768 [0xffffffffffff8000]))
>         (nil))) 
[ ... ]
> (insn 240 235 255 26 (set (reg/f:DI 14 a4 [orig:204 _177 ] [204])
>         (mem/f:DI (plus:DI (reg/f:DI 12 a2 [380])
>                 (const_int 280 [0x118])) [7 *_176+0 S8 A64])) "pbeampp.c":139:15 179 {*movdi_64bit}
>      (nil))


Reload/LRA made an absolute mess of that code.

But before we add a new pass (target specific or generic), I think it 
may be in our best interest experiment a bit of creative rewriting to 
preserve the shadd, but without the frame pointer.  Perhaps also looking 
for a way to fold the constants, both the explicit ones and the implicit 
one from FP elimination.

This looks particularly promising:

> Trying 237, 239 -> 240:
>   237: r377:DI=r200:DI<<0x3+frame:DI
>       REG_DEAD r200:DI
>   239: r380:DI=r513:DI+r377:DI
>       REG_DEAD r377:DI
>       REG_EQUAL r377:DI-0x8000
>   240: r204:DI=[r380:DI+0x118]
>       REG_DEAD r380:DI
> Failed to match this instruction:
> (set (reg/f:DI 204 [ _177 ])
>     (mem/f:DI (plus:DI (plus:DI (plus:DI (mult:DI (reg:DI 200 [ _173 ])
>                         (const_int 8 [0x8]))
>                     (reg/f:DI 65 frame))
>                 (reg:DI 513))
>             (const_int 280 [0x118])) [7 *_176+0 S8 A64]))


We could reassociate this as

t1 = r200 * 8 + r513
t2 = frame + 280
t3 = t1 + t2
r204 = *t3

Which after elimination would be

t1 = r2000 * 8 + r513
t2 = sp + C + 280
t3 = t1 + t2
r204 = *t3

C + 280 will simplify.  And we'll probably end up in the addptr3 case 
which I think gives us a chance to write this a bit so that we end up wit
t1 = r200 * 8 + r513
t2 = sp + t1
r204 = *(t2 + 280 + C)


Or at least I *think* we might be able to get there.  Anyway, as I said, 
I think this deserves a bit of playing around before we jump straight 
into adding a new pass.

jeff
  
Jeff Law June 9, 2023, 12:57 a.m. UTC | #12
On 5/25/23 06:35, Manolis Tsamis wrote:
> Implementation of the new RISC-V optimization pass for memory offset
> calculations, documentation and testcases.
> 
> gcc/ChangeLog:
> 
> 	* config.gcc: Add riscv-fold-mem-offsets.o to extra_objs.
> 	* config/riscv/riscv-passes.def (INSERT_PASS_AFTER): Schedule a new
> 	pass.
> 	* config/riscv/riscv-protos.h (make_pass_fold_mem_offsets): Declare.
> 	* config/riscv/riscv.opt: New options.
> 	* config/riscv/t-riscv: New build rule.
> 	* doc/invoke.texi: Document new option.
> 	* config/riscv/riscv-fold-mem-offsets.cc: New file.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/riscv/fold-mem-offsets-1.c: New test.
> 	* gcc.target/riscv/fold-mem-offsets-2.c: New test.
> 	* gcc.target/riscv/fold-mem-offsets-3.c: New test.
So a followup.

While I think we probably could create a variety of backend patterns, 
perhaps disallow the frame pointer as the addend argument to a shadd 
pattern and the like and capture the key cases from mcf and probably 
deepsjeng it's probably not the best direction.

What I suspect would ultimately happen is we'd be presented with 
additional cases over time that would require an ever increasing number 
of patterns.  sign vs zero extension, increasing depth of search space 
to find reassociation opportunities, different variants with and without 
shadd/zbb, etc etc.

So with that space explored a bit the next big question is target 
specific or generic.  I'll poke in there a it over the coming days.  In 
the mean time I do have some questions/comments on the code itself. 
There may be more over time..



> +static rtx_insn*
> +get_single_def_in_bb (rtx_insn *insn, rtx reg)
[ ... ]


> +  for (ref_link = ref_chain; ref_link; ref_link = ref_link->next)
> +    {
> +      /* Problem getting some definition for this instruction.  */
> +      if (ref_link->ref == NULL)
> +	return NULL;
> +      if (DF_REF_INSN_INFO (ref_link->ref) == NULL)
> +	return NULL;
> +      if (global_regs[REGNO (reg)]
> +	  && !set_of (reg, DF_REF_INSN (ref_link->ref)))
> +	return NULL;
> +    }
That last condition feels a bit odd.  It would seem that you wanted an 
OR boolean rather than AND.


> +
> +  unsigned int dest_regno = REGNO (dest);
> +
> +  /* We don't want to fold offsets from instructions that change some
> +     particular registers with potentially global side effects.  */
> +  if (!GP_REG_P (dest_regno)
> +      || dest_regno == STACK_POINTER_REGNUM
> +      || (frame_pointer_needed && dest_regno == HARD_FRAME_POINTER_REGNUM)
> +      || dest_regno == GP_REGNUM
> +      || dest_regno == THREAD_POINTER_REGNUM
> +      || dest_regno == RETURN_ADDR_REGNUM)
> +    return 0;
I'd think most of this would be captured by testing fixed_registers 
rather than trying to list each register individually.  In fact, if we 
need to generalize this to work on other targets we almost certainly 
want a more general test.


> +      else if ((
> +		GET_CODE (src) == SIGN_EXTEND
> +		|| GET_CODE (src) == ZERO_EXTEND
> +	      )
> +	      && MEM_P (XEXP (src, 0)))
Formatting is goofy above...



> +
> +	  if (dump_file)
> +	    {
> +	      fprintf (dump_file, "Instruction deleted from folding:");
> +	      print_rtl_single (dump_file, insn);
> +	    }
> +
> +	  if (REGNO (dest) != REGNO (arg1))
> +	    {
> +	      /* If the dest register is different than the fisrt argument
> +		 then the addition with constant 0 is equivalent to a move
> +		 instruction.  We emit the move and let the subsequent
> +		 pass cprop_hardreg eliminate that if possible.  */
> +	      rtx arg1_reg_rtx = gen_rtx_REG (GET_MODE (dest), REGNO (arg1));
> +	      rtx mov_rtx = gen_move_insn (dest, arg1_reg_rtx);
> +	      df_insn_rescan (emit_insn_after (mov_rtx, insn));
> +	    }
> +
> +	  /* If the dest register is the same with the first argument
> +	     then the addition with constant 0 is a no-op.
> +	     We can now delete the original add immidiate instruction.  */
> +	  delete_insn (insn);
The debugging message is a bit misleading.  Yea, we always delete 
something here, but in one case we end up emitting a copy.



> +
> +	  /* Temporarily change the offset in MEM to test whether
> +	     it results in a valid instruction.  */
> +	  machine_mode mode = GET_MODE (mem_addr);
> +	  XEXP (mem, 0) = gen_rtx_PLUS (mode, reg, GEN_INT (offset));
> +
> +	  bool valid_change = recog (PATTERN (insn), insn, 0) >= 0;
> +
> +	  /* Restore the instruction.  */
> +	  XEXP (mem, 0) = mem_addr;
You need to reset the INSN_CODE after restoring the instruction.  That's 
generally a bad thing to do, but I've seen it done enough (and been 
guilty myself in the past) that we should just assume some ports are 
broken in this regard.


Anyway, just wanted to get those issues raised so that you can address them.

jeff
  
Jeff Law June 10, 2023, 3:49 p.m. UTC | #13
On 5/25/23 06:35, Manolis Tsamis wrote:
> Implementation of the new RISC-V optimization pass for memory offset
> calculations, documentation and testcases.
> 
> gcc/ChangeLog:
> 
> 	* config.gcc: Add riscv-fold-mem-offsets.o to extra_objs.
> 	* config/riscv/riscv-passes.def (INSERT_PASS_AFTER): Schedule a new
> 	pass.
> 	* config/riscv/riscv-protos.h (make_pass_fold_mem_offsets): Declare.
> 	* config/riscv/riscv.opt: New options.
> 	* config/riscv/t-riscv: New build rule.
> 	* doc/invoke.texi: Document new option.
> 	* config/riscv/riscv-fold-mem-offsets.cc: New file.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/riscv/fold-mem-offsets-1.c: New test.
> 	* gcc.target/riscv/fold-mem-offsets-2.c: New test.
> 	* gcc.target/riscv/fold-mem-offsets-3.c: New test.

So I made a small number of changes so that this could be run on other 
targets.


I had an hppa compiler handy, so it was trivial to do some light testing 
with that.  f-m-o didn't help at all on the included tests.  But I think 
that's more likely an artifact of the port supporting scaled indexed 
loads and doing fairly aggressive address rewriting to encourage that 
addressing mode.

Next I had an H8 compiler handy.  All three included tests showed 
improvement, both in terms of instruction count and size.  What was most 
interesting here is that f-m-o removed some redundant address 
calculations without needing to adjust the memory references which was a 
pleasant surprise.

Given the fact that both ports worked and the H8 showed an improvement, 
the next step was to put the patch into my tester.  It tests 30+ 
distinct processor families.  The goal wasn't to evaluate effectiveness, 
but to validate that those targets could still build their target 
libraries and successfully run their testsuites.

That's run through the various crosses.  Things like the hppa, alpha, 
m68k bootstraps only run once a week as they take many hours each.  The 
result is quite encouraging.  None of the crosses had any build issues 
or regressions.

The net result I think is we should probably move this to a target 
independent optimization pass.  We only need to generalize a few things.

Most importantly we need to get a resolution on the conditional I asked 
about inside get_single_def_in_bb.   There's some other refactoring I 
think we should do, but I'd really like to get a resolution on the code 
in get_single_def_in_bb first, then we ought to be able to move forward 
pretty quickly on the refactoring and integration.

jeff
  
Manolis Tsamis June 12, 2023, 7:32 a.m. UTC | #14
On Fri, Jun 9, 2023 at 3:57 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>
>
> On 5/25/23 06:35, Manolis Tsamis wrote:
> > Implementation of the new RISC-V optimization pass for memory offset
> > calculations, documentation and testcases.
> >
> > gcc/ChangeLog:
> >
> >       * config.gcc: Add riscv-fold-mem-offsets.o to extra_objs.
> >       * config/riscv/riscv-passes.def (INSERT_PASS_AFTER): Schedule a new
> >       pass.
> >       * config/riscv/riscv-protos.h (make_pass_fold_mem_offsets): Declare.
> >       * config/riscv/riscv.opt: New options.
> >       * config/riscv/t-riscv: New build rule.
> >       * doc/invoke.texi: Document new option.
> >       * config/riscv/riscv-fold-mem-offsets.cc: New file.
> >
> > gcc/testsuite/ChangeLog:
> >
> >       * gcc.target/riscv/fold-mem-offsets-1.c: New test.
> >       * gcc.target/riscv/fold-mem-offsets-2.c: New test.
> >       * gcc.target/riscv/fold-mem-offsets-3.c: New test.
> So a followup.
>
> While I think we probably could create a variety of backend patterns,
> perhaps disallow the frame pointer as the addend argument to a shadd
> pattern and the like and capture the key cases from mcf and probably
> deepsjeng it's probably not the best direction.
>
> What I suspect would ultimately happen is we'd be presented with
> additional cases over time that would require an ever increasing number
> of patterns.  sign vs zero extension, increasing depth of search space
> to find reassociation opportunities, different variants with and without
> shadd/zbb, etc etc.
>
> So with that space explored a bit the next big question is target
> specific or generic.  I'll poke in there a it over the coming days.  In
> the mean time I do have some questions/comments on the code itself.
> There may be more over time..
>
>
>
> > +static rtx_insn*
> > +get_single_def_in_bb (rtx_insn *insn, rtx reg)
> [ ... ]
>
>
> > +  for (ref_link = ref_chain; ref_link; ref_link = ref_link->next)
> > +    {
> > +      /* Problem getting some definition for this instruction.  */
> > +      if (ref_link->ref == NULL)
> > +     return NULL;
> > +      if (DF_REF_INSN_INFO (ref_link->ref) == NULL)
> > +     return NULL;
> > +      if (global_regs[REGNO (reg)]
> > +       && !set_of (reg, DF_REF_INSN (ref_link->ref)))
> > +     return NULL;
> > +    }
> That last condition feels a bit odd.  It would seem that you wanted an
> OR boolean rather than AND.
>

Most of this function I didn't write by myself, I used existing code
to get definitions taken from REE's get_defs.
In the original there's a comment about this line this comment that explains it:

  As global regs are assumed to be defined at each function call
  dataflow can report a call_insn as being a definition of REG.
  But we can't do anything with that in this pass so proceed only
  if the instruction really sets REG in a way that can be deduced
  from the RTL structure.

This function is the only one I copied without changing much (because
I didn't quite understand it), so I don't know if that condition is
any useful for f-m-o.
Also the code duplication here is a bit unfortunate, maybe it would be
preferred to create a generic version that can be used in both?

>
> > +
> > +  unsigned int dest_regno = REGNO (dest);
> > +
> > +  /* We don't want to fold offsets from instructions that change some
> > +     particular registers with potentially global side effects.  */
> > +  if (!GP_REG_P (dest_regno)
> > +      || dest_regno == STACK_POINTER_REGNUM
> > +      || (frame_pointer_needed && dest_regno == HARD_FRAME_POINTER_REGNUM)
> > +      || dest_regno == GP_REGNUM
> > +      || dest_regno == THREAD_POINTER_REGNUM
> > +      || dest_regno == RETURN_ADDR_REGNUM)
> > +    return 0;
> I'd think most of this would be captured by testing fixed_registers
> rather than trying to list each register individually.  In fact, if we
> need to generalize this to work on other targets we almost certainly
> want a more general test.
>

Thanks, I knew there would be some proper way to test this but wasn't
aware which is the correct one.
Should this look like below? Or is the GP_REG_P redundant and just
fixed_regs will do?

  if (!GP_REG_P (dest_regno) || fixed_regs[dest_regno])
    return 0;

>
> > +      else if ((
> > +             GET_CODE (src) == SIGN_EXTEND
> > +             || GET_CODE (src) == ZERO_EXTEND
> > +           )
> > +           && MEM_P (XEXP (src, 0)))
> Formatting is goofy above...
>

Noted.

>
>
> > +
> > +       if (dump_file)
> > +         {
> > +           fprintf (dump_file, "Instruction deleted from folding:");
> > +           print_rtl_single (dump_file, insn);
> > +         }
> > +
> > +       if (REGNO (dest) != REGNO (arg1))
> > +         {
> > +           /* If the dest register is different than the fisrt argument
> > +              then the addition with constant 0 is equivalent to a move
> > +              instruction.  We emit the move and let the subsequent
> > +              pass cprop_hardreg eliminate that if possible.  */
> > +           rtx arg1_reg_rtx = gen_rtx_REG (GET_MODE (dest), REGNO (arg1));
> > +           rtx mov_rtx = gen_move_insn (dest, arg1_reg_rtx);
> > +           df_insn_rescan (emit_insn_after (mov_rtx, insn));
> > +         }
> > +
> > +       /* If the dest register is the same with the first argument
> > +          then the addition with constant 0 is a no-op.
> > +          We can now delete the original add immidiate instruction.  */
> > +       delete_insn (insn);
> The debugging message is a bit misleading.  Yea, we always delete
> something here, but in one case we end up emitting a copy.
>

Indeed. Maybe "Instruction reduced to move: ..."?

>
>
> > +
> > +       /* Temporarily change the offset in MEM to test whether
> > +          it results in a valid instruction.  */
> > +       machine_mode mode = GET_MODE (mem_addr);
> > +       XEXP (mem, 0) = gen_rtx_PLUS (mode, reg, GEN_INT (offset));
> > +
> > +       bool valid_change = recog (PATTERN (insn), insn, 0) >= 0;
> > +
> > +       /* Restore the instruction.  */
> > +       XEXP (mem, 0) = mem_addr;
> You need to reset the INSN_CODE after restoring the instruction.  That's
> generally a bad thing to do, but I've seen it done enough (and been
> guilty myself in the past) that we should just assume some ports are
> broken in this regard.
>

Ok thanks, I didn't knew that one.

>
> Anyway, just wanted to get those issues raised so that you can address them.
>
> jeff
  
Manolis Tsamis June 12, 2023, 7:36 a.m. UTC | #15
On Thu, Jun 8, 2023 at 8:37 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>
>
> On 5/25/23 06:35, Manolis Tsamis wrote:
> > Implementation of the new RISC-V optimization pass for memory offset
> > calculations, documentation and testcases.
> >
> > gcc/ChangeLog:
> >
> >       * config.gcc: Add riscv-fold-mem-offsets.o to extra_objs.
> >       * config/riscv/riscv-passes.def (INSERT_PASS_AFTER): Schedule a new
> >       pass.
> >       * config/riscv/riscv-protos.h (make_pass_fold_mem_offsets): Declare.
> >       * config/riscv/riscv.opt: New options.
> >       * config/riscv/t-riscv: New build rule.
> >       * doc/invoke.texi: Document new option.
> >       * config/riscv/riscv-fold-mem-offsets.cc: New file.
> >
> > gcc/testsuite/ChangeLog:
> >
> >       * gcc.target/riscv/fold-mem-offsets-1.c: New test.
> >       * gcc.target/riscv/fold-mem-offsets-2.c: New test.
> >       * gcc.target/riscv/fold-mem-offsets-3.c: New test.
> So not going into the guts of the patch yet.
>
>  From a benchmark standpoint the only two that get out of the +-0.05%
> range are mcf and deepsjeng (from a dynamic instruction standpoint).  So
> from an evaluation standpoint we can probably focus our efforts there.
> And as we know, mcf is actually memory bound, so while improving its
> dynamic instruction count is good, the end performance improvement may
> be marginal.
>

Even if late, one question for the dynamic instruction numbers.
Was this measured just with f-m-o or with the stack pointer fold patch
applied too?
I remember I was getting better improvements in the past, but most of
the cases had to do with the stack pointer so the second patch is
necessary.

> As I mentioned to Philipp many months ago this reminds me a lot of a
> problem I've seen before.  Basically register elimination emits code
> that can be terrible in some circumstances.  So I went and poked at this
> again.
>
> I think the key difference between now and what I was dealing with
> before is for the cases that really matter for rv64 we have a shNadd
> insn in the sequence.  That private port I was working on before did not
> have shNadd (don't ask, I probably can't tell).  Our target also had
> reg+reg addressing modes.  What I can't remember was if we were trying
> harder to fold the constant terms into the memory reference or if we
> were more focused on the reg+reg.  Ultimately it's probably not that
> important to remember -- the key is there are very significant
> differences in the target's capabilities which impact how we should be
> generating code in this case.  Those differences affect the code we
> generate *and* the places where we can potentially get control and do
> some address rewriting.
>
> A  key sequence in mcf looks something like this in IRA, others have
> similar structure:
>
> > (insn 237 234 239 26 (set (reg:DI 377)
> >         (plus:DI (ashift:DI (reg:DI 200 [ _173 ])
> >                 (const_int 3 [0x3]))
> >             (reg/f:DI 65 frame))) "pbeampp.c":139:15 333 {*shNadd}
> >      (nil))
> > (insn 239 237 235 26 (set (reg/f:DI 380)
> >         (plus:DI (reg:DI 513)
> >             (reg:DI 377))) "pbeampp.c":139:15 5 {adddi3}
> >      (expr_list:REG_DEAD (reg:DI 377)
> >         (expr_list:REG_EQUAL (plus:DI (reg:DI 377)
> >                 (const_int -32768 [0xffffffffffff8000]))
> >             (nil))))
> [ ... ]
> > (insn 240 235 255 26 (set (reg/f:DI 204 [ _177 ])
> >         (mem/f:DI (plus:DI (reg/f:DI 380)
> >                 (const_int 280 [0x118])) [7 *_176+0 S8 A64])) "pbeampp.c":139:15 179 {*movdi_64bit}
> >      (expr_list:REG_DEAD (reg/f:DI 380)
> >         (nil)))
>
>
> The key here is insn 237.  It's generally going to be bad to have FP
> show up in a shadd insn because its going to be eliminated into
> sp+offset.  That'll generate an input reload before insn 237 and we
> can't do any combination with the constant in insn 239.
>
> After LRA it looks like this:
>
> > (insn 1540 234 1541 26 (set (reg:DI 11 a1 [750])
> >         (const_int 32768 [0x8000])) "pbeampp.c":139:15 179 {*movdi_64bit}
> >      (nil))
> > (insn 1541 1540 1611 26 (set (reg:DI 12 a2 [749])
> >         (plus:DI (reg:DI 11 a1 [750])
> >             (const_int -272 [0xfffffffffffffef0]))) "pbeampp.c":139:15 5 {adddi3}
> >      (expr_list:REG_EQUAL (const_int 32496 [0x7ef0])
> >         (nil)))
> > (insn 1611 1541 1542 26 (set (reg:DI 29 t4 [795])
> >         (plus:DI (reg/f:DI 2 sp)
> >             (const_int 64 [0x40]))) "pbeampp.c":139:15 5 {adddi3}
> >      (nil))
> > (insn 1542 1611 237 26 (set (reg:DI 12 a2 [749])
> >         (plus:DI (reg:DI 12 a2 [749])
> >             (reg:DI 29 t4 [795]))) "pbeampp.c":139:15 5 {adddi3}
> >      (nil))
> > (insn 237 1542 239 26 (set (reg:DI 12 a2 [377])
> >         (plus:DI (ashift:DI (reg:DI 14 a4 [orig:200 _173 ] [200])
> >                 (const_int 3 [0x3]))
> >             (reg:DI 12 a2 [749]))) "pbeampp.c":139:15 333 {*shNadd}
> >      (nil))
> > (insn 239 237 235 26 (set (reg/f:DI 12 a2 [380])
> >         (plus:DI (reg:DI 10 a0 [513])
> >             (reg:DI 12 a2 [377]))) "pbeampp.c":139:15 5 {adddi3}
> >      (expr_list:REG_EQUAL (plus:DI (reg:DI 12 a2 [377])
> >             (const_int -32768 [0xffffffffffff8000]))
> >         (nil)))
> [ ... ]
> > (insn 240 235 255 26 (set (reg/f:DI 14 a4 [orig:204 _177 ] [204])
> >         (mem/f:DI (plus:DI (reg/f:DI 12 a2 [380])
> >                 (const_int 280 [0x118])) [7 *_176+0 S8 A64])) "pbeampp.c":139:15 179 {*movdi_64bit}
> >      (nil))
>
>
> Reload/LRA made an absolute mess of that code.
>
> But before we add a new pass (target specific or generic), I think it
> may be in our best interest experiment a bit of creative rewriting to
> preserve the shadd, but without the frame pointer.  Perhaps also looking
> for a way to fold the constants, both the explicit ones and the implicit
> one from FP elimination.
>
> This looks particularly promising:
>
> > Trying 237, 239 -> 240:
> >   237: r377:DI=r200:DI<<0x3+frame:DI
> >       REG_DEAD r200:DI
> >   239: r380:DI=r513:DI+r377:DI
> >       REG_DEAD r377:DI
> >       REG_EQUAL r377:DI-0x8000
> >   240: r204:DI=[r380:DI+0x118]
> >       REG_DEAD r380:DI
> > Failed to match this instruction:
> > (set (reg/f:DI 204 [ _177 ])
> >     (mem/f:DI (plus:DI (plus:DI (plus:DI (mult:DI (reg:DI 200 [ _173 ])
> >                         (const_int 8 [0x8]))
> >                     (reg/f:DI 65 frame))
> >                 (reg:DI 513))
> >             (const_int 280 [0x118])) [7 *_176+0 S8 A64]))
>
>
> We could reassociate this as
>
> t1 = r200 * 8 + r513
> t2 = frame + 280
> t3 = t1 + t2
> r204 = *t3
>
> Which after elimination would be
>
> t1 = r2000 * 8 + r513
> t2 = sp + C + 280
> t3 = t1 + t2
> r204 = *t3
>
> C + 280 will simplify.  And we'll probably end up in the addptr3 case
> which I think gives us a chance to write this a bit so that we end up wit
> t1 = r200 * 8 + r513
> t2 = sp + t1
> r204 = *(t2 + 280 + C)
>
>
> Or at least I *think* we might be able to get there.  Anyway, as I said,
> I think this deserves a bit of playing around before we jump straight
> into adding a new pass.
>
> jeff
>
  
Manolis Tsamis June 12, 2023, 7:41 a.m. UTC | #16
On Sat, Jun 10, 2023 at 6:49 PM Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>
>
> On 5/25/23 06:35, Manolis Tsamis wrote:
> > Implementation of the new RISC-V optimization pass for memory offset
> > calculations, documentation and testcases.
> >
> > gcc/ChangeLog:
> >
> >       * config.gcc: Add riscv-fold-mem-offsets.o to extra_objs.
> >       * config/riscv/riscv-passes.def (INSERT_PASS_AFTER): Schedule a new
> >       pass.
> >       * config/riscv/riscv-protos.h (make_pass_fold_mem_offsets): Declare.
> >       * config/riscv/riscv.opt: New options.
> >       * config/riscv/t-riscv: New build rule.
> >       * doc/invoke.texi: Document new option.
> >       * config/riscv/riscv-fold-mem-offsets.cc: New file.
> >
> > gcc/testsuite/ChangeLog:
> >
> >       * gcc.target/riscv/fold-mem-offsets-1.c: New test.
> >       * gcc.target/riscv/fold-mem-offsets-2.c: New test.
> >       * gcc.target/riscv/fold-mem-offsets-3.c: New test.
>
> So I made a small number of changes so that this could be run on other
> targets.
>
>
> I had an hppa compiler handy, so it was trivial to do some light testing
> with that.  f-m-o didn't help at all on the included tests.  But I think
> that's more likely an artifact of the port supporting scaled indexed
> loads and doing fairly aggressive address rewriting to encourage that
> addressing mode.
>
> Next I had an H8 compiler handy.  All three included tests showed
> improvement, both in terms of instruction count and size.  What was most
> interesting here is that f-m-o removed some redundant address
> calculations without needing to adjust the memory references which was a
> pleasant surprise.
>
> Given the fact that both ports worked and the H8 showed an improvement,
> the next step was to put the patch into my tester.  It tests 30+
> distinct processor families.  The goal wasn't to evaluate effectiveness,
> but to validate that those targets could still build their target
> libraries and successfully run their testsuites.
>
> That's run through the various crosses.  Things like the hppa, alpha,
> m68k bootstraps only run once a week as they take many hours each.  The
> result is quite encouraging.  None of the crosses had any build issues
> or regressions.
>

That's all great news!

> The net result I think is we should probably move this to a target
> independent optimization pass.  We only need to generalize a few things.
>

I also think that's where this should end up since most of the pass is
target independent anyway.
I just couldn't figure out what would be a proper way to model the
propagation rules for each target.
Is a target hook necessary for that?

> Most importantly we need to get a resolution on the conditional I asked
> about inside get_single_def_in_bb.   There's some other refactoring I
> think we should do, but I'd really like to get a resolution on the code
> in get_single_def_in_bb first, then we ought to be able to move forward
> pretty quickly on the refactoring and integration.
>

Just replied to that in my previous response :)

> jeff

Thanks,
Manolis
  
Jeff Law June 12, 2023, 2:37 p.m. UTC | #17
On 6/12/23 01:36, Manolis Tsamis wrote:

>>
> 
> Even if late, one question for the dynamic instruction numbers.
> Was this measured just with f-m-o or with the stack pointer fold patch
> applied too?
> I remember I was getting better improvements in the past, but most of
> the cases had to do with the stack pointer so the second patch is
> necessary.
It was just the main f-m-o patch, so there'll be additional benefits 
with the ability to cprop the stack pointer.   And even if we don't get 
measurable wins for something like mcf due to its memory bound nature, 
smaller, tighter code is always preferable.

Jeff
  
Jeff Law June 12, 2023, 9:36 p.m. UTC | #18
On 6/12/23 01:41, Manolis Tsamis wrote:

> 
> I also think that's where this should end up since most of the pass is
> target independent anyway.
> I just couldn't figure out what would be a proper way to model the
> propagation rules for each target.
> Is a target hook necessary for that?
No hook should be necessary.  You're already checking that the result is 
recognized.  In theory you shouldn't have to, but checking the 
constraints seems advisable as well.

Costing is a different matter.  You might end changing an offset in such 
a way as to create a longer instruction on targets that have variable 
length encoding.  If we see that we'll likely have to add some rtx cost 
calls and compare the before/after.

But I suspect those cases are going to be limited in practice and in 
general if we're able to delete an earlier instruction we going to win 
even if the offset in the MEM changes and perhaps even results in a 
longer instruction.

Jeff
  
Jeff Law June 12, 2023, 9:58 p.m. UTC | #19
On 6/12/23 01:32, Manolis Tsamis wrote:

>>
>>> +  for (ref_link = ref_chain; ref_link; ref_link = ref_link->next)
>>> +    {
>>> +      /* Problem getting some definition for this instruction.  */
>>> +      if (ref_link->ref == NULL)
>>> +     return NULL;
>>> +      if (DF_REF_INSN_INFO (ref_link->ref) == NULL)
>>> +     return NULL;
>>> +      if (global_regs[REGNO (reg)]
>>> +       && !set_of (reg, DF_REF_INSN (ref_link->ref)))
>>> +     return NULL;
>>> +    }
>> That last condition feels a bit odd.  It would seem that you wanted an
>> OR boolean rather than AND.
>>
> 
> Most of this function I didn't write by myself, I used existing code
> to get definitions taken from REE's get_defs.
> In the original there's a comment about this line this comment that explains it:
> 
>    As global regs are assumed to be defined at each function call
>    dataflow can report a call_insn as being a definition of REG.
>    But we can't do anything with that in this pass so proceed only
>    if the instruction really sets REG in a way that can be deduced
>    from the RTL structure.
> 
> This function is the only one I copied without changing much (because
> I didn't quite understand it), so I don't know if that condition is
> any useful for f-m-o.
> Also the code duplication here is a bit unfortunate, maybe it would be
> preferred to create a generic version that can be used in both?
Ah.  So I think the code is meant to filter out things that DF will say 
are set vs those which are actually exposed explicitly in the RTL (and 
which REE might be able to modify).  So we're probably good.

Those routines are are pretty close to each other in implementation.  I 
bet we could take everything up to the loop over the ref links and 
factor that into a common function.   Both your function and get_defs 
would be able to use that and then do bit of processing afterwards.


> 
>>
>>> +
>>> +  unsigned int dest_regno = REGNO (dest);
>>> +
>>> +  /* We don't want to fold offsets from instructions that change some
>>> +     particular registers with potentially global side effects.  */
>>> +  if (!GP_REG_P (dest_regno)
>>> +      || dest_regno == STACK_POINTER_REGNUM
>>> +      || (frame_pointer_needed && dest_regno == HARD_FRAME_POINTER_REGNUM)
>>> +      || dest_regno == GP_REGNUM
>>> +      || dest_regno == THREAD_POINTER_REGNUM
>>> +      || dest_regno == RETURN_ADDR_REGNUM)
>>> +    return 0;
>> I'd think most of this would be captured by testing fixed_registers
>> rather than trying to list each register individually.  In fact, if we
>> need to generalize this to work on other targets we almost certainly
>> want a more general test.
>>
> 
> Thanks, I knew there would be some proper way to test this but wasn't
> aware which is the correct one.
> Should this look like below? Or is the GP_REG_P redundant and just
> fixed_regs will do?
If you want to verify it's a general register, then you have to ask if 
the regno is in the GENERAL_REGS class.  Something like:

TEST_HARD_REG_BIT (reg_class_contents[GENERAL_REGS], dest_regno)

GP_REG_P is a risc-v specific macro, so we can't use it here.

So something like
   if (fixed_regs[dest_regno]
       || !TEST_HARD_REG_BIT (reg_class_contents[GENERAL_REGS], dest_regno))



>> The debugging message is a bit misleading.  Yea, we always delete
>> something here, but in one case we end up emitting a copy.
>>
> 
> Indeed. Maybe "Instruction reduced to move: ..."?
Works for me.

> 
>>
>>
>>> +
>>> +       /* Temporarily change the offset in MEM to test whether
>>> +          it results in a valid instruction.  */
>>> +       machine_mode mode = GET_MODE (mem_addr);
>>> +       XEXP (mem, 0) = gen_rtx_PLUS (mode, reg, GEN_INT (offset));
>>> +
>>> +       bool valid_change = recog (PATTERN (insn), insn, 0) >= 0;
>>> +
>>> +       /* Restore the instruction.  */
>>> +       XEXP (mem, 0) = mem_addr;
>> You need to reset the INSN_CODE after restoring the instruction.  That's
>> generally a bad thing to do, but I've seen it done enough (and been
>> guilty myself in the past) that we should just assume some ports are
>> broken in this regard.
>>
> 
> Ok thanks, I didn't knew that one.
It's pretty obscure and I probably would have missed it if I hadn't 
debugged a problem related to this just a few months back.  It shouldn't 
be necessary due to rules about how the movXX patterns are supposed to 
work.  But I've seen it mucked up enough that it's the right thing to do.

Essentially when you call into recog, if the pattern is recognized, then 
a cookie is cached so that we know what pattern was recognized within 
the backend.

As far as generalizing to a target independent pass.  You'll need to 
declare the new pass in tree-pass.h, add it to passes.def, wire up the 
option in common.opt, document it in invoke.texi and turn it on for -O2 
and above.  WRT the interaction with shorten-memrefs, I think that can 
be handled in override-options.  I think we want to have shorten-memrefs 
override.  So if shorten-memrefs is on, then we turn off f-o-m in the RV 
backend.  This should probably be documented in invoke.texi as well.

It sounds like a lot, but I think each of those is a relatively simple 
change.  It'd be best if you could tackle those changes.  I think with 
that done and bootstrap/regression test round we'd be ready to integrate 
your work.

Thanks!

jeff
  
Manolis Tsamis June 15, 2023, 5:34 p.m. UTC | #20
The new target-independant implementation of fold-mem-offsets pass can
be found in the list (link is
https://gcc.gnu.org/pipermail/gcc-patches/2023-June/621920.html)

Aside from now being target independent, I have fixed a number of new
bugs that emerged when running this on other targets and a minor
memory leak.
I have also improved the propagation logic in fold_offsets to work
with more patterns found in other targets (e.g. LEA instructions in
x86).
Finally I improved the naming of things (e.g. replaced uses of
'delete'/'remove' with 'fold', made bitmap names more meaningful) and
reduced unnecessary verbosity in some comments.

Thanks,
Manolis

On Tue, Jun 13, 2023 at 12:58 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>
>
> On 6/12/23 01:32, Manolis Tsamis wrote:
>
> >>
> >>> +  for (ref_link = ref_chain; ref_link; ref_link = ref_link->next)
> >>> +    {
> >>> +      /* Problem getting some definition for this instruction.  */
> >>> +      if (ref_link->ref == NULL)
> >>> +     return NULL;
> >>> +      if (DF_REF_INSN_INFO (ref_link->ref) == NULL)
> >>> +     return NULL;
> >>> +      if (global_regs[REGNO (reg)]
> >>> +       && !set_of (reg, DF_REF_INSN (ref_link->ref)))
> >>> +     return NULL;
> >>> +    }
> >> That last condition feels a bit odd.  It would seem that you wanted an
> >> OR boolean rather than AND.
> >>
> >
> > Most of this function I didn't write by myself, I used existing code
> > to get definitions taken from REE's get_defs.
> > In the original there's a comment about this line this comment that explains it:
> >
> >    As global regs are assumed to be defined at each function call
> >    dataflow can report a call_insn as being a definition of REG.
> >    But we can't do anything with that in this pass so proceed only
> >    if the instruction really sets REG in a way that can be deduced
> >    from the RTL structure.
> >
> > This function is the only one I copied without changing much (because
> > I didn't quite understand it), so I don't know if that condition is
> > any useful for f-m-o.
> > Also the code duplication here is a bit unfortunate, maybe it would be
> > preferred to create a generic version that can be used in both?
> Ah.  So I think the code is meant to filter out things that DF will say
> are set vs those which are actually exposed explicitly in the RTL (and
> which REE might be able to modify).  So we're probably good.
>
> Those routines are are pretty close to each other in implementation.  I
> bet we could take everything up to the loop over the ref links and
> factor that into a common function.   Both your function and get_defs
> would be able to use that and then do bit of processing afterwards.
>
>
> >
> >>
> >>> +
> >>> +  unsigned int dest_regno = REGNO (dest);
> >>> +
> >>> +  /* We don't want to fold offsets from instructions that change some
> >>> +     particular registers with potentially global side effects.  */
> >>> +  if (!GP_REG_P (dest_regno)
> >>> +      || dest_regno == STACK_POINTER_REGNUM
> >>> +      || (frame_pointer_needed && dest_regno == HARD_FRAME_POINTER_REGNUM)
> >>> +      || dest_regno == GP_REGNUM
> >>> +      || dest_regno == THREAD_POINTER_REGNUM
> >>> +      || dest_regno == RETURN_ADDR_REGNUM)
> >>> +    return 0;
> >> I'd think most of this would be captured by testing fixed_registers
> >> rather than trying to list each register individually.  In fact, if we
> >> need to generalize this to work on other targets we almost certainly
> >> want a more general test.
> >>
> >
> > Thanks, I knew there would be some proper way to test this but wasn't
> > aware which is the correct one.
> > Should this look like below? Or is the GP_REG_P redundant and just
> > fixed_regs will do?
> If you want to verify it's a general register, then you have to ask if
> the regno is in the GENERAL_REGS class.  Something like:
>
> TEST_HARD_REG_BIT (reg_class_contents[GENERAL_REGS], dest_regno)
>
> GP_REG_P is a risc-v specific macro, so we can't use it here.
>
> So something like
>    if (fixed_regs[dest_regno]
>        || !TEST_HARD_REG_BIT (reg_class_contents[GENERAL_REGS], dest_regno))
>
>
>
> >> The debugging message is a bit misleading.  Yea, we always delete
> >> something here, but in one case we end up emitting a copy.
> >>
> >
> > Indeed. Maybe "Instruction reduced to move: ..."?
> Works for me.
>
> >
> >>
> >>
> >>> +
> >>> +       /* Temporarily change the offset in MEM to test whether
> >>> +          it results in a valid instruction.  */
> >>> +       machine_mode mode = GET_MODE (mem_addr);
> >>> +       XEXP (mem, 0) = gen_rtx_PLUS (mode, reg, GEN_INT (offset));
> >>> +
> >>> +       bool valid_change = recog (PATTERN (insn), insn, 0) >= 0;
> >>> +
> >>> +       /* Restore the instruction.  */
> >>> +       XEXP (mem, 0) = mem_addr;
> >> You need to reset the INSN_CODE after restoring the instruction.  That's
> >> generally a bad thing to do, but I've seen it done enough (and been
> >> guilty myself in the past) that we should just assume some ports are
> >> broken in this regard.
> >>
> >
> > Ok thanks, I didn't knew that one.
> It's pretty obscure and I probably would have missed it if I hadn't
> debugged a problem related to this just a few months back.  It shouldn't
> be necessary due to rules about how the movXX patterns are supposed to
> work.  But I've seen it mucked up enough that it's the right thing to do.
>
> Essentially when you call into recog, if the pattern is recognized, then
> a cookie is cached so that we know what pattern was recognized within
> the backend.
>
> As far as generalizing to a target independent pass.  You'll need to
> declare the new pass in tree-pass.h, add it to passes.def, wire up the
> option in common.opt, document it in invoke.texi and turn it on for -O2
> and above.  WRT the interaction with shorten-memrefs, I think that can
> be handled in override-options.  I think we want to have shorten-memrefs
> override.  So if shorten-memrefs is on, then we turn off f-o-m in the RV
> backend.  This should probably be documented in invoke.texi as well.
>
> It sounds like a lot, but I think each of those is a relatively simple
> change.  It'd be best if you could tackle those changes.  I think with
> that done and bootstrap/regression test round we'd be ready to integrate
> your work.
>
> Thanks!
>
> jeff
  

Patch

diff --git a/gcc/config.gcc b/gcc/config.gcc
index d88071773c9..5dffd21b4c8 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -529,7 +529,7 @@  pru-*-*)
 	;;
 riscv*)
 	cpu_type=riscv
-	extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o riscv-shorten-memrefs.o riscv-selftests.o riscv-v.o riscv-vsetvl.o"
+	extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o riscv-shorten-memrefs.o riscv-fold-mem-offsets.o riscv-selftests.o riscv-v.o riscv-vsetvl.o"
 	extra_objs="${extra_objs} riscv-vector-builtins.o riscv-vector-builtins-shapes.o riscv-vector-builtins-bases.o"
 	extra_objs="${extra_objs} thead.o"
 	d_target_objs="riscv-d.o"
diff --git a/gcc/config/riscv/riscv-fold-mem-offsets.cc b/gcc/config/riscv/riscv-fold-mem-offsets.cc
new file mode 100644
index 00000000000..81325bb3beb
--- /dev/null
+++ b/gcc/config/riscv/riscv-fold-mem-offsets.cc
@@ -0,0 +1,637 @@ 
+/* Fold memory offsets pass for RISC-V.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "rtl.h"
+#include "tree.h"
+#include "expr.h"
+#include "backend.h"
+#include "regs.h"
+#include "target.h"
+#include "memmodel.h"
+#include "emit-rtl.h"
+#include "insn-config.h"
+#include "recog.h"
+#include "predict.h"
+#include "df.h"
+#include "tree-pass.h"
+#include "cfgrtl.h"
+
+/* This pass tries to optimize memory offset calculations by moving them
+   from add immediate instructions to the memory loads/stores.
+   For example it can transform this:
+
+     addi t4,sp,16
+     add  t2,a6,t4
+     shl  t3,t2,1
+     ld   a2,0(t3)
+     addi a2,1
+     sd   a2,8(t2)
+
+   into the following (one instruction less):
+
+     add  t2,a6,sp
+     shl  t3,t2,1
+     ld   a2,32(t3)
+     addi a2,1
+     sd   a2,24(t2)
+
+   Usually, the code generated from the previous passes tries to have the
+   offsets in the memory instructions but this pass is still beneficial
+   because:
+
+    - There are cases where add instructions are added in a late rtl pass
+      and the rest of the pipeline cannot eliminate them.  Specifically,
+      arrays and structs allocated on the stack can result in multiple
+      unnecessary add instructions that cannot be eliminated easily
+      otherwise.
+
+    - The existing mechanisms that move offsets to memory instructions
+      usually apply only to specific patterns or have other limitations.
+      This pass is very generic and can fold offsets through complex
+      calculations with multiple memory uses and partially overlapping
+      calculations.  As a result it can eliminate more instructions than
+      what is possible otherwise.
+
+   This pass runs inside a single basic blocks and consists of 4 phases:
+
+    - Phase 1 (Analysis): Find "foldable" instructions.
+      Foldable instructions are those that we know how to propagate
+      a constant addition through (add, slli, mv, ...) and only have other
+      foldable instructions for uses.  In that phase a DFS traversal on the
+      definition tree is performed and foldable instructions are marked on
+      a bitmap.  The add immediate instructions that are reachable in this
+      DFS are candidates for removal since all the intermediate
+      calculations affected by them are also foldable.
+
+    - Phase 2 (Validity): Traverse again, this time calculating the
+      offsets that would result from folding all add immediate instructions
+      found.  Also keep track of which instructions will be folded for this
+      particular offset because folding can be partially or completely
+      shared across an number of different memory instructions.  At this point,
+      since we calculated the actual offset resulting from folding, we check
+      and keep track if it's a valid 12-bit immediate.
+
+    - Phase 3 (Commit offsets): Traverse again.  This time it is known if
+      a particular fold is valid so actually fold the offset by changing
+      the RTL statement.  It's important that this phase is separate from the
+      previous because one instruction that is foldable with a valid offset
+      can become result in an invalid offset for another instruction later on.
+
+    - Phase 4 (Commit instruction deletions): Scan all insns and delete
+      all add immediate instructions that were folded.  */
+
+namespace {
+
+const pass_data pass_data_fold_mem =
+{
+  RTL_PASS, /* type */
+  "fold_mem_offsets", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_fold_mem_offsets : public rtl_opt_pass
+{
+public:
+  pass_fold_mem_offsets (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_fold_mem, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      return riscv_mfold_mem_offsets
+	       && optimize >= 2;
+    }
+
+  virtual unsigned int execute (function *);
+}; // class pass_fold_mem_offsets
+
+/* Bitmap that tracks which instructions are reachable through sequences
+   of foldable instructions.  */
+static bitmap_head can_fold_insn;
+
+/* Bitmap with instructions marked for deletion due to folding.  */
+static bitmap_head pending_remove_insn;
+
+/* Bitmap with instructions that cannot be deleted because that would
+   require folding an offset that's invalid in some memory access.
+   An instruction can be in both PENDING_REMOVE_INSN and CANNOT_REMOVE_INSN
+   at the same time, in which case it cannot be safely deleted.  */
+static bitmap_head cannot_remove_insn;
+
+/* The number of folded addi instructions of the form "addi reg, sp, X".  */
+static int stats_folded_sp;
+
+/* The number of the rest folded addi instructions.  */
+static int stats_folded_other;
+
+enum fold_mem_phase
+{
+  FM_PHASE_ANALYSIS,
+  FM_PHASE_VALIDITY,
+  FM_PHASE_COMMIT_OFFSETS,
+  FM_PHASE_COMMIT_INSNS
+};
+
+/* Helper function for fold_offsets.
+  Get the single reaching definition of an instruction inside a BB.
+  The definition is desired for REG used in INSN.
+  Return the definition insn or NULL if there's no definition with
+  the desired criteria.  */
+static rtx_insn*
+get_single_def_in_bb (rtx_insn *insn, rtx reg)
+{
+  df_ref use;
+  struct df_link *ref_chain, *ref_link;
+
+  FOR_EACH_INSN_USE (use, insn)
+    {
+      if (GET_CODE (DF_REF_REG (use)) == SUBREG)
+	return NULL;
+      if (REGNO (DF_REF_REG (use)) == REGNO (reg))
+	break;
+    }
+
+  if (!use)
+    return NULL;
+
+  ref_chain = DF_REF_CHAIN (use);
+
+  if (!ref_chain)
+    return NULL;
+
+  for (ref_link = ref_chain; ref_link; ref_link = ref_link->next)
+    {
+      /* Problem getting some definition for this instruction.  */
+      if (ref_link->ref == NULL)
+	return NULL;
+      if (DF_REF_INSN_INFO (ref_link->ref) == NULL)
+	return NULL;
+      if (global_regs[REGNO (reg)]
+	  && !set_of (reg, DF_REF_INSN (ref_link->ref)))
+	return NULL;
+    }
+
+  if (ref_chain->next)
+    return NULL;
+
+  rtx_insn* def = DF_REF_INSN (ref_chain->ref);
+
+  if (BLOCK_FOR_INSN (def) != BLOCK_FOR_INSN (insn))
+    return NULL;
+
+  if (DF_INSN_LUID (def) > DF_INSN_LUID (insn))
+    return NULL;
+
+  return def;
+}
+
+/* Helper function for fold_offsets.
+   Get all the reaching uses of an instruction.  The uses are desired for REG
+   set in INSN.  Return use list or NULL if a use is missing or irregular.
+   If SUCCESS is not NULL then it's value is set to false if there are
+   missing or irregular uses and to true otherwise.  */
+static struct df_link*
+get_uses (rtx_insn *insn, rtx reg, bool* success)
+{
+  df_ref def;
+  struct df_link *ref_chain, *ref_link;
+
+  if (success != NULL)
+    *success = false;
+
+  FOR_EACH_INSN_DEF (def, insn)
+    if (REGNO (DF_REF_REG (def)) == REGNO (reg))
+      break;
+
+  if (!def)
+    return NULL;
+
+  ref_chain = DF_REF_CHAIN (def);
+
+  for (ref_link = ref_chain; ref_link; ref_link = ref_link->next)
+    {
+      /* Problem getting some use for this instruction.  */
+      if (ref_link->ref == NULL)
+	return NULL;
+      if (DF_REF_CLASS (ref_link->ref) != DF_REF_REGULAR)
+	return NULL;
+    }
+
+  if (success != NULL)
+    *success = true;
+
+  return ref_chain;
+}
+
+/* Recursive function that computes the foldable offsets through the
+   definitions of REG in INSN given an integer scale factor SCALE.
+   Returns the offset that would have to be added if all instructions
+   in PENDING_DELETES were to be deleted.
+
+  - if ANALYZE is true then it recurses through definitions with the common
+    code and marks eligible for folding instructions in the bitmap
+    can_fold_insn.  An instruction is eligible if all it's uses are also
+    eligible.  Initially can_fold_insn is true for memory accesses.
+
+  - if ANALYZE is false then it recurses through definitions with the common
+    code and computes and returns the offset that would result from folding
+    the instructions in PENDING_DELETES were to be deleted.  */
+static HOST_WIDE_INT
+fold_offsets (rtx_insn* insn, rtx reg, int scale, bool analyze,
+	      bitmap pending_deletes)
+{
+  rtx_insn* def = get_single_def_in_bb (insn, reg);
+
+  if (!def)
+    return 0;
+
+  rtx set = single_set (def);
+
+  if (!set)
+    return 0;
+
+  rtx src = SET_SRC (set);
+  rtx dest = SET_DEST (set);
+
+  enum rtx_code code = GET_CODE (src);
+
+  /* Return early for SRC codes that we don't know how to handle.  */
+  if (code != PLUS && code != ASHIFT && code != REG)
+    return 0;
+
+  unsigned int dest_regno = REGNO (dest);
+
+  /* We don't want to fold offsets from instructions that change some
+     particular registers with potentially global side effects.  */
+  if (!GP_REG_P (dest_regno)
+      || dest_regno == STACK_POINTER_REGNUM
+      || (frame_pointer_needed && dest_regno == HARD_FRAME_POINTER_REGNUM)
+      || dest_regno == GP_REGNUM
+      || dest_regno == THREAD_POINTER_REGNUM
+      || dest_regno == RETURN_ADDR_REGNUM)
+    return 0;
+
+  if (analyze)
+    {
+      /* We can only fold through instructions that are eventually used as
+	 memory addresses and do not have other uses.  Use the same logic
+	 from the offset calculation to visit instructions that can
+	 propagate offsets and keep track in can_fold_insn which have uses
+	 that end always in memory instructions.  */
+
+      if (REG_P (dest))
+	{
+	  bool success;
+	  struct df_link *uses = get_uses (def, dest, &success), *ref_link;
+
+	  if (!success)
+	    return 0;
+
+	  for (ref_link = uses; ref_link; ref_link = ref_link->next)
+	    {
+	      rtx_insn* use = DF_REF_INSN (ref_link->ref);
+
+	      /* Ignore debug insns during analysis.  */
+	      if (DEBUG_INSN_P (use))
+		continue;
+
+	      if (!bitmap_bit_p (&can_fold_insn, INSN_UID (use)))
+		return 0;
+
+	      rtx use_set = single_set (use);
+
+	      /* Prevent folding when a memory store uses the dest register.  */
+	      if (use_set
+		  && MEM_P (SET_DEST (use_set))
+		  && REG_P (SET_SRC (use_set))
+		  && REGNO (SET_SRC (use_set)) == REGNO (dest))
+		return 0;
+	    }
+
+	  bitmap_set_bit (&can_fold_insn, INSN_UID (def));
+	}
+    }
+
+  if (!bitmap_bit_p (&can_fold_insn, INSN_UID (def)))
+    return 0;
+
+  switch (code)
+    {
+    case PLUS:
+      {
+	/* Propagate through add.  */
+	rtx arg1 = XEXP (src, 0);
+	rtx arg2 = XEXP (src, 1);
+
+	HOST_WIDE_INT offset = 0;
+
+	if (REG_P (arg1))
+	  offset += fold_offsets (def, arg1, 1, analyze, pending_deletes);
+	else if (GET_CODE (arg1) == ASHIFT && REG_P (XEXP (arg1, 0))
+		 && CONST_INT_P (XEXP (arg1, 1)))
+	  {
+	    /* Also handle shift-and-add from the zbb extension.  */
+	    int shift_scale = (1 << (int) INTVAL (XEXP (arg1, 1)));
+	    offset += fold_offsets (def, XEXP (arg1, 0), shift_scale, analyze,
+				    pending_deletes);
+	  }
+
+	if (REG_P (arg2))
+	  offset += fold_offsets (def, arg2, 1, analyze, pending_deletes);
+	else if (CONST_INT_P (arg2) && !analyze)
+	  {
+	    offset += INTVAL (arg2);
+	    bitmap_set_bit (pending_deletes, INSN_UID (def));
+	  }
+
+	return scale * offset;
+      }
+    case ASHIFT:
+      {
+	/* Propagate through sll.  */
+	rtx arg1 = XEXP (src, 0);
+	rtx arg2 = XEXP (src, 1);
+
+	if (REG_P (arg1) && CONST_INT_P (arg2))
+	  {
+	    int shift_scale = (1 << (int) INTVAL (arg2));
+	    return scale * fold_offsets (def, arg1, shift_scale, analyze,
+					 pending_deletes);
+	  }
+
+	return 0;
+      }
+    case REG:
+      /* Propagate through mv.  */
+      return scale * fold_offsets (def, src, 1, analyze, pending_deletes);
+    default:
+      /* Cannot propagate.  */
+      return 0;
+    }
+}
+
+/* Helper function for fold_offset_mem.
+   If INSN is a set rtx that loads from or stores to
+   some memory location that could have an offset folded
+   to it, return the rtx for the memory operand.  */
+static rtx
+get_foldable_mem_rtx (rtx_insn* insn)
+{
+  rtx set = single_set (insn);
+
+  if (set != NULL_RTX)
+    {
+      rtx src = SET_SRC (set);
+      rtx dest = SET_DEST (set);
+
+      /* We don't want folding if the memory has
+	 unspec/unspec volatile in either src or dest.
+	 In particular this also prevents folding
+	 when atomics are involved.  */
+      if (GET_CODE (src) == UNSPEC
+	  || GET_CODE (src) == UNSPEC_VOLATILE
+	  || GET_CODE (dest) == UNSPEC
+	  || GET_CODE (dest) == UNSPEC_VOLATILE)
+	return NULL;
+
+      if (MEM_P (src))
+	return src;
+      else if (MEM_P (dest))
+	return dest;
+      else if ((
+		GET_CODE (src) == SIGN_EXTEND
+		|| GET_CODE (src) == ZERO_EXTEND
+	      )
+	      && MEM_P (XEXP (src, 0)))
+	return XEXP (src, 0);
+    }
+
+  return NULL;
+}
+
+/* Driver function that performs the actions defined by PHASE for INSN.  */
+static void
+fold_offset_mem (rtx_insn* insn, int phase)
+{
+  if (phase == FM_PHASE_COMMIT_INSNS)
+    {
+      if (bitmap_bit_p (&pending_remove_insn, INSN_UID (insn))
+	  && !bitmap_bit_p (&cannot_remove_insn, INSN_UID (insn)))
+	{
+	  rtx set = single_set (insn);
+	  rtx src = SET_SRC (set);
+	  rtx dest = SET_DEST (set);
+	  rtx arg1 = XEXP (src, 0);
+
+	  /* INSN is an add immidiate addi DEST, SRC1, SRC2 that we
+	     must replace with addi DEST, SRC1, 0.  */
+	  if (XEXP (src, 0) == stack_pointer_rtx)
+	    stats_folded_sp++;
+	  else
+	    stats_folded_other++;
+
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, "Instruction deleted from folding:");
+	      print_rtl_single (dump_file, insn);
+	    }
+
+	  if (REGNO (dest) != REGNO (arg1))
+	    {
+	      /* If the dest register is different than the fisrt argument
+		 then the addition with constant 0 is equivalent to a move
+		 instruction.  We emit the move and let the subsequent
+		 pass cprop_hardreg eliminate that if possible.  */
+	      rtx arg1_reg_rtx = gen_rtx_REG (GET_MODE (dest), REGNO (arg1));
+	      rtx mov_rtx = gen_move_insn (dest, arg1_reg_rtx);
+	      df_insn_rescan (emit_insn_after (mov_rtx, insn));
+	    }
+
+	  /* If the dest register is the same with the first argument
+	     then the addition with constant 0 is a no-op.
+	     We can now delete the original add immidiate instruction.  */
+	  delete_insn (insn);
+	}
+    }
+  else
+    {
+      rtx mem = get_foldable_mem_rtx (insn);
+
+      if (!mem)
+	return;
+
+      rtx mem_addr = XEXP (mem, 0);
+      rtx reg;
+      HOST_WIDE_INT cur_off;
+
+      if (REG_P (mem_addr))
+	{
+	  reg = mem_addr;
+	  cur_off = 0;
+	}
+      else if (GET_CODE (mem_addr) == PLUS
+	       && REG_P (XEXP (mem_addr, 0))
+	       && CONST_INT_P (XEXP (mem_addr, 1)))
+	{
+	  reg = XEXP (mem_addr, 0);
+	  cur_off = INTVAL (XEXP (mem_addr, 1));
+	}
+      else
+	return;
+
+      if (phase == FM_PHASE_ANALYSIS)
+	{
+	  bitmap_set_bit (&can_fold_insn, INSN_UID (insn));
+	  fold_offsets (insn, reg, 1, true, NULL);
+	}
+      else if (phase == FM_PHASE_VALIDITY)
+	{
+	  bitmap_head new_pending_deletes;
+	  bitmap_initialize (&new_pending_deletes, NULL);
+	  HOST_WIDE_INT offset = cur_off + fold_offsets (insn, reg, 1, false,
+							&new_pending_deletes);
+
+	  /* Temporarily change the offset in MEM to test whether
+	     it results in a valid instruction.  */
+	  machine_mode mode = GET_MODE (mem_addr);
+	  XEXP (mem, 0) = gen_rtx_PLUS (mode, reg, GEN_INT (offset));
+
+	  bool valid_change = recog (PATTERN (insn), insn, 0) >= 0;
+
+	  /* Restore the instruction.  */
+	  XEXP (mem, 0) = mem_addr;
+
+	  if (valid_change)
+	    bitmap_ior_into (&pending_remove_insn, &new_pending_deletes);
+	  else
+	    bitmap_ior_into (&cannot_remove_insn, &new_pending_deletes);
+	  bitmap_release (&new_pending_deletes);
+	}
+      else if (phase == FM_PHASE_COMMIT_OFFSETS)
+	{
+	  bitmap_head required_deletes;
+	  bitmap_initialize (&required_deletes, NULL);
+	  HOST_WIDE_INT offset = cur_off + fold_offsets (insn, reg, 1, false,
+							 &required_deletes);
+	  bool illegal = bitmap_intersect_p (&required_deletes,
+					     &cannot_remove_insn);
+
+	  if (offset == cur_off)
+	    return;
+
+	  gcc_assert (!bitmap_empty_p (&required_deletes));
+
+	  /* We have to update CANNOT_REMOVE_INSN again if transforming
+	     this instruction is illegal.  */
+	  if (illegal)
+	    bitmap_ior_into (&cannot_remove_insn, &required_deletes);
+	  else
+	    {
+	      machine_mode mode = GET_MODE (mem_addr);
+	      XEXP (mem, 0) = gen_rtx_PLUS (mode, reg, GEN_INT (offset));
+	      df_insn_rescan (insn);
+
+	      if (dump_file)
+		{
+		  fprintf (dump_file, "Memory offset changed from "
+				      HOST_WIDE_INT_PRINT_DEC
+				      " to "
+				      HOST_WIDE_INT_PRINT_DEC
+				      " for instruction:\n", cur_off, offset);
+			print_rtl_single (dump_file, insn);
+		}
+	    }
+	  bitmap_release (&required_deletes);
+	}
+    }
+}
+
+unsigned int
+pass_fold_mem_offsets::execute (function *fn)
+{
+  basic_block bb;
+  rtx_insn *insn;
+
+  df_set_flags (DF_RD_PRUNE_DEAD_DEFS | DF_DEFER_INSN_RESCAN);
+  df_chain_add_problem (DF_UD_CHAIN + DF_DU_CHAIN);
+  df_analyze ();
+
+  bitmap_initialize (&can_fold_insn, NULL);
+  bitmap_initialize (&pending_remove_insn, NULL);
+  bitmap_initialize (&cannot_remove_insn, NULL);
+
+  stats_folded_sp = 0;
+  stats_folded_other = 0;
+
+  FOR_ALL_BB_FN (bb, fn)
+    {
+      /* The shorten-memrefs pass runs when a BB is optimized for size
+	 and moves offsets from multiple memory instructions to a common
+	 add instruction.  Disable folding if optimizing for size because
+	 this pass will cancel the effects of shorten-memrefs.  */
+      if (optimize_bb_for_size_p (bb))
+	continue;
+
+      bitmap_clear (&can_fold_insn);
+      bitmap_clear (&pending_remove_insn);
+      bitmap_clear (&cannot_remove_insn);
+
+      FOR_BB_INSNS (bb, insn)
+	fold_offset_mem (insn, FM_PHASE_ANALYSIS);
+
+      FOR_BB_INSNS (bb, insn)
+	fold_offset_mem (insn, FM_PHASE_VALIDITY);
+
+      FOR_BB_INSNS (bb, insn)
+	fold_offset_mem (insn, FM_PHASE_COMMIT_OFFSETS);
+
+      FOR_BB_INSNS (bb, insn)
+	fold_offset_mem (insn, FM_PHASE_COMMIT_INSNS);
+    }
+
+  statistics_counter_event (cfun, "addi with sp fold", stats_folded_sp);
+  statistics_counter_event (cfun, "other addi fold", stats_folded_other);
+
+  bitmap_release (&can_fold_insn);
+  bitmap_release (&pending_remove_insn);
+  bitmap_release (&cannot_remove_insn);
+
+  return 0;
+}
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_fold_mem_offsets (gcc::context *ctxt)
+{
+  return new pass_fold_mem_offsets (ctxt);
+}
diff --git a/gcc/config/riscv/riscv-passes.def b/gcc/config/riscv/riscv-passes.def
index 4084122cf0a..dc08daadc66 100644
--- a/gcc/config/riscv/riscv-passes.def
+++ b/gcc/config/riscv/riscv-passes.def
@@ -18,4 +18,5 @@ 
    <http://www.gnu.org/licenses/>.  */
 
 INSERT_PASS_AFTER (pass_rtl_store_motion, 1, pass_shorten_memrefs);
+INSERT_PASS_AFTER (pass_regrename, 1, pass_fold_mem_offsets);
 INSERT_PASS_BEFORE (pass_fast_rtl_dce, 1, pass_vsetvl);
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 5f78fd579bb..b89a82adb0e 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -104,6 +104,7 @@  extern void riscv_parse_arch_string (const char *, struct gcc_options *, locatio
 extern bool riscv_hard_regno_rename_ok (unsigned, unsigned);
 
 rtl_opt_pass * make_pass_shorten_memrefs (gcc::context *ctxt);
+rtl_opt_pass * make_pass_fold_mem_offsets (gcc::context *ctxt);
 rtl_opt_pass * make_pass_vsetvl (gcc::context *ctxt);
 
 /* Information about one CPU we know about.  */
diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
index 63d4710cb15..5e1fbdbedcc 100644
--- a/gcc/config/riscv/riscv.opt
+++ b/gcc/config/riscv/riscv.opt
@@ -105,6 +105,10 @@  Convert BASE + LARGE_OFFSET addresses to NEW_BASE + SMALL_OFFSET to allow more
 memory accesses to be generated as compressed instructions.  Currently targets
 32-bit integer load/stores.
 
+mfold-mem-offsets
+Target Bool Var(riscv_mfold_mem_offsets) Init(1)
+Fold instructions calculating memory offsets to the memory access instruction if possible.
+
 mcmodel=
 Target RejectNegative Joined Enum(code_model) Var(riscv_cmodel) Init(TARGET_DEFAULT_CMODEL)
 Specify the code model.
diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv
index 1252d6f851a..f29cf463867 100644
--- a/gcc/config/riscv/t-riscv
+++ b/gcc/config/riscv/t-riscv
@@ -76,6 +76,10 @@  riscv-shorten-memrefs.o: $(srcdir)/config/riscv/riscv-shorten-memrefs.cc \
 	$(COMPILE) $<
 	$(POSTCOMPILE)
 
+riscv-fold-mem-offsets.o: $(srcdir)/config/riscv/riscv-fold-mem-offsets.cc
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+
 riscv-selftests.o: $(srcdir)/config/riscv/riscv-selftests.cc \
   $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) $(TREE_H) output.h \
   $(C_COMMON_H) $(TARGET_H) $(OPTABS_H) $(EXPR_H) $(INSN_ATTR_H) $(EMIT_RTL_H)
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index ee78591c73e..39b57cab595 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1218,6 +1218,7 @@  See RS/6000 and PowerPC Options.
 -msmall-data-limit=@var{N-bytes}
 -msave-restore  -mno-save-restore
 -mshorten-memrefs  -mno-shorten-memrefs
+-mfold-mem-offsets  -mno-fold-mem-offsets
 -mstrict-align  -mno-strict-align
 -mcmodel=medlow  -mcmodel=medany
 -mexplicit-relocs  -mno-explicit-relocs
@@ -29048,6 +29049,13 @@  of 'new base + small offset'.  If the new base gets stored in a compressed
 register, then the new load/store can be compressed.  Currently targets 32-bit
 integer load/stores only.
 
+@opindex mfold-mem-offsets
+@item -mfold-mem-offsets
+@itemx -mno-fold-mem-offsets
+Do or do not attempt to move constant addition calculations used to for memory
+offsets to the corresponding memory instructions.  The default is
+@option{-mfold-mem-offsets} at levels @option{-O2}, @option{-O3}.
+
 @opindex mstrict-align
 @item -mstrict-align
 @itemx -mno-strict-align
diff --git a/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c
new file mode 100644
index 00000000000..574cc92b6ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mfold-mem-offsets" } */
+
+void sink(int arr[2]);
+
+void
+foo(int a, int b, int i)
+{
+  int arr[2] = {a, b};
+  arr[i]++;
+  sink(arr);
+}
+
+// Should compile without negative memory offsets when using -mfold-mem-offsets
+/* { dg-final { scan-assembler-not "lw\t.*,-.*\\(.*\\)" } } */
+/* { dg-final { scan-assembler-not "sw\t.*,-.*\\(.*\\)" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c
new file mode 100644
index 00000000000..e6c251d3a3c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c
@@ -0,0 +1,24 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mfold-mem-offsets" } */
+
+void sink(int arr[3]);
+
+void
+foo(int a, int b, int c, int i)
+{
+  int arr1[3] = {a, b, c};
+  int arr2[3] = {a, c, b};
+  int arr3[3] = {c, b, a};
+
+  arr1[i]++;
+  arr2[i]++;
+  arr3[i]++;
+  
+  sink(arr1);
+  sink(arr2);
+  sink(arr3);
+}
+
+// Should compile without negative memory offsets when using -mfold-mem-offsets
+/* { dg-final { scan-assembler-not "lw\t.*,-.*\\(.*\\)" } } */
+/* { dg-final { scan-assembler-not "sw\t.*,-.*\\(.*\\)" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c
new file mode 100644
index 00000000000..8586d3e3a29
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mfold-mem-offsets" } */
+
+void load(int arr[2]);
+
+int
+foo(long unsigned int i)
+{
+  int arr[2];
+  load(arr);
+
+  return arr[3 * i + 77];
+}
+
+// Should compile without negative memory offsets when using -mfold-mem-offsets
+/* { dg-final { scan-assembler-not "lw\t.*,-.*\\(.*\\)" } } */
+/* { dg-final { scan-assembler-not "addi\t.*,.*,77" } } */
\ No newline at end of file