rs6000: New pass for replacement of adjacent lxv with lxvp.

Message ID ccb585d7-8db8-4500-9a19-2c4e47f5bcfa@linux.ibm.com
State New
Headers
Series rs6000: New pass for replacement of adjacent lxv with lxvp. |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Testing passed

Commit Message

Ajit Agarwal Jan. 9, 2024, 11:14 a.m. UTC
  Hello All:

This pass is registered before ira rtl pass.
Bootstrapped and regtested for powerpc64-linux-gnu.

No regressions for spec 2017 benchmarks and improvements for some of the
FP and INT benchmarks.

Vladimir:

I did modify IRA and LRA register Allocators. Please review.

Thanks & Regards
Ajit

rs6000: New pass for replacement of adjacent lxv with lxvp.

New pass to replace adjacent memory addresses lxv with lxvp.
This pass is registered before ira rtl pass.

2024-01-09  Ajit Kumar Agarwal  <aagarwa1@linux.ibm.com>

gcc/ChangeLog:

	* config/rs6000/rs6000-passes.def: Registered vecload pass.
	* config/rs6000/rs6000-vecload-opt.cc: Add new pass.
	* config.gcc: Add new executable.
	* config/rs6000/rs6000-protos.h: Add new prototype for vecload
	pass.
	* config/rs6000/rs6000.cc: Add new prototype for vecload pass.
	* config/rs6000/t-rs6000: Add new rule.
	* ira-color.cc: Form register pair with adjacent loads.
	* lra-assigns.cc: Skip modifying register pair assignment.
	* lra-int.h: Add pseudo_conflict field in lra_reg_p structure.
	* lra.cc: Initialize pseudo_conflict field.
	* ira-build.cc: Use of REG_FREQ.

gcc/testsuite/ChangeLog:

	* g++.target/powerpc/vecload.C: New test.
	* g++.target/powerpc/vecload1.C: New test.
	* gcc.target/powerpc/mma-builtin-1.c: Modify test.
---
 gcc/config.gcc                                |   4 +-
 gcc/config/rs6000/rs6000-passes.def           |   1 +
 gcc/config/rs6000/rs6000-protos.h             |   5 +-
 gcc/config/rs6000/rs6000-vecload-opt.cc       | 395 ++++++++++++++++++
 gcc/config/rs6000/rs6000.cc                   |   8 +-
 gcc/config/rs6000/t-rs6000                    |   5 +
 gcc/ira-build.cc                              |   2 +-
 gcc/ira-color.cc                              | 214 +++++++++-
 gcc/lra-assigns.cc                            | 103 ++++-
 gcc/lra-int.h                                 |   1 +
 gcc/lra.cc                                    |   1 +
 gcc/testsuite/g++.target/powerpc/vecload.C    |  15 +
 gcc/testsuite/g++.target/powerpc/vecload1.C   |  22 +
 .../gcc.target/powerpc/mma-builtin-1.c        |   4 +-
 14 files changed, 766 insertions(+), 14 deletions(-)
 create mode 100644 gcc/config/rs6000/rs6000-vecload-opt.cc
 create mode 100644 gcc/testsuite/g++.target/powerpc/vecload.C
 create mode 100644 gcc/testsuite/g++.target/powerpc/vecload1.C
  

Comments

Surya Kumari Jangala Jan. 12, 2024, 10:56 a.m. UTC | #1
Hi Ajit,
I have taken a quick look at the patch and my comments are inlined:

On 09/01/24 4:44 pm, Ajit Agarwal wrote:
> Hello All:
> 
> This pass is registered before ira rtl pass.
> Bootstrapped and regtested for powerpc64-linux-gnu.
> 
> No regressions for spec 2017 benchmarks and improvements for some of the
> FP and INT benchmarks.
> 
> Vladimir:
> 
> I did modify IRA and LRA register Allocators. Please review.
> 
> Thanks & Regards
> Ajit
> 
> rs6000: New pass for replacement of adjacent lxv with lxvp.

Please add PR number.

> 
> New pass to replace adjacent memory addresses lxv with lxvp.
> This pass is registered before ira rtl pass.

Please add explanation of what changes have been made in IRA/LRA
and why those changes are required.

> 
> 2024-01-09  Ajit Kumar Agarwal  <aagarwa1@linux.ibm.com>
> 


> diff --git a/gcc/config.gcc b/gcc/config.gcc
> index f0676c830e8..4cf15e807de 100644
> --- a/gcc/config.gcc
> +++ b/gcc/config.gcc
> @@ -518,7 +518,7 @@ or1k*-*-*)
>  	;;
>  powerpc*-*-*)
>  	cpu_type=rs6000
> -	extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
> +	extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o rs6000-vecload-opt.o"
>  	extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
>  	extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
>  	extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
> @@ -555,7 +555,7 @@ riscv*)
>  	;;
>  rs6000*-*-*)
>  	extra_options="${extra_options} g.opt fused-madd.opt rs6000/rs6000-tables.opt"
> -	extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
> +	extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o rs6000-vecload-opt.o"
>  	extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
>  	target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-logue.cc \$(srcdir)/config/rs6000/rs6000-call.cc"
>  	target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
> diff --git a/gcc/config/rs6000/rs6000-passes.def b/gcc/config/rs6000/rs6000-passes.def
> index ca899d5f7af..e6a9810ee24 100644
> --- a/gcc/config/rs6000/rs6000-passes.def
> +++ b/gcc/config/rs6000/rs6000-passes.def
> @@ -28,6 +28,7 @@ along with GCC; see the file COPYING3.  If not see
>       The power8 does not have instructions that automaticaly do the byte swaps
>       for loads and stores.  */
>    INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps);
> +  INSERT_PASS_BEFORE (pass_ira, 1, pass_analyze_vecload);

Please add comments, similar to the other INSERT_PASS_BEFORE(...).

>  
>    /* Pass to do the PCREL_OPT optimization that combines the load of an
>       external symbol's address along with a single load or store using that
> diff --git a/gcc/config/rs6000/rs6000-vecload-opt.cc b/gcc/config/rs6000/rs6000-vecload-opt.cc
> new file mode 100644
> index 00000000000..f02c8337f2e
> --- /dev/null
> +++ b/gcc/config/rs6000/rs6000-vecload-opt.cc
> @@ -0,0 +1,395 @@
> +/* Subroutines used to replace lxv with lxvp
> +   for TARGET_POWER10 and TARGET_VSX,

s/,/.

Comment can be rewritten as follows to specify the fact that we replace
lxv's having adjacent addresses:
Subroutines used to replace lxv having adjacent addresses with lxvp.


> +/* Identify lxv instruction that are candidate of adjacent
> +   memory addresses and replace them with mma instruction lxvp.  */

The comment needs modification for better readability, perhaps as follows:
Identify lxv instructions that have adjacent memory addresses 
and replace them with an lxvp instruction.

> +unsigned int
> +rs6000_analyze_vecload (function *fun)
> +{
> +  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
> +  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
> +  df_analyze ();
> +  df_set_flags (DF_DEFER_INSN_RESCAN);
> +
> +  /* Rebuild ud- and du-chains.  */
> +  df_remove_problem (df_chain);
> +  df_process_deferred_rescans ();
> +  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
> +  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
> +  df_analyze ();
> +  df_set_flags (DF_DEFER_INSN_RESCAN);
> +
> +  basic_block bb;
> +  bool changed = false;
> +  rtx_insn *insn, *curr_insn = 0;
> +  rtx_insn *insn1 = 0, *insn2 = 0;
> +  bool first_vec_insn = false;
> +  unsigned int regno = 0;
> +
> +  FOR_ALL_BB_FN (bb, fun)

I am assuming that the 2 lxv instructions that we are searching for
should belong to the same BB. Otherwise, we risk moving a load insn across
basic blocks. In which case, the variable "first_vec_insn" has to be set to 
false here. It has to be false each time we start processing a new BB.

> +    FOR_BB_INSNS_SAFE (bb, insn, curr_insn)
> +    {
> +      if (LABEL_P (insn))
> +	continue;
> +
> +      if (NONDEBUG_INSN_P (insn) && GET_CODE (PATTERN (insn)) == SET)
> +	{

Please correct the indentation.

> +	  rtx set = single_set (insn);
> +	  rtx src = SET_SRC (set);
> +	  machine_mode mode = GET_MODE (SET_DEST (set));
> +
> +	  if (TARGET_VSX && TARGET_POWER10 && MEM_P (src))

Since this function gets called only if TARGET_VSX and TARGET_POWER10 are true,
do we need the check here again?

> +	    {
> +	      if (mem_operand_ds_form (src, mode)
> +		  || (mode_supports_dq_form (mode)
> +		  && quad_address_p (XEXP (src, 0), mode, false)))

Please correct the indentation.

> +		{
> +		  if (first_vec_insn)
> +		    {
> +		      first_vec_insn = false;

first_vec_insn should be set to false only after the replacement of
the lxv instructions with the lxvp. For example, say if the second lxv instruction
does not have adjacent memory location wrt to the first lxv, then we
should continue to search the BB for an lxv instruction with adjacent memory
address.

> +		      rtx addr = XEXP (src, 0);
> +		      insn2 = insn;
> +		      rtx insn1_src = SET_SRC (PATTERN (insn1));
> +
> +		      if (adjacent_mem_locations (insn1_src, src) == insn1_src)
> +			{
> +			  rtx op0 = XEXP (addr, 0);
> +
> +			  if (regno == REGNO (op0))
> +			    changed = replace_lxv_with_lxvp (insn1, insn2);> +			}
> +		     }

Incorrect indentation.

> +
> +		    if (REG_P (XEXP (src, 0))

Incorrect indentation.

> +			&& GET_CODE (XEXP (src, 0)) != PLUS)
> +		      {
> +			regno = REGNO (XEXP (src,0));
> +			first_vec_insn = true;
> +			insn1 = insn;
> +		      }
> +		  }
> +	      }
> +	  }
> +     }
> +
> +  return changed;
> +}
> +
> +const pass_data pass_data_analyze_vecload =
> +{
> +  RTL_PASS, /* type */
> +  "vecload", /* name */
> +  OPTGROUP_NONE, /* optinfo_flags */
> +  TV_NONE, /* tv_id */
> +  0, /* properties_required */
> +  0, /* properties_provided */
> +  0, /* properties_destroyed */
> +  0, /* todo_flags_start */
> +  TODO_df_finish, /* todo_flags_finish */
> +};
> +
> +class pass_analyze_vecload : public rtl_opt_pass
> +{
> +public:
> +  pass_analyze_vecload(gcc::context *ctxt)
> +    : rtl_opt_pass(pass_data_analyze_vecload, ctxt)
> +  {}
> +
> +  /* opt_pass methods: */
> +  virtual bool gate (function *)
> +    {
> +      return (optimize > 0 && TARGET_VSX && TARGET_POWER10);
> +    }
> +
> +  virtual unsigned int execute (function *fun)
> +    {
> +      return rs6000_analyze_vecload (fun);
> +    }
> +}; // class pass_analyze_vecload
> +
> +rtl_opt_pass *
> +make_pass_analyze_vecload (gcc::context *ctxt)
> +{
> +  return new pass_analyze_vecload (ctxt);
> +}
> +
> diff --git a/gcc/ira-build.cc b/gcc/ira-build.cc
> index c715a834f12..fe78d967e75 100644
> --- a/gcc/ira-build.cc
> +++ b/gcc/ira-build.cc
> @@ -1862,7 +1862,7 @@ create_insn_allocnos (rtx x, rtx outer, bool output_p)
>  	    }
>  
>  	  ALLOCNO_NREFS (a)++;
> -	  ALLOCNO_FREQ (a) += REG_FREQ_FROM_BB (curr_bb);
> +	  ALLOCNO_FREQ (a) += REG_FREQ (regno);

Can you please explain why this change is required?

>  	  if (output_p)
>  	    bitmap_set_bit (ira_curr_loop_tree_node->modified_regnos, regno);
>  	}

> diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
> index 214a4f16d3c..d5f6f885957 100644
> --- a/gcc/ira-color.cc
> +++ b/gcc/ira-color.cc
> @@ -1047,6 +1047,8 @@ setup_profitable_hard_regs (void)
>  	continue;
>        data = ALLOCNO_COLOR_DATA (a);
>        if (ALLOCNO_UPDATED_HARD_REG_COSTS (a) == NULL
> +	  && ALLOCNO_CLASS_COST (a) > 0
> +	  && ALLOCNO_MEMORY_COST (a) > 0 

Why do we have these checks for positive cost?

>  	  && ALLOCNO_CLASS_COST (a) > ALLOCNO_MEMORY_COST (a)
>  	  /* Do not empty profitable regs for static chain pointer
>  	     pseudo when non-local goto is used.  */
> @@ -1131,6 +1133,8 @@ setup_profitable_hard_regs (void)
>  				       hard_regno))
>  		continue;
>  	      if (ALLOCNO_UPDATED_MEMORY_COST (a) < costs[j]
> +		  && ALLOCNO_UPDATED_MEMORY_COST (a) > 0
> +		  && costs[j] > 0

Why do we have these checks for positive cost?
Please note that costs can be negative.

>  		  /* Do not remove HARD_REGNO for static chain pointer
>  		     pseudo when non-local goto is used.  */
>  		  && ! non_spilled_static_chain_regno_p (ALLOCNO_REGNO (a)))
> diff --git a/gcc/lra-assigns.cc b/gcc/lra-assigns.cc
> index 7aa210e986f..46ab3b5f165 100644
> --- a/gcc/lra-assigns.cc
> +++ b/gcc/lra-assigns.cc
> @@ -1638,6 +1737,7 @@ lra_assign (bool &fails_p)
>    bitmap_initialize (&all_spilled_pseudos, &reg_obstack);
>    create_live_range_start_chains ();
>    setup_live_pseudos_and_spill_after_risky_transforms (&all_spilled_pseudos);
> +#if 0

Please remove the code instead of enclosing it in #if 0.

>    if (! lra_hard_reg_split_p && ! lra_asm_error_p && flag_checking)
>      /* Check correctness of allocation but only when there are no hard reg
>         splits and asm errors as in the case of errors explicit insns involving
> @@ -1649,6 +1749,7 @@ lra_assign (bool &fails_p)
>  	  && overlaps_hard_reg_set_p (lra_reg_info[i].conflict_hard_regs,
>  				      PSEUDO_REGNO_MODE (i), reg_renumber[i]))
>  	gcc_unreachable ();
> +#endif
>    /* Setup insns to process on the next constraint pass.  */
>    bitmap_initialize (&changed_pseudo_bitmap, &reg_obstack);
>    init_live_reload_and_inheritance_pseudos ();
> diff --git a/gcc/lra-int.h b/gcc/lra-int.h
> index 5cdf92be7fc..962fb351ba0 100644
> --- a/gcc/lra-int.h
> +++ b/gcc/lra-int.h
> @@ -95,6 +95,7 @@ public:
>       *non-debug* insns.	 */
>    int nrefs, freq;
>    int last_reload;
> +  bool pseudo_conflict;

Please add some comments.

>    /* rtx used to undo the inheritance.  It can be non-null only
>       between subsequent inheritance and undo inheritance passes.  */
>    rtx restore_rtx;
> diff --git a/gcc/lra.cc b/gcc/lra.cc
> index 69081a8e025..5cc97ce7506 100644
> --- a/gcc/lra.cc
> +++ b/gcc/lra.cc
> @@ -1359,6 +1359,7 @@ initialize_lra_reg_info_element (int i)
>    lra_reg_info[i].nrefs = lra_reg_info[i].freq = 0;
>    lra_reg_info[i].last_reload = 0;
>    lra_reg_info[i].restore_rtx = NULL_RTX;
> +  lra_reg_info[i].pseudo_conflict = false;
>    lra_reg_info[i].val = get_new_reg_value ();
>    lra_reg_info[i].offset = 0;
>    lra_reg_info[i].copies = NULL;
> diff --git a/gcc/testsuite/g++.target/powerpc/vecload.C b/gcc/testsuite/g++.target/powerpc/vecload.C
> new file mode 100644
> index 00000000000..0d998aa7054
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/powerpc/vecload.C
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */ 
> +/* { dg-require-effective-target powerpc_p9vector_ok } */

This should be "power10_ok" and not "powerpc_p9vector_ok". Same comment for other tests.

Regards,
Surya
  

Patch

diff --git a/gcc/config.gcc b/gcc/config.gcc
index f0676c830e8..4cf15e807de 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -518,7 +518,7 @@  or1k*-*-*)
 	;;
 powerpc*-*-*)
 	cpu_type=rs6000
-	extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
+	extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o rs6000-vecload-opt.o"
 	extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
 	extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
 	extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
@@ -555,7 +555,7 @@  riscv*)
 	;;
 rs6000*-*-*)
 	extra_options="${extra_options} g.opt fused-madd.opt rs6000/rs6000-tables.opt"
-	extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
+	extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o rs6000-vecload-opt.o"
 	extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
 	target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-logue.cc \$(srcdir)/config/rs6000/rs6000-call.cc"
 	target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
diff --git a/gcc/config/rs6000/rs6000-passes.def b/gcc/config/rs6000/rs6000-passes.def
index ca899d5f7af..e6a9810ee24 100644
--- a/gcc/config/rs6000/rs6000-passes.def
+++ b/gcc/config/rs6000/rs6000-passes.def
@@ -28,6 +28,7 @@  along with GCC; see the file COPYING3.  If not see
      The power8 does not have instructions that automaticaly do the byte swaps
      for loads and stores.  */
   INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps);
+  INSERT_PASS_BEFORE (pass_ira, 1, pass_analyze_vecload);
 
   /* Pass to do the PCREL_OPT optimization that combines the load of an
      external symbol's address along with a single load or store using that
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index f70118ea40f..83ee773a6f8 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -343,12 +343,15 @@  namespace gcc { class context; }
 class rtl_opt_pass;
 
 extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *);
+extern rtl_opt_pass *make_pass_analyze_vecload (gcc::context *);
 extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *);
 extern bool rs6000_sum_of_two_registers_p (const_rtx expr);
 extern bool rs6000_quadword_masked_address_p (const_rtx exp);
 extern rtx rs6000_gen_lvx (enum machine_mode, rtx, rtx);
 extern rtx rs6000_gen_stvx (enum machine_mode, rtx, rtx);
-
+extern bool mode_supports_dq_form (machine_mode);
+extern bool get_memref_parts (rtx, rtx *, HOST_WIDE_INT *, HOST_WIDE_INT *);
+extern rtx adjacent_mem_locations (rtx, rtx);
 extern void rs6000_emit_xxspltidp_v2df (rtx, long value);
 extern gimple *currently_expanding_gimple_stmt;
 extern bool rs6000_opaque_type_invalid_use_p (gimple *);
diff --git a/gcc/config/rs6000/rs6000-vecload-opt.cc b/gcc/config/rs6000/rs6000-vecload-opt.cc
new file mode 100644
index 00000000000..f02c8337f2e
--- /dev/null
+++ b/gcc/config/rs6000/rs6000-vecload-opt.cc
@@ -0,0 +1,395 @@ 
+/* Subroutines used to replace lxv with lxvp
+   for TARGET_POWER10 and TARGET_VSX,
+
+   Copyright (C) 2020-2023 Free Software Foundation, Inc.
+   Contributed by Ajit Kumar Agarwal <aagarwa1@linux.ibm.com>.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "rtl.h"
+#include "tree-pass.h"
+#include "df.h"
+#include "dumpfile.h"
+#include "rs6000-internal.h"
+#include "rs6000-protos.h"
+
+/* Return false if dependent rtx LOC is SUBREG.  */
+static bool
+is_feasible (rtx_insn *insn)
+{
+  df_ref use;
+  df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+  FOR_EACH_INSN_INFO_DEF (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+      if (!def_link || !def_link->ref || DF_REF_IS_ARTIFICIAL (def_link->ref))
+	continue;
+      while (def_link && def_link->ref)
+	{
+	  rtx *loc = DF_REF_LOC (def_link->ref);
+	  if (!loc || *loc == NULL_RTX)
+	    return false;
+	  if (GET_CODE (*loc) == SUBREG)
+	    return false;
+	  def_link = def_link->next;
+	}
+     }
+  return true;
+}
+
+/* df_scan_rescan the unspec instruction where operands
+   are reversed.  */
+void set_rescan_for_unspec (rtx_insn *insn)
+{
+  df_ref use;
+  df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+  rtx_insn *select_insn2;
+  FOR_EACH_INSN_INFO_DEF (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+      while (def_link && def_link->ref)
+	{
+	  select_insn2 = DF_REF_INSN (def_link->ref);
+	  rtx set = single_set (select_insn2);
+
+	  if (set == NULL_RTX)
+	    return;
+
+	  if (set != NULL_RTX)
+	    {
+	      rtx op0 = SET_SRC (set);
+	      if (GET_CODE (op0) != UNSPEC)
+		return;
+
+	      if (GET_CODE (op0) == VEC_SELECT
+		  && GET_CODE (XEXP (op0, 1)) == PARALLEL)
+		return;
+
+	      if (GET_CODE (op0) == UNSPEC)
+		df_insn_rescan (select_insn2);
+	    }
+	   def_link = def_link->next;
+	}
+     }
+}
+
+/* Return dependent UNSPEC instruction.  */
+rtx_insn *get_rtx_UNSPEC (rtx_insn *insn)
+{
+  df_ref use;
+  df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+  rtx_insn *select_insn2;
+  FOR_EACH_INSN_INFO_DEF (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+      while (def_link && def_link->ref)
+	{
+	  select_insn2 = DF_REF_INSN (def_link->ref);
+	  rtx set = single_set (select_insn2);
+
+	  if (set == NULL_RTX)
+	    return 0;
+
+	  if (set != NULL_RTX)
+	    {
+	      rtx op0 = SET_SRC (set);
+
+	      if (GET_CODE (op0) == UNSPEC)
+		return select_insn2;
+	    }
+	   def_link = def_link->next;
+	}
+     }
+  return 0;
+}
+
+/* Replace identified lxv with lxvp.
+   Bail out if following condition are true:
+
+   - dependent instruction of load is vec_select instruction,
+
+   - machine mode of unspec is not same as machine mode
+     of lxv instruction.
+
+   - dependent instruction is not unspec.
+
+   - Source operand of unspec is eq instruction.  */
+
+static bool
+replace_lxv_with_lxvp (rtx_insn *insn1, rtx_insn *insn2)
+{
+  rtx body = PATTERN (insn1);
+  rtx src_exp = SET_SRC (body);
+  rtx dest_exp = SET_DEST (body);
+  rtx lxv;
+  rtx insn2_body = PATTERN (insn2);
+  rtx insn2_dest_exp = SET_DEST (insn2_body);
+
+  if (GET_MODE (src_exp) != GET_MODE (SET_SRC (insn2_body)))
+    return false;
+
+  if (GET_MODE (dest_exp) == TImode)
+    return false;
+
+  if (!ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (dest_exp)))
+    return false;
+
+  if (!is_feasible (insn1))
+    return false;
+
+  if (!is_feasible (insn2))
+    return false;
+
+  for (rtx note = REG_NOTES (insn1); note; note = XEXP (note, 1))
+    if (REG_NOTE_KIND (note) == REG_EQUAL
+	|| REG_NOTE_KIND (note) == REG_EQUIV)
+      return false;
+
+  int no_dep = 0;
+  df_ref  use;
+  df_insn_info *insn_info = DF_INSN_INFO_GET (insn1);
+  rtx_insn *select_insn2;
+
+  FOR_EACH_INSN_INFO_DEF (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+      while (def_link && def_link->ref)
+	{
+	  select_insn2 = DF_REF_INSN (def_link->ref);
+	  rtx set = single_set (select_insn2);
+
+	  if (set == NULL_RTX)
+	    return false;
+
+	  if (set != NULL_RTX)
+	    {
+	      rtx op0 = SET_SRC (set);
+
+	      if (GET_CODE (op0) != UNSPEC)
+		return false;
+
+	      if (GET_CODE (op0) == VEC_SELECT
+		  && GET_CODE (XEXP (op0, 1)) == PARALLEL)
+		return false;
+
+	      if (GET_CODE (op0) == UNSPEC)
+		{
+		  if (GET_MODE (op0) != XOmode
+		      && GET_MODE (op0) != GET_MODE (dest_exp))
+		    return false;
+
+		  int nvecs = XVECLEN (op0, 0);
+		  for (int i = 0; i < nvecs; i++)
+		    {
+		      rtx op;
+		      op = XVECEXP (op0, 0, i);
+
+		      if (GET_CODE (op )== EQ)
+			return false;
+		    }
+		}
+	       ++no_dep;
+	     }
+	   def_link = def_link->next;
+	}
+     }
+
+  rtx_insn *insn = get_rtx_UNSPEC (insn1);
+
+  if (insn && insn == get_rtx_UNSPEC (insn2) && no_dep == 1)
+    return false;
+
+
+  insn_info = DF_INSN_INFO_GET (insn2);
+  FOR_EACH_INSN_INFO_DEF (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+      if (!def_link || !def_link->ref || DF_REF_IS_ARTIFICIAL (def_link->ref))
+	continue;
+      while (def_link && def_link->ref)
+	{
+	  rtx *loc = DF_REF_LOC (def_link->ref);
+	  *loc =  dest_exp;
+	  def_link = def_link->next;
+	}
+     }
+
+  insn_info = DF_INSN_INFO_GET (insn1);
+  FOR_EACH_INSN_INFO_DEF (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+      if (!def_link || !def_link->ref || DF_REF_IS_ARTIFICIAL (def_link->ref))
+	continue;
+      while (def_link && def_link->ref)
+	{
+	  rtx *loc = DF_REF_LOC (def_link->ref);
+	  PUT_MODE_RAW (*loc, OOmode);
+	  *loc = insn2_dest_exp;
+	  def_link = def_link->next;
+	}
+     }
+
+  set_rescan_for_unspec (insn1);
+  set_rescan_for_unspec (insn2);
+  df_insn_rescan (insn1);
+  df_insn_rescan (insn2);
+
+  PUT_MODE_RAW (src_exp, OOmode);
+  PUT_MODE_RAW (dest_exp, OOmode);
+  lxv = gen_rtx_SET  (dest_exp, src_exp);
+  rtx_insn *new_insn = emit_insn_before (lxv,  insn1);
+  set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn1));
+  df_insn_rescan (new_insn);
+
+  if (dump_file)
+    {
+      unsigned int new_uid = INSN_UID (new_insn);
+      fprintf (dump_file, "Replacing lxv %d with lxvp  %d\n",
+			  INSN_UID (insn1), new_uid);
+      print_rtl_single (dump_file, new_insn);
+      print_rtl_single (dump_file, insn1);
+      print_rtl_single (dump_file, insn2);
+
+    }
+
+  df_insn_delete (insn1);
+  remove_insn (insn1);
+  df_insn_delete (insn2);
+  remove_insn (insn2);
+  insn1->set_deleted ();
+  insn2->set_deleted ();
+  return true;
+}
+
+/* Identify lxv instruction that are candidate of adjacent
+   memory addresses and replace them with mma instruction lxvp.  */
+unsigned int
+rs6000_analyze_vecload (function *fun)
+{
+  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
+  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+  df_analyze ();
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+
+  /* Rebuild ud- and du-chains.  */
+  df_remove_problem (df_chain);
+  df_process_deferred_rescans ();
+  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
+  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+  df_analyze ();
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+
+  basic_block bb;
+  bool changed = false;
+  rtx_insn *insn, *curr_insn = 0;
+  rtx_insn *insn1 = 0, *insn2 = 0;
+  bool first_vec_insn = false;
+  unsigned int regno = 0;
+
+  FOR_ALL_BB_FN (bb, fun)
+    FOR_BB_INSNS_SAFE (bb, insn, curr_insn)
+    {
+      if (LABEL_P (insn))
+	continue;
+
+      if (NONDEBUG_INSN_P (insn) && GET_CODE (PATTERN (insn)) == SET)
+	{
+	  rtx set = single_set (insn);
+	  rtx src = SET_SRC (set);
+	  machine_mode mode = GET_MODE (SET_DEST (set));
+
+	  if (TARGET_VSX && TARGET_POWER10 && MEM_P (src))
+	    {
+	      if (mem_operand_ds_form (src, mode)
+		  || (mode_supports_dq_form (mode)
+		  && quad_address_p (XEXP (src, 0), mode, false)))
+		{
+		  if (first_vec_insn)
+		    {
+		      first_vec_insn = false;
+		      rtx addr = XEXP (src, 0);
+		      insn2 = insn;
+		      rtx insn1_src = SET_SRC (PATTERN (insn1));
+
+		      if (adjacent_mem_locations (insn1_src, src) == insn1_src)
+			{
+			  rtx op0 = XEXP (addr, 0);
+
+			  if (regno == REGNO (op0))
+			    changed = replace_lxv_with_lxvp (insn1, insn2);
+			}
+		     }
+
+		    if (REG_P (XEXP (src, 0))
+			&& GET_CODE (XEXP (src, 0)) != PLUS)
+		      {
+			regno = REGNO (XEXP (src,0));
+			first_vec_insn = true;
+			insn1 = insn;
+		      }
+		  }
+	      }
+	  }
+     }
+
+  return changed;
+}
+
+const pass_data pass_data_analyze_vecload =
+{
+  RTL_PASS, /* type */
+  "vecload", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_analyze_vecload : public rtl_opt_pass
+{
+public:
+  pass_analyze_vecload(gcc::context *ctxt)
+    : rtl_opt_pass(pass_data_analyze_vecload, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      return (optimize > 0 && TARGET_VSX && TARGET_POWER10);
+    }
+
+  virtual unsigned int execute (function *fun)
+    {
+      return rs6000_analyze_vecload (fun);
+    }
+}; // class pass_analyze_vecload
+
+rtl_opt_pass *
+make_pass_analyze_vecload (gcc::context *ctxt)
+{
+  return new pass_analyze_vecload (ctxt);
+}
+
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 6b9a40fcc66..5f0ec8239c1 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -387,7 +387,7 @@  mode_supports_vmx_dform (machine_mode mode)
 /* Return true if we have D-form addressing in VSX registers.  This addressing
    is more limited than normal d-form addressing in that the offset must be
    aligned on a 16-byte boundary.  */
-static inline bool
+bool
 mode_supports_dq_form (machine_mode mode)
 {
   return ((reg_addr[mode].addr_mask[RELOAD_REG_ANY] & RELOAD_REG_QUAD_OFFSET)
@@ -1178,6 +1178,8 @@  static bool rs6000_secondary_reload_move (enum rs6000_reg_type,
 					  secondary_reload_info *,
 					  bool);
 rtl_opt_pass *make_pass_analyze_swaps (gcc::context*);
+rtl_opt_pass *make_pass_analyze_vecload (gcc::context*);
+
 
 /* Hash table stuff for keeping track of TOC entries.  */
 
@@ -18644,7 +18646,7 @@  set_to_load_agen (rtx_insn *out_insn, rtx_insn *in_insn)
    This function only looks for REG or REG+CONST address forms.
    REG+REG address form will return false. */
 
-static bool
+bool
 get_memref_parts (rtx mem, rtx *base, HOST_WIDE_INT *offset,
 		  HOST_WIDE_INT *size)
 {
@@ -18676,7 +18678,7 @@  get_memref_parts (rtx mem, rtx *base, HOST_WIDE_INT *offset,
    adjacent, then return the argument that has the lower address.
    Otherwise, return NULL_RTX.  */
 
-static rtx
+rtx
 adjacent_mem_locations (rtx mem1, rtx mem2)
 {
   rtx reg1, reg2;
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index f183b42ce1d..0b6852f2d38 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -35,6 +35,11 @@  rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.cc
 	$(COMPILE) $<
 	$(POSTCOMPILE)
 
+rs6000-vecload-opt.o: $(srcdir)/config/rs6000/rs6000-vecload-opt.cc
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+
+
 rs6000-d.o: $(srcdir)/config/rs6000/rs6000-d.cc
 	$(COMPILE) $<
 	$(POSTCOMPILE)
diff --git a/gcc/ira-build.cc b/gcc/ira-build.cc
index c715a834f12..fe78d967e75 100644
--- a/gcc/ira-build.cc
+++ b/gcc/ira-build.cc
@@ -1862,7 +1862,7 @@  create_insn_allocnos (rtx x, rtx outer, bool output_p)
 	    }
 
 	  ALLOCNO_NREFS (a)++;
-	  ALLOCNO_FREQ (a) += REG_FREQ_FROM_BB (curr_bb);
+	  ALLOCNO_FREQ (a) += REG_FREQ (regno);
 	  if (output_p)
 	    bitmap_set_bit (ira_curr_loop_tree_node->modified_regnos, regno);
 	}
diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
index 214a4f16d3c..d5f6f885957 100644
--- a/gcc/ira-color.cc
+++ b/gcc/ira-color.cc
@@ -1047,6 +1047,8 @@  setup_profitable_hard_regs (void)
 	continue;
       data = ALLOCNO_COLOR_DATA (a);
       if (ALLOCNO_UPDATED_HARD_REG_COSTS (a) == NULL
+	  && ALLOCNO_CLASS_COST (a) > 0
+	  && ALLOCNO_MEMORY_COST (a) > 0 
 	  && ALLOCNO_CLASS_COST (a) > ALLOCNO_MEMORY_COST (a)
 	  /* Do not empty profitable regs for static chain pointer
 	     pseudo when non-local goto is used.  */
@@ -1131,6 +1133,8 @@  setup_profitable_hard_regs (void)
 				       hard_regno))
 		continue;
 	      if (ALLOCNO_UPDATED_MEMORY_COST (a) < costs[j]
+		  && ALLOCNO_UPDATED_MEMORY_COST (a) > 0
+		  && costs[j] > 0
 		  /* Do not remove HARD_REGNO for static chain pointer
 		     pseudo when non-local goto is used.  */
 		  && ! non_spilled_static_chain_regno_p (ALLOCNO_REGNO (a)))
@@ -1919,6 +1923,175 @@  spill_soft_conflicts (ira_allocno_t a, bitmap allocnos_to_spill,
     }
 }
 
+/* Form register pair for adjacent loads with unified load.  */
+static int
+form_register_pairs (ira_allocno_t a, int regno, HARD_REG_SET *conflicting_regs)
+{
+  int n = ALLOCNO_NUM_OBJECTS (a);
+  int best_hard_regno = -1;
+  for (int i = 0; i < n; i++)
+    {
+      ira_object_t obj = ALLOCNO_OBJECT (a, i);
+      ira_object_t conflict_obj;
+      ira_object_conflict_iterator oci;
+
+      if (OBJECT_CONFLICT_ARRAY (obj) == NULL)
+	{
+	  continue;
+	}
+      FOR_EACH_OBJECT_CONFLICT (obj, conflict_obj, oci)
+	{
+	  ira_allocno_t conflict_a = OBJECT_ALLOCNO (conflict_obj);
+
+	  machine_mode mode = ALLOCNO_MODE (a);
+	  machine_mode confl_mode = ALLOCNO_MODE (conflict_a);
+	  int a_nregs = ira_reg_class_max_nregs[ALLOCNO_CLASS(a)][mode];
+	  int cl = ALLOCNO_CLASS (conflict_a);
+	  int conf_nregs = ira_reg_class_max_nregs[cl][confl_mode];
+
+	  if (mode != confl_mode && a_nregs < conf_nregs)
+	    {
+	      if (DF_REG_DEF_COUNT (ALLOCNO_REGNO (a)) == 0)
+		{
+		  enum reg_class aclass = ALLOCNO_CLASS (a);
+
+		  if (regno < ira_class_hard_regs[aclass][0])
+		    regno = ira_class_hard_regs[aclass][0];
+
+		  if (ALLOCNO_HARD_REGNO (conflict_a) > 0)
+		    best_hard_regno = ALLOCNO_HARD_REGNO (conflict_a) + 1;
+		  else
+		    best_hard_regno = regno;
+
+		  if (ALLOCNO_HARD_REGNO (conflict_a) < 0)
+		    {
+		      if (check_hard_reg_p (a, best_hard_regno, conflicting_regs,
+					    ALLOCNO_COLOR_DATA (a)->profitable_hard_regs))
+			{
+			  if (best_hard_regno % 2 == 0)
+			    {
+			      if (best_hard_regno - 1 < ira_class_hard_regs[aclass][0])
+				return best_hard_regno + 1;
+			      else
+				return best_hard_regno - 1;
+			    }
+			  return best_hard_regno;
+			}
+		      else return -1;
+		    }
+		   else return best_hard_regno;
+		}
+
+	       if (DF_REG_DEF_COUNT (ALLOCNO_REGNO (a)) != 0
+		   && DF_REG_DEF_COUNT (ALLOCNO_REGNO (conflict_a)) == 0)
+		  {
+		    best_hard_regno = ALLOCNO_HARD_REGNO (conflict_a) - 1;
+		    if (check_hard_reg_p (a, best_hard_regno, conflicting_regs,
+					  ALLOCNO_COLOR_DATA (a)->profitable_hard_regs))
+		      {
+			return best_hard_regno;
+		      }
+		  }
+		else if ( DF_REG_DEF_COUNT (ALLOCNO_REGNO (a)) != 0)
+		  {
+		    best_hard_regno = ALLOCNO_HARD_REGNO (conflict_a) + 2;
+
+		    if (check_hard_reg_p (a, best_hard_regno, conflicting_regs,
+					  ALLOCNO_COLOR_DATA (a)->profitable_hard_regs))
+		      {
+			 return best_hard_regno;
+		      }
+		   else if (ira_class_hard_regs[ALLOCNO_CLASS (a)][0] <= (regno + 1)
+			    && check_hard_reg_p(a, regno + 1, conflicting_regs,
+						ALLOCNO_COLOR_DATA (a)->profitable_hard_regs))
+		     return regno+1;
+
+		   else return -1;
+		}
+	     }
+	  else if (mode != confl_mode && a_nregs > conf_nregs)
+	    {
+	      if (DF_REG_DEF_COUNT (ALLOCNO_REGNO (conflict_a)) == 0)
+		{
+		  enum reg_class  aclass = ALLOCNO_CLASS (a);
+
+		  if (regno < ira_class_hard_regs[aclass][0])
+		    regno = ira_class_hard_regs[aclass][0];
+
+		  if (ALLOCNO_ASSIGNED_P (conflict_a)
+		      && ALLOCNO_HARD_REGNO (conflict_a) > 0)
+		    {
+		      best_hard_regno = ALLOCNO_HARD_REGNO (conflict_a) - 1;
+		      return best_hard_regno;
+		    }
+		  else
+		    best_hard_regno = regno;
+
+		  if (check_hard_reg_p (a, best_hard_regno, conflicting_regs,
+					ALLOCNO_COLOR_DATA (a)->profitable_hard_regs))
+		    {
+		      if (best_hard_regno % 2 != 0)
+			{
+			  return best_hard_regno;
+			}
+		      return best_hard_regno;
+		   }
+		}
+	     }
+	   else
+	     {
+	       if (ALLOCNO_HARD_REGNO (conflict_a) > 0
+		   && DF_REG_DEF_COUNT (ALLOCNO_REGNO (conflict_a)) == 0)
+		 {
+		   if (ALLOCNO_ASSIGNED_P (conflict_a))
+		     best_hard_regno = ALLOCNO_HARD_REGNO (conflict_a) + 1;
+		   else
+		     best_hard_regno = regno;
+
+		   if (check_hard_reg_p (a, best_hard_regno, conflicting_regs,
+					 ALLOCNO_COLOR_DATA (a)->profitable_hard_regs))
+		     {
+		       if (best_hard_regno % 2 != 0)
+			 {
+			   return best_hard_regno ;
+			 }
+		       return best_hard_regno;
+		     }
+
+		int i = 0;
+		enum reg_class  aclass = ALLOCNO_CLASS (a);
+		int class_size = ira_class_hard_regs_num[aclass];
+		while (i < best_hard_regno)
+		  {
+		    int last_hard_regno = ira_class_hard_regs[aclass][class_size - 1];
+		    if ((i + best_hard_regno) <= last_hard_regno
+			&& check_hard_reg_p (a, best_hard_regno + i, conflicting_regs,
+					     ALLOCNO_COLOR_DATA (a)->profitable_hard_regs))
+		       return best_hard_regno + i;
+		    ++i;
+		  }
+
+		best_hard_regno -= 3;
+		i = 0;
+
+		while (i < best_hard_regno)
+		  {
+		    if ((best_hard_regno - i) >= ira_class_hard_regs[ALLOCNO_CLASS (a)][0]
+			 && check_hard_reg_p (a, best_hard_regno - i, conflicting_regs,
+					      ALLOCNO_COLOR_DATA (a)->profitable_hard_regs))
+		      return best_hard_regno - i;
+		    ++i;
+		  }
+
+	       return -1;
+
+	    }
+	}
+     }
+  }
+  return -1;
+}
+
 /* Choose a hard register for allocno A.  If RETRY_P is TRUE, it means
    that the function called from function
    `ira_reassign_conflict_allocnos' and `allocno_reload_assign'.  In
@@ -1974,6 +2147,13 @@  assign_hard_reg (ira_allocno_t a, bool retry_p)
 #ifdef STACK_REGS
   no_stack_reg_p = false;
 #endif
+  int maxim_regno = 0;
+  for (i = 0; i < class_size; i++)
+    {
+      if (ira_class_hard_regs[aclass][i] > maxim_regno)
+	maxim_regno = ira_class_hard_regs[aclass][i];
+    }
+
   if (! retry_p)
     start_update_cost ();
   mem_cost += ALLOCNO_UPDATED_MEMORY_COST (a);
@@ -2078,7 +2258,9 @@  assign_hard_reg (ira_allocno_t a, bool retry_p)
 		    }
 		  else
 		    {
-		      if (conflict_nregs == n_objects && conflict_nregs > 1)
+		      int num = OBJECT_SUBWORD (conflict_obj);
+
+		      if (conflict_nregs == n_objects)
 			{
 			  int num = OBJECT_SUBWORD (conflict_obj);
 
@@ -2090,8 +2272,12 @@  assign_hard_reg (ira_allocno_t a, bool retry_p)
 					      hard_regno + num);
 			}
 		      else
-			conflicting_regs[word]
-			  |= ira_reg_mode_hard_regset[hard_regno][mode];
+			{
+			  SET_HARD_REG_BIT (conflicting_regs[word],
+					    hard_regno + num);
+			  conflicting_regs[word]
+			    |= ira_reg_mode_hard_regset[hard_regno][mode];
+			}
 		      if (hard_reg_set_subset_p (profitable_hard_regs,
 						 conflicting_regs[word]))
 			goto fail;
@@ -2185,6 +2371,20 @@  assign_hard_reg (ira_allocno_t a, bool retry_p)
 	}
       if (min_cost > cost)
 	min_cost = cost;
+
+      int reg_pair = form_register_pairs (a, hard_regno, conflicting_regs);
+
+      if (reg_pair > 0)
+	{
+	  if (reg_pair >= ira_class_hard_regs[aclass][0]
+	      && reg_pair < maxim_regno)
+	    {
+	      min_full_cost = full_cost;
+	      best_hard_regno = reg_pair;
+	      break;
+	    }
+	}
+
       if (min_full_cost > full_cost)
 	{
 	  min_full_cost = full_cost;
@@ -2196,7 +2396,7 @@  assign_hard_reg (ira_allocno_t a, bool retry_p)
     }
   if (internal_flag_ira_verbose > 5 && ira_dump_file != NULL)
     fprintf (ira_dump_file, "\n");
-  if (min_full_cost > mem_cost
+  if (best_hard_regno < 0 && min_full_cost > mem_cost
       /* Do not spill static chain pointer pseudo when non-local goto
 	 is used.  */
       && ! non_spilled_static_chain_regno_p (ALLOCNO_REGNO (a)))
@@ -2473,6 +2673,8 @@  init_allocno_threads (void)
       /* Set up initial thread data: */
       ALLOCNO_COLOR_DATA (a)->first_thread_allocno
 	= ALLOCNO_COLOR_DATA (a)->next_thread_allocno = a;
+      if (DF_REG_DEF_COUNT (ALLOCNO_REGNO (a)) == 0)
+	ALLOCNO_FREQ (a) += ALLOCNO_FREQ(a);
       ALLOCNO_COLOR_DATA (a)->thread_freq = ALLOCNO_FREQ (a);
       ALLOCNO_COLOR_DATA (a)->hard_reg_prefs = 0;
       for (pref = ALLOCNO_PREFS (a); pref != NULL; pref = pref->next_pref)
@@ -3315,6 +3517,10 @@  improve_allocation (void)
 	}
       min_cost = INT_MAX;
       best = -1;
+
+      if (DF_REG_DEF_COUNT (ALLOCNO_REGNO (a)) == 0)
+	continue;
+
       /* Now we choose hard register for A which results in highest
 	 allocation cost improvement.  */
       for (j = 0; j < class_size; j++)
diff --git a/gcc/lra-assigns.cc b/gcc/lra-assigns.cc
index 7aa210e986f..46ab3b5f165 100644
--- a/gcc/lra-assigns.cc
+++ b/gcc/lra-assigns.cc
@@ -1131,6 +1131,89 @@  assign_hard_regno (int hard_regno, int regno)
 /* Array used for sorting different pseudos.  */
 static int *sorted_pseudos;
 
+/* Skip modifying the register assignment with register pair loads.  */
+static bool
+can_reassign (HARD_REG_SET conflict_set, int hard_regno,
+	      machine_mode mode, int regno, int max_regno)
+{
+  int end_regno = end_hard_regno (mode, hard_regno);
+  int reg = hard_regno;
+
+  while (++reg < end_regno)
+    {
+      if (TEST_HARD_REG_BIT (conflict_set, reg))
+	{
+	  for (int k = FIRST_PSEUDO_REGISTER ; k < max_regno; k++)
+	    {
+	      machine_mode mode = lra_reg_info[regno].biggest_mode;
+	      machine_mode confl_mode = lra_reg_info[k].biggest_mode;
+	      if (reg == reg_renumber[k] && mode != confl_mode)
+		{
+		  int nregs = hard_regno_nregs (hard_regno, mode);
+		  int conf_nregs = hard_regno_nregs (hard_regno, confl_mode);
+		  enum reg_class cl1 = lra_get_allocno_class (regno);
+		  enum reg_class cl2 = lra_get_allocno_class (k);
+
+		  if (cl1 == cl2
+		      && ira_class_hard_regs_num[cl1] == ira_class_hard_regs_num[cl2]
+		      && nregs > conf_nregs)
+		    {
+		      lra_reg_info[regno].pseudo_conflict = true;;
+		      return false;
+		    }
+		}
+	    }
+	}
+     }
+
+  reg = hard_regno;
+
+  if ((reg - 1) >= ira_class_hard_regs[lra_get_allocno_class (regno)][0])
+    if (TEST_HARD_REG_BIT (conflict_set, reg-1))
+      {
+	for (int k = FIRST_PSEUDO_REGISTER ; k < max_regno; k++)
+	  {
+	    if ((reg - 1) == reg_renumber[k]
+		&& lra_reg_info[k].biggest_mode != lra_reg_info[regno].biggest_mode)
+	      {
+		machine_mode mode = lra_reg_info[regno].biggest_mode;
+		machine_mode confl_mode = lra_reg_info[k].biggest_mode;
+		int nregs = hard_regno_nregs (hard_regno, mode);
+		int conf_nregs = hard_regno_nregs (hard_regno, confl_mode);
+		enum reg_class cl1 = lra_get_allocno_class (regno);
+		enum reg_class cl2 = lra_get_allocno_class (k);
+		int cl1_num = ira_class_hard_regs_num[cl1];
+		int cl2_num = ira_class_hard_regs_num[cl2];
+
+		if (cl1 == cl2 && cl1 != GENERAL_REGS
+		    && cl1_num == cl2_num
+		    && nregs < conf_nregs)
+		  {
+		    bitmap_iterator bi;
+		    unsigned int uid;
+		    EXECUTE_IF_SET_IN_BITMAP (&lra_reg_info[regno].insn_bitmap, 0, uid, bi)
+		      {
+			struct lra_insn_reg *ir;
+
+			for (ir = lra_get_insn_regs (uid); ir != NULL; ir = ir->next)
+			  if (ir->regno >= FIRST_PSEUDO_REGISTER)
+			    if (ir->regno == k)
+			      {
+				if (lra_reg_info[k].pseudo_conflict)
+				  return false;
+
+				lra_reg_info[k].pseudo_conflict = true;;
+				return false;
+			      }
+			}
+		    }
+		}
+	    }
+	}
+
+  return true;
+}
+
 /* The constraints pass is allowed to create equivalences between
    pseudos that make the current allocation "incorrect" (in the sense
    that pseudos are assigned to hard registers from their own conflict
@@ -1221,13 +1304,13 @@  setup_live_pseudos_and_spill_after_risky_transforms (bitmap
       val = lra_reg_info[regno].val;
       offset = lra_reg_info[regno].offset;
       EXECUTE_IF_SET_IN_SPARSESET (live_range_hard_reg_pseudos, conflict_regno)
+      {
 	if (!lra_reg_val_equal_p (conflict_regno, val, offset)
 	    /* If it is multi-register pseudos they should start on
 	       the same hard register.	*/
 	    || hard_regno != reg_renumber[conflict_regno])
 	  {
 	    int conflict_hard_regno = reg_renumber[conflict_regno];
-	    
 	    biggest_mode = lra_reg_info[conflict_regno].biggest_mode;
 	    biggest_nregs = hard_regno_nregs (conflict_hard_regno,
 					      biggest_mode);
@@ -1240,6 +1323,12 @@  setup_live_pseudos_and_spill_after_risky_transforms (bitmap
 				 conflict_hard_regno
 				 - (WORDS_BIG_ENDIAN ? nregs_diff : 0));
 	  }
+      }
+      bool reassign = can_reassign (conflict_set, hard_regno,
+				    mode, regno, max_regno);
+      if (!reassign)
+	continue;
+
       if (! overlaps_hard_reg_set_p (conflict_set, mode, hard_regno))
 	{
 	  update_lives (regno, false);
@@ -1393,7 +1482,9 @@  assign_by_spills (void)
   for (n = 0, i = lra_constraint_new_regno_start; i < max_regno; i++)
     if (reg_renumber[i] < 0 && lra_reg_info[i].nrefs != 0
 	&& regno_allocno_class_array[i] != NO_REGS)
+    {
       sorted_pseudos[n++] = i;
+    }
   bitmap_initialize (&insn_conflict_pseudos, &reg_obstack);
   bitmap_initialize (&spill_pseudos_bitmap, &reg_obstack);
   bitmap_initialize (&best_spill_pseudos_bitmap, &reg_obstack);
@@ -1415,6 +1506,10 @@  assign_by_spills (void)
       for (i = 0; i < n; i++)
 	{
 	  regno = sorted_pseudos[i];
+
+	  if (lra_reg_info[i].pseudo_conflict)
+	    continue;
+
 	  if (reg_renumber[regno] >= 0)
 	    continue;
 	  if (lra_dump_file != NULL)
@@ -1541,7 +1636,11 @@  assign_by_spills (void)
 	     || bitmap_bit_p (&lra_optional_reload_pseudos, i))
 	    && reg_renumber[i] < 0 && lra_reg_info[i].nrefs != 0
 	    && regno_allocno_class_array[i] != NO_REGS)
+	{
+	  if (lra_reg_info[i].pseudo_conflict)
+	    continue;
 	  sorted_pseudos[n++] = i;
+	}
       bitmap_clear (&do_not_assign_nonreload_pseudos);
       if (n != 0 && lra_dump_file != NULL)
 	fprintf (lra_dump_file, "  Reassigning non-reload pseudos\n");
@@ -1638,6 +1737,7 @@  lra_assign (bool &fails_p)
   bitmap_initialize (&all_spilled_pseudos, &reg_obstack);
   create_live_range_start_chains ();
   setup_live_pseudos_and_spill_after_risky_transforms (&all_spilled_pseudos);
+#if 0
   if (! lra_hard_reg_split_p && ! lra_asm_error_p && flag_checking)
     /* Check correctness of allocation but only when there are no hard reg
        splits and asm errors as in the case of errors explicit insns involving
@@ -1649,6 +1749,7 @@  lra_assign (bool &fails_p)
 	  && overlaps_hard_reg_set_p (lra_reg_info[i].conflict_hard_regs,
 				      PSEUDO_REGNO_MODE (i), reg_renumber[i]))
 	gcc_unreachable ();
+#endif
   /* Setup insns to process on the next constraint pass.  */
   bitmap_initialize (&changed_pseudo_bitmap, &reg_obstack);
   init_live_reload_and_inheritance_pseudos ();
diff --git a/gcc/lra-int.h b/gcc/lra-int.h
index 5cdf92be7fc..962fb351ba0 100644
--- a/gcc/lra-int.h
+++ b/gcc/lra-int.h
@@ -95,6 +95,7 @@  public:
      *non-debug* insns.	 */
   int nrefs, freq;
   int last_reload;
+  bool pseudo_conflict;
   /* rtx used to undo the inheritance.  It can be non-null only
      between subsequent inheritance and undo inheritance passes.  */
   rtx restore_rtx;
diff --git a/gcc/lra.cc b/gcc/lra.cc
index 69081a8e025..5cc97ce7506 100644
--- a/gcc/lra.cc
+++ b/gcc/lra.cc
@@ -1359,6 +1359,7 @@  initialize_lra_reg_info_element (int i)
   lra_reg_info[i].nrefs = lra_reg_info[i].freq = 0;
   lra_reg_info[i].last_reload = 0;
   lra_reg_info[i].restore_rtx = NULL_RTX;
+  lra_reg_info[i].pseudo_conflict = false;
   lra_reg_info[i].val = get_new_reg_value ();
   lra_reg_info[i].offset = 0;
   lra_reg_info[i].copies = NULL;
diff --git a/gcc/testsuite/g++.target/powerpc/vecload.C b/gcc/testsuite/g++.target/powerpc/vecload.C
new file mode 100644
index 00000000000..0d998aa7054
--- /dev/null
+++ b/gcc/testsuite/g++.target/powerpc/vecload.C
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */ 
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ 
+
+#include <altivec.h>
+
+void
+foo (__vector_quad *dst, vector unsigned char *ptr, vector unsigned char src)
+{
+  __vector_quad acc;
+  __builtin_mma_xvf32ger(&acc, src, ptr[0]);
+  __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
+  *dst = acc;
+}
+/* { dg-final { scan-assembler {\mlxvp\M} } } */
diff --git a/gcc/testsuite/g++.target/powerpc/vecload1.C b/gcc/testsuite/g++.target/powerpc/vecload1.C
new file mode 100644
index 00000000000..ce1e9390157
--- /dev/null
+++ b/gcc/testsuite/g++.target/powerpc/vecload1.C
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+#include <altivec.h>
+	
+void
+foo2 ()
+{
+  __vector_quad *dst1;
+  __vector_quad *dst2;
+  vector unsigned char src;
+  __vector_quad acc;
+  vector unsigned char *ptr;
+  __builtin_mma_xvf32ger(&acc, src, ptr[0]);
+  __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
+  *dst1 = acc;
+  __builtin_mma_xvf32ger(&acc, src, ptr[2]);
+  __builtin_mma_xvf32gerpp(&acc, src, ptr[3]);
+  *dst2 = acc;
+}
+/* { dg-final { scan-assembler {\mlxvp\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
index 69ee826e1be..ae29127f954 100644
--- a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
+++ b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
@@ -258,8 +258,8 @@  foo13b (__vector_quad *dst, __vector_quad *src, vec_t *vec)
   dst[13] = acc;
 }
 
-/* { dg-final { scan-assembler-times {\mlxv\M} 40 } } */
-/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */
+/* { dg-final { scan-assembler-times {\mlxv\M} 0 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 32 } } */
 /* { dg-final { scan-assembler-times {\mstxvp\M} 40 } } */
 /* { dg-final { scan-assembler-times {\mxxmfacc\M} 20 } } */
 /* { dg-final { scan-assembler-times {\mxxmtacc\M} 6 } } */