[02/10] aarch64: Add backend support for DFP

Message ID 20220509143507.239804-3-christophe.lyon@arm.com
State Superseded
Headers
Series Enable Decimal Floating Point (DFP) on AArch64 |

Commit Message

Christophe Lyon May 9, 2022, 2:34 p.m. UTC
  This patch updates the aarch64 backend as needed to support DFP modes
(SD, DD and TD).

2022-03-31  Christophe Lyon  <christophe.lyon@arm.com>

	gcc/
	* config/aarch64/aarch64.cc
	(aarch64_split_128bit_move): Handle DFP modes.
	(aarch64_gen_storewb_pair): Likewise.
	(aarch64_gen_loadwb_pair): Likewise.
	(aarch64_gen_store_pair): Likewise.
	(aarch64_gen_load_pair): Likewise.
	(aarch64_mode_valid_for_sched_fusion_p): Likewise.
	(aarch64_classify_address): Likewise.
	(aarch64_legitimize_address_displacement): Likewise.
	(aarch64_can_const_movi_rtx_p): Likewise.
	(aarch64_anchor_offset): Likewise.
	(aarch64_secondary_reload): Likewise.
	(aarch64_rtx_costs): Likewise.
	(aarch64_gimplify_va_arg_expr): Likewise.
	(aapcs_vfp_sub_candidate): Likewise.
	(aarch64_vfp_is_call_or_return_candidate): Likewise.
	(aarch64_output_scalar_simd_mov_immediate): Likewise.
	(aarch64_gen_adjusted_ldpstp): Likewise.
	(aarch64_scalar_mode_supported_p): Accept DFP modes if enabled.
	* config/aarch64/aarch64.md
	(movsf_aarch64): Use SFD iterator and rename into
	mov<mode>_aarch64.
	(movdf_aarch64): Use DFD iterator and rename into
	mov<mode>_aarch64.
	(movtf_aarch64): Use TFD iterator and rename into
	mov<mode>_aarch64.
	(split pattern for move TF mode): Use TFD iterator.
	(load_pair_dw_tftf): Use TX iterator and rename into
	load_pair_dw_<mode><mode>.
	(store_pair_dw_tftf): Likewise.
	(loadwb_pair<GPF:mode>_<P:mode>): Use GPF_PAIR iterator.
	(storewb_pair<GPF:mode>_<P:mode>): Likewise.
	* config/aarch64/aarch64/iterators.md
	(GPF_PAIR): New iterator.
	(GPF_TF_F16_MOV): Add DFP modes.
	(SFD, DFD, TFD): New iterators.
	(GPF_TF): Add DFP modes.
	(TX, DX, DX2): Likewise.
---
 gcc/config/aarch64/aarch64.cc   | 95 +++++++++++++++++++++++++--------
 gcc/config/aarch64/aarch64.md   | 86 ++++++++++++++---------------
 gcc/config/aarch64/iterators.md | 28 +++++++---
 3 files changed, 136 insertions(+), 73 deletions(-)
  

Comments

Richard Sandiford May 10, 2022, 9:23 a.m. UTC | #1
Christophe Lyon via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> @@ -8464,10 +8464,18 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
>        return gen_storewb_pairdf_di (base, base, reg, reg2,
>  				    GEN_INT (-adjustment),
>  				    GEN_INT (UNITS_PER_WORD - adjustment));
> +    case E_DDmode:
> +      return gen_storewb_pairdd_di (base, base, reg, reg2,
> +				    GEN_INT (-adjustment),
> +				    GEN_INT (UNITS_PER_WORD - adjustment));
>      case E_TFmode:
>        return gen_storewb_pairtf_di (base, base, reg, reg2,
>  				    GEN_INT (-adjustment),
>  				    GEN_INT (UNITS_PER_VREG - adjustment));
> +    case E_TDmode:
> +      return gen_storewb_pairtd_di (base, base, reg, reg2,
> +				    GEN_INT (-adjustment),
> +				    GEN_INT (UNITS_PER_VREG - adjustment));
>      default:
>        gcc_unreachable ();
>      }
> @@ -8510,9 +8518,15 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
>      case E_DFmode:
>        return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
>  				   GEN_INT (UNITS_PER_WORD));
> +    case E_DDmode:
> +      return gen_loadwb_pairdd_di (base, base, reg, reg2, GEN_INT (adjustment),
> +				   GEN_INT (UNITS_PER_WORD));
>      case E_TFmode:
>        return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
>  				   GEN_INT (UNITS_PER_VREG));
> +    case E_TDmode:
> +      return gen_loadwb_pairtd_di (base, base, reg, reg2, GEN_INT (adjustment),
> +				   GEN_INT (UNITS_PER_VREG));
>      default:
>        gcc_unreachable ();
>      }

Are these changes needed?  I would only have expected them to be
used for stack pushes and pops.

> @@ -8561,9 +8575,15 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
>      case E_DFmode:
>        return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
> 
> +    case E_DDmode:
> +      return gen_store_pair_dw_dddd (mem1, reg1, mem2, reg2);
> +
>      case E_TFmode:
>        return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
> 
> +    case E_TDmode:
> +      return gen_store_pair_dw_tdtd (mem1, reg1, mem2, reg2);
> +
>      case E_V4SImode:
>        return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
> 
> @@ -8590,9 +8610,15 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
>      case E_DFmode:
>        return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
> 
> +    case E_DDmode:
> +      return gen_load_pair_dw_dddd (reg1, mem1, reg2, mem2);
> +
>      case E_TFmode:
>        return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
> 
> +    case E_TDmode:
> +      return gen_load_pair_dw_tdtd (reg1, mem1, reg2, mem2);
> +
>      case E_V4SImode:
>        return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
 
Same idea here, in that I think these would only be used for prologue/epilogue
and block operations.

> @@ -20362,7 +20407,8 @@ aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
>    machine_mode new_mode = VOIDmode;
>    bool composite_p = aarch64_composite_type_p (type, mode);
>  
> -  if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
> +  if ((!composite_p && ((GET_MODE_CLASS (mode) == MODE_FLOAT)
> +			|| GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))

Formatting nit: doubled brackets.  Now that the condition is spread over
multiple lines, the && should start a new line:

  if ((!composite_p
	&& (GET_MODE_CLASS (mode) == MODE_FLOAT)
	    || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT)

> -;; Iterator for all scalar floating point modes (SF, DF and TF)
> -(define_mode_iterator GPF_TF [SF DF TF])
> +;; Iterator for all scalar floating point modes (SF, DF, TF SD, DD, and TD)
> +(define_mode_iterator GPF_TF [SF DF TF SD DD TD])

Missing comma after “TF”.

The patch has some changes to the rtx costs for constants.  What happens
with aarch64_reinterpret_float_as_int?  (Genuine question.)  Do dfp
constants get through and do they get converted to an integer?
If so, I guess we need to handle DDmode like DFmode there.

Related, aarch64_legitimate_constant_p has:

  /* Support CSE and rematerialization of common constants.  */
  if (CONST_INT_P (x)
      || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT))
    return true;

which looks it would exclude DFP constants.  Perhaps we should
simply remove the MODE_FLOAT condition, since on AArch64 all
CONST_DOUBLEs are/must be FP constants.

The patch changes the scalar handling in aapcs_vfp_sub_candidate,
but not the complex handling.  Is that deliberate?

I wes wondering whether it would be worth adding some abstraction,
since the same checks appear multiple times.  But the only things
I could think of would have made things worse, not better, so I agree
open-coding the mode checks is the way to go.

The other parts of the patch look good.

Thanks,
Richard
  
Richard Sandiford May 10, 2022, 9:30 a.m. UTC | #2
Richard Sandiford <richard.sandiford@arm.com> writes:
> The patch changes the scalar handling in aapcs_vfp_sub_candidate,
> but not the complex handling.  Is that deliberate?

TIL: we don't support complex decimal floats.  So never mind that :-)

Richard
  
Christophe Lyon May 10, 2022, 10:18 a.m. UTC | #3
On 5/10/22 11:30, Richard Sandiford wrote:
> Richard Sandiford <richard.sandiford@arm.com> writes:
>> The patch changes the scalar handling in aapcs_vfp_sub_candidate,
>> but not the complex handling.  Is that deliberate?
> 
> TIL: we don't support complex decimal floats.  So never mind that :-)
> 
Indeed. Sorry, maybe I should have made this clear in the commit message.

> Richard
  

Patch

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index bd855758778..0f1b3c04158 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -4828,7 +4828,7 @@  aarch64_split_128bit_move (rtx dst, rtx src)
 
   machine_mode mode = GET_MODE (dst);
 
-  gcc_assert (mode == TImode || mode == TFmode);
+  gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 
@@ -8464,10 +8464,18 @@  aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
       return gen_storewb_pairdf_di (base, base, reg, reg2,
 				    GEN_INT (-adjustment),
 				    GEN_INT (UNITS_PER_WORD - adjustment));
+    case E_DDmode:
+      return gen_storewb_pairdd_di (base, base, reg, reg2,
+				    GEN_INT (-adjustment),
+				    GEN_INT (UNITS_PER_WORD - adjustment));
     case E_TFmode:
       return gen_storewb_pairtf_di (base, base, reg, reg2,
 				    GEN_INT (-adjustment),
 				    GEN_INT (UNITS_PER_VREG - adjustment));
+    case E_TDmode:
+      return gen_storewb_pairtd_di (base, base, reg, reg2,
+				    GEN_INT (-adjustment),
+				    GEN_INT (UNITS_PER_VREG - adjustment));
     default:
       gcc_unreachable ();
     }
@@ -8510,9 +8518,15 @@  aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
     case E_DFmode:
       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
 				   GEN_INT (UNITS_PER_WORD));
+    case E_DDmode:
+      return gen_loadwb_pairdd_di (base, base, reg, reg2, GEN_INT (adjustment),
+				   GEN_INT (UNITS_PER_WORD));
     case E_TFmode:
       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
 				   GEN_INT (UNITS_PER_VREG));
+    case E_TDmode:
+      return gen_loadwb_pairtd_di (base, base, reg, reg2, GEN_INT (adjustment),
+				   GEN_INT (UNITS_PER_VREG));
     default:
       gcc_unreachable ();
     }
@@ -8561,9 +8575,15 @@  aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
     case E_DFmode:
       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
 
+    case E_DDmode:
+      return gen_store_pair_dw_dddd (mem1, reg1, mem2, reg2);
+
     case E_TFmode:
       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
 
+    case E_TDmode:
+      return gen_store_pair_dw_tdtd (mem1, reg1, mem2, reg2);
+
     case E_V4SImode:
       return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
 
@@ -8590,9 +8610,15 @@  aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
     case E_DFmode:
       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
 
+    case E_DDmode:
+      return gen_load_pair_dw_dddd (reg1, mem1, reg2, mem2);
+
     case E_TFmode:
       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
 
+    case E_TDmode:
+      return gen_load_pair_dw_tdtd (reg1, mem1, reg2, mem2);
+
     case E_V4SImode:
       return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
 
@@ -10568,6 +10594,7 @@  aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
 {
   return mode == SImode || mode == DImode
 	 || mode == SFmode || mode == DFmode
+	 || mode == SDmode || mode == DDmode
 	 || (aarch64_vector_mode_supported_p (mode)
 	     && (known_eq (GET_MODE_SIZE (mode), 8)
 		 || (known_eq (GET_MODE_SIZE (mode), 16)
@@ -10610,12 +10637,13 @@  aarch64_classify_address (struct aarch64_address_info *info,
   vec_flags &= ~VEC_PARTIAL;
 
   /* On BE, we use load/store pair for all large int mode load/stores.
-     TI/TFmode may also use a load/store pair.  */
+     TI/TF/TDmode may also use a load/store pair.  */
   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
 			    || type == ADDR_QUERY_LDP_STP_N
 			    || mode == TImode
 			    || mode == TFmode
+			    || mode == TDmode
 			    || (BYTES_BIG_ENDIAN && advsimd_struct_p));
   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
      corresponds to the actual size of the memory being loaded/stored and the
@@ -10689,7 +10717,7 @@  aarch64_classify_address (struct aarch64_address_info *info,
 	  info->offset = op1;
 	  info->const_offset = offset;
 
-	  /* TImode and TFmode values are allowed in both pairs of X
+	  /* TImode, TFmode and TDmode values are allowed in both pairs of X
 	     registers and individual Q registers.  The available
 	     address modes are:
 	     X,X: 7-bit signed scaled offset
@@ -10698,7 +10726,7 @@  aarch64_classify_address (struct aarch64_address_info *info,
 	     When performing the check for pairs of X registers i.e.  LDP/STP
 	     pass down DImode since that is the natural size of the LDP/STP
 	     instruction memory accesses.  */
-	  if (mode == TImode || mode == TFmode)
+	  if (mode == TImode || mode == TFmode || mode == TDmode)
 	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
 		    && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
 			|| offset_12bit_unsigned_scaled_p (mode, offset)));
@@ -10821,14 +10849,14 @@  aarch64_classify_address (struct aarch64_address_info *info,
 	  info->offset = XEXP (XEXP (x, 1), 1);
 	  info->const_offset = offset;
 
-	  /* TImode and TFmode values are allowed in both pairs of X
+	  /* TImode, TFmode and TDmode values are allowed in both pairs of X
 	     registers and individual Q registers.  The available
 	     address modes are:
 	     X,X: 7-bit signed scaled offset
 	     Q:   9-bit signed offset
 	     We conservatively require an offset representable in either mode.
 	   */
-	  if (mode == TImode || mode == TFmode)
+	  if (mode == TImode || mode == TFmode || mode == TDmode)
 	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
 		    && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
 
@@ -10990,9 +11018,9 @@  aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
 	 offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
 	 range otherwise to increase opportunities for sharing the base
 	 address of different sizes.  Unaligned accesses use the signed
-	 9-bit range, TImode/TFmode use the intersection of signed
+	 9-bit range, TImode/TFmode/TDmode use the intersection of signed
 	 scaled 7-bit and signed 9-bit offset.  */
-      if (mode == TImode || mode == TFmode)
+      if (mode == TImode || mode == TFmode || mode == TDmode)
 	second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
       else if ((const_offset & (size - 1)) != 0)
 	second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
@@ -11156,7 +11184,7 @@  aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
   else
     return false;
 
-   /* use a 64 bit mode for everything except for DI/DF mode, where we use
+   /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
      a 128 bit vector mode.  */
   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
 
@@ -12356,7 +12384,7 @@  aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
   if (IN_RANGE (offset, -256, 0))
     return 0;
 
-  if (mode == TImode || mode == TFmode)
+  if (mode == TImode || mode == TFmode || mode == TDmode)
     return (offset + 0x100) & ~0x1ff;
 
   /* Use 12-bit offset by access size.  */
@@ -12465,7 +12493,9 @@  aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
 
   /* Without the TARGET_SIMD instructions we cannot move a Q register
      to a Q register directly.  We need a scratch.  */
-  if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
+  if (REG_P (x)
+      && (mode == TFmode || mode == TImode || mode == TDmode)
+      && mode == GET_MODE (x)
       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
       && reg_class_subset_p (rclass, FP_REGS))
     {
@@ -12473,14 +12503,16 @@  aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
       return NO_REGS;
     }
 
-  /* A TFmode or TImode memory access should be handled via an FP_REGS
+  /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
      because AArch64 has richer addressing modes for LDR/STR instructions
      than LDP/STP instructions.  */
   if (TARGET_FLOAT && rclass == GENERAL_REGS
       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
     return FP_REGS;
 
-  if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
+  if (rclass == FP_REGS
+      && (mode == TImode || mode == TFmode || mode == TDmode)
+      && CONSTANT_P(x))
       return GENERAL_REGS;
 
   return NO_REGS;
@@ -13611,9 +13643,9 @@  aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
 		*cost += extra_cost->ldst.storev;
 	      else if (GET_MODE_CLASS (mode) == MODE_INT)
 		*cost += extra_cost->ldst.store;
-	      else if (mode == SFmode)
+	      else if (mode == SFmode || mode == SDmode)
 		*cost += extra_cost->ldst.storef;
-	      else if (mode == DFmode)
+	      else if (mode == DFmode || mode == DDmode)
 		*cost += extra_cost->ldst.stored;
 
 	      *cost +=
@@ -13737,11 +13769,11 @@  aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
 	  /* mov[df,sf]_aarch64.  */
 	  if (aarch64_float_const_representable_p (x))
 	    /* FMOV (scalar immediate).  */
-	    *cost += extra_cost->fp[mode == DFmode].fpconst;
+	    *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
 	  else if (!aarch64_float_const_zero_rtx_p (x))
 	    {
 	      /* This will be a load from memory.  */
-	      if (mode == DFmode)
+	      if (mode == DFmode || mode == DDmode)
 		*cost += extra_cost->ldst.loadd;
 	      else
 		*cost += extra_cost->ldst.loadf;
@@ -13767,9 +13799,9 @@  aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
 	    *cost += extra_cost->ldst.loadv;
 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
 	    *cost += extra_cost->ldst.load;
-	  else if (mode == SFmode)
+	  else if (mode == SFmode || mode == SDmode)
 	    *cost += extra_cost->ldst.loadf;
-	  else if (mode == DFmode)
+	  else if (mode == DFmode || mode == DDmode)
 	    *cost += extra_cost->ldst.loadd;
 
 	  *cost +=
@@ -19795,6 +19827,18 @@  aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
 	  field_t = long_double_type_node;
 	  field_ptr_t = long_double_ptr_type_node;
 	  break;
+	case SDmode:
+	  field_t = dfloat32_type_node;
+	  field_ptr_t = build_pointer_type (dfloat32_type_node);
+	  break;
+	case DDmode:
+	  field_t = dfloat64_type_node;
+	  field_ptr_t = build_pointer_type (dfloat64_type_node);
+	  break;
+	case TDmode:
+	  field_t = dfloat128_type_node;
+	  field_ptr_t = build_pointer_type (dfloat128_type_node);
+	  break;
 	case E_HFmode:
 	  field_t = aarch64_fp16_type_node;
 	  field_ptr_t = aarch64_fp16_ptr_type_node;
@@ -20046,7 +20090,8 @@  aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
     case REAL_TYPE:
       mode = TYPE_MODE (type);
       if (mode != DFmode && mode != SFmode
-	  && mode != TFmode && mode != HFmode)
+	  && mode != TFmode && mode != HFmode
+	  && mode != SDmode && mode != DDmode && mode != TDmode)
 	return -1;
 
       if (*modep == VOIDmode)
@@ -20362,7 +20407,8 @@  aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
   machine_mode new_mode = VOIDmode;
   bool composite_p = aarch64_composite_type_p (type, mode);
 
-  if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
+  if ((!composite_p && ((GET_MODE_CLASS (mode) == MODE_FLOAT)
+			|| GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
       || aarch64_short_vector_p (type, mode))
     {
       *count = 1;
@@ -23264,7 +23310,7 @@  aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
     }
 
   machine_mode vmode;
-  /* use a 64 bit mode for everything except for DI/DF mode, where we use
+  /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
      a 128 bit vector mode.  */
   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
 
@@ -26082,7 +26128,7 @@  aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
     base_off = (off_val_1 + off_val_3) / 2;
   else
     /* However, due to issues with negative LDP/STP offset generation for
-       larger modes, for DF, DI and vector modes. we must not use negative
+       larger modes, for DF, DD, DI and vector modes. we must not use negative
        addresses smaller than 9 signed unadjusted bits can store.  This
        provides the most range in this case.  */
     base_off = off_val_1;
@@ -26360,6 +26406,9 @@  aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
 static bool
 aarch64_scalar_mode_supported_p (scalar_mode mode)
 {
+  if (DECIMAL_FLOAT_MODE_P (mode))
+    return default_decimal_float_supported_p ();
+
   return (mode == HFmode
 	  ? true
 	  : default_scalar_mode_supported_p (mode));
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index f5c635938ad..b5b69da4fab 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1477,11 +1477,11 @@  (define_insn "*mov<mode>_aarch64"
    (set_attr "arch" "simd,fp16,simd,simd,simd,fp16,simd,*,*,*,*,*")]
 )
 
-(define_insn "*movsf_aarch64"
-  [(set (match_operand:SF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
-	(match_operand:SF 1 "general_operand"      "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r,M"))]
-  "TARGET_FLOAT && (register_operand (operands[0], SFmode)
-    || aarch64_reg_or_fp_zero (operands[1], SFmode))"
+(define_insn "*mov<mode>_aarch64"
+  [(set (match_operand:SFD 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
+	(match_operand:SFD 1 "general_operand"      "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r,M"))]
+  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
+    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
   "@
    movi\\t%0.2s, #0
    fmov\\t%s0, %w1
@@ -1501,11 +1501,11 @@  (define_insn "*movsf_aarch64"
    (set_attr "arch" "simd,*,*,*,*,simd,*,*,*,*,*,*")]
 )
 
-(define_insn "*movdf_aarch64"
-  [(set (match_operand:DF 0 "nonimmediate_operand" "=w, w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
-	(match_operand:DF 1 "general_operand"      "Y , ?rY, w,w,Ufc,Uvi,m,w,m,rY,r,N"))]
-  "TARGET_FLOAT && (register_operand (operands[0], DFmode)
-    || aarch64_reg_or_fp_zero (operands[1], DFmode))"
+(define_insn "*mov<mode>_aarch64"
+  [(set (match_operand:DFD 0 "nonimmediate_operand" "=w, w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
+	(match_operand:DFD 1 "general_operand"      "Y , ?rY, w,w,Ufc,Uvi,m,w,m,rY,r,N"))]
+  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
+    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
   "@
    movi\\t%d0, #0
    fmov\\t%d0, %x1
@@ -1545,13 +1545,13 @@  (define_split
   }
 )
 
-(define_insn "*movtf_aarch64"
-  [(set (match_operand:TF 0
+(define_insn "*mov<mode>_aarch64"
+  [(set (match_operand:TFD 0
 	 "nonimmediate_operand" "=w,?r ,w ,?r,w,?w,w,m,?r,m ,m")
-	(match_operand:TF 1
+	(match_operand:TFD 1
 	 "general_operand"      " w,?rY,?r,w ,Y,Y ,m,w,m ,?r,Y"))]
-  "TARGET_FLOAT && (register_operand (operands[0], TFmode)
-    || aarch64_reg_or_fp_zero (operands[1], TFmode))"
+  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
+    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
   "@
    mov\\t%0.16b, %1.16b
    #
@@ -1571,8 +1571,8 @@  (define_insn "*movtf_aarch64"
 )
 
 (define_split
-   [(set (match_operand:TF 0 "register_operand" "")
-	 (match_operand:TF 1 "nonmemory_operand" ""))]
+   [(set (match_operand:TFD 0 "register_operand" "")
+	 (match_operand:TFD 1 "nonmemory_operand" ""))]
   "reload_completed && aarch64_split_128bit_move_p (operands[0], operands[1])"
   [(const_int 0)]
   {
@@ -1746,16 +1746,16 @@  (define_insn "load_pair_dw_<DX:mode><DX2:mode>"
    (set_attr "arch" "*,fp")]
 )
 
-(define_insn "load_pair_dw_tftf"
-  [(set (match_operand:TF 0 "register_operand" "=w")
-	(match_operand:TF 1 "aarch64_mem_pair_operand" "Ump"))
-   (set (match_operand:TF 2 "register_operand" "=w")
-	(match_operand:TF 3 "memory_operand" "m"))]
+(define_insn "load_pair_dw_<mode><mode>"
+  [(set (match_operand:TX 0 "register_operand" "=w")
+	(match_operand:TX 1 "aarch64_mem_pair_operand" "Ump"))
+   (set (match_operand:TX 2 "register_operand" "=w")
+	(match_operand:TX 3 "memory_operand" "m"))]
    "TARGET_SIMD
     && rtx_equal_p (XEXP (operands[3], 0),
 		    plus_constant (Pmode,
 				   XEXP (operands[1], 0),
-				   GET_MODE_SIZE (TFmode)))"
+				   GET_MODE_SIZE (<MODE>mode)))"
   "ldp\\t%q0, %q2, %z1"
   [(set_attr "type" "neon_ldp_q")
    (set_attr "fp" "yes")]
@@ -1796,16 +1796,16 @@  (define_insn "store_pair_dw_<DX:mode><DX2:mode>"
    (set_attr "arch" "*,fp")]
 )
 
-(define_insn "store_pair_dw_tftf"
-  [(set (match_operand:TF 0 "aarch64_mem_pair_operand" "=Ump")
-	(match_operand:TF 1 "register_operand" "w"))
-   (set (match_operand:TF 2 "memory_operand" "=m")
-	(match_operand:TF 3 "register_operand" "w"))]
+(define_insn "store_pair_dw_<mode><mode>"
+  [(set (match_operand:TX 0 "aarch64_mem_pair_operand" "=Ump")
+	(match_operand:TX 1 "register_operand" "w"))
+   (set (match_operand:TX 2 "memory_operand" "=m")
+	(match_operand:TX 3 "register_operand" "w"))]
    "TARGET_SIMD &&
     rtx_equal_p (XEXP (operands[2], 0),
 		 plus_constant (Pmode,
 				XEXP (operands[0], 0),
-				GET_MODE_SIZE (TFmode)))"
+				GET_MODE_SIZE (<MODE>mode)))"
   "stp\\t%q1, %q3, %z0"
   [(set_attr "type" "neon_stp_q")
    (set_attr "fp" "yes")]
@@ -1828,18 +1828,18 @@  (define_insn "loadwb_pair<GPI:mode>_<P:mode>"
   [(set_attr "type" "load_<GPI:ldpstp_sz>")]
 )
 
-(define_insn "loadwb_pair<GPF:mode>_<P:mode>"
+(define_insn "loadwb_pair<GPF_PAIR:mode>_<P:mode>"
   [(parallel
     [(set (match_operand:P 0 "register_operand" "=k")
           (plus:P (match_operand:P 1 "register_operand" "0")
                   (match_operand:P 4 "aarch64_mem_pair_offset" "n")))
-     (set (match_operand:GPF 2 "register_operand" "=w")
-          (mem:GPF (match_dup 1)))
-     (set (match_operand:GPF 3 "register_operand" "=w")
-          (mem:GPF (plus:P (match_dup 1)
+     (set (match_operand:GPF_PAIR 2 "register_operand" "=w")
+          (mem:GPF_PAIR (match_dup 1)))
+     (set (match_operand:GPF_PAIR 3 "register_operand" "=w")
+          (mem:GPF_PAIR (plus:P (match_dup 1)
                    (match_operand:P 5 "const_int_operand" "n"))))])]
-  "INTVAL (operands[5]) == GET_MODE_SIZE (<GPF:MODE>mode)"
-  "ldp\\t%<GPF:w>2, %<GPF:w>3, [%1], %4"
+  "INTVAL (operands[5]) == GET_MODE_SIZE (<GPF_PAIR:MODE>mode)"
+  "ldp\\t%<GPF_PAIR:w>2, %<GPF_PAIR:w>3, [%1], %4"
   [(set_attr "type" "neon_load1_2reg")]
 )
 
@@ -1876,19 +1876,19 @@  (define_insn "storewb_pair<GPI:mode>_<P:mode>"
   [(set_attr "type" "store_<GPI:ldpstp_sz>")]
 )
 
-(define_insn "storewb_pair<GPF:mode>_<P:mode>"
+(define_insn "storewb_pair<GPF_PAIR:mode>_<P:mode>"
   [(parallel
     [(set (match_operand:P 0 "register_operand" "=&k")
           (plus:P (match_operand:P 1 "register_operand" "0")
                   (match_operand:P 4 "aarch64_mem_pair_offset" "n")))
-     (set (mem:GPF (plus:P (match_dup 0)
+     (set (mem:GPF_PAIR (plus:P (match_dup 0)
                    (match_dup 4)))
-          (match_operand:GPF 2 "register_operand" "w"))
-     (set (mem:GPF (plus:P (match_dup 0)
+          (match_operand:GPF_PAIR 2 "register_operand" "w"))
+     (set (mem:GPF_PAIR (plus:P (match_dup 0)
                    (match_operand:P 5 "const_int_operand" "n")))
-          (match_operand:GPF 3 "register_operand" "w"))])]
-  "INTVAL (operands[5]) == INTVAL (operands[4]) + GET_MODE_SIZE (<GPF:MODE>mode)"
-  "stp\\t%<GPF:w>2, %<GPF:w>3, [%0, %4]!"
+          (match_operand:GPF_PAIR 3 "register_operand" "w"))])]
+  "INTVAL (operands[5]) == INTVAL (operands[4]) + GET_MODE_SIZE (<GPF_PAIR:MODE>mode)"
+  "stp\\t%<GPF_PAIR:w>2, %<GPF_PAIR:w>3, [%0, %4]!"
   [(set_attr "type" "neon_store1_2reg<q>")]
 )
 
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 88af964de7e..8a6801a6716 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -54,6 +54,10 @@  (define_mode_iterator ALLX [QI HI SI])
 ;; Iterator for General Purpose Floating-point registers (32- and 64-bit modes)
 (define_mode_iterator GPF [SF DF])
 
+;; Iterator for General Purpose Floating-point registers suitable for
+;; load/store by pairs
+(define_mode_iterator GPF_PAIR [SF DF SD DD])
+
 ;; Iterator for all scalar floating point modes (HF, SF, DF)
 (define_mode_iterator GPF_F16 [(HF "AARCH64_ISA_F16") SF DF])
 
@@ -64,14 +68,24 @@  (define_mode_iterator GPF_HF [HF SF DF])
 (define_mode_iterator HFBF [HF BF])
 
 ;; Iterator for all scalar floating point modes suitable for moving, including
-;; special BF type (HF, SF, DF, TF and BF)
-(define_mode_iterator GPF_TF_F16_MOV [HF BF SF DF TF])
+;; special BF type and decimal floating point types (HF, SF, DF, TF, BF,
+;; SD, DD and TD)
+(define_mode_iterator GPF_TF_F16_MOV [HF BF SF DF TF SD DD TD])
+
+;; Iterator for scalar 32bit fp modes (SF, SD)
+(define_mode_iterator SFD [SD SF])
+
+;; Iterator for scalar 64bit fp modes (DF, DD)
+(define_mode_iterator DFD [DD DF])
+
+;; Iterator for scalar 128bit fp modes (TF, TD)
+(define_mode_iterator TFD [TD TF])
 
 ;; Double vector modes.
 (define_mode_iterator VDF [V2SF V4HF])
 
-;; Iterator for all scalar floating point modes (SF, DF and TF)
-(define_mode_iterator GPF_TF [SF DF TF])
+;; Iterator for all scalar floating point modes (SF, DF, TF SD, DD, and TD)
+(define_mode_iterator GPF_TF [SF DF TF SD DD TD])
 
 ;; Integer Advanced SIMD modes.
 (define_mode_iterator VDQ_I [V8QI V16QI V4HI V8HI V2SI V4SI V2DI])
@@ -298,7 +312,7 @@  (define_mode_iterator VB [V8QI V16QI])
 ;; 2 and 4 lane SI modes.
 (define_mode_iterator VS [V2SI V4SI])
 
-(define_mode_iterator TX [TI TF])
+(define_mode_iterator TX [TI TF TD])
 
 ;; Advanced SIMD opaque structure modes.
 (define_mode_iterator VSTRUCT [OI CI XI])
@@ -400,10 +414,10 @@  (define_mode_iterator VSTRUCT_QD [V2x8QI V2x4HI V2x2SI V2x1DI
 				  V4x8HF V4x4SF V4x2DF V4x8BF])
 
 ;; Double scalar modes
-(define_mode_iterator DX [DI DF])
+(define_mode_iterator DX [DI DF DD])
 
 ;; Duplicate of the above
-(define_mode_iterator DX2 [DI DF])
+(define_mode_iterator DX2 [DI DF DD])
 
 ;; Single scalar modes
 (define_mode_iterator SX [SI SF])