[Aarch64] : Use fmov for some low-lane FP SIMD constant vectors [PR113856]

Message ID 20260505035837.3905094-1-naveen.siddegowda@oss.qualcomm.com
State Accepted
Delegated to: Wilco Dijkstra
Headers
Series [Aarch64] : Use fmov for some low-lane FP SIMD constant vectors [PR113856] |

Commit Message

Naveen May 5, 2026, 3:58 a.m. UTC
  Extend AdvSIMD constant materialization to recognize vectors where only
the low element is a representable floating-point constant and all other
elements are zero.
Bootstrapped and tested on aarch64-linux-gnu.

PR target/113856

gcc/ChangeLog:

        * config/aarch64/aarch64-protos.h
	  (aarch64_output_simd_mov_imm_low): New.
	  (aarch64_const_vec_fmov_p): New.
	* config/aarch64/aarch64-simd.md (mov<mode>): Do not expand constant
	vectors handled by aarch64_const_vec_fmov_p into VDUP.
	(*aarch64_simd_mov<VDMOV:mode>): Add Dc alternatives for FMOV based
	SIMD constant moves.
	(*aarch64_simd_mov<VQMOV:mode>): Likewise.
	* config/aarch64/aarch64.cc (aarch64_const_vec_fmov_p): New function.
	(aarch64_output_simd_mov_imm_low): New function.
	* config/aarch64/constraints.md (Dc): New constraint.

gcc/testsuite/ChangeLog:
	* gcc.target/aarch64/pr113856.c: New test.

Signed-off-by: Naveen <naveen.siddegowda@oss.qualcomm.com>
---
 gcc/config/aarch64/aarch64-protos.h         |  2 +
 gcc/config/aarch64/aarch64-simd.md          |  5 +-
 gcc/config/aarch64/aarch64.cc               | 79 +++++++++++++++++++++
 gcc/config/aarch64/constraints.md           |  7 ++
 gcc/testsuite/gcc.target/aarch64/pr113856.c | 70 ++++++++++++++++++
 5 files changed, 162 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr113856.c
  

Comments

Wilco Dijkstra May 5, 2026, 3:19 p.m. UTC | #1
Hi Naveen,

> Extend AdvSIMD constant materialization to recognize vectors where only
> the low element is a representable floating-point constant and all other
> elements are zero.
> Bootstrapped and tested on aarch64-linux-gnu.

LGTM with one minor nit, see below. Do you have commit rights?

Cheers,
Wilco


+char *
+aarch64_output_simd_mov_imm_low (rtx *operands)
+{
+  machine_mode mode = GET_MODE (operands[1]);
+  scalar_mode inner_mode = GET_MODE_INNER (mode);
+  unsigned int nunits = GET_MODE_NUNITS (mode).to_constant ();
+  unsigned int const_idx = BYTES_BIG_ENDIAN ? nunits - 1 : 0;
+  rtx elt = CONST_VECTOR_ELT (operands[1], const_idx);
+  rtx xop[2];
+
+  xop[0] = lowpart_subreg (inner_mode, operands[0], mode);

We should just use operands[0] here since we print explicit registers, so
it is unnecessary.
  
Stefan Schulze Frielinghaus May 11, 2026, 7:43 a.m. UTC | #2
On Mon, May 04, 2026 at 08:58:37PM -0700, Naveen wrote:
> Extend AdvSIMD constant materialization to recognize vectors where only
> the low element is a representable floating-point constant and all other
> elements are zero.
> Bootstrapped and tested on aarch64-linux-gnu.
> 
> PR target/113856
> 
> gcc/ChangeLog:
> 
>         * config/aarch64/aarch64-protos.h
> 	  (aarch64_output_simd_mov_imm_low): New.
> 	  (aarch64_const_vec_fmov_p): New.
> 	* config/aarch64/aarch64-simd.md (mov<mode>): Do not expand constant
> 	vectors handled by aarch64_const_vec_fmov_p into VDUP.
> 	(*aarch64_simd_mov<VDMOV:mode>): Add Dc alternatives for FMOV based
> 	SIMD constant moves.
> 	(*aarch64_simd_mov<VQMOV:mode>): Likewise.
> 	* config/aarch64/aarch64.cc (aarch64_const_vec_fmov_p): New function.
> 	(aarch64_output_simd_mov_imm_low): New function.
> 	* config/aarch64/constraints.md (Dc): New constraint.
> 
> gcc/testsuite/ChangeLog:
> 	* gcc.target/aarch64/pr113856.c: New test.
> 
> Signed-off-by: Naveen <naveen.siddegowda@oss.qualcomm.com>
> ---
>  gcc/config/aarch64/aarch64-protos.h         |  2 +
>  gcc/config/aarch64/aarch64-simd.md          |  5 +-
>  gcc/config/aarch64/aarch64.cc               | 79 +++++++++++++++++++++
>  gcc/config/aarch64/constraints.md           |  7 ++
>  gcc/testsuite/gcc.target/aarch64/pr113856.c | 70 ++++++++++++++++++
>  5 files changed, 162 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr113856.c
> 
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index 16b58f39a97..0798546809d 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -952,8 +952,10 @@ char *aarch64_output_simd_and_imm (rtx, unsigned);
>  char *aarch64_output_simd_xor_imm (rtx, unsigned);
>  char *aarch64_output_fmov (rtx);
>  
> +char *aarch64_output_simd_mov_imm_low (rtx *);
>  char *aarch64_output_sve_mov_immediate (rtx);
>  char *aarch64_output_sve_ptrues (rtx);
> +bool aarch64_const_vec_fmov_p (rtx);
>  bool aarch64_pad_reg_upward (machine_mode, const_tree, bool);
>  bool aarch64_regno_ok_for_base_p (int, bool);
>  bool aarch64_regno_ok_for_index_p (int, bool);
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index c314e85927d..2e142b1e1ee 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -79,7 +79,8 @@
>  	}
>        else if (!aarch64_simd_imm_zero (operands[1], <MODE>mode)
>  	       && !aarch64_simd_special_constant_p (operands[1], <MODE>mode)
> -	       && !aarch64_simd_valid_mov_imm (operands[1]))
> +	       && !aarch64_simd_valid_mov_imm (operands[1])
> +	       && !aarch64_const_vec_fmov_p (operands[1]))
>  	{
>  	  rtx x;
>  	  /* Expand into VDUP.  */
> @@ -183,6 +184,7 @@
>       [?r, w ; neon_to_gp<q>      , *        , *] fmov\t%x0, %d1
>       [?w, r ; f_mcr              , *        , *] fmov\t%d0, %1
>       [?r, r ; mov_reg            , *        , *] mov\t%0, %1
> +     [w , Dc; fmov               , *        , *] << aarch64_output_simd_mov_imm_low (operands);
>       [w , Dn; neon_move<q>       , simd     , *] << aarch64_output_simd_mov_imm (operands[1], 64);
>       [w , Dz; f_mcr              , *        , *] fmov\t%d0, xzr
>       [w , Dx; neon_move          , simd     , 8] #
> @@ -212,6 +214,7 @@
>       [?r , w ; multiple           , *   , 8] #
>       [?w , r ; multiple           , *   , 8] #
>       [?r , r ; multiple           , *   , 8] #
> +     [w  , Dc; fmov               , *   , 4] << aarch64_output_simd_mov_imm_low (operands);
>       [w  , Dn; neon_move<q>       , simd, 4] << aarch64_output_simd_mov_imm (operands[1], 128);
>       [w  , Dz; fmov               , *   , 4] fmov\t%d0, xzr
>       [w  , Dx; neon_move          , simd, 8] #
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 3816df92b18..3359ac9a67d 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -24604,6 +24604,85 @@ aarch64_simd_valid_mov_imm (rtx op)
>    return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_MOV);
>  }
>  
> +
> +/* Return true if OP is an FP constant vector in which the low register
> +   element can be materialized using FMOV and all other elements are zero.  */
> +bool
> +aarch64_const_vec_fmov_p (rtx op)
> +{
> +  if (!CONST_VECTOR_P (op))
> +    return false;
> +
> +  machine_mode mode = GET_MODE (op);
> +  scalar_mode inner_mode = GET_MODE_INNER (mode);
> +
> +  if (inner_mode != E_HFmode
> +      && inner_mode != E_SFmode
> +      && inner_mode != E_DFmode)
> +    return false;
> +
> +  if (inner_mode == E_HFmode && !TARGET_FP_F16INST)
> +    return false;
> +
> +  unsigned int nunits = GET_MODE_NUNITS (mode).to_constant ();
> +  unsigned int const_idx = BYTES_BIG_ENDIAN ? nunits - 1 : 0;
> +
> +  rtx elt = CONST_VECTOR_ELT (op, const_idx);
> +  if (!CONST_DOUBLE_P (elt))
> +    return false;
> +
> +  REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
> +  if (!aarch64_real_float_const_representable_p (r))
> +    return false;
> +
> +  for (unsigned int i = 0; i < nunits; ++i)
> +    {
> +      if (i == const_idx)
> +	continue;
> +
> +      rtx x = CONST_VECTOR_ELT (op, i);
> +      if (!rtx_equal_p (x, CONST0_RTX (inner_mode)))
> +	return false;
> +    }
> +
> +  return true;
> +}
> +
> +/* Output a move of an FP constant vector in which the low register element is
> +   materialized using FMOV and all other elements are zero.  */
> +char *
> +aarch64_output_simd_mov_imm_low (rtx *operands)
> +{
> +  machine_mode mode = GET_MODE (operands[1]);
> +  scalar_mode inner_mode = GET_MODE_INNER (mode);
> +  unsigned int nunits = GET_MODE_NUNITS (mode).to_constant ();
> +  unsigned int const_idx = BYTES_BIG_ENDIAN ? nunits - 1 : 0;
> +  rtx elt = CONST_VECTOR_ELT (operands[1], const_idx);
> +  rtx xop[2];
> +
> +  xop[0] = lowpart_subreg (inner_mode, operands[0], mode);
> +  xop[1] = elt;
> +
> +  switch (inner_mode)
> +    {
> +      case E_HFmode:
> +	output_asm_insn ("fmov\t%h0, %1", xop);
> +	break;
> +
> +      case E_SFmode:
> +	output_asm_insn ("fmov\t%s0, %1", xop);
> +	break;
> +
> +      case E_DFmode:
> +	output_asm_insn ("fmov\t%d0, %1", xop);
> +	break;
> +
> +      default:
> +	gcc_unreachable ();
> +    }
> +  return "";
> +}

This breaks bootstrap for me with:

gcc/config/aarch64/aarch64.cc: In function ‘char* aarch64_output_simd_mov_imm_low(rtx_def**)’:
gcc/config/aarch64/aarch64.cc:24683:10: error: ISO C++ forbids converting a string constant to ‘char*’ [-Werror=write-strings]
24683 |   return "";
      |          ^~

I haven't looked much into it, however, if it is desired returning
always an empty string, then the return type should be `const char *`
instead of just `char *`.

Cheers,
Stefan

> +
>  /* Return true if OP is a valid SIMD orr immediate for SVE or AdvSIMD.  */
>  bool
>  aarch64_simd_valid_orr_imm (rtx op)
> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
> index 3d166fe3a17..8760220835b 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -503,6 +503,13 @@
>   (and (match_code "const_vector")
>        (match_test "aarch64_simd_valid_xor_imm (op)")))
>  
> +(define_constraint "Dc"
> + "@internal
> +  A constraint that matches an FP constant vector in which the low register
> +  element can be materialized using FMOV and all other elements are zero."
> + (and (match_code "const_vector")
> +      (match_test "aarch64_const_vec_fmov_p (op)")))
> +
>  (define_constraint "Dn"
>    "@internal
>   A constraint that matches vector of immediates."
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr113856.c b/gcc/testsuite/gcc.target/aarch64/pr113856.c
> new file mode 100644
> index 00000000000..f0facbcba09
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr113856.c
> @@ -0,0 +1,70 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-additional-options "-march=armv8-a+fp16" } */
> +
> +/* Check that FP vector constants with only the low element nonzero are
> +   materialized with scalar FMOV rather than a literal pool load.
> +
> +   PR target/113856.  */
> +
> +typedef float vect64_float __attribute__((vector_size(8)));
> +typedef float vect128_float __attribute__((vector_size(16)));
> +typedef _Float16 vect64_half __attribute__((vector_size(8)));
> +typedef _Float16 vect128_half __attribute__((vector_size(16)));
> +typedef double vect128_double __attribute__((vector_size(16)));
> +
> +vect64_float
> +f1 (void)
> +{
> +  return (vect64_float) { 1.0f, 0.0f };
> +}
> +
> +/* Existing duplicated-lane case.  */
> +vect64_float
> +f2 (void)
> +{
> +  return (vect64_float) { 1.0f, 1.0f };
> +}
> +
> +vect128_float
> +f3 (void)
> +{
> +  return (vect128_float) { 1.0f, 0.0f, 0.0f, 0.0f };
> +}
> +
> +vect64_half
> +f4 (void)
> +{
> +  return (vect64_half) { (_Float16) 1.0, (_Float16) 0.0,
> +                         (_Float16) 0.0, (_Float16) 0.0 };
> +}
> +
> +vect128_half
> +f5 (void)
> +{
> +  return (vect128_half) { (_Float16) 1.0, (_Float16) 0.0,
> +                          (_Float16) 0.0, (_Float16) 0.0,
> +                          (_Float16) 0.0, (_Float16) 0.0,
> +                          (_Float16) 0.0, (_Float16) 0.0 };
> +}
> +
> +vect128_double
> +f6 (void)
> +{
> +  return (vect128_double) { 1.0, 0.0 };
> +}
> +
> +/* f1 and f3: scalar FMOV into the low SF element.  */
> +/* { dg-final { scan-assembler-times {\tfmov\ts[0-9]+, 1\.0} 2 } } */
> +
> +/* f2: existing vector duplicated-FMOV case.  */
> +/* { dg-final { scan-assembler-times {\tfmov\tv[0-9]+\.2s, 1\.0} 1 } } */
> +
> +/* f4 and f5: scalar FMOV into the low HF element.  */
> +/* { dg-final { scan-assembler-times {\tfmov\th[0-9]+, 1\.0} 2 } } */
> +
> +/* f6: scalar FMOV into the low DF element.  */
> +/* { dg-final { scan-assembler-times {\tfmov\td[0-9]+, 1\.0} 1 } } */
> +
> +/* None of them should need a literal pool load.  */
> +/* { dg-final { scan-assembler-not {\tldr\tq[0-9]+,} } } */
> -- 
> 2.34.1
>
  
Naveen May 11, 2026, 8:47 a.m. UTC | #3
Hi Stefan,

The return type should be "const char *" as you rightly pointed out
as it returns
an empty string.
I will send out a patch fixing the return type.

Thanks,
Naveen

On Mon, May 11, 2026 at 1:14 PM Stefan Schulze Frielinghaus <
stefansf@linux.ibm.com> wrote:

> On Mon, May 04, 2026 at 08:58:37PM -0700, Naveen wrote:
> > Extend AdvSIMD constant materialization to recognize vectors where only
> > the low element is a representable floating-point constant and all other
> > elements are zero.
> > Bootstrapped and tested on aarch64-linux-gnu.
> >
> > PR target/113856
> >
> > gcc/ChangeLog:
> >
> >         * config/aarch64/aarch64-protos.h
> >         (aarch64_output_simd_mov_imm_low): New.
> >         (aarch64_const_vec_fmov_p): New.
> >       * config/aarch64/aarch64-simd.md (mov<mode>): Do not expand
> constant
> >       vectors handled by aarch64_const_vec_fmov_p into VDUP.
> >       (*aarch64_simd_mov<VDMOV:mode>): Add Dc alternatives for FMOV based
> >       SIMD constant moves.
> >       (*aarch64_simd_mov<VQMOV:mode>): Likewise.
> >       * config/aarch64/aarch64.cc (aarch64_const_vec_fmov_p): New
> function.
> >       (aarch64_output_simd_mov_imm_low): New function.
> >       * config/aarch64/constraints.md (Dc): New constraint.
> >
> > gcc/testsuite/ChangeLog:
> >       * gcc.target/aarch64/pr113856.c: New test.
> >
> > Signed-off-by: Naveen <naveen.siddegowda@oss.qualcomm.com>
> > ---
> >  gcc/config/aarch64/aarch64-protos.h         |  2 +
> >  gcc/config/aarch64/aarch64-simd.md          |  5 +-
> >  gcc/config/aarch64/aarch64.cc               | 79 +++++++++++++++++++++
> >  gcc/config/aarch64/constraints.md           |  7 ++
> >  gcc/testsuite/gcc.target/aarch64/pr113856.c | 70 ++++++++++++++++++
> >  5 files changed, 162 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr113856.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-protos.h
> b/gcc/config/aarch64/aarch64-protos.h
> > index 16b58f39a97..0798546809d 100644
> > --- a/gcc/config/aarch64/aarch64-protos.h
> > +++ b/gcc/config/aarch64/aarch64-protos.h
> > @@ -952,8 +952,10 @@ char *aarch64_output_simd_and_imm (rtx, unsigned);
> >  char *aarch64_output_simd_xor_imm (rtx, unsigned);
> >  char *aarch64_output_fmov (rtx);
> >
> > +char *aarch64_output_simd_mov_imm_low (rtx *);
> >  char *aarch64_output_sve_mov_immediate (rtx);
> >  char *aarch64_output_sve_ptrues (rtx);
> > +bool aarch64_const_vec_fmov_p (rtx);
> >  bool aarch64_pad_reg_upward (machine_mode, const_tree, bool);
> >  bool aarch64_regno_ok_for_base_p (int, bool);
> >  bool aarch64_regno_ok_for_index_p (int, bool);
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> > index c314e85927d..2e142b1e1ee 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -79,7 +79,8 @@
> >       }
> >        else if (!aarch64_simd_imm_zero (operands[1], <MODE>mode)
> >              && !aarch64_simd_special_constant_p (operands[1],
> <MODE>mode)
> > -            && !aarch64_simd_valid_mov_imm (operands[1]))
> > +            && !aarch64_simd_valid_mov_imm (operands[1])
> > +            && !aarch64_const_vec_fmov_p (operands[1]))
> >       {
> >         rtx x;
> >         /* Expand into VDUP.  */
> > @@ -183,6 +184,7 @@
> >       [?r, w ; neon_to_gp<q>      , *        , *] fmov\t%x0, %d1
> >       [?w, r ; f_mcr              , *        , *] fmov\t%d0, %1
> >       [?r, r ; mov_reg            , *        , *] mov\t%0, %1
> > +     [w , Dc; fmov               , *        , *] <<
> aarch64_output_simd_mov_imm_low (operands);
> >       [w , Dn; neon_move<q>       , simd     , *] <<
> aarch64_output_simd_mov_imm (operands[1], 64);
> >       [w , Dz; f_mcr              , *        , *] fmov\t%d0, xzr
> >       [w , Dx; neon_move          , simd     , 8] #
> > @@ -212,6 +214,7 @@
> >       [?r , w ; multiple           , *   , 8] #
> >       [?w , r ; multiple           , *   , 8] #
> >       [?r , r ; multiple           , *   , 8] #
> > +     [w  , Dc; fmov               , *   , 4] <<
> aarch64_output_simd_mov_imm_low (operands);
> >       [w  , Dn; neon_move<q>       , simd, 4] <<
> aarch64_output_simd_mov_imm (operands[1], 128);
> >       [w  , Dz; fmov               , *   , 4] fmov\t%d0, xzr
> >       [w  , Dx; neon_move          , simd, 8] #
> > diff --git a/gcc/config/aarch64/aarch64.cc
> b/gcc/config/aarch64/aarch64.cc
> > index 3816df92b18..3359ac9a67d 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -24604,6 +24604,85 @@ aarch64_simd_valid_mov_imm (rtx op)
> >    return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_MOV);
> >  }
> >
> > +
> > +/* Return true if OP is an FP constant vector in which the low register
> > +   element can be materialized using FMOV and all other elements are
> zero.  */
> > +bool
> > +aarch64_const_vec_fmov_p (rtx op)
> > +{
> > +  if (!CONST_VECTOR_P (op))
> > +    return false;
> > +
> > +  machine_mode mode = GET_MODE (op);
> > +  scalar_mode inner_mode = GET_MODE_INNER (mode);
> > +
> > +  if (inner_mode != E_HFmode
> > +      && inner_mode != E_SFmode
> > +      && inner_mode != E_DFmode)
> > +    return false;
> > +
> > +  if (inner_mode == E_HFmode && !TARGET_FP_F16INST)
> > +    return false;
> > +
> > +  unsigned int nunits = GET_MODE_NUNITS (mode).to_constant ();
> > +  unsigned int const_idx = BYTES_BIG_ENDIAN ? nunits - 1 : 0;
> > +
> > +  rtx elt = CONST_VECTOR_ELT (op, const_idx);
> > +  if (!CONST_DOUBLE_P (elt))
> > +    return false;
> > +
> > +  REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
> > +  if (!aarch64_real_float_const_representable_p (r))
> > +    return false;
> > +
> > +  for (unsigned int i = 0; i < nunits; ++i)
> > +    {
> > +      if (i == const_idx)
> > +     continue;
> > +
> > +      rtx x = CONST_VECTOR_ELT (op, i);
> > +      if (!rtx_equal_p (x, CONST0_RTX (inner_mode)))
> > +     return false;
> > +    }
> > +
> > +  return true;
> > +}
> > +
> > +/* Output a move of an FP constant vector in which the low register
> element is
> > +   materialized using FMOV and all other elements are zero.  */
> > +char *
> > +aarch64_output_simd_mov_imm_low (rtx *operands)
> > +{
> > +  machine_mode mode = GET_MODE (operands[1]);
> > +  scalar_mode inner_mode = GET_MODE_INNER (mode);
> > +  unsigned int nunits = GET_MODE_NUNITS (mode).to_constant ();
> > +  unsigned int const_idx = BYTES_BIG_ENDIAN ? nunits - 1 : 0;
> > +  rtx elt = CONST_VECTOR_ELT (operands[1], const_idx);
> > +  rtx xop[2];
> > +
> > +  xop[0] = lowpart_subreg (inner_mode, operands[0], mode);
> > +  xop[1] = elt;
> > +
> > +  switch (inner_mode)
> > +    {
> > +      case E_HFmode:
> > +     output_asm_insn ("fmov\t%h0, %1", xop);
> > +     break;
> > +
> > +      case E_SFmode:
> > +     output_asm_insn ("fmov\t%s0, %1", xop);
> > +     break;
> > +
> > +      case E_DFmode:
> > +     output_asm_insn ("fmov\t%d0, %1", xop);
> > +     break;
> > +
> > +      default:
> > +     gcc_unreachable ();
> > +    }
> > +  return "";
> > +}
>
> This breaks bootstrap for me with:
>
> gcc/config/aarch64/aarch64.cc: In function ‘char*
> aarch64_output_simd_mov_imm_low(rtx_def**)’:
> gcc/config/aarch64/aarch64.cc:24683:10: error: ISO C++ forbids converting
> a string constant to ‘char*’ [-Werror=write-strings]
> 24683 |   return "";
>       |          ^~
>
> I haven't looked much into it, however, if it is desired returning
> always an empty string, then the return type should be `const char *`
> instead of just `char *`.
>
> Cheers,
> Stefan
>
> > +
> >  /* Return true if OP is a valid SIMD orr immediate for SVE or AdvSIMD.
> */
> >  bool
> >  aarch64_simd_valid_orr_imm (rtx op)
> > diff --git a/gcc/config/aarch64/constraints.md
> b/gcc/config/aarch64/constraints.md
> > index 3d166fe3a17..8760220835b 100644
> > --- a/gcc/config/aarch64/constraints.md
> > +++ b/gcc/config/aarch64/constraints.md
> > @@ -503,6 +503,13 @@
> >   (and (match_code "const_vector")
> >        (match_test "aarch64_simd_valid_xor_imm (op)")))
> >
> > +(define_constraint "Dc"
> > + "@internal
> > +  A constraint that matches an FP constant vector in which the low
> register
> > +  element can be materialized using FMOV and all other elements are
> zero."
> > + (and (match_code "const_vector")
> > +      (match_test "aarch64_const_vec_fmov_p (op)")))
> > +
> >  (define_constraint "Dn"
> >    "@internal
> >   A constraint that matches vector of immediates."
> > diff --git a/gcc/testsuite/gcc.target/aarch64/pr113856.c
> b/gcc/testsuite/gcc.target/aarch64/pr113856.c
> > new file mode 100644
> > index 00000000000..f0facbcba09
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/pr113856.c
> > @@ -0,0 +1,70 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +/* { dg-additional-options "-march=armv8-a+fp16" } */
> > +
> > +/* Check that FP vector constants with only the low element nonzero are
> > +   materialized with scalar FMOV rather than a literal pool load.
> > +
> > +   PR target/113856.  */
> > +
> > +typedef float vect64_float __attribute__((vector_size(8)));
> > +typedef float vect128_float __attribute__((vector_size(16)));
> > +typedef _Float16 vect64_half __attribute__((vector_size(8)));
> > +typedef _Float16 vect128_half __attribute__((vector_size(16)));
> > +typedef double vect128_double __attribute__((vector_size(16)));
> > +
> > +vect64_float
> > +f1 (void)
> > +{
> > +  return (vect64_float) { 1.0f, 0.0f };
> > +}
> > +
> > +/* Existing duplicated-lane case.  */
> > +vect64_float
> > +f2 (void)
> > +{
> > +  return (vect64_float) { 1.0f, 1.0f };
> > +}
> > +
> > +vect128_float
> > +f3 (void)
> > +{
> > +  return (vect128_float) { 1.0f, 0.0f, 0.0f, 0.0f };
> > +}
> > +
> > +vect64_half
> > +f4 (void)
> > +{
> > +  return (vect64_half) { (_Float16) 1.0, (_Float16) 0.0,
> > +                         (_Float16) 0.0, (_Float16) 0.0 };
> > +}
> > +
> > +vect128_half
> > +f5 (void)
> > +{
> > +  return (vect128_half) { (_Float16) 1.0, (_Float16) 0.0,
> > +                          (_Float16) 0.0, (_Float16) 0.0,
> > +                          (_Float16) 0.0, (_Float16) 0.0,
> > +                          (_Float16) 0.0, (_Float16) 0.0 };
> > +}
> > +
> > +vect128_double
> > +f6 (void)
> > +{
> > +  return (vect128_double) { 1.0, 0.0 };
> > +}
> > +
> > +/* f1 and f3: scalar FMOV into the low SF element.  */
> > +/* { dg-final { scan-assembler-times {\tfmov\ts[0-9]+, 1\.0} 2 } } */
> > +
> > +/* f2: existing vector duplicated-FMOV case.  */
> > +/* { dg-final { scan-assembler-times {\tfmov\tv[0-9]+\.2s, 1\.0} 1 } }
> */
> > +
> > +/* f4 and f5: scalar FMOV into the low HF element.  */
> > +/* { dg-final { scan-assembler-times {\tfmov\th[0-9]+, 1\.0} 2 } } */
> > +
> > +/* f6: scalar FMOV into the low DF element.  */
> > +/* { dg-final { scan-assembler-times {\tfmov\td[0-9]+, 1\.0} 1 } } */
> > +
> > +/* None of them should need a literal pool load.  */
> > +/* { dg-final { scan-assembler-not {\tldr\tq[0-9]+,} } } */
> > --
> > 2.34.1
> >
>
  

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 16b58f39a97..0798546809d 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -952,8 +952,10 @@  char *aarch64_output_simd_and_imm (rtx, unsigned);
 char *aarch64_output_simd_xor_imm (rtx, unsigned);
 char *aarch64_output_fmov (rtx);
 
+char *aarch64_output_simd_mov_imm_low (rtx *);
 char *aarch64_output_sve_mov_immediate (rtx);
 char *aarch64_output_sve_ptrues (rtx);
+bool aarch64_const_vec_fmov_p (rtx);
 bool aarch64_pad_reg_upward (machine_mode, const_tree, bool);
 bool aarch64_regno_ok_for_base_p (int, bool);
 bool aarch64_regno_ok_for_index_p (int, bool);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c314e85927d..2e142b1e1ee 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -79,7 +79,8 @@ 
 	}
       else if (!aarch64_simd_imm_zero (operands[1], <MODE>mode)
 	       && !aarch64_simd_special_constant_p (operands[1], <MODE>mode)
-	       && !aarch64_simd_valid_mov_imm (operands[1]))
+	       && !aarch64_simd_valid_mov_imm (operands[1])
+	       && !aarch64_const_vec_fmov_p (operands[1]))
 	{
 	  rtx x;
 	  /* Expand into VDUP.  */
@@ -183,6 +184,7 @@ 
      [?r, w ; neon_to_gp<q>      , *        , *] fmov\t%x0, %d1
      [?w, r ; f_mcr              , *        , *] fmov\t%d0, %1
      [?r, r ; mov_reg            , *        , *] mov\t%0, %1
+     [w , Dc; fmov               , *        , *] << aarch64_output_simd_mov_imm_low (operands);
      [w , Dn; neon_move<q>       , simd     , *] << aarch64_output_simd_mov_imm (operands[1], 64);
      [w , Dz; f_mcr              , *        , *] fmov\t%d0, xzr
      [w , Dx; neon_move          , simd     , 8] #
@@ -212,6 +214,7 @@ 
      [?r , w ; multiple           , *   , 8] #
      [?w , r ; multiple           , *   , 8] #
      [?r , r ; multiple           , *   , 8] #
+     [w  , Dc; fmov               , *   , 4] << aarch64_output_simd_mov_imm_low (operands);
      [w  , Dn; neon_move<q>       , simd, 4] << aarch64_output_simd_mov_imm (operands[1], 128);
      [w  , Dz; fmov               , *   , 4] fmov\t%d0, xzr
      [w  , Dx; neon_move          , simd, 8] #
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 3816df92b18..3359ac9a67d 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24604,6 +24604,85 @@  aarch64_simd_valid_mov_imm (rtx op)
   return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_MOV);
 }
 
+
+/* Return true if OP is an FP constant vector in which the low register
+   element can be materialized using FMOV and all other elements are zero.  */
+bool
+aarch64_const_vec_fmov_p (rtx op)
+{
+  if (!CONST_VECTOR_P (op))
+    return false;
+
+  machine_mode mode = GET_MODE (op);
+  scalar_mode inner_mode = GET_MODE_INNER (mode);
+
+  if (inner_mode != E_HFmode
+      && inner_mode != E_SFmode
+      && inner_mode != E_DFmode)
+    return false;
+
+  if (inner_mode == E_HFmode && !TARGET_FP_F16INST)
+    return false;
+
+  unsigned int nunits = GET_MODE_NUNITS (mode).to_constant ();
+  unsigned int const_idx = BYTES_BIG_ENDIAN ? nunits - 1 : 0;
+
+  rtx elt = CONST_VECTOR_ELT (op, const_idx);
+  if (!CONST_DOUBLE_P (elt))
+    return false;
+
+  REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
+  if (!aarch64_real_float_const_representable_p (r))
+    return false;
+
+  for (unsigned int i = 0; i < nunits; ++i)
+    {
+      if (i == const_idx)
+	continue;
+
+      rtx x = CONST_VECTOR_ELT (op, i);
+      if (!rtx_equal_p (x, CONST0_RTX (inner_mode)))
+	return false;
+    }
+
+  return true;
+}
+
+/* Output a move of an FP constant vector in which the low register element is
+   materialized using FMOV and all other elements are zero.  */
+char *
+aarch64_output_simd_mov_imm_low (rtx *operands)
+{
+  machine_mode mode = GET_MODE (operands[1]);
+  scalar_mode inner_mode = GET_MODE_INNER (mode);
+  unsigned int nunits = GET_MODE_NUNITS (mode).to_constant ();
+  unsigned int const_idx = BYTES_BIG_ENDIAN ? nunits - 1 : 0;
+  rtx elt = CONST_VECTOR_ELT (operands[1], const_idx);
+  rtx xop[2];
+
+  xop[0] = lowpart_subreg (inner_mode, operands[0], mode);
+  xop[1] = elt;
+
+  switch (inner_mode)
+    {
+      case E_HFmode:
+	output_asm_insn ("fmov\t%h0, %1", xop);
+	break;
+
+      case E_SFmode:
+	output_asm_insn ("fmov\t%s0, %1", xop);
+	break;
+
+      case E_DFmode:
+	output_asm_insn ("fmov\t%d0, %1", xop);
+	break;
+
+      default:
+	gcc_unreachable ();
+    }
+  return "";
+}
+
 /* Return true if OP is a valid SIMD orr immediate for SVE or AdvSIMD.  */
 bool
 aarch64_simd_valid_orr_imm (rtx op)
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 3d166fe3a17..8760220835b 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -503,6 +503,13 @@ 
  (and (match_code "const_vector")
       (match_test "aarch64_simd_valid_xor_imm (op)")))
 
+(define_constraint "Dc"
+ "@internal
+  A constraint that matches an FP constant vector in which the low register
+  element can be materialized using FMOV and all other elements are zero."
+ (and (match_code "const_vector")
+      (match_test "aarch64_const_vec_fmov_p (op)")))
+
 (define_constraint "Dn"
   "@internal
  A constraint that matches vector of immediates."
diff --git a/gcc/testsuite/gcc.target/aarch64/pr113856.c b/gcc/testsuite/gcc.target/aarch64/pr113856.c
new file mode 100644
index 00000000000..f0facbcba09
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr113856.c
@@ -0,0 +1,70 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-march=armv8-a+fp16" } */
+
+/* Check that FP vector constants with only the low element nonzero are
+   materialized with scalar FMOV rather than a literal pool load.
+
+   PR target/113856.  */
+
+typedef float vect64_float __attribute__((vector_size(8)));
+typedef float vect128_float __attribute__((vector_size(16)));
+typedef _Float16 vect64_half __attribute__((vector_size(8)));
+typedef _Float16 vect128_half __attribute__((vector_size(16)));
+typedef double vect128_double __attribute__((vector_size(16)));
+
+vect64_float
+f1 (void)
+{
+  return (vect64_float) { 1.0f, 0.0f };
+}
+
+/* Existing duplicated-lane case.  */
+vect64_float
+f2 (void)
+{
+  return (vect64_float) { 1.0f, 1.0f };
+}
+
+vect128_float
+f3 (void)
+{
+  return (vect128_float) { 1.0f, 0.0f, 0.0f, 0.0f };
+}
+
+vect64_half
+f4 (void)
+{
+  return (vect64_half) { (_Float16) 1.0, (_Float16) 0.0,
+                         (_Float16) 0.0, (_Float16) 0.0 };
+}
+
+vect128_half
+f5 (void)
+{
+  return (vect128_half) { (_Float16) 1.0, (_Float16) 0.0,
+                          (_Float16) 0.0, (_Float16) 0.0,
+                          (_Float16) 0.0, (_Float16) 0.0,
+                          (_Float16) 0.0, (_Float16) 0.0 };
+}
+
+vect128_double
+f6 (void)
+{
+  return (vect128_double) { 1.0, 0.0 };
+}
+
+/* f1 and f3: scalar FMOV into the low SF element.  */
+/* { dg-final { scan-assembler-times {\tfmov\ts[0-9]+, 1\.0} 2 } } */
+
+/* f2: existing vector duplicated-FMOV case.  */
+/* { dg-final { scan-assembler-times {\tfmov\tv[0-9]+\.2s, 1\.0} 1 } } */
+
+/* f4 and f5: scalar FMOV into the low HF element.  */
+/* { dg-final { scan-assembler-times {\tfmov\th[0-9]+, 1\.0} 2 } } */
+
+/* f6: scalar FMOV into the low DF element.  */
+/* { dg-final { scan-assembler-times {\tfmov\td[0-9]+, 1\.0} 1 } } */
+
+/* None of them should need a literal pool load.  */
+/* { dg-final { scan-assembler-not {\tldr\tq[0-9]+,} } } */