aarch64: Recognize vector permute patterns suitable for FMOV [PR100165]

Message ID 20241031174414.3514-1-quic_pzheng@quicinc.com
State New
Headers
Series aarch64: Recognize vector permute patterns suitable for FMOV [PR100165] |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 success Test passed

Commit Message

Pengxuan Zheng Oct. 31, 2024, 5:44 p.m. UTC
  This patch optimizes certain vector permute expansion with the FMOV instruction
when one of the input vectors is a vector of all zeros and the result of the
vector permute is as if the upper lane of the non-zero input vector is set to
zero and the lower lane remains unchanged.

Note that the patch also propagates zero_op0_p and zero_op1_p during re-encode
now.  They will be used by aarch64_evpc_fmov to check if the input vectors are
valid candidates.

	PR target/100165

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (aarch64_simd_vec_set_zero_fmov<mode>):
	New define_insn.
	* config/aarch64/aarch64.cc (aarch64_evpc_reencode): Copy zero_op0_p and
	zero_op1_p.
	(aarch64_evpc_fmov): New function.
	(aarch64_expand_vec_perm_const_1): Add call to aarch64_evpc_fmov.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vec-set-zero.c: Update test accordingly.
	* gcc.target/aarch64/fmov.c: New test.
	* gcc.target/aarch64/fmov-be.c: New test.

Signed-off-by: Pengxuan Zheng <quic_pzheng@quicinc.com>
---
 gcc/config/aarch64/aarch64-simd.md            |  14 +++
 gcc/config/aarch64/aarch64.cc                 |  74 +++++++++++-
 gcc/testsuite/gcc.target/aarch64/fmov-be.c    |  74 ++++++++++++
 gcc/testsuite/gcc.target/aarch64/fmov.c       | 110 ++++++++++++++++++
 .../gcc.target/aarch64/vec-set-zero.c         |   6 +-
 5 files changed, 275 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-be.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov.c
  

Comments

Richard Sandiford Oct. 31, 2024, 10:07 p.m. UTC | #1
Pengxuan Zheng <quic_pzheng@quicinc.com> writes:
> This patch optimizes certain vector permute expansion with the FMOV instruction
> when one of the input vectors is a vector of all zeros and the result of the
> vector permute is as if the upper lane of the non-zero input vector is set to
> zero and the lower lane remains unchanged.
>
> Note that the patch also propagates zero_op0_p and zero_op1_p during re-encode
> now.  They will be used by aarch64_evpc_fmov to check if the input vectors are
> valid candidates.
>
> 	PR target/100165
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-simd.md (aarch64_simd_vec_set_zero_fmov<mode>):
> 	New define_insn.
> 	* config/aarch64/aarch64.cc (aarch64_evpc_reencode): Copy zero_op0_p and
> 	zero_op1_p.
> 	(aarch64_evpc_fmov): New function.
> 	(aarch64_expand_vec_perm_const_1): Add call to aarch64_evpc_fmov.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/vec-set-zero.c: Update test accordingly.
> 	* gcc.target/aarch64/fmov.c: New test.
> 	* gcc.target/aarch64/fmov-be.c: New test.

Nice!  Thanks for doing this.  Some comments on the patch below.
>
> Signed-off-by: Pengxuan Zheng <quic_pzheng@quicinc.com>
> ---
>  gcc/config/aarch64/aarch64-simd.md            |  14 +++
>  gcc/config/aarch64/aarch64.cc                 |  74 +++++++++++-
>  gcc/testsuite/gcc.target/aarch64/fmov-be.c    |  74 ++++++++++++
>  gcc/testsuite/gcc.target/aarch64/fmov.c       | 110 ++++++++++++++++++
>  .../gcc.target/aarch64/vec-set-zero.c         |   6 +-
>  5 files changed, 275 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-be.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov.c
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index e456f693d2f..543126948e7 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -1190,6 +1190,20 @@ (define_insn "aarch64_simd_vec_set<mode>"
>    [(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")]
>  )
>  
> +(define_insn "aarch64_simd_vec_set_zero_fmov<mode>"
> +  [(set (match_operand:VP_2E 0 "register_operand" "=w")
> +	(vec_merge:VP_2E
> +	    (match_operand:VP_2E 1 "aarch64_simd_imm_zero" "Dz")
> +	    (match_operand:VP_2E 3 "register_operand" "w")
> +	    (match_operand:SI 2 "immediate_operand" "i")))]
> +  "TARGET_SIMD
> +   && (ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2]))) == 1)"
> +  {
> +    return "fmov\\t%<Vetype>0, %<Vetype>3";
> +  }
> +  [(set_attr "type" "fmov")]
> +)
> +

I think this shows that target-independent code is missing some
canonicalisation of vec_merge.  combine has:

  unsigned n_elts = 0;
  if (GET_CODE (x) == VEC_MERGE
      && CONST_INT_P (XEXP (x, 2))
      && GET_MODE_NUNITS (GET_MODE (x)).is_constant (&n_elts)
      && (swap_commutative_operands_p (XEXP (x, 0), XEXP (x, 1))
	  /* Two operands have same precedence, then
	     first bit of mask select first operand.  */
	  || (!swap_commutative_operands_p (XEXP (x, 1), XEXP (x, 0))
	      && !(UINTVAL (XEXP (x, 2)) & 1))))
    {
      rtx temp = XEXP (x, 0);
      unsigned HOST_WIDE_INT sel = UINTVAL (XEXP (x, 2));
      unsigned HOST_WIDE_INT mask = HOST_WIDE_INT_1U;
      if (n_elts == HOST_BITS_PER_WIDE_INT)
	mask = -1;
      else
	mask = (HOST_WIDE_INT_1U << n_elts) - 1;
      SUBST (XEXP (x, 0), XEXP (x, 1));
      SUBST (XEXP (x, 1), temp);
      SUBST (XEXP (x, 2), GEN_INT (~sel & mask));
    }

which AFAICT would prefer to put the immediate second, not first.  I think
we should be doing the same canonicalisation in simplify_ternary_operation,
and possibly elsewhere, so that the .md pattern only needs to match the
canonical form (i.e. register, immedate, mask).

On:

> +   && (ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2]))) == 1)"

it seems dangerous to pass exact_log2 to ENDIAN_LANE_N when we haven't
checked whether it is a power of 2.  (0b00 or 0b11 ought to get simplified,
but I don't think we can ignore the possibility.)

Rather than restrict the pattern to pairs, could we instead handle
VALL_F16 minus the QI elements, with the 16-bit elements restricted
to TARGET_F16?  E.g. we should be able to handle V4SI using an FMOV
of S registers if only the low element is nonzero.

Part of me thinks that this should just be described as a plain old AND,
but I suppose that doesn't work well for FP modes.  Still, handling ANDs
might be an interesting follow-up :)

Thanks,
Richard

>  (define_insn "aarch64_simd_vec_set_zero<mode>"
>    [(set (match_operand:VALL_F16 0 "register_operand" "=w")
>  	(vec_merge:VALL_F16
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index a6cc00e74ab..64756920eda 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25950,6 +25950,8 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d)
>    newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
>    newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
>    newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
> +  newd.zero_op0_p = d->zero_op0_p;
> +  newd.zero_op1_p = d->zero_op1_p;
>    newd.testing_p = d->testing_p;
>    newd.one_vector_p = d->one_vector_p;
>  
> @@ -26434,6 +26436,74 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
>    return true;
>  }
>  
> +/* Recognize patterns suitable for the FMOV instructions.  */
> +static bool
> +aarch64_evpc_fmov (struct expand_vec_perm_d *d)
> +{
> +  machine_mode mode = d->vmode;
> +  unsigned HOST_WIDE_INT nelt;
> +
> +  if (d->vec_flags != VEC_ADVSIMD)
> +    return false;
> +
> +  /* to_constant is safe since this routine is specific to Advanced SIMD
> +     vectors.  */
> +  nelt = d->perm.length ().to_constant ();
> +
> +  /* Either d->op0 or d->op1 should be a vector of all zeros.  */
> +  if (nelt != 2 || d->one_vector_p || (!d->zero_op0_p && !d->zero_op1_p))
> +    return false;
> +
> +  HOST_WIDE_INT elt0, elt1;
> +  rtx in0 = d->op0;
> +  rtx in1 = d->op1;
> +
> +  if (!d->perm[0].is_constant (&elt0))
> +    return false;
> +
> +  if (!d->perm[1].is_constant (&elt1))
> +    return false;
> +
> +  if (!BYTES_BIG_ENDIAN)
> +    {
> +      /* Lane 0 of the output vector should come from lane 0 of the non-zero
> +	 vector.  */
> +      if (elt0 != (d->zero_op0_p ? 2 : 0))
> +	return false;
> +
> +      /* Lane 1 of the output vector should come from any lane of the zero
> +	 vector.  */
> +      if (elt1 != (d->zero_op0_p ? 0 : 2) && elt1 != (d->zero_op0_p ? 1 : 3))
> +	return false;
> +    }
> +  else
> +    {
> +      /* Lane 0 of the output vector should come from any lane of the zero
> +	 vector.  */
> +      if (elt0 != (d->zero_op0_p ? 0 : 2) && elt0 != (d->zero_op0_p ? 1 : 3))
> +	return false;
> +
> +      /* Lane 1 of the output vector should come from lane 1 of the non-zero
> +	 vector.  */
> +      if (elt1 != (d->zero_op0_p ? 3 : 1))
> +	return false;
> +    }
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
> +  expand_operand ops[5];
> +  create_output_operand (&ops[0], d->target, mode);
> +  create_input_operand (&ops[1], d->zero_op0_p ? in1 : in0, mode);
> +  create_integer_operand (&ops[2], BYTES_BIG_ENDIAN ? 1 : 2);
> +  create_input_operand (&ops[3], d->zero_op0_p ? in0 : in1, mode);
> +  create_integer_operand (&ops[4], 0);
> +  expand_insn (icode, 5, ops);
> +
> +  return true;
> +}
> +
>  static bool
>  aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>  {
> @@ -26457,7 +26527,9 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>      {
>        if (d->vmode == d->op_mode)
>  	{
> -	  if (aarch64_evpc_rev_local (d))
> +	  if (aarch64_evpc_fmov (d))
> +	    return true;
> +	  else if (aarch64_evpc_rev_local (d))
>  	    return true;
>  	  else if (aarch64_evpc_rev_global (d))
>  	    return true;
> diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-be.c b/gcc/testsuite/gcc.target/aarch64/fmov-be.c
> new file mode 100644
> index 00000000000..f864d474a7a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fmov-be.c
> @@ -0,0 +1,74 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mbig-endian" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +typedef int v2si __attribute__ ((vector_size (8)));
> +typedef long v2di __attribute__ ((vector_size (16)));
> +typedef int v4si __attribute__ ((vector_size (16)));
> +typedef float v4sf __attribute__ ((vector_size (16)));
> +
> +/*
> +** f_v2si:
> +**	fmov	s0, s0
> +**	ret
> +*/
> +v2si
> +f_v2si (v2si x)
> +{
> +  return __builtin_shuffle (x, (v2si){ 0, 0 }, (v2si){ 3, 1 });
> +}
> +
> +/*
> +** g_v2si:
> +**	fmov	s0, s0
> +**	ret
> +*/
> +v2si
> +g_v2si (v2si x)
> +{
> +  return __builtin_shuffle ((v2si){ 0, 0 }, x, (v2si){ 0, 3 });
> +}
> +
> +/*
> +** f_v2di:
> +**	fmov	d0, d0
> +**	ret
> +*/
> +v2di
> +f_v2di (v2di x)
> +{
> +  return __builtin_shuffle (x, (v2di){ 0, 0 }, (v2di){ 2, 1 });
> +}
> +
> +/*
> +** g_v2di:
> +**	fmov	d0, d0
> +**	ret
> +*/
> +v2di
> +g_v2di (v2di x)
> +{
> +  return __builtin_shuffle ((v2di){ 0, 0 }, x, (v2di){ 0, 3 });
> +}
> +
> +/*
> +** f_v4si:
> +**	fmov	d0, d0
> +**	ret
> +*/
> +v4si
> +f_v4si (v4si x)
> +{
> +  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 6, 7, 2, 3 });
> +}
> +
> +/*
> +** g_v4si:
> +**	fmov	d0, d0
> +**	ret
> +*/
> +v4si
> +g_v4si (v4si x)
> +{
> +  return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 2, 3, 6, 7 });
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/fmov.c b/gcc/testsuite/gcc.target/aarch64/fmov.c
> new file mode 100644
> index 00000000000..f40ec56c5f8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fmov.c
> @@ -0,0 +1,110 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +typedef int v2si __attribute__ ((vector_size (8)));
> +typedef float v2sf __attribute__ ((vector_size (8)));
> +typedef long v2di __attribute__ ((vector_size (16)));
> +typedef double v2df __attribute__ ((vector_size (16)));
> +typedef int v4si __attribute__ ((vector_size (16)));
> +typedef float v4sf __attribute__ ((vector_size (16)));
> +
> +
> +/*
> +** f_v2si:
> +**	fmov	s0, s0
> +**	ret
> +*/
> +v2si
> +f_v2si (v2si x)
> +{
> +  return __builtin_shuffle (x, (v2si){ 0, 0 }, (v2si){ 0, 3 });
> +}
> +
> +/*
> +** g_v2si:
> +**	fmov	s0, s0
> +**	ret
> +*/
> +v2si
> +g_v2si (v2si x)
> +{
> +  return __builtin_shuffle ((v2si){ 0, 0 }, x, (v2si){ 2, 0 });
> +}
> +
> +/*
> +** f_v2sf:
> +**	fmov	s0, s0
> +**	ret
> +*/
> +v2sf
> +f_v2sf (v2sf x)
> +{
> +  return __builtin_shuffle (x, (v2sf){ 0, 0 }, (v2si){ 0, 2 });
> +}
> +
> +/*
> +** f_v2di:
> +**	fmov	d0, d0
> +**	ret
> +*/
> +v2di
> +f_v2di (v2di x)
> +{
> +  return __builtin_shuffle (x, (v2di){ 0, 0 }, (v2di){ 0, 3 });
> +}
> +
> +/*
> +** g_v2di:
> +**	fmov	d0, d0
> +**	ret
> +*/
> +v2di
> +g_v2di (v2di x)
> +{
> +  return __builtin_shuffle ((v2di){ 0, 0 }, x, (v2di){ 2, 1 });
> +}
> +
> +/*
> +** f_v2df:
> +**	fmov	d0, d0
> +**	ret
> +*/
> +v2df
> +f_v2df (v2df x)
> +{
> +  return __builtin_shuffle (x, (v2df){ 0, 0 }, (v2di){ 0, 2 });
> +}
> +
> +/*
> +** f_v4si:
> +**	fmov	d0, d0
> +**	ret
> +*/
> +v4si
> +f_v4si (v4si x)
> +{
> +  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 1, 4, 5 });
> +}
> +
> +/*
> +** g_v4si:
> +**	fmov	d0, d0
> +**	ret
> +*/
> +v4si
> +g_v4si (v4si x)
> +{
> +  return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 4, 5, 2, 3 });
> +}
> +
> +/*
> +** f_v4sf:
> +**	fmov	d0, d0
> +**	ret
> +*/
> +v4sf
> +f_v4sf (v4sf x)
> +{
> +  return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 1, 6, 7 });
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c b/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c
> index b34b902cf27..9040839931f 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c
> @@ -28,8 +28,10 @@ FOO(float64x2_t)
>  
>  /* { dg-final { scan-assembler-times {ins\tv[0-9]+\.b\[1\], wzr} 2 { target aarch64_little_endian } } } */
>  /* { dg-final { scan-assembler-times {ins\tv[0-9]+\.h\[1\], wzr} 4 { target aarch64_little_endian } } } */
> -/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.s\[1\], wzr} 4 { target aarch64_little_endian } } } */
> -/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.d\[1\], xzr} 2 { target aarch64_little_endian } } } */
> +/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.s\[1\], wzr} 2 { target aarch64_little_endian } } } */
> +/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.d\[1\], xzr} 0 { target aarch64_little_endian } } } */
> +/* { dg-final { scan-assembler-times {fmov\ts0, s0} 2 { target aarch64_little_endian } } } */
> +/* { dg-final { scan-assembler-times {fmov\td0, d0} 2 { target aarch64_little_endian } } } */
>  
>  /* { dg-final { scan-assembler-times {ins\tv[0-9]+\.b\[6\], wzr} 1 { target aarch64_big_endian } } } */
>  /* { dg-final { scan-assembler-times {ins\tv[0-9]+\.b\[14\], wzr} 1 { target aarch64_big_endian } } } */
  

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index e456f693d2f..543126948e7 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1190,6 +1190,20 @@  (define_insn "aarch64_simd_vec_set<mode>"
   [(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")]
 )
 
+(define_insn "aarch64_simd_vec_set_zero_fmov<mode>"
+  [(set (match_operand:VP_2E 0 "register_operand" "=w")
+	(vec_merge:VP_2E
+	    (match_operand:VP_2E 1 "aarch64_simd_imm_zero" "Dz")
+	    (match_operand:VP_2E 3 "register_operand" "w")
+	    (match_operand:SI 2 "immediate_operand" "i")))]
+  "TARGET_SIMD
+   && (ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2]))) == 1)"
+  {
+    return "fmov\\t%<Vetype>0, %<Vetype>3";
+  }
+  [(set_attr "type" "fmov")]
+)
+
 (define_insn "aarch64_simd_vec_set_zero<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(vec_merge:VALL_F16
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index a6cc00e74ab..64756920eda 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25950,6 +25950,8 @@  aarch64_evpc_reencode (struct expand_vec_perm_d *d)
   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
+  newd.zero_op0_p = d->zero_op0_p;
+  newd.zero_op1_p = d->zero_op1_p;
   newd.testing_p = d->testing_p;
   newd.one_vector_p = d->one_vector_p;
 
@@ -26434,6 +26436,74 @@  aarch64_evpc_ins (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Recognize patterns suitable for the FMOV instructions.  */
+static bool
+aarch64_evpc_fmov (struct expand_vec_perm_d *d)
+{
+  machine_mode mode = d->vmode;
+  unsigned HOST_WIDE_INT nelt;
+
+  if (d->vec_flags != VEC_ADVSIMD)
+    return false;
+
+  /* to_constant is safe since this routine is specific to Advanced SIMD
+     vectors.  */
+  nelt = d->perm.length ().to_constant ();
+
+  /* Either d->op0 or d->op1 should be a vector of all zeros.  */
+  if (nelt != 2 || d->one_vector_p || (!d->zero_op0_p && !d->zero_op1_p))
+    return false;
+
+  HOST_WIDE_INT elt0, elt1;
+  rtx in0 = d->op0;
+  rtx in1 = d->op1;
+
+  if (!d->perm[0].is_constant (&elt0))
+    return false;
+
+  if (!d->perm[1].is_constant (&elt1))
+    return false;
+
+  if (!BYTES_BIG_ENDIAN)
+    {
+      /* Lane 0 of the output vector should come from lane 0 of the non-zero
+	 vector.  */
+      if (elt0 != (d->zero_op0_p ? 2 : 0))
+	return false;
+
+      /* Lane 1 of the output vector should come from any lane of the zero
+	 vector.  */
+      if (elt1 != (d->zero_op0_p ? 0 : 2) && elt1 != (d->zero_op0_p ? 1 : 3))
+	return false;
+    }
+  else
+    {
+      /* Lane 0 of the output vector should come from any lane of the zero
+	 vector.  */
+      if (elt0 != (d->zero_op0_p ? 0 : 2) && elt0 != (d->zero_op0_p ? 1 : 3))
+	return false;
+
+      /* Lane 1 of the output vector should come from lane 1 of the non-zero
+	 vector.  */
+      if (elt1 != (d->zero_op0_p ? 3 : 1))
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
+  expand_operand ops[5];
+  create_output_operand (&ops[0], d->target, mode);
+  create_input_operand (&ops[1], d->zero_op0_p ? in1 : in0, mode);
+  create_integer_operand (&ops[2], BYTES_BIG_ENDIAN ? 1 : 2);
+  create_input_operand (&ops[3], d->zero_op0_p ? in0 : in1, mode);
+  create_integer_operand (&ops[4], 0);
+  expand_insn (icode, 5, ops);
+
+  return true;
+}
+
 static bool
 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
@@ -26457,7 +26527,9 @@  aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
     {
       if (d->vmode == d->op_mode)
 	{
-	  if (aarch64_evpc_rev_local (d))
+	  if (aarch64_evpc_fmov (d))
+	    return true;
+	  else if (aarch64_evpc_rev_local (d))
 	    return true;
 	  else if (aarch64_evpc_rev_global (d))
 	    return true;
diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-be.c b/gcc/testsuite/gcc.target/aarch64/fmov-be.c
new file mode 100644
index 00000000000..f864d474a7a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmov-be.c
@@ -0,0 +1,74 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbig-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef int v2si __attribute__ ((vector_size (8)));
+typedef long v2di __attribute__ ((vector_size (16)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+
+/*
+** f_v2si:
+**	fmov	s0, s0
+**	ret
+*/
+v2si
+f_v2si (v2si x)
+{
+  return __builtin_shuffle (x, (v2si){ 0, 0 }, (v2si){ 3, 1 });
+}
+
+/*
+** g_v2si:
+**	fmov	s0, s0
+**	ret
+*/
+v2si
+g_v2si (v2si x)
+{
+  return __builtin_shuffle ((v2si){ 0, 0 }, x, (v2si){ 0, 3 });
+}
+
+/*
+** f_v2di:
+**	fmov	d0, d0
+**	ret
+*/
+v2di
+f_v2di (v2di x)
+{
+  return __builtin_shuffle (x, (v2di){ 0, 0 }, (v2di){ 2, 1 });
+}
+
+/*
+** g_v2di:
+**	fmov	d0, d0
+**	ret
+*/
+v2di
+g_v2di (v2di x)
+{
+  return __builtin_shuffle ((v2di){ 0, 0 }, x, (v2di){ 0, 3 });
+}
+
+/*
+** f_v4si:
+**	fmov	d0, d0
+**	ret
+*/
+v4si
+f_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 6, 7, 2, 3 });
+}
+
+/*
+** g_v4si:
+**	fmov	d0, d0
+**	ret
+*/
+v4si
+g_v4si (v4si x)
+{
+  return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 2, 3, 6, 7 });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fmov.c b/gcc/testsuite/gcc.target/aarch64/fmov.c
new file mode 100644
index 00000000000..f40ec56c5f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmov.c
@@ -0,0 +1,110 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef int v2si __attribute__ ((vector_size (8)));
+typedef float v2sf __attribute__ ((vector_size (8)));
+typedef long v2di __attribute__ ((vector_size (16)));
+typedef double v2df __attribute__ ((vector_size (16)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+
+
+/*
+** f_v2si:
+**	fmov	s0, s0
+**	ret
+*/
+v2si
+f_v2si (v2si x)
+{
+  return __builtin_shuffle (x, (v2si){ 0, 0 }, (v2si){ 0, 3 });
+}
+
+/*
+** g_v2si:
+**	fmov	s0, s0
+**	ret
+*/
+v2si
+g_v2si (v2si x)
+{
+  return __builtin_shuffle ((v2si){ 0, 0 }, x, (v2si){ 2, 0 });
+}
+
+/*
+** f_v2sf:
+**	fmov	s0, s0
+**	ret
+*/
+v2sf
+f_v2sf (v2sf x)
+{
+  return __builtin_shuffle (x, (v2sf){ 0, 0 }, (v2si){ 0, 2 });
+}
+
+/*
+** f_v2di:
+**	fmov	d0, d0
+**	ret
+*/
+v2di
+f_v2di (v2di x)
+{
+  return __builtin_shuffle (x, (v2di){ 0, 0 }, (v2di){ 0, 3 });
+}
+
+/*
+** g_v2di:
+**	fmov	d0, d0
+**	ret
+*/
+v2di
+g_v2di (v2di x)
+{
+  return __builtin_shuffle ((v2di){ 0, 0 }, x, (v2di){ 2, 1 });
+}
+
+/*
+** f_v2df:
+**	fmov	d0, d0
+**	ret
+*/
+v2df
+f_v2df (v2df x)
+{
+  return __builtin_shuffle (x, (v2df){ 0, 0 }, (v2di){ 0, 2 });
+}
+
+/*
+** f_v4si:
+**	fmov	d0, d0
+**	ret
+*/
+v4si
+f_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 1, 4, 5 });
+}
+
+/*
+** g_v4si:
+**	fmov	d0, d0
+**	ret
+*/
+v4si
+g_v4si (v4si x)
+{
+  return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 4, 5, 2, 3 });
+}
+
+/*
+** f_v4sf:
+**	fmov	d0, d0
+**	ret
+*/
+v4sf
+f_v4sf (v4sf x)
+{
+  return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 1, 6, 7 });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c b/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c
index b34b902cf27..9040839931f 100644
--- a/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c
@@ -28,8 +28,10 @@  FOO(float64x2_t)
 
 /* { dg-final { scan-assembler-times {ins\tv[0-9]+\.b\[1\], wzr} 2 { target aarch64_little_endian } } } */
 /* { dg-final { scan-assembler-times {ins\tv[0-9]+\.h\[1\], wzr} 4 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.s\[1\], wzr} 4 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.d\[1\], xzr} 2 { target aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.s\[1\], wzr} 2 { target aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.d\[1\], xzr} 0 { target aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-times {fmov\ts0, s0} 2 { target aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-times {fmov\td0, d0} 2 { target aarch64_little_endian } } } */
 
 /* { dg-final { scan-assembler-times {ins\tv[0-9]+\.b\[6\], wzr} 1 { target aarch64_big_endian } } } */
 /* { dg-final { scan-assembler-times {ins\tv[0-9]+\.b\[14\], wzr} 1 { target aarch64_big_endian } } } */