diff mbox series

aarch64: Use SVE's RDVL instruction

Message ID	mptzgcxqtwt.fsf@arm.com
State	Deferred
Headers	DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 9A18D38438D5 To: gcc-patches@gcc.gnu.org Mail-Followup-To: gcc-patches@gcc.gnu.org, richard.sandiford@arm.com Subject: [PATCH] aarch64: Use SVE's RDVL instruction Date: Fri, 11 Nov 2022 17:32:02 +0000 Message-ID: <mptzgcxqtwt.fsf@arm.com> User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.3 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain Precedence: list From: Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> Reply-To: Richard Sandiford <richard.sandiford@arm.com> Errors-To: gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org>
Series	aarch64: Use SVE's RDVL instruction \| aarch64: Use SVE's RDVL instruction

Commit Message

Richard Sandiford Nov. 11, 2022, 5:32 p.m. UTC

  We didn't previously use SVE's RDVL instruction, since the CNT*
forms are preferred and provide most of the range.  However,
there are some cases that RDVL can handle and CNT* can't,
and using RDVL-like instructions becomes important for SME.

Tested on aarch64-linux-gnu.  I plan to apply this soon if there
are no comments.

Thanks,
Richard


gcc/
	* config/aarch64/aarch64-protos.h (aarch64_sve_rdvl_immediate_p)
	(aarch64_output_sve_rdvl): Declare.
	* config/aarch64/aarch64.cc (aarch64_sve_cnt_factor_p): New
	function, split out from...
	(aarch64_sve_cnt_immediate_p): ...here.
	(aarch64_sve_rdvl_factor_p): New function.
	(aarch64_sve_rdvl_immediate_p): Likewise.
	(aarch64_output_sve_rdvl): Likewise.
	(aarch64_offset_temporaries): Rewrite the SVE handling to use RDVL
	for some cases.
	(aarch64_expand_mov_immediate): Handle RDVL immediates.
	(aarch64_mov_operand_p): Likewise.
	* config/aarch64/constraints.md (Usr): New constraint.
	* config/aarch64/aarch64.md (*mov<SHORT:mode>_aarch64): Add an RDVL
	alternative.
	(*movsi_aarch64, *movdi_aarch64): Likewise.

gcc/testsuite/
	* gcc.target/aarch64/sve/acle/asm/cntb.c: Tweak expected output.
	* gcc.target/aarch64/sve/acle/asm/cnth.c: Likewise.
	* gcc.target/aarch64/sve/acle/asm/cntw.c: Likewise.
	* gcc.target/aarch64/sve/acle/asm/cntd.c: Likewise.
	* gcc.target/aarch64/sve/acle/asm/prfb.c: Likewise.
	* gcc.target/aarch64/sve/acle/asm/prfh.c: Likewise.
	* gcc.target/aarch64/sve/acle/asm/prfw.c: Likewise.
	* gcc.target/aarch64/sve/acle/asm/prfd.c: Likewise.
	* gcc.target/aarch64/sve/loop_add_4.c: Expect RDVL to be used
	to calculate the -17 and 17 factors.
	* gcc.target/aarch64/sve/pcs/stack_clash_1.c: Likewise the 18 factor.
---
 gcc/config/aarch64/aarch64-protos.h           |   2 +
 gcc/config/aarch64/aarch64.cc                 | 196 ++++++++++++------
 gcc/config/aarch64/aarch64.md                 |  58 +++---
 gcc/config/aarch64/constraints.md             |   6 +
 .../gcc.target/aarch64/sve/acle/asm/cntb.c    |  71 +++++--
 .../gcc.target/aarch64/sve/acle/asm/cntd.c    |  12 +-
 .../gcc.target/aarch64/sve/acle/asm/cnth.c    |  20 +-
 .../gcc.target/aarch64/sve/acle/asm/cntw.c    |  16 +-
 .../gcc.target/aarch64/sve/acle/asm/prfb.c    |   6 +-
 .../gcc.target/aarch64/sve/acle/asm/prfd.c    |   4 +-
 .../gcc.target/aarch64/sve/acle/asm/prfh.c    |   4 +-
 .../gcc.target/aarch64/sve/acle/asm/prfw.c    |   4 +-
 .../gcc.target/aarch64/sve/loop_add_4.c       |   6 +-
 .../aarch64/sve/pcs/stack_clash_1.c           |   3 +-
 14 files changed, 260 insertions(+), 148 deletions(-)

diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 3d81c223b01..866d68ad4d7 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -801,6 +801,7 @@  bool aarch64_sve_mode_p (machine_mode);
 HOST_WIDE_INT aarch64_fold_sve_cnt_pat (aarch64_svpattern, unsigned int);
 bool aarch64_sve_cnt_immediate_p (rtx);
 bool aarch64_sve_scalar_inc_dec_immediate_p (rtx);
+bool aarch64_sve_rdvl_immediate_p (rtx);
 bool aarch64_sve_addvl_addpl_immediate_p (rtx);
 bool aarch64_sve_vector_inc_dec_immediate_p (rtx);
 int aarch64_add_offset_temporaries (rtx);
@@ -813,6 +814,7 @@  char *aarch64_output_sve_prefetch (const char *, rtx, const char *);
 char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
 char *aarch64_output_sve_cnt_pat_immediate (const char *, const char *, rtx *);
 char *aarch64_output_sve_scalar_inc_dec (rtx);
+char *aarch64_output_sve_rdvl (rtx);
 char *aarch64_output_sve_addvl_addpl (rtx);
 char *aarch64_output_sve_vector_inc_dec (const char *, rtx);
 char *aarch64_output_scalar_simd_mov_immediate (rtx, scalar_int_mode);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 41a2181a7d3..a40ac6fd903 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -5266,6 +5266,18 @@  aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
   return -1;
 }
 
+/* Return true if a single CNT[BHWD] instruction can multiply FACTOR
+   by the number of 128-bit quadwords in an SVE vector.  */
+
+static bool
+aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor)
+{
+  /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
+  return (IN_RANGE (factor, 2, 16 * 16)
+	  && (factor & 1) == 0
+	  && factor <= 16 * (factor & -factor));
+}
+
 /* Return true if we can move VALUE into a register using a single
    CNT[BHWD] instruction.  */
 
@@ -5273,11 +5285,7 @@  static bool
 aarch64_sve_cnt_immediate_p (poly_int64 value)
 {
   HOST_WIDE_INT factor = value.coeffs[0];
-  /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
-  return (value.coeffs[1] == factor
-	  && IN_RANGE (factor, 2, 16 * 16)
-	  && (factor & 1) == 0
-	  && factor <= 16 * (factor & -factor));
+  return value.coeffs[1] == factor && aarch64_sve_cnt_factor_p (factor);
 }
 
 /* Likewise for rtx X.  */
@@ -5393,6 +5401,50 @@  aarch64_output_sve_scalar_inc_dec (rtx offset)
 					     -offset_value.coeffs[1], 0);
 }
 
+/* Return true if a single RDVL instruction can multiply FACTOR by the
+   number of 128-bit quadwords in an SVE vector.  */
+
+static bool
+aarch64_sve_rdvl_factor_p (HOST_WIDE_INT factor)
+{
+  return (multiple_p (factor, 16)
+	  && IN_RANGE (factor, -32 * 16, 31 * 16));
+}
+
+/* Return true if we can move VALUE into a register using a single
+   RDVL instruction.  */
+
+static bool
+aarch64_sve_rdvl_immediate_p (poly_int64 value)
+{
+  HOST_WIDE_INT factor = value.coeffs[0];
+  return value.coeffs[1] == factor && aarch64_sve_rdvl_factor_p (factor);
+}
+
+/* Likewise for rtx X.  */
+
+bool
+aarch64_sve_rdvl_immediate_p (rtx x)
+{
+  poly_int64 value;
+  return poly_int_rtx_p (x, &value) && aarch64_sve_rdvl_immediate_p (value);
+}
+
+/* Return the asm string for moving RDVL immediate OFFSET into register
+   operand 0.  */
+
+char *
+aarch64_output_sve_rdvl (rtx offset)
+{
+  static char buffer[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)];
+  poly_int64 offset_value = rtx_to_poly_int64 (offset);
+  gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value));
+
+  int factor = offset_value.coeffs[1];
+  snprintf (buffer, sizeof (buffer), "rdvl\t%%x0, #%d", factor / 16);
+  return buffer;
+}
+
 /* Return true if we can add VALUE to a register using a single ADDVL
    or ADDPL instruction.  */
 
@@ -5991,13 +6043,13 @@  aarch64_offset_temporaries (bool add_p, poly_int64 offset)
     count += 1;
   else if (factor != 0)
     {
-      factor = abs (factor);
-      if (factor > 16 * (factor & -factor))
-	/* Need one register for the CNT result and one for the multiplication
-	   factor.  If necessary, the second temporary can be reused for the
-	   constant part of the offset.  */
+      factor /= (HOST_WIDE_INT) least_bit_hwi (factor);
+      if (!IN_RANGE (factor, -32, 31))
+	/* Need one register for the CNT or RDVL result and one for the
+	   multiplication factor.  If necessary, the second temporary
+	   can be reused for the constant part of the offset.  */
 	return 2;
-      /* Need one register for the CNT result (which might then
+      /* Need one register for the CNT or RDVL result (which might then
 	 be shifted).  */
       count += 1;
     }
@@ -6086,85 +6138,105 @@  aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
   /* Otherwise use a CNT-based sequence.  */
   else if (factor != 0)
     {
-      /* Use a subtraction if we have a negative factor.  */
-      rtx_code code = PLUS;
-      if (factor < 0)
-	{
-	  factor = -factor;
-	  code = MINUS;
-	}
+      /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT,
+	 with negative shifts indicating a shift right.  */
+      HOST_WIDE_INT low_bit = least_bit_hwi (factor);
+      HOST_WIDE_INT rel_factor = factor / low_bit;
+      int shift = exact_log2 (low_bit) - 4;
+      gcc_assert (shift >= -4 && (rel_factor & 1) != 0);
 
-      /* Calculate CNTD * FACTOR / 2.  First try to fold the division
-	 into the multiplication.  */
+      /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is
+	 equal to CNTB * FACTOR / 16, with CODE being the [+-].
+
+	 We can avoid a multiplication if REL_FACTOR is in the range
+	 of RDVL, although there are then various optimizations that
+	 we can try on top.  */
+      rtx_code code = PLUS;
       rtx val;
-      int shift = 0;
-      if (factor & 1)
-	/* Use a right shift by 1.  */
-	shift = -1;
-      else
-	factor /= 2;
-      HOST_WIDE_INT low_bit = factor & -factor;
-      if (factor <= 16 * low_bit)
+      if (IN_RANGE (rel_factor, -32, 31))
 	{
-	  if (factor > 16 * 8)
+	  /* Try to use an unshifted CNT[BHWD].  */
+	  if (aarch64_sve_cnt_factor_p (factor))
 	    {
-	      /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
-		 the value with the minimum multiplier and shift it into
-		 position.  */
-	      int extra_shift = exact_log2 (low_bit);
-	      shift += extra_shift;
-	      factor >>= extra_shift;
+	      val = gen_int_mode (poly_int64 (factor, factor), mode);
+	      shift = 0;
 	    }
-	  val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
+	  /* Try to use an unshifted RDVL.  */
+	  else if (aarch64_sve_rdvl_factor_p (factor))
+	    {
+	      val = gen_int_mode (poly_int64 (factor, factor), mode);
+	      shift = 0;
+	    }
+	  /* Try to subtract an unshifted CNT[BHWD].  */
+	  else if (aarch64_sve_cnt_factor_p (-factor))
+	    {
+	      code = MINUS;
+	      val = gen_int_mode (poly_int64 (-factor, -factor), mode);
+	      shift = 0;
+	    }
+	  /* If subtraction is free, prefer to load a positive constant.
+	     In the best case this will fit a shifted CNTB.  */
+	  else if (src != const0_rtx && rel_factor < 0)
+	    {
+	      code = MINUS;
+	      val = gen_int_mode (-rel_factor * BYTES_PER_SVE_VECTOR, mode);
+	    }
+	  /* Otherwise use a shifted RDVL or CNT[BHWD].  */
+	  else
+	    val = gen_int_mode (rel_factor * BYTES_PER_SVE_VECTOR, mode);
 	}
       else
 	{
-	  /* Base the factor on LOW_BIT if we can calculate LOW_BIT
-	     directly, since that should increase the chances of being
-	     able to use a shift and add sequence.  If LOW_BIT itself
-	     is out of range, just use CNTD.  */
-	  if (low_bit <= 16 * 8)
-	    factor /= low_bit;
+	  /* If we can calculate CNTB << SHIFT directly, prefer to do that,
+	     since it should increase the chances of being able to use
+	     a shift and add sequence for the multiplication.
+	     If CNTB << SHIFT is out of range, stick with the current
+	     shift factor.  */
+	  if (IN_RANGE (low_bit, 2, 16 * 16))
+	    {
+	      val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
+	      shift = 0;
+	    }
 	  else
-	    low_bit = 1;
+	    val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
 
-	  val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
 	  val = aarch64_force_temporary (mode, temp1, val);
 
+	  /* Prefer to multiply by a positive factor and subtract rather
+	     than multiply by a negative factor and add, since positive
+	     values are usually easier to move.  */
+	  if (rel_factor < 0 && src != const0_rtx)
+	    {
+	      rel_factor = -rel_factor;
+	      code = MINUS;
+	    }
+
 	  if (can_create_pseudo_p ())
 	    {
-	      rtx coeff1 = gen_int_mode (factor, mode);
+	      rtx coeff1 = gen_int_mode (rel_factor, mode);
 	      val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
 	    }
 	  else
 	    {
-	      /* Go back to using a negative multiplication factor if we have
-		 no register from which to subtract.  */
-	      if (code == MINUS && src == const0_rtx)
-		{
-		  factor = -factor;
-		  code = PLUS;
-		}
-	      rtx coeff1 = gen_int_mode (factor, mode);
+	      rtx coeff1 = gen_int_mode (rel_factor, mode);
 	      coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
 	      val = gen_rtx_MULT (mode, val, coeff1);
 	    }
 	}
 
+      /* Multiply by 2 ** SHIFT.  */
       if (shift > 0)
 	{
-	  /* Multiply by 1 << SHIFT.  */
 	  val = aarch64_force_temporary (mode, temp1, val);
 	  val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
 	}
-      else if (shift == -1)
+      else if (shift < 0)
 	{
-	  /* Divide by 2.  */
 	  val = aarch64_force_temporary (mode, temp1, val);
-	  val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
+	  val = gen_rtx_ASHIFTRT (mode, val, GEN_INT (-shift));
 	}
 
-      /* Calculate SRC +/- CNTD * FACTOR / 2.  */
+      /* Add the result to SRC or subtract the result from SRC.  */
       if (src != const0_rtx)
 	{
 	  val = aarch64_force_temporary (mode, temp1, val);
@@ -6809,7 +6881,9 @@  aarch64_expand_mov_immediate (rtx dest, rtx imm)
 	      aarch64_report_sve_required ();
 	      return;
 	    }
-	  if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
+	  if (base == const0_rtx
+	      && (aarch64_sve_cnt_immediate_p (offset)
+		  || aarch64_sve_rdvl_immediate_p (offset)))
 	    emit_insn (gen_rtx_SET (dest, imm));
 	  else
 	    {
@@ -21434,7 +21508,9 @@  aarch64_mov_operand_p (rtx x, machine_mode mode)
   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
     return true;
 
-  if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
+  if (TARGET_SVE
+      && (aarch64_sve_cnt_immediate_p (x)
+	  || aarch64_sve_rdvl_immediate_p (x)))
     return true;
 
   return aarch64_classify_symbolic_expression (x)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c4581cb823f..ca2e618d9b9 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1188,8 +1188,8 @@  (define_expand "mov<mode>"
 )
 
 (define_insn "*mov<mode>_aarch64"
-  [(set (match_operand:SHORT 0 "nonimmediate_operand" "=r,r,    w,r  ,r,w, m,m,r,w,w")
-	(match_operand:SHORT 1 "aarch64_mov_operand"  " r,M,D<hq>,Usv,m,m,rZ,w,w,rZ,w"))]
+  [(set (match_operand:SHORT 0 "nonimmediate_operand" "=r,r,    w,r  ,r  ,r,w, m,m,r,w,w")
+	(match_operand:SHORT 1 "aarch64_mov_operand"  " r,M,D<hq>,Usv,Usr,m,m,rZ,w,w,rZ,w"))]
   "(register_operand (operands[0], <MODE>mode)
     || aarch64_reg_or_zero (operands[1], <MODE>mode))"
 {
@@ -1205,27 +1205,30 @@  (define_insn "*mov<mode>_aarch64"
      case 3:
        return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
      case 4:
-       return "ldr<size>\t%w0, %1";
+       return aarch64_output_sve_rdvl (operands[1]);
      case 5:
-       return "ldr\t%<size>0, %1";
+       return "ldr<size>\t%w0, %1";
      case 6:
-       return "str<size>\t%w1, %0";
+       return "ldr\t%<size>0, %1";
      case 7:
-       return "str\t%<size>1, %0";
+       return "str<size>\t%w1, %0";
      case 8:
-       return TARGET_SIMD ? "umov\t%w0, %1.<v>[0]" : "fmov\t%w0, %s1";
+       return "str\t%<size>1, %0";
      case 9:
-       return TARGET_SIMD ? "dup\t%0.<Vallxd>, %w1" : "fmov\t%s0, %w1";
+       return TARGET_SIMD ? "umov\t%w0, %1.<v>[0]" : "fmov\t%w0, %s1";
      case 10:
+       return TARGET_SIMD ? "dup\t%0.<Vallxd>, %w1" : "fmov\t%s0, %w1";
+     case 11:
        return TARGET_SIMD ? "dup\t%<Vetype>0, %1.<v>[0]" : "fmov\t%s0, %s1";
      default:
        gcc_unreachable ();
      }
 }
-  ;; The "mov_imm" type for CNT is just a placeholder.
-  [(set_attr "type" "mov_reg,mov_imm,neon_move,mov_imm,load_4,load_4,store_4,
-		     store_4,neon_to_gp<q>,neon_from_gp<q>,neon_dup")
-   (set_attr "arch" "*,*,simd,sve,*,*,*,*,*,*,*")]
+  ;; The "mov_imm" type for CNT and RDVL is just a placeholder.
+  [(set_attr "type" "mov_reg,mov_imm,neon_move,mov_imm,mov_imm,
+		     load_4,load_4,store_4,store_4,
+		     neon_to_gp<q>,neon_from_gp<q>,neon_dup")
+   (set_attr "arch" "*,*,simd,sve,*,*,*,*,*,*,*,*")]
 )
 
 (define_expand "mov<mode>"
@@ -1262,8 +1265,8 @@  (define_expand "mov<mode>"
 )
 
 (define_insn_and_split "*movsi_aarch64"
-  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,k,r,r,r,r, r,w, m, m,  r,  r,  r, w,r,w, w")
-	(match_operand:SI 1 "aarch64_mov_operand"  " r,r,k,M,n,Usv,m,m,rZ,w,Usw,Usa,Ush,rZ,w,w,Ds"))]
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,k,r,r,r,  r,  r,r,w, m,m,  r,  r,  r, w,r,w, w")
+	(match_operand:SI 1 "aarch64_mov_operand"  " r,r,k,M,n,Usv,Usr,m,m,rZ,w,Usw,Usa,Ush,rZ,w,w,Ds"))]
   "(register_operand (operands[0], SImode)
     || aarch64_reg_or_zero (operands[1], SImode))"
   "@
@@ -1273,6 +1276,7 @@  (define_insn_and_split "*movsi_aarch64"
    mov\\t%w0, %1
    #
    * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
+   * return aarch64_output_sve_rdvl (operands[1]);
    ldr\\t%w0, %1
    ldr\\t%s0, %1
    str\\t%w1, %0
@@ -1292,16 +1296,18 @@  (define_insn_and_split "*movsi_aarch64"
        DONE;
     }"
   ;; The "mov_imm" type for CNT is just a placeholder.
-  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,load_4,
-		    load_4,store_4,store_4,load_4,adr,adr,f_mcr,f_mrc,fmov,neon_move")
-   (set_attr "arch"   "*,*,*,*,*,sve,*,fp,*,fp,*,*,*,fp,fp,fp,simd")
-   (set_attr "length" "4,4,4,4,*,  4,4, 4,4, 4,8,4,4, 4, 4, 4,   4")
+  [(set_attr "type" "mov_reg,mov_reg,mov_reg,
+		     mov_imm,mov_imm,mov_imm,mov_imm,
+		     load_4,load_4,store_4,store_4,load_4,
+		     adr,adr,f_mcr,f_mrc,fmov,neon_move")
+   (set_attr "arch"   "*,*,*,*,*,sve,sve,*,fp,*,fp,*,*,*,fp,fp,fp,simd")
+   (set_attr "length" "4,4,4,4,*,  4,  4,4, 4,4, 4,8,4,4, 4, 4, 4,   4")
 ]
 )
 
 (define_insn_and_split "*movdi_aarch64"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m,   r,  r,  r, w,r,w, w")
-	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Usv,m,m,rZ,w,Usw,Usa,Ush,rZ,w,w,Dd"))]
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,  r,  r,r,w, m,m,  r,  r,  r, w,r,w, w")
+	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Usv,Usr,m,m,rZ,w,Usw,Usa,Ush,rZ,w,w,Dd"))]
   "(register_operand (operands[0], DImode)
     || aarch64_reg_or_zero (operands[1], DImode))"
   "@
@@ -1312,6 +1318,7 @@  (define_insn_and_split "*movdi_aarch64"
    mov\\t%w0, %1
    #
    * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
+   * return aarch64_output_sve_rdvl (operands[1]);
    ldr\\t%x0, %1
    ldr\\t%d0, %1
    str\\t%x1, %0
@@ -1331,11 +1338,12 @@  (define_insn_and_split "*movdi_aarch64"
        DONE;
     }"
   ;; The "mov_imm" type for CNTD is just a placeholder.
-  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
-		     load_8,load_8,store_8,store_8,load_8,adr,adr,f_mcr,f_mrc,
-		     fmov,neon_move")
-   (set_attr "arch"   "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,*,fp,fp,fp,simd")
-   (set_attr "length" "4,4,4,4,4,*,  4,4, 4,4, 4,8,4,4, 4, 4, 4,   4")]
+  [(set_attr "type" "mov_reg,mov_reg,mov_reg,
+		     mov_imm,mov_imm,mov_imm,mov_imm,mov_imm,
+		     load_8,load_8,store_8,store_8,load_8,
+		     adr,adr,f_mcr,f_mrc,fmov,neon_move")
+   (set_attr "arch"   "*,*,*,*,*,*,sve,sve,*,fp,*,fp,*,*,*,fp,fp,fp,simd")
+   (set_attr "length" "4,4,4,4,4,*,  4,  4,4, 4,4, 4,8,4,4, 4, 4, 4,   4")]
 )
 
 (define_insn "insv_imm<mode>"
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index ee7587cca16..3664e4dbdd6 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -209,6 +209,12 @@  (define_constraint "Ulc"
  (and (match_code "const_int")
       (match_test "aarch64_high_bits_all_ones_p (ival)")))
 
+(define_constraint "Usr"
+  "@internal
+   A constraint that matches a value produced by RDVL."
+ (and (match_code "const_poly_int")
+      (match_test "aarch64_sve_rdvl_immediate_p (op)")))
+
 (define_constraint "Usv"
   "@internal
    A constraint that matches a VG-based constant that can be loaded by
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c
index 8b8fe8e4f2b..a22d8a28d86 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c
@@ -51,19 +51,24 @@  PROTO (cntb_15, uint64_t, ()) { return svcntb () * 15; }
 */
 PROTO (cntb_16, uint64_t, ()) { return svcntb () * 16; }
 
-/* Other sequences would be OK.  */
 /*
 ** cntb_17:
-**	cntb	x0, all, mul #16
-**	incb	x0
+**	rdvl	x0, #17
 **	ret
 */
 PROTO (cntb_17, uint64_t, ()) { return svcntb () * 17; }
 
+/*
+** cntb_31:
+**	rdvl	x0, #31
+**	ret
+*/
+PROTO (cntb_31, uint64_t, ()) { return svcntb () * 31; }
+
 /*
 ** cntb_32:
-**	cntd	(x[0-9]+)
-**	lsl	x0, \1, 8
+**	cntb	(x[0-9]+)
+**	lsl	x0, \1, 5
 **	ret
 */
 PROTO (cntb_32, uint64_t, ()) { return svcntb () * 32; }
@@ -80,16 +85,16 @@  PROTO (cntb_33, uint64_t, ()) { return svcntb () * 33; }
 
 /*
 ** cntb_64:
-**	cntd	(x[0-9]+)
-**	lsl	x0, \1, 9
+**	cntb	(x[0-9]+)
+**	lsl	x0, \1, 6
 **	ret
 */
 PROTO (cntb_64, uint64_t, ()) { return svcntb () * 64; }
 
 /*
 ** cntb_128:
-**	cntd	(x[0-9]+)
-**	lsl	x0, \1, 10
+**	cntb	(x[0-9]+)
+**	lsl	x0, \1, 7
 **	ret
 */
 PROTO (cntb_128, uint64_t, ()) { return svcntb () * 128; }
@@ -106,46 +111,70 @@  PROTO (cntb_129, uint64_t, ()) { return svcntb () * 129; }
 
 /*
 ** cntb_m1:
-**	cntb	(x[0-9]+)
-**	neg	x0, \1
+**	rdvl	x0, #-1
 **	ret
 */
 PROTO (cntb_m1, uint64_t, ()) { return -svcntb (); }
 
 /*
 ** cntb_m13:
-**	cntb	(x[0-9]+), all, mul #13
-**	neg	x0, \1
+**	rdvl	x0, #-13
 **	ret
 */
 PROTO (cntb_m13, uint64_t, ()) { return -svcntb () * 13; }
 
 /*
 ** cntb_m15:
-**	cntb	(x[0-9]+), all, mul #15
-**	neg	x0, \1
+**	rdvl	x0, #-15
 **	ret
 */
 PROTO (cntb_m15, uint64_t, ()) { return -svcntb () * 15; }
 
 /*
 ** cntb_m16:
-**	cntb	(x[0-9]+), all, mul #16
-**	neg	x0, \1
+**	rdvl	x0, #-16
 **	ret
 */
 PROTO (cntb_m16, uint64_t, ()) { return -svcntb () * 16; }
 
-/* Other sequences would be OK.  */
 /*
 ** cntb_m17:
-**	cntb	x0, all, mul #16
-**	incb	x0
-**	neg	x0, x0
+**	rdvl	x0, #-17
 **	ret
 */
 PROTO (cntb_m17, uint64_t, ()) { return -svcntb () * 17; }
 
+/*
+** cntb_m32:
+**	rdvl	x0, #-32
+**	ret
+*/
+PROTO (cntb_m32, uint64_t, ()) { return -svcntb () * 32; }
+
+/*
+** cntb_m33:
+**	rdvl	x0, #-32
+**	decb	x0
+**	ret
+*/
+PROTO (cntb_m33, uint64_t, ()) { return -svcntb () * 33; }
+
+/*
+** cntb_m34:
+**	rdvl	(x[0-9]+), #-17
+**	lsl	x0, \1, #?1
+**	ret
+*/
+PROTO (cntb_m34, uint64_t, ()) { return -svcntb () * 34; }
+
+/*
+** cntb_m64:
+**	rdvl	(x[0-9]+), #-1
+**	lsl	x0, \1, #?6
+**	ret
+*/
+PROTO (cntb_m64, uint64_t, ()) { return -svcntb () * 64; }
+
 /*
 ** incb_1:
 **	incb	x0
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c
index 0d0ed4849f1..090a643b418 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c
@@ -54,8 +54,8 @@  PROTO (cntd_16, uint64_t, ()) { return svcntd () * 16; }
 /* Other sequences would be OK.  */
 /*
 ** cntd_17:
-**	cntb	x0, all, mul #2
-**	incd	x0
+**	rdvl	(x[0-9]+), #17
+**	asr	x0, \1, 3
 **	ret
 */
 PROTO (cntd_17, uint64_t, ()) { return svcntd () * 17; }
@@ -107,8 +107,7 @@  PROTO (cntd_m15, uint64_t, ()) { return -svcntd () * 15; }
 
 /*
 ** cntd_m16:
-**	cntb	(x[0-9]+), all, mul #2
-**	neg	x0, \1
+**	rdvl	x0, #-2
 **	ret
 */
 PROTO (cntd_m16, uint64_t, ()) { return -svcntd () * 16; }
@@ -116,9 +115,8 @@  PROTO (cntd_m16, uint64_t, ()) { return -svcntd () * 16; }
 /* Other sequences would be OK.  */
 /*
 ** cntd_m17:
-**	cntb	x0, all, mul #2
-**	incd	x0
-**	neg	x0, x0
+**	rdvl	(x[0-9]+), #-17
+**	asr	x0, \1, 3
 **	ret
 */
 PROTO (cntd_m17, uint64_t, ()) { return -svcntd () * 17; }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c
index c29930f1591..1a4e7dc0e01 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c
@@ -54,8 +54,8 @@  PROTO (cnth_16, uint64_t, ()) { return svcnth () * 16; }
 /* Other sequences would be OK.  */
 /*
 ** cnth_17:
-**	cntb	x0, all, mul #8
-**	inch	x0
+**	rdvl	(x[0-9]+), #17
+**	asr	x0, \1, 1
 **	ret
 */
 PROTO (cnth_17, uint64_t, ()) { return svcnth () * 17; }
@@ -69,16 +69,16 @@  PROTO (cnth_32, uint64_t, ()) { return svcnth () * 32; }
 
 /*
 ** cnth_64:
-**	cntd	(x[0-9]+)
-**	lsl	x0, \1, 8
+**	cntb	(x[0-9]+)
+**	lsl	x0, \1, 5
 **	ret
 */
 PROTO (cnth_64, uint64_t, ()) { return svcnth () * 64; }
 
 /*
 ** cnth_128:
-**	cntd	(x[0-9]+)
-**	lsl	x0, \1, 9
+**	cntb	(x[0-9]+)
+**	lsl	x0, \1, 6
 **	ret
 */
 PROTO (cnth_128, uint64_t, ()) { return svcnth () * 128; }
@@ -109,8 +109,7 @@  PROTO (cnth_m15, uint64_t, ()) { return -svcnth () * 15; }
 
 /*
 ** cnth_m16:
-**	cntb	(x[0-9]+), all, mul #8
-**	neg	x0, \1
+**	rdvl	x0, #-8
 **	ret
 */
 PROTO (cnth_m16, uint64_t, ()) { return -svcnth () * 16; }
@@ -118,9 +117,8 @@  PROTO (cnth_m16, uint64_t, ()) { return -svcnth () * 16; }
 /* Other sequences would be OK.  */
 /*
 ** cnth_m17:
-**	cntb	x0, all, mul #8
-**	inch	x0
-**	neg	x0, x0
+**	rdvl	(x[0-9]+), #-17
+**	asr	x0, \1, 1
 **	ret
 */
 PROTO (cnth_m17, uint64_t, ()) { return -svcnth () * 17; }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c
index e26cc67a467..9d169769094 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c
@@ -54,8 +54,8 @@  PROTO (cntw_16, uint64_t, ()) { return svcntw () * 16; }
 /* Other sequences would be OK.  */
 /*
 ** cntw_17:
-**	cntb	x0, all, mul #4
-**	incw	x0
+**	rdvl	(x[0-9]+), #17
+**	asr	x0, \1, 2
 **	ret
 */
 PROTO (cntw_17, uint64_t, ()) { return svcntw () * 17; }
@@ -76,8 +76,8 @@  PROTO (cntw_64, uint64_t, ()) { return svcntw () * 64; }
 
 /*
 ** cntw_128:
-**	cntd	(x[0-9]+)
-**	lsl	x0, \1, 8
+**	cntb	(x[0-9]+)
+**	lsl	x0, \1, 5
 **	ret
 */
 PROTO (cntw_128, uint64_t, ()) { return svcntw () * 128; }
@@ -108,8 +108,7 @@  PROTO (cntw_m15, uint64_t, ()) { return -svcntw () * 15; }
 
 /*
 ** cntw_m16:
-**	cntb	(x[0-9]+), all, mul #4
-**	neg	x0, \1
+**	rdvl	(x[0-9]+), #-4
 **	ret
 */
 PROTO (cntw_m16, uint64_t, ()) { return -svcntw () * 16; }
@@ -117,9 +116,8 @@  PROTO (cntw_m16, uint64_t, ()) { return -svcntw () * 16; }
 /* Other sequences would be OK.  */
 /*
 ** cntw_m17:
-**	cntb	x0, all, mul #4
-**	incw	x0
-**	neg	x0, x0
+**	rdvl	(x[0-9]+), #-17
+**	asr	x0, \1, 2
 **	ret
 */
 PROTO (cntw_m17, uint64_t, ()) { return -svcntw () * 17; }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c
index c90730a037c..94cd3a0662e 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c
@@ -218,8 +218,8 @@  TEST_PREFETCH (prfb_vnum_31, uint16_t,
 
 /*
 ** prfb_vnum_32:
-**	cntd	(x[0-9]+)
-**	lsl	(x[0-9]+), \1, #?8
+**	cntb	(x[0-9]+)
+**	lsl	(x[0-9]+), \1, #?5
 **	add	(x[0-9]+), (\2, x0|x0, \2)
 **	prfb	pldl1keep, p0, \[\3\]
 **	ret
@@ -240,7 +240,7 @@  TEST_PREFETCH (prfb_vnum_m32, uint16_t,
 /*
 ** prfb_vnum_m33:
 **	...
-**	prfb	pldl1keep, p0, \[x[0-9]+\]
+**	prfb	pldl1keep, p0, \[x[0-9]+(, x[0-9]+)?\]
 **	ret
 */
 TEST_PREFETCH (prfb_vnum_m33, uint16_t,
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c
index 869ef3d3eeb..b7a116cf056 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c
@@ -218,8 +218,8 @@  TEST_PREFETCH (prfd_vnum_31, uint16_t,
 
 /*
 ** prfd_vnum_32:
-**	cntd	(x[0-9]+)
-**	lsl	(x[0-9]+), \1, #?8
+**	cntb	(x[0-9]+)
+**	lsl	(x[0-9]+), \1, #?5
 **	add	(x[0-9]+), (\2, x0|x0, \2)
 **	prfd	pldl1keep, p0, \[\3\]
 **	ret
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c
index 45a735eaea0..9d3df6bd3a8 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c
@@ -218,8 +218,8 @@  TEST_PREFETCH (prfh_vnum_31, uint16_t,
 
 /*
 ** prfh_vnum_32:
-**	cntd	(x[0-9]+)
-**	lsl	(x[0-9]+), \1, #?8
+**	cntb	(x[0-9]+)
+**	lsl	(x[0-9]+), \1, #?5
 **	add	(x[0-9]+), (\2, x0|x0, \2)
 **	prfh	pldl1keep, p0, \[\3\]
 **	ret
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c
index 444187f45d9..6962abab600 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c
@@ -218,8 +218,8 @@  TEST_PREFETCH (prfw_vnum_31, uint16_t,
 
 /*
 ** prfw_vnum_32:
-**	cntd	(x[0-9]+)
-**	lsl	(x[0-9]+), \1, #?8
+**	cntb	(x[0-9]+)
+**	lsl	(x[0-9]+), \1, #?5
 **	add	(x[0-9]+), (\2, x0|x0, \2)
 **	prfw	pldl1keep, p0, \[\3\]
 **	ret
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c b/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
index 9ead9c21b35..7f02497e839 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
@@ -68,8 +68,7 @@  TEST_ALL (LOOP)
 /* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.s, w[0-9]+, w[0-9]+\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */
 /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */
-/* 2 for the calculations of -17 and 17.  */
-/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 10 } } */
+/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 8 } } */
 
 /* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #16\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #15\n} 1 } } */
@@ -86,8 +85,7 @@  TEST_ALL (LOOP)
 /* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.d, x[0-9]+, x[0-9]+\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */
 /* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */
-/* 2 for the calculations of -17 and 17.  */
-/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 10 } } */
+/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 8 } } */
 
 /* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #16\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #15\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c
index 110947a6c4a..5de34fc6163 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c
@@ -6,8 +6,7 @@ 
 
 /*
 ** test_1:
-**	cntd	x12, all, mul #9
-**	lsl	x12, x12, #?4
+**	rdvl	x12, #18
 **	mov	x11, sp
 **	...
 **	sub	sp, sp, x12