ivcannon: support aggregate ranged on loop bounds

Message ID patch-20512-tamar@arm.com
State New
Headers
Series ivcannon: support aggregate ranged on loop bounds |

Commit Message

Tamar Christina May 6, 2026, 1:01 p.m. UTC
  The loop

void add(int N, int *__restrict a, int *__restrict b, int *__restrict c) {
  for (int i = 0; i < N; i++) {
    c[i] = a[i] + b[i];
    if (i > 1000) {
        break;
    }
  }
}

is an example of a loop that we have started vectorizing using early break
vectorization.  However the given loop is not actually an early break loop as
the condition on the exit introduces a MAX bound on N.

This patch teaches IVcannon to identify such cases and rewrite the latch
condition from i < N to i < MIN (N, C) where C is a constant.

It then tries to find an appropriate edge to remove from the loop iteration.

Concretely instead of

.L8:
        add     v31.4s, v30.4s, v27.4s
        add     v30.4s, v30.4s, v28.4s
        cmpeq   p15.s, p7/z, z31.s, #0
        b.any   .L41
        ldr     q31, [x9, x4]
        add     w5, w5, 4
        ldr     q29, [x8, x4]
        add     v31.4s, v31.4s, v29.4s
        str     q31, [x6, x4]
        add     x4, x4, 16
        cmp     w7, w5
        bne     .L8

we now generate:

.L4:
        ldr     q31, [x1, x0]
        ldr     q30, [x2, x0]
        add     v30.4s, v31.4s, v30.4s
        str     q30, [x3, x0]
        add     x0, x0, 16
        cmp     x4, x0
        bne     .L4

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.
Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	PR tree-optimization/113134
	* tree-ssa-loop-ivcanon.cc (redundant_with_constrained_latch_exit_p,
	(find_latch_constrained_niter): New.
	(try_unroll_loop_completely): Explicitly check exit.
	(canonicalize_loop_induction_variables): Use them.

gcc/testsuite/ChangeLog:

	PR tree-optimization/113134
	* gcc.dg/ivcannon-pr113134.c: New test.

---


--
  

Comments

Richard Biener May 8, 2026, 6:34 a.m. UTC | #1
On Wed, 6 May 2026, Tamar Christina wrote:

> The loop
> 
> void add(int N, int *__restrict a, int *__restrict b, int *__restrict c) {
>   for (int i = 0; i < N; i++) {
>     c[i] = a[i] + b[i];
>     if (i > 1000) {
>         break;
>     }
>   }
> }
> 
> is an example of a loop that we have started vectorizing using early break
> vectorization.  However the given loop is not actually an early break loop as
> the condition on the exit introduces a MAX bound on N.
> 
> This patch teaches IVcannon to identify such cases and rewrite the latch
> condition from i < N to i < MIN (N, C) where C is a constant.

The pattern-matching in IVCANON looks quite ugly.  loop header copying
(run after ifcombine...) exposes

  if (i_21 == 1001)
    goto <bb 5>; [1.00%]
  else
    goto <bb 4>; [99.00%]

  <bb 4> [local count: 1004539166]:
  i_18 = i_21 + 1;
  if (N_13(D) > i_18)
    goto <bb 3>; [94.50%]
  else
    goto <bb 5>; [5.50%]

  <bb 5> [local count: 69202658]:

which looks like a ifcombine opportunity to me.  I recently reviewed
a patch to do extra if-combining from tail merging.

That said, IVCANON doesn't look like the correct place to fuse
two exit tests?

> It then tries to find an appropriate edge to remove from the loop iteration.
> 
> Concretely instead of
> 
> .L8:
>         add     v31.4s, v30.4s, v27.4s
>         add     v30.4s, v30.4s, v28.4s
>         cmpeq   p15.s, p7/z, z31.s, #0
>         b.any   .L41
>         ldr     q31, [x9, x4]
>         add     w5, w5, 4
>         ldr     q29, [x8, x4]
>         add     v31.4s, v31.4s, v29.4s
>         str     q31, [x6, x4]
>         add     x4, x4, 16
>         cmp     w7, w5
>         bne     .L8
> 
> we now generate:
> 
> .L4:
>         ldr     q31, [x1, x0]
>         ldr     q30, [x2, x0]
>         add     v30.4s, v31.4s, v30.4s
>         str     q30, [x3, x0]
>         add     x0, x0, 16
>         cmp     x4, x0
>         bne     .L4
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	PR tree-optimization/113134
> 	* tree-ssa-loop-ivcanon.cc (redundant_with_constrained_latch_exit_p,
> 	(find_latch_constrained_niter): New.
> 	(try_unroll_loop_completely): Explicitly check exit.
> 	(canonicalize_loop_induction_variables): Use them.
> 
> gcc/testsuite/ChangeLog:
> 
> 	PR tree-optimization/113134
> 	* gcc.dg/ivcannon-pr113134.c: New test.
> 
> ---
> diff --git a/gcc/testsuite/gcc.dg/ivcannon-pr113134.c b/gcc/testsuite/gcc.dg/ivcannon-pr113134.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..49fd231cf436211ae3779c96def8a833446ca4fd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/ivcannon-pr113134.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -fdump-tree-ivcanon-details -std=c99" } */
> +
> +void add(int N, int *__restrict a, int *__restrict b, int *__restrict c) {
> +  for (int i = 0; i < N; i++) {
> +    c[i] = a[i] + b[i];
> +    if (i > 1000) {
> +        break;
> +    }
> +  }
> +}
> +
> +/* { dg-final { scan-tree-dump {exit edges updated and IV rewritten to} "ivcanon" } } */
> diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> index fb2ef44a3d3c8504823b75747da1656ce2854bfd..beb3157db44b51bf656d8af143cc1706e322ea6b 100644
> --- a/gcc/tree-ssa-loop-ivcanon.cc
> +++ b/gcc/tree-ssa-loop-ivcanon.cc
> @@ -129,6 +129,133 @@ create_canonical_iv (class loop *loop, edge exit, tree niter,
>    update_stmt (cond);
>  }
>  
> +/* Return true if exit E becomes redundant once LATCH_EXIT is rewritten with a
> +   constrained latch count.  This is intentionally narrow: the non-exit path
> +   must flow directly to LATCH_EXIT in the same iteration.  */
> +
> +static bool
> +redundant_with_constrained_latch_exit_p (class loop *loop, edge e,
> +					 edge latch_exit,
> +					 HOST_WIDE_INT maxiter)
> +{
> +  class tree_niter_desc desc;
> +  edge nonexit;
> +  gphi_iterator gpi;
> +
> +  /* Candidate exit must not be the latch exit and must execute at most
> +     once per iteration.  i.e. no other flow through the loop.  */
> +  if (e == latch_exit || !just_once_each_iteration_p (loop, e->src))
> +    return false;
> +
> +  /* The candidate exit must dominate the latch, otherwise it would be unsafe
> +     to elide the candidate edge and have the latch assume it's role.  */
> +  if (!dominated_by_p (CDI_DOMINATORS, latch_exit->src, e->src))
> +    return false;
> +
> +  nonexit = EDGE_SUCC (e->src, 0);
> +  if (nonexit == e)
> +    nonexit = EDGE_SUCC (e->src, 1);
> +
> +  /* The condidate edge must fall through to the latch.  This might be too
> +     strict, but it's safe for now.  The reason for this is to prevent a second
> +     branch like a diamond shape that can reach the latch from the edge.  */
> +  if (nonexit->dest != latch_exit->src)
> +    return false;
> +
> +  /* Since we want to use the latch for the candidate edge exit, they must have
> +     been going to the same destination.  */
> +  if (e->dest != latch_exit->dest)
> +    return false;
> +
> +  /* The candidate edge must have a known, constant niters and it must be
> +     smaller or equal to our maximum iteration count.  */
> +  if (!number_of_iterations_exit (loop, e, &desc, false)
> +      || !integer_zerop (desc.may_be_zero)
> +      || TREE_CODE (desc.niter) != INTEGER_CST
> +      || !wi::geu_p (wi::to_widest (desc.niter), maxiter))
> +    return false;
> +
> +  /* Check that the candidate exit and the latch exit compute the same value
> +     otherwise rewriting the candidate exit to go through the latch is not a
> +     valid thing to do.  */
> +  for (gpi = gsi_start_phis (e->dest); !gsi_end_p (gpi); gsi_next (&gpi))
> +    {
> +      gphi *phi = gpi.phi ();
> +      tree e_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
> +      tree latch_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_exit);
> +
> +      if (!operand_equal_p (e_arg, latch_arg, 0))
> +	return false;
> +    }
> +
> +  return true;
> +}
> +
> +/* Return a constrained latch bound for LATCH_EXIT, using only exits that
> +   dominate LATCH_EXIT and are executed once per iteration.  The original
> +   latch count is stored in *LATCH_NITER_OUT.  */
> +
> +static tree
> +find_latch_constrained_niter (class loop *loop, edge latch_exit,
> +			      tree *latch_niter_out)
> +{
> +  class tree_niter_desc desc;
> +
> +  *latch_niter_out = NULL_TREE;
> +  if (!number_of_iterations_exit (loop, latch_exit, &desc, false)
> +      || !integer_zerop (desc.may_be_zero))
> +    return chrec_dont_know;
> +
> +  tree niter = desc.niter;
> +  *latch_niter_out = niter;
> +
> +  for (auto e : get_loop_exit_edges (loop))
> +    {
> +      /* Ignore the latch edge, and the exit must dominate the latch.  If it
> +	 doesn't then the form is broken.  */
> +      if (e == latch_exit
> +	  || !just_once_each_iteration_p (loop, e->src)
> +	  || !dominated_by_p (CDI_DOMINATORS, latch_exit->src, e->src))
> +	continue;
> +
> +      /* The edge must have a non-zero iteration count.  */
> +      if (!number_of_iterations_exit (loop, e, &desc, false)
> +	  || !integer_zerop (desc.may_be_zero))
> +	continue;
> +
> +      tree aniter = desc.niter;
> +      tree ty1 = TREE_TYPE (niter);
> +      tree ty2 = TREE_TYPE (aniter);
> +      if (ty1 != ty2)
> +	{
> +	  if (TYPE_UNSIGNED (ty1) != TYPE_UNSIGNED (ty2))
> +	    return chrec_dont_know;
> +
> +	  if (CONSTANT_CLASS_P (niter)
> +	      && TYPE_PRECISION (ty1) <= TYPE_PRECISION (ty2))
> +	    niter = fold_convert (ty2, niter);
> +	  else if (CONSTANT_CLASS_P (aniter)
> +		   && TYPE_PRECISION (ty2) <= TYPE_PRECISION (ty1))
> +	    aniter = fold_convert (ty1, aniter);
> +	  else
> +	    return chrec_dont_know;
> +	}
> +
> +      if (TREE_CODE (aniter) != INTEGER_CST)
> +	continue;
> +
> +      if (TREE_CODE (niter) != INTEGER_CST)
> +	{
> +	  niter = fold_build2 (MIN_EXPR, TREE_TYPE (niter), niter, aniter);
> +	  continue;
> +	}
> +
> +      niter = tree_int_cst_lt (aniter, niter) ? aniter : niter;
> +    }
> +
> +  return niter;
> +}
> +
>  /* Describe size of loop as detected by tree_estimate_loop_size.  */
>  struct loop_size
>  {
> @@ -766,7 +893,7 @@ try_unroll_loop_completely (class loop *loop,
>       If the number of execution of loop is determined by standard induction
>       variable test, then EXIT and EDGE_TO_CANCEL are the two edges leaving
>       from the iv test.  */
> -  if (tree_fits_uhwi_p (niter))
> +  if (exit && tree_fits_uhwi_p (niter))
>      {
>        n_unroll = tree_to_uhwi (niter);
>        n_unroll_found = true;
> @@ -1354,7 +1481,77 @@ canonicalize_loop_induction_variables (class loop *loop,
>  				  innermost_cunrolli_p))
>      return true;
>  
> -  if ((create_iv || by_eval)
> +  edge latch_e = NULL;
> +  bool aggr_rewritten_p = false;
> +  if (create_iv && !single_exit (loop))
> +    {
> +      tree latch_niter = NULL_TREE;
> +      tree niter_aggr = chrec_dont_know;
> +
> +      latch_e = loop_latch_edge (loop);
> +      if (single_pred_p (latch_e->src))
> +	{
> +	  latch_e = single_pred_edge (latch_e->src);
> +	  auto bb_exit = latch_e->src;
> +	  if (EDGE_COUNT (bb_exit->succs) != 2)
> +	    latch_e = NULL;
> +	  else if (loop_exit_edge_p (loop, EDGE_SUCC (bb_exit, 0)))
> +	    latch_e = EDGE_SUCC (bb_exit, 0);
> +	  else if (loop_exit_edge_p (loop, EDGE_SUCC (bb_exit, 1)))
> +	    latch_e = EDGE_SUCC (bb_exit, 1);
> +	  else
> +	    latch_e = NULL;
> +	}
> +      else
> +	latch_e = NULL;
> +
> +      if (latch_e)
> +	{
> +	  niter_aggr = find_latch_constrained_niter (loop, latch_e,
> +						     &latch_niter);
> +	  if (!niter_aggr)
> +	    niter_aggr = chrec_dont_know;
> +	  if (chrec_contains_undetermined (latch_niter)
> +	      || chrec_contains_undetermined (niter_aggr)
> +	      || operand_equal_p (latch_niter, niter_aggr, 0))
> +	    latch_e = NULL;
> +	}
> +
> +      if (latch_e)
> +	{
> +	  create_canonical_iv (loop, latch_e, niter_aggr);
> +	  aggr_rewritten_p = true;
> +
> +	  for (auto e : get_loop_exit_edges (loop))
> +	    if (redundant_with_constrained_latch_exit_p (loop, e, latch_e,
> +							 maxiter))
> +	      {
> +		gcond *cond_stmt
> +		  = as_a <gcond *> (gsi_stmt (gsi_last_nondebug_bb
> +					      (e->src)));
> +		if (e->flags & EDGE_TRUE_VALUE)
> +		  gimple_cond_make_false (cond_stmt);
> +		else
> +		  gimple_cond_make_true (cond_stmt);
> +		update_stmt (cond_stmt);
> +	      }
> +
> +	  if (dump_file && (dump_flags & TDF_DETAILS))
> +	    {
> +	      fprintf (dump_file,
> +		       "Loop %d exit edges updated and IV rewritten to ",
> +		       loop->num);
> +	      print_generic_expr (dump_file, niter_aggr, TDF_SLIM);
> +	      fprintf (dump_file, ".\n");
> +	    }
> +	}
> +    }
> +
> +  /* Only accept the new MIN based bounds if we can actually simplify the loop,
> +     otherwise the MIN_EXPR just causes more instruction in loop pre-headers
> +     without any benefits.  */
> +  if (!aggr_rewritten_p
> +      && (create_iv || by_eval)
>        && niter && !chrec_contains_undetermined (niter)
>        && exit && just_once_each_iteration_p (loop, exit->src))
>      {
> @@ -1798,5 +1995,3 @@ make_pass_complete_unrolli (gcc::context *ctxt)
>  {
>    return new pass_complete_unrolli (ctxt);
>  }
> -
> -
> 
> 
>
  
Tamar Christina May 8, 2026, 6:57 a.m. UTC | #2
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: 08 May 2026 07:35
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> Subject: Re: [PATCH] ivcannon: support aggregate ranged on loop bounds
> 
> On Wed, 6 May 2026, Tamar Christina wrote:
> 
> > The loop
> >
> > void add(int N, int *__restrict a, int *__restrict b, int *__restrict c) {
> >   for (int i = 0; i < N; i++) {
> >     c[i] = a[i] + b[i];
> >     if (i > 1000) {
> >         break;
> >     }
> >   }
> > }
> >
> > is an example of a loop that we have started vectorizing using early break
> > vectorization.  However the given loop is not actually an early break loop as
> > the condition on the exit introduces a MAX bound on N.
> >
> > This patch teaches IVcannon to identify such cases and rewrite the latch
> > condition from i < N to i < MIN (N, C) where C is a constant.
> 
> The pattern-matching in IVCANON looks quite ugly.  loop header copying
> (run after ifcombine...) exposes
> 
>   if (i_21 == 1001)
>     goto <bb 5>; [1.00%]
>   else
>     goto <bb 4>; [99.00%]
> 
>   <bb 4> [local count: 1004539166]:
>   i_18 = i_21 + 1;
>   if (N_13(D) > i_18)
>     goto <bb 3>; [94.50%]
>   else
>     goto <bb 5>; [5.50%]
> 
>   <bb 5> [local count: 69202658]:
> 
> which looks like a ifcombine opportunity to me.  I recently reviewed
> a patch to do extra if-combining from tail merging.
> 
> That said, IVCANON doesn't look like the correct place to fuse
> two exit tests?

I only did so based on your own suggestion https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113134#c20

But fair enough.
Tamar

> 
> > It then tries to find an appropriate edge to remove from the loop iteration.
> >
> > Concretely instead of
> >
> > .L8:
> >         add     v31.4s, v30.4s, v27.4s
> >         add     v30.4s, v30.4s, v28.4s
> >         cmpeq   p15.s, p7/z, z31.s, #0
> >         b.any   .L41
> >         ldr     q31, [x9, x4]
> >         add     w5, w5, 4
> >         ldr     q29, [x8, x4]
> >         add     v31.4s, v31.4s, v29.4s
> >         str     q31, [x6, x4]
> >         add     x4, x4, 16
> >         cmp     w7, w5
> >         bne     .L8
> >
> > we now generate:
> >
> > .L4:
> >         ldr     q31, [x1, x0]
> >         ldr     q30, [x2, x0]
> >         add     v30.4s, v31.4s, v30.4s
> >         str     q30, [x3, x0]
> >         add     x0, x0, 16
> >         cmp     x4, x0
> >         bne     .L4
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu,
> > arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> > -m32, -m64 and no issues.
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	PR tree-optimization/113134
> > 	* tree-ssa-loop-ivcanon.cc
> (redundant_with_constrained_latch_exit_p,
> > 	(find_latch_constrained_niter): New.
> > 	(try_unroll_loop_completely): Explicitly check exit.
> > 	(canonicalize_loop_induction_variables): Use them.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	PR tree-optimization/113134
> > 	* gcc.dg/ivcannon-pr113134.c: New test.
> >
> > ---
> > diff --git a/gcc/testsuite/gcc.dg/ivcannon-pr113134.c
> b/gcc/testsuite/gcc.dg/ivcannon-pr113134.c
> > new file mode 100644
> > index
> 0000000000000000000000000000000000000000..49fd231cf436211ae37
> 79c96def8a833446ca4fd
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/ivcannon-pr113134.c
> > @@ -0,0 +1,13 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O3 -fdump-tree-ivcanon-details -std=c99" } */
> > +
> > +void add(int N, int *__restrict a, int *__restrict b, int *__restrict c) {
> > +  for (int i = 0; i < N; i++) {
> > +    c[i] = a[i] + b[i];
> > +    if (i > 1000) {
> > +        break;
> > +    }
> > +  }
> > +}
> > +
> > +/* { dg-final { scan-tree-dump {exit edges updated and IV rewritten to}
> "ivcanon" } } */
> > diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> > index
> fb2ef44a3d3c8504823b75747da1656ce2854bfd..beb3157db44b51bf656d
> 8af143cc1706e322ea6b 100644
> > --- a/gcc/tree-ssa-loop-ivcanon.cc
> > +++ b/gcc/tree-ssa-loop-ivcanon.cc
> > @@ -129,6 +129,133 @@ create_canonical_iv (class loop *loop, edge exit,
> tree niter,
> >    update_stmt (cond);
> >  }
> >
> > +/* Return true if exit E becomes redundant once LATCH_EXIT is rewritten
> with a
> > +   constrained latch count.  This is intentionally narrow: the non-exit path
> > +   must flow directly to LATCH_EXIT in the same iteration.  */
> > +
> > +static bool
> > +redundant_with_constrained_latch_exit_p (class loop *loop, edge e,
> > +					 edge latch_exit,
> > +					 HOST_WIDE_INT maxiter)
> > +{
> > +  class tree_niter_desc desc;
> > +  edge nonexit;
> > +  gphi_iterator gpi;
> > +
> > +  /* Candidate exit must not be the latch exit and must execute at most
> > +     once per iteration.  i.e. no other flow through the loop.  */
> > +  if (e == latch_exit || !just_once_each_iteration_p (loop, e->src))
> > +    return false;
> > +
> > +  /* The candidate exit must dominate the latch, otherwise it would be
> unsafe
> > +     to elide the candidate edge and have the latch assume it's role.  */
> > +  if (!dominated_by_p (CDI_DOMINATORS, latch_exit->src, e->src))
> > +    return false;
> > +
> > +  nonexit = EDGE_SUCC (e->src, 0);
> > +  if (nonexit == e)
> > +    nonexit = EDGE_SUCC (e->src, 1);
> > +
> > +  /* The condidate edge must fall through to the latch.  This might be too
> > +     strict, but it's safe for now.  The reason for this is to prevent a second
> > +     branch like a diamond shape that can reach the latch from the edge.  */
> > +  if (nonexit->dest != latch_exit->src)
> > +    return false;
> > +
> > +  /* Since we want to use the latch for the candidate edge exit, they must
> have
> > +     been going to the same destination.  */
> > +  if (e->dest != latch_exit->dest)
> > +    return false;
> > +
> > +  /* The candidate edge must have a known, constant niters and it must be
> > +     smaller or equal to our maximum iteration count.  */
> > +  if (!number_of_iterations_exit (loop, e, &desc, false)
> > +      || !integer_zerop (desc.may_be_zero)
> > +      || TREE_CODE (desc.niter) != INTEGER_CST
> > +      || !wi::geu_p (wi::to_widest (desc.niter), maxiter))
> > +    return false;
> > +
> > +  /* Check that the candidate exit and the latch exit compute the same value
> > +     otherwise rewriting the candidate exit to go through the latch is not a
> > +     valid thing to do.  */
> > +  for (gpi = gsi_start_phis (e->dest); !gsi_end_p (gpi); gsi_next (&gpi))
> > +    {
> > +      gphi *phi = gpi.phi ();
> > +      tree e_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
> > +      tree latch_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_exit);
> > +
> > +      if (!operand_equal_p (e_arg, latch_arg, 0))
> > +	return false;
> > +    }
> > +
> > +  return true;
> > +}
> > +
> > +/* Return a constrained latch bound for LATCH_EXIT, using only exits that
> > +   dominate LATCH_EXIT and are executed once per iteration.  The original
> > +   latch count is stored in *LATCH_NITER_OUT.  */
> > +
> > +static tree
> > +find_latch_constrained_niter (class loop *loop, edge latch_exit,
> > +			      tree *latch_niter_out)
> > +{
> > +  class tree_niter_desc desc;
> > +
> > +  *latch_niter_out = NULL_TREE;
> > +  if (!number_of_iterations_exit (loop, latch_exit, &desc, false)
> > +      || !integer_zerop (desc.may_be_zero))
> > +    return chrec_dont_know;
> > +
> > +  tree niter = desc.niter;
> > +  *latch_niter_out = niter;
> > +
> > +  for (auto e : get_loop_exit_edges (loop))
> > +    {
> > +      /* Ignore the latch edge, and the exit must dominate the latch.  If it
> > +	 doesn't then the form is broken.  */
> > +      if (e == latch_exit
> > +	  || !just_once_each_iteration_p (loop, e->src)
> > +	  || !dominated_by_p (CDI_DOMINATORS, latch_exit->src, e->src))
> > +	continue;
> > +
> > +      /* The edge must have a non-zero iteration count.  */
> > +      if (!number_of_iterations_exit (loop, e, &desc, false)
> > +	  || !integer_zerop (desc.may_be_zero))
> > +	continue;
> > +
> > +      tree aniter = desc.niter;
> > +      tree ty1 = TREE_TYPE (niter);
> > +      tree ty2 = TREE_TYPE (aniter);
> > +      if (ty1 != ty2)
> > +	{
> > +	  if (TYPE_UNSIGNED (ty1) != TYPE_UNSIGNED (ty2))
> > +	    return chrec_dont_know;
> > +
> > +	  if (CONSTANT_CLASS_P (niter)
> > +	      && TYPE_PRECISION (ty1) <= TYPE_PRECISION (ty2))
> > +	    niter = fold_convert (ty2, niter);
> > +	  else if (CONSTANT_CLASS_P (aniter)
> > +		   && TYPE_PRECISION (ty2) <= TYPE_PRECISION (ty1))
> > +	    aniter = fold_convert (ty1, aniter);
> > +	  else
> > +	    return chrec_dont_know;
> > +	}
> > +
> > +      if (TREE_CODE (aniter) != INTEGER_CST)
> > +	continue;
> > +
> > +      if (TREE_CODE (niter) != INTEGER_CST)
> > +	{
> > +	  niter = fold_build2 (MIN_EXPR, TREE_TYPE (niter), niter, aniter);
> > +	  continue;
> > +	}
> > +
> > +      niter = tree_int_cst_lt (aniter, niter) ? aniter : niter;
> > +    }
> > +
> > +  return niter;
> > +}
> > +
> >  /* Describe size of loop as detected by tree_estimate_loop_size.  */
> >  struct loop_size
> >  {
> > @@ -766,7 +893,7 @@ try_unroll_loop_completely (class loop *loop,
> >       If the number of execution of loop is determined by standard induction
> >       variable test, then EXIT and EDGE_TO_CANCEL are the two edges leaving
> >       from the iv test.  */
> > -  if (tree_fits_uhwi_p (niter))
> > +  if (exit && tree_fits_uhwi_p (niter))
> >      {
> >        n_unroll = tree_to_uhwi (niter);
> >        n_unroll_found = true;
> > @@ -1354,7 +1481,77 @@ canonicalize_loop_induction_variables (class
> loop *loop,
> >  				  innermost_cunrolli_p))
> >      return true;
> >
> > -  if ((create_iv || by_eval)
> > +  edge latch_e = NULL;
> > +  bool aggr_rewritten_p = false;
> > +  if (create_iv && !single_exit (loop))
> > +    {
> > +      tree latch_niter = NULL_TREE;
> > +      tree niter_aggr = chrec_dont_know;
> > +
> > +      latch_e = loop_latch_edge (loop);
> > +      if (single_pred_p (latch_e->src))
> > +	{
> > +	  latch_e = single_pred_edge (latch_e->src);
> > +	  auto bb_exit = latch_e->src;
> > +	  if (EDGE_COUNT (bb_exit->succs) != 2)
> > +	    latch_e = NULL;
> > +	  else if (loop_exit_edge_p (loop, EDGE_SUCC (bb_exit, 0)))
> > +	    latch_e = EDGE_SUCC (bb_exit, 0);
> > +	  else if (loop_exit_edge_p (loop, EDGE_SUCC (bb_exit, 1)))
> > +	    latch_e = EDGE_SUCC (bb_exit, 1);
> > +	  else
> > +	    latch_e = NULL;
> > +	}
> > +      else
> > +	latch_e = NULL;
> > +
> > +      if (latch_e)
> > +	{
> > +	  niter_aggr = find_latch_constrained_niter (loop, latch_e,
> > +						     &latch_niter);
> > +	  if (!niter_aggr)
> > +	    niter_aggr = chrec_dont_know;
> > +	  if (chrec_contains_undetermined (latch_niter)
> > +	      || chrec_contains_undetermined (niter_aggr)
> > +	      || operand_equal_p (latch_niter, niter_aggr, 0))
> > +	    latch_e = NULL;
> > +	}
> > +
> > +      if (latch_e)
> > +	{
> > +	  create_canonical_iv (loop, latch_e, niter_aggr);
> > +	  aggr_rewritten_p = true;
> > +
> > +	  for (auto e : get_loop_exit_edges (loop))
> > +	    if (redundant_with_constrained_latch_exit_p (loop, e, latch_e,
> > +							 maxiter))
> > +	      {
> > +		gcond *cond_stmt
> > +		  = as_a <gcond *> (gsi_stmt (gsi_last_nondebug_bb
> > +					      (e->src)));
> > +		if (e->flags & EDGE_TRUE_VALUE)
> > +		  gimple_cond_make_false (cond_stmt);
> > +		else
> > +		  gimple_cond_make_true (cond_stmt);
> > +		update_stmt (cond_stmt);
> > +	      }
> > +
> > +	  if (dump_file && (dump_flags & TDF_DETAILS))
> > +	    {
> > +	      fprintf (dump_file,
> > +		       "Loop %d exit edges updated and IV rewritten to ",
> > +		       loop->num);
> > +	      print_generic_expr (dump_file, niter_aggr, TDF_SLIM);
> > +	      fprintf (dump_file, ".\n");
> > +	    }
> > +	}
> > +    }
> > +
> > +  /* Only accept the new MIN based bounds if we can actually simplify the
> loop,
> > +     otherwise the MIN_EXPR just causes more instruction in loop pre-
> headers
> > +     without any benefits.  */
> > +  if (!aggr_rewritten_p
> > +      && (create_iv || by_eval)
> >        && niter && !chrec_contains_undetermined (niter)
> >        && exit && just_once_each_iteration_p (loop, exit->src))
> >      {
> > @@ -1798,5 +1995,3 @@ make_pass_complete_unrolli (gcc::context *ctxt)
> >  {
> >    return new pass_complete_unrolli (ctxt);
> >  }
> > -
> > -
> >
> >
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Jochen Jaser, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> Nuernberg)
  
Richard Biener May 8, 2026, 7:14 a.m. UTC | #3
On Fri, 8 May 2026, Tamar Christina wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: 08 May 2026 07:35
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> > Subject: Re: [PATCH] ivcannon: support aggregate ranged on loop bounds
> > 
> > On Wed, 6 May 2026, Tamar Christina wrote:
> > 
> > > The loop
> > >
> > > void add(int N, int *__restrict a, int *__restrict b, int *__restrict c) {
> > >   for (int i = 0; i < N; i++) {
> > >     c[i] = a[i] + b[i];
> > >     if (i > 1000) {
> > >         break;
> > >     }
> > >   }
> > > }
> > >
> > > is an example of a loop that we have started vectorizing using early break
> > > vectorization.  However the given loop is not actually an early break loop as
> > > the condition on the exit introduces a MAX bound on N.
> > >
> > > This patch teaches IVcannon to identify such cases and rewrite the latch
> > > condition from i < N to i < MIN (N, C) where C is a constant.
> > 
> > The pattern-matching in IVCANON looks quite ugly.  loop header copying
> > (run after ifcombine...) exposes
> > 
> >   if (i_21 == 1001)
> >     goto <bb 5>; [1.00%]
> >   else
> >     goto <bb 4>; [99.00%]
> > 
> >   <bb 4> [local count: 1004539166]:
> >   i_18 = i_21 + 1;
> >   if (N_13(D) > i_18)
> >     goto <bb 3>; [94.50%]
> >   else
> >     goto <bb 5>; [5.50%]
> > 
> >   <bb 5> [local count: 69202658]:
> > 
> > which looks like a ifcombine opportunity to me.  I recently reviewed
> > a patch to do extra if-combining from tail merging.
> > 
> > That said, IVCANON doesn't look like the correct place to fuse
> > two exit tests?
> 
> I only did so based on your own suggestion 
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113134#c20
> 
> But fair enough.

I guess I thought it would be less ugly.

> Tamar
> 
> > 
> > > It then tries to find an appropriate edge to remove from the loop iteration.
> > >
> > > Concretely instead of
> > >
> > > .L8:
> > >         add     v31.4s, v30.4s, v27.4s
> > >         add     v30.4s, v30.4s, v28.4s
> > >         cmpeq   p15.s, p7/z, z31.s, #0
> > >         b.any   .L41
> > >         ldr     q31, [x9, x4]
> > >         add     w5, w5, 4
> > >         ldr     q29, [x8, x4]
> > >         add     v31.4s, v31.4s, v29.4s
> > >         str     q31, [x6, x4]
> > >         add     x4, x4, 16
> > >         cmp     w7, w5
> > >         bne     .L8
> > >
> > > we now generate:
> > >
> > > .L4:
> > >         ldr     q31, [x1, x0]
> > >         ldr     q30, [x2, x0]
> > >         add     v30.4s, v31.4s, v30.4s
> > >         str     q30, [x3, x0]
> > >         add     x0, x0, 16
> > >         cmp     x4, x0
> > >         bne     .L4
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu,
> > > arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> > > -m32, -m64 and no issues.
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > 	PR tree-optimization/113134
> > > 	* tree-ssa-loop-ivcanon.cc
> > (redundant_with_constrained_latch_exit_p,
> > > 	(find_latch_constrained_niter): New.
> > > 	(try_unroll_loop_completely): Explicitly check exit.
> > > 	(canonicalize_loop_induction_variables): Use them.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > 	PR tree-optimization/113134
> > > 	* gcc.dg/ivcannon-pr113134.c: New test.
> > >
> > > ---
> > > diff --git a/gcc/testsuite/gcc.dg/ivcannon-pr113134.c
> > b/gcc/testsuite/gcc.dg/ivcannon-pr113134.c
> > > new file mode 100644
> > > index
> > 0000000000000000000000000000000000000000..49fd231cf436211ae37
> > 79c96def8a833446ca4fd
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/ivcannon-pr113134.c
> > > @@ -0,0 +1,13 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-additional-options "-O3 -fdump-tree-ivcanon-details -std=c99" } */
> > > +
> > > +void add(int N, int *__restrict a, int *__restrict b, int *__restrict c) {
> > > +  for (int i = 0; i < N; i++) {
> > > +    c[i] = a[i] + b[i];
> > > +    if (i > 1000) {
> > > +        break;
> > > +    }
> > > +  }
> > > +}
> > > +
> > > +/* { dg-final { scan-tree-dump {exit edges updated and IV rewritten to}
> > "ivcanon" } } */
> > > diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> > > index
> > fb2ef44a3d3c8504823b75747da1656ce2854bfd..beb3157db44b51bf656d
> > 8af143cc1706e322ea6b 100644
> > > --- a/gcc/tree-ssa-loop-ivcanon.cc
> > > +++ b/gcc/tree-ssa-loop-ivcanon.cc
> > > @@ -129,6 +129,133 @@ create_canonical_iv (class loop *loop, edge exit,
> > tree niter,
> > >    update_stmt (cond);
> > >  }
> > >
> > > +/* Return true if exit E becomes redundant once LATCH_EXIT is rewritten
> > with a
> > > +   constrained latch count.  This is intentionally narrow: the non-exit path
> > > +   must flow directly to LATCH_EXIT in the same iteration.  */
> > > +
> > > +static bool
> > > +redundant_with_constrained_latch_exit_p (class loop *loop, edge e,
> > > +					 edge latch_exit,
> > > +					 HOST_WIDE_INT maxiter)
> > > +{
> > > +  class tree_niter_desc desc;
> > > +  edge nonexit;
> > > +  gphi_iterator gpi;
> > > +
> > > +  /* Candidate exit must not be the latch exit and must execute at most
> > > +     once per iteration.  i.e. no other flow through the loop.  */
> > > +  if (e == latch_exit || !just_once_each_iteration_p (loop, e->src))
> > > +    return false;
> > > +
> > > +  /* The candidate exit must dominate the latch, otherwise it would be
> > unsafe
> > > +     to elide the candidate edge and have the latch assume it's role.  */
> > > +  if (!dominated_by_p (CDI_DOMINATORS, latch_exit->src, e->src))
> > > +    return false;
> > > +
> > > +  nonexit = EDGE_SUCC (e->src, 0);
> > > +  if (nonexit == e)
> > > +    nonexit = EDGE_SUCC (e->src, 1);
> > > +
> > > +  /* The condidate edge must fall through to the latch.  This might be too
> > > +     strict, but it's safe for now.  The reason for this is to prevent a second
> > > +     branch like a diamond shape that can reach the latch from the edge.  */
> > > +  if (nonexit->dest != latch_exit->src)
> > > +    return false;
> > > +
> > > +  /* Since we want to use the latch for the candidate edge exit, they must
> > have
> > > +     been going to the same destination.  */
> > > +  if (e->dest != latch_exit->dest)
> > > +    return false;
> > > +
> > > +  /* The candidate edge must have a known, constant niters and it must be
> > > +     smaller or equal to our maximum iteration count.  */
> > > +  if (!number_of_iterations_exit (loop, e, &desc, false)
> > > +      || !integer_zerop (desc.may_be_zero)
> > > +      || TREE_CODE (desc.niter) != INTEGER_CST
> > > +      || !wi::geu_p (wi::to_widest (desc.niter), maxiter))
> > > +    return false;
> > > +
> > > +  /* Check that the candidate exit and the latch exit compute the same value
> > > +     otherwise rewriting the candidate exit to go through the latch is not a
> > > +     valid thing to do.  */
> > > +  for (gpi = gsi_start_phis (e->dest); !gsi_end_p (gpi); gsi_next (&gpi))
> > > +    {
> > > +      gphi *phi = gpi.phi ();
> > > +      tree e_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
> > > +      tree latch_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_exit);
> > > +
> > > +      if (!operand_equal_p (e_arg, latch_arg, 0))
> > > +	return false;
> > > +    }
> > > +
> > > +  return true;
> > > +}
> > > +
> > > +/* Return a constrained latch bound for LATCH_EXIT, using only exits that
> > > +   dominate LATCH_EXIT and are executed once per iteration.  The original
> > > +   latch count is stored in *LATCH_NITER_OUT.  */
> > > +
> > > +static tree
> > > +find_latch_constrained_niter (class loop *loop, edge latch_exit,
> > > +			      tree *latch_niter_out)
> > > +{
> > > +  class tree_niter_desc desc;
> > > +
> > > +  *latch_niter_out = NULL_TREE;
> > > +  if (!number_of_iterations_exit (loop, latch_exit, &desc, false)
> > > +      || !integer_zerop (desc.may_be_zero))
> > > +    return chrec_dont_know;
> > > +
> > > +  tree niter = desc.niter;
> > > +  *latch_niter_out = niter;
> > > +
> > > +  for (auto e : get_loop_exit_edges (loop))
> > > +    {
> > > +      /* Ignore the latch edge, and the exit must dominate the latch.  If it
> > > +	 doesn't then the form is broken.  */
> > > +      if (e == latch_exit
> > > +	  || !just_once_each_iteration_p (loop, e->src)
> > > +	  || !dominated_by_p (CDI_DOMINATORS, latch_exit->src, e->src))
> > > +	continue;
> > > +
> > > +      /* The edge must have a non-zero iteration count.  */
> > > +      if (!number_of_iterations_exit (loop, e, &desc, false)
> > > +	  || !integer_zerop (desc.may_be_zero))
> > > +	continue;
> > > +
> > > +      tree aniter = desc.niter;
> > > +      tree ty1 = TREE_TYPE (niter);
> > > +      tree ty2 = TREE_TYPE (aniter);
> > > +      if (ty1 != ty2)
> > > +	{
> > > +	  if (TYPE_UNSIGNED (ty1) != TYPE_UNSIGNED (ty2))
> > > +	    return chrec_dont_know;
> > > +
> > > +	  if (CONSTANT_CLASS_P (niter)
> > > +	      && TYPE_PRECISION (ty1) <= TYPE_PRECISION (ty2))
> > > +	    niter = fold_convert (ty2, niter);
> > > +	  else if (CONSTANT_CLASS_P (aniter)
> > > +		   && TYPE_PRECISION (ty2) <= TYPE_PRECISION (ty1))
> > > +	    aniter = fold_convert (ty1, aniter);
> > > +	  else
> > > +	    return chrec_dont_know;
> > > +	}
> > > +
> > > +      if (TREE_CODE (aniter) != INTEGER_CST)
> > > +	continue;
> > > +
> > > +      if (TREE_CODE (niter) != INTEGER_CST)
> > > +	{
> > > +	  niter = fold_build2 (MIN_EXPR, TREE_TYPE (niter), niter, aniter);
> > > +	  continue;
> > > +	}
> > > +
> > > +      niter = tree_int_cst_lt (aniter, niter) ? aniter : niter;
> > > +    }
> > > +
> > > +  return niter;
> > > +}
> > > +
> > >  /* Describe size of loop as detected by tree_estimate_loop_size.  */
> > >  struct loop_size
> > >  {
> > > @@ -766,7 +893,7 @@ try_unroll_loop_completely (class loop *loop,
> > >       If the number of execution of loop is determined by standard induction
> > >       variable test, then EXIT and EDGE_TO_CANCEL are the two edges leaving
> > >       from the iv test.  */
> > > -  if (tree_fits_uhwi_p (niter))
> > > +  if (exit && tree_fits_uhwi_p (niter))
> > >      {
> > >        n_unroll = tree_to_uhwi (niter);
> > >        n_unroll_found = true;
> > > @@ -1354,7 +1481,77 @@ canonicalize_loop_induction_variables (class
> > loop *loop,
> > >  				  innermost_cunrolli_p))
> > >      return true;
> > >
> > > -  if ((create_iv || by_eval)
> > > +  edge latch_e = NULL;
> > > +  bool aggr_rewritten_p = false;
> > > +  if (create_iv && !single_exit (loop))
> > > +    {
> > > +      tree latch_niter = NULL_TREE;
> > > +      tree niter_aggr = chrec_dont_know;
> > > +
> > > +      latch_e = loop_latch_edge (loop);
> > > +      if (single_pred_p (latch_e->src))
> > > +	{
> > > +	  latch_e = single_pred_edge (latch_e->src);
> > > +	  auto bb_exit = latch_e->src;
> > > +	  if (EDGE_COUNT (bb_exit->succs) != 2)
> > > +	    latch_e = NULL;
> > > +	  else if (loop_exit_edge_p (loop, EDGE_SUCC (bb_exit, 0)))
> > > +	    latch_e = EDGE_SUCC (bb_exit, 0);
> > > +	  else if (loop_exit_edge_p (loop, EDGE_SUCC (bb_exit, 1)))
> > > +	    latch_e = EDGE_SUCC (bb_exit, 1);
> > > +	  else
> > > +	    latch_e = NULL;
> > > +	}
> > > +      else
> > > +	latch_e = NULL;
> > > +
> > > +      if (latch_e)
> > > +	{
> > > +	  niter_aggr = find_latch_constrained_niter (loop, latch_e,
> > > +						     &latch_niter);
> > > +	  if (!niter_aggr)
> > > +	    niter_aggr = chrec_dont_know;
> > > +	  if (chrec_contains_undetermined (latch_niter)
> > > +	      || chrec_contains_undetermined (niter_aggr)
> > > +	      || operand_equal_p (latch_niter, niter_aggr, 0))
> > > +	    latch_e = NULL;
> > > +	}
> > > +
> > > +      if (latch_e)
> > > +	{
> > > +	  create_canonical_iv (loop, latch_e, niter_aggr);
> > > +	  aggr_rewritten_p = true;
> > > +
> > > +	  for (auto e : get_loop_exit_edges (loop))
> > > +	    if (redundant_with_constrained_latch_exit_p (loop, e, latch_e,
> > > +							 maxiter))
> > > +	      {
> > > +		gcond *cond_stmt
> > > +		  = as_a <gcond *> (gsi_stmt (gsi_last_nondebug_bb
> > > +					      (e->src)));
> > > +		if (e->flags & EDGE_TRUE_VALUE)
> > > +		  gimple_cond_make_false (cond_stmt);
> > > +		else
> > > +		  gimple_cond_make_true (cond_stmt);
> > > +		update_stmt (cond_stmt);
> > > +	      }
> > > +
> > > +	  if (dump_file && (dump_flags & TDF_DETAILS))
> > > +	    {
> > > +	      fprintf (dump_file,
> > > +		       "Loop %d exit edges updated and IV rewritten to ",
> > > +		       loop->num);
> > > +	      print_generic_expr (dump_file, niter_aggr, TDF_SLIM);
> > > +	      fprintf (dump_file, ".\n");
> > > +	    }
> > > +	}
> > > +    }
> > > +
> > > +  /* Only accept the new MIN based bounds if we can actually simplify the
> > loop,
> > > +     otherwise the MIN_EXPR just causes more instruction in loop pre-
> > headers
> > > +     without any benefits.  */
> > > +  if (!aggr_rewritten_p
> > > +      && (create_iv || by_eval)
> > >        && niter && !chrec_contains_undetermined (niter)
> > >        && exit && just_once_each_iteration_p (loop, exit->src))
> > >      {
> > > @@ -1798,5 +1995,3 @@ make_pass_complete_unrolli (gcc::context *ctxt)
> > >  {
> > >    return new pass_complete_unrolli (ctxt);
> > >  }
> > > -
> > > -
> > >
> > >
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Jochen Jaser, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > Nuernberg)
>
  

Patch

diff --git a/gcc/testsuite/gcc.dg/ivcannon-pr113134.c b/gcc/testsuite/gcc.dg/ivcannon-pr113134.c
new file mode 100644
index 0000000000000000000000000000000000000000..49fd231cf436211ae3779c96def8a833446ca4fd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ivcannon-pr113134.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-ivcanon-details -std=c99" } */
+
+void add(int N, int *__restrict a, int *__restrict b, int *__restrict c) {
+  for (int i = 0; i < N; i++) {
+    c[i] = a[i] + b[i];
+    if (i > 1000) {
+        break;
+    }
+  }
+}
+
+/* { dg-final { scan-tree-dump {exit edges updated and IV rewritten to} "ivcanon" } } */
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index fb2ef44a3d3c8504823b75747da1656ce2854bfd..beb3157db44b51bf656d8af143cc1706e322ea6b 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -129,6 +129,133 @@  create_canonical_iv (class loop *loop, edge exit, tree niter,
   update_stmt (cond);
 }
 
+/* Return true if exit E becomes redundant once LATCH_EXIT is rewritten with a
+   constrained latch count.  This is intentionally narrow: the non-exit path
+   must flow directly to LATCH_EXIT in the same iteration.  */
+
+static bool
+redundant_with_constrained_latch_exit_p (class loop *loop, edge e,
+					 edge latch_exit,
+					 HOST_WIDE_INT maxiter)
+{
+  class tree_niter_desc desc;
+  edge nonexit;
+  gphi_iterator gpi;
+
+  /* Candidate exit must not be the latch exit and must execute at most
+     once per iteration.  i.e. no other flow through the loop.  */
+  if (e == latch_exit || !just_once_each_iteration_p (loop, e->src))
+    return false;
+
+  /* The candidate exit must dominate the latch, otherwise it would be unsafe
+     to elide the candidate edge and have the latch assume it's role.  */
+  if (!dominated_by_p (CDI_DOMINATORS, latch_exit->src, e->src))
+    return false;
+
+  nonexit = EDGE_SUCC (e->src, 0);
+  if (nonexit == e)
+    nonexit = EDGE_SUCC (e->src, 1);
+
+  /* The condidate edge must fall through to the latch.  This might be too
+     strict, but it's safe for now.  The reason for this is to prevent a second
+     branch like a diamond shape that can reach the latch from the edge.  */
+  if (nonexit->dest != latch_exit->src)
+    return false;
+
+  /* Since we want to use the latch for the candidate edge exit, they must have
+     been going to the same destination.  */
+  if (e->dest != latch_exit->dest)
+    return false;
+
+  /* The candidate edge must have a known, constant niters and it must be
+     smaller or equal to our maximum iteration count.  */
+  if (!number_of_iterations_exit (loop, e, &desc, false)
+      || !integer_zerop (desc.may_be_zero)
+      || TREE_CODE (desc.niter) != INTEGER_CST
+      || !wi::geu_p (wi::to_widest (desc.niter), maxiter))
+    return false;
+
+  /* Check that the candidate exit and the latch exit compute the same value
+     otherwise rewriting the candidate exit to go through the latch is not a
+     valid thing to do.  */
+  for (gpi = gsi_start_phis (e->dest); !gsi_end_p (gpi); gsi_next (&gpi))
+    {
+      gphi *phi = gpi.phi ();
+      tree e_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
+      tree latch_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_exit);
+
+      if (!operand_equal_p (e_arg, latch_arg, 0))
+	return false;
+    }
+
+  return true;
+}
+
+/* Return a constrained latch bound for LATCH_EXIT, using only exits that
+   dominate LATCH_EXIT and are executed once per iteration.  The original
+   latch count is stored in *LATCH_NITER_OUT.  */
+
+static tree
+find_latch_constrained_niter (class loop *loop, edge latch_exit,
+			      tree *latch_niter_out)
+{
+  class tree_niter_desc desc;
+
+  *latch_niter_out = NULL_TREE;
+  if (!number_of_iterations_exit (loop, latch_exit, &desc, false)
+      || !integer_zerop (desc.may_be_zero))
+    return chrec_dont_know;
+
+  tree niter = desc.niter;
+  *latch_niter_out = niter;
+
+  for (auto e : get_loop_exit_edges (loop))
+    {
+      /* Ignore the latch edge, and the exit must dominate the latch.  If it
+	 doesn't then the form is broken.  */
+      if (e == latch_exit
+	  || !just_once_each_iteration_p (loop, e->src)
+	  || !dominated_by_p (CDI_DOMINATORS, latch_exit->src, e->src))
+	continue;
+
+      /* The edge must have a non-zero iteration count.  */
+      if (!number_of_iterations_exit (loop, e, &desc, false)
+	  || !integer_zerop (desc.may_be_zero))
+	continue;
+
+      tree aniter = desc.niter;
+      tree ty1 = TREE_TYPE (niter);
+      tree ty2 = TREE_TYPE (aniter);
+      if (ty1 != ty2)
+	{
+	  if (TYPE_UNSIGNED (ty1) != TYPE_UNSIGNED (ty2))
+	    return chrec_dont_know;
+
+	  if (CONSTANT_CLASS_P (niter)
+	      && TYPE_PRECISION (ty1) <= TYPE_PRECISION (ty2))
+	    niter = fold_convert (ty2, niter);
+	  else if (CONSTANT_CLASS_P (aniter)
+		   && TYPE_PRECISION (ty2) <= TYPE_PRECISION (ty1))
+	    aniter = fold_convert (ty1, aniter);
+	  else
+	    return chrec_dont_know;
+	}
+
+      if (TREE_CODE (aniter) != INTEGER_CST)
+	continue;
+
+      if (TREE_CODE (niter) != INTEGER_CST)
+	{
+	  niter = fold_build2 (MIN_EXPR, TREE_TYPE (niter), niter, aniter);
+	  continue;
+	}
+
+      niter = tree_int_cst_lt (aniter, niter) ? aniter : niter;
+    }
+
+  return niter;
+}
+
 /* Describe size of loop as detected by tree_estimate_loop_size.  */
 struct loop_size
 {
@@ -766,7 +893,7 @@  try_unroll_loop_completely (class loop *loop,
      If the number of execution of loop is determined by standard induction
      variable test, then EXIT and EDGE_TO_CANCEL are the two edges leaving
      from the iv test.  */
-  if (tree_fits_uhwi_p (niter))
+  if (exit && tree_fits_uhwi_p (niter))
     {
       n_unroll = tree_to_uhwi (niter);
       n_unroll_found = true;
@@ -1354,7 +1481,77 @@  canonicalize_loop_induction_variables (class loop *loop,
 				  innermost_cunrolli_p))
     return true;
 
-  if ((create_iv || by_eval)
+  edge latch_e = NULL;
+  bool aggr_rewritten_p = false;
+  if (create_iv && !single_exit (loop))
+    {
+      tree latch_niter = NULL_TREE;
+      tree niter_aggr = chrec_dont_know;
+
+      latch_e = loop_latch_edge (loop);
+      if (single_pred_p (latch_e->src))
+	{
+	  latch_e = single_pred_edge (latch_e->src);
+	  auto bb_exit = latch_e->src;
+	  if (EDGE_COUNT (bb_exit->succs) != 2)
+	    latch_e = NULL;
+	  else if (loop_exit_edge_p (loop, EDGE_SUCC (bb_exit, 0)))
+	    latch_e = EDGE_SUCC (bb_exit, 0);
+	  else if (loop_exit_edge_p (loop, EDGE_SUCC (bb_exit, 1)))
+	    latch_e = EDGE_SUCC (bb_exit, 1);
+	  else
+	    latch_e = NULL;
+	}
+      else
+	latch_e = NULL;
+
+      if (latch_e)
+	{
+	  niter_aggr = find_latch_constrained_niter (loop, latch_e,
+						     &latch_niter);
+	  if (!niter_aggr)
+	    niter_aggr = chrec_dont_know;
+	  if (chrec_contains_undetermined (latch_niter)
+	      || chrec_contains_undetermined (niter_aggr)
+	      || operand_equal_p (latch_niter, niter_aggr, 0))
+	    latch_e = NULL;
+	}
+
+      if (latch_e)
+	{
+	  create_canonical_iv (loop, latch_e, niter_aggr);
+	  aggr_rewritten_p = true;
+
+	  for (auto e : get_loop_exit_edges (loop))
+	    if (redundant_with_constrained_latch_exit_p (loop, e, latch_e,
+							 maxiter))
+	      {
+		gcond *cond_stmt
+		  = as_a <gcond *> (gsi_stmt (gsi_last_nondebug_bb
+					      (e->src)));
+		if (e->flags & EDGE_TRUE_VALUE)
+		  gimple_cond_make_false (cond_stmt);
+		else
+		  gimple_cond_make_true (cond_stmt);
+		update_stmt (cond_stmt);
+	      }
+
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      fprintf (dump_file,
+		       "Loop %d exit edges updated and IV rewritten to ",
+		       loop->num);
+	      print_generic_expr (dump_file, niter_aggr, TDF_SLIM);
+	      fprintf (dump_file, ".\n");
+	    }
+	}
+    }
+
+  /* Only accept the new MIN based bounds if we can actually simplify the loop,
+     otherwise the MIN_EXPR just causes more instruction in loop pre-headers
+     without any benefits.  */
+  if (!aggr_rewritten_p
+      && (create_iv || by_eval)
       && niter && !chrec_contains_undetermined (niter)
       && exit && just_once_each_iteration_p (loop, exit->src))
     {
@@ -1798,5 +1995,3 @@  make_pass_complete_unrolli (gcc::context *ctxt)
 {
   return new pass_complete_unrolli (ctxt);
 }
-
-