Loop unswitching: support gswitch statements.

Message ID f1eb9f80-fdc5-8f57-327a-d35db67c1a6c@suse.cz
State New
Headers
Series Loop unswitching: support gswitch statements. |

Commit Message

Martin Liška Sept. 15, 2021, 8:46 a.m. UTC
  Hello.

The patch extends the loop unswitching pass so that gswitch
statements are supported. The pass now uses ranger which marks
switch edges that are known to be unreachable in a versioned loop.

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.

Ready to be installed?
Thanks,
Martin

gcc/ChangeLog:

	* tree-cfg.c (gimple_lv_add_condition_to_bb): Support non-gimple
	expressions that needs to be gimplified.
	* tree-ssa-loop-unswitch.c (tree_unswitch_loop): Add new
	cond_edge parameter.
	(tree_may_unswitch_on): Support gswitch statements.
	(clean_up_switches): New function.
	(tree_ssa_unswitch_loops): Call clean_up_switches.
	(simplify_using_entry_checks): Removed and replaced with ranger.
	(tree_unswitch_single_loop): Change assumptions.

gcc/testsuite/ChangeLog:

	* gcc.dg/loop-unswitch-6.c: New test.
	* gcc.dg/loop-unswitch-7.c: New test.
	* gcc.dg/loop-unswitch-8.c: New test.
	* gcc.dg/loop-unswitch-9.c: New test.

Co-Authored-By: Richard Biener <rguenther@suse.de>
---
  gcc/testsuite/gcc.dg/loop-unswitch-6.c |  56 +++++
  gcc/testsuite/gcc.dg/loop-unswitch-7.c |  45 ++++
  gcc/testsuite/gcc.dg/loop-unswitch-8.c |  28 +++
  gcc/testsuite/gcc.dg/loop-unswitch-9.c |  34 +++
  gcc/tree-cfg.c                         |   7 +-
  gcc/tree-ssa-loop-unswitch.c           | 284 ++++++++++++++++++-------
  6 files changed, 374 insertions(+), 80 deletions(-)
  create mode 100644 gcc/testsuite/gcc.dg/loop-unswitch-6.c
  create mode 100644 gcc/testsuite/gcc.dg/loop-unswitch-7.c
  create mode 100644 gcc/testsuite/gcc.dg/loop-unswitch-8.c
  create mode 100644 gcc/testsuite/gcc.dg/loop-unswitch-9.c
  

Comments

Jeff Law Sept. 19, 2021, 4:50 p.m. UTC | #1
On 9/15/2021 2:46 AM, Martin Liška wrote:
> Hello.
>
> The patch extends the loop unswitching pass so that gswitch
> statements are supported. The pass now uses ranger which marks
> switch edges that are known to be unreachable in a versioned loop.
>
> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
>
> Ready to be installed?
> Thanks,
> Martin
>
> gcc/ChangeLog:
>
>     * tree-cfg.c (gimple_lv_add_condition_to_bb): Support non-gimple
>     expressions that needs to be gimplified.
>     * tree-ssa-loop-unswitch.c (tree_unswitch_loop): Add new
>     cond_edge parameter.
>     (tree_may_unswitch_on): Support gswitch statements.
>     (clean_up_switches): New function.
>     (tree_ssa_unswitch_loops): Call clean_up_switches.
>     (simplify_using_entry_checks): Removed and replaced with ranger.
>     (tree_unswitch_single_loop): Change assumptions.
>
> gcc/testsuite/ChangeLog:
>
>     * gcc.dg/loop-unswitch-6.c: New test.
>     * gcc.dg/loop-unswitch-7.c: New test.
>     * gcc.dg/loop-unswitch-8.c: New test.
>     * gcc.dg/loop-unswitch-9.c: New test.
LGTM.  OK.
Jeff
  
Richard Biener Sept. 28, 2021, 11:50 a.m. UTC | #2
On Wed, Sep 15, 2021 at 10:46 AM Martin Liška <mliska@suse.cz> wrote:
>
> Hello.
>
> The patch extends the loop unswitching pass so that gswitch
> statements are supported. The pass now uses ranger which marks
> switch edges that are known to be unreachable in a versioned loop.
>
> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
>
> Ready to be installed?
> Thanks,
> Martin
>
> gcc/ChangeLog:
>
>         * tree-cfg.c (gimple_lv_add_condition_to_bb): Support non-gimple
>         expressions that needs to be gimplified.
>         * tree-ssa-loop-unswitch.c (tree_unswitch_loop): Add new
>         cond_edge parameter.
>         (tree_may_unswitch_on): Support gswitch statements.
>         (clean_up_switches): New function.
>         (tree_ssa_unswitch_loops): Call clean_up_switches.
>         (simplify_using_entry_checks): Removed and replaced with ranger.
>         (tree_unswitch_single_loop): Change assumptions.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/loop-unswitch-6.c: New test.
>         * gcc.dg/loop-unswitch-7.c: New test.
>         * gcc.dg/loop-unswitch-8.c: New test.
>         * gcc.dg/loop-unswitch-9.c: New test.
>
> Co-Authored-By: Richard Biener <rguenther@suse.de>
> ---
>   gcc/testsuite/gcc.dg/loop-unswitch-6.c |  56 +++++
>   gcc/testsuite/gcc.dg/loop-unswitch-7.c |  45 ++++
>   gcc/testsuite/gcc.dg/loop-unswitch-8.c |  28 +++
>   gcc/testsuite/gcc.dg/loop-unswitch-9.c |  34 +++
>   gcc/tree-cfg.c                         |   7 +-
>   gcc/tree-ssa-loop-unswitch.c           | 284 ++++++++++++++++++-------
>   6 files changed, 374 insertions(+), 80 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.dg/loop-unswitch-6.c
>   create mode 100644 gcc/testsuite/gcc.dg/loop-unswitch-7.c
>   create mode 100644 gcc/testsuite/gcc.dg/loop-unswitch-8.c
>   create mode 100644 gcc/testsuite/gcc.dg/loop-unswitch-9.c
>
> diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-6.c b/gcc/testsuite/gcc.dg/loop-unswitch-6.c
> new file mode 100644
> index 00000000000..8a022e0f200
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-unswitch-6.c
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details --param=max-unswitch-insns=1000 --param=max-unswitch-level=10" } */
> +
> +int
> +__attribute__((noipa))
> +foo(double *a, double *b, double *c, double *d, double *r, int size, int order)
> +{
> +  for (int i = 0; i < size; i++)
> +  {
> +    double tmp, tmp2;
> +
> +    switch(order)
> +    {
> +      case 0:
> +        tmp = -8 * a[i];
> +        tmp2 = 2 * b[i];
> +        break;
> +      case 1:
> +        tmp = 3 * a[i] -  2 * b[i];
> +        tmp2 = 5 * b[i] - 2 * c[i];
> +        break;
> +      case 2:
> +        tmp = 9 * a[i] +  2 * b[i] + c[i];
> +        tmp2 = 4 * b[i] + 2 * c[i] + 8 * d[i];
> +        break;
> +      case 3:
> +        tmp = 3 * a[i] +  2 * b[i] - c[i];
> +        tmp2 = b[i] - 2 * c[i] + 8 * d[i];
> +        break;
> +      defaut:
> +        __builtin_unreachable ();
> +    }
> +
> +    double x = 3 * tmp + d[i] + tmp;
> +    double y = 3.4f * tmp + d[i] + tmp2;
> +    r[i] = x + y;
> +  }
> +
> +  return 0;
> +}
> +
> +#define N 16 * 1024
> +double aa[N], bb[N], cc[N], dd[N], rr[N];
> +
> +int main()
> +{
> +  for (int i = 0; i < 100 * 1000; i++)
> +    foo (aa, bb, cc, dd, rr, N, i % 4);
> +}
> +
> +
> +/* Test that we actually unswitched something.  */
> +/* { dg-final { scan-tree-dump ";; Unswitching loop with condition: order.* == 0" "unswitch" } } */
> +/* { dg-final { scan-tree-dump ";; Unswitching loop with condition: order.* == 1" "unswitch" } } */
> +/* { dg-final { scan-tree-dump ";; Unswitching loop with condition: order.* == 2" "unswitch" } } */
> +/* { dg-final { scan-tree-dump ";; Unswitching loop with condition: order.* == 3" "unswitch" } } */
> diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-7.c b/gcc/testsuite/gcc.dg/loop-unswitch-7.c
> new file mode 100644
> index 00000000000..00f2fcff64b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-unswitch-7.c
> @@ -0,0 +1,45 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details --param=max-unswitch-insns=1000 --param=max-unswitch-level=10" } */
> +
> +int
> +foo(double *a, double *b, double *c, double *d, double *r, int size, int order)
> +{
> +  for (int i = 0; i < size; i++)
> +  {
> +    double tmp, tmp2;
> +
> +    switch(order)
> +    {
> +      case 5 ... 6:
> +      case 9:
> +        tmp = -8 * a[i];
> +        tmp2 = 2 * b[i];
> +        break;
> +      case 11:
> +        tmp = 3 * a[i] -  2 * b[i];
> +        tmp2 = 5 * b[i] - 2 * c[i];
> +        break;
> +      case 22:
> +        tmp = 9 * a[i] +  2 * b[i] + c[i];
> +        tmp2 = 4 * b[i] + 2 * c[i] + 8 * d[i];
> +        break;
> +      case 33:
> +        tmp = 3 * a[i] +  2 * b[i] - c[i];
> +        tmp2 = b[i] - 2 * c[i] + 8 * d[i];
> +        break;
> +      defaut:
> +        __builtin_unreachable ();
> +    }
> +
> +    double x = 3 * tmp + d[i] + tmp;
> +    double y = 3.4f * tmp + d[i] + tmp2;
> +    r[i] = x + y;
> +  }
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump ";; Unswitching loop with condition: order_.* >= 5 & order_.* <= 6 | order_.* == 9" "unswitch" } } */
> +/* { dg-final { scan-tree-dump ";; Unswitching loop with condition: order.* == 1" "unswitch" } } */
> +/* { dg-final { scan-tree-dump ";; Unswitching loop with condition: order.* == 2" "unswitch" } } */
> +/* { dg-final { scan-tree-dump ";; Unswitching loop with condition: order.* == 3" "unswitch" } } */
> diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-8.c b/gcc/testsuite/gcc.dg/loop-unswitch-8.c
> new file mode 100644
> index 00000000000..4cec1f53bcc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-unswitch-8.c
> @@ -0,0 +1,28 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details" } */
> +
> +int
> +foo(double *a, double *b, double *c, double *d, double *r, int size, int order)
> +{
> +  for (int i = 0; i < size; i++)
> +  {
> +    double tmp;
> +
> +    if (order == 1)
> +      tmp = -8 * a[i];
> +    else
> +      tmp = -4 * b[i];
> +
> +    double x = 3 * tmp + d[i] + tmp;
> +
> +    if (order == 1)
> +      x += 2;
> +
> +    double y = 3.4f * tmp + d[i];
> +    r[i] = x + y;
> +  }
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times ";; Unswitching loop with condition: order.* == 1" 1 "unswitch" } } */
> diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-9.c b/gcc/testsuite/gcc.dg/loop-unswitch-9.c
> new file mode 100644
> index 00000000000..3058d4a5b38
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-unswitch-9.c
> @@ -0,0 +1,34 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details" } */
> +
> +int
> +foo(double *a, double *b, double *c, double *d, double *r, int size, unsigned order)
> +{
> +  for (int i = 0; i < size; i++)
> +  {
> +    double tmp;
> +
> +    switch (order)
> +      {
> +      case 0 ... 4:
> +       tmp = -8 * a[i];
> +       break;
> +      default:
> +       tmp = -4 * b[i];
> +       break;
> +      }
> +
> +    double x = 3 * tmp + d[i] + tmp;
> +
> +    /* This should not be unswitched as it's mutually excluded with case 0 ... 4.  */
> +    if (order >= 5)
> +      x += 2;
> +
> +    double y = 3.4f * tmp + d[i];
> +    r[i] = x + y;
> +  }
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times ";; Unswitching loop with condition: order.* <= 4" 1 "unswitch" } } */
> diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
> index 367dcfa20bf..159e5d83c06 100644
> --- a/gcc/tree-cfg.c
> +++ b/gcc/tree-cfg.c
> @@ -9053,11 +9053,16 @@ gimple_lv_add_condition_to_bb (basic_block first_head ATTRIBUTE_UNUSED,
>     edge e0;
>
>     /* Build new conditional expr */
> +  gsi = gsi_last_bb (cond_bb);
> +
> +  cond_expr = force_gimple_operand_gsi_1 (&gsi, cond_expr,
> +                                         is_gimple_condexpr_for_cond,
> +                                         NULL_TREE, false,
> +                                         GSI_CONTINUE_LINKING);
>     new_cond_expr = gimple_build_cond_from_tree (cond_expr,
>                                                NULL_TREE, NULL_TREE);
>
>     /* Add new cond in cond_bb.  */
> -  gsi = gsi_last_bb (cond_bb);
>     gsi_insert_after (&gsi, new_cond_expr, GSI_NEW_STMT);
>
>     /* Adjust edges appropriately to connect new head with first head
> diff --git a/gcc/tree-ssa-loop-unswitch.c b/gcc/tree-ssa-loop-unswitch.c
> index fe4dacc0833..a509938c7d0 100644
> --- a/gcc/tree-ssa-loop-unswitch.c
> +++ b/gcc/tree-ssa-loop-unswitch.c
> @@ -37,6 +37,10 @@ along with GCC; see the file COPYING3.  If not see
>   #include "gimple-iterator.h"
>   #include "cfghooks.h"
>   #include "tree-ssa-loop-manip.h"
> +#include "cfganal.h"
> +#include "tree-cfgcleanup.h"
> +#include "tree-pretty-print.h"
> +#include "gimple-range.h"
>
>   /* This file implements the loop unswitching, i.e. transformation of loops like
>
> @@ -74,9 +78,9 @@ along with GCC; see the file COPYING3.  If not see
>      tree-ssa-loop-im.c ensures that all the suitable conditions are in this
>      shape.  */
>
> -static class loop *tree_unswitch_loop (class loop *, basic_block, tree);
> +static class loop *tree_unswitch_loop (class loop *, basic_block, tree, edge);
>   static bool tree_unswitch_single_loop (class loop *, int);
> -static tree tree_may_unswitch_on (basic_block, class loop *);
> +static tree tree_may_unswitch_on (basic_block, class loop *, edge *);
>   static bool tree_unswitch_outer_loop (class loop *);
>   static edge find_loop_guard (class loop *);
>   static bool empty_bb_without_guard_p (class loop *, basic_block);
> @@ -84,6 +88,7 @@ static bool used_outside_loop_p (class loop *, tree);
>   static void hoist_guard (class loop *, edge);
>   static bool check_exit_phi (class loop *);
>   static tree get_vop_from_header (class loop *);
> +static void clean_up_switches (void);
>
>   /* Main entry point.  Perform loop unswitching on all suitable loops.  */
>
> @@ -103,7 +108,10 @@ tree_ssa_unswitch_loops (void)
>       }
>
>     if (changed)
> -    return TODO_cleanup_cfg;
> +    {
> +      clean_up_switches ();
> +      return TODO_cleanup_cfg;
> +    }
>     return 0;
>   }
>
> @@ -182,80 +190,114 @@ is_maybe_undefined (const tree name, gimple *stmt, class loop *loop)
>   }
>
>   /* Checks whether we can unswitch LOOP on condition at end of BB -- one of its
> -   basic blocks (for what it means see comments below).  */
> +   basic blocks (for what it means see comments below).
> +   When a gswitch case is returned, fill up COND_EDGE for it.  */
>
>   static tree
> -tree_may_unswitch_on (basic_block bb, class loop *loop)
> +tree_may_unswitch_on (basic_block bb, class loop *loop, edge *cond_edge)
>   {
>     gimple *last, *def;
> -  gcond *stmt;
> -  tree cond, use;
> +  tree use;
>     basic_block def_bb;
>     ssa_op_iter iter;
>
>     /* BB must end in a simple conditional jump.  */
>     last = last_stmt (bb);
> -  if (!last || gimple_code (last) != GIMPLE_COND)
> -    return NULL_TREE;
> -  stmt = as_a <gcond *> (last);
> -
> -  /* To keep the things simple, we do not directly remove the conditions,
> -     but just replace tests with 0 != 0 resp. 1 != 0.  Prevent the infinite
> -     loop where we would unswitch again on such a condition.  */
> -  if (gimple_cond_true_p (stmt) || gimple_cond_false_p (stmt))
> -    return NULL_TREE;
> -
> -  /* Condition must be invariant.  */
> -  FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE)
> +  if (gcond *stmt = safe_dyn_cast <gcond *> (last))
>       {
> -      def = SSA_NAME_DEF_STMT (use);
> +      /* To keep the things simple, we do not directly remove the conditions,
> +        but just replace tests with 0 != 0 resp. 1 != 0.  Prevent the infinite
> +        loop where we would unswitch again on such a condition.  */
> +      if (gimple_cond_true_p (stmt) || gimple_cond_false_p (stmt))
> +       return NULL_TREE;
> +
> +      /* Condition must be invariant.  */
> +      FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE)
> +       {
> +         def = SSA_NAME_DEF_STMT (use);
> +         def_bb = gimple_bb (def);
> +         if (def_bb
> +             && flow_bb_inside_loop_p (loop, def_bb))
> +           return NULL_TREE;
> +         /* Unswitching on undefined values would introduce undefined
> +            behavior that the original program might never exercise.  */
> +         if (is_maybe_undefined (use, stmt, loop))
> +           return NULL_TREE;
> +       }
> +
> +      edge e1 = EDGE_SUCC (bb, 0);
> +      edge e2 = EDGE_SUCC (bb, 1);
> +      *cond_edge = e1->flags & EDGE_TRUE_VALUE ? e1 : e2;
> +      return build2 (gimple_cond_code (stmt), boolean_type_node,
> +                    gimple_cond_lhs (stmt), gimple_cond_rhs (stmt));
> +    }
> +  else if (gswitch *stmt = safe_dyn_cast <gswitch *> (last))
> +    {
> +      unsigned nlabels = gimple_switch_num_labels (stmt);
> +      tree idx = gimple_switch_index (stmt);
> +      if (TREE_CODE (idx) != SSA_NAME
> +         || nlabels < 1)
> +       return NULL_TREE;
> +      /* Index must be invariant.  */
> +      def = SSA_NAME_DEF_STMT (idx);
>         def_bb = gimple_bb (def);
>         if (def_bb
>           && flow_bb_inside_loop_p (loop, def_bb))
>         return NULL_TREE;
>         /* Unswitching on undefined values would introduce undefined
>          behavior that the original program might never exercise.  */
> -      if (is_maybe_undefined (use, stmt, loop))
> +      if (is_maybe_undefined (idx, stmt, loop))
>         return NULL_TREE;
> -    }
> -
> -  cond = build2 (gimple_cond_code (stmt), boolean_type_node,
> -                gimple_cond_lhs (stmt), gimple_cond_rhs (stmt));
>
> -  return cond;
> -}
> -
> -/* Simplifies COND using checks in front of the entry of the LOOP.  Just very
> -   simplish (sufficient to prevent us from duplicating loop in unswitching
> -   unnecessarily).  */
> -
> -static tree
> -simplify_using_entry_checks (class loop *loop, tree cond)
> -{
> -  edge e = loop_preheader_edge (loop);
> -  gimple *stmt;
> +      edge e;
> +      edge_iterator ei;
> +      FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->succs)
> +       {
> +         if (!(e->flags & EDGE_IGNORE))
> +           {
> +             /* Build compound expression for all cases leading
> +                to this edge.  */
> +             tree expr = NULL_TREE;
> +             for (unsigned i = 1; i < gimple_switch_num_labels (stmt); ++i)
> +               {
> +                 tree lab = gimple_switch_label (stmt, i);
> +                 basic_block dest = label_to_block (cfun, CASE_LABEL (lab));
> +                 edge e2 = find_edge (gimple_bb (stmt), dest);
> +                 if (e == e2)
> +                   {
> +                     tree cmp;
> +                     if (CASE_HIGH (lab) != NULL_TREE)
> +                       {
> +                         tree cmp1 = build2 (GE_EXPR, boolean_type_node, idx,
> +                                             CASE_LOW (lab));
> +                         tree cmp2 = build2 (LE_EXPR, boolean_type_node, idx,
> +                                             CASE_HIGH (lab));
> +                         cmp = build2 (BIT_AND_EXPR, boolean_type_node, cmp1,
> +                                       cmp2);
> +                       }
> +                     else
> +                       cmp = build2 (EQ_EXPR, boolean_type_node, idx,
> +                                     CASE_LOW (lab));
> +
> +                     /* Combine the expression with the existing one.  */
> +                     if (expr == NULL_TREE)
> +                       expr = cmp;
> +                     else
> +                       expr = build2 (BIT_IOR_EXPR, boolean_type_node, expr,
> +                                      cmp);
> +                   }
> +               }
>
> -  while (1)
> -    {
> -      stmt = last_stmt (e->src);
> -      if (stmt
> -         && gimple_code (stmt) == GIMPLE_COND
> -         && gimple_cond_code (stmt) == TREE_CODE (cond)
> -         && operand_equal_p (gimple_cond_lhs (stmt),
> -                             TREE_OPERAND (cond, 0), 0)
> -         && operand_equal_p (gimple_cond_rhs (stmt),
> -                             TREE_OPERAND (cond, 1), 0))
> -       return (e->flags & EDGE_TRUE_VALUE
> -               ? boolean_true_node
> -               : boolean_false_node);
> -
> -      if (!single_pred_p (e->src))
> -       return cond;
> -
> -      e = single_pred_edge (e->src);
> -      if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
> -       return cond;
> +             if (expr != NULL_TREE)
> +               {
> +                 *cond_edge = e;
> +                 return expr;
> +               }
> +           }
> +       }
>       }
> +
> +  return NULL_TREE;
>   }
>
>   /* Unswitch single LOOP.  NUM is number of unswitchings done; we do not allow
> @@ -269,6 +311,7 @@ tree_unswitch_single_loop (class loop *loop, int num)
>     class loop *nloop;
>     unsigned i, found;
>     tree cond = NULL_TREE;
> +  edge cond_edge = NULL;
>     gimple *stmt;
>     bool changed = false;
>     HOST_WIDE_INT iterations;
> @@ -311,11 +354,12 @@ tree_unswitch_single_loop (class loop *loop, int num)
>     bbs = get_loop_body (loop);
>     found = loop->num_nodes;
>
> +  gimple_ranger ranger;

ISTR constructing/destructing ranger has a non-negligible overhead -
is it possible
to keep it live for a longer time (note we're heavily modifying the CFG)?

>     while (1)
>       {
>         /* Find a bb to unswitch on.  */
>         for (; i < loop->num_nodes; i++)
> -       if ((cond = tree_may_unswitch_on (bbs[i], loop)))
> +       if ((cond = tree_may_unswitch_on (bbs[i], loop, &cond_edge)))
>           break;
>
>         if (i == loop->num_nodes)
> @@ -333,24 +377,70 @@ tree_unswitch_single_loop (class loop *loop, int num)
>           break;
>         }
>
> -      cond = simplify_using_entry_checks (loop, cond);

I also fear we're losing simplification of unswitching on float compares or even
run into endlessly unswitching on the same condition?

In the testcases I failed to see some verifying we're _not_ repeatedly
processing
ifs, like scan for a definitive number of unswitchings for, say,

  for (..)
    {
         if (a)
          ...;
         xyz;
         if (a)
           ...;
    }

where we want to unswitch on if (a) only once (but of course simplify the second
if (a) ideally from within unswitching so CFG cleanup removes the dead paths).
The old code guaranteed this even for float compares IIRC.

At least also add scan-tree-dump-times overall expected unswitch count scans
to the new testcases.

Btw, I was hoping to use the relation stuff here, not so much use range
queries, but see below ...

>         stmt = last_stmt (bbs[i]);
> -      if (integer_nonzerop (cond))
> +      gcond *condition = dyn_cast<gcond *> (stmt);
> +      gswitch *swtch = dyn_cast<gswitch *> (stmt);
> +
> +      if (condition != NULL)
>         {
> -         /* Remove false path.  */
> -         gimple_cond_set_condition_from_tree (as_a <gcond *> (stmt),
> -                                              boolean_true_node);
> -         changed = true;
> +         int_range_max r;
> +         edge edge_true, edge_false;
> +         extract_true_false_edges_from_block (bbs[i], &edge_true, &edge_false);
> +         tree cond = gimple_cond_lhs (stmt);
> +
> +         if (r.supports_type_p (TREE_TYPE (cond)))
> +           {
> +             if (ranger.range_on_edge (r, edge_true, cond)
> +                 && r.undefined_p ())

Can you really use ranger this way to tell whether the edge is not executed?
I think you instead want to somehow evaluate the gcond or gswitch, looking
for a known taken edge?

> +               {
> +                 gimple_cond_set_condition_from_tree (condition,
> +                                                      boolean_false_node);
> +                 update_stmt (condition);
> +                 changed = true;
> +               }
> +             else if(ranger.range_on_edge (r, edge_false, gimple_cond_lhs (stmt))
> +                     && r.undefined_p ())
> +               {
> +                 gimple_cond_set_condition_from_tree (condition,
> +                                                      boolean_true_node);
> +                 update_stmt (condition);
> +                 changed = true;
> +               }
> +           }
>         }
> -      else if (integer_zerop (cond))
> +      else if (swtch != NULL)
>         {
> -         /* Remove true path.  */
> -         gimple_cond_set_condition_from_tree (as_a <gcond *> (stmt),
> -                                              boolean_false_node);
> -         changed = true;
> +         hash_set<edge> ignored_edges;
> +         unsigned nlabels = gimple_switch_num_labels (swtch);
> +         tree index_candidate = NULL_TREE;
> +
> +         for (unsigned i = 0; i < nlabels; ++i)
> +           {
> +             tree lab = gimple_switch_label (swtch, i);
> +             basic_block dest = label_to_block (cfun, CASE_LABEL (lab));
> +             edge e = find_edge (gimple_bb (stmt), dest);
> +
> +             int_range_max r;
> +             if (ranger.range_on_edge (r, e, gimple_switch_index (swtch))
> +                 && r.undefined_p ())
> +               {
> +                 e->flags |= EDGE_IGNORE;
> +                 ignored_edges.add (e);
> +               }
> +             else
> +               index_candidate = CASE_LOW (lab);
> +           }
> +
> +         if (index_candidate != NULL_TREE
> +             && ignored_edges.elements () == EDGE_COUNT (bbs[i]->succs) - 1)
> +           {
> +             gimple_switch_set_index (swtch, index_candidate);
> +             update_stmt (swtch);
> +           }
>         }
> +
>         /* Do not unswitch too much.  */
> -      else if (num > param_max_unswitch_level)
> +      if (num > param_max_unswitch_level)
>         {
>           i++;
>           continue;
> @@ -371,7 +461,6 @@ tree_unswitch_single_loop (class loop *loop, int num)
>           break;
>         }
>
> -      update_stmt (stmt);
>         i++;
>       }
>
> @@ -435,7 +524,7 @@ tree_unswitch_single_loop (class loop *loop, int num)
>         /* Find a bb to unswitch on.  */
>         for (; found < loop->num_nodes; found++)
>         if ((bbs[found]->flags & BB_REACHABLE)
> -           && (cond = tree_may_unswitch_on (bbs[found], loop)))
> +           && (cond = tree_may_unswitch_on (bbs[found], loop, &cond_edge)))
>           break;
>
>         if (found == loop->num_nodes)
> @@ -446,11 +535,15 @@ tree_unswitch_single_loop (class loop *loop, int num)
>       }
>
>     if (dump_file && (dump_flags & TDF_DETAILS))
> -    fprintf (dump_file, ";; Unswitching loop\n");
> +    {
> +      fprintf (dump_file, ";; Unswitching loop with condition: ");
> +      print_generic_expr (dump_file, cond);
> +      fprintf (dump_file, "\n");
> +    }
>
>     initialize_original_copy_tables ();
>     /* Unswitch the loop on this condition.  */
> -  nloop = tree_unswitch_loop (loop, bbs[found], cond);
> +  nloop = tree_unswitch_loop (loop, bbs[found], cond, cond_edge);
>     if (!nloop)
>       {
>         free_original_copy_tables ();
> @@ -476,18 +569,15 @@ tree_unswitch_single_loop (class loop *loop, int num)
>
>   static class loop *
>   tree_unswitch_loop (class loop *loop,
> -                   basic_block unswitch_on, tree cond)
> +                   basic_block unswitch_on, tree cond, edge cond_edge)
>   {
>     profile_probability prob_true;
> -  edge edge_true, edge_false;
>
>     /* Some sanity checking.  */
>     gcc_assert (flow_bb_inside_loop_p (loop, unswitch_on));
> -  gcc_assert (EDGE_COUNT (unswitch_on->succs) == 2);
>     gcc_assert (loop->inner == NULL);
>
> -  extract_true_false_edges_from_block (unswitch_on, &edge_true, &edge_false);
> -  prob_true = edge_true->probability;
> +  prob_true = cond_edge->probability;
>     return loop_version (loop, unshare_expr (cond),
>                        NULL, prob_true,
>                        prob_true.invert (),
> @@ -965,6 +1055,42 @@ check_exit_phi (class loop *loop)
>     return true;
>   }
>
> +/* Remove all dead cases from switches that are unswitched.  */
> +
> +static void
> +clean_up_switches (void)
> +{
> +  basic_block bb;
> +
> +  FOR_EACH_BB_FN (bb, cfun)
> +    {
> +      gimple *last = last_stmt (bb);
> +      if (gswitch *stmt = safe_dyn_cast <gswitch *> (last))
> +       {
> +         unsigned nlabels = gimple_switch_num_labels (stmt);
> +         unsigned index = 1;
> +         for (unsigned i = 1; i < nlabels; ++i)
> +           {
> +             tree lab = gimple_switch_label (stmt, i);
> +             basic_block dest = label_to_block (cfun, CASE_LABEL (lab));
> +             edge e = find_edge (gimple_bb (stmt), dest);
> +             if (e == NULL)
> +               ; /* The edge is already removed.  */
> +             else if (e->flags & EDGE_IGNORE)

btw, a bit better would be to use a pass-local edge flag via

  auto_edge_flag edge_unswitched (cfun);

and use that.  EDGE_IGNORED seems to be unused and should
probably be removed.

> +               remove_edge (e);
> +             else
> +               {
> +                 gimple_switch_set_label (stmt, index, lab);
> +                 ++index;
> +               }
> +           }
> +
> +         if (index != nlabels)
> +           gimple_switch_set_num_labels (stmt, index);
> +       }
> +    }
> +}
> +
>   /* Loop unswitching pass.  */
>
>   namespace {
> --
> 2.33.0
>
  
Andrew MacLeod Sept. 28, 2021, 8:39 p.m. UTC | #3
On 9/28/21 7:50 AM, Richard Biener wrote:
> On Wed, Sep 15, 2021 at 10:46 AM Martin Liška <mliska@suse.cz> wrote:
>>    /* Unswitch single LOOP.  NUM is number of unswitchings done; we do not allow
>> @@ -269,6 +311,7 @@ tree_unswitch_single_loop (class loop *loop, int num)
>>      class loop *nloop;
>>      unsigned i, found;
>>      tree cond = NULL_TREE;
>> +  edge cond_edge = NULL;
>>      gimple *stmt;
>>      bool changed = false;
>>      HOST_WIDE_INT iterations;
>> @@ -311,11 +354,12 @@ tree_unswitch_single_loop (class loop *loop, int num)
>>      bbs = get_loop_body (loop);
>>      found = loop->num_nodes;
>>
>> +  gimple_ranger ranger;
> ISTR constructing/destructing ranger has a non-negligible overhead -
> is it possible
> to keep it live for a longer time (note we're heavily modifying the CFG)?


There is some overhead.. right now we determine all the imports and 
exports for each block ahead of time, but thats about it. We can make 
adjustments for true on demand clients like this so that even that 
doesnt happen. we only do that so we know ahead of time which ssa-names 
are never used in outgoing edges, and never even have to check those.  
Thats mostly an optimization for heavy users like EVRP.  If you want, I 
can make that an option  so there virtually no overhead

More importantly, the longer it remains alive, the more "reuse" of 
ranges you will get..   If there is not a pattern of using variables 
from earlier in the program it wouldnt really matter much.

In Theory, modifying the IL should be fine, it happens already in 
places, but its not extensively tested under those conditions yet.

>>      while (1)
>>        {
>>          /* Find a bb to unswitch on.  */
>>          for (; i < loop->num_nodes; i++)
>> -       if ((cond = tree_may_unswitch_on (bbs[i], loop)))
>> +       if ((cond = tree_may_unswitch_on (bbs[i], loop, &cond_edge)))
>>            break;
>>
>>          if (i == loop->num_nodes)
>> @@ -333,24 +377,70 @@ tree_unswitch_single_loop (class loop *loop, int num)
>>            break;
>>          }
>>
>> -      cond = simplify_using_entry_checks (loop, cond);
> I also fear we're losing simplification of unswitching on float compares or even
> run into endlessly unswitching on the same condition?
Ranger will only do integra/pointer stuff right now.  floats are on the 
list for GCC13, fwiw.
>
> In the testcases I failed to see some verifying we're _not_ repeatedly
> processing
> ifs, like scan for a definitive number of unswitchings for, say,
>
>    for (..)
>      {
>           if (a)
>            ...;
>           xyz;
>           if (a)
>             ...;
>      }
>
> where we want to unswitch on if (a) only once (but of course simplify the second
> if (a) ideally from within unswitching so CFG cleanup removes the dead paths).
> The old code guaranteed this even for float compares IIRC.
>
> At least also add scan-tree-dump-times overall expected unswitch count scans
> to the new testcases.
>
> Btw, I was hoping to use the relation stuff here, not so much use range
> queries, but see below ...
I seem to recall a discussion about using predication and how thats 
really what we want here?  . I had some thoughts on a predication engine 
we could add utilizing the relations oracle, but haven't had a time to 
look into it yet
>
>>          stmt = last_stmt (bbs[i]);
>> -      if (integer_nonzerop (cond))
>> +      gcond *condition = dyn_cast<gcond *> (stmt);
>> +      gswitch *swtch = dyn_cast<gswitch *> (stmt);
>> +
>> +      if (condition != NULL)
>>          {
>> -         /* Remove false path.  */
>> -         gimple_cond_set_condition_from_tree (as_a <gcond *> (stmt),
>> -                                              boolean_true_node);
>> -         changed = true;
>> +         int_range_max r;
>> +         edge edge_true, edge_false;
>> +         extract_true_false_edges_from_block (bbs[i], &edge_true, &edge_false);
>> +         tree cond = gimple_cond_lhs (stmt);
>> +
>> +         if (r.supports_type_p (TREE_TYPE (cond)))
>> +           {
>> +             if (ranger.range_on_edge (r, edge_true, cond)
>> +                 && r.undefined_p ())
> Can you really use ranger this way to tell whether the edge is not executed?
> I think you instead want to somehow evaluate the gcond or gswitch, looking
> for a known taken edge?

Yes and no  :-)  I use to do that, but now that we allow uninitialized 
values to be treated as UNDEFINED,  it may also mean that its 
uninitialized on that edge.

Evaluating
if (c_3 == 0)       when we know c_3 = [1,1]

What you suggest is fundamentally what ranger does... It evaluates what 
the full set of possible ranges are on the edge you ask about, then 
intersects it with the known range of c_3.  .   If the condition cannot 
ever be true,and is thus unexecutable,  the result will be UNDEFINED .  
ie above,  c_3 would have to have a range of [0,0] on the true edge, and 
its real range is [1,1].. intersecting the 2 values results in UNDEFINED...

So it can mean the edge is unexecutable.   It can also mean the value is 
actually undefined.. if this was a use-before-def case, the range of c_3 
in the block would be UNDEFINED.  and c_3 will be UNDEFINED on BOTH 
edges due ot the intersection.  the UNDEFINED state is viral.

I guess you can argue you can arbitrarily choose an edge to process in 
this case, but if you want to avoid that situation completely, I think 
you could also check that cond is not UNDEFINED  in the stmt first.. 
then if you get UNDEFINED on and edge you are 100% sure its 
unexectuable.. ie

+
+         if (ranger.range_of_expr (r, cond, stmt) && !r.undefined_p ())
+           {
+             if (ranger.range_on_edge (r, edge_true, cond) && r.undefined_p ())

Note the call to range_of_expr () will do the supported_type check 
anyway and return false if it isnt supported.


>> +               {
>> +/* Remove all dead cases from switches that are unswitched.  */
>> +
>> +static void
>> +clean_up_switches (void)
>> +{
>> +  basic_block bb;
>> +
>> +  FOR_EACH_BB_FN (bb, cfun)
>> +    {
>> +      gimple *last = last_stmt (bb);
>> +      if (gswitch *stmt = safe_dyn_cast <gswitch *> (last))
>> +       {
>> +         unsigned nlabels = gimple_switch_num_labels (stmt);
>> +         unsigned index = 1;
>> +         for (unsigned i = 1; i < nlabels; ++i)
>> +           {
>> +             tree lab = gimple_switch_label (stmt, i);
>> +             basic_block dest = label_to_block (cfun, CASE_LABEL (lab));
>> +             edge e = find_edge (gimple_bb (stmt), dest);
>> +             if (e == NULL)
>> +               ; /* The edge is already removed.  */
>> +             else if (e->flags & EDGE_IGNORE)
> btw, a bit better would be to use a pass-local edge flag via
>
>    auto_edge_flag edge_unswitched (cfun);
>
> and use that.  EDGE_IGNORED seems to be unused and should
> probably be removed.
side note.. I love that auto_edge_flag thing :-)  Thanks again for 
pointing it out.

Andrew
  
Richard Biener Sept. 29, 2021, 8:43 a.m. UTC | #4
On Tue, Sep 28, 2021 at 10:39 PM Andrew MacLeod <amacleod@redhat.com> wrote:
>
> On 9/28/21 7:50 AM, Richard Biener wrote:
> > On Wed, Sep 15, 2021 at 10:46 AM Martin Liška <mliska@suse.cz> wrote:
> >>    /* Unswitch single LOOP.  NUM is number of unswitchings done; we do not allow
> >> @@ -269,6 +311,7 @@ tree_unswitch_single_loop (class loop *loop, int num)
> >>      class loop *nloop;
> >>      unsigned i, found;
> >>      tree cond = NULL_TREE;
> >> +  edge cond_edge = NULL;
> >>      gimple *stmt;
> >>      bool changed = false;
> >>      HOST_WIDE_INT iterations;
> >> @@ -311,11 +354,12 @@ tree_unswitch_single_loop (class loop *loop, int num)
> >>      bbs = get_loop_body (loop);
> >>      found = loop->num_nodes;
> >>
> >> +  gimple_ranger ranger;
> > ISTR constructing/destructing ranger has a non-negligible overhead -
> > is it possible
> > to keep it live for a longer time (note we're heavily modifying the CFG)?
>
>
> There is some overhead.. right now we determine all the imports and
> exports for each block ahead of time, but thats about it. We can make
> adjustments for true on demand clients like this so that even that
> doesnt happen. we only do that so we know ahead of time which ssa-names
> are never used in outgoing edges, and never even have to check those.
> Thats mostly an optimization for heavy users like EVRP.  If you want, I
> can make that an option  so there virtually no overhead
>
> More importantly, the longer it remains alive, the more "reuse" of
> ranges you will get..   If there is not a pattern of using variables
> from earlier in the program it wouldnt really matter much.
>
> In Theory, modifying the IL should be fine, it happens already in
> places, but its not extensively tested under those conditions yet.

Note it's modifying the CFG as well.

My issue is that the current place is one construction per loop
(and even when we do _not_ end up doing anything), so if the
ranger use isn't O(loop-size) (not to mention nest depth...) then
we'll quickly run into complexity issues for functions with a lot of
loops.

> >>      while (1)
> >>        {
> >>          /* Find a bb to unswitch on.  */
> >>          for (; i < loop->num_nodes; i++)
> >> -       if ((cond = tree_may_unswitch_on (bbs[i], loop)))
> >> +       if ((cond = tree_may_unswitch_on (bbs[i], loop, &cond_edge)))
> >>            break;
> >>
> >>          if (i == loop->num_nodes)
> >> @@ -333,24 +377,70 @@ tree_unswitch_single_loop (class loop *loop, int num)
> >>            break;
> >>          }
> >>
> >> -      cond = simplify_using_entry_checks (loop, cond);
> > I also fear we're losing simplification of unswitching on float compares or even
> > run into endlessly unswitching on the same condition?
> Ranger will only do integra/pointer stuff right now.  floats are on the
> list for GCC13, fwiw.
> >
> > In the testcases I failed to see some verifying we're _not_ repeatedly
> > processing
> > ifs, like scan for a definitive number of unswitchings for, say,
> >
> >    for (..)
> >      {
> >           if (a)
> >            ...;
> >           xyz;
> >           if (a)
> >             ...;
> >      }
> >
> > where we want to unswitch on if (a) only once (but of course simplify the second
> > if (a) ideally from within unswitching so CFG cleanup removes the dead paths).
> > The old code guaranteed this even for float compares IIRC.
> >
> > At least also add scan-tree-dump-times overall expected unswitch count scans
> > to the new testcases.
> >
> > Btw, I was hoping to use the relation stuff here, not so much use range
> > queries, but see below ...
> I seem to recall a discussion about using predication and how thats
> really what we want here?  . I had some thoughts on a predication engine
> we could add utilizing the relations oracle, but haven't had a time to
> look into it yet
> >
> >>          stmt = last_stmt (bbs[i]);
> >> -      if (integer_nonzerop (cond))
> >> +      gcond *condition = dyn_cast<gcond *> (stmt);
> >> +      gswitch *swtch = dyn_cast<gswitch *> (stmt);
> >> +
> >> +      if (condition != NULL)
> >>          {
> >> -         /* Remove false path.  */
> >> -         gimple_cond_set_condition_from_tree (as_a <gcond *> (stmt),
> >> -                                              boolean_true_node);
> >> -         changed = true;
> >> +         int_range_max r;
> >> +         edge edge_true, edge_false;
> >> +         extract_true_false_edges_from_block (bbs[i], &edge_true, &edge_false);
> >> +         tree cond = gimple_cond_lhs (stmt);
> >> +
> >> +         if (r.supports_type_p (TREE_TYPE (cond)))
> >> +           {
> >> +             if (ranger.range_on_edge (r, edge_true, cond)
> >> +                 && r.undefined_p ())
> > Can you really use ranger this way to tell whether the edge is not executed?
> > I think you instead want to somehow evaluate the gcond or gswitch, looking
> > for a known taken edge?
>
> Yes and no  :-)  I use to do that, but now that we allow uninitialized
> values to be treated as UNDEFINED,  it may also mean that its
> uninitialized on that edge.
>
> Evaluating
> if (c_3 == 0)       when we know c_3 = [1,1]
>
> What you suggest is fundamentally what ranger does... It evaluates what
> the full set of possible ranges are on the edge you ask about, then
> intersects it with the known range of c_3.  .   If the condition cannot
> ever be true,and is thus unexecutable,  the result will be UNDEFINED .
> ie above,  c_3 would have to have a range of [0,0] on the true edge, and
> its real range is [1,1].. intersecting the 2 values results in UNDEFINED...
>
> So it can mean the edge is unexecutable.   It can also mean the value is
> actually undefined.. if this was a use-before-def case, the range of c_3
> in the block would be UNDEFINED.  and c_3 will be UNDEFINED on BOTH
> edges due ot the intersection.  the UNDEFINED state is viral.
>
> I guess you can argue you can arbitrarily choose an edge to process in
> this case, but if you want to avoid that situation completely, I think
> you could also check that cond is not UNDEFINED  in the stmt first..
> then if you get UNDEFINED on and edge you are 100% sure its
> unexectuable.. ie
>
> +
> +         if (ranger.range_of_expr (r, cond, stmt) && !r.undefined_p ())
> +           {
> +             if (ranger.range_on_edge (r, edge_true, cond) && r.undefined_p ())
>
> Note the call to range_of_expr () will do the supported_type check
> anyway and return false if it isnt supported.

So my question was probably more like if there's a way to evaluate
a condition with ranger to either true, false or unknown that looks less "odd"?

In principle it would be asking for the range of the c_3 == 0 expression
which is embedded in the GIMPLE_COND.  I really don't like the other
proposed options as they look like error-prone (wrt wrong-code).

>
> >> +               {
> >> +/* Remove all dead cases from switches that are unswitched.  */
> >> +
> >> +static void
> >> +clean_up_switches (void)
> >> +{
> >> +  basic_block bb;
> >> +
> >> +  FOR_EACH_BB_FN (bb, cfun)
> >> +    {
> >> +      gimple *last = last_stmt (bb);
> >> +      if (gswitch *stmt = safe_dyn_cast <gswitch *> (last))
> >> +       {
> >> +         unsigned nlabels = gimple_switch_num_labels (stmt);
> >> +         unsigned index = 1;
> >> +         for (unsigned i = 1; i < nlabels; ++i)
> >> +           {
> >> +             tree lab = gimple_switch_label (stmt, i);
> >> +             basic_block dest = label_to_block (cfun, CASE_LABEL (lab));
> >> +             edge e = find_edge (gimple_bb (stmt), dest);
> >> +             if (e == NULL)
> >> +               ; /* The edge is already removed.  */
> >> +             else if (e->flags & EDGE_IGNORE)
> > btw, a bit better would be to use a pass-local edge flag via
> >
> >    auto_edge_flag edge_unswitched (cfun);
> >
> > and use that.  EDGE_IGNORED seems to be unused and should
> > probably be removed.
> side note.. I love that auto_edge_flag thing :-)  Thanks again for
> pointing it out.

Yeah, it was exactly invented to allow O(region-size) use of temporary flags on
edges/bbs.

Richard.

> Andrew
>
  
Andrew MacLeod Sept. 29, 2021, 3:20 p.m. UTC | #5
On 9/29/21 4:43 AM, Richard Biener wrote:
> On Tue, Sep 28, 2021 at 10:39 PM Andrew MacLeod <amacleod@redhat.com> wrote:
>> On 9/28/21 7:50 AM, Richard Biener wrote:
>>> On Wed, Sep 15, 2021 at 10:46 AM Martin Liška <mliska@suse.cz> wrote:
>>>>     /* Unswitch single LOOP.  NUM is number of unswitchings done; we do not allow
>>>> @@ -269,6 +311,7 @@ tree_unswitch_single_loop (class loop *loop, int num)
>>>>       class loop *nloop;
>>>>       unsigned i, found;
>>>>       tree cond = NULL_TREE;
>>>> +  edge cond_edge = NULL;
>>>>       gimple *stmt;
>>>>       bool changed = false;
>>>>       HOST_WIDE_INT iterations;
>>>> @@ -311,11 +354,12 @@ tree_unswitch_single_loop (class loop *loop, int num)
>>>>       bbs = get_loop_body (loop);
>>>>       found = loop->num_nodes;
>>>>
>>>> +  gimple_ranger ranger;
>>> ISTR constructing/destructing ranger has a non-negligible overhead -
>>> is it possible
>>> to keep it live for a longer time (note we're heavily modifying the CFG)?
>>
>> There is some overhead.. right now we determine all the imports and
>> exports for each block ahead of time, but thats about it. We can make
>> adjustments for true on demand clients like this so that even that
>> doesnt happen. we only do that so we know ahead of time which ssa-names
>> are never used in outgoing edges, and never even have to check those.
>> Thats mostly an optimization for heavy users like EVRP.  If you want, I
>> can make that an option  so there virtually no overhead
>>
>> More importantly, the longer it remains alive, the more "reuse" of
>> ranges you will get..   If there is not a pattern of using variables
>> from earlier in the program it wouldnt really matter much.
>>
>> In Theory, modifying the IL should be fine, it happens already in
>> places, but its not extensively tested under those conditions yet.
> Note it's modifying the CFG as well.
bah, thats what I meant.   as long as the IL is changed and CFG updated 
to match, it should in theory work.  And as long as existing SSA_NAMEs 
dont have their meaning changes..  ie reusing an SSA_NAME to have a  
different definition is likely to cause problems without telling ranger 
that an SSA_NAME is now different.
>
> My issue is that the current place is one construction per loop
> (and even when we do _not_ end up doing anything), so if the
> ranger use isn't O(loop-size) (not to mention nest depth...) then
> we'll quickly run into complexity issues for functions with a lot of
> loops.

It should, again in theory, be possible to simple call 
enable_ranger(cfun) at the beginning of the pass,  and then just use 
"get_range_query(cfun)"  throughout.. and call disable_ranger at the 
end.  or if you want to keep using the gimple_ranger pointer:

gimple_ranger *ranger = enable_ranger (cfun);

disable_ranger(cfun)


I *think* it will work thru the CFG changes.. but one would have to try 
it and see if the same results happen.  If they don't loop me in because 
it might be something simple.   we may need to force a request for a 
range of the stmts which are changed under some circumstances.  Nothing 
occurs to me off the top of my head that would be needed.

>
>>
>> Yes and no  :-)  I use to do that, but now that we allow uninitialized
>> values to be treated as UNDEFINED,  it may also mean that its
>> uninitialized on that edge.
>>
>> Evaluating
>> if (c_3 == 0)       when we know c_3 = [1,1]
>>
>> What you suggest is fundamentally what ranger does... It evaluates what
>> the full set of possible ranges are on the edge you ask about, then
>> intersects it with the known range of c_3.  .   If the condition cannot
>> ever be true,and is thus unexecutable,  the result will be UNDEFINED .
>> ie above,  c_3 would have to have a range of [0,0] on the true edge, and
>> its real range is [1,1].. intersecting the 2 values results in UNDEFINED...
>>
>> So it can mean the edge is unexecutable.   It can also mean the value is
>> actually undefined.. if this was a use-before-def case, the range of c_3
>> in the block would be UNDEFINED.  and c_3 will be UNDEFINED on BOTH
>> edges due ot the intersection.  the UNDEFINED state is viral.
>>
>> I guess you can argue you can arbitrarily choose an edge to process in
>> this case, but if you want to avoid that situation completely, I think
>> you could also check that cond is not UNDEFINED  in the stmt first..
>> then if you get UNDEFINED on and edge you are 100% sure its
>> unexectuable.. ie
>>
>> +
>> +         if (ranger.range_of_expr (r, cond, stmt) && !r.undefined_p ())
>> +           {
>> +             if (ranger.range_on_edge (r, edge_true, cond) && r.undefined_p ())
>>
>> Note the call to range_of_expr () will do the supported_type check
>> anyway and return false if it isnt supported.
> So my question was probably more like if there's a way to evaluate
> a condition with ranger to either true, false or unknown that looks less "odd"?

well, you can directly try to fold the conditional stmt and see what the 
result is.   You can simply pass a range_query in which is used to 
resolve the operands on the stmt.   So where 'stmt' is :    if (c_3 == 0)

   if (fold_range (r, stmt, ranger) && r.singleton_p ())

will return [0,1] if we don't know which edge is taken, or [0,0] if the 
false edge is known to be taken, or [1,1] if the true edge is known to 
be taken...

Or, if we switch to the enable_ranger() mechanism for the entire pass:

   if (fold_range (r, stmt, get_range_query (cfun)) && r.singleton_p ())

btw this is virtually the same as

   if (get_range_query (cfun)->range_of_stmt (r, stmt) && r.singleton_p ())


>
> In principle it would be asking for the range of the c_3 == 0 expression
> which is embedded in the GIMPLE_COND.  I really don't like the other
> proposed options as they look like error-prone (wrt wrong-code).
>
yes, so its exactly like that


Andrew
  
Jeff Law Sept. 29, 2021, 3:28 p.m. UTC | #6
On 9/29/2021 9:20 AM, Andrew MacLeod via Gcc-patches wrote:
> On 9/29/21 4:43 AM, Richard Biener wrote:
>> On Tue, Sep 28, 2021 at 10:39 PM Andrew MacLeod <amacleod@redhat.com> 
>> wrote:
>>> On 9/28/21 7:50 AM, Richard Biener wrote:
>>>> On Wed, Sep 15, 2021 at 10:46 AM Martin Liška <mliska@suse.cz> wrote:
>>>>>     /* Unswitch single LOOP.  NUM is number of unswitchings done; 
>>>>> we do not allow
>>>>> @@ -269,6 +311,7 @@ tree_unswitch_single_loop (class loop *loop, 
>>>>> int num)
>>>>>       class loop *nloop;
>>>>>       unsigned i, found;
>>>>>       tree cond = NULL_TREE;
>>>>> +  edge cond_edge = NULL;
>>>>>       gimple *stmt;
>>>>>       bool changed = false;
>>>>>       HOST_WIDE_INT iterations;
>>>>> @@ -311,11 +354,12 @@ tree_unswitch_single_loop (class loop *loop, 
>>>>> int num)
>>>>>       bbs = get_loop_body (loop);
>>>>>       found = loop->num_nodes;
>>>>>
>>>>> +  gimple_ranger ranger;
>>>> ISTR constructing/destructing ranger has a non-negligible overhead -
>>>> is it possible
>>>> to keep it live for a longer time (note we're heavily modifying the 
>>>> CFG)?
>>>
>>> There is some overhead.. right now we determine all the imports and
>>> exports for each block ahead of time, but thats about it. We can make
>>> adjustments for true on demand clients like this so that even that
>>> doesnt happen. we only do that so we know ahead of time which ssa-names
>>> are never used in outgoing edges, and never even have to check those.
>>> Thats mostly an optimization for heavy users like EVRP.  If you want, I
>>> can make that an option  so there virtually no overhead
>>>
>>> More importantly, the longer it remains alive, the more "reuse" of
>>> ranges you will get..   If there is not a pattern of using variables
>>> from earlier in the program it wouldnt really matter much.
>>>
>>> In Theory, modifying the IL should be fine, it happens already in
>>> places, but its not extensively tested under those conditions yet.
>> Note it's modifying the CFG as well.
> bah, thats what I meant.   as long as the IL is changed and CFG 
> updated to match, it should in theory work.  And as long as existing 
> SSA_NAMEs dont have their meaning changes..  ie reusing an SSA_NAME to 
> have a  different definition is likely to cause problems without 
> telling ranger that an SSA_NAME is now different.
There is an API somewhere which resets the global information that we 
call for these scenarios.   But that assumes we know when an name is 
going to be reused or moved to a point where the global information 
isn't necessary accurate anymore.  It's not heavily used as these cases 
are relatively rare.

The nastier case is when we release an SSA_NAME because it was dead 
code, then later re-cycle it.  If we're going to have Ranger instances 
live across passes, then that becomes a much bigger concern.

Jeff
  
Andrew MacLeod Sept. 29, 2021, 3:59 p.m. UTC | #7
On 9/29/21 11:28 AM, Jeff Law wrote:
>
>
> On 9/29/2021 9:20 AM, Andrew MacLeod via Gcc-patches wrote:
>> On 9/29/21 4:43 AM, Richard Biener wrote:
>>> On Tue, Sep 28, 2021 at 10:39 PM Andrew MacLeod 
>>> <amacleod@redhat.com> wrote:
>>>> On 9/28/21 7:50 AM, Richard Biener wrote:
>>>>> On Wed, Sep 15, 2021 at 10:46 AM Martin Liška <mliska@suse.cz> wrote:
>>>>>>     /* Unswitch single LOOP.  NUM is number of unswitchings done; 
>>>>>> we do not allow
>>>>>> @@ -269,6 +311,7 @@ tree_unswitch_single_loop (class loop *loop, 
>>>>>> int num)
>>>>>>       class loop *nloop;
>>>>>>       unsigned i, found;
>>>>>>       tree cond = NULL_TREE;
>>>>>> +  edge cond_edge = NULL;
>>>>>>       gimple *stmt;
>>>>>>       bool changed = false;
>>>>>>       HOST_WIDE_INT iterations;
>>>>>> @@ -311,11 +354,12 @@ tree_unswitch_single_loop (class loop 
>>>>>> *loop, int num)
>>>>>>       bbs = get_loop_body (loop);
>>>>>>       found = loop->num_nodes;
>>>>>>
>>>>>> +  gimple_ranger ranger;
>>>>> ISTR constructing/destructing ranger has a non-negligible overhead -
>>>>> is it possible
>>>>> to keep it live for a longer time (note we're heavily modifying 
>>>>> the CFG)?
>>>>
>>>> There is some overhead.. right now we determine all the imports and
>>>> exports for each block ahead of time, but thats about it. We can make
>>>> adjustments for true on demand clients like this so that even that
>>>> doesnt happen. we only do that so we know ahead of time which 
>>>> ssa-names
>>>> are never used in outgoing edges, and never even have to check those.
>>>> Thats mostly an optimization for heavy users like EVRP.  If you 
>>>> want, I
>>>> can make that an option  so there virtually no overhead
>>>>
>>>> More importantly, the longer it remains alive, the more "reuse" of
>>>> ranges you will get..   If there is not a pattern of using variables
>>>> from earlier in the program it wouldnt really matter much.
>>>>
>>>> In Theory, modifying the IL should be fine, it happens already in
>>>> places, but its not extensively tested under those conditions yet.
>>> Note it's modifying the CFG as well.
>> bah, thats what I meant.   as long as the IL is changed and CFG 
>> updated to match, it should in theory work.  And as long as existing 
>> SSA_NAMEs dont have their meaning changes..  ie reusing an SSA_NAME 
>> to have a  different definition is likely to cause problems without 
>> telling ranger that an SSA_NAME is now different.
> There is an API somewhere which resets the global information that we 
> call for these scenarios.   But that assumes we know when an name is 
> going to be reused or moved to a point where the global information 
> isn't necessary accurate anymore.  It's not heavily used as these 
> cases are relatively rare.
>
> The nastier case is when we release an SSA_NAME because it was dead 
> code, then later re-cycle it.  If we're going to have Ranger instances 
> live across passes, then that becomes a much bigger concern. 

we'd mostly need to add an API call when we de-register ssa-names and 
add them back into the free list to the get_range_query() instance to 
let it know an SSA_NAME is now dead.   if no ranger is active, it just 
wouldnt do anything. If ranger is alive, it would flush its global cache 
of that ssa_name and move it back to not-processed.

I think thats all that would be required.. at least for that scenario.  
My hope was to make it live across passes eventually...  at least until 
we do something major.  For short gaps it would be beneficial, so that 
if a threader runs right after a VRP pass, its fully populated already.

Andrew
  
Richard Biener Sept. 30, 2021, 7:33 a.m. UTC | #8
On Wed, Sep 29, 2021 at 5:28 PM Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>
>
> On 9/29/2021 9:20 AM, Andrew MacLeod via Gcc-patches wrote:
> > On 9/29/21 4:43 AM, Richard Biener wrote:
> >> On Tue, Sep 28, 2021 at 10:39 PM Andrew MacLeod <amacleod@redhat.com>
> >> wrote:
> >>> On 9/28/21 7:50 AM, Richard Biener wrote:
> >>>> On Wed, Sep 15, 2021 at 10:46 AM Martin Liška <mliska@suse.cz> wrote:
> >>>>>     /* Unswitch single LOOP.  NUM is number of unswitchings done;
> >>>>> we do not allow
> >>>>> @@ -269,6 +311,7 @@ tree_unswitch_single_loop (class loop *loop,
> >>>>> int num)
> >>>>>       class loop *nloop;
> >>>>>       unsigned i, found;
> >>>>>       tree cond = NULL_TREE;
> >>>>> +  edge cond_edge = NULL;
> >>>>>       gimple *stmt;
> >>>>>       bool changed = false;
> >>>>>       HOST_WIDE_INT iterations;
> >>>>> @@ -311,11 +354,12 @@ tree_unswitch_single_loop (class loop *loop,
> >>>>> int num)
> >>>>>       bbs = get_loop_body (loop);
> >>>>>       found = loop->num_nodes;
> >>>>>
> >>>>> +  gimple_ranger ranger;
> >>>> ISTR constructing/destructing ranger has a non-negligible overhead -
> >>>> is it possible
> >>>> to keep it live for a longer time (note we're heavily modifying the
> >>>> CFG)?
> >>>
> >>> There is some overhead.. right now we determine all the imports and
> >>> exports for each block ahead of time, but thats about it. We can make
> >>> adjustments for true on demand clients like this so that even that
> >>> doesnt happen. we only do that so we know ahead of time which ssa-names
> >>> are never used in outgoing edges, and never even have to check those.
> >>> Thats mostly an optimization for heavy users like EVRP.  If you want, I
> >>> can make that an option  so there virtually no overhead
> >>>
> >>> More importantly, the longer it remains alive, the more "reuse" of
> >>> ranges you will get..   If there is not a pattern of using variables
> >>> from earlier in the program it wouldnt really matter much.
> >>>
> >>> In Theory, modifying the IL should be fine, it happens already in
> >>> places, but its not extensively tested under those conditions yet.
> >> Note it's modifying the CFG as well.
> > bah, thats what I meant.   as long as the IL is changed and CFG
> > updated to match, it should in theory work.  And as long as existing
> > SSA_NAMEs dont have their meaning changes..  ie reusing an SSA_NAME to
> > have a  different definition is likely to cause problems without
> > telling ranger that an SSA_NAME is now different.
> There is an API somewhere which resets the global information that we
> call for these scenarios.   But that assumes we know when an name is
> going to be reused or moved to a point where the global information
> isn't necessary accurate anymore.  It's not heavily used as these cases
> are relatively rare.
>
> The nastier case is when we release an SSA_NAME because it was dead
> code, then later re-cycle it.  If we're going to have Ranger instances
> live across passes, then that becomes a much bigger concern.

For the case at hand there shouldn't be any transforms invalidating ranges
on existing SSA names - we possibly place additional conditions and thus
refine existing ranges though, and we rely on that info to be used. Since
we repeatedly transform a single loop re-setting ranger like Martin did might
be necessary for those to be picked up.

Richard.

> Jeff
>
  
Andrew MacLeod Oct. 5, 2021, 5:08 p.m. UTC | #9
On 9/28/21 7:50 AM, Richard Biener wrote:
> On Wed, Sep 15, 2021 at 10:46 AM Martin Liška <mliska@suse.cz> wrote:
>> Hello.
>>
>> The patch extends the loop unswitching pass so that gswitch
>> statements are supported. The pass now uses ranger which marks
>> switch edges that are known to be unreachable in a versioned loop.
>>
>> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
>>
>> Ready to be installed?
>> Thanks,
>> Martin
>>

Do you have any switch size restrictions or limits?
I'm about to add --param which will limit ranger to switches of a specified size, defaulting to 50 outgoing edges.
It turns out a number of the pathological cases we see are very large switches, and we spend a lot of time processing and unioning ranges on the various edges and at meet points.

Anyway, the limit will make ranger not "see" switches above the threshold, so it will not generate ranges for them. I hope to eliminate this for next release, but for now it serves a purpose.
The patch currently makes this apply to all rangers, but if this is problematic for you, I will adjust it to make sure its only when invoked via EVRP.

Andrew
  
Martin Liška Nov. 8, 2021, 3:05 p.m. UTC | #10
On 9/28/21 22:39, Andrew MacLeod wrote:
> In Theory, modifying the IL should be fine, it happens already in places, but its not extensively tested under those conditions yet.

Hello Andrew.

I've just tried using a global gimple_ranger and it crashes when loop unswitching duplicates
some BBs.

Please try the attached patch for:

$ ./xgcc -B. /home/marxin/Programming/gcc/gcc/testsuite/gcc.dg/loop-unswitch-1.c -O3 -c
during GIMPLE pass: unswitch
/home/marxin/Programming/gcc/gcc/testsuite/gcc.dg/loop-unswitch-1.c: In function ‘xml_colorize_line’:
/home/marxin/Programming/gcc/gcc/testsuite/gcc.dg/loop-unswitch-1.c:6:6: internal compiler error: in get_bb_range, at gimple-range-cache.cc:255
     6 | void xml_colorize_line(unsigned int *p, int state)
       |      ^~~~~~~~~~~~~~~~~
0x871fcf sbr_vector::get_bb_range(irange&, basic_block_def const*)
	/home/marxin/Programming/gcc/gcc/gimple-range-cache.cc:255
0x871fcf sbr_vector::get_bb_range(irange&, basic_block_def const*)
	/home/marxin/Programming/gcc/gcc/gimple-range-cache.cc:253
0x1b99800 ranger_cache::fill_block_cache(tree_node*, basic_block_def*, basic_block_def*)
	/home/marxin/Programming/gcc/gcc/gimple-range-cache.cc:1332
0x1b99bc4 ranger_cache::block_range(irange&, basic_block_def*, tree_node*, bool)
	/home/marxin/Programming/gcc/gcc/gimple-range-cache.cc:1107
0x1b9461c gimple_ranger::range_on_entry(irange&, basic_block_def*, tree_node*)
	/home/marxin/Programming/gcc/gcc/gimple-range.cc:144
0x1b95140 gimple_ranger::range_of_expr(irange&, tree_node*, gimple*)
	/home/marxin/Programming/gcc/gcc/gimple-range.cc:118
0x1b9ebef fold_using_range::range_of_range_op(irange&, gimple*, fur_source&)
	/home/marxin/Programming/gcc/gcc/gimple-range-fold.cc:600
0x1ba0221 fold_using_range::fold_stmt(irange&, gimple*, fur_source&, tree_node*)
	/home/marxin/Programming/gcc/gcc/gimple-range-fold.cc:552
0x1b94a16 gimple_ranger::fold_range_internal(irange&, gimple*, tree_node*)
	/home/marxin/Programming/gcc/gcc/gimple-range.cc:243
0x1b94bab gimple_ranger::range_of_stmt(irange&, gimple*, tree_node*)
	/home/marxin/Programming/gcc/gcc/gimple-range.cc:273
0x10a5b5f tree_unswitch_single_loop
	/home/marxin/Programming/gcc/gcc/tree-ssa-loop-unswitch.c:390
0x10a5d62 tree_unswitch_single_loop
	/home/marxin/Programming/gcc/gcc/tree-ssa-loop-unswitch.c:546
0x10a68fe tree_ssa_unswitch_loops()
	/home/marxin/Programming/gcc/gcc/tree-ssa-loop-unswitch.c:106

Martin
  
Andrew MacLeod Nov. 8, 2021, 6:34 p.m. UTC | #11
On 11/8/21 10:05 AM, Martin Liška wrote:
> On 9/28/21 22:39, Andrew MacLeod wrote:
>> In Theory, modifying the IL should be fine, it happens already in 
>> places, but its not extensively tested under those conditions yet.
>
> Hello Andrew.
>
> I've just tried using a global gimple_ranger and it crashes when loop 
> unswitching duplicates
> some BBs.
>
ah, ok, so the default on-entry cache for ranger doesn't expect to see 
the number of BBs to increase.  .  I can change this to grow, but I want 
to avoid too many grows.  This test case looks like it grows the number 
of BBs from 24 to somewhere north of 90..  Do you have any idea in 
advance how many BBs you will be adding?  Although Im not sure how to 
make such a suggestion anyway .  Ill work something out.  The sparse 
cache has no such issue, but you will lose precision so we don't want to 
trigger on that anyway.

As work around for the moment to keep you going, heres a patch which 
simply starts with 256 extra spaces, so that should allow you to 
continue while I fix this properly to grow.  and you can see if things 
continue to work as expected.  You can increase that number as you see fit

I'll put in a proper fix in a bit.
  
Andrew MacLeod Nov. 8, 2021, 7:45 p.m. UTC | #12
On 11/8/21 10:05 AM, Martin Liška wrote:
> On 9/28/21 22:39, Andrew MacLeod wrote:
>> In Theory, modifying the IL should be fine, it happens already in 
>> places, but its not extensively tested under those conditions yet.
>
> Hello Andrew.
>
> I've just tried using a global gimple_ranger and it crashes when loop 
> unswitching duplicates
> some BBs.
>
> Please try the attached patch for: 

hey Martin,

try using this in your tree.  Since nothing else is using a growing BB 
right now, I'll let you work with it and see if everything works as 
expected before checking it in, just in case we need more tweaking.   
With this,

make RUNTESTFLAGS=dg.exp=loop-unswitch*.c check-gcc

runs clean.


basically, I tried to grow it by either a factor of 10% for the current 
BB size when the grow is requested, or some double the needed extra 
size, or 128... whichever value is "maximum"    That means it shoudnt be 
asking for tooo much each time, but also not a minimum amount.

Im certainly open to suggestion on how much to grow it each time.    
Note the vector being grown is ONLY fo the SSA_NAme being asked for.. so 
it really an on-demand thing just for specific names, in your case, 
mostly just the switch index.

Let me know how this works for you, and if you have any other issues.

Andrew
  
Richard Biener Nov. 9, 2021, 1:37 p.m. UTC | #13
On Mon, Nov 8, 2021 at 8:45 PM Andrew MacLeod <amacleod@redhat.com> wrote:
>
> On 11/8/21 10:05 AM, Martin Liška wrote:
> > On 9/28/21 22:39, Andrew MacLeod wrote:
> >> In Theory, modifying the IL should be fine, it happens already in
> >> places, but its not extensively tested under those conditions yet.
> >
> > Hello Andrew.
> >
> > I've just tried using a global gimple_ranger and it crashes when loop
> > unswitching duplicates
> > some BBs.
> >
> > Please try the attached patch for:
>
> hey Martin,
>
> try using this in your tree.  Since nothing else is using a growing BB
> right now, I'll let you work with it and see if everything works as
> expected before checking it in, just in case we need more tweaking.
> With this,
>
> make RUNTESTFLAGS=dg.exp=loop-unswitch*.c check-gcc
>
> runs clean.
>
>
> basically, I tried to grow it by either a factor of 10% for the current
> BB size when the grow is requested, or some double the needed extra
> size, or 128... whichever value is "maximum"    That means it shoudnt be
> asking for tooo much each time, but also not a minimum amount.
>
> Im certainly open to suggestion on how much to grow it each time.
> Note the vector being grown is ONLY fo the SSA_NAme being asked for.. so
> it really an on-demand thing just for specific names, in your case,
> mostly just the switch index.
>
> Let me know how this works for you, and if you have any other issues.

So I think in the end we shouldn't need the growing.  Ideally we'd do all
the analysis before the first transform, but for that we'd need ranger to
be able to "simplify" conditions based on a known true/false predicate
that's not yet in the IL.  Consider

 for (;;)
   {
        if (invariant < 3) // A
          {
...
          }
        if (invariant < 5) // B
          {
...
          }
   }

unswitch analysis will run into the condition 'A' and determine the loop
can be unswitched with the condition 'invariant < 3'.  To be able to
perform cost assessment and to avoid redundant unswitching we
want to determine that if we unswitch with 'invariant < 3' being
true then the condition at 'B' is true as well before actually inserting
the if (invariant < 3) outside of the loop.

So I'm thinking of assigning a gimple_uid to each condition we want to
unswitch on and have an array indexed by the uid with meta-data on
the unswitch opportunity, the "related" conditions could be marked with
the same uid (or some other), and the folding result recorded so that
at transform time we can just do the appropriate replacement without
invoking ranger again.

Now, but how do we arrange for the ranger analysis here?

We might also somehow want to remember that on the
'invariant < 3' == false copy of the loop there's still the
unswitching opportunity on 'invariant < 5', but not on the
'invariant < 5' == true copy.

Currently unswitching uses a custom simplify_using_entry_checks
which tries to do simplification only after the fact (and so costing
also is far from costing the true cost and ordering of the opportunities
to do the best first is not implemented either).

Richard.

> Andrew
>
  
Andrew MacLeod Nov. 9, 2021, 4:41 p.m. UTC | #14
On 11/9/21 8:37 AM, Richard Biener wrote:
> On Mon, Nov 8, 2021 at 8:45 PM Andrew MacLeod <amacleod@redhat.com> wrote:
>> On 11/8/21 10:05 AM, Martin Liška wrote:
>>> On 9/28/21 22:39, Andrew MacLeod wrote:
>>>> In Theory, modifying the IL should be fine, it happens already in
>>>> places, but its not extensively tested under those conditions yet.
>>> Hello Andrew.
>>>
>>> I've just tried using a global gimple_ranger and it crashes when loop
>>> unswitching duplicates
>>> some BBs.
>>>
>>> Please try the attached patch for:
>> hey Martin,
>>
>> try using this in your tree.  Since nothing else is using a growing BB
>> right now, I'll let you work with it and see if everything works as
>> expected before checking it in, just in case we need more tweaking.
>> With this,
>>
>> make RUNTESTFLAGS=dg.exp=loop-unswitch*.c check-gcc
>>
>> runs clean.
>>
>>
>> basically, I tried to grow it by either a factor of 10% for the current
>> BB size when the grow is requested, or some double the needed extra
>> size, or 128... whichever value is "maximum"    That means it shoudnt be
>> asking for tooo much each time, but also not a minimum amount.
>>
>> Im certainly open to suggestion on how much to grow it each time.
>> Note the vector being grown is ONLY fo the SSA_NAme being asked for.. so
>> it really an on-demand thing just for specific names, in your case,
>> mostly just the switch index.
>>
>> Let me know how this works for you, and if you have any other issues.
> So I think in the end we shouldn't need the growing.  Ideally we'd do all
> the analysis before the first transform, but for that we'd need ranger to
> be able to "simplify" conditions based on a known true/false predicate
> that's not yet in the IL.  Consider
>
>   for (;;)
>     {
>          if (invariant < 3) // A
>            {
> ...
>            }
>          if (invariant < 5) // B
>            {
> ...
>            }
>     }
>
> unswitch analysis will run into the condition 'A' and determine the loop
> can be unswitched with the condition 'invariant < 3'.  To be able to
> perform cost assessment and to avoid redundant unswitching we
> want to determine that if we unswitch with 'invariant < 3' being
> true then the condition at 'B' is true as well before actually inserting
> the if (invariant < 3) outside of the loop.
>
> So I'm thinking of assigning a gimple_uid to each condition we want to
> unswitch on and have an array indexed by the uid with meta-data on
> the unswitch opportunity, the "related" conditions could be marked with
> the same uid (or some other), and the folding result recorded so that
> at transform time we can just do the appropriate replacement without
> invoking ranger again.
>
> Now, but how do we arrange for the ranger analysis here?

well, i think there are multiple ways we could do this.    are you 
always doing this on

   if (invariant < constant) or might it be another ssa-name?

because if its always a  constant, you could ask for the range of 
invariant at  if (invariant < 3) when you unswitch it and it will 
provide you with [MIN, 2].

when you look at  if (invariant < 5), you can try folding that stmt 
using the range you know already from above..   theres an API to 
fold_stmt() independent from ranger (in gimple-range-fold.h) which lets 
you supply ranges for ssa_names in the order they are found in the stmt,

          bool fold_range (irange &r, gimple *s, irange &r1);

so putting it together, you can do something like:

// decide to unswitch this, as for the range of invariant on the TRUE edge:
s1 = first_stmt   :  if (invariant < 3)
range_of_expr (&ivrange, invariant, TRUE_EDGE)  //  This will set 
ivrange to [MIN, 2].. its value on the TRUE edge

// Now we come to the second if, we try to fold it using the range from 
the first stmt.   if fold_stmt returns true, it mean stmt2_range will 
have the result of folding the stmt. only one range is supplied, so it 
will apply ivrange [MIN, 2] to the first ssa-name it encounters in the 
stmt, and [MIN, 2] < 5  so it will return bool [1,1] for the range of 
the stmt.

s2 = second_stmt  : if (invariant < 5)
if (fold_range (&stmt2_range, second_stmt, ivrange) && 
stmt2_range.singleton_p ()
   {
       if (!stmt2_range.zero_p ())
              // result is not zero, so that means this stmt will always 
be true given the range in ivrange substituted for "invariant",

There is a fair amount of flexibility on exactly what you can do, 
depending on how complex you want to get.

   In some ways, you might be able to utilize Aldy's path-ranger that he 
uses in the threader.. and then it wouldn't matter how complex the 
conditions are, if you decide to unswitch the first condition, then 
you'd create a path thru the loop using the true edge and it would 
automatically pick up *all* the ranges that are true on that edge and 
apply them to the other conditions in the path you create.   In theory, 
without doing anything else, the the second condition should 
automatically fold to [1,1]  .  My guess is its too late in the cycle to 
be fooling around with that, because Im not sure if the path ranger is 
dynamically adjustable enough to build paths on the fly like that, or 
whether its more focused on its thread specific bits.. It may also work 
bottom to top, Im not sure.  But it also includes relations that turn 
out to be true and false as well as it goes.

Regardless, we could work on enhancements that would function in this way.

>
> We might also somehow want to remember that on the
> 'invariant < 3' == false copy of the loop there's still the
> unswitching opportunity on 'invariant < 5', but not on the
> 'invariant < 5' == true copy.

tracking the same thing for the false edge would give you a range of [3, 
MAX]   when thats applied to invariant < 5, it'll return a range of bool 
[0,1], and singleton_p() will be false so you would know it doesnt 
resolve in that case

Instead of tracking the ranges, if you are stashing the meta data which 
includes stmt, you can always pick up the ranges you want as you need 
them by making the range_of_expr() query. on the appropriate outgoing 
edge from the block with the condition

ie, to get the range from the first stmt, you can simply ask for the 
range-of-expr() on  whichever edge you happen to care about (true or 
false) just when you want it.  You could stash it in the meta data or 
just re-ask when you need it depending on which decision was made.

So you would just need to be able to key off the ssa-name in the 
condition that you unswitched I guess?

Andrew
  
Martin Liška Nov. 9, 2021, 4:44 p.m. UTC | #15
On 11/9/21 14:37, Richard Biener wrote:
> On Mon, Nov 8, 2021 at 8:45 PM Andrew MacLeod <amacleod@redhat.com> wrote:
>>
>> On 11/8/21 10:05 AM, Martin Liška wrote:
>>> On 9/28/21 22:39, Andrew MacLeod wrote:
>>>> In Theory, modifying the IL should be fine, it happens already in
>>>> places, but its not extensively tested under those conditions yet.
>>>
>>> Hello Andrew.
>>>
>>> I've just tried using a global gimple_ranger and it crashes when loop
>>> unswitching duplicates
>>> some BBs.
>>>
>>> Please try the attached patch for:
>>
>> hey Martin,
>>
>> try using this in your tree.  Since nothing else is using a growing BB
>> right now, I'll let you work with it and see if everything works as
>> expected before checking it in, just in case we need more tweaking.
>> With this,
>>
>> make RUNTESTFLAGS=dg.exp=loop-unswitch*.c check-gcc
>>
>> runs clean.
>>
>>
>> basically, I tried to grow it by either a factor of 10% for the current
>> BB size when the grow is requested, or some double the needed extra
>> size, or 128... whichever value is "maximum"    That means it shoudnt be
>> asking for tooo much each time, but also not a minimum amount.
>>
>> Im certainly open to suggestion on how much to grow it each time.
>> Note the vector being grown is ONLY fo the SSA_NAme being asked for.. so
>> it really an on-demand thing just for specific names, in your case,
>> mostly just the switch index.
>>
>> Let me know how this works for you, and if you have any other issues.
> 
> So I think in the end we shouldn't need the growing.  Ideally we'd do all
> the analysis before the first transform, but for that we'd need ranger to
> be able to "simplify" conditions based on a known true/false predicate
> that's not yet in the IL.  Consider
> 
>   for (;;)
>     {
>          if (invariant < 3) // A
>            {
> ...
>            }
>          if (invariant < 5) // B
>            {
> ...
>            }
>     }
> 
> unswitch analysis will run into the condition 'A' and determine the loop
> can be unswitched with the condition 'invariant < 3'.  To be able to
> perform cost assessment and to avoid redundant unswitching we
> want to determine that if we unswitch with 'invariant < 3' being
> true then the condition at 'B' is true as well before actually inserting
> the if (invariant < 3) outside of the loop.
> 
> So I'm thinking of assigning a gimple_uid to each condition we want to
> unswitch on and have an array indexed by the uid with meta-data on
> the unswitch opportunity, the "related" conditions could be marked with
> the same uid (or some other), and the folding result recorded so that
> at transform time we can just do the appropriate replacement without
> invoking ranger again.

Calculating all this before transformation is quite ambitious based on the code
we have now.

Note one can have in a loop:

if (a > 100)
    ...

switch (a)
    case 1000:
      ...
    case 20:
      ...
    case 200:
      ...

which means the first predicate effectively makes some cases unreachable. Moreover
one can have

if (a > 100 && b < 300)
    ...

and more complex conditions.

> 
> Now, but how do we arrange for the ranger analysis here?

That's likely something we need support from ranger, yes.

> 
> We might also somehow want to remember that on the
> 'invariant < 3' == false copy of the loop there's still the
> unswitching opportunity on 'invariant < 5', but not on the
> 'invariant < 5' == true copy.
> 
> Currently unswitching uses a custom simplify_using_entry_checks
> which tries to do simplification only after the fact (and so costing
> also is far from costing the true cost and ordering of the opportunities
> to do the best first is not implemented either).

I'm sending updated version of the patch where I changed:
- simplify_using_entry_checks is put back for the floating point expressions
- all scans utilize scan-tree-dump-times
- some new tests were added
- global ranger is used (I rely on the growing patch provided by Andrew)
- better ranger API is used for gcond expression: ranger.range_of_stmt (r, stmt) && r.singleton_p (&result))
- auto_edge_flag is used now

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.

Thoughts?
Martin

> 
> Richard.
> 
>> Andrew
>>
  
Aldy Hernandez Nov. 10, 2021, 7:52 a.m. UTC | #16
On Tue, Nov 9, 2021 at 5:41 PM Andrew MacLeod <amacleod@redhat.com> wrote:
>
> On 11/9/21 8:37 AM, Richard Biener wrote:
> > On Mon, Nov 8, 2021 at 8:45 PM Andrew MacLeod <amacleod@redhat.com> wrote:
> >> On 11/8/21 10:05 AM, Martin Liška wrote:
> >>> On 9/28/21 22:39, Andrew MacLeod wrote:
> >>>> In Theory, modifying the IL should be fine, it happens already in
> >>>> places, but its not extensively tested under those conditions yet.
> >>> Hello Andrew.
> >>>
> >>> I've just tried using a global gimple_ranger and it crashes when loop
> >>> unswitching duplicates
> >>> some BBs.
> >>>
> >>> Please try the attached patch for:
> >> hey Martin,
> >>
> >> try using this in your tree.  Since nothing else is using a growing BB
> >> right now, I'll let you work with it and see if everything works as
> >> expected before checking it in, just in case we need more tweaking.
> >> With this,
> >>
> >> make RUNTESTFLAGS=dg.exp=loop-unswitch*.c check-gcc
> >>
> >> runs clean.
> >>
> >>
> >> basically, I tried to grow it by either a factor of 10% for the current
> >> BB size when the grow is requested, or some double the needed extra
> >> size, or 128... whichever value is "maximum"    That means it shoudnt be
> >> asking for tooo much each time, but also not a minimum amount.
> >>
> >> Im certainly open to suggestion on how much to grow it each time.
> >> Note the vector being grown is ONLY fo the SSA_NAme being asked for.. so
> >> it really an on-demand thing just for specific names, in your case,
> >> mostly just the switch index.
> >>
> >> Let me know how this works for you, and if you have any other issues.
> > So I think in the end we shouldn't need the growing.  Ideally we'd do all
> > the analysis before the first transform, but for that we'd need ranger to
> > be able to "simplify" conditions based on a known true/false predicate
> > that's not yet in the IL.  Consider
> >
> >   for (;;)
> >     {
> >          if (invariant < 3) // A
> >            {
> > ...
> >            }
> >          if (invariant < 5) // B
> >            {
> > ...
> >            }
> >     }
> >
> > unswitch analysis will run into the condition 'A' and determine the loop
> > can be unswitched with the condition 'invariant < 3'.  To be able to
> > perform cost assessment and to avoid redundant unswitching we
> > want to determine that if we unswitch with 'invariant < 3' being
> > true then the condition at 'B' is true as well before actually inserting
> > the if (invariant < 3) outside of the loop.
> >
> > So I'm thinking of assigning a gimple_uid to each condition we want to
> > unswitch on and have an array indexed by the uid with meta-data on
> > the unswitch opportunity, the "related" conditions could be marked with
> > the same uid (or some other), and the folding result recorded so that
> > at transform time we can just do the appropriate replacement without
> > invoking ranger again.
> >
> > Now, but how do we arrange for the ranger analysis here?
>
> well, i think there are multiple ways we could do this.    are you
> always doing this on
>
>    if (invariant < constant) or might it be another ssa-name?
>
> because if its always a  constant, you could ask for the range of
> invariant at  if (invariant < 3) when you unswitch it and it will
> provide you with [MIN, 2].
>
> when you look at  if (invariant < 5), you can try folding that stmt
> using the range you know already from above..   theres an API to
> fold_stmt() independent from ranger (in gimple-range-fold.h) which lets
> you supply ranges for ssa_names in the order they are found in the stmt,
>
>           bool fold_range (irange &r, gimple *s, irange &r1);
>
> so putting it together, you can do something like:
>
> // decide to unswitch this, as for the range of invariant on the TRUE edge:
> s1 = first_stmt   :  if (invariant < 3)
> range_of_expr (&ivrange, invariant, TRUE_EDGE)  //  This will set
> ivrange to [MIN, 2].. its value on the TRUE edge
>
> // Now we come to the second if, we try to fold it using the range from
> the first stmt.   if fold_stmt returns true, it mean stmt2_range will
> have the result of folding the stmt. only one range is supplied, so it
> will apply ivrange [MIN, 2] to the first ssa-name it encounters in the
> stmt, and [MIN, 2] < 5  so it will return bool [1,1] for the range of
> the stmt.
>
> s2 = second_stmt  : if (invariant < 5)
> if (fold_range (&stmt2_range, second_stmt, ivrange) &&
> stmt2_range.singleton_p ()
>    {
>        if (!stmt2_range.zero_p ())
>               // result is not zero, so that means this stmt will always
> be true given the range in ivrange substituted for "invariant",
>
> There is a fair amount of flexibility on exactly what you can do,
> depending on how complex you want to get.
>
>    In some ways, you might be able to utilize Aldy's path-ranger that he
> uses in the threader.. and then it wouldn't matter how complex the
> conditions are, if you decide to unswitch the first condition, then
> you'd create a path thru the loop using the true edge and it would
> automatically pick up *all* the ranges that are true on that edge and
> apply them to the other conditions in the path you create.   In theory,
> without doing anything else, the the second condition should
> automatically fold to [1,1]  .  My guess is its too late in the cycle to
> be fooling around with that, because Im not sure if the path ranger is
> dynamically adjustable enough to build paths on the fly like that, or
> whether its more focused on its thread specific bits.. It may also work
> bottom to top, Im not sure.  But it also includes relations that turn
> out to be true and false as well as it goes.

If I understand correctly, this could be easily done by the solver.
The API is simple, you feed it a sequence of blocks and a bitmap of
SSAs you're interested in and just ask it things with range_of_stmt /
range_of_expr.  The returned values would be the ranges on exit from
the last block.

For example, if you have a sequence of BB2->BB3->BB4->?? and want to
know the answer to the final conditional in BB4, you would do

gimple_ranger ranger;
path_range_query path (ranger, /*resolve=*/true);
// Populate blocks along the path in reverse order.
auto_vec<basic_block> bbs;
bbs.safe_push(bb4);
bbs.safe_push(bb3);
bss.safe_push (bb2);
// Precompute ranges along the path.
path.compute_ranges (bbs, ranger.gori ().imports (bb4));
// Ask for ranges at the end of the path.
path.range_of_stmt (r, last_stmt (bb4));

or in case the last statement is a switch:

tree index = gimple_switch_index (last_stmt (bb4));
path.range_of_expr (r, index, last_stmt (bb4));

We may have to clean up some things for you:

a) The call to precompute_ranges starts with the list of SSAs that are
of interest to the BB4 (call the gor().imports).  We should probably
hide all that inside the path solver.  For that matter, there are some
other items that get put in that list by the backward threader.  We
could move that as well to the solver.

b) The blocks are in reverse order.  That's just a quirk of its use
from the backward threader.

c) Right now, you'd have to call compute_ranges() with a new set of
blocks if they change.  Since the solver works top to bottom, we could
probably add blocks to the bottom of the sequence without having to
recalculate things.

The functionality is all there.  It just requires a little tweaking to
separate it a bit more from its only user, the backward threader.

As Andrew says, it may be too late to start playing with the solver,
especially if the ranger can do what you want.  If you it's too
cumbersome, I could clean up the above 3 things.

Aldy

>
> Regardless, we could work on enhancements that would function in this way.
>
> >
> > We might also somehow want to remember that on the
> > 'invariant < 3' == false copy of the loop there's still the
> > unswitching opportunity on 'invariant < 5', but not on the
> > 'invariant < 5' == true copy.
>
> tracking the same thing for the false edge would give you a range of [3,
> MAX]   when thats applied to invariant < 5, it'll return a range of bool
> [0,1], and singleton_p() will be false so you would know it doesnt
> resolve in that case
>
> Instead of tracking the ranges, if you are stashing the meta data which
> includes stmt, you can always pick up the ranges you want as you need
> them by making the range_of_expr() query. on the appropriate outgoing
> edge from the block with the condition
>
> ie, to get the range from the first stmt, you can simply ask for the
> range-of-expr() on  whichever edge you happen to care about (true or
> false) just when you want it.  You could stash it in the meta data or
> just re-ask when you need it depending on which decision was made.
>
> So you would just need to be able to key off the ssa-name in the
> condition that you unswitched I guess?
>
> Andrew
>
  
Richard Biener Nov. 10, 2021, 8:50 a.m. UTC | #17
On Tue, Nov 9, 2021 at 5:41 PM Andrew MacLeod <amacleod@redhat.com> wrote:
>
> On 11/9/21 8:37 AM, Richard Biener wrote:
> > On Mon, Nov 8, 2021 at 8:45 PM Andrew MacLeod <amacleod@redhat.com> wrote:
> >> On 11/8/21 10:05 AM, Martin Liška wrote:
> >>> On 9/28/21 22:39, Andrew MacLeod wrote:
> >>>> In Theory, modifying the IL should be fine, it happens already in
> >>>> places, but its not extensively tested under those conditions yet.
> >>> Hello Andrew.
> >>>
> >>> I've just tried using a global gimple_ranger and it crashes when loop
> >>> unswitching duplicates
> >>> some BBs.
> >>>
> >>> Please try the attached patch for:
> >> hey Martin,
> >>
> >> try using this in your tree.  Since nothing else is using a growing BB
> >> right now, I'll let you work with it and see if everything works as
> >> expected before checking it in, just in case we need more tweaking.
> >> With this,
> >>
> >> make RUNTESTFLAGS=dg.exp=loop-unswitch*.c check-gcc
> >>
> >> runs clean.
> >>
> >>
> >> basically, I tried to grow it by either a factor of 10% for the current
> >> BB size when the grow is requested, or some double the needed extra
> >> size, or 128... whichever value is "maximum"    That means it shoudnt be
> >> asking for tooo much each time, but also not a minimum amount.
> >>
> >> Im certainly open to suggestion on how much to grow it each time.
> >> Note the vector being grown is ONLY fo the SSA_NAme being asked for.. so
> >> it really an on-demand thing just for specific names, in your case,
> >> mostly just the switch index.
> >>
> >> Let me know how this works for you, and if you have any other issues.
> > So I think in the end we shouldn't need the growing.  Ideally we'd do all
> > the analysis before the first transform, but for that we'd need ranger to
> > be able to "simplify" conditions based on a known true/false predicate
> > that's not yet in the IL.  Consider
> >
> >   for (;;)
> >     {
> >          if (invariant < 3) // A
> >            {
> > ...
> >            }
> >          if (invariant < 5) // B
> >            {
> > ...
> >            }
> >     }
> >
> > unswitch analysis will run into the condition 'A' and determine the loop
> > can be unswitched with the condition 'invariant < 3'.  To be able to
> > perform cost assessment and to avoid redundant unswitching we
> > want to determine that if we unswitch with 'invariant < 3' being
> > true then the condition at 'B' is true as well before actually inserting
> > the if (invariant < 3) outside of the loop.
> >
> > So I'm thinking of assigning a gimple_uid to each condition we want to
> > unswitch on and have an array indexed by the uid with meta-data on
> > the unswitch opportunity, the "related" conditions could be marked with
> > the same uid (or some other), and the folding result recorded so that
> > at transform time we can just do the appropriate replacement without
> > invoking ranger again.
> >
> > Now, but how do we arrange for the ranger analysis here?
>
> well, i think there are multiple ways we could do this.    are you
> always doing this on
>
>    if (invariant < constant) or might it be another ssa-name?

It can be any other SSA name, including local (but also invariant)
defs that need to be moved.

> because if its always a  constant, you could ask for the range of
> invariant at  if (invariant < 3) when you unswitch it and it will
> provide you with [MIN, 2].
>
> when you look at  if (invariant < 5), you can try folding that stmt
> using the range you know already from above..   theres an API to
> fold_stmt() independent from ranger (in gimple-range-fold.h) which lets
> you supply ranges for ssa_names in the order they are found in the stmt,
>
>           bool fold_range (irange &r, gimple *s, irange &r1);
>
> so putting it together, you can do something like:
>
> // decide to unswitch this, as for the range of invariant on the TRUE edge:
> s1 = first_stmt   :  if (invariant < 3)
> range_of_expr (&ivrange, invariant, TRUE_EDGE)  //  This will set
> ivrange to [MIN, 2].. its value on the TRUE edge
>
> // Now we come to the second if, we try to fold it using the range from
> the first stmt.   if fold_stmt returns true, it mean stmt2_range will
> have the result of folding the stmt. only one range is supplied, so it
> will apply ivrange [MIN, 2] to the first ssa-name it encounters in the
> stmt, and [MIN, 2] < 5  so it will return bool [1,1] for the range of
> the stmt.
>
> s2 = second_stmt  : if (invariant < 5)
> if (fold_range (&stmt2_range, second_stmt, ivrange) &&
> stmt2_range.singleton_p ()
>    {
>        if (!stmt2_range.zero_p ())
>               // result is not zero, so that means this stmt will always
> be true given the range in ivrange substituted for "invariant",
>
> There is a fair amount of flexibility on exactly what you can do,
> depending on how complex you want to get.

So in some sense I want to evaluate general predicates, a more
complex example might be that we unswitch on

   if (pred1)

and later we see

   tem = pred1 & pred2;
   if (tem)

where for example pred2 might not be invariant and ideally we'd like to see that
by unswitching on 'pred1' on the false path the if (tem) part is dead
(so cost-wise it's free).  The current code doesn't get this - it simply
compares a gcond literally against the gcond of the current unswitching,
so in principle we can preserve exactly this capability but I thought that
the relation oracle could do better here - of course with the twist that
half of the IL is not there.  Basically I'd like to know whether there's a
way to "seed" the relation oracle with known true/false predicates on some
edge (or BB) and then do followup queries and then also roll back
the seeds and any possible effect the followup queries had on the cache.

Going further it would be nice to discover the unswitch predicate that
allows to resolve the most other predicates in the loop ;)

>    In some ways, you might be able to utilize Aldy's path-ranger that he
> uses in the threader.. and then it wouldn't matter how complex the
> conditions are, if you decide to unswitch the first condition, then
> you'd create a path thru the loop using the true edge and it would
> automatically pick up *all* the ranges that are true on that edge and
> apply them to the other conditions in the path you create.   In theory,
> without doing anything else, the the second condition should
> automatically fold to [1,1]  .  My guess is its too late in the cycle to
> be fooling around with that, because Im not sure if the path ranger is
> dynamically adjustable enough to build paths on the fly like that, or
> whether its more focused on its thread specific bits.. It may also work
> bottom to top, Im not sure.  But it also includes relations that turn
> out to be true and false as well as it goes.
>
> Regardless, we could work on enhancements that would function in this way.
>
> >
> > We might also somehow want to remember that on the
> > 'invariant < 3' == false copy of the loop there's still the
> > unswitching opportunity on 'invariant < 5', but not on the
> > 'invariant < 5' == true copy.
>
> tracking the same thing for the false edge would give you a range of [3,
> MAX]   when thats applied to invariant < 5, it'll return a range of bool
> [0,1], and singleton_p() will be false so you would know it doesnt
> resolve in that case
>
> Instead of tracking the ranges, if you are stashing the meta data which
> includes stmt, you can always pick up the ranges you want as you need
> them by making the range_of_expr() query. on the appropriate outgoing
> edge from the block with the condition
>
> ie, to get the range from the first stmt, you can simply ask for the
> range-of-expr() on  whichever edge you happen to care about (true or
> false) just when you want it.  You could stash it in the meta data or
> just re-ask when you need it depending on which decision was made.
>
> So you would just need to be able to key off the ssa-name in the
> condition that you unswitched I guess?

I agree that with conditions that resolve to constant ranges we could work
on those.  But I guess that's not general enough.

Anyway, I'll tell Martin to not use ranger for now but instead just replicate
the existing functionality.  The difficulty was originally in the fact that
we want to deal with switches, but that's not going to be simpler with
ranger - apart from the fact that for switches we _do_ have constant
ranges.  So there your suggestions above might indeed help.

Richard.

> Andrew
>
  
Richard Biener Nov. 10, 2021, 8:59 a.m. UTC | #18
On Tue, Nov 9, 2021 at 5:44 PM Martin Liška <mliska@suse.cz> wrote:
>
> On 11/9/21 14:37, Richard Biener wrote:
> > On Mon, Nov 8, 2021 at 8:45 PM Andrew MacLeod <amacleod@redhat.com> wrote:
> >>
> >> On 11/8/21 10:05 AM, Martin Liška wrote:
> >>> On 9/28/21 22:39, Andrew MacLeod wrote:
> >>>> In Theory, modifying the IL should be fine, it happens already in
> >>>> places, but its not extensively tested under those conditions yet.
> >>>
> >>> Hello Andrew.
> >>>
> >>> I've just tried using a global gimple_ranger and it crashes when loop
> >>> unswitching duplicates
> >>> some BBs.
> >>>
> >>> Please try the attached patch for:
> >>
> >> hey Martin,
> >>
> >> try using this in your tree.  Since nothing else is using a growing BB
> >> right now, I'll let you work with it and see if everything works as
> >> expected before checking it in, just in case we need more tweaking.
> >> With this,
> >>
> >> make RUNTESTFLAGS=dg.exp=loop-unswitch*.c check-gcc
> >>
> >> runs clean.
> >>
> >>
> >> basically, I tried to grow it by either a factor of 10% for the current
> >> BB size when the grow is requested, or some double the needed extra
> >> size, or 128... whichever value is "maximum"    That means it shoudnt be
> >> asking for tooo much each time, but also not a minimum amount.
> >>
> >> Im certainly open to suggestion on how much to grow it each time.
> >> Note the vector being grown is ONLY fo the SSA_NAme being asked for.. so
> >> it really an on-demand thing just for specific names, in your case,
> >> mostly just the switch index.
> >>
> >> Let me know how this works for you, and if you have any other issues.
> >
> > So I think in the end we shouldn't need the growing.  Ideally we'd do all
> > the analysis before the first transform, but for that we'd need ranger to
> > be able to "simplify" conditions based on a known true/false predicate
> > that's not yet in the IL.  Consider
> >
> >   for (;;)
> >     {
> >          if (invariant < 3) // A
> >            {
> > ...
> >            }
> >          if (invariant < 5) // B
> >            {
> > ...
> >            }
> >     }
> >
> > unswitch analysis will run into the condition 'A' and determine the loop
> > can be unswitched with the condition 'invariant < 3'.  To be able to
> > perform cost assessment and to avoid redundant unswitching we
> > want to determine that if we unswitch with 'invariant < 3' being
> > true then the condition at 'B' is true as well before actually inserting
> > the if (invariant < 3) outside of the loop.
> >
> > So I'm thinking of assigning a gimple_uid to each condition we want to
> > unswitch on and have an array indexed by the uid with meta-data on
> > the unswitch opportunity, the "related" conditions could be marked with
> > the same uid (or some other), and the folding result recorded so that
> > at transform time we can just do the appropriate replacement without
> > invoking ranger again.
>
> Calculating all this before transformation is quite ambitious based on the code
> we have now.
>
> Note one can have in a loop:
>
> if (a > 100)
>     ...
>
> switch (a)
>     case 1000:
>       ...
>     case 20:
>       ...
>     case 200:
>       ...
>
> which means the first predicate effectively makes some cases unreachable. Moreover
> one can have
>
> if (a > 100 && b < 300)
>     ...
>
> and more complex conditions.

True - I guess we should do two things.

 1) keep simplify_using_entry_checks like code for symbolic conditions
 2) add integer ranges for unswitch conditions producing them, that
     includes all unswitching of switch stmts - we might be able to use
     the ranger queries (with global ranges) to simplify stmts with the
     known ranges as noted by Andrew

I do think that pre-computing the simplifications is what we should do
to be able to make the cost modeling sane.  What we can avoid
trying is evaluating multiple unswitch possibilities to pick the "best".

I think changing the code do to the analysis first should be done
before wiring in gcond support, even adding the additional 'range'
capability will be useful without that since the current code
wont figure out a > 5 is true when we unswitch on a > 3.

> >
> > Now, but how do we arrange for the ranger analysis here?
>
> That's likely something we need support from ranger, yes.
>
> >
> > We might also somehow want to remember that on the
> > 'invariant < 3' == false copy of the loop there's still the
> > unswitching opportunity on 'invariant < 5', but not on the
> > 'invariant < 5' == true copy.
> >
> > Currently unswitching uses a custom simplify_using_entry_checks
> > which tries to do simplification only after the fact (and so costing
> > also is far from costing the true cost and ordering of the opportunities
> > to do the best first is not implemented either).
>
> I'm sending updated version of the patch where I changed:
> - simplify_using_entry_checks is put back for the floating point expressions
> - all scans utilize scan-tree-dump-times
> - some new tests were added
> - global ranger is used (I rely on the growing patch provided by Andrew)
> - better ranger API is used for gcond expression: ranger.range_of_stmt (r, stmt) && r.singleton_p (&result))
> - auto_edge_flag is used now
>
> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
>
> Thoughts?
> Martin
>
> >
> > Richard.
> >
> >> Andrew
> >>
  
Martin Liška Nov. 10, 2021, 1:29 p.m. UTC | #19
On 11/10/21 09:59, Richard Biener wrote:
> On Tue, Nov 9, 2021 at 5:44 PM Martin Liška <mliska@suse.cz> wrote:
>>
>> On 11/9/21 14:37, Richard Biener wrote:
>>> On Mon, Nov 8, 2021 at 8:45 PM Andrew MacLeod <amacleod@redhat.com> wrote:
>>>>
>>>> On 11/8/21 10:05 AM, Martin Liška wrote:
>>>>> On 9/28/21 22:39, Andrew MacLeod wrote:
>>>>>> In Theory, modifying the IL should be fine, it happens already in
>>>>>> places, but its not extensively tested under those conditions yet.
>>>>>
>>>>> Hello Andrew.
>>>>>
>>>>> I've just tried using a global gimple_ranger and it crashes when loop
>>>>> unswitching duplicates
>>>>> some BBs.
>>>>>
>>>>> Please try the attached patch for:
>>>>
>>>> hey Martin,
>>>>
>>>> try using this in your tree.  Since nothing else is using a growing BB
>>>> right now, I'll let you work with it and see if everything works as
>>>> expected before checking it in, just in case we need more tweaking.
>>>> With this,
>>>>
>>>> make RUNTESTFLAGS=dg.exp=loop-unswitch*.c check-gcc
>>>>
>>>> runs clean.
>>>>
>>>>
>>>> basically, I tried to grow it by either a factor of 10% for the current
>>>> BB size when the grow is requested, or some double the needed extra
>>>> size, or 128... whichever value is "maximum"    That means it shoudnt be
>>>> asking for tooo much each time, but also not a minimum amount.
>>>>
>>>> Im certainly open to suggestion on how much to grow it each time.
>>>> Note the vector being grown is ONLY fo the SSA_NAme being asked for.. so
>>>> it really an on-demand thing just for specific names, in your case,
>>>> mostly just the switch index.
>>>>
>>>> Let me know how this works for you, and if you have any other issues.
>>>
>>> So I think in the end we shouldn't need the growing.  Ideally we'd do all
>>> the analysis before the first transform, but for that we'd need ranger to
>>> be able to "simplify" conditions based on a known true/false predicate
>>> that's not yet in the IL.  Consider
>>>
>>>    for (;;)
>>>      {
>>>           if (invariant < 3) // A
>>>             {
>>> ...
>>>             }
>>>           if (invariant < 5) // B
>>>             {
>>> ...
>>>             }
>>>      }
>>>
>>> unswitch analysis will run into the condition 'A' and determine the loop
>>> can be unswitched with the condition 'invariant < 3'.  To be able to
>>> perform cost assessment and to avoid redundant unswitching we
>>> want to determine that if we unswitch with 'invariant < 3' being
>>> true then the condition at 'B' is true as well before actually inserting
>>> the if (invariant < 3) outside of the loop.
>>>
>>> So I'm thinking of assigning a gimple_uid to each condition we want to
>>> unswitch on and have an array indexed by the uid with meta-data on
>>> the unswitch opportunity, the "related" conditions could be marked with
>>> the same uid (or some other), and the folding result recorded so that
>>> at transform time we can just do the appropriate replacement without
>>> invoking ranger again.
>>
>> Calculating all this before transformation is quite ambitious based on the code
>> we have now.
>>
>> Note one can have in a loop:
>>
>> if (a > 100)
>>      ...
>>
>> switch (a)
>>      case 1000:
>>        ...
>>      case 20:
>>        ...
>>      case 200:
>>        ...
>>
>> which means the first predicate effectively makes some cases unreachable. Moreover
>> one can have
>>
>> if (a > 100 && b < 300)
>>      ...
>>
>> and more complex conditions.
> 
> True - I guess we should do two things.

All right.

> 
>   1) keep simplify_using_entry_checks like code for symbolic conditions
>   2) add integer ranges for unswitch conditions producing them, that
>       includes all unswitching of switch stmts - we might be able to use
>       the ranger queries (with global ranges) to simplify stmts with the
>       known ranges as noted by Andrew
> 
> I do think that pre-computing the simplifications is what we should do
> to be able to make the cost modeling sane.  What we can avoid
> trying is evaluating multiple unswitch possibilities to pick the "best".

So the first step would be taking all unswitching candidates (gconds basically)
and grouping them (all items in a group would fold to true edge in the unswitched loop).
Is it something we want to do combining simplify_using_entry_checks and fold_range ranger
capability?

> 
> I think changing the code do to the analysis first should be done
> before wiring in gcond support, even adding the additional 'range'

s/gcond/switch, right?

> capability will be useful without that since the current code
> wont figure out a > 5 is true when we unswitch on a > 3.

Agree that gswitch support should be added later.

Martin

> 
>>>
>>> Now, but how do we arrange for the ranger analysis here?
>>
>> That's likely something we need support from ranger, yes.
>>
>>>
>>> We might also somehow want to remember that on the
>>> 'invariant < 3' == false copy of the loop there's still the
>>> unswitching opportunity on 'invariant < 5', but not on the
>>> 'invariant < 5' == true copy.
>>>
>>> Currently unswitching uses a custom simplify_using_entry_checks
>>> which tries to do simplification only after the fact (and so costing
>>> also is far from costing the true cost and ordering of the opportunities
>>> to do the best first is not implemented either).
>>
>> I'm sending updated version of the patch where I changed:
>> - simplify_using_entry_checks is put back for the floating point expressions
>> - all scans utilize scan-tree-dump-times
>> - some new tests were added
>> - global ranger is used (I rely on the growing patch provided by Andrew)
>> - better ranger API is used for gcond expression: ranger.range_of_stmt (r, stmt) && r.singleton_p (&result))
>> - auto_edge_flag is used now
>>
>> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
>>
>> Thoughts?
>> Martin
>>
>>>
>>> Richard.
>>>
>>>> Andrew
>>>>
  
Richard Biener Nov. 11, 2021, 7:15 a.m. UTC | #20
On Wed, Nov 10, 2021 at 2:29 PM Martin Liška <mliska@suse.cz> wrote:
>
> On 11/10/21 09:59, Richard Biener wrote:
> > On Tue, Nov 9, 2021 at 5:44 PM Martin Liška <mliska@suse.cz> wrote:
> >>
> >> On 11/9/21 14:37, Richard Biener wrote:
> >>> On Mon, Nov 8, 2021 at 8:45 PM Andrew MacLeod <amacleod@redhat.com> wrote:
> >>>>
> >>>> On 11/8/21 10:05 AM, Martin Liška wrote:
> >>>>> On 9/28/21 22:39, Andrew MacLeod wrote:
> >>>>>> In Theory, modifying the IL should be fine, it happens already in
> >>>>>> places, but its not extensively tested under those conditions yet.
> >>>>>
> >>>>> Hello Andrew.
> >>>>>
> >>>>> I've just tried using a global gimple_ranger and it crashes when loop
> >>>>> unswitching duplicates
> >>>>> some BBs.
> >>>>>
> >>>>> Please try the attached patch for:
> >>>>
> >>>> hey Martin,
> >>>>
> >>>> try using this in your tree.  Since nothing else is using a growing BB
> >>>> right now, I'll let you work with it and see if everything works as
> >>>> expected before checking it in, just in case we need more tweaking.
> >>>> With this,
> >>>>
> >>>> make RUNTESTFLAGS=dg.exp=loop-unswitch*.c check-gcc
> >>>>
> >>>> runs clean.
> >>>>
> >>>>
> >>>> basically, I tried to grow it by either a factor of 10% for the current
> >>>> BB size when the grow is requested, or some double the needed extra
> >>>> size, or 128... whichever value is "maximum"    That means it shoudnt be
> >>>> asking for tooo much each time, but also not a minimum amount.
> >>>>
> >>>> Im certainly open to suggestion on how much to grow it each time.
> >>>> Note the vector being grown is ONLY fo the SSA_NAme being asked for.. so
> >>>> it really an on-demand thing just for specific names, in your case,
> >>>> mostly just the switch index.
> >>>>
> >>>> Let me know how this works for you, and if you have any other issues.
> >>>
> >>> So I think in the end we shouldn't need the growing.  Ideally we'd do all
> >>> the analysis before the first transform, but for that we'd need ranger to
> >>> be able to "simplify" conditions based on a known true/false predicate
> >>> that's not yet in the IL.  Consider
> >>>
> >>>    for (;;)
> >>>      {
> >>>           if (invariant < 3) // A
> >>>             {
> >>> ...
> >>>             }
> >>>           if (invariant < 5) // B
> >>>             {
> >>> ...
> >>>             }
> >>>      }
> >>>
> >>> unswitch analysis will run into the condition 'A' and determine the loop
> >>> can be unswitched with the condition 'invariant < 3'.  To be able to
> >>> perform cost assessment and to avoid redundant unswitching we
> >>> want to determine that if we unswitch with 'invariant < 3' being
> >>> true then the condition at 'B' is true as well before actually inserting
> >>> the if (invariant < 3) outside of the loop.
> >>>
> >>> So I'm thinking of assigning a gimple_uid to each condition we want to
> >>> unswitch on and have an array indexed by the uid with meta-data on
> >>> the unswitch opportunity, the "related" conditions could be marked with
> >>> the same uid (or some other), and the folding result recorded so that
> >>> at transform time we can just do the appropriate replacement without
> >>> invoking ranger again.
> >>
> >> Calculating all this before transformation is quite ambitious based on the code
> >> we have now.
> >>
> >> Note one can have in a loop:
> >>
> >> if (a > 100)
> >>      ...
> >>
> >> switch (a)
> >>      case 1000:
> >>        ...
> >>      case 20:
> >>        ...
> >>      case 200:
> >>        ...
> >>
> >> which means the first predicate effectively makes some cases unreachable. Moreover
> >> one can have
> >>
> >> if (a > 100 && b < 300)
> >>      ...
> >>
> >> and more complex conditions.
> >
> > True - I guess we should do two things.
>
> All right.
>
> >
> >   1) keep simplify_using_entry_checks like code for symbolic conditions
> >   2) add integer ranges for unswitch conditions producing them, that
> >       includes all unswitching of switch stmts - we might be able to use
> >       the ranger queries (with global ranges) to simplify stmts with the
> >       known ranges as noted by Andrew
> >
> > I do think that pre-computing the simplifications is what we should do
> > to be able to make the cost modeling sane.  What we can avoid
> > trying is evaluating multiple unswitch possibilities to pick the "best".
>
> So the first step would be taking all unswitching candidates (gconds basically)
> and grouping them (all items in a group would fold to true edge in the unswitched loop).
> Is it something we want to do combining simplify_using_entry_checks and fold_range ranger
> capability?

The first step is to, after finding the first condition to unswitch on, continue
scanning the loop body for related conditions that will simplify to be able to
assess the true cost of the unswitching more readily and to prepare the
simplification step after the loop copying.

That is, the first step is to "fix" the cost modeling.

I wouldn't go as far as trying to discover all unswitching opportunities at once
yet - even though the far goal would be to maybe do that to discover the most
profitable / cheapest.

If you look at simplify_using_entry_checks then this is really really simple,
so I'd try to abstract this, recording sth like a unswitch_predicate where
we store the condition we unswitch on plus maybe cache the constant
range of a VAR cmp CST variable condition on the true/false edge.  We
can then try to simplify each gcond/gswitch based on such an unswitch_predicate
(when we ever scan the loop once to discover all opportunities we'd have a
set of unswitch_predicates to try simplifying against).  As said the integer
range thing would be an improvement over the current state so even that
can be done as followup but I guess for gswitch support that's going to be
the thing to use.

So I'd try to do no functional change first, improving the costing and
setting up the transform to simply pick up the stmts to "fold" as discovered
during analysis (as I hinted you possibly can use gimple_uid to mark
the stmts that simplify, IIRC gimple_uid is preserved during copying.
gimple_uid would also scale better than gimple_plf in case we do
the analysis for all candidates at once).

As for costing in the end we should probably assign a budget based on
the original loop and eat from that for all unswitchings done on it.

> >
> > I think changing the code do to the analysis first should be done
> > before wiring in gcond support, even adding the additional 'range'
>
> s/gcond/switch, right?

Eh, yes - obviously ;)

>
> > capability will be useful without that since the current code
> > wont figure out a > 5 is true when we unswitch on a > 3.
>
> Agree that gswitch support should be added later.
>
> Martin
>
> >
> >>>
> >>> Now, but how do we arrange for the ranger analysis here?
> >>
> >> That's likely something we need support from ranger, yes.
> >>
> >>>
> >>> We might also somehow want to remember that on the
> >>> 'invariant < 3' == false copy of the loop there's still the
> >>> unswitching opportunity on 'invariant < 5', but not on the
> >>> 'invariant < 5' == true copy.
> >>>
> >>> Currently unswitching uses a custom simplify_using_entry_checks
> >>> which tries to do simplification only after the fact (and so costing
> >>> also is far from costing the true cost and ordering of the opportunities
> >>> to do the best first is not implemented either).
> >>
> >> I'm sending updated version of the patch where I changed:
> >> - simplify_using_entry_checks is put back for the floating point expressions
> >> - all scans utilize scan-tree-dump-times
> >> - some new tests were added
> >> - global ranger is used (I rely on the growing patch provided by Andrew)
> >> - better ranger API is used for gcond expression: ranger.range_of_stmt (r, stmt) && r.singleton_p (&result))
> >> - auto_edge_flag is used now
> >>
> >> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
> >>
> >> Thoughts?
> >> Martin
> >>
> >>>
> >>> Richard.
> >>>
> >>>> Andrew
> >>>>
>
  
Martin Liška Nov. 16, 2021, 1:53 p.m. UTC | #21
On 11/11/21 08:15, Richard Biener wrote:
> If you look at simplify_using_entry_checks then this is really really simple,
> so I'd try to abstract this, recording sth like a unswitch_predicate where
> we store the condition we unswitch on plus maybe cache the constant
> range of a VAR cmp CST variable condition on the true/false edge.  We
> can then try to simplify each gcond/gswitch based on such an unswitch_predicate
> (when we ever scan the loop once to discover all opportunities we'd have a
> set of unswitch_predicates to try simplifying against).  As said the integer
> range thing would be an improvement over the current state so even that
> can be done as followup but I guess for gswitch support that's going to be
> the thing to use.

I started working on the unswitch_predicate where I recond also true/false-edge irange
of an expression we unswitch on.

I noticed one significant problem, let's consider:

   for (int i = 0; i < size; i++)
   {
     double tmp;

     if (order == 1)
       tmp = -8 * a[i];
     else
       {
	if (order == 2)
	  tmp = -4 * b[i];
	else
	  tmp = a[i];
       }

     r[i] = 3.4f * tmp + d[i];
   }

We can end up with first unswitching candidate being 'if (order == 2)' (I have a real benchmark where it happens).
So I collect ranges and they are [2,2] for true edge and [-INF, 0], [3, INF] (because we came to the condition through order != 1 cond).
Then the loop is cloned and we have

if (order == 2)
    loop_version_1
else
    loop_version_2

but in loop_version_2 we wrongly fold 'if (order == 1)' to false because it's reflected in the range.

So the question is, can one iterate get_loop_body stmts in some dominator order?

Thanks,
Martin
  
Martin Liška Nov. 16, 2021, 2:40 p.m. UTC | #22
On 11/11/21 08:15, Richard Biener wrote:
> So I'd try to do no functional change first, improving the costing and
> setting up the transform to simply pick up the stmts to "fold" as discovered
> during analysis (as I hinted you possibly can use gimple_uid to mark
> the stmts that simplify, IIRC gimple_uid is preserved during copying.
> gimple_uid would also scale better than gimple_plf in case we do
> the analysis for all candidates at once).

Thinking about the analysis. Am I correct that we want to properly calculate
loop size for true and false edge of a potential gcond before the actually unswitching?

We can do that by finding a first gcond candidate, evaluate (symbolic + irange approache)
all other gcond in the loop body and use BB_REACHABLE discovery. Similarly to what we do now
at lines 378-446. Then tree_num_loop_insns can be adjusted for only these reachable blocks.
Having that, we can calculate # of insns that will live in true/false loops.

Then we can call tree_unswitch_loop and make the gcond folding as we do in the versioned loops.

Is it a step in good direction? Having that we can then extend it to gswitch statements.

Cheers,
Martin
  
Richard Biener Nov. 19, 2021, 9:49 a.m. UTC | #23
On Tue, Nov 16, 2021 at 2:53 PM Martin Liška <mliska@suse.cz> wrote:
>
> On 11/11/21 08:15, Richard Biener wrote:
> > If you look at simplify_using_entry_checks then this is really really simple,
> > so I'd try to abstract this, recording sth like a unswitch_predicate where
> > we store the condition we unswitch on plus maybe cache the constant
> > range of a VAR cmp CST variable condition on the true/false edge.  We
> > can then try to simplify each gcond/gswitch based on such an unswitch_predicate
> > (when we ever scan the loop once to discover all opportunities we'd have a
> > set of unswitch_predicates to try simplifying against).  As said the integer
> > range thing would be an improvement over the current state so even that
> > can be done as followup but I guess for gswitch support that's going to be
> > the thing to use.
>
> I started working on the unswitch_predicate where I recond also true/false-edge irange
> of an expression we unswitch on.
>
> I noticed one significant problem, let's consider:
>
>    for (int i = 0; i < size; i++)
>    {
>      double tmp;
>
>      if (order == 1)
>        tmp = -8 * a[i];
>      else
>        {
>         if (order == 2)
>           tmp = -4 * b[i];
>         else
>           tmp = a[i];
>        }
>
>      r[i] = 3.4f * tmp + d[i];
>    }
>
> We can end up with first unswitching candidate being 'if (order == 2)' (I have a real benchmark where it happens).
> So I collect ranges and they are [2,2] for true edge and [-INF, 0], [3, INF] (because we came to the condition through order != 1 cond).

You can only record a range from the unswitch condition itself, not
from the path you came, so your false edge range
looks wrong.

> Then the loop is cloned and we have

> if (order == 2)
>     loop_version_1
> else
>     loop_version_2
>
> but in loop_version_2 we wrongly fold 'if (order == 1)' to false because it's reflected in the range.

Of course for loop_version_1 the range is [2, 2] and for
loop_version_2 it is ~[2, 2] - you
do have to invert the range when folding the "false" version.

> So the question is, can one iterate get_loop_body stmts in some dominator order?

I don't think this would fix anything, but yes, in the end we'd like
to unswitch on "outer" conditions
first.  That would mean iterating blocks in program order, the easiest
way is to use
get_loop_body_in_dom_order which should fix the above testcase but
likely not arbitrary
nested ifs.  For those you could use
rev_post_order_and_mark_dfs_back_seme, but as said
using get_loop_body_in_dom_order should be a strict improvement even
without any other
change (you could propose a patch if you have a testcase that shows a
difference with
otherwise unchanged unswitching, for example choosing the outer condition
instead of the inner with --param max-unswitch-level==0).

Richard.

>
> Thanks,
> Martin
>
>
  
Richard Biener Nov. 19, 2021, 10 a.m. UTC | #24
On Tue, Nov 16, 2021 at 3:40 PM Martin Liška <mliska@suse.cz> wrote:
>
> On 11/11/21 08:15, Richard Biener wrote:
> > So I'd try to do no functional change first, improving the costing and
> > setting up the transform to simply pick up the stmts to "fold" as discovered
> > during analysis (as I hinted you possibly can use gimple_uid to mark
> > the stmts that simplify, IIRC gimple_uid is preserved during copying.
> > gimple_uid would also scale better than gimple_plf in case we do
> > the analysis for all candidates at once).
>
> Thinking about the analysis. Am I correct that we want to properly calculate
> loop size for true and false edge of a potential gcond before the actually unswitching?

Yes.

> We can do that by finding a first gcond candidate, evaluate (symbolic + irange approache)
> all other gcond in the loop body and use BB_REACHABLE discovery. Similarly to what we do now
> at lines 378-446. Then tree_num_loop_insns can be adjusted for only these reachable blocks.
> Having that, we can calculate # of insns that will live in true/false loops.

So whatever we do here we should record as "this control stmt folds to
{true,false}" (or {true,unknown},
or in future, "this control stmt will lead to edge {e,unknown}"),
recording the simplification
on the true/false loop version in a way we can apply it after the transform.

> Then we can call tree_unswitch_loop and make the gcond folding as we do in the versioned loops.
>
> Is it a step in good direction? Having that we can then extend it to gswitch statements.

One issue I see is that BB_REACHABLE is there only once but you could use
auto_bb_flag reachable_true, reachable_false to distinguish the
true/false loop version
copies.

So yes, I think that sounds reasonable.  At the point we want to
evaluate different
(first) unswitching opportunities against each other storing this only
as BB flag is
likely to hit limits.  When we want to evaluate multiple levels of
unswitching before
doing any transforms even more so (if there are 3 opportunities there'd be
many cases to be considered when going to level 3 ;)).  I _think_ that a sparse
lattice of stmt UID -> edge might do the trick if we change tree_num_loop_insns
do to a DFS walk from the loop header, ignoring not taken edges by
consulting the
lattice.  Or, for speed reason, pre-compute tree_num_loop_insns for each BB
so we just have to sum a different set of BBs rather than walking all
stmts again.

That said, the second step would definitely be to choose the "best" opportunity
on the current level.

Richard.

> Cheers,
> Martin
  
Martin Liška Nov. 22, 2021, 3:06 p.m. UTC | #25
On 11/19/21 11:00, Richard Biener wrote:
> On Tue, Nov 16, 2021 at 3:40 PM Martin Liška <mliska@suse.cz> wrote:
>>
>> On 11/11/21 08:15, Richard Biener wrote:
>>> So I'd try to do no functional change first, improving the costing and
>>> setting up the transform to simply pick up the stmts to "fold" as discovered
>>> during analysis (as I hinted you possibly can use gimple_uid to mark
>>> the stmts that simplify, IIRC gimple_uid is preserved during copying.
>>> gimple_uid would also scale better than gimple_plf in case we do
>>> the analysis for all candidates at once).
>>
>> Thinking about the analysis. Am I correct that we want to properly calculate
>> loop size for true and false edge of a potential gcond before the actually unswitching?
> 
> Yes.
> 
>> We can do that by finding a first gcond candidate, evaluate (symbolic + irange approache)
>> all other gcond in the loop body and use BB_REACHABLE discovery. Similarly to what we do now
>> at lines 378-446. Then tree_num_loop_insns can be adjusted for only these reachable blocks.
>> Having that, we can calculate # of insns that will live in true/false loops.
> 
> So whatever we do here we should record as "this control stmt folds to
> {true,false}" (or {true,unknown},
> or in future, "this control stmt will lead to edge {e,unknown}"),
> recording the simplification
> on the true/false loop version in a way we can apply it after the transform.
> 
>> Then we can call tree_unswitch_loop and make the gcond folding as we do in the versioned loops.
>>
>> Is it a step in good direction? Having that we can then extend it to gswitch statements.
> 
> One issue I see is that BB_REACHABLE is there only once but you could use
> auto_bb_flag reachable_true, reachable_false to distinguish the
> true/false loop version
> copies.
> 
> So yes, I think that sounds reasonable.  At the point we want to
> evaluate different
> (first) unswitching opportunities against each other storing this only
> as BB flag is
> likely to hit limits.  When we want to evaluate multiple levels of
> unswitching before
> doing any transforms even more so (if there are 3 opportunities there'd be
> many cases to be considered when going to level 3 ;)).  I _think_ that a sparse
> lattice of stmt UID -> edge might do the trick if we change tree_num_loop_insns
> do to a DFS walk from the loop header, ignoring not taken edges by
> consulting the
> lattice.  Or, for speed reason, pre-compute tree_num_loop_insns for each BB
> so we just have to sum a different set of BBs rather than walking all
> stmts again.
> 
> That said, the second step would definitely be to choose the "best" opportunity
> on the current level.
> 
> Richard.
> 
>> Cheers,
>> Martin

Hello.

I'm sending a new version where I changed:
1) all unswitch_predicates are find for a loop
2) context sensitive costing happens based on an unswitch_predicate and BB reachability
    is implemented
3) folding happens in recursive invocation once we decide to unswitch
4) the patch folds both symbolic gcond predicates and irange provided by ranger
5) debug counter was added

Patch can bootstrap on x86_64-linux-gnu and survives regression tests. Plus, I tested it
on SPEC2006 and SPEC2017 with -size=ref.

Thoughts?
Thanks,
Martin
  
Richard Biener Nov. 23, 2021, 1:58 p.m. UTC | #26
On Mon, Nov 22, 2021 at 4:07 PM Martin Liška <mliska@suse.cz> wrote:
>
> On 11/19/21 11:00, Richard Biener wrote:
> > On Tue, Nov 16, 2021 at 3:40 PM Martin Liška <mliska@suse.cz> wrote:
> >>
> >> On 11/11/21 08:15, Richard Biener wrote:
> >>> So I'd try to do no functional change first, improving the costing and
> >>> setting up the transform to simply pick up the stmts to "fold" as discovered
> >>> during analysis (as I hinted you possibly can use gimple_uid to mark
> >>> the stmts that simplify, IIRC gimple_uid is preserved during copying.
> >>> gimple_uid would also scale better than gimple_plf in case we do
> >>> the analysis for all candidates at once).
> >>
> >> Thinking about the analysis. Am I correct that we want to properly calculate
> >> loop size for true and false edge of a potential gcond before the actually unswitching?
> >
> > Yes.
> >
> >> We can do that by finding a first gcond candidate, evaluate (symbolic + irange approache)
> >> all other gcond in the loop body and use BB_REACHABLE discovery. Similarly to what we do now
> >> at lines 378-446. Then tree_num_loop_insns can be adjusted for only these reachable blocks.
> >> Having that, we can calculate # of insns that will live in true/false loops.
> >
> > So whatever we do here we should record as "this control stmt folds to
> > {true,false}" (or {true,unknown},
> > or in future, "this control stmt will lead to edge {e,unknown}"),
> > recording the simplification
> > on the true/false loop version in a way we can apply it after the transform.
> >
> >> Then we can call tree_unswitch_loop and make the gcond folding as we do in the versioned loops.
> >>
> >> Is it a step in good direction? Having that we can then extend it to gswitch statements.
> >
> > One issue I see is that BB_REACHABLE is there only once but you could use
> > auto_bb_flag reachable_true, reachable_false to distinguish the
> > true/false loop version
> > copies.
> >
> > So yes, I think that sounds reasonable.  At the point we want to
> > evaluate different
> > (first) unswitching opportunities against each other storing this only
> > as BB flag is
> > likely to hit limits.  When we want to evaluate multiple levels of
> > unswitching before
> > doing any transforms even more so (if there are 3 opportunities there'd be
> > many cases to be considered when going to level 3 ;)).  I _think_ that a sparse
> > lattice of stmt UID -> edge might do the trick if we change tree_num_loop_insns
> > do to a DFS walk from the loop header, ignoring not taken edges by
> > consulting the
> > lattice.  Or, for speed reason, pre-compute tree_num_loop_insns for each BB
> > so we just have to sum a different set of BBs rather than walking all
> > stmts again.
> >
> > That said, the second step would definitely be to choose the "best" opportunity
> > on the current level.
> >
> > Richard.
> >
> >> Cheers,
> >> Martin
>
> Hello.
>
> I'm sending a new version where I changed:
> 1) all unswitch_predicates are find for a loop
> 2) context sensitive costing happens based on an unswitch_predicate and BB reachability
>     is implemented
> 3) folding happens in recursive invocation once we decide to unswitch
> 4) the patch folds both symbolic gcond predicates and irange provided by ranger
> 5) debug counter was added
>
> Patch can bootstrap on x86_64-linux-gnu and survives regression tests. Plus, I tested it
> on SPEC2006 and SPEC2017 with -size=ref.

Meh, diff made a mess out if this ;)  Random comments, I'm walking
myself the optimizations
flow.

tree_unswitch_single_loop:

+  unswitch_predicate *predicate = NULL;
+  if (num > param_max_unswitch_level)
+    {
+      if (dump_file
+         && (dump_flags & TDF_DETAILS))
+       fprintf (dump_file, ";; Not unswitching anymore, hit max level\n");
+      goto exit;
+    }

this looks like we can do this check before find_all_unswitching_predicates?

+  for (auto pred: candidates)
+    {
+      unsigned cost
+       = evaluate_loop_insns_for_predicate (loop, bbs, ranger, pred);
...

so this searches for the first candidate that fits in
param_max_unswitch_insns, it doesn't
yet try to find the cheapest / best one.  Please add a comment to say
that.  After we
found one candidate we apply unswitching to such one candidate (and throw the
others away).  I guess that's OK - it's what the old code did - what
you did for this
intermediate step is actually gather all unswitching predicates
upfront.  Hopefully
we'll be able to share some of the work done for the recursive invocations.

+         fprintf (dump_file, ";; Unswitching loop with condition: ");

"on condition"

+         fprintf (dump_file, ";; Not unswitching condition, loop too big "
+                  "(%d insns): ", cost);

"cost too big"?  I assume 'cost' is the number of stmts we'll add,
loop-size - true-eliminated - false-eliminated?

+exit:
+  for (auto predicate: candidates)
+    delete predicate;

Some refactoring should get rid of the goto ...

+static unsigned
+evaluate_insns (class loop *loop,  basic_block *bbs,
+               unswitch_predicate *candidate, bool true_edge,
+               gimple_ranger *ranger, auto_bb_flag &reachable_flag)
+{
...
+                 unswitch_predicate *predicate
+                   = tree_may_unswitch_on (bb, loop, ranger);
+                 if (predicate != NULL)
+                   {
+                     tree folded
+                       = simplify_using_entry_checks (predicate, candidate,
+                                                      true_edge);

so just looking at the implementation it seems that if you assign UIDs
to all last_stmt() in a loop you can keep tree_may_unswitch_on ()
in a dense array and you do not have to re-compute it?  UIDs should
survive copying so even for recursive unswitchings the predicates should
be shareable (pointing to the wrong stmt then, but then just do not record
the stmt in there - it should be available from the caller somehow).

But then I wonder how this translates to gswitch handling where the
result is a taken edge / a set of known not taken edges.  I suggest
to try wiring in gcond support in a separate commit on top of this to
see whether some refactoring is needed.

Looking at evaluate_insns it also seems that 'reachable_flag'
is unused - computing the sizes could be done once you process
a block, no?  Previously I suggested to compute estimate_num_insns
for each BB of the original loop once so you can re-use that and
just need to sum up BB costs.


+static bool
+find_all_unswitching_predicates (class loop *loop, basic_block *bbs,
+                                bool true_edge,
+                                unswitch_predicate *parent_predicate,
+                                gimple_ranger *ranger,
+                                vec<unswitch_predicate *> &candidates)
+{
...

so you re-do the simplification step here, in fact this function seems
to not only find unswitching predicates but apply simplifications
from the previous unswitching step.  (yes, that's what the old code did ...)
The costing step already did all the work though, it should be
possible to keep a vector of stmts to simplify and the simplification
from that analysis time.

You can get at the "other loop copy" stmt via

  last_stmt (get_bb_copy (gimple_bb (cond)))

if you do that before free_original_copy_tables.  I think it would be
nice to have the simplification step explicit before recursing
even if that means some duplicate walking for now
(I do hope we can share the find_all_unswitching_predicates
analysis step)

You are applying param_max_unswitch_insns with

+      unsigned cost
+       = evaluate_loop_insns_for_predicate (loop, bbs, ranger, pred);

-      cond = simplify_using_entry_checks (loop, cond);
-      stmt = last_stmt (bbs[i]);
-      if (integer_nonzerop (cond))
+      if (cost <= (unsigned)param_max_unswitch_insns)

where cost is the sum of the simplified sizes of both loop copies after
unswitching.  By its very nature this will match for all recursive invocations
(loops only will get smaller) which is why we need param_max_unswitch_level.
The idea was to have one param_max_unswitch_insns budget for each
_original_ loop the recursive invocations will eat from, removing the need
for param_max_unswitch_level.  I'd re-interpret param_max_unswitch_insns
as the number of insns we may add, so with cost calculated as above we'd
have

    if (cost <= budget)
...

and

    budget -= cost;

before the recursion (and pass by reference so that the two recursions
share the budget).

Initialize by

  budget = original_loop_size + param_max_unswitch_insns;

Hope this wasn't too many comments ;)

Thanks,
Richard.

> Thoughts?
> Thanks,
> Martin
>
  
Martin Liška Nov. 23, 2021, 3:20 p.m. UTC | #27
On 11/23/21 14:58, Richard Biener wrote:
> On Mon, Nov 22, 2021 at 4:07 PM Martin Liška <mliska@suse.cz> wrote:
>>
>> On 11/19/21 11:00, Richard Biener wrote:
>>> On Tue, Nov 16, 2021 at 3:40 PM Martin Liška <mliska@suse.cz> wrote:
>>>>
>>>> On 11/11/21 08:15, Richard Biener wrote:
>>>>> So I'd try to do no functional change first, improving the costing and
>>>>> setting up the transform to simply pick up the stmts to "fold" as discovered
>>>>> during analysis (as I hinted you possibly can use gimple_uid to mark
>>>>> the stmts that simplify, IIRC gimple_uid is preserved during copying.
>>>>> gimple_uid would also scale better than gimple_plf in case we do
>>>>> the analysis for all candidates at once).
>>>>
>>>> Thinking about the analysis. Am I correct that we want to properly calculate
>>>> loop size for true and false edge of a potential gcond before the actually unswitching?
>>>
>>> Yes.
>>>
>>>> We can do that by finding a first gcond candidate, evaluate (symbolic + irange approache)
>>>> all other gcond in the loop body and use BB_REACHABLE discovery. Similarly to what we do now
>>>> at lines 378-446. Then tree_num_loop_insns can be adjusted for only these reachable blocks.
>>>> Having that, we can calculate # of insns that will live in true/false loops.
>>>
>>> So whatever we do here we should record as "this control stmt folds to
>>> {true,false}" (or {true,unknown},
>>> or in future, "this control stmt will lead to edge {e,unknown}"),
>>> recording the simplification
>>> on the true/false loop version in a way we can apply it after the transform.
>>>
>>>> Then we can call tree_unswitch_loop and make the gcond folding as we do in the versioned loops.
>>>>
>>>> Is it a step in good direction? Having that we can then extend it to gswitch statements.
>>>
>>> One issue I see is that BB_REACHABLE is there only once but you could use
>>> auto_bb_flag reachable_true, reachable_false to distinguish the
>>> true/false loop version
>>> copies.
>>>
>>> So yes, I think that sounds reasonable.  At the point we want to
>>> evaluate different
>>> (first) unswitching opportunities against each other storing this only
>>> as BB flag is
>>> likely to hit limits.  When we want to evaluate multiple levels of
>>> unswitching before
>>> doing any transforms even more so (if there are 3 opportunities there'd be
>>> many cases to be considered when going to level 3 ;)).  I _think_ that a sparse
>>> lattice of stmt UID -> edge might do the trick if we change tree_num_loop_insns
>>> do to a DFS walk from the loop header, ignoring not taken edges by
>>> consulting the
>>> lattice.  Or, for speed reason, pre-compute tree_num_loop_insns for each BB
>>> so we just have to sum a different set of BBs rather than walking all
>>> stmts again.
>>>
>>> That said, the second step would definitely be to choose the "best" opportunity
>>> on the current level.
>>>
>>> Richard.
>>>
>>>> Cheers,
>>>> Martin
>>
>> Hello.
>>
>> I'm sending a new version where I changed:
>> 1) all unswitch_predicates are find for a loop
>> 2) context sensitive costing happens based on an unswitch_predicate and BB reachability
>>      is implemented
>> 3) folding happens in recursive invocation once we decide to unswitch
>> 4) the patch folds both symbolic gcond predicates and irange provided by ranger
>> 5) debug counter was added
>>
>> Patch can bootstrap on x86_64-linux-gnu and survives regression tests. Plus, I tested it
>> on SPEC2006 and SPEC2017 with -size=ref.
> 
> Meh, diff made a mess out if this ;)  Random comments, I'm walking
> myself the optimizations
> flow.

Sure.

> 
> tree_unswitch_single_loop:
> 
> +  unswitch_predicate *predicate = NULL;
> +  if (num > param_max_unswitch_level)
> +    {
> +      if (dump_file
> +         && (dump_flags & TDF_DETAILS))
> +       fprintf (dump_file, ";; Not unswitching anymore, hit max level\n");
> +      goto exit;
> +    }
> 
> this looks like we can do this check before find_all_unswitching_predicates?

Makes sense.

> 
> +  for (auto pred: candidates)
> +    {
> +      unsigned cost
> +       = evaluate_loop_insns_for_predicate (loop, bbs, ranger, pred);
> ...
> 
> so this searches for the first candidate that fits in
> param_max_unswitch_insns, it doesn't
> yet try to find the cheapest / best one.  Please add a comment to say
> that.  After we
> found one candidate we apply unswitching to such one candidate (and throw the
> others away).  I guess that's OK - it's what the old code did - what
> you did for this
> intermediate step is actually gather all unswitching predicates
> upfront.  Hopefully
> we'll be able to share some of the work done for the recursive invocations.
> 
> +         fprintf (dump_file, ";; Unswitching loop with condition: ");
> 
> "on condition"
> 
> +         fprintf (dump_file, ";; Not unswitching condition, loop too big "
> +                  "(%d insns): ", cost);
> 
> "cost too big"?  I assume 'cost' is the number of stmts we'll add,
> loop-size - true-eliminated - false-eliminated?

I'm going to adjust this.

> 
> +exit:
> +  for (auto predicate: candidates)
> +    delete predicate;
> 
> Some refactoring should get rid of the goto ...

You don't like it? It seems to me quite logical as one does not have to repeat
a clean up code before each return statement.

> 
> +static unsigned
> +evaluate_insns (class loop *loop,  basic_block *bbs,
> +               unswitch_predicate *candidate, bool true_edge,
> +               gimple_ranger *ranger, auto_bb_flag &reachable_flag)
> +{
> ...
> +                 unswitch_predicate *predicate
> +                   = tree_may_unswitch_on (bb, loop, ranger);
> +                 if (predicate != NULL)
> +                   {
> +                     tree folded
> +                       = simplify_using_entry_checks (predicate, candidate,
> +                                                      true_edge);
> 
> so just looking at the implementation it seems that if you assign UIDs
> to all last_stmt() in a loop you can keep tree_may_unswitch_on ()
> in a dense array and you do not have to re-compute it?  UIDs should
> survive copying so even for recursive unswitchings the predicates should
> be shareable (pointing to the wrong stmt then, but then just do not record
> the stmt in there - it should be available from the caller somehow).

I like the idea, lemme implement it.

> 
> But then I wonder how this translates to gswitch handling where the
> result is a taken edge / a set of known not taken edges.  I suggest
> to try wiring in gcond support in a separate commit on top of this to

s/gcond/gswitch, right?

> see whether some refactoring is needed.

Sure, so for e.g. case 1 ... 5 we would need to create a new unswitch_predicate
with 1 <= index && index <= 5 tree predicate (and the corresponding irange range).
Later once we unswitch on it, we should use a special unreachable_flag that will
be used for marking of dead edges (similarly how we fold gconds to boolean_{false/true}_node.
Does it make sense?

> 
> Looking at evaluate_insns it also seems that 'reachable_flag'
> is unused - computing the sizes could be done once you process

It is used:

	  if (dest->loop_father == loop
	      && !(dest->flags & reachable_flag)
	      && !(e->flags & flags))
	    {
	      worklist.safe_push (dest);
	      dest->flags |= reachable_flag;
	    }

Where's problem?

> a block, no?  Previously I suggested to compute estimate_num_insns
> for each BB of the original loop once so you can re-use that and
> just need to sum up BB costs.

Yep, makes sense.

> 
> 
> +static bool
> +find_all_unswitching_predicates (class loop *loop, basic_block *bbs,
> +                                bool true_edge,
> +                                unswitch_predicate *parent_predicate,
> +                                gimple_ranger *ranger,
> +                                vec<unswitch_predicate *> &candidates)
> +{
> ...
> 
> so you re-do the simplification step here, in fact this function seems
> to not only find unswitching predicates but apply simplifications
> from the previous unswitching step.  (yes, that's what the old code did ...)
> The costing step already did all the work though, it should be
> possible to keep a vector of stmts to simplify and the simplification
> from that analysis time.

Makes sense.

> 
> You can get at the "other loop copy" stmt via
> 
>    last_stmt (get_bb_copy (gimple_bb (cond)))
> 
> if you do that before free_original_copy_tables.  I think it would be
> nice to have the simplification step explicit before recursing
> even if that means some duplicate walking for now
> (I do hope we can share the find_all_unswitching_predicates
> analysis step)

Nods.

> 
> You are applying param_max_unswitch_insns with
> 
> +      unsigned cost
> +       = evaluate_loop_insns_for_predicate (loop, bbs, ranger, pred);
> 
> -      cond = simplify_using_entry_checks (loop, cond);
> -      stmt = last_stmt (bbs[i]);
> -      if (integer_nonzerop (cond))
> +      if (cost <= (unsigned)param_max_unswitch_insns)
> 
> where cost is the sum of the simplified sizes of both loop copies after
> unswitching.  By its very nature this will match for all recursive invocations
> (loops only will get smaller) which is why we need param_max_unswitch_level.
> The idea was to have one param_max_unswitch_insns budget for each
> _original_ loop the recursive invocations will eat from, removing the need
> for param_max_unswitch_level.  I'd re-interpret param_max_unswitch_insns
> as the number of insns we may add, so with cost calculated as above we'd
> have
> 
>      if (cost <= budget)
> ...
> 
> and
> 
>      budget -= cost;
> 
> before the recursion (and pass by reference so that the two recursions
> share the budget).
> 
> Initialize by
> 
>    budget = original_loop_size + param_max_unswitch_insns;

Ah, ok, lemme change that.

> 
> Hope this wasn't too many comments ;)

It was, but they are all very useful.

Thanks,
Martin

> 
> Thanks,
> Richard.
> 
>> Thoughts?
>> Thanks,
>> Martin
>>
  
Martin Liška Nov. 23, 2021, 4:36 p.m. UTC | #28
On 11/23/21 16:20, Martin Liška wrote:
> Sure, so for e.g. case 1 ... 5 we would need to create a new unswitch_predicate
> with 1 <= index && index <= 5 tree predicate (and the corresponding irange range).
> Later once we unswitch on it, we should use a special unreachable_flag that will
> be used for marking of dead edges (similarly how we fold gconds to boolean_{false/true}_node.
> Does it make sense?

I have thought about it more and it's not enough. What we really want is having a irange
for *each edge* (2 for gconds and multiple for gswitchs). Once we select a unswitch_predicate,
then we need to fold_range in true/false loop all these iranges. Doing that we can handle situations like:

if (index < 1)
    do_something1

if (index > 2)
    do_something2

switch (index)
    case 1 ... 2:
      do_something;
...

as seen the once we unswitch on 'index < 1' and 'index > 2', then the first case will be taken in the false_edge
of 'index > 2' loop unswitching.

Martin
  
Richard Biener Nov. 24, 2021, 7:46 a.m. UTC | #29
On Tue, Nov 23, 2021 at 4:20 PM Martin Liška <mliska@suse.cz> wrote:
>
> On 11/23/21 14:58, Richard Biener wrote:
> > On Mon, Nov 22, 2021 at 4:07 PM Martin Liška <mliska@suse.cz> wrote:
> >>
> >> On 11/19/21 11:00, Richard Biener wrote:
> >>> On Tue, Nov 16, 2021 at 3:40 PM Martin Liška <mliska@suse.cz> wrote:
> >>>>
> >>>> On 11/11/21 08:15, Richard Biener wrote:
> >>>>> So I'd try to do no functional change first, improving the costing and
> >>>>> setting up the transform to simply pick up the stmts to "fold" as discovered
> >>>>> during analysis (as I hinted you possibly can use gimple_uid to mark
> >>>>> the stmts that simplify, IIRC gimple_uid is preserved during copying.
> >>>>> gimple_uid would also scale better than gimple_plf in case we do
> >>>>> the analysis for all candidates at once).
> >>>>
> >>>> Thinking about the analysis. Am I correct that we want to properly calculate
> >>>> loop size for true and false edge of a potential gcond before the actually unswitching?
> >>>
> >>> Yes.
> >>>
> >>>> We can do that by finding a first gcond candidate, evaluate (symbolic + irange approache)
> >>>> all other gcond in the loop body and use BB_REACHABLE discovery. Similarly to what we do now
> >>>> at lines 378-446. Then tree_num_loop_insns can be adjusted for only these reachable blocks.
> >>>> Having that, we can calculate # of insns that will live in true/false loops.
> >>>
> >>> So whatever we do here we should record as "this control stmt folds to
> >>> {true,false}" (or {true,unknown},
> >>> or in future, "this control stmt will lead to edge {e,unknown}"),
> >>> recording the simplification
> >>> on the true/false loop version in a way we can apply it after the transform.
> >>>
> >>>> Then we can call tree_unswitch_loop and make the gcond folding as we do in the versioned loops.
> >>>>
> >>>> Is it a step in good direction? Having that we can then extend it to gswitch statements.
> >>>
> >>> One issue I see is that BB_REACHABLE is there only once but you could use
> >>> auto_bb_flag reachable_true, reachable_false to distinguish the
> >>> true/false loop version
> >>> copies.
> >>>
> >>> So yes, I think that sounds reasonable.  At the point we want to
> >>> evaluate different
> >>> (first) unswitching opportunities against each other storing this only
> >>> as BB flag is
> >>> likely to hit limits.  When we want to evaluate multiple levels of
> >>> unswitching before
> >>> doing any transforms even more so (if there are 3 opportunities there'd be
> >>> many cases to be considered when going to level 3 ;)).  I _think_ that a sparse
> >>> lattice of stmt UID -> edge might do the trick if we change tree_num_loop_insns
> >>> do to a DFS walk from the loop header, ignoring not taken edges by
> >>> consulting the
> >>> lattice.  Or, for speed reason, pre-compute tree_num_loop_insns for each BB
> >>> so we just have to sum a different set of BBs rather than walking all
> >>> stmts again.
> >>>
> >>> That said, the second step would definitely be to choose the "best" opportunity
> >>> on the current level.
> >>>
> >>> Richard.
> >>>
> >>>> Cheers,
> >>>> Martin
> >>
> >> Hello.
> >>
> >> I'm sending a new version where I changed:
> >> 1) all unswitch_predicates are find for a loop
> >> 2) context sensitive costing happens based on an unswitch_predicate and BB reachability
> >>      is implemented
> >> 3) folding happens in recursive invocation once we decide to unswitch
> >> 4) the patch folds both symbolic gcond predicates and irange provided by ranger
> >> 5) debug counter was added
> >>
> >> Patch can bootstrap on x86_64-linux-gnu and survives regression tests. Plus, I tested it
> >> on SPEC2006 and SPEC2017 with -size=ref.
> >
> > Meh, diff made a mess out if this ;)  Random comments, I'm walking
> > myself the optimizations
> > flow.
>
> Sure.
>
> >
> > tree_unswitch_single_loop:
> >
> > +  unswitch_predicate *predicate = NULL;
> > +  if (num > param_max_unswitch_level)
> > +    {
> > +      if (dump_file
> > +         && (dump_flags & TDF_DETAILS))
> > +       fprintf (dump_file, ";; Not unswitching anymore, hit max level\n");
> > +      goto exit;
> > +    }
> >
> > this looks like we can do this check before find_all_unswitching_predicates?
>
> Makes sense.
>
> >
> > +  for (auto pred: candidates)
> > +    {
> > +      unsigned cost
> > +       = evaluate_loop_insns_for_predicate (loop, bbs, ranger, pred);
> > ...
> >
> > so this searches for the first candidate that fits in
> > param_max_unswitch_insns, it doesn't
> > yet try to find the cheapest / best one.  Please add a comment to say
> > that.  After we
> > found one candidate we apply unswitching to such one candidate (and throw the
> > others away).  I guess that's OK - it's what the old code did - what
> > you did for this
> > intermediate step is actually gather all unswitching predicates
> > upfront.  Hopefully
> > we'll be able to share some of the work done for the recursive invocations.
> >
> > +         fprintf (dump_file, ";; Unswitching loop with condition: ");
> >
> > "on condition"
> >
> > +         fprintf (dump_file, ";; Not unswitching condition, loop too big "
> > +                  "(%d insns): ", cost);
> >
> > "cost too big"?  I assume 'cost' is the number of stmts we'll add,
> > loop-size - true-eliminated - false-eliminated?
>
> I'm going to adjust this.
>
> >
> > +exit:
> > +  for (auto predicate: candidates)
> > +    delete predicate;
> >
> > Some refactoring should get rid of the goto ...
>
> You don't like it? It seems to me quite logical as one does not have to repeat
> a clean up code before each return statement.
>
> >
> > +static unsigned
> > +evaluate_insns (class loop *loop,  basic_block *bbs,
> > +               unswitch_predicate *candidate, bool true_edge,
> > +               gimple_ranger *ranger, auto_bb_flag &reachable_flag)
> > +{
> > ...
> > +                 unswitch_predicate *predicate
> > +                   = tree_may_unswitch_on (bb, loop, ranger);
> > +                 if (predicate != NULL)
> > +                   {
> > +                     tree folded
> > +                       = simplify_using_entry_checks (predicate, candidate,
> > +                                                      true_edge);
> >
> > so just looking at the implementation it seems that if you assign UIDs
> > to all last_stmt() in a loop you can keep tree_may_unswitch_on ()
> > in a dense array and you do not have to re-compute it?  UIDs should
> > survive copying so even for recursive unswitchings the predicates should
> > be shareable (pointing to the wrong stmt then, but then just do not record
> > the stmt in there - it should be available from the caller somehow).
>
> I like the idea, lemme implement it.
>
> >
> > But then I wonder how this translates to gswitch handling where the
> > result is a taken edge / a set of known not taken edges.  I suggest
> > to try wiring in gcond support in a separate commit on top of this to
>
> s/gcond/gswitch, right?
>
> > see whether some refactoring is needed.
>
> Sure, so for e.g. case 1 ... 5 we would need to create a new unswitch_predicate
> with 1 <= index && index <= 5 tree predicate (and the corresponding irange range).
> Later once we unswitch on it, we should use a special unreachable_flag that will
> be used for marking of dead edges (similarly how we fold gconds to boolean_{false/true}_node.
> Does it make sense?
>
> >
> > Looking at evaluate_insns it also seems that 'reachable_flag'
> > is unused - computing the sizes could be done once you process
>
> It is used:
>
>           if (dest->loop_father == loop
>               && !(dest->flags & reachable_flag)
>               && !(e->flags & flags))
>             {
>               worklist.safe_push (dest);
>               dest->flags |= reachable_flag;
>             }
>
> Where's problem?

Ah, OK - it's used as 'visited' as well.  Fine then.

>
> > a block, no?  Previously I suggested to compute estimate_num_insns
> > for each BB of the original loop once so you can re-use that and
> > just need to sum up BB costs.
>
> Yep, makes sense.

I could suggest to use bb->aux ...

(maybe that's even copied by the versioning so that recursions do not need
to recompute stuff - no, it doesn't seem so, we only copy edge->aux it seems
looking at duplicate_block!?)

Anyway - just compute it for each "original" loop then, in the very far end
we hopefully evaluate the whole recursion before doing the transforms.

> >
> >
> > +static bool
> > +find_all_unswitching_predicates (class loop *loop, basic_block *bbs,
> > +                                bool true_edge,
> > +                                unswitch_predicate *parent_predicate,
> > +                                gimple_ranger *ranger,
> > +                                vec<unswitch_predicate *> &candidates)
> > +{
> > ...
> >
> > so you re-do the simplification step here, in fact this function seems
> > to not only find unswitching predicates but apply simplifications
> > from the previous unswitching step.  (yes, that's what the old code did ...)
> > The costing step already did all the work though, it should be
> > possible to keep a vector of stmts to simplify and the simplification
> > from that analysis time.
>
> Makes sense.
>
> >
> > You can get at the "other loop copy" stmt via
> >
> >    last_stmt (get_bb_copy (gimple_bb (cond)))
> >
> > if you do that before free_original_copy_tables.  I think it would be
> > nice to have the simplification step explicit before recursing
> > even if that means some duplicate walking for now
> > (I do hope we can share the find_all_unswitching_predicates
> > analysis step)
>
> Nods.
>
> >
> > You are applying param_max_unswitch_insns with
> >
> > +      unsigned cost
> > +       = evaluate_loop_insns_for_predicate (loop, bbs, ranger, pred);
> >
> > -      cond = simplify_using_entry_checks (loop, cond);
> > -      stmt = last_stmt (bbs[i]);
> > -      if (integer_nonzerop (cond))
> > +      if (cost <= (unsigned)param_max_unswitch_insns)
> >
> > where cost is the sum of the simplified sizes of both loop copies after
> > unswitching.  By its very nature this will match for all recursive invocations
> > (loops only will get smaller) which is why we need param_max_unswitch_level.
> > The idea was to have one param_max_unswitch_insns budget for each
> > _original_ loop the recursive invocations will eat from, removing the need
> > for param_max_unswitch_level.  I'd re-interpret param_max_unswitch_insns
> > as the number of insns we may add, so with cost calculated as above we'd
> > have
> >
> >      if (cost <= budget)
> > ...
> >
> > and
> >
> >      budget -= cost;
> >
> > before the recursion (and pass by reference so that the two recursions
> > share the budget).
> >
> > Initialize by
> >
> >    budget = original_loop_size + param_max_unswitch_insns;
>
> Ah, ok, lemme change that.
>
> >
> > Hope this wasn't too many comments ;)
>
> It was, but they are all very useful.
>
> Thanks,
> Martin
>
> >
> > Thanks,
> > Richard.
> >
> >> Thoughts?
> >> Thanks,
> >> Martin
> >>
>
  
Richard Biener Nov. 24, 2021, 8 a.m. UTC | #30
On Tue, Nov 23, 2021 at 5:36 PM Martin Liška <mliska@suse.cz> wrote:
>
> On 11/23/21 16:20, Martin Liška wrote:
> > Sure, so for e.g. case 1 ... 5 we would need to create a new unswitch_predicate
> > with 1 <= index && index <= 5 tree predicate (and the corresponding irange range).
> > Later once we unswitch on it, we should use a special unreachable_flag that will
> > be used for marking of dead edges (similarly how we fold gconds to boolean_{false/true}_node.
> > Does it make sense?
>
> I have thought about it more and it's not enough. What we really want is having a irange
> for *each edge* (2 for gconds and multiple for gswitchs). Once we select a unswitch_predicate,
> then we need to fold_range in true/false loop all these iranges. Doing that we can handle situations like:
>
> if (index < 1)
>     do_something1
>
> if (index > 2)
>     do_something2
>
> switch (index)
>     case 1 ... 2:
>       do_something;
> ...
>
> as seen the once we unswitch on 'index < 1' and 'index > 2', then the first case will be taken in the false_edge
> of 'index > 2' loop unswitching.

Hmm.  I'm not sure it needs to be this complicated.  We're basically
evaluating ranges/predicates based
on a fixed set of versioning predicates.  Your implementation created
"predicates" for the to be simplified
conditions but in the end we like to evaluate the actual stmt to
figure the taken/not taken edges.  IIRC
elsewhere Andrew showed a snipped on how to evaluate a stmt with a
given range - not sure if that
was useful enough.  So what I think would be nice if we could somehow
use rangers path query
without an actual CFG.  So we virtuall have

  if (versioning-predicate1)
    if (versioning-predicate2)
       ;
   else
      for (;;) // out current loop
        {
          ...
              if (condition)
                ;
         ...
              switch (var)
                 {
...
                  }
        }

and versioning-predicate1 and versioning-predicate2 are not in the IL.
What we'd like
to do is seed the path query with a "virtual" path through the two
predicates to the
entry of the loop and compute_ranges based on those.  Then we like to
use range_of_stmt on 'if (condition)' and 'switch (var)' to determine
not taken edges.
Looking somewhat at the sources it seems like we "simply" need to do what
compute_outgoing_relations does - unfortunately the code lacks comments
so I have no idea what jt_fur_source src (...).register_outgoing_edges does ...

Anyway, for now manually simplifying things is fine but I probably would still
stick to a basic interface that marks not taken outgoing edges of a stmt based
on the set of versioning predicates.

Richard.

>
> Martin
  
Martin Liška Nov. 24, 2021, 10:48 a.m. UTC | #31
On 11/24/21 09:00, Richard Biener wrote:
> On Tue, Nov 23, 2021 at 5:36 PM Martin Liška <mliska@suse.cz> wrote:
>>
>> On 11/23/21 16:20, Martin Liška wrote:
>>> Sure, so for e.g. case 1 ... 5 we would need to create a new unswitch_predicate
>>> with 1 <= index && index <= 5 tree predicate (and the corresponding irange range).
>>> Later once we unswitch on it, we should use a special unreachable_flag that will
>>> be used for marking of dead edges (similarly how we fold gconds to boolean_{false/true}_node.
>>> Does it make sense?
>>
>> I have thought about it more and it's not enough. What we really want is having a irange
>> for *each edge* (2 for gconds and multiple for gswitchs). Once we select a unswitch_predicate,
>> then we need to fold_range in true/false loop all these iranges. Doing that we can handle situations like:
>>
>> if (index < 1)
>>      do_something1
>>
>> if (index > 2)
>>      do_something2
>>
>> switch (index)
>>      case 1 ... 2:
>>        do_something;
>> ...
>>
>> as seen the once we unswitch on 'index < 1' and 'index > 2', then the first case will be taken in the false_edge
>> of 'index > 2' loop unswitching.
> 
> Hmm.  I'm not sure it needs to be this complicated.  We're basically
> evaluating ranges/predicates based
> on a fixed set of versioning predicates.  Your implementation created
> "predicates" for the to be simplified
> conditions but in the end we like to evaluate the actual stmt to
> figure the taken/not taken edges.

Yes.

>  IIRC
> elsewhere Andrew showed a snipped on how to evaluate a stmt with a
> given range - not sure if that

I'm using that. First I isolate a irange from a versioning-predicate with
ranger->range_on_edge and I later combine it with:
fold_range (r, stmt, parent_range).


> was useful enough.  So what I think would be nice if we could somehow
> use rangers path query
> without an actual CFG.  So we virtuall have
> 
>    if (versioning-predicate1)
>      if (versioning-predicate2)
>         ;
>     else
>        for (;;) // out current loop
>          {
>            ...
>                if (condition)
>                  ;
>           ...
>                switch (var)
>                   {
> ...
>                    }
>          }
> 
> and versioning-predicate1 and versioning-predicate2 are not in the IL.
> What we'd like
> to do is seed the path query with a "virtual" path through the two
> predicates to the
> entry of the loop and compute_ranges based on those.

What I can do that via building of a vector of tuple<unswitch_predicate,bool>
that would be passed to recursive calls of tree_unswitch_single_loop.
That basically describes which true/false edges are taken for the so far created
versioning-predicates. Right? That should be usable.

> Then we like to
> use range_of_stmt on 'if (condition)' and 'switch (var)' to determine
> not taken edges.

Works for me and we would mark unreachable case BBs with a unreachable_flag
(we can't fold it away as shown in the original patch attempt).

> Looking somewhat at the sources it seems like we "simply" need to do what
> compute_outgoing_relations does - unfortunately the code lacks comments
> so I have no idea what jt_fur_source src (...).register_outgoing_edges does ...
> 
> Anyway, for now manually simplifying things is fine but I probably would still
> stick to a basic interface that marks not taken outgoing edges of a stmt based
> on the set of versioning predicates.

Lemme try working on another version of the patch.

Martin

> 
> Richard.
> 
>>
>> Martin
  
Richard Biener Nov. 24, 2021, 12:48 p.m. UTC | #32
On Wed, Nov 24, 2021 at 11:48 AM Martin Liška <mliska@suse.cz> wrote:
>
> On 11/24/21 09:00, Richard Biener wrote:
> > On Tue, Nov 23, 2021 at 5:36 PM Martin Liška <mliska@suse.cz> wrote:
> >>
> >> On 11/23/21 16:20, Martin Liška wrote:
> >>> Sure, so for e.g. case 1 ... 5 we would need to create a new unswitch_predicate
> >>> with 1 <= index && index <= 5 tree predicate (and the corresponding irange range).
> >>> Later once we unswitch on it, we should use a special unreachable_flag that will
> >>> be used for marking of dead edges (similarly how we fold gconds to boolean_{false/true}_node.
> >>> Does it make sense?
> >>
> >> I have thought about it more and it's not enough. What we really want is having a irange
> >> for *each edge* (2 for gconds and multiple for gswitchs). Once we select a unswitch_predicate,
> >> then we need to fold_range in true/false loop all these iranges. Doing that we can handle situations like:
> >>
> >> if (index < 1)
> >>      do_something1
> >>
> >> if (index > 2)
> >>      do_something2
> >>
> >> switch (index)
> >>      case 1 ... 2:
> >>        do_something;
> >> ...
> >>
> >> as seen the once we unswitch on 'index < 1' and 'index > 2', then the first case will be taken in the false_edge
> >> of 'index > 2' loop unswitching.
> >
> > Hmm.  I'm not sure it needs to be this complicated.  We're basically
> > evaluating ranges/predicates based
> > on a fixed set of versioning predicates.  Your implementation created
> > "predicates" for the to be simplified
> > conditions but in the end we like to evaluate the actual stmt to
> > figure the taken/not taken edges.
>
> Yes.
>
> >  IIRC
> > elsewhere Andrew showed a snipped on how to evaluate a stmt with a
> > given range - not sure if that
>
> I'm using that. First I isolate a irange from a versioning-predicate with
> ranger->range_on_edge and I later combine it with:
> fold_range (r, stmt, parent_range).
>
>
> > was useful enough.  So what I think would be nice if we could somehow
> > use rangers path query
> > without an actual CFG.  So we virtuall have
> >
> >    if (versioning-predicate1)
> >      if (versioning-predicate2)
> >         ;
> >     else
> >        for (;;) // out current loop
> >          {
> >            ...
> >                if (condition)
> >                  ;
> >           ...
> >                switch (var)
> >                   {
> > ...
> >                    }
> >          }
> >
> > and versioning-predicate1 and versioning-predicate2 are not in the IL.
> > What we'd like
> > to do is seed the path query with a "virtual" path through the two
> > predicates to the
> > entry of the loop and compute_ranges based on those.
>
> What I can do that via building of a vector of tuple<unswitch_predicate,bool>
> that would be passed to recursive calls of tree_unswitch_single_loop.
> That basically describes which true/false edges are taken for the so far created
> versioning-predicates. Right? That should be usable.

Yeah.  Not sure how much incremental re-use we can have here.  I'd keep
things simple at this point and shoot for something that works on a single
recursion level only.

> > Then we like to
> > use range_of_stmt on 'if (condition)' and 'switch (var)' to determine
> > not taken edges.
>
> Works for me and we would mark unreachable case BBs with a unreachable_flag
> (we can't fold it away as shown in the original patch attempt).

As said we probably want to mark edges, unreachable edges from
upthread recursions
should get their flags copied even.

> > Looking somewhat at the sources it seems like we "simply" need to do what
> > compute_outgoing_relations does - unfortunately the code lacks comments
> > so I have no idea what jt_fur_source src (...).register_outgoing_edges does ...
> >
> > Anyway, for now manually simplifying things is fine but I probably would still
> > stick to a basic interface that marks not taken outgoing edges of a stmt based
> > on the set of versioning predicates.
>
> Lemme try working on another version of the patch.

Yup.  You did have a branch, right?  Maybe I'll poke at it a bit as well.

Richard.

> Martin
>
> >
> > Richard.
> >
> >>
> >> Martin
>
  
Martin Liška Nov. 24, 2021, 2:14 p.m. UTC | #33
On 11/24/21 13:48, Richard Biener wrote:
> Yup.  You did have a branch, right?  Maybe I'll poke at it a bit as well.

Well, I rebase quite a bit as it's under heavy development now. Do you want me
creating a devel/* branch?

Anyway, I've got a proof-of-concept patch that does:

- unswitch_predicates are first discovered before top-level tree_unswitch_single_loop
   is called and they live in a shared cache based on gimple::uid.
- finding candidates in a loop is easy -> uses unswitch_predicates from the previous step
- note I allow multiple unswitch_predicates for a BB, it's because gswitch that can emit > 2
   for a switch
- evaluate_loop_insns_for_predicate does not fold any statements
- folding happens right before a recursive tree_unswitch_single_loop happens
- costing is not resolved yet, but should be easy
- combine_range can intersect all iranges for a given index variable (gimple_cond_lhs for now).

It likely miscompiles gcc.dg/loop-unswitch-5.c, working on that..

Thoughts?
Cheers,
Martin
  
Martin Liška Nov. 24, 2021, 2:32 p.m. UTC | #34
On 11/24/21 15:14, Martin Liška wrote:
> It likely miscompiles gcc.dg/loop-unswitch-5.c, working on that..

Fixed that in the updated version.

Martin
  
Aldy Hernandez Nov. 25, 2021, 10:38 a.m. UTC | #35
On Wed, Nov 24, 2021 at 9:00 AM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Tue, Nov 23, 2021 at 5:36 PM Martin Liška <mliska@suse.cz> wrote:
> >
> > On 11/23/21 16:20, Martin Liška wrote:
> > > Sure, so for e.g. case 1 ... 5 we would need to create a new unswitch_predicate
> > > with 1 <= index && index <= 5 tree predicate (and the corresponding irange range).
> > > Later once we unswitch on it, we should use a special unreachable_flag that will
> > > be used for marking of dead edges (similarly how we fold gconds to boolean_{false/true}_node.
> > > Does it make sense?
> >
> > I have thought about it more and it's not enough. What we really want is having a irange
> > for *each edge* (2 for gconds and multiple for gswitchs). Once we select a unswitch_predicate,
> > then we need to fold_range in true/false loop all these iranges. Doing that we can handle situations like:
> >
> > if (index < 1)
> >     do_something1
> >
> > if (index > 2)
> >     do_something2
> >
> > switch (index)
> >     case 1 ... 2:
> >       do_something;
> > ...
> >
> > as seen the once we unswitch on 'index < 1' and 'index > 2', then the first case will be taken in the false_edge
> > of 'index > 2' loop unswitching.
>
> Hmm.  I'm not sure it needs to be this complicated.  We're basically
> evaluating ranges/predicates based
> on a fixed set of versioning predicates.  Your implementation created
> "predicates" for the to be simplified
> conditions but in the end we like to evaluate the actual stmt to
> figure the taken/not taken edges.  IIRC
> elsewhere Andrew showed a snipped on how to evaluate a stmt with a
> given range - not sure if that
> was useful enough.  So what I think would be nice if we could somehow
> use rangers path query
> without an actual CFG.  So we virtuall have
>
>   if (versioning-predicate1)
>     if (versioning-predicate2)
>        ;
>    else
>       for (;;) // out current loop
>         {
>           ...
>               if (condition)
>                 ;
>          ...
>               switch (var)
>                  {
> ...
>                   }
>         }
>
> and versioning-predicate1 and versioning-predicate2 are not in the IL.
> What we'd like
> to do is seed the path query with a "virtual" path through the two
> predicates to the
> entry of the loop and compute_ranges based on those.  Then we like to
> use range_of_stmt on 'if (condition)' and 'switch (var)' to determine
> not taken edges.

Huh, that's an interesting idea.  We could definitely adapt
path_range_query to work with an artificial sequence of blocks, but it
would need some surgery.  Off the top of my head:

a) The phi handling code looks for specific edges in the path (both
for intra path ranges and for relations inherent in PHIs).
b) The exported ranges between blocks in the path, probably needs some
massaging.
c) compute_outgoing_relations would need some work as you mention below...

> Looking somewhat at the sources it seems like we "simply" need to do what
> compute_outgoing_relations does - unfortunately the code lacks comments
> so I have no idea what jt_fur_source src (...).register_outgoing_edges does ...

fur_source is an abstraction for operands to the folding mechanism:

// Source of all operands for fold_using_range and gori_compute.
// It abstracts out the source of an operand so it can come from a stmt or
// and edge or anywhere a derived class of fur_source wants.
// The default simply picks up ranges from the current range_query.

class fur_source
{
}

When passed to register_outgoing_edges, it registers outgoing
relations out of a conditional.  I pass it the known outgoing edge out
of the conditional, so only the relational on that edge is recorded.
I have overloaded fur_source into a path specific jt_fur_source that
uses a path_oracle to register relations as they would occur along a
path.  Once register_outgoing_edges is called on each outgoing edge
between blocks in a path, the relations will have been set, and can be
seen by the range_of_stmt:

path_range_query::range_of_stmt (irange &r, gimple *stmt, tree)
{
...
  // If resolving unknowns, fold the statement making use of any
  // relations along the path.
  if (m_resolve)
    {
      fold_using_range f;
      jt_fur_source src (stmt, this, &m_ranger->gori (), m_path);
      if (!f.fold_stmt (r, stmt, src))
    r.set_varying (type);
    }
...
}

register_outgoing_edges would probably have to be adjusted for your
CFGless paths, and maybe the path_oracle (Andrew??).

My apologies.  The jt_fur_source is not only  wrongly named
"jump_thread", but is the least obvious part of the solver.  There are
some comments for jt_fur_source, but its use could benefit from better
comments throughout.  Let's see if I have some time before my leave to
document things better.

Aldy

>
> Anyway, for now manually simplifying things is fine but I probably would still
> stick to a basic interface that marks not taken outgoing edges of a stmt based
> on the set of versioning predicates.
>
> Richard.
>
> >
> > Martin
>
  
Richard Biener Nov. 26, 2021, 7:45 a.m. UTC | #36
On Thu, Nov 25, 2021 at 11:38 AM Aldy Hernandez <aldyh@redhat.com> wrote:
>
> On Wed, Nov 24, 2021 at 9:00 AM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Tue, Nov 23, 2021 at 5:36 PM Martin Liška <mliska@suse.cz> wrote:
> > >
> > > On 11/23/21 16:20, Martin Liška wrote:
> > > > Sure, so for e.g. case 1 ... 5 we would need to create a new unswitch_predicate
> > > > with 1 <= index && index <= 5 tree predicate (and the corresponding irange range).
> > > > Later once we unswitch on it, we should use a special unreachable_flag that will
> > > > be used for marking of dead edges (similarly how we fold gconds to boolean_{false/true}_node.
> > > > Does it make sense?
> > >
> > > I have thought about it more and it's not enough. What we really want is having a irange
> > > for *each edge* (2 for gconds and multiple for gswitchs). Once we select a unswitch_predicate,
> > > then we need to fold_range in true/false loop all these iranges. Doing that we can handle situations like:
> > >
> > > if (index < 1)
> > >     do_something1
> > >
> > > if (index > 2)
> > >     do_something2
> > >
> > > switch (index)
> > >     case 1 ... 2:
> > >       do_something;
> > > ...
> > >
> > > as seen the once we unswitch on 'index < 1' and 'index > 2', then the first case will be taken in the false_edge
> > > of 'index > 2' loop unswitching.
> >
> > Hmm.  I'm not sure it needs to be this complicated.  We're basically
> > evaluating ranges/predicates based
> > on a fixed set of versioning predicates.  Your implementation created
> > "predicates" for the to be simplified
> > conditions but in the end we like to evaluate the actual stmt to
> > figure the taken/not taken edges.  IIRC
> > elsewhere Andrew showed a snipped on how to evaluate a stmt with a
> > given range - not sure if that
> > was useful enough.  So what I think would be nice if we could somehow
> > use rangers path query
> > without an actual CFG.  So we virtuall have
> >
> >   if (versioning-predicate1)
> >     if (versioning-predicate2)
> >        ;
> >    else
> >       for (;;) // out current loop
> >         {
> >           ...
> >               if (condition)
> >                 ;
> >          ...
> >               switch (var)
> >                  {
> > ...
> >                   }
> >         }
> >
> > and versioning-predicate1 and versioning-predicate2 are not in the IL.
> > What we'd like
> > to do is seed the path query with a "virtual" path through the two
> > predicates to the
> > entry of the loop and compute_ranges based on those.  Then we like to
> > use range_of_stmt on 'if (condition)' and 'switch (var)' to determine
> > not taken edges.
>
> Huh, that's an interesting idea.  We could definitely adapt
> path_range_query to work with an artificial sequence of blocks, but it
> would need some surgery.  Off the top of my head:
>
> a) The phi handling code looks for specific edges in the path (both
> for intra path ranges and for relations inherent in PHIs).
> b) The exported ranges between blocks in the path, probably needs some
> massaging.
> c) compute_outgoing_relations would need some work as you mention below...
>
> > Looking somewhat at the sources it seems like we "simply" need to do what
> > compute_outgoing_relations does - unfortunately the code lacks comments
> > so I have no idea what jt_fur_source src (...).register_outgoing_edges does ...
>
> fur_source is an abstraction for operands to the folding mechanism:
>
> // Source of all operands for fold_using_range and gori_compute.
> // It abstracts out the source of an operand so it can come from a stmt or
> // and edge or anywhere a derived class of fur_source wants.
> // The default simply picks up ranges from the current range_query.
>
> class fur_source
> {
> }
>
> When passed to register_outgoing_edges, it registers outgoing
> relations out of a conditional.  I pass it the known outgoing edge out
> of the conditional, so only the relational on that edge is recorded.
> I have overloaded fur_source into a path specific jt_fur_source that
> uses a path_oracle to register relations as they would occur along a
> path.  Once register_outgoing_edges is called on each outgoing edge
> between blocks in a path, the relations will have been set, and can be
> seen by the range_of_stmt:
>
> path_range_query::range_of_stmt (irange &r, gimple *stmt, tree)
> {
> ...
>   // If resolving unknowns, fold the statement making use of any
>   // relations along the path.
>   if (m_resolve)
>     {
>       fold_using_range f;
>       jt_fur_source src (stmt, this, &m_ranger->gori (), m_path);
>       if (!f.fold_stmt (r, stmt, src))
>     r.set_varying (type);
>     }
> ...
> }
>
> register_outgoing_edges would probably have to be adjusted for your
> CFGless paths, and maybe the path_oracle (Andrew??).

So conceptually we'd attach extra predicates to an edge (a single one,
and even the "entry" edge of the path in this specific case).  That is,
instead of explicit

   \
   if (p1)
       \
     if (p2)
        |
       { stmts }

we see

      \ p1 && p2
     { stmts }

it might be easier to think of it in this way.  p1 and p2 could be
represented as gimple_seq (but off IL) or as GENERIC tree
and my idea was that compute_outgoing_relations could somehow
process those.

There is edge->aux one could use to more easily experiment
with a solution (to avoid massaging too many APIs), possibly
also edge->insns.g, or one could use a ranger global hash-map.

That said, it's an idea on how to make use of ranger for this more
easily in the future, it's not a strict requirement for the ongoing work.

Richard.

> My apologies.  The jt_fur_source is not only  wrongly named
> "jump_thread", but is the least obvious part of the solver.  There are
> some comments for jt_fur_source, but its use could benefit from better
> comments throughout.  Let's see if I have some time before my leave to
> document things better.
>
> Aldy
>
> >
> > Anyway, for now manually simplifying things is fine but I probably would still
> > stick to a basic interface that marks not taken outgoing edges of a stmt based
> > on the set of versioning predicates.
> >
> > Richard.
> >
> > >
> > > Martin
> >
>
  
Richard Biener Nov. 26, 2021, 8:12 a.m. UTC | #37
On Wed, Nov 24, 2021 at 3:32 PM Martin Liška <mliska@suse.cz> wrote:
>
> On 11/24/21 15:14, Martin Liška wrote:
> > It likely miscompiles gcc.dg/loop-unswitch-5.c, working on that..
>
> Fixed that in the updated version.

Function level comments need updating it seems.

+static unsigned
+evaluate_insns (class loop *loop,  basic_block *bbs,
+               predicate_vector &predicate_path,
+               auto_bb_flag &reachable_flag)
+{
+  auto_vec<basic_block> worklist (loop->num_nodes);
+  worklist.quick_push (bbs[0]);
...

so when adding gswitch support the easiest way to make

+      FOR_EACH_EDGE (e, ei, bb->succs)
+       {
...
+           {
+             worklist.safe_push (dest);
+             dest->flags |= reachable_flag;

work is when the gcond/gswitch simplification would mark
outgoing edges as (non-)executable.  For gswitch this
could be achieved by iterating over the case labels and
intersecting that with the range while for gcond it's a
matter of setting an edge flag instead of returning true/false.
I'd call the common function evaluate_control_stmt_using_entry_checks
or so and invoke it on the last stmt of a block with >= 2 outgoing
edges.

We still seem to do the simplification work twice, once for costing
and once for transform, but that's OK for now I guess.

I think you want to clear_aux_for_blocks at the end of the pass.

Otherwise I like it - it seems you have some TODO around cost
modeling.  Did you try to do gswitch support ontop as I suggested
to see if the general structure keeps working?

Thanks,
Richard.

>
> Martin
  
Martin Liška Nov. 29, 2021, 12:45 p.m. UTC | #38
On 11/26/21 09:12, Richard Biener wrote:
> On Wed, Nov 24, 2021 at 3:32 PM Martin Liška <mliska@suse.cz> wrote:
>>
>> On 11/24/21 15:14, Martin Liška wrote:
>>> It likely miscompiles gcc.dg/loop-unswitch-5.c, working on that..
>>
>> Fixed that in the updated version.
> 
> Function level comments need updating it seems.

I've done that.

> 
> +static unsigned
> +evaluate_insns (class loop *loop,  basic_block *bbs,
> +               predicate_vector &predicate_path,
> +               auto_bb_flag &reachable_flag)
> +{
> +  auto_vec<basic_block> worklist (loop->num_nodes);
> +  worklist.quick_push (bbs[0]);
> ...
> 
> so when adding gswitch support the easiest way to make
> 
> +      FOR_EACH_EDGE (e, ei, bb->succs)
> +       {
> ...
> +           {
> +             worklist.safe_push (dest);
> +             dest->flags |= reachable_flag;
> 
> work is when the gcond/gswitch simplification would mark
> outgoing edges as (non-)executable.  For gswitch this
> could be achieved by iterating over the case labels and
> intersecting that with the range while for gcond it's a
> matter of setting an edge flag instead of returning true/false.

Exactly, it can be quite naturally added to the current patch.

> I'd call the common function evaluate_control_stmt_using_entry_checks
> or so and invoke it on the last stmt of a block with >= 2 outgoing
> edges.

Yes, I'll do it for the gswitch support patch.

> 
> We still seem to do the simplification work twice, once for costing
> and once for transform, but that's OK for now I guess.
> 
> I think you want to clear_aux_for_blocks at the end of the pass.

Called that.

> 
> Otherwise I like it - it seems you have some TODO around cost
> modeling.  Did you try to do gswitch support ontop as I suggested
> to see if the general structure keeps working?

I vanished and tested the patch. No, I don't have the gswitch support patch
as the current patch was reworked a few times.

Can we please progress and have installed the suggested patch?

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.

Ready to be installed?
Thanks,
Martin

> 
> Thanks,
> Richard.
> 
>>
>> Martin
  
Richard Biener Nov. 30, 2021, 11:17 a.m. UTC | #39
On Mon, Nov 29, 2021 at 1:45 PM Martin Liška <mliska@suse.cz> wrote:
>
> On 11/26/21 09:12, Richard Biener wrote:
> > On Wed, Nov 24, 2021 at 3:32 PM Martin Liška <mliska@suse.cz> wrote:
> >>
> >> On 11/24/21 15:14, Martin Liška wrote:
> >>> It likely miscompiles gcc.dg/loop-unswitch-5.c, working on that..
> >>
> >> Fixed that in the updated version.
> >
> > Function level comments need updating it seems.
>
> I've done that.
>
> >
> > +static unsigned
> > +evaluate_insns (class loop *loop,  basic_block *bbs,
> > +               predicate_vector &predicate_path,
> > +               auto_bb_flag &reachable_flag)
> > +{
> > +  auto_vec<basic_block> worklist (loop->num_nodes);
> > +  worklist.quick_push (bbs[0]);
> > ...
> >
> > so when adding gswitch support the easiest way to make
> >
> > +      FOR_EACH_EDGE (e, ei, bb->succs)
> > +       {
> > ...
> > +           {
> > +             worklist.safe_push (dest);
> > +             dest->flags |= reachable_flag;
> >
> > work is when the gcond/gswitch simplification would mark
> > outgoing edges as (non-)executable.  For gswitch this
> > could be achieved by iterating over the case labels and
> > intersecting that with the range while for gcond it's a
> > matter of setting an edge flag instead of returning true/false.
>
> Exactly, it can be quite naturally added to the current patch.
>
> > I'd call the common function evaluate_control_stmt_using_entry_checks
> > or so and invoke it on the last stmt of a block with >= 2 outgoing
> > edges.
>
> Yes, I'll do it for the gswitch support patch.
>
> >
> > We still seem to do the simplification work twice, once for costing
> > and once for transform, but that's OK for now I guess.
> >
> > I think you want to clear_aux_for_blocks at the end of the pass.
>
> Called that.
>
> >
> > Otherwise I like it - it seems you have some TODO around cost
> > modeling.  Did you try to do gswitch support ontop as I suggested
> > to see if the general structure keeps working?
>
> I vanished and tested the patch. No, I don't have the gswitch support patch
> as the current patch was reworked a few times.
>
> Can we please progress and have installed the suggested patch?

I'd like to see the gswitch support - that's what was posted before stage3
close, this patch on its own doesn't seem worth pushing for.  That said,
I have some comments below (and the already raised ones about how
things might need to change with gswitch support).  Is it so difficult to
develop gswitch support as a separate change ontop of this?

> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.

+#include <utility>

that's included unconditionally by system.h

+/* The type represents a predicate path leading to a basic block.  */
+typedef auto_vec<std::pair<unswitch_predicate *, bool>> predicate_vector;

+static bool tree_unswitch_single_loop (class loop *, int,
+                                      predicate_vector &predicate_path,

I think we don't want to pass auto_vec by reference, instead auto_vec should
decay to vec<> when passed around.

+  unswitch_predicate *predicate = new unswitch_predicate (cond, lhs);
+  if (irange::supports_type_p (TREE_TYPE (lhs)) && CONSTANT_CLASS_P (rhs))
+    {
+      ranger->range_on_edge (predicate->true_range, edge_true, lhs);
+      predicate->false_range = predicate->true_range;

-  return cond;
+      if (!predicate->false_range.varying_p ()
+         && !predicate->false_range.undefined_p ())
+       predicate->false_range.invert ();
+    }

is that correct?  I would guess range_on_edge, for

   if (a > 10)
     if (a < 15)
        /* true */
     else
        /* false */

figures [11, 14] on the true edge of if (a < 15) (considered the
unswitch predicate),
inverting that yields [0, 10] u [15, +INF] but that's at least
sub-optimal for the
else range.  I think we want to call range_on_edge again to determine the range
on the else branch?

 }

-/* Simplifies COND using checks in front of the entry of the LOOP.  Just very
-   simplish (sufficient to prevent us from duplicating loop in unswitching
-   unnecessarily).  */
+static void
+combine_range (predicate_vector &predicate_path, tree index, irange
&path_range)
+{

unless I misread the patch combine_range misses a comment.

+evaluate_control_stmt_using_entry_checks (gimple *stmt,
+                                         predicate_vector &predicate_path)
 {

so this function for ranger does combine all predicates on the predicate_path
but for the symbolic evaluation it looks at the last predicate only?  I guess
that's because other predicate simplification opportunities are applied already,
correct?  But doesn't that mean that the combine_range could be done once
when we build the predicate vector instead of for each stmt?  I'm just
looking at the difference in treating both cases - if we first analyze the whole
unswitching path (including all recursions) then we'd have to simplify all
opportunities at once, so iterating over all predicates would make sense.
Still merging ranges when pushing the to the predicate vector rather than
for each stmt would make sense?  We'd then have at most one predicate
supported by irange for a specific lhs on the vector?

+      if (EDGE_COUNT (bb->succs) == 2)
+       {
+         gcond *cond = dyn_cast<gcond *> (last_stmt (bb));
+         if (cond != NULL)
+           {

since gcond always has two edges the edge count test is redundant
and removing it allows to indent less ;)

Richard.


> Ready to be installed?
> Thanks,
> Martin
>
> >
> > Thanks,
> > Richard.
> >
> >>
> >> Martin
  
Martin Liška Dec. 1, 2021, 2:10 p.m. UTC | #40
On 11/30/21 12:17, Richard Biener wrote:
> On Mon, Nov 29, 2021 at 1:45 PM Martin Liška <mliska@suse.cz> wrote:
>>
>> On 11/26/21 09:12, Richard Biener wrote:
>>> On Wed, Nov 24, 2021 at 3:32 PM Martin Liška <mliska@suse.cz> wrote:
>>>>
>>>> On 11/24/21 15:14, Martin Liška wrote:
>>>>> It likely miscompiles gcc.dg/loop-unswitch-5.c, working on that..
>>>>
>>>> Fixed that in the updated version.
>>>
>>> Function level comments need updating it seems.
>>
>> I've done that.
>>
>>>
>>> +static unsigned
>>> +evaluate_insns (class loop *loop,  basic_block *bbs,
>>> +               predicate_vector &predicate_path,
>>> +               auto_bb_flag &reachable_flag)
>>> +{
>>> +  auto_vec<basic_block> worklist (loop->num_nodes);
>>> +  worklist.quick_push (bbs[0]);
>>> ...
>>>
>>> so when adding gswitch support the easiest way to make
>>>
>>> +      FOR_EACH_EDGE (e, ei, bb->succs)
>>> +       {
>>> ...
>>> +           {
>>> +             worklist.safe_push (dest);
>>> +             dest->flags |= reachable_flag;
>>>
>>> work is when the gcond/gswitch simplification would mark
>>> outgoing edges as (non-)executable.  For gswitch this
>>> could be achieved by iterating over the case labels and
>>> intersecting that with the range while for gcond it's a
>>> matter of setting an edge flag instead of returning true/false.
>>
>> Exactly, it can be quite naturally added to the current patch.
>>
>>> I'd call the common function evaluate_control_stmt_using_entry_checks
>>> or so and invoke it on the last stmt of a block with >= 2 outgoing
>>> edges.
>>
>> Yes, I'll do it for the gswitch support patch.
>>
>>>
>>> We still seem to do the simplification work twice, once for costing
>>> and once for transform, but that's OK for now I guess.
>>>
>>> I think you want to clear_aux_for_blocks at the end of the pass.
>>
>> Called that.
>>
>>>
>>> Otherwise I like it - it seems you have some TODO around cost
>>> modeling.  Did you try to do gswitch support ontop as I suggested
>>> to see if the general structure keeps working?
>>
>> I vanished and tested the patch. No, I don't have the gswitch support patch
>> as the current patch was reworked a few times.
>>
>> Can we please progress and have installed the suggested patch?
> 
> I'd like to see the gswitch support - that's what was posted before stage3
> close, this patch on its own doesn't seem worth pushing for.  That said,
> I have some comments below (and the already raised ones about how
> things might need to change with gswitch support).  Is it so difficult to
> develop gswitch support as a separate change ontop of this?

I'm going to work on that in the upcoming days. When are you leaving for the Christmas
holidays :P ?

> 
>> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
> 
> +#include <utility>
> 
> that's included unconditionally by system.h

Good.

> 
> +/* The type represents a predicate path leading to a basic block.  */
> +typedef auto_vec<std::pair<unswitch_predicate *, bool>> predicate_vector;
> 
> +static bool tree_unswitch_single_loop (class loop *, int,
> +                                      predicate_vector &predicate_path,
> 
> I think we don't want to pass auto_vec by reference, instead auto_vec should
> decay to vec<> when passed around.

Ok.

> 
> +  unswitch_predicate *predicate = new unswitch_predicate (cond, lhs);
> +  if (irange::supports_type_p (TREE_TYPE (lhs)) && CONSTANT_CLASS_P (rhs))
> +    {
> +      ranger->range_on_edge (predicate->true_range, edge_true, lhs);
> +      predicate->false_range = predicate->true_range;
> 
> -  return cond;
> +      if (!predicate->false_range.varying_p ()
> +         && !predicate->false_range.undefined_p ())
> +       predicate->false_range.invert ();
> +    }
> 
> is that correct?  I would guess range_on_edge, for
> 
>     if (a > 10)
>       if (a < 15)
>          /* true */
>       else
>          /* false */
> 
> figures [11, 14] on the true edge of if (a < 15) (considered the
> unswitch predicate),
> inverting that yields [0, 10] u [15, +INF] but that's at least
> sub-optimal for the
> else range.  I think we want to call range_on_edge again to determine the range
> on the else branch?

No, as shown in the previous emails, Ranger is CFG sensitive and we should rely
on our irange merging.

> 
>   }
> 
> -/* Simplifies COND using checks in front of the entry of the LOOP.  Just very
> -   simplish (sufficient to prevent us from duplicating loop in unswitching
> -   unnecessarily).  */
> +static void
> +combine_range (predicate_vector &predicate_path, tree index, irange
> &path_range)
> +{
> 
> unless I misread the patch combine_range misses a comment.
> 
> +evaluate_control_stmt_using_entry_checks (gimple *stmt,
> +                                         predicate_vector &predicate_path)
>   {
> 
> so this function for ranger does combine all predicates on the predicate_path
> but for the symbolic evaluation it looks at the last predicate only?

Yes.

>  I guess
> that's because other predicate simplification opportunities are applied already,
> correct?

Exactly.

>  But doesn't that mean that the combine_range could be done once
> when we build the predicate vector instead of for each stmt?  I'm just
> looking at the difference in treating both cases - if we first analyze the whole
> unswitching path (including all recursions) then we'd have to simplify all
> opportunities at once, so iterating over all predicates would make sense.
> Still merging ranges when pushing the to the predicate vector rather than
> for each stmt would make sense?  We'd then have at most one predicate
> supported by irange for a specific lhs on the vector?

Yes, we can do better. I have a patch that appends to the path, and merges
with the previous irange that has equal LHS. Doing that, we can only first the
last predicate for a LHS and this one would have precise range (merged with all previous).

> 
> +      if (EDGE_COUNT (bb->succs) == 2)
> +       {
> +         gcond *cond = dyn_cast<gcond *> (last_stmt (bb));
> +         if (cond != NULL)
> +           {
> 
> since gcond always has two edges the edge count test is redundant
> and removing it allows to indent less ;)

Lemme work on the gswitch support now.

Martin

> 
> Richard.
> 
> 
>> Ready to be installed?
>> Thanks,
>> Martin
>>
>>>
>>> Thanks,
>>> Richard.
>>>
>>>>
>>>> Martin
  
Richard Biener Dec. 1, 2021, 2:19 p.m. UTC | #41
On Wed, Dec 1, 2021 at 3:10 PM Martin Liška <mliska@suse.cz> wrote:
>
> On 11/30/21 12:17, Richard Biener wrote:
> > On Mon, Nov 29, 2021 at 1:45 PM Martin Liška <mliska@suse.cz> wrote:
> >>
> >> On 11/26/21 09:12, Richard Biener wrote:
> >>> On Wed, Nov 24, 2021 at 3:32 PM Martin Liška <mliska@suse.cz> wrote:
> >>>>
> >>>> On 11/24/21 15:14, Martin Liška wrote:
> >>>>> It likely miscompiles gcc.dg/loop-unswitch-5.c, working on that..
> >>>>
> >>>> Fixed that in the updated version.
> >>>
> >>> Function level comments need updating it seems.
> >>
> >> I've done that.
> >>
> >>>
> >>> +static unsigned
> >>> +evaluate_insns (class loop *loop,  basic_block *bbs,
> >>> +               predicate_vector &predicate_path,
> >>> +               auto_bb_flag &reachable_flag)
> >>> +{
> >>> +  auto_vec<basic_block> worklist (loop->num_nodes);
> >>> +  worklist.quick_push (bbs[0]);
> >>> ...
> >>>
> >>> so when adding gswitch support the easiest way to make
> >>>
> >>> +      FOR_EACH_EDGE (e, ei, bb->succs)
> >>> +       {
> >>> ...
> >>> +           {
> >>> +             worklist.safe_push (dest);
> >>> +             dest->flags |= reachable_flag;
> >>>
> >>> work is when the gcond/gswitch simplification would mark
> >>> outgoing edges as (non-)executable.  For gswitch this
> >>> could be achieved by iterating over the case labels and
> >>> intersecting that with the range while for gcond it's a
> >>> matter of setting an edge flag instead of returning true/false.
> >>
> >> Exactly, it can be quite naturally added to the current patch.
> >>
> >>> I'd call the common function evaluate_control_stmt_using_entry_checks
> >>> or so and invoke it on the last stmt of a block with >= 2 outgoing
> >>> edges.
> >>
> >> Yes, I'll do it for the gswitch support patch.
> >>
> >>>
> >>> We still seem to do the simplification work twice, once for costing
> >>> and once for transform, but that's OK for now I guess.
> >>>
> >>> I think you want to clear_aux_for_blocks at the end of the pass.
> >>
> >> Called that.
> >>
> >>>
> >>> Otherwise I like it - it seems you have some TODO around cost
> >>> modeling.  Did you try to do gswitch support ontop as I suggested
> >>> to see if the general structure keeps working?
> >>
> >> I vanished and tested the patch. No, I don't have the gswitch support patch
> >> as the current patch was reworked a few times.
> >>
> >> Can we please progress and have installed the suggested patch?
> >
> > I'd like to see the gswitch support - that's what was posted before stage3
> > close, this patch on its own doesn't seem worth pushing for.  That said,
> > I have some comments below (and the already raised ones about how
> > things might need to change with gswitch support).  Is it so difficult to
> > develop gswitch support as a separate change ontop of this?
>
> I'm going to work on that in the upcoming days. When are you leaving for the Christmas
> holidays :P ?

Wednesday next week is my first day off, but I might peek into mails
now and then.

> >
> >> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
> >
> > +#include <utility>
> >
> > that's included unconditionally by system.h
>
> Good.
>
> >
> > +/* The type represents a predicate path leading to a basic block.  */
> > +typedef auto_vec<std::pair<unswitch_predicate *, bool>> predicate_vector;
> >
> > +static bool tree_unswitch_single_loop (class loop *, int,
> > +                                      predicate_vector &predicate_path,
> >
> > I think we don't want to pass auto_vec by reference, instead auto_vec should
> > decay to vec<> when passed around.
>
> Ok.
>
> >
> > +  unswitch_predicate *predicate = new unswitch_predicate (cond, lhs);
> > +  if (irange::supports_type_p (TREE_TYPE (lhs)) && CONSTANT_CLASS_P (rhs))
> > +    {
> > +      ranger->range_on_edge (predicate->true_range, edge_true, lhs);
> > +      predicate->false_range = predicate->true_range;
> >
> > -  return cond;
> > +      if (!predicate->false_range.varying_p ()
> > +         && !predicate->false_range.undefined_p ())
> > +       predicate->false_range.invert ();
> > +    }
> >
> > is that correct?  I would guess range_on_edge, for
> >
> >     if (a > 10)
> >       if (a < 15)
> >          /* true */
> >       else
> >          /* false */
> >
> > figures [11, 14] on the true edge of if (a < 15) (considered the
> > unswitch predicate),
> > inverting that yields [0, 10] u [15, +INF] but that's at least
> > sub-optimal for the
> > else range.  I think we want to call range_on_edge again to determine the range
> > on the else branch?
>
> No, as shown in the previous emails, Ranger is CFG sensitive and we should rely
> on our irange merging.

I don't understand.  You do

> > +  unswitch_predicate *predicate = new unswitch_predicate (cond, lhs);
> > +  if (irange::supports_type_p (TREE_TYPE (lhs)) && CONSTANT_CLASS_P (rhs))
> > +    {
> > +      ranger->range_on_edge (predicate->true_range, edge_true, lhs);
> > +      predicate->false_range = predicate->true_range;
> >
> > -  return cond;
> > +      if (!predicate->false_range.varying_p ()
> > +         && !predicate->false_range.undefined_p ())
> > +       predicate->false_range.invert ();
> > +    }

which is compute the range of 'lhs' on edge_true into predicate->true_range,
assign that same range to ->false_range and then invert it to get the
range on the false_edge.  What I am saying is that for better precision
you should do

     ranger->range_on_edge (predicate->false_range, edge_false, lhs);

rather than prematurely optimize this to the inversion of the true range
since yes, ranger is CFG sensitive and only the _last_ predicate on a
long CFG path is actually inverted.

What am I missing?

>
> >
> >   }
> >
> > -/* Simplifies COND using checks in front of the entry of the LOOP.  Just very
> > -   simplish (sufficient to prevent us from duplicating loop in unswitching
> > -   unnecessarily).  */
> > +static void
> > +combine_range (predicate_vector &predicate_path, tree index, irange
> > &path_range)
> > +{
> >
> > unless I misread the patch combine_range misses a comment.
> >
> > +evaluate_control_stmt_using_entry_checks (gimple *stmt,
> > +                                         predicate_vector &predicate_path)
> >   {
> >
> > so this function for ranger does combine all predicates on the predicate_path
> > but for the symbolic evaluation it looks at the last predicate only?
>
> Yes.
>
> >  I guess
> > that's because other predicate simplification opportunities are applied already,
> > correct?
>
> Exactly.
>
> >  But doesn't that mean that the combine_range could be done once
> > when we build the predicate vector instead of for each stmt?  I'm just
> > looking at the difference in treating both cases - if we first analyze the whole
> > unswitching path (including all recursions) then we'd have to simplify all
> > opportunities at once, so iterating over all predicates would make sense.
> > Still merging ranges when pushing the to the predicate vector rather than
> > for each stmt would make sense?  We'd then have at most one predicate
> > supported by irange for a specific lhs on the vector?
>
> Yes, we can do better. I have a patch that appends to the path, and merges
> with the previous irange that has equal LHS. Doing that, we can only first the
> last predicate for a LHS and this one would have precise range (merged with all previous).
>
> >
> > +      if (EDGE_COUNT (bb->succs) == 2)
> > +       {
> > +         gcond *cond = dyn_cast<gcond *> (last_stmt (bb));
> > +         if (cond != NULL)
> > +           {
> >
> > since gcond always has two edges the edge count test is redundant
> > and removing it allows to indent less ;)
>
> Lemme work on the gswitch support now.
>
> Martin
>
> >
> > Richard.
> >
> >
> >> Ready to be installed?
> >> Thanks,
> >> Martin
> >>
> >>>
> >>> Thanks,
> >>> Richard.
> >>>
> >>>>
> >>>> Martin
>
  
Martin Liška Dec. 1, 2021, 2:25 p.m. UTC | #42
On 12/1/21 15:19, Richard Biener wrote:
> which is compute the range of 'lhs' on edge_true into predicate->true_range,
> assign that same range to ->false_range and then invert it to get the
> range on the false_edge.  What I am saying is that for better precision
> you should do
> 
>       ranger->range_on_edge (predicate->false_range, edge_false, lhs);
> 
> rather than prematurely optimize this to the inversion of the true range
> since yes, ranger is CFG sensitive and only the_last_  predicate on a
> long CFG path is actually inverted.
> 
> What am I missing?

I might be misunderstood, but I think it's the problem defined here:
https://gcc.gnu.org/pipermail/gcc-patches/2021-November/584605.html

where I used the ranger->range_on_edge on the false_edge.

Martin
  
Richard Biener Dec. 1, 2021, 2:34 p.m. UTC | #43
On Wed, Dec 1, 2021 at 3:25 PM Martin Liška <mliska@suse.cz> wrote:
>
> On 12/1/21 15:19, Richard Biener wrote:
> > which is compute the range of 'lhs' on edge_true into predicate->true_range,
> > assign that same range to ->false_range and then invert it to get the
> > range on the false_edge.  What I am saying is that for better precision
> > you should do
> >
> >       ranger->range_on_edge (predicate->false_range, edge_false, lhs);
> >
> > rather than prematurely optimize this to the inversion of the true range
> > since yes, ranger is CFG sensitive and only the_last_  predicate on a
> > long CFG path is actually inverted.
> >
> > What am I missing?
>
> I might be misunderstood, but I think it's the problem defined here:
> https://gcc.gnu.org/pipermail/gcc-patches/2021-November/584605.html
>
> where I used the ranger->range_on_edge on the false_edge.

Ah, OK.  But then even the true_edge range is possibly wrong, no?
Consider

  for (;;)
     {
         if (a < 100)
           if (a > 50)  // unswitch on this
             /* .. */
         if (a < 120)
             /* ... */
     }

then you record [51, 99] for true_range of the a > 50 predicate and thus
simplification will simplify the if (a < 120) check, no?

You can only record the range from the (CFG independent) a > 50 check,
thus [51, +INF] but of course at simplification time you can also use
the CFG context at each simplification location.

Richard.

> Martin
  
Martin Liška Dec. 1, 2021, 2:48 p.m. UTC | #44
On 12/1/21 15:34, Richard Biener wrote:
> On Wed, Dec 1, 2021 at 3:25 PM Martin Liška <mliska@suse.cz> wrote:
>>
>> On 12/1/21 15:19, Richard Biener wrote:
>>> which is compute the range of 'lhs' on edge_true into predicate->true_range,
>>> assign that same range to ->false_range and then invert it to get the
>>> range on the false_edge.  What I am saying is that for better precision
>>> you should do
>>>
>>>        ranger->range_on_edge (predicate->false_range, edge_false, lhs);
>>>
>>> rather than prematurely optimize this to the inversion of the true range
>>> since yes, ranger is CFG sensitive and only the_last_  predicate on a
>>> long CFG path is actually inverted.
>>>
>>> What am I missing?
>>
>> I might be misunderstood, but I think it's the problem defined here:
>> https://gcc.gnu.org/pipermail/gcc-patches/2021-November/584605.html
>>
>> where I used the ranger->range_on_edge on the false_edge.
> 
> Ah, OK.  But then even the true_edge range is possibly wrong, no?

You are of course correct, I've just proved that in debugger ://

> Consider
> 
>    for (;;)
>       {
>           if (a < 100)
>             if (a > 50)  // unswitch on this
>               /* .. */
>           if (a < 120)
>               /* ... */
>       }
> 
> then you record [51, 99] for true_range of the a > 50 predicate and thus
> simplification will simplify the if (a < 120) check, no?

Yep.

> 
> You can only record the range from the (CFG independent) a > 50 check,
> thus [51, +INF] but of course at simplification time you can also use
> the CFG context at each simplification location.

@Andrew: How can I easily get irange based just on a stmt? Something like fold_range
with int_range_max as the 3rd argument?

Thanks,
Martin

> 
> Richard.
> 
>> Martin
  
Andrew MacLeod Dec. 1, 2021, 6:21 p.m. UTC | #45
On 12/1/21 09:48, Martin Liška wrote:
> On 12/1/21 15:34, Richard Biener wrote:
>> On Wed, Dec 1, 2021 at 3:25 PM Martin Liška <mliska@suse.cz> wrote:
>>>
>>> On 12/1/21 15:19, Richard Biener wrote:
>>>> which is compute the range of 'lhs' on edge_true into 
>>>> predicate->true_range,
>>>> assign that same range to ->false_range and then invert it to get the
>>>> range on the false_edge.  What I am saying is that for better 
>>>> precision
>>>> you should do
>>>>
>>>>        ranger->range_on_edge (predicate->false_range, edge_false, 
>>>> lhs);
>>>>
>>>> rather than prematurely optimize this to the inversion of the true 
>>>> range
>>>> since yes, ranger is CFG sensitive and only the_last_ predicate on a
>>>> long CFG path is actually inverted.
>>>>
>>>> What am I missing?
>>>
>>> I might be misunderstood, but I think it's the problem defined here:
>>> https://gcc.gnu.org/pipermail/gcc-patches/2021-November/584605.html
>>>
>>> where I used the ranger->range_on_edge on the false_edge.
>>
>> Ah, OK.  But then even the true_edge range is possibly wrong, no?
>
> You are of course correct, I've just proved that in debugger ://
>
>> Consider
>>
>>    for (;;)
>>       {
>>           if (a < 100)
>>             if (a > 50)  // unswitch on this
>>               /* .. */
>>           if (a < 120)
>>               /* ... */
>>       }
>>
>> then you record [51, 99] for true_range of the a > 50 predicate and thus
>> simplification will simplify the if (a < 120) check, no?
>
> Yep.
>
>>
>> You can only record the range from the (CFG independent) a > 50 check,
>> thus [51, +INF] but of course at simplification time you can also use
>> the CFG context at each simplification location.
>
> @Andrew: How can I easily get irange based just on a stmt? Something 
> like fold_range
> with int_range_max as the 3rd argument?
>
Sorry, I miss these things if I'm not directly CC'd a lot :-)

So you just want to know the basic range the stmt generates without 
context?    Sure, what you say would be fine, but your want to 
initialize it to the type range:

int_range_max range (TREE_TYPE (name));

you can also simply trigger it using the current SSA_NAME_RANGE_INFO 
global  values query instead of the default current contextual one... 
which , if there isnt a global range, will automatically use the range 
of the type of the argument.. so maybe just try

fold_range (r, stmt, get_global_range_query ())

Andrew
  
Martin Liška Dec. 2, 2021, 11:45 a.m. UTC | #46
On 12/1/21 19:21, Andrew MacLeod wrote:
> On 12/1/21 09:48, Martin Liška wrote:
>> On 12/1/21 15:34, Richard Biener wrote:
>>> On Wed, Dec 1, 2021 at 3:25 PM Martin Liška <mliska@suse.cz> wrote:
>>>>
>>>> On 12/1/21 15:19, Richard Biener wrote:
>>>>> which is compute the range of 'lhs' on edge_true into predicate->true_range,
>>>>> assign that same range to ->false_range and then invert it to get the
>>>>> range on the false_edge.  What I am saying is that for better precision
>>>>> you should do
>>>>>
>>>>>        ranger->range_on_edge (predicate->false_range, edge_false, lhs);
>>>>>
>>>>> rather than prematurely optimize this to the inversion of the true range
>>>>> since yes, ranger is CFG sensitive and only the_last_ predicate on a
>>>>> long CFG path is actually inverted.
>>>>>
>>>>> What am I missing?
>>>>
>>>> I might be misunderstood, but I think it's the problem defined here:
>>>> https://gcc.gnu.org/pipermail/gcc-patches/2021-November/584605.html
>>>>
>>>> where I used the ranger->range_on_edge on the false_edge.
>>>
>>> Ah, OK.  But then even the true_edge range is possibly wrong, no?
>>
>> You are of course correct, I've just proved that in debugger ://
>>
>>> Consider
>>>
>>>    for (;;)
>>>       {
>>>           if (a < 100)
>>>             if (a > 50)  // unswitch on this
>>>               /* .. */
>>>           if (a < 120)
>>>               /* ... */
>>>       }
>>>
>>> then you record [51, 99] for true_range of the a > 50 predicate and thus
>>> simplification will simplify the if (a < 120) check, no?
>>
>> Yep.
>>
>>>
>>> You can only record the range from the (CFG independent) a > 50 check,
>>> thus [51, +INF] but of course at simplification time you can also use
>>> the CFG context at each simplification location.
>>
>> @Andrew: How can I easily get irange based just on a stmt? Something like fold_range
>> with int_range_max as the 3rd argument?
>>
> Sorry, I miss these things if I'm not directly CC'd a lot :-)
> 
> So you just want to know the basic range the stmt generates without context?    Sure, what you say would be fine, but your want to initialize it to the type range:

Yes, I want to know range of LHS in a gcond statement and the same for cases in a gswitch statement.

> 
> int_range_max range (TREE_TYPE (name));
> 
> you can also simply trigger it using the current SSA_NAME_RANGE_INFO global  values query instead of the default current contextual one... which , if there isnt a global range, will automatically use the range of the type of the argument.. so maybe just try
> 
> fold_range (r, stmt, get_global_range_query ())

Doing

       predicate->true_range = int_range_max (TREE_TYPE (lhs));
       fold_range (predicate->true_range, stmt, get_global_range_query ());
       predicate->true_range.debug();

gives me _Bool VARYING.

Martin

> 
> Andrew
> 
> 
> 
>
  
Richard Biener Dec. 2, 2021, 12:01 p.m. UTC | #47
On Thu, Dec 2, 2021 at 12:45 PM Martin Liška <mliska@suse.cz> wrote:
>
> On 12/1/21 19:21, Andrew MacLeod wrote:
> > On 12/1/21 09:48, Martin Liška wrote:
> >> On 12/1/21 15:34, Richard Biener wrote:
> >>> On Wed, Dec 1, 2021 at 3:25 PM Martin Liška <mliska@suse.cz> wrote:
> >>>>
> >>>> On 12/1/21 15:19, Richard Biener wrote:
> >>>>> which is compute the range of 'lhs' on edge_true into predicate->true_range,
> >>>>> assign that same range to ->false_range and then invert it to get the
> >>>>> range on the false_edge.  What I am saying is that for better precision
> >>>>> you should do
> >>>>>
> >>>>>        ranger->range_on_edge (predicate->false_range, edge_false, lhs);
> >>>>>
> >>>>> rather than prematurely optimize this to the inversion of the true range
> >>>>> since yes, ranger is CFG sensitive and only the_last_ predicate on a
> >>>>> long CFG path is actually inverted.
> >>>>>
> >>>>> What am I missing?
> >>>>
> >>>> I might be misunderstood, but I think it's the problem defined here:
> >>>> https://gcc.gnu.org/pipermail/gcc-patches/2021-November/584605.html
> >>>>
> >>>> where I used the ranger->range_on_edge on the false_edge.
> >>>
> >>> Ah, OK.  But then even the true_edge range is possibly wrong, no?
> >>
> >> You are of course correct, I've just proved that in debugger ://
> >>
> >>> Consider
> >>>
> >>>    for (;;)
> >>>       {
> >>>           if (a < 100)
> >>>             if (a > 50)  // unswitch on this
> >>>               /* .. */
> >>>           if (a < 120)
> >>>               /* ... */
> >>>       }
> >>>
> >>> then you record [51, 99] for true_range of the a > 50 predicate and thus
> >>> simplification will simplify the if (a < 120) check, no?
> >>
> >> Yep.
> >>
> >>>
> >>> You can only record the range from the (CFG independent) a > 50 check,
> >>> thus [51, +INF] but of course at simplification time you can also use
> >>> the CFG context at each simplification location.
> >>
> >> @Andrew: How can I easily get irange based just on a stmt? Something like fold_range
> >> with int_range_max as the 3rd argument?
> >>
> > Sorry, I miss these things if I'm not directly CC'd a lot :-)
> >
> > So you just want to know the basic range the stmt generates without context?    Sure, what you say would be fine, but your want to initialize it to the type range:
>
> Yes, I want to know range of LHS in a gcond statement and the same for cases in a gswitch statement.
>
> >
> > int_range_max range (TREE_TYPE (name));
> >
> > you can also simply trigger it using the current SSA_NAME_RANGE_INFO global  values query instead of the default current contextual one... which , if there isnt a global range, will automatically use the range of the type of the argument.. so maybe just try
> >
> > fold_range (r, stmt, get_global_range_query ())
>
> Doing
>
>        predicate->true_range = int_range_max (TREE_TYPE (lhs));
>        fold_range (predicate->true_range, stmt, get_global_range_query ());
>        predicate->true_range.debug();
>
> gives me _Bool VARYING.

Likely because that gives a range for the bool result rather than
a range for the LHS of a LHS op RHS on the true or false edge.
I would guess no stmt level API gives that.  In previous VRP
incarnation assert expr discovery would yield the asserts
that hold for the LHS on the respective edge and from the asserts
ranges could be determined.

Richard.

>
> Martin
>
> >
> > Andrew
> >
> >
> >
> >
>
  
Martin Liška Dec. 2, 2021, 1:10 p.m. UTC | #48
On 12/2/21 13:01, Richard Biener wrote:
> On Thu, Dec 2, 2021 at 12:45 PM Martin Liška <mliska@suse.cz> wrote:
>>
>> On 12/1/21 19:21, Andrew MacLeod wrote:
>>> On 12/1/21 09:48, Martin Liška wrote:
>>>> On 12/1/21 15:34, Richard Biener wrote:
>>>>> On Wed, Dec 1, 2021 at 3:25 PM Martin Liška <mliska@suse.cz> wrote:
>>>>>>
>>>>>> On 12/1/21 15:19, Richard Biener wrote:
>>>>>>> which is compute the range of 'lhs' on edge_true into predicate->true_range,
>>>>>>> assign that same range to ->false_range and then invert it to get the
>>>>>>> range on the false_edge.  What I am saying is that for better precision
>>>>>>> you should do
>>>>>>>
>>>>>>>         ranger->range_on_edge (predicate->false_range, edge_false, lhs);
>>>>>>>
>>>>>>> rather than prematurely optimize this to the inversion of the true range
>>>>>>> since yes, ranger is CFG sensitive and only the_last_ predicate on a
>>>>>>> long CFG path is actually inverted.
>>>>>>>
>>>>>>> What am I missing?
>>>>>>
>>>>>> I might be misunderstood, but I think it's the problem defined here:
>>>>>> https://gcc.gnu.org/pipermail/gcc-patches/2021-November/584605.html
>>>>>>
>>>>>> where I used the ranger->range_on_edge on the false_edge.
>>>>>
>>>>> Ah, OK.  But then even the true_edge range is possibly wrong, no?
>>>>
>>>> You are of course correct, I've just proved that in debugger ://
>>>>
>>>>> Consider
>>>>>
>>>>>     for (;;)
>>>>>        {
>>>>>            if (a < 100)
>>>>>              if (a > 50)  // unswitch on this
>>>>>                /* .. */
>>>>>            if (a < 120)
>>>>>                /* ... */
>>>>>        }
>>>>>
>>>>> then you record [51, 99] for true_range of the a > 50 predicate and thus
>>>>> simplification will simplify the if (a < 120) check, no?
>>>>
>>>> Yep.
>>>>
>>>>>
>>>>> You can only record the range from the (CFG independent) a > 50 check,
>>>>> thus [51, +INF] but of course at simplification time you can also use
>>>>> the CFG context at each simplification location.
>>>>
>>>> @Andrew: How can I easily get irange based just on a stmt? Something like fold_range
>>>> with int_range_max as the 3rd argument?
>>>>
>>> Sorry, I miss these things if I'm not directly CC'd a lot :-)
>>>
>>> So you just want to know the basic range the stmt generates without context?    Sure, what you say would be fine, but your want to initialize it to the type range:
>>
>> Yes, I want to know range of LHS in a gcond statement and the same for cases in a gswitch statement.
>>
>>>
>>> int_range_max range (TREE_TYPE (name));
>>>
>>> you can also simply trigger it using the current SSA_NAME_RANGE_INFO global  values query instead of the default current contextual one... which , if there isnt a global range, will automatically use the range of the type of the argument.. so maybe just try
>>>
>>> fold_range (r, stmt, get_global_range_query ())
>>
>> Doing
>>
>>         predicate->true_range = int_range_max (TREE_TYPE (lhs));
>>         fold_range (predicate->true_range, stmt, get_global_range_query ());
>>         predicate->true_range.debug();
>>
>> gives me _Bool VARYING.
> 
> Likely because that gives a range for the bool result rather than
> a range for the LHS of a LHS op RHS on the true or false edge.

Yes :) I guess Andrew can help us.

> I would guess no stmt level API gives that.  In previous VRP
> incarnation assert expr discovery would yield the asserts
> that hold for the LHS on the respective edge and from the asserts
> ranges could be determined.

About the gswitch statements, I need similarly irange for a switch statement
on an edge e.

Thanks for help,
Martin

> 
> Richard.
> 
>>
>> Martin
>>
>>>
>>> Andrew
>>>
>>>
>>>
>>>
>>
  
Richard Biener Dec. 2, 2021, 1:46 p.m. UTC | #49
On Thu, Dec 2, 2021 at 2:10 PM Martin Liška <mliska@suse.cz> wrote:
>
> On 12/2/21 13:01, Richard Biener wrote:
> > On Thu, Dec 2, 2021 at 12:45 PM Martin Liška <mliska@suse.cz> wrote:
> >>
> >> On 12/1/21 19:21, Andrew MacLeod wrote:
> >>> On 12/1/21 09:48, Martin Liška wrote:
> >>>> On 12/1/21 15:34, Richard Biener wrote:
> >>>>> On Wed, Dec 1, 2021 at 3:25 PM Martin Liška <mliska@suse.cz> wrote:
> >>>>>>
> >>>>>> On 12/1/21 15:19, Richard Biener wrote:
> >>>>>>> which is compute the range of 'lhs' on edge_true into predicate->true_range,
> >>>>>>> assign that same range to ->false_range and then invert it to get the
> >>>>>>> range on the false_edge.  What I am saying is that for better precision
> >>>>>>> you should do
> >>>>>>>
> >>>>>>>         ranger->range_on_edge (predicate->false_range, edge_false, lhs);
> >>>>>>>
> >>>>>>> rather than prematurely optimize this to the inversion of the true range
> >>>>>>> since yes, ranger is CFG sensitive and only the_last_ predicate on a
> >>>>>>> long CFG path is actually inverted.
> >>>>>>>
> >>>>>>> What am I missing?
> >>>>>>
> >>>>>> I might be misunderstood, but I think it's the problem defined here:
> >>>>>> https://gcc.gnu.org/pipermail/gcc-patches/2021-November/584605.html
> >>>>>>
> >>>>>> where I used the ranger->range_on_edge on the false_edge.
> >>>>>
> >>>>> Ah, OK.  But then even the true_edge range is possibly wrong, no?
> >>>>
> >>>> You are of course correct, I've just proved that in debugger ://
> >>>>
> >>>>> Consider
> >>>>>
> >>>>>     for (;;)
> >>>>>        {
> >>>>>            if (a < 100)
> >>>>>              if (a > 50)  // unswitch on this
> >>>>>                /* .. */
> >>>>>            if (a < 120)
> >>>>>                /* ... */
> >>>>>        }
> >>>>>
> >>>>> then you record [51, 99] for true_range of the a > 50 predicate and thus
> >>>>> simplification will simplify the if (a < 120) check, no?
> >>>>
> >>>> Yep.
> >>>>
> >>>>>
> >>>>> You can only record the range from the (CFG independent) a > 50 check,
> >>>>> thus [51, +INF] but of course at simplification time you can also use
> >>>>> the CFG context at each simplification location.
> >>>>
> >>>> @Andrew: How can I easily get irange based just on a stmt? Something like fold_range
> >>>> with int_range_max as the 3rd argument?
> >>>>
> >>> Sorry, I miss these things if I'm not directly CC'd a lot :-)
> >>>
> >>> So you just want to know the basic range the stmt generates without context?    Sure, what you say would be fine, but your want to initialize it to the type range:
> >>
> >> Yes, I want to know range of LHS in a gcond statement and the same for cases in a gswitch statement.
> >>
> >>>
> >>> int_range_max range (TREE_TYPE (name));
> >>>
> >>> you can also simply trigger it using the current SSA_NAME_RANGE_INFO global  values query instead of the default current contextual one... which , if there isnt a global range, will automatically use the range of the type of the argument.. so maybe just try
> >>>
> >>> fold_range (r, stmt, get_global_range_query ())
> >>
> >> Doing
> >>
> >>         predicate->true_range = int_range_max (TREE_TYPE (lhs));
> >>         fold_range (predicate->true_range, stmt, get_global_range_query ());
> >>         predicate->true_range.debug();
> >>
> >> gives me _Bool VARYING.
> >
> > Likely because that gives a range for the bool result rather than
> > a range for the LHS of a LHS op RHS on the true or false edge.
>
> Yes :) I guess Andrew can help us.

some grepping and poking found be gimple_range_calc_op1 which should
eventually do the trick (for constant gimple_cond_rhs at least).

> > I would guess no stmt level API gives that.  In previous VRP
> > incarnation assert expr discovery would yield the asserts
> > that hold for the LHS on the respective edge and from the asserts
> > ranges could be determined.
>
> About the gswitch statements, I need similarly irange for a switch statement
> on an edge e.

Well, you simply want a range (and condition to generate the if from)
from a CASE_LABEL_EXPR.  Given that's either a single integer constant
or a RANGE_EXPR it should be easy to use some of the irange CTORs here.

Richard.

> Thanks for help,
> Martin
>
> >
> > Richard.
> >
> >>
> >> Martin
> >>
> >>>
> >>> Andrew
> >>>
> >>>
> >>>
> >>>
> >>
>
  
Andrew MacLeod Dec. 2, 2021, 2:27 p.m. UTC | #50
On 12/2/21 06:45, Martin Liška wrote:
> On 12/1/21 19:21, Andrew MacLeod wrote:
>> On 12/1/21 09:48, Martin Liška wrote:
>>> On 12/1/21 15:34, Richard Biener wrote:
>>>> On Wed, Dec 1, 2021 at 3:25 PM Martin Liška <mliska@suse.cz> wrote:
>>>>>
>>>>> On 12/1/21 15:19, Richard Biener wrote:
>>>>>> which is compute the range of 'lhs' on edge_true into 
>>>>>> predicate->true_range,
>>>>>> assign that same range to ->false_range and then invert it to get 
>>>>>> the
>>>>>> range on the false_edge.  What I am saying is that for better 
>>>>>> precision
>>>>>> you should do
>>>>>>
>>>>>>        ranger->range_on_edge (predicate->false_range, edge_false, 
>>>>>> lhs);
>>>>>>
>>>>>> rather than prematurely optimize this to the inversion of the 
>>>>>> true range
>>>>>> since yes, ranger is CFG sensitive and only the_last_ predicate on a
>>>>>> long CFG path is actually inverted.
>>>>>>
>>>>>> What am I missing?
>>>>>
>>>>> I might be misunderstood, but I think it's the problem defined here:
>>>>> https://gcc.gnu.org/pipermail/gcc-patches/2021-November/584605.html
>>>>>
>>>>> where I used the ranger->range_on_edge on the false_edge.
>>>>
>>>> Ah, OK.  But then even the true_edge range is possibly wrong, no?
>>>
>>> You are of course correct, I've just proved that in debugger ://
>>>
>>>> Consider
>>>>
>>>>    for (;;)
>>>>       {
>>>>           if (a < 100)
>>>>             if (a > 50)  // unswitch on this
>>>>               /* .. */
>>>>           if (a < 120)
>>>>               /* ... */
>>>>       }
>>>>
>>>> then you record [51, 99] for true_range of the a > 50 predicate and 
>>>> thus
>>>> simplification will simplify the if (a < 120) check, no?
>>>
>>> Yep.
>>>
>>>>
>>>> You can only record the range from the (CFG independent) a > 50 check,
>>>> thus [51, +INF] but of course at simplification time you can also use
>>>> the CFG context at each simplification location.
>>>
>>> @Andrew: How can I easily get irange based just on a stmt? Something 
>>> like fold_range
>>> with int_range_max as the 3rd argument?
>>>
>> Sorry, I miss these things if I'm not directly CC'd a lot :-)
>>
>> So you just want to know the basic range the stmt generates without 
>> context?    Sure, what you say would be fine, but your want to 
>> initialize it to the type range:
>
> Yes, I want to know range of LHS in a gcond statement and the same for 
> cases in a gswitch statement.
>
>>
>> int_range_max range (TREE_TYPE (name));
>>
>> you can also simply trigger it using the current SSA_NAME_RANGE_INFO 
>> global  values query instead of the default current contextual one... 
>> which , if there isnt a global range, will automatically use the 
>> range of the type of the argument.. so maybe just try
>>
>> fold_range (r, stmt, get_global_range_query ())
>
> Doing
>
>       predicate->true_range = int_range_max (TREE_TYPE (lhs));
>       fold_range (predicate->true_range, stmt, get_global_range_query 
> ());
>       predicate->true_range.debug();
>
> gives me _Bool VARYING.

wait, what  stmt are you asking for?  is this on something like:

if (a < 120)

?  Then if it doesnt now anything about 'a', you would expect to get 
bool varying because the stmt is a true/false

if you want to know the range of A on this, the instead, pick your edge, 
and ask ranger for the range of 'a' on the outgoing edge

Now, I guess what you are looking for is the range of a without any 
context?  Then you'll want to access the GORI engine directly.. try

ranger->gori().outgoing_edge_range_p (irange &r, edge e, tree name, 
*get_global_range_query ());

if you ask for 'a' on the true edge, it should give you [0,119] false 
edge should give you [120, +INF]


same thing works for switches... pick and edge and it'll give you the 
range of NAME on that edge, without any contextual information.

Anderw

PS  if you DO want contextual,  skip the final argument and it'll go 
directly to what ranger knows at this time, without any additional lookups.


Andrew
  
Martin Liška Dec. 2, 2021, 4:02 p.m. UTC | #51
On 12/2/21 15:27, Andrew MacLeod wrote:
> ranger->gori().outgoing_edge_range_p (irange &r, edge e, tree name, *get_global_range_query ());

Thank you! It works for me!

Martin
  
Andrew MacLeod Dec. 3, 2021, 2:09 p.m. UTC | #52
On 12/2/21 11:02, Martin Liška wrote:
> On 12/2/21 15:27, Andrew MacLeod wrote:
>> ranger->gori().outgoing_edge_range_p (irange &r, edge e, tree name, 
>> *get_global_range_query ());
>
> Thank you! It works for me!
>
> Martin
>
btw, this applies to names not  on the stmt as well.   The function 
returns TRUE if there is an outgoing range calculable, and false if 
not.  so:

a = b + 20
if (a > 40)        <- returns TRUE for outgoing range of 'a' or 'b'
     {
        c = foo()
        if (c > 60)   <- returns false for 'a' or 'b'
            {
                if (b < 100)   <- Also returns TRUE for 'a' or 'b'

The final block, from the EVRP dump:

     <bb 4> :
     if (b_4(D) <= 99)
       goto <bb 5>; [INV]
     else
       goto <bb 6>; [INV]

4->5  (T) b_4(D) :      unsigned int [21, 99]
4->5  (T) a_5 :         unsigned int [41, 119]
4->6  (F) b_4(D) :      unsigned int [100, 4294967275]
4->6  (F) a_5 :         unsigned int [120, +INF]

Shows that it has a range for both 'a' and 'b'.

Just to point out that you can use this query on any conditional edge, 
even if it isnt directly mentioned on the stmt.  so if you are keying of 
'a', you could simply ask if outgoing_edge_range_p ( , a,...)  rather 
than parsing the condition to see if 'a' is on it.   if there is no 
chance that 'a' is affected by the block, is just returns false.

When its called directly like this, it picks up no ranges from outside 
the basic block you are querying, except via that range query you 
provide. The listing shows the defaultwhic is whatever ranger knows.   
So if you provided get_global_range_query, those ranges would instead be 
something like:

4->5  (T) b_4(D) :      unsigned int [0, 99]
4->5  (T) a_5 :         unsigned int [21, 119]
4->6  (F) b_4(D) :      unsigned int [100, +INF]
4->6  (F) a_5 :         unsigned int [0,20] [120, +INF-20 ]

It may simplify things a little if you are unswitching on 'a', you can 
just ask each block with a condition whether 'a's range can be modified....

Andrew.
  
Andrew MacLeod Dec. 8, 2021, 9:06 p.m. UTC | #53
On 12/2/21 08:46, Richard Biener wrote:
> On Thu, Dec 2, 2021 at 2:10 PM Martin Liška <mliska@suse.cz> wrote:
>> On 12/2/21 13:01, Richard Biener wrote:
>>> On Thu, Dec 2, 2021 at 12:45 PM Martin Liška <mliska@suse.cz> wrote:
>>>> On 12/1/21 19:21, Andrew MacLeod wrote:
>>>>> On 12/1/21 09:48, Martin Liška wrote:
>>>>>> On 12/1/21 15:34, Richard Biener wrote:
>>>>>>> On Wed, Dec 1, 2021 at 3:25 PM Martin Liška <mliska@suse.cz> wrote:
>>>>>>>> On 12/1/21 15:19, Richard Biener wrote:
>>>>>>>>> which is compute the range of 'lhs' on edge_true into predicate->true_range,
>>>>>>>>> assign that same range to ->false_range and then invert it to get the
>>>>>>>>> range on the false_edge.  What I am saying is that for better precision
>>>>>>>>> you should do
>>>>>>>>>
>>>>>>>>>          ranger->range_on_edge (predicate->false_range, edge_false, lhs);
>>>>>>>>>
>>>>>>>>> rather than prematurely optimize this to the inversion of the true range
>>>>>>>>> since yes, ranger is CFG sensitive and only the_last_ predicate on a
>>>>>>>>> long CFG path is actually inverted.
>>>>>>>>>
>>>>>>>>> What am I missing?
>>>>>>>> I might be misunderstood, but I think it's the problem defined here:
>>>>>>>> https://gcc.gnu.org/pipermail/gcc-patches/2021-November/584605.html
>>>>>>>>
>>>>>>>> where I used the ranger->range_on_edge on the false_edge.
>>>>>>> Ah, OK.  But then even the true_edge range is possibly wrong, no?
>>>>>> You are of course correct, I've just proved that in debugger ://
>>>>>>
>>>>>>> Consider
>>>>>>>
>>>>>>>      for (;;)
>>>>>>>         {
>>>>>>>             if (a < 100)
>>>>>>>               if (a > 50)  // unswitch on this
>>>>>>>                 /* .. */
>>>>>>>             if (a < 120)
>>>>>>>                 /* ... */
>>>>>>>         }
>>>>>>>
>>>>>>> then you record [51, 99] for true_range of the a > 50 predicate and thus
>>>>>>> simplification will simplify the if (a < 120) check, no?
>>>>>> Yep.
>>>>>>
>>>>>>> You can only record the range from the (CFG independent) a > 50 check,
>>>>>>> thus [51, +INF] but of course at simplification time you can also use
>>>>>>> the CFG context at each simplification location.
>>>>>> @Andrew: How can I easily get irange based just on a stmt? Something like fold_range
>>>>>> with int_range_max as the 3rd argument?
>>>>>>
>>>>> Sorry, I miss these things if I'm not directly CC'd a lot :-)
>>>>>
>>>>> So you just want to know the basic range the stmt generates without context?    Sure, what you say would be fine, but your want to initialize it to the type range:
>>>> Yes, I want to know range of LHS in a gcond statement and the same for cases in a gswitch statement.
>>>>
>>>>> int_range_max range (TREE_TYPE (name));
>>>>>
>>>>> you can also simply trigger it using the current SSA_NAME_RANGE_INFO global  values query instead of the default current contextual one... which , if there isnt a global range, will automatically use the range of the type of the argument.. so maybe just try
>>>>>
>>>>> fold_range (r, stmt, get_global_range_query ())
>>>> Doing
>>>>
>>>>          predicate->true_range = int_range_max (TREE_TYPE (lhs));
>>>>          fold_range (predicate->true_range, stmt, get_global_range_query ());
>>>>          predicate->true_range.debug();
>>>>
>>>> gives me _Bool VARYING.
>>> Likely because that gives a range for the bool result rather than
>>> a range for the LHS of a LHS op RHS on the true or false edge.
>> Yes :) I guess Andrew can help us.
> some grepping and poking found be gimple_range_calc_op1 which should
> eventually do the trick (for constant gimple_cond_rhs at least).
>
Wait, didn't the  gori()->outgoing_edge_range_p()  call do this for you? 
Its a more general API that invokes gimple_range_calc_op1() under the 
covers.

Im going to add myself to the cc so I don't miss any more of these :-)

Or am I just way behind in my non-addressed reading? and that did 
address your issue? :-)

Andrew
  
Martin Liška Dec. 9, 2021, 12:59 p.m. UTC | #54
On 12/3/21 15:09, Andrew MacLeod wrote:
> On 12/2/21 11:02, Martin Liška wrote:
>> On 12/2/21 15:27, Andrew MacLeod wrote:
>>> ranger->gori().outgoing_edge_range_p (irange &r, edge e, tree name, *get_global_range_query ());
>>
>> Thank you! It works for me!
>>
>> Martin
>>
> btw, this applies to names not  on the stmt as well.   The function returns TRUE if there is an outgoing range calculable, and false if not.  so:
> 
> a = b + 20
> if (a > 40)        <- returns TRUE for outgoing range of 'a' or 'b'
>      {
>         c = foo()
>         if (c > 60)   <- returns false for 'a' or 'b'
>             {
>                 if (b < 100)   <- Also returns TRUE for 'a' or 'b'

Hello.

That's quite interesting support. However, for the bug mentioned earlier in this email conversion,
I only want local range for a gcond/gswitch, ignoring all path leading to the statement.

Martin

> 
> The final block, from the EVRP dump:
> 
>      <bb 4> :
>      if (b_4(D) <= 99)
>        goto <bb 5>; [INV]
>      else
>        goto <bb 6>; [INV]
> 
> 4->5  (T) b_4(D) :      unsigned int [21, 99]
> 4->5  (T) a_5 :         unsigned int [41, 119]
> 4->6  (F) b_4(D) :      unsigned int [100, 4294967275]
> 4->6  (F) a_5 :         unsigned int [120, +INF]
> 
> Shows that it has a range for both 'a' and 'b'.
> 
> Just to point out that you can use this query on any conditional edge, even if it isnt directly mentioned on the stmt.  so if you are keying of 'a', you could simply ask if outgoing_edge_range_p ( , a,...)  rather than parsing the condition to see if 'a' is on it.   if there is no chance that 'a' is affected by the block, is just returns false.
> 
> When its called directly like this, it picks up no ranges from outside the basic block you are querying, except via that range query you provide. The listing shows the defaultwhic is whatever ranger knows. So if you provided get_global_range_query, those ranges would instead be something like:
> 
> 4->5  (T) b_4(D) :      unsigned int [0, 99]
> 4->5  (T) a_5 :         unsigned int [21, 119]
> 4->6  (F) b_4(D) :      unsigned int [100, +INF]
> 4->6  (F) a_5 :         unsigned int [0,20] [120, +INF-20 ]
> 
> It may simplify things a little if you are unswitching on 'a', you can just ask each block with a condition whether 'a's range can be modified....
> 
> Andrew.
> 
> 
>
  
Martin Liška Dec. 9, 2021, 1:02 p.m. UTC | #55
On 11/30/21 12:17, Richard Biener wrote:
> I'd like to see the gswitch support - that's what was posted before stage3
> close, this patch on its own doesn't seem worth pushing for.  That said,
> I have some comments below (and the already raised ones about how
> things might need to change with gswitch support).  Is it so difficult to
> develop gswitch support as a separate change ontop of this?

Hello.

Took me some time, but I have a working version of the patch that makes both
refactoring of the costing model and adds support for gswitch. For quite some
time, I maintained 2 patches, but as commonly happens, I was forced doing quite
some rework. If we really want a separation for bisection purpose, I suggest simple
disabling of gswitch support?

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
SPEC 2006 and 2017 survive running -O3 -march=native. There are 2090 optimization notes
for SPEC 2006 (compared to 1945 before the patch).

Thoughts?
Thanks,
Martin
  
Andrew MacLeod Dec. 9, 2021, 2:44 p.m. UTC | #56
On 12/9/21 07:59, Martin Liška wrote:
> On 12/3/21 15:09, Andrew MacLeod wrote:
>> On 12/2/21 11:02, Martin Liška wrote:
>>> On 12/2/21 15:27, Andrew MacLeod wrote:
>>>> ranger->gori().outgoing_edge_range_p (irange &r, edge e, tree name, 
>>>> *get_global_range_query ());
>>>
>>> Thank you! It works for me!
>>>
>>> Martin
>>>
>> btw, this applies to names not  on the stmt as well.   The function 
>> returns TRUE if there is an outgoing range calculable, and false if 
>> not.  so:
>>
>> a = b + 20
>> if (a > 40)        <- returns TRUE for outgoing range of 'a' or 'b'
>>      {
>>         c = foo()
>>         if (c > 60)   <- returns false for 'a' or 'b'
>>             {
>>                 if (b < 100)   <- Also returns TRUE for 'a' or 'b'
>
> Hello.
>
> That's quite interesting support. However, for the bug mentioned 
> earlier in this email conversion,
> I only want local range for a gcond/gswitch, ignoring all path leading 
> to the statement.
>
> Martin 
In which case just directly invoking gori with the global range query 
should be fine.   You could go directly to the lower level routines like 
gimple_range_calc_op1, but this saves you from doing a little footwork 
to call them.  Ultimately, it works out the same for you on the true 
side of
if (a > 40)
as  setting up and invoking:

    if (gimple_range_calc_op1  (r, stmt, int_range<2> 
(boolean_true_node, boolean_true_node), [40,40])

Its just less work to do it thru gori()->outgoing_edge_range_p ()   :-)

Andrew
  
Richard Biener Jan. 5, 2022, 12:34 p.m. UTC | #57
On Thu, Dec 9, 2021 at 2:02 PM Martin Liška <mliska@suse.cz> wrote:
>
> On 11/30/21 12:17, Richard Biener wrote:
> > I'd like to see the gswitch support - that's what was posted before stage3
> > close, this patch on its own doesn't seem worth pushing for.  That said,
> > I have some comments below (and the already raised ones about how
> > things might need to change with gswitch support).  Is it so difficult to
> > develop gswitch support as a separate change ontop of this?
>
> Hello.
>
> Took me some time, but I have a working version of the patch that makes both
> refactoring of the costing model and adds support for gswitch. For quite some
> time, I maintained 2 patches, but as commonly happens, I was forced doing quite
> some rework. If we really want a separation for bisection purpose, I suggest simple
> disabling of gswitch support?

It was really meant to ease review.  I'm now looking at the combined
patch, comments will
follow.

+static void
+clean_up_after_unswitching (const auto_edge_flag &ignored_edge_flag)
+{
+  basic_block bb;
+
...
+  /* Clean up the ignored_edge_flag from edges.  */
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      edge e;

you can probably clean up outgoing edge flags in the loop above?

(I'm randomly jumping)

+             /* Build compound expression for all cases leading
+                to this edge.  */
+             tree expr = NULL_TREE;
+             for (unsigned i = 1; i < gimple_switch_num_labels (stmt); ++i)
+               {
+                 tree lab = gimple_switch_label (stmt, i);
+                 basic_block dest = label_to_block (cfun, CASE_LABEL (lab));
+                 edge e2 = find_edge (gimple_bb (stmt), dest);
+                 if (e == e2)

just as a style note I prefer if (e != e2) continue; so the following code
needs less indentation

+                         tree cmp1 = build2 (GE_EXPR, boolean_type_node, idx,
+                                             CASE_LOW (lab));

is there a reason you are not using fold_build2?  Do we want to somehow account
for the built expression size or maybe have a separate limit for the number of
cases we want to combine this way?

+                 unswitch_predicate *predicate
+                   = new unswitch_predicate (expr, idx, edge_index);
+                 ranger->gori ().outgoing_edge_range_p
(predicate->true_range, e,
+                                                        idx,
*get_global_range_query ());
+                 /* Huge switches are not supported by Ranger.  */
+                 if (predicate->true_range.undefined_p ())

I hope ranger will set the range to varying_p () in that case, not
undefined?  But even
then, is that a reason for punting?  I guess we fail to prune cases in
that case but
the cost modeling should then account for those and thus we are at
least consistent?

+                   {
+                     delete predicate;
+                     return;

In this context, when we do

+      if (operand_equal_p (predicate->lhs, last_predicate->lhs, 0))
+       {
+         irange &other = (true_edge ? predicate->merged_true_range
+                          : predicate->merged_false_range);
+         last_predicate->merged_true_range.intersect (other);
+         last_predicate->merged_false_range.intersect (other);
+         return;

ranger may be conservative when intersecting (and hopefully not end up
with undefined_p - heh).  I also am confused about intersecting both
merged_true_range and merged_false_range with 'other'?  I would
have expected to merge true edge info with true edge info and thus
only "swap" things somehow?  OTOH "path" suggests we're dealing
with more than one edge and associated ranges?  Maybe expanding
the comment on the predicate_vector would be useful.  AFAIR we
there store the sequence of unswitchings done with pairs of
the predicate unswitched on and a bool indicating whether we're
dealing with the copy that had the predicate true or the one that
had it false.

+  unswitch_predicate *predicate = NULL;
+  basic_block bb = NULL;
+  if (num > param_max_unswitch_level)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, loc,
+                        "Not unswitching anymore, hit max level\n");
+      goto exit;
+    }

I'll notice that given we have the overall size budget limiting the number
of unswitchings itself is probably unnecessary (as noted above we might
need to account for the size of the unswitch condition).

+  for (unsigned i = 0; i != loop->num_nodes; i++)
+    {
+      vec<unswitch_predicate *> &predicates = get_predicates_for_bb (bbs[i]);
+      if (!predicates.is_empty ())
+       {

same comment about indenting

I wonder whether evaluate_control_stmt_using_entry_checks could set
ignored_edge_flag itself instead of communicating via a hash_set?

It's not exactly clear what we use pred->handled for, do we re-discover
and re-try predicates when unswitching another level otherwise?  But
we _do_ want to unswitch the same switch stmt again, no?  And since
the BB predicate is keyed on the switch stmt that wouldn't work?
I wonder whether on recursive invocations of tree_unswitch_single_loop
we'd want to skip over unreachable BBs in the loop when looking
for candidates (when we compute BB predicates we skip over them
but that's done before all cases).  Ah, the BB predicates are a vector,
I wonder why we not simply remove the predicate from the BB predicate
vector when we decide to unswitch on it and thus append it to a predicate
path?

Can you please amend the toplevel file comment as to the new flow of
things?

For the switch unswitching testcases I wonder if we can add dump scanning
to verify we do not end up with duplicate cases across the loop versions?

Overall it looks reasonable but I hope for some simplification still.  I also
think it's something for stage1 now :/

After addressing comments above can you put the patch on a branch
so I can play & fixup things a bit when I run into them?  Often it's easier
to just do a change instead of writing up a review comment and expecting
that to be point-on ;)

Thanks,
Richard.

>
> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
> SPEC 2006 and 2017 survive running -O3 -march=native. There are 2090 optimization notes
> for SPEC 2006 (compared to 1945 before the patch).
>
> Thoughts?
> Thanks,
> Martin
  
Andrew MacLeod Jan. 6, 2022, 3:11 p.m. UTC | #58
On 1/5/22 07:34, Richard Biener wrote:
> On Thu, Dec 9, 2021 at 2:02 PM Martin Liška <mliska@suse.cz> wrote:
>> On 11/30/21 12:17, Richard Biener wrote:
>>
> +                 unswitch_predicate *predicate
> +                   = new unswitch_predicate (expr, idx, edge_index);
> +                 ranger->gori ().outgoing_edge_range_p
> (predicate->true_range, e,
> +                                                        idx,
> *get_global_range_query ());
> +                 /* Huge switches are not supported by Ranger.  */
> +                 if (predicate->true_range.undefined_p ())
>
> I hope ranger will set the range to varying_p () in that case, not
> undefined?  But even
> then, is that a reason for punting?  I guess we fail to prune cases in
> that case but
> the cost modeling should then account for those and thus we are at
> least consistent?

huge switches not supported?  I don't know what you mean, either that or 
I forget :-)  If there are more edges than multi-ranges support, then 
things will start getting merged because they cant be represented.. and 
the default case may then also contain some values that also have 
cases..  but all inconsistencies will move towards varying.. not undefined.

Andrew
  
Martin Liška Jan. 6, 2022, 4:02 p.m. UTC | #59
On 1/6/22 16:11, Andrew MacLeod wrote:
> On 1/5/22 07:34, Richard Biener wrote:
>> On Thu, Dec 9, 2021 at 2:02 PM Martin Liška <mliska@suse.cz> wrote:
>>> On 11/30/21 12:17, Richard Biener wrote:
>>>
>> +                 unswitch_predicate *predicate
>> +                   = new unswitch_predicate (expr, idx, edge_index);
>> +                 ranger->gori ().outgoing_edge_range_p
>> (predicate->true_range, e,
>> +                                                        idx,
>> *get_global_range_query ());
>> +                 /* Huge switches are not supported by Ranger.  */
>> +                 if (predicate->true_range.undefined_p ())
>>
>> I hope ranger will set the range to varying_p () in that case, not
>> undefined?  But even
>> then, is that a reason for punting?  I guess we fail to prune cases in
>> that case but
>> the cost modeling should then account for those and thus we are at
>> least consistent?
> 
> huge switches not supported?  I don't know what you mean, either that or I forget :-)  If there are more edges than multi-ranges support, then things will start getting merged because they cant be represented.. and the default case may then also contain some values that also have cases..  but all inconsistencies will move towards varying.. not undefined.
> 
> Andrew
> 

Hello.

If you consider the attached test-case, then I get for:

(gdb) p debug_bb(e->src)
<bb 4> [local count: 955630225]:
# i_489 = PHI <i_485(99), 0(98)>
# tmp_490 = PHI <tmp_379(99), tmp_382(D)(98)>
_1329 = (long unsigned int) i_489;
_1330 = _1329 * 8;
switch (order_385(D)) <default: <L94> [1.08%], case 0: <L1> [1.08%], case 1: <L2> [1.08%], case 2: <L3> [1.08%], case 3: <L4> [1.08%], case 4: <L5> [1.08%], case 5: <L6> [1.08%], case 6: <L7> [1.08%], case 7: <L8> [1.08%], case 8: <L9> [1.08%], case 9: <L10> [1.08%], case 10: <L11> [1.08%], case 11: <L12> [1.08%], case 12: <L13> [1.08%], case 13: <L14> [1.08%], case 14: <L15> [1.08%], case 15: <L16> [1.08%], case 16: <L17> [1.08%], case 17: <L18> [1.08%], case 18: <L19> [1.08%], case 19: <L20> [1.08%], case 20: <L21> [1.08%], case 21: <L22> [1.08%], case 22: <L23> [1.08%], case 23: <L24> [1.08%], case 24: <L25> [1.08%], case 25: <L26> [1.08%], case 26: <L27> [1.08%], case 27: <L28> [1.08%], case 28: <L29> [1.08%], case 29: <L30> [1.08%], case 30: <L31> [1.08%], case 31: <L32> [1.08%], case 32: <L33> [1.08%], case 33: <L34> [1.08%], case 34: <L35> [1.08%], case 35: <L36> [1.08%], case 36: <L37> [1.08%], case 37: <L38> [1.08%], case 38: <L39> [1.08%], case 39: <L40> [1.08%], case 40: <L41> [1.08%], case 41: <L42> [1.08%], case 42: <L43> [1.08%], case 43: <L44> [1.08%], case 44: <L45> [1.08%], case 45: <L46> [1.08%], case 46: <L47> [1.08%], case 47: <L48> [1.08%], case 48: <L49> [1.08%], case 49: <L50> [1.08%], case 50: <L51> [1.08%], case 51: <L52> [1.08%], case 52: <L53> [1.08%], case 53: <L54> [1.08%], case 54: <L55> [1.08%], case 55: <L56> [1.08%], case 56: <L57> [1.08%], case 57: <L58> [1.08%], case 58: <L59> [1.08%], case 59: <L60> [1.08%], case 60: <L61> [1.08%], case 61: <L62> [1.08%], case 62: <L63> [1.08%], case 63: <L64> [1.08%], case 64: <L65> [1.08%], case 65: <L66> [1.08%], case 66: <L67> [1.08%], case 67: <L68> [1.08%], case 68: <L69> [1.08%], case 69: <L70> [1.08%], case 70: <L71> [1.08%], case 71: <L72> [1.08%], case 72: <L73> [1.08%], case 73: <L74> [1.08%], case 74: <L75> [1.08%], case 75: <L76> [1.08%], case 76: <L77> [1.08%], case 77: <L78> [1.08%], case 78: <L79> [1.08%], case 79: <L80> [1.08%], case 80: <L81> [1.08%], case 81: <L82> [1.08%], case 82: <L83> [1.08%], case 83: <L84> [1.08%], case 84: <L85> [1.08%], case 85: <L86> [1.08%], case 86: <L87> [1.08%], case 87: <L88> [1.08%], case 88: <L89> [1.08%], case 89: <L90> [1.08%], case 90: <L91> [1.08%], case 91: <L92> [1.08%]>

$7 = void
(gdb) p debug_bb(e->dest)
<bb 5> [local count: 10275591]:
<L1>:
_3 = a_386(D) + _1330;
_4 = *_3;
tmp_478 = _4 * 0.0;
goto <bb 97>; [100.00%]

│      480                    ranger->gori ().outgoing_edge_range_p (predicate->true_range, e,
│      481                                                           idx, *get_global_range_query ());

Note the return value is false. But I think the comment is not precise:

│     1233  // Calculate a range on edge E and return it in R.  Try to evaluate a
│     1234  // range for NAME on this edge.  Return FALSE if this is either not a
│     1235  // control edge or NAME is not defined by this edge.

and

(gdb) p predicate->true_range.debug()
UNDEFINED

So is the behavior correct Andrew?
Thanks,
Martin
  
Andrew MacLeod Jan. 6, 2022, 4:20 p.m. UTC | #60
On 1/6/22 11:02, Martin Liška wrote:
> On 1/6/22 16:11, Andrew MacLeod wrote:
>> On 1/5/22 07:34, Richard Biener wrote:
>>> On Thu, Dec 9, 2021 at 2:02 PM Martin Liška <mliska@suse.cz> wrote:
>>>> On 11/30/21 12:17, Richard Biener wrote:
>>>>
>>> +                 unswitch_predicate *predicate
>>> +                   = new unswitch_predicate (expr, idx, edge_index);
>>> +                 ranger->gori ().outgoing_edge_range_p
>>> (predicate->true_range, e,
>>> +                                                        idx,
>>> *get_global_range_query ());
>>> +                 /* Huge switches are not supported by Ranger.  */
>>> +                 if (predicate->true_range.undefined_p ())
>>>
>>> I hope ranger will set the range to varying_p () in that case, not
>>> undefined?  But even
>>> then, is that a reason for punting?  I guess we fail to prune cases in
>>> that case but
>>> the cost modeling should then account for those and thus we are at
>>> least consistent?
>>
>> huge switches not supported?  I don't know what you mean, either that 
>> or I forget :-)  If there are more edges than multi-ranges support, 
>> then things will start getting merged because they cant be 
>> represented.. and the default case may then also contain some values 
>> that also have cases..  but all inconsistencies will move towards 
>> varying.. not undefined.
>>
>> Andrew
>>
>
> Hello.
>
> If you consider the attached test-case, then I get for:
>
> (gdb) p debug_bb(e->src)
> <bb 4> [local count: 955630225]:
> # i_489 = PHI <i_485(99), 0(98)>
> # tmp_490 = PHI <tmp_379(99), tmp_382(D)(98)>
> _1329 = (long unsigned int) i_489;
> _1330 = _1329 * 8;
> switch (order_385(D)) <default: <L94> [1.08%], case 0: <L1> [1.08%], 
> case 1: <L2> [1.08%], case 2: <L3> [1.08%], case 3: <L4> [1.08%], case 
> 4: <L5> [1.08%], case 5: <L6> [1.08%], case 6: <L7> [1.08%], case 7: 
> <L8> [1.08%], case 8: <L9> [1.08%], case 9: <L10> [1.08%], case 10: 
> <L11> [1.08%], case 11: <L12> [1.08%], case 12: <L13> [1.08%], case 
> 13: <L14> [1.08%], case 14: <L15> [1.08%], case 15: <L16> [1.08%], 
> case 16: <L17> [1.08%], case 17: <L18> [1.08%], case 18: <L19> 
> [1.08%], case 19: <L20> [1.08%], case 20: <L21> [1.08%], case 21: 
> <L22> [1.08%], case 22: <L23> [1.08%], case 23: <L24> [1.08%], case 
> 24: <L25> [1.08%], case 25: <L26> [1.08%], case 26: <L27> [1.08%], 
> case 27: <L28> [1.08%], case 28: <L29> [1.08%], case 29: <L30> 
> [1.08%], case 30: <L31> [1.08%], case 31: <L32> [1.08%], case 32: 
> <L33> [1.08%], case 33: <L34> [1.08%], case 34: <L35> [1.08%], case 
> 35: <L36> [1.08%], case 36: <L37> [1.08%], case 37: <L38> [1.08%], 
> case 38: <L39> [1.08%], case 39: <L40> [1.08%], case 40: <L41> 
> [1.08%], case 41: <L42> [1.08%], case 42: <L43> [1.08%], case 43: 
> <L44> [1.08%], case 44: <L45> [1.08%], case 45: <L46> [1.08%], case 
> 46: <L47> [1.08%], case 47: <L48> [1.08%], case 48: <L49> [1.08%], 
> case 49: <L50> [1.08%], case 50: <L51> [1.08%], case 51: <L52> 
> [1.08%], case 52: <L53> [1.08%], case 53: <L54> [1.08%], case 54: 
> <L55> [1.08%], case 55: <L56> [1.08%], case 56: <L57> [1.08%], case 
> 57: <L58> [1.08%], case 58: <L59> [1.08%], case 59: <L60> [1.08%], 
> case 60: <L61> [1.08%], case 61: <L62> [1.08%], case 62: <L63> 
> [1.08%], case 63: <L64> [1.08%], case 64: <L65> [1.08%], case 65: 
> <L66> [1.08%], case 66: <L67> [1.08%], case 67: <L68> [1.08%], case 
> 68: <L69> [1.08%], case 69: <L70> [1.08%], case 70: <L71> [1.08%], 
> case 71: <L72> [1.08%], case 72: <L73> [1.08%], case 73: <L74> 
> [1.08%], case 74: <L75> [1.08%], case 75: <L76> [1.08%], case 76: 
> <L77> [1.08%], case 77: <L78> [1.08%], case 78: <L79> [1.08%], case 
> 79: <L80> [1.08%], case 80: <L81> [1.08%], case 81: <L82> [1.08%], 
> case 82: <L83> [1.08%], case 83: <L84> [1.08%], case 84: <L85> 
> [1.08%], case 85: <L86> [1.08%], case 86: <L87> [1.08%], case 87: 
> <L88> [1.08%], case 88: <L89> [1.08%], case 89: <L90> [1.08%], case 
> 90: <L91> [1.08%], case 91: <L92> [1.08%]>
>
> $7 = void
> (gdb) p debug_bb(e->dest)
> <bb 5> [local count: 10275591]:
> <L1>:
> _3 = a_386(D) + _1330;
> _4 = *_3;
> tmp_478 = _4 * 0.0;
> goto <bb 97>; [100.00%]
>
> │      480                    ranger->gori ().outgoing_edge_range_p 
> (predicate->true_range, e,
> │ 481                                                           idx, 
> *get_global_range_query ());
>
> Note the return value is false. But I think the comment is not precise:


So if you get a FALSE back, you cannot use any results because GORI is 
claiming that it cant figure something out... or there is nothing to 
figure out.   Most of rangers routines are implemented so that if they 
return FALSE, the result is meaningless.


what is IDX you are passing?  order_385?

As a side note, theres a typo in the testcase:  .. Im not sure how that 
affects things, but :

    defaut:
             __builtin_unreachable ();


default is misspelled...  maybe it thinks that some kind of runtime 
value?   I am surprised it even compiles.  maybe that is mucking up what 
GORI thiunks it can calculate?



>
> │     1233  // Calculate a range on edge E and return it in R. Try to 
> evaluate a
> │     1234  // range for NAME on this edge.  Return FALSE if this is 
> either not a
> │     1235  // control edge or NAME is not defined by this edge.
>
> and
>
> (gdb) p predicate->true_range.debug()
> UNDEFINED
>
> So is the behavior correct Andrew?
> Thanks,
> Martin
>
  
Martin Liška Jan. 6, 2022, 4:30 p.m. UTC | #61
On 1/5/22 13:34, Richard Biener wrote:
> On Thu, Dec 9, 2021 at 2:02 PM Martin Liška <mliska@suse.cz> wrote:
>>
>> On 11/30/21 12:17, Richard Biener wrote:
>>> I'd like to see the gswitch support - that's what was posted before stage3
>>> close, this patch on its own doesn't seem worth pushing for.  That said,
>>> I have some comments below (and the already raised ones about how
>>> things might need to change with gswitch support).  Is it so difficult to
>>> develop gswitch support as a separate change ontop of this?
>>
>> Hello.
>>
>> Took me some time, but I have a working version of the patch that makes both
>> refactoring of the costing model and adds support for gswitch. For quite some
>> time, I maintained 2 patches, but as commonly happens, I was forced doing quite
>> some rework. If we really want a separation for bisection purpose, I suggest simple
>> disabling of gswitch support?
> 
> It was really meant to ease review.

Sure, but keeping a separation if the final shape is changing is difficult :P

> I'm now looking at the combined
> patch, comments will
> follow.
> 
> +static void
> +clean_up_after_unswitching (const auto_edge_flag &ignored_edge_flag)
> +{
> +  basic_block bb;
> +
> ...
> +  /* Clean up the ignored_edge_flag from edges.  */
> +  FOR_EACH_BB_FN (bb, cfun)
> +    {
> +      edge e;
> 
> you can probably clean up outgoing edge flags in the loop above?

Done.

> 
> (I'm randomly jumping)
> 
> +             /* Build compound expression for all cases leading
> +                to this edge.  */
> +             tree expr = NULL_TREE;
> +             for (unsigned i = 1; i < gimple_switch_num_labels (stmt); ++i)
> +               {
> +                 tree lab = gimple_switch_label (stmt, i);
> +                 basic_block dest = label_to_block (cfun, CASE_LABEL (lab));
> +                 edge e2 = find_edge (gimple_bb (stmt), dest);
> +                 if (e == e2)
> 
> just as a style note I prefer if (e != e2) continue; so the following code
> needs less indentation

Likewise.

> 
> +                         tree cmp1 = build2 (GE_EXPR, boolean_type_node, idx,
> +                                             CASE_LOW (lab));
> 
> is there a reason you are not using fold_build2?

Changed.

> Do we want to somehow account
> for the built expression size or maybe have a separate limit for the number of
> cases we want to combine this way?

Huh, I would ignore it for now, but yes, can be accounted as well.

> 
> +                 unswitch_predicate *predicate
> +                   = new unswitch_predicate (expr, idx, edge_index);
> +                 ranger->gori ().outgoing_edge_range_p
> (predicate->true_range, e,
> +                                                        idx,
> *get_global_range_query ());
> +                 /* Huge switches are not supported by Ranger.  */
> +                 if (predicate->true_range.undefined_p ())
> 
> I hope ranger will set the range to varying_p () in that case, not
> undefined?

... it's been discussed in a different sub-thread.

> But even
> then, is that a reason for punting?  I guess we fail to prune cases in
> that case but
> the cost modeling should then account for those and thus we are at
> least consistent?

Yes, but I think the situation (huge switches) will be rarely an interesting
optimization opportunity. Plus we would have to find a hot case in such switch.
That's currently ignored by the pass.

> 
> +                   {
> +                     delete predicate;
> +                     return;
> 
> In this context, when we do
> 
> +      if (operand_equal_p (predicate->lhs, last_predicate->lhs, 0))
> +       {
> +         irange &other = (true_edge ? predicate->merged_true_range
> +                          : predicate->merged_false_range);
> +         last_predicate->merged_true_range.intersect (other);
> +         last_predicate->merged_false_range.intersect (other);
> +         return;
> 
> ranger may be conservative when intersecting (and hopefully not end up
> with undefined_p - heh).  I also am confused about intersecting both
> merged_true_range and merged_false_range with 'other'?

other is a predicate for a SSA_NAME _x_ that we already switched on. And thus
we can improve both predicates for true/false edge that are also related to
the same SSA_NAME _x_.


> I would
> have expected to merge true edge info with true edge info and thus
> only "swap" things somehow?  OTOH "path" suggests we're dealing
> with more than one edge and associated ranges?

Yep, path means the we already did a series of unswitching like:

- if (a > 10) TRUE
- if (b == 10) FALSE
- if (c > 0) TRUE

> Maybe expanding
> the comment on the predicate_vector would be useful.  AFAIR we

Sure, will do that.

> there store the sequence of unswitchings done with pairs of
> the predicate unswitched on and a bool indicating whether we're
> dealing with the copy that had the predicate true or the one that
> had it false.

Exactly.

> 
> +  unswitch_predicate *predicate = NULL;
> +  basic_block bb = NULL;
> +  if (num > param_max_unswitch_level)
> +    {
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, loc,
> +                        "Not unswitching anymore, hit max level\n");
> +      goto exit;
> +    }
> 
> I'll notice that given we have the overall size budget limiting the number
> of unswitchings itself is probably unnecessary (as noted above we might
> need to account for the size of the unswitch condition).

Removed.

> 
> +  for (unsigned i = 0; i != loop->num_nodes; i++)
> +    {
> +      vec<unswitch_predicate *> &predicates = get_predicates_for_bb (bbs[i]);
> +      if (!predicates.is_empty ())
> +       {
> 
> same comment about indenting

Done.

> 
> I wonder whether evaluate_control_stmt_using_entry_checks could set
> ignored_edge_flag itself instead of communicating via a hash_set?

Note the function is used in 2 contexts (costing and simplification) and for
the costing, we modify edges as we want to evaluate all opportunities in parallel.

> 
> It's not exactly clear what we use pred->handled for, do we re-discover
> and re-try predicates when unswitching another level otherwise?

We discover them only once and reuse them via gimple_uid in loop versions.
What's tricky is that switch predicates can be unswitched multiple times
and that's why we can't remove them.

>  But
> we _do_ want to unswitch the same switch stmt again, no?

Yes, we do.

> And since
> the BB predicate is keyed on the switch stmt that wouldn't work?
> I wonder whether on recursive invocations of tree_unswitch_single_loop
> we'd want to skip over unreachable BBs in the loop when looking
> for candidates (when we compute BB predicates we skip over them
> but that's done before all cases).

As mentioned, they are discivered only once before we start unswitching.

> Ah, the BB predicates are a vector,
> I wonder why we not simply remove the predicate from the BB predicate
> vector when we decide to unswitch on it and thus append it to a predicate
> path?

As explained above, it's because switches can have >2 edges.

> 
> Can you please amend the toplevel file comment as to the new flow of
> things?

Sure, will do that and that should really help.

> 
> For the switch unswitching testcases I wonder if we can add dump scanning
> to verify we do not end up with duplicate cases across the loop versions?

Yep, will do that.

> 
> Overall it looks reasonable but I hope for some simplification still.  I also
> think it's something for stage1 now :/

Fully agree, that's stage1 material.

> 
> After addressing comments above can you put the patch on a branch
> so I can play & fixup things a bit when I run into them?  Often it's easier
> to just do a change instead of writing up a review comment and expecting
> that to be point-on ;)

I really welcome that, I've pushed devel/loop-unswitch-support-switches
branch with first changes you pointed out. Feel free playing with the branch.

Thanks,
Martin

> 
> Thanks,
> Richard.
> 
>>
>> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
>> SPEC 2006 and 2017 survive running -O3 -march=native. There are 2090 optimization notes
>> for SPEC 2006 (compared to 1945 before the patch).
>>
>> Thoughts?
>> Thanks,
>> Martin
  
Andrew MacLeod Jan. 6, 2022, 4:32 p.m. UTC | #62
On 1/6/22 11:02, Martin Liška wrote:
> On 1/6/22 16:11, Andrew MacLeod wrote:
>> On 1/5/22 07:34, Richard Biener wrote:
>>> On Thu, Dec 9, 2021 at 2:02 PM Martin Liška <mliska@suse.cz> wrote:
>>>> On 11/30/21 12:17, Richard Biener wrote:
>>>>
>>> +                 unswitch_predicate *predicate
>>> +                   = new unswitch_predicate (expr, idx, edge_index);
>>> +                 ranger->gori ().outgoing_edge_range_p
>>> (predicate->true_range, e,
>>> +                                                        idx,
>>> *get_global_range_query ());
>>> +                 /* Huge switches are not supported by Ranger.  */
>>> +                 if (predicate->true_range.undefined_p ())
>>>
>>> I hope ranger will set the range to varying_p () in that case, not
>>> undefined?  But even
>>> then, is that a reason for punting?  I guess we fail to prune cases in
>>> that case but
>>> the cost modeling should then account for those and thus we are at
>>> least consistent?
>>
>> huge switches not supported?  I don't know what you mean, either that 
>> or I forget :-)  If there are more edges than multi-ranges support, 
>> then things will start getting merged because they cant be 
>> represented.. and the default case may then also contain some values 
>> that also have cases..  but all inconsistencies will move towards 
>> varying.. not undefined.
>>
>> Andrew
>>
oh oh oh.   EVRp was spending too much time in very large switches, so 
we added a param to say "don't look at switches too big"   You are 
probably tripping over that

-param=evrp-switch-limit=
Common Joined UInteger Var(param_evrp_switch_limit) Init(50) 
Optimization Param
Maximum number of outgoing edges in a switch before EVRP will not 
process it.

and in GORI we check:

       // Do not process switches if they are too large.
       if (EDGE_COUNT (bb->succs) > (unsigned)param_evrp_switch_limit)
         return;

so maybe you want to temporarily override param_evrp_switch_limit to 
some big number for your purposes...  just set it back when you are done :-)


Andrew
  
Martin Liška Jan. 6, 2022, 4:35 p.m. UTC | #63
On 1/6/22 17:20, Andrew MacLeod wrote:
> So if you get a FALSE back, you cannot use any results because GORI is claiming that it cant figure something out... or there is nothing to figure out.   Most of rangers routines are implemented so that if they return FALSE, the result is meaningless.

All right, then it's my bad, sorry for the noise.

> 
> 
> what is IDX you are passing?  order_385?

Yep.

(gdb) p idx
$1 = <ssa_name 0x7ffff7733e10 383>


> 
> As a side note, theres a typo in the testcase:  .. Im not sure how that affects things, but :

Oh, yeah, that's typo :)

> 
>     defaut:
>              __builtin_unreachable ();
> 
> 
> default is misspelled...  maybe it thinks that some kind of runtime value?   I am surprised it even compiles.  maybe that is mucking up what GORI thiunks it can calculate?

But it does not affect anything. The bailout happens due to:

│      199    // Only process switches if it within the size limit.
│      200    if (EDGE_COUNT (e->src->succs) > (unsigned)m_max_edges)
│  >   201      return NULL;

Cheers,
Martin
  
Andrew MacLeod Jan. 6, 2022, 4:42 p.m. UTC | #64
On 1/6/22 11:35, Martin Liška wrote:
> On 1/6/22 17:20, Andrew MacLeod wrote:
>> So if you get a FALSE back, you cannot use any results because GORI 
>> is claiming that it cant figure something out... or there is nothing 
>> to figure out.   Most of rangers routines are implemented so that if 
>> they return FALSE, the result is meaningless.
>
> All right, then it's my bad, sorry for the noise.
>
>>
>>
>> what is IDX you are passing?  order_385?
>
> Yep.
>
> (gdb) p idx
> $1 = <ssa_name 0x7ffff7733e10 383>
>
>
>>
>> As a side note, theres a typo in the testcase:  .. Im not sure how 
>> that affects things, but :
>
> Oh, yeah, that's typo :)
>
>>
>>     defaut:
>>              __builtin_unreachable ();
>>
>>
>> default is misspelled...  maybe it thinks that some kind of runtime 
>> value?   I am surprised it even compiles.  maybe that is mucking up 
>> what GORI thiunks it can calculate?
>
> But it does not affect anything. The bailout happens due to:
>
> │      199    // Only process switches if it within the size limit.
> │      200    if (EDGE_COUNT (e->src->succs) > (unsigned)m_max_edges)
> │  >   201      return NULL;
>
> Cheers,
> Martin
>
yeah its created by:

gori_compute::gori_compute (int not_executable_flag)
                       : outgoing (param_evrp_switch_limit), tracer 
("GORI ")


so as long as you change it to whatever you want before you create a 
ranger,  you should get whatever limit you want. anything above 255 may 
produce imprecise default values, if there are big holes in the switch 
because multi ranges only currently support up to 255 ranges.. so if 
there are more than 255 distinct subranges, it wont be exact.

Andrew
  
Martin Liška Jan. 13, 2022, 4:01 p.m. UTC | #65
On 1/6/22 17:30, Martin Liška wrote:
> I really welcome that, I've pushed devel/loop-unswitch-support-switches
> branch with first changes you pointed out. Feel free playing with the branch.

Hello.

I've just pushed a revision to the branch that introduced top-level comment.
Feel free to play with the branch once you have spare cycles and we can
return to it next stage1.

Cheers,
Martin
  
Richard Biener Jan. 14, 2022, 7:23 a.m. UTC | #66
On Thu, Jan 13, 2022 at 5:01 PM Martin Liška <mliska@suse.cz> wrote:
>
> On 1/6/22 17:30, Martin Liška wrote:
> > I really welcome that, I've pushed devel/loop-unswitch-support-switches
> > branch with first changes you pointed out. Feel free playing with the branch.
>
> Hello.
>
> I've just pushed a revision to the branch that introduced top-level comment.
> Feel free to play with the branch once you have spare cycles and we can
> return to it next stage1.

Thanks, will do!

Richard.

>
> Cheers,
> Martin
  

Patch

diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-6.c b/gcc/testsuite/gcc.dg/loop-unswitch-6.c
new file mode 100644
index 00000000000..8a022e0f200
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-6.c
@@ -0,0 +1,56 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details --param=max-unswitch-insns=1000 --param=max-unswitch-level=10" } */
+
+int
+__attribute__((noipa))
+foo(double *a, double *b, double *c, double *d, double *r, int size, int order)
+{
+  for (int i = 0; i < size; i++)
+  {
+    double tmp, tmp2;
+
+    switch(order)
+    {
+      case 0:
+        tmp = -8 * a[i];
+        tmp2 = 2 * b[i];
+        break;
+      case 1:
+        tmp = 3 * a[i] -  2 * b[i];
+        tmp2 = 5 * b[i] - 2 * c[i];
+        break;
+      case 2:
+        tmp = 9 * a[i] +  2 * b[i] + c[i];
+        tmp2 = 4 * b[i] + 2 * c[i] + 8 * d[i];
+        break;
+      case 3:
+        tmp = 3 * a[i] +  2 * b[i] - c[i];
+        tmp2 = b[i] - 2 * c[i] + 8 * d[i];
+        break;
+      defaut:
+        __builtin_unreachable ();
+    }
+
+    double x = 3 * tmp + d[i] + tmp;
+    double y = 3.4f * tmp + d[i] + tmp2;
+    r[i] = x + y;
+  }
+
+  return 0;
+}
+
+#define N 16 * 1024
+double aa[N], bb[N], cc[N], dd[N], rr[N];
+
+int main()
+{
+  for (int i = 0; i < 100 * 1000; i++)
+    foo (aa, bb, cc, dd, rr, N, i % 4);
+}
+
+
+/* Test that we actually unswitched something.  */
+/* { dg-final { scan-tree-dump ";; Unswitching loop with condition: order.* == 0" "unswitch" } } */
+/* { dg-final { scan-tree-dump ";; Unswitching loop with condition: order.* == 1" "unswitch" } } */
+/* { dg-final { scan-tree-dump ";; Unswitching loop with condition: order.* == 2" "unswitch" } } */
+/* { dg-final { scan-tree-dump ";; Unswitching loop with condition: order.* == 3" "unswitch" } } */
diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-7.c b/gcc/testsuite/gcc.dg/loop-unswitch-7.c
new file mode 100644
index 00000000000..00f2fcff64b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-7.c
@@ -0,0 +1,45 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details --param=max-unswitch-insns=1000 --param=max-unswitch-level=10" } */
+
+int
+foo(double *a, double *b, double *c, double *d, double *r, int size, int order)
+{
+  for (int i = 0; i < size; i++)
+  {
+    double tmp, tmp2;
+
+    switch(order)
+    {
+      case 5 ... 6:
+      case 9:
+        tmp = -8 * a[i];
+        tmp2 = 2 * b[i];
+        break;
+      case 11:
+        tmp = 3 * a[i] -  2 * b[i];
+        tmp2 = 5 * b[i] - 2 * c[i];
+        break;
+      case 22:
+        tmp = 9 * a[i] +  2 * b[i] + c[i];
+        tmp2 = 4 * b[i] + 2 * c[i] + 8 * d[i];
+        break;
+      case 33:
+        tmp = 3 * a[i] +  2 * b[i] - c[i];
+        tmp2 = b[i] - 2 * c[i] + 8 * d[i];
+        break;
+      defaut:
+        __builtin_unreachable ();
+    }
+
+    double x = 3 * tmp + d[i] + tmp;
+    double y = 3.4f * tmp + d[i] + tmp2;
+    r[i] = x + y;
+  }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump ";; Unswitching loop with condition: order_.* >= 5 & order_.* <= 6 | order_.* == 9" "unswitch" } } */
+/* { dg-final { scan-tree-dump ";; Unswitching loop with condition: order.* == 1" "unswitch" } } */
+/* { dg-final { scan-tree-dump ";; Unswitching loop with condition: order.* == 2" "unswitch" } } */
+/* { dg-final { scan-tree-dump ";; Unswitching loop with condition: order.* == 3" "unswitch" } } */
diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-8.c b/gcc/testsuite/gcc.dg/loop-unswitch-8.c
new file mode 100644
index 00000000000..4cec1f53bcc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-8.c
@@ -0,0 +1,28 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details" } */
+
+int
+foo(double *a, double *b, double *c, double *d, double *r, int size, int order)
+{
+  for (int i = 0; i < size; i++)
+  {
+    double tmp;
+
+    if (order == 1)
+      tmp = -8 * a[i];
+    else
+      tmp = -4 * b[i];
+
+    double x = 3 * tmp + d[i] + tmp;
+
+    if (order == 1)
+      x += 2;
+
+    double y = 3.4f * tmp + d[i];
+    r[i] = x + y;
+  }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times ";; Unswitching loop with condition: order.* == 1" 1 "unswitch" } } */
diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-9.c b/gcc/testsuite/gcc.dg/loop-unswitch-9.c
new file mode 100644
index 00000000000..3058d4a5b38
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-9.c
@@ -0,0 +1,34 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details" } */
+
+int
+foo(double *a, double *b, double *c, double *d, double *r, int size, unsigned order)
+{
+  for (int i = 0; i < size; i++)
+  {
+    double tmp;
+
+    switch (order)
+      {
+      case 0 ... 4:
+	tmp = -8 * a[i];
+	break;
+      default:
+	tmp = -4 * b[i];
+	break;
+      }
+
+    double x = 3 * tmp + d[i] + tmp;
+
+    /* This should not be unswitched as it's mutually excluded with case 0 ... 4.  */
+    if (order >= 5)
+      x += 2;
+
+    double y = 3.4f * tmp + d[i];
+    r[i] = x + y;
+  }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times ";; Unswitching loop with condition: order.* <= 4" 1 "unswitch" } } */
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index 367dcfa20bf..159e5d83c06 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -9053,11 +9053,16 @@  gimple_lv_add_condition_to_bb (basic_block first_head ATTRIBUTE_UNUSED,
    edge e0;
  
    /* Build new conditional expr */
+  gsi = gsi_last_bb (cond_bb);
+
+  cond_expr = force_gimple_operand_gsi_1 (&gsi, cond_expr,
+					  is_gimple_condexpr_for_cond,
+					  NULL_TREE, false,
+					  GSI_CONTINUE_LINKING);
    new_cond_expr = gimple_build_cond_from_tree (cond_expr,
  					       NULL_TREE, NULL_TREE);
  
    /* Add new cond in cond_bb.  */
-  gsi = gsi_last_bb (cond_bb);
    gsi_insert_after (&gsi, new_cond_expr, GSI_NEW_STMT);
  
    /* Adjust edges appropriately to connect new head with first head
diff --git a/gcc/tree-ssa-loop-unswitch.c b/gcc/tree-ssa-loop-unswitch.c
index fe4dacc0833..a509938c7d0 100644
--- a/gcc/tree-ssa-loop-unswitch.c
+++ b/gcc/tree-ssa-loop-unswitch.c
@@ -37,6 +37,10 @@  along with GCC; see the file COPYING3.  If not see
  #include "gimple-iterator.h"
  #include "cfghooks.h"
  #include "tree-ssa-loop-manip.h"
+#include "cfganal.h"
+#include "tree-cfgcleanup.h"
+#include "tree-pretty-print.h"
+#include "gimple-range.h"
  
  /* This file implements the loop unswitching, i.e. transformation of loops like
  
@@ -74,9 +78,9 @@  along with GCC; see the file COPYING3.  If not see
     tree-ssa-loop-im.c ensures that all the suitable conditions are in this
     shape.  */
  
-static class loop *tree_unswitch_loop (class loop *, basic_block, tree);
+static class loop *tree_unswitch_loop (class loop *, basic_block, tree, edge);
  static bool tree_unswitch_single_loop (class loop *, int);
-static tree tree_may_unswitch_on (basic_block, class loop *);
+static tree tree_may_unswitch_on (basic_block, class loop *, edge *);
  static bool tree_unswitch_outer_loop (class loop *);
  static edge find_loop_guard (class loop *);
  static bool empty_bb_without_guard_p (class loop *, basic_block);
@@ -84,6 +88,7 @@  static bool used_outside_loop_p (class loop *, tree);
  static void hoist_guard (class loop *, edge);
  static bool check_exit_phi (class loop *);
  static tree get_vop_from_header (class loop *);
+static void clean_up_switches (void);
  
  /* Main entry point.  Perform loop unswitching on all suitable loops.  */
  
@@ -103,7 +108,10 @@  tree_ssa_unswitch_loops (void)
      }
  
    if (changed)
-    return TODO_cleanup_cfg;
+    {
+      clean_up_switches ();
+      return TODO_cleanup_cfg;
+    }
    return 0;
  }
  
@@ -182,80 +190,114 @@  is_maybe_undefined (const tree name, gimple *stmt, class loop *loop)
  }
  
  /* Checks whether we can unswitch LOOP on condition at end of BB -- one of its
-   basic blocks (for what it means see comments below).  */
+   basic blocks (for what it means see comments below).
+   When a gswitch case is returned, fill up COND_EDGE for it.  */
  
  static tree
-tree_may_unswitch_on (basic_block bb, class loop *loop)
+tree_may_unswitch_on (basic_block bb, class loop *loop, edge *cond_edge)
  {
    gimple *last, *def;
-  gcond *stmt;
-  tree cond, use;
+  tree use;
    basic_block def_bb;
    ssa_op_iter iter;
  
    /* BB must end in a simple conditional jump.  */
    last = last_stmt (bb);
-  if (!last || gimple_code (last) != GIMPLE_COND)
-    return NULL_TREE;
-  stmt = as_a <gcond *> (last);
-
-  /* To keep the things simple, we do not directly remove the conditions,
-     but just replace tests with 0 != 0 resp. 1 != 0.  Prevent the infinite
-     loop where we would unswitch again on such a condition.  */
-  if (gimple_cond_true_p (stmt) || gimple_cond_false_p (stmt))
-    return NULL_TREE;
-
-  /* Condition must be invariant.  */
-  FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE)
+  if (gcond *stmt = safe_dyn_cast <gcond *> (last))
      {
-      def = SSA_NAME_DEF_STMT (use);
+      /* To keep the things simple, we do not directly remove the conditions,
+	 but just replace tests with 0 != 0 resp. 1 != 0.  Prevent the infinite
+	 loop where we would unswitch again on such a condition.  */
+      if (gimple_cond_true_p (stmt) || gimple_cond_false_p (stmt))
+	return NULL_TREE;
+
+      /* Condition must be invariant.  */
+      FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE)
+	{
+	  def = SSA_NAME_DEF_STMT (use);
+	  def_bb = gimple_bb (def);
+	  if (def_bb
+	      && flow_bb_inside_loop_p (loop, def_bb))
+	    return NULL_TREE;
+	  /* Unswitching on undefined values would introduce undefined
+	     behavior that the original program might never exercise.  */
+	  if (is_maybe_undefined (use, stmt, loop))
+	    return NULL_TREE;
+	}
+
+      edge e1 = EDGE_SUCC (bb, 0);
+      edge e2 = EDGE_SUCC (bb, 1);
+      *cond_edge = e1->flags & EDGE_TRUE_VALUE ? e1 : e2;
+      return build2 (gimple_cond_code (stmt), boolean_type_node,
+		     gimple_cond_lhs (stmt), gimple_cond_rhs (stmt));
+    }
+  else if (gswitch *stmt = safe_dyn_cast <gswitch *> (last))
+    {
+      unsigned nlabels = gimple_switch_num_labels (stmt);
+      tree idx = gimple_switch_index (stmt);
+      if (TREE_CODE (idx) != SSA_NAME
+	  || nlabels < 1)
+	return NULL_TREE;
+      /* Index must be invariant.  */
+      def = SSA_NAME_DEF_STMT (idx);
        def_bb = gimple_bb (def);
        if (def_bb
  	  && flow_bb_inside_loop_p (loop, def_bb))
  	return NULL_TREE;
        /* Unswitching on undefined values would introduce undefined
  	 behavior that the original program might never exercise.  */
-      if (is_maybe_undefined (use, stmt, loop))
+      if (is_maybe_undefined (idx, stmt, loop))
  	return NULL_TREE;
-    }
-
-  cond = build2 (gimple_cond_code (stmt), boolean_type_node,
-		 gimple_cond_lhs (stmt), gimple_cond_rhs (stmt));
  
-  return cond;
-}
-
-/* Simplifies COND using checks in front of the entry of the LOOP.  Just very
-   simplish (sufficient to prevent us from duplicating loop in unswitching
-   unnecessarily).  */
-
-static tree
-simplify_using_entry_checks (class loop *loop, tree cond)
-{
-  edge e = loop_preheader_edge (loop);
-  gimple *stmt;
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->succs)
+	{
+	  if (!(e->flags & EDGE_IGNORE))
+	    {
+	      /* Build compound expression for all cases leading
+		 to this edge.  */
+	      tree expr = NULL_TREE;
+	      for (unsigned i = 1; i < gimple_switch_num_labels (stmt); ++i)
+		{
+		  tree lab = gimple_switch_label (stmt, i);
+		  basic_block dest = label_to_block (cfun, CASE_LABEL (lab));
+		  edge e2 = find_edge (gimple_bb (stmt), dest);
+		  if (e == e2)
+		    {
+		      tree cmp;
+		      if (CASE_HIGH (lab) != NULL_TREE)
+			{
+			  tree cmp1 = build2 (GE_EXPR, boolean_type_node, idx,
+					      CASE_LOW (lab));
+			  tree cmp2 = build2 (LE_EXPR, boolean_type_node, idx,
+					      CASE_HIGH (lab));
+			  cmp = build2 (BIT_AND_EXPR, boolean_type_node, cmp1,
+					cmp2);
+			}
+		      else
+			cmp = build2 (EQ_EXPR, boolean_type_node, idx,
+				      CASE_LOW (lab));
+
+		      /* Combine the expression with the existing one.  */
+		      if (expr == NULL_TREE)
+			expr = cmp;
+		      else
+			expr = build2 (BIT_IOR_EXPR, boolean_type_node, expr,
+				       cmp);
+		    }
+		}
  
-  while (1)
-    {
-      stmt = last_stmt (e->src);
-      if (stmt
-	  && gimple_code (stmt) == GIMPLE_COND
-	  && gimple_cond_code (stmt) == TREE_CODE (cond)
-	  && operand_equal_p (gimple_cond_lhs (stmt),
-			      TREE_OPERAND (cond, 0), 0)
-	  && operand_equal_p (gimple_cond_rhs (stmt),
-			      TREE_OPERAND (cond, 1), 0))
-	return (e->flags & EDGE_TRUE_VALUE
-		? boolean_true_node
-		: boolean_false_node);
-
-      if (!single_pred_p (e->src))
-	return cond;
-
-      e = single_pred_edge (e->src);
-      if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
-	return cond;
+	      if (expr != NULL_TREE)
+		{
+		  *cond_edge = e;
+		  return expr;
+		}
+	    }
+	}
      }
+
+  return NULL_TREE;
  }
  
  /* Unswitch single LOOP.  NUM is number of unswitchings done; we do not allow
@@ -269,6 +311,7 @@  tree_unswitch_single_loop (class loop *loop, int num)
    class loop *nloop;
    unsigned i, found;
    tree cond = NULL_TREE;
+  edge cond_edge = NULL;
    gimple *stmt;
    bool changed = false;
    HOST_WIDE_INT iterations;
@@ -311,11 +354,12 @@  tree_unswitch_single_loop (class loop *loop, int num)
    bbs = get_loop_body (loop);
    found = loop->num_nodes;
  
+  gimple_ranger ranger;
    while (1)
      {
        /* Find a bb to unswitch on.  */
        for (; i < loop->num_nodes; i++)
-	if ((cond = tree_may_unswitch_on (bbs[i], loop)))
+	if ((cond = tree_may_unswitch_on (bbs[i], loop, &cond_edge)))
  	  break;
  
        if (i == loop->num_nodes)
@@ -333,24 +377,70 @@  tree_unswitch_single_loop (class loop *loop, int num)
  	  break;
  	}
  
-      cond = simplify_using_entry_checks (loop, cond);
        stmt = last_stmt (bbs[i]);
-      if (integer_nonzerop (cond))
+      gcond *condition = dyn_cast<gcond *> (stmt);
+      gswitch *swtch = dyn_cast<gswitch *> (stmt);
+
+      if (condition != NULL)
  	{
-	  /* Remove false path.  */
-	  gimple_cond_set_condition_from_tree (as_a <gcond *> (stmt),
-					       boolean_true_node);
-	  changed = true;
+	  int_range_max r;
+	  edge edge_true, edge_false;
+	  extract_true_false_edges_from_block (bbs[i], &edge_true, &edge_false);
+	  tree cond = gimple_cond_lhs (stmt);
+
+	  if (r.supports_type_p (TREE_TYPE (cond)))
+	    {
+	      if (ranger.range_on_edge (r, edge_true, cond)
+		  && r.undefined_p ())
+		{
+		  gimple_cond_set_condition_from_tree (condition,
+						       boolean_false_node);
+		  update_stmt (condition);
+		  changed = true;
+		}
+	      else if(ranger.range_on_edge (r, edge_false, gimple_cond_lhs (stmt))
+		      && r.undefined_p ())
+		{
+		  gimple_cond_set_condition_from_tree (condition,
+						       boolean_true_node);
+		  update_stmt (condition);
+		  changed = true;
+		}
+	    }
  	}
-      else if (integer_zerop (cond))
+      else if (swtch != NULL)
  	{
-	  /* Remove true path.  */
-	  gimple_cond_set_condition_from_tree (as_a <gcond *> (stmt),
-					       boolean_false_node);
-	  changed = true;
+	  hash_set<edge> ignored_edges;
+	  unsigned nlabels = gimple_switch_num_labels (swtch);
+	  tree index_candidate = NULL_TREE;
+
+	  for (unsigned i = 0; i < nlabels; ++i)
+	    {
+	      tree lab = gimple_switch_label (swtch, i);
+	      basic_block dest = label_to_block (cfun, CASE_LABEL (lab));
+	      edge e = find_edge (gimple_bb (stmt), dest);
+
+	      int_range_max r;
+	      if (ranger.range_on_edge (r, e, gimple_switch_index (swtch))
+		  && r.undefined_p ())
+		{
+		  e->flags |= EDGE_IGNORE;
+		  ignored_edges.add (e);
+		}
+	      else
+		index_candidate = CASE_LOW (lab);
+	    }
+
+	  if (index_candidate != NULL_TREE
+	      && ignored_edges.elements () == EDGE_COUNT (bbs[i]->succs) - 1)
+	    {
+	      gimple_switch_set_index (swtch, index_candidate);
+	      update_stmt (swtch);
+	    }
  	}
+
        /* Do not unswitch too much.  */
-      else if (num > param_max_unswitch_level)
+      if (num > param_max_unswitch_level)
  	{
  	  i++;
  	  continue;
@@ -371,7 +461,6 @@  tree_unswitch_single_loop (class loop *loop, int num)
  	  break;
  	}
  
-      update_stmt (stmt);
        i++;
      }
  
@@ -435,7 +524,7 @@  tree_unswitch_single_loop (class loop *loop, int num)
        /* Find a bb to unswitch on.  */
        for (; found < loop->num_nodes; found++)
  	if ((bbs[found]->flags & BB_REACHABLE)
-	    && (cond = tree_may_unswitch_on (bbs[found], loop)))
+	    && (cond = tree_may_unswitch_on (bbs[found], loop, &cond_edge)))
  	  break;
  
        if (found == loop->num_nodes)
@@ -446,11 +535,15 @@  tree_unswitch_single_loop (class loop *loop, int num)
      }
  
    if (dump_file && (dump_flags & TDF_DETAILS))
-    fprintf (dump_file, ";; Unswitching loop\n");
+    {
+      fprintf (dump_file, ";; Unswitching loop with condition: ");
+      print_generic_expr (dump_file, cond);
+      fprintf (dump_file, "\n");
+    }
  
    initialize_original_copy_tables ();
    /* Unswitch the loop on this condition.  */
-  nloop = tree_unswitch_loop (loop, bbs[found], cond);
+  nloop = tree_unswitch_loop (loop, bbs[found], cond, cond_edge);
    if (!nloop)
      {
        free_original_copy_tables ();
@@ -476,18 +569,15 @@  tree_unswitch_single_loop (class loop *loop, int num)
  
  static class loop *
  tree_unswitch_loop (class loop *loop,
-		    basic_block unswitch_on, tree cond)
+		    basic_block unswitch_on, tree cond, edge cond_edge)
  {
    profile_probability prob_true;
-  edge edge_true, edge_false;
  
    /* Some sanity checking.  */
    gcc_assert (flow_bb_inside_loop_p (loop, unswitch_on));
-  gcc_assert (EDGE_COUNT (unswitch_on->succs) == 2);
    gcc_assert (loop->inner == NULL);
  
-  extract_true_false_edges_from_block (unswitch_on, &edge_true, &edge_false);
-  prob_true = edge_true->probability;
+  prob_true = cond_edge->probability;
    return loop_version (loop, unshare_expr (cond),
  		       NULL, prob_true,
  		       prob_true.invert (),
@@ -965,6 +1055,42 @@  check_exit_phi (class loop *loop)
    return true;
  }
  
+/* Remove all dead cases from switches that are unswitched.  */
+
+static void
+clean_up_switches (void)
+{
+  basic_block bb;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      gimple *last = last_stmt (bb);
+      if (gswitch *stmt = safe_dyn_cast <gswitch *> (last))
+	{
+	  unsigned nlabels = gimple_switch_num_labels (stmt);
+	  unsigned index = 1;
+	  for (unsigned i = 1; i < nlabels; ++i)
+	    {
+	      tree lab = gimple_switch_label (stmt, i);
+	      basic_block dest = label_to_block (cfun, CASE_LABEL (lab));
+	      edge e = find_edge (gimple_bb (stmt), dest);
+	      if (e == NULL)
+		; /* The edge is already removed.  */
+	      else if (e->flags & EDGE_IGNORE)
+		remove_edge (e);
+	      else
+		{
+		  gimple_switch_set_label (stmt, index, lab);
+		  ++index;
+		}
+	    }
+
+	  if (index != nlabels)
+	    gimple_switch_set_num_labels (stmt, index);
+	}
+    }
+}
+
  /* Loop unswitching pass.  */
  
  namespace {