[3/3] Fix loop split incorrect count and probability

Message ID 20211208055416.1415283-4-luoxhu@linux.ibm.com
State Committed
Commit cd5ae148c47c6dee05adb19acd6a523f7187be7f
Headers
Series Dependency patches for hoist LIM code to cold loop |

Commit Message

Xionghu Luo Dec. 8, 2021, 5:54 a.m. UTC
  In tree-ssa-loop-split.c, split_loop and split_loop_on_cond does two
kind of split. split_loop only works for single loop and insert edge at
exit when split, while split_loop_on_cond is not limited to single loop
and insert edge at latch when split.  Both split behavior should consider
loop count and probability update.  For split_loop, loop split condition
is moved in front of loop1 and loop2; But split_loop_on_cond moves the
condition between loop1 and loop2, this patch does:
 1) profile count proportion for both original loop and copied loop
without dropping down the true branch's count;
 2) probability update in the two loops and between the two loops.

Regression tested pass, OK for master?

Changes diff for split_loop and split_loop_on_cond cases:

1) diff base/loop-split.c.151t.lsplit patched/loop-split.c.152t.lsplit
...
   <bb 2> [local count: 118111600]:
   if (beg_5(D) < end_8(D))
     goto <bb 14>; [89.00%]
   else
     goto <bb 6>; [11.00%]

   <bb 14> [local count: 105119324]:
   if (beg2_6(D) < c_9(D))
-    goto <bb 15>; [100.00%]
+    goto <bb 15>; [33.00%]
   else
-    goto <bb 16>; [100.00%]
+    goto <bb 16>; [67.00%]

-  <bb 15> [local count: 105119324]:
+  <bb 15> [local count: 34689377]:
   _25 = beg_5(D) + 1;
   _26 = end_8(D) - beg_5(D);
   _27 = beg2_6(D) + _26;
   _28 = MIN_EXPR <c_9(D), _27>;

-  <bb 3> [local count: 955630225]:
+  <bb 3> [local count: 315357973]:
   # i_16 = PHI <i_11(8), beg_5(D)(15)>
   # j_17 = PHI <j_12(8), beg2_6(D)(15)>
   printf ("a: %d %d\n", i_16, j_17);
   i_11 = i_16 + 1;
   j_12 = j_17 + 1;
   if (j_12 < _28)
-    goto <bb 8>; [89.00%]
+    goto <bb 8>; [29.37%]
   else
-    goto <bb 17>; [11.00%]
+    goto <bb 17>; [70.63%]

-  <bb 8> [local count: 850510901]:
+  <bb 8> [local count: 280668596]:
   goto <bb 3>; [100.00%]

-  <bb 16> [local count: 105119324]:
+  <bb 16> [local count: 70429947]:
   # i_22 = PHI <beg_5(D)(14), i_29(17)>
   # j_23 = PHI <beg2_6(D)(14), j_30(17)>

   <bb 10> [local count: 955630225]:
   # i_2 = PHI <i_22(16), i_20(13)>
   # j_1 = PHI <j_23(16), j_21(13)>
   i_20 = i_2 + 1;
   j_21 = j_1 + 1;
   if (end_8(D) > i_20)
-    goto <bb 13>; [89.00%]
+    goto <bb 13>; [59.63%]
   else
-    goto <bb 9>; [11.00%]
+    goto <bb 9>; [40.37%]

-  <bb 13> [local count: 850510901]:
+  <bb 13> [local count: 569842305]:
   goto <bb 10>; [100.00%]

   <bb 17> [local count: 105119324]:
   # i_29 = PHI <i_11(3)>
   # j_30 = PHI <j_12(3)>
   if (end_8(D) > i_29)
     goto <bb 16>; [80.00%]
   else
     goto <bb 9>; [20.00%]

   <bb 9> [local count: 105119324]:

   <bb 6> [local count: 118111600]:
   return 0;

 }
   <bb 2> [local count: 118111600]:
-  if (beg_5(D) < end_8(D))
+  _1 = end_6(D) - beg_7(D);
+  j_9 = _1 + beg2_8(D);
+  if (end_6(D) > beg_7(D))
     goto <bb 14>; [89.00%]
   else
     goto <bb 6>; [11.00%]

   <bb 14> [local count: 105119324]:
-  if (beg2_6(D) < c_9(D))
-    goto <bb 15>; [100.00%]
+  if (j_9 >= c_11(D))
+    goto <bb 15>; [33.00%]
   else
-    goto <bb 16>; [100.00%]
+    goto <bb 16>; [67.00%]

-  <bb 15> [local count: 105119324]:
-  _25 = beg_5(D) + 1;
-  _26 = end_8(D) - beg_5(D);
-  _27 = beg2_6(D) + _26;
-  _28 = MIN_EXPR <c_9(D), _27>;
-
-  <bb 3> [local count: 955630225]:
-  # i_16 = PHI <i_11(8), beg_5(D)(15)>
-  # j_17 = PHI <j_12(8), beg2_6(D)(15)>
-  printf ("a: %d %d\n", i_16, j_17);
-  i_11 = i_16 + 1;
-  j_12 = j_17 + 1;
-  if (j_12 < _28)
-    goto <bb 8>; [89.00%]
+  <bb 15> [local count: 34689377]:
+  _27 = end_6(D) + -1;
+  _28 = beg_7(D) - end_6(D);
+  _29 = j_9 + _28;
+  _30 = _29 + 1;
+  _31 = MAX_EXPR <c_11(D), _30>;
+
+  <bb 3> [local count: 315357973]:
+  # i_18 = PHI <i_13(8), end_6(D)(15)>
+  # j_19 = PHI <j_14(8), j_9(15)>
+  printf ("a: %d %d\n", i_18, j_19);
+  i_13 = i_18 + -1;
+  j_14 = j_19 + -1;
+  if (j_14 >= _31)
+    goto <bb 8>; [29.37%]
   else
-    goto <bb 17>; [11.00%]
+    goto <bb 17>; [70.63%]

-  <bb 8> [local count: 850510901]:
+  <bb 8> [local count: 280668596]:
   goto <bb 3>; [100.00%]

-  <bb 16> [local count: 105119324]:
-  # i_22 = PHI <beg_5(D)(14), i_29(17)>
-  # j_23 = PHI <beg2_6(D)(14), j_30(17)>
+  <bb 16> [local count: 70429947]:
+  # i_24 = PHI <end_6(D)(14), i_32(17)>
+  # j_25 = PHI <j_9(14), j_33(17)>

   <bb 10> [local count: 955630225]:
-  # i_2 = PHI <i_22(16), i_20(13)>
-  # j_1 = PHI <j_23(16), j_21(13)>
-  i_20 = i_2 + 1;
-  j_21 = j_1 + 1;
-  if (end_8(D) > i_20)
+  # i_3 = PHI <i_24(16), i_22(13)>
+  # j_2 = PHI <j_25(16), j_23(13)>
+  i_22 = i_3 + -1;
+  j_23 = j_2 + -1;
+  if (beg_7(D) < i_22)
     goto <bb 13>; [89.00%]
   else
     goto <bb 9>; [11.00%]

-  <bb 13> [local count: 850510901]:
+  <bb 13> [local count: 569842305]:
   goto <bb 10>; [100.00%]

   <bb 17> [local count: 105119324]:
-  # i_29 = PHI <i_11(3)>
-  # j_30 = PHI <j_12(3)>
-  if (end_8(D) > i_29)
+  # i_32 = PHI <i_13(3)>
+  # j_33 = PHI <j_14(3)>
+  if (beg_7(D) < i_32)
     goto <bb 16>; [80.00%]
   else
     goto <bb 9>; [20.00%]

   <bb 9> [local count: 105119324]:

   <bb 6> [local count: 118111600]:
   return 0;

 }

2) diff base/loop-cond-split-1.c.151t.lsplit  patched/loop-cond-split-1.c.151t.lsplit:
...
   <bb 2> [local count: 118111600]:
   if (n_7(D) > 0)
     goto <bb 4>; [89.00%]
   else
     goto <bb 3>; [11.00%]

   <bb 3> [local count: 118111600]:
   return;

   <bb 4> [local count: 105119324]:
   pretmp_3 = ga;

-  <bb 5> [local count: 955630225]:
+  <bb 5> [local count: 315357973]:
   # i_13 = PHI <i_10(20), 0(4)>
   # prephitmp_12 = PHI <prephitmp_5(20), pretmp_3(4)>
   if (prephitmp_12 != 0)
     goto <bb 6>; [33.00%]
   else
     goto <bb 7>; [67.00%]

   <bb 6> [local count: 315357972]:
   _2 = do_something ();
   ga = _2;

-  <bb 7> [local count: 955630225]:
+  <bb 7> [local count: 315357973]:
   # prephitmp_5 = PHI <prephitmp_12(5), _2(6)>
   i_10 = inc (i_13);
   if (n_7(D) > i_10)
     goto <bb 21>; [89.00%]
   else
     goto <bb 11>; [11.00%]

   <bb 11> [local count: 105119324]:
   goto <bb 3>; [100.00%]

-  <bb 21> [local count: 850510901]:
+  <bb 21> [local count: 280668596]:
   if (prephitmp_12 != 0)
-    goto <bb 20>; [100.00%]
+    goto <bb 20>; [33.00%]
   else
-    goto <bb 19>; [INV]
+    goto <bb 19>; [67.00%]

-  <bb 20> [local count: 850510901]:
+  <bb 20> [local count: 280668596]:
   goto <bb 5>; [100.00%]

-  <bb 19> [count: 0]:
+  <bb 19> [local count: 70429947]:
   # i_23 = PHI <i_10(21)>
   # prephitmp_25 = PHI <prephitmp_5(21)>

-  <bb 12> [local count: 955630225]:
+  <bb 12> [local count: 640272252]:
   # i_15 = PHI <i_23(19), i_22(16)>
   # prephitmp_16 = PHI <prephitmp_25(19), prephitmp_16(16)>
   i_22 = inc (i_15);
   if (n_7(D) > i_22)
     goto <bb 16>; [89.00%]
   else
     goto <bb 11>; [11.00%]

-  <bb 16> [local count: 850510901]:
+  <bb 16> [local count: 569842305]:
   goto <bb 12>; [100.00%]

 }

gcc/ChangeLog:

	* tree-ssa-loop-split.c (split_loop): Fix incorrect
	profile_count and probability.
	(do_split_loop_on_cond): Likewise.
---
 gcc/tree-ssa-loop-split.c | 85 +++++++++++++++++++++++++++++++++++----
 1 file changed, 77 insertions(+), 8 deletions(-)
  

Comments

Jeff Law Dec. 8, 2021, 11:47 p.m. UTC | #1
On 12/7/2021 10:54 PM, Xionghu Luo via Gcc-patches wrote:
> In tree-ssa-loop-split.c, split_loop and split_loop_on_cond does two
> kind of split. split_loop only works for single loop and insert edge at
> exit when split, while split_loop_on_cond is not limited to single loop
> and insert edge at latch when split.  Both split behavior should consider
> loop count and probability update.  For split_loop, loop split condition
> is moved in front of loop1 and loop2; But split_loop_on_cond moves the
> condition between loop1 and loop2, this patch does:
>   1) profile count proportion for both original loop and copied loop
> without dropping down the true branch's count;
>   2) probability update in the two loops and between the two loops.
>
> Regression tested pass, OK for master?
>
> Changes diff for split_loop and split_loop_on_cond cases:
>
> 1) diff base/loop-split.c.151t.lsplit patched/loop-split.c.152t.lsplit
> ...
>     <bb 2> [local count: 118111600]:
>     if (beg_5(D) < end_8(D))
>       goto <bb 14>; [89.00%]
>     else
>       goto <bb 6>; [11.00%]
>
>     <bb 14> [local count: 105119324]:
>     if (beg2_6(D) < c_9(D))
> -    goto <bb 15>; [100.00%]
> +    goto <bb 15>; [33.00%]
>     else
> -    goto <bb 16>; [100.00%]
> +    goto <bb 16>; [67.00%]
>
> -  <bb 15> [local count: 105119324]:
> +  <bb 15> [local count: 34689377]:
>     _25 = beg_5(D) + 1;
>     _26 = end_8(D) - beg_5(D);
>     _27 = beg2_6(D) + _26;
>     _28 = MIN_EXPR <c_9(D), _27>;
>
> -  <bb 3> [local count: 955630225]:
> +  <bb 3> [local count: 315357973]:
>     # i_16 = PHI <i_11(8), beg_5(D)(15)>
>     # j_17 = PHI <j_12(8), beg2_6(D)(15)>
>     printf ("a: %d %d\n", i_16, j_17);
>     i_11 = i_16 + 1;
>     j_12 = j_17 + 1;
>     if (j_12 < _28)
> -    goto <bb 8>; [89.00%]
> +    goto <bb 8>; [29.37%]
>     else
> -    goto <bb 17>; [11.00%]
> +    goto <bb 17>; [70.63%]
>
> -  <bb 8> [local count: 850510901]:
> +  <bb 8> [local count: 280668596]:
>     goto <bb 3>; [100.00%]
>
> -  <bb 16> [local count: 105119324]:
> +  <bb 16> [local count: 70429947]:
>     # i_22 = PHI <beg_5(D)(14), i_29(17)>
>     # j_23 = PHI <beg2_6(D)(14), j_30(17)>
>
>     <bb 10> [local count: 955630225]:
>     # i_2 = PHI <i_22(16), i_20(13)>
>     # j_1 = PHI <j_23(16), j_21(13)>
>     i_20 = i_2 + 1;
>     j_21 = j_1 + 1;
>     if (end_8(D) > i_20)
> -    goto <bb 13>; [89.00%]
> +    goto <bb 13>; [59.63%]
>     else
> -    goto <bb 9>; [11.00%]
> +    goto <bb 9>; [40.37%]
>
> -  <bb 13> [local count: 850510901]:
> +  <bb 13> [local count: 569842305]:
>     goto <bb 10>; [100.00%]
>
>     <bb 17> [local count: 105119324]:
>     # i_29 = PHI <i_11(3)>
>     # j_30 = PHI <j_12(3)>
>     if (end_8(D) > i_29)
>       goto <bb 16>; [80.00%]
>     else
>       goto <bb 9>; [20.00%]
>
>     <bb 9> [local count: 105119324]:
>
>     <bb 6> [local count: 118111600]:
>     return 0;
>
>   }
>     <bb 2> [local count: 118111600]:
> -  if (beg_5(D) < end_8(D))
> +  _1 = end_6(D) - beg_7(D);
> +  j_9 = _1 + beg2_8(D);
> +  if (end_6(D) > beg_7(D))
>       goto <bb 14>; [89.00%]
>     else
>       goto <bb 6>; [11.00%]
>
>     <bb 14> [local count: 105119324]:
> -  if (beg2_6(D) < c_9(D))
> -    goto <bb 15>; [100.00%]
> +  if (j_9 >= c_11(D))
> +    goto <bb 15>; [33.00%]
>     else
> -    goto <bb 16>; [100.00%]
> +    goto <bb 16>; [67.00%]
>
> -  <bb 15> [local count: 105119324]:
> -  _25 = beg_5(D) + 1;
> -  _26 = end_8(D) - beg_5(D);
> -  _27 = beg2_6(D) + _26;
> -  _28 = MIN_EXPR <c_9(D), _27>;
> -
> -  <bb 3> [local count: 955630225]:
> -  # i_16 = PHI <i_11(8), beg_5(D)(15)>
> -  # j_17 = PHI <j_12(8), beg2_6(D)(15)>
> -  printf ("a: %d %d\n", i_16, j_17);
> -  i_11 = i_16 + 1;
> -  j_12 = j_17 + 1;
> -  if (j_12 < _28)
> -    goto <bb 8>; [89.00%]
> +  <bb 15> [local count: 34689377]:
> +  _27 = end_6(D) + -1;
> +  _28 = beg_7(D) - end_6(D);
> +  _29 = j_9 + _28;
> +  _30 = _29 + 1;
> +  _31 = MAX_EXPR <c_11(D), _30>;
> +
> +  <bb 3> [local count: 315357973]:
> +  # i_18 = PHI <i_13(8), end_6(D)(15)>
> +  # j_19 = PHI <j_14(8), j_9(15)>
> +  printf ("a: %d %d\n", i_18, j_19);
> +  i_13 = i_18 + -1;
> +  j_14 = j_19 + -1;
> +  if (j_14 >= _31)
> +    goto <bb 8>; [29.37%]
>     else
> -    goto <bb 17>; [11.00%]
> +    goto <bb 17>; [70.63%]
>
> -  <bb 8> [local count: 850510901]:
> +  <bb 8> [local count: 280668596]:
>     goto <bb 3>; [100.00%]
>
> -  <bb 16> [local count: 105119324]:
> -  # i_22 = PHI <beg_5(D)(14), i_29(17)>
> -  # j_23 = PHI <beg2_6(D)(14), j_30(17)>
> +  <bb 16> [local count: 70429947]:
> +  # i_24 = PHI <end_6(D)(14), i_32(17)>
> +  # j_25 = PHI <j_9(14), j_33(17)>
>
>     <bb 10> [local count: 955630225]:
> -  # i_2 = PHI <i_22(16), i_20(13)>
> -  # j_1 = PHI <j_23(16), j_21(13)>
> -  i_20 = i_2 + 1;
> -  j_21 = j_1 + 1;
> -  if (end_8(D) > i_20)
> +  # i_3 = PHI <i_24(16), i_22(13)>
> +  # j_2 = PHI <j_25(16), j_23(13)>
> +  i_22 = i_3 + -1;
> +  j_23 = j_2 + -1;
> +  if (beg_7(D) < i_22)
>       goto <bb 13>; [89.00%]
>     else
>       goto <bb 9>; [11.00%]
>
> -  <bb 13> [local count: 850510901]:
> +  <bb 13> [local count: 569842305]:
>     goto <bb 10>; [100.00%]
>
>     <bb 17> [local count: 105119324]:
> -  # i_29 = PHI <i_11(3)>
> -  # j_30 = PHI <j_12(3)>
> -  if (end_8(D) > i_29)
> +  # i_32 = PHI <i_13(3)>
> +  # j_33 = PHI <j_14(3)>
> +  if (beg_7(D) < i_32)
>       goto <bb 16>; [80.00%]
>     else
>       goto <bb 9>; [20.00%]
>
>     <bb 9> [local count: 105119324]:
>
>     <bb 6> [local count: 118111600]:
>     return 0;
>
>   }
>
> 2) diff base/loop-cond-split-1.c.151t.lsplit  patched/loop-cond-split-1.c.151t.lsplit:
> ...
>     <bb 2> [local count: 118111600]:
>     if (n_7(D) > 0)
>       goto <bb 4>; [89.00%]
>     else
>       goto <bb 3>; [11.00%]
>
>     <bb 3> [local count: 118111600]:
>     return;
>
>     <bb 4> [local count: 105119324]:
>     pretmp_3 = ga;
>
> -  <bb 5> [local count: 955630225]:
> +  <bb 5> [local count: 315357973]:
>     # i_13 = PHI <i_10(20), 0(4)>
>     # prephitmp_12 = PHI <prephitmp_5(20), pretmp_3(4)>
>     if (prephitmp_12 != 0)
>       goto <bb 6>; [33.00%]
>     else
>       goto <bb 7>; [67.00%]
>
>     <bb 6> [local count: 315357972]:
>     _2 = do_something ();
>     ga = _2;
>
> -  <bb 7> [local count: 955630225]:
> +  <bb 7> [local count: 315357973]:
>     # prephitmp_5 = PHI <prephitmp_12(5), _2(6)>
>     i_10 = inc (i_13);
>     if (n_7(D) > i_10)
>       goto <bb 21>; [89.00%]
>     else
>       goto <bb 11>; [11.00%]
>
>     <bb 11> [local count: 105119324]:
>     goto <bb 3>; [100.00%]
>
> -  <bb 21> [local count: 850510901]:
> +  <bb 21> [local count: 280668596]:
>     if (prephitmp_12 != 0)
> -    goto <bb 20>; [100.00%]
> +    goto <bb 20>; [33.00%]
>     else
> -    goto <bb 19>; [INV]
> +    goto <bb 19>; [67.00%]
>
> -  <bb 20> [local count: 850510901]:
> +  <bb 20> [local count: 280668596]:
>     goto <bb 5>; [100.00%]
>
> -  <bb 19> [count: 0]:
> +  <bb 19> [local count: 70429947]:
>     # i_23 = PHI <i_10(21)>
>     # prephitmp_25 = PHI <prephitmp_5(21)>
>
> -  <bb 12> [local count: 955630225]:
> +  <bb 12> [local count: 640272252]:
>     # i_15 = PHI <i_23(19), i_22(16)>
>     # prephitmp_16 = PHI <prephitmp_25(19), prephitmp_16(16)>
>     i_22 = inc (i_15);
>     if (n_7(D) > i_22)
>       goto <bb 16>; [89.00%]
>     else
>       goto <bb 11>; [11.00%]
>
> -  <bb 16> [local count: 850510901]:
> +  <bb 16> [local count: 569842305]:
>     goto <bb 12>; [100.00%]
>
>   }
>
> gcc/ChangeLog:
>
> 	* tree-ssa-loop-split.c (split_loop): Fix incorrect
> 	profile_count and probability.
> 	(do_split_loop_on_cond): Likewise.
> ---
>   gcc/tree-ssa-loop-split.c | 85 +++++++++++++++++++++++++++++++++++----
>   1 file changed, 77 insertions(+), 8 deletions(-)
>
> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
> index 3f6ad046623..33128061aab 100644
> --- a/gcc/tree-ssa-loop-split.c
> +++ b/gcc/tree-ssa-loop-split.c
>
> @@ -607,6 +610,38 @@ split_loop (class loop *loop1)
>   	tree guard_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop1));
>   	patch_loop_exit (loop1, guard_stmt, guard_next, newend, initial_true);
>   
> +	update_ssa (TODO_update_ssa);
> +
> +	/* Proportion first loop's bb counts except those dominated by true
> +	   branch to avoid drop 1s down.  */
> +	basic_block *bbs1, *bbs2;
> +	bbs1 = get_loop_body (loop1);
> +	unsigned j;
> +	for (j = 0; j < loop1->num_nodes; j++)
> +	  if (bbs1[j] == loop1->latch
> +	      || !dominated_by_p (CDI_DOMINATORS, bbs1[j], true_edge->dest))
> +	    bbs1[j]->count
> +	      = bbs1[j]->count.apply_probability (true_edge->probability);
> +	free (bbs1);
It looks like there's two copies of this code in this patch, one in 
split_loop and the other in do_split_loop_on_cond.  Would it make sense 
to factor it out into its own little function?


> +
> +	/* Proportion second loop's bb counts except those dominated by false
> +	   branch to avoid drop 1s down.  */
> +	basic_block bbi_copy = get_bb_copy (false_edge->dest);
> +	bbs2 = get_loop_body (loop2);
> +	for (j = 0; j < loop2->num_nodes; j++)
> +	  if (bbs2[j] == loop2->latch
> +	      || !dominated_by_p (CDI_DOMINATORS, bbs2[j], bbi_copy))
> +	    bbs2[j]->count = bbs2[j]->count.apply_probability (
> +	      true_edge->probability.invert ());
> +	free (bbs2);
Similarly for this block of code.

If those can be reasonably factored out into two helper functions to be 
called from split_loop and do_split_loop_on_cond, then this is OK with 
the refactoring.

jeff
  
Xionghu Luo Dec. 13, 2021, 8:57 a.m. UTC | #2
On 2021/12/9 07:47, Jeff Law wrote:
>> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
>> index 3f6ad046623..33128061aab 100644
>> --- a/gcc/tree-ssa-loop-split.c
>> +++ b/gcc/tree-ssa-loop-split.c
>>
>> @@ -607,6 +610,38 @@ split_loop (class loop *loop1)
>>       tree guard_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge
>> (loop1));
>>       patch_loop_exit (loop1, guard_stmt, guard_next, newend,
>> initial_true);
>>   +    update_ssa (TODO_update_ssa);
>> +
>> +    /* Proportion first loop's bb counts except those dominated by true
>> +       branch to avoid drop 1s down.  */
>> +    basic_block *bbs1, *bbs2;
>> +    bbs1 = get_loop_body (loop1);
>> +    unsigned j;
>> +    for (j = 0; j < loop1->num_nodes; j++)
>> +      if (bbs1[j] == loop1->latch
>> +          || !dominated_by_p (CDI_DOMINATORS, bbs1[j], true_edge->dest))
>> +        bbs1[j]->count
>> +          = bbs1[j]->count.apply_probability (true_edge->probability);
>> +    free (bbs1);
> It looks like there's two copies of this code in this patch, one in
> split_loop and the other in do_split_loop_on_cond.  Would it make sense
> to factor it out into its own little function?
> 
> 
>> +
>> +    /* Proportion second loop's bb counts except those dominated by
>> false
>> +       branch to avoid drop 1s down.  */
>> +    basic_block bbi_copy = get_bb_copy (false_edge->dest);
>> +    bbs2 = get_loop_body (loop2);
>> +    for (j = 0; j < loop2->num_nodes; j++)
>> +      if (bbs2[j] == loop2->latch
>> +          || !dominated_by_p (CDI_DOMINATORS, bbs2[j], bbi_copy))
>> +        bbs2[j]->count = bbs2[j]->count.apply_probability (
>> +          true_edge->probability.invert ());
>> +    free (bbs2);
> Similarly for this block of code.
> 
> If those can be reasonably factored out into two helper functions to be
> called from split_loop and do_split_loop_on_cond, then this is OK with
> the refactoring.
> 
> jeff


Thanks for the comments, updated as below.  Will commit this patchset and the
approved patch for LIM if there are no objections:


[PATCH v2 3/3] Fix loop split incorrect count and probability

In tree-ssa-loop-split.c, split_loop and split_loop_on_cond does two
kind of split. split_loop only works for single loop and insert edge at
exit when split, while split_loop_on_cond is not limited to single loop
and insert edge at latch when split.  Both split behavior should consider
loop count and probability update.  For split_loop, loop split condition
is moved in front of loop1 and loop2; But split_loop_on_cond moves the
condition between loop1 and loop2, this patch does:
 1) profile count proportion for both original loop and copied loop
without dropping down the true branch's count;
 2) probability update in the two loops and between the two loops.

Regression tested pass, OK for master?

Changes diff for split_loop and split_loop_on_cond cases:

1) diff base/loop-split.c.151t.lsplit patched/loop-split.c.152t.lsplit
...
   <bb 2> [local count: 118111600]:
   _1 = end_6(D) - beg_7(D);
   j_9 = _1 + beg2_8(D);
   if (end_6(D) > beg_7(D))
     goto <bb 14>; [89.00%]
   else
     goto <bb 6>; [11.00%]

   <bb 14> [local count: 105119324]:
   if (j_9 >= c_11(D))
-    goto <bb 15>; [100.00%]
+    goto <bb 15>; [33.00%]
   else
-    goto <bb 16>; [100.00%]
+    goto <bb 16>; [67.00%]

-  <bb 15> [local count: 105119324]:
+  <bb 15> [local count: 34689377]:
   _27 = end_6(D) + -1;
   _28 = beg_7(D) - end_6(D);
   _29 = j_9 + _28;
   _30 = _29 + 1;
   _31 = MAX_EXPR <c_11(D), _30>;

-  <bb 3> [local count: 955630225]:
+  <bb 3> [local count: 315357973]:
   # i_18 = PHI <i_13(8), end_6(D)(15)>
   # j_19 = PHI <j_14(8), j_9(15)>
   printf ("a: %d %d\n", i_18, j_19);
   i_13 = i_18 + -1;
   j_14 = j_19 + -1;
   if (j_14 >= _31)
-    goto <bb 8>; [89.00%]
+    goto <bb 8>; [29.37%]
   else
-    goto <bb 17>; [11.00%]
+    goto <bb 17>; [70.63%]

-  <bb 8> [local count: 850510901]:
+  <bb 8> [local count: 280668596]:
   goto <bb 3>; [100.00%]

-  <bb 16> [local count: 105119324]:
+  <bb 16> [local count: 70429947]:
   # i_24 = PHI <end_6(D)(14), i_32(17)>
   # j_25 = PHI <j_9(14), j_33(17)>

   <bb 10> [local count: 955630225]:
   # i_3 = PHI <i_24(16), i_22(13)>
   # j_2 = PHI <j_25(16), j_23(13)>
   i_22 = i_3 + -1;
   j_23 = j_2 + -1;
   if (beg_7(D) < i_22)
     goto <bb 13>; [89.00%]
   else
     goto <bb 9>; [11.00%]

-  <bb 13> [local count: 850510901]:
+  <bb 13> [local count: 569842305]:
   goto <bb 10>; [100.00%]

   <bb 17> [local count: 105119324]:
   # i_32 = PHI <i_13(3)>
   # j_33 = PHI <j_14(3)>
   if (beg_7(D) < i_32)
     goto <bb 16>; [80.00%]
   else
     goto <bb 9>; [20.00%]

   <bb 9> [local count: 105119324]:

   <bb 6> [local count: 118111600]:
   return 0;

 }

2) diff base/loop-cond-split-1.c.151t.lsplit  patched/loop-cond-split-1.c.151t.lsplit:
...
   <bb 2> [local count: 118111600]:
   if (n_7(D) > 0)
     goto <bb 4>; [89.00%]
   else
     goto <bb 3>; [11.00%]

   <bb 3> [local count: 118111600]:
   return;

   <bb 4> [local count: 105119324]:
   pretmp_3 = ga;

-  <bb 5> [local count: 955630225]:
+  <bb 5> [local count: 315357973]:
   # i_13 = PHI <i_10(20), 0(4)>
   # prephitmp_12 = PHI <prephitmp_5(20), pretmp_3(4)>
   if (prephitmp_12 != 0)
     goto <bb 6>; [33.00%]
   else
     goto <bb 7>; [67.00%]

   <bb 6> [local count: 315357972]:
   _2 = do_something ();
   ga = _2;

-  <bb 7> [local count: 955630225]:
+  <bb 7> [local count: 315357973]:
   # prephitmp_5 = PHI <prephitmp_12(5), _2(6)>
   i_10 = inc (i_13);
   if (n_7(D) > i_10)
     goto <bb 21>; [89.00%]
   else
     goto <bb 11>; [11.00%]

   <bb 11> [local count: 105119324]:
   goto <bb 3>; [100.00%]

-  <bb 21> [local count: 850510901]:
+  <bb 21> [local count: 280668596]:
   if (prephitmp_12 != 0)
-    goto <bb 20>; [100.00%]
+    goto <bb 20>; [33.00%]
   else
-    goto <bb 19>; [INV]
+    goto <bb 19>; [67.00%]

-  <bb 20> [local count: 850510901]:
+  <bb 20> [local count: 280668596]:
   goto <bb 5>; [100.00%]

-  <bb 19> [count: 0]:
+  <bb 19> [local count: 70429947]:
   # i_23 = PHI <i_10(21)>
   # prephitmp_25 = PHI <prephitmp_5(21)>

-  <bb 12> [local count: 955630225]:
+  <bb 12> [local count: 640272252]:
   # i_15 = PHI <i_23(19), i_22(16)>
   # prephitmp_16 = PHI <prephitmp_25(19), prephitmp_16(16)>
   i_22 = inc (i_15);
   if (n_7(D) > i_22)
     goto <bb 16>; [89.00%]
   else
     goto <bb 11>; [11.00%]

-  <bb 16> [local count: 850510901]:
+  <bb 16> [local count: 569842305]:
   goto <bb 12>; [100.00%]
 }

gcc/ChangeLog:

	* tree-ssa-loop-split.c (Fix_loop_bb_probability): New function.
	(split_loop): Fix incorrect profile_count and probability.
	(do_split_loop_on_cond): Likewise.
---
 gcc/tree-ssa-loop-split.c | 72 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 64 insertions(+), 8 deletions(-)

diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
index 3f6ad046623..55011aeed97 100644
--- a/gcc/tree-ssa-loop-split.c
+++ b/gcc/tree-ssa-loop-split.c
@@ -484,6 +484,39 @@ compute_new_first_bound (gimple_seq *stmts, class tree_niter_desc *niter,
   return newend;
 }
 
+/* Fix the two loop's bb count after split based on the split edge probability,
+   don't adjust the bbs dominated by true branches of that loop to avoid
+   dropping 1s down.  */
+static void
+fix_loop_bb_probability (class loop *loop1, class loop *loop2, edge true_edge,
+			 edge false_edge)
+{
+  update_ssa (TODO_update_ssa);
+
+  /* Proportion first loop's bb counts except those dominated by true
+     branch to avoid drop 1s down.  */
+  basic_block *bbs1, *bbs2;
+  bbs1 = get_loop_body (loop1);
+  unsigned j;
+  for (j = 0; j < loop1->num_nodes; j++)
+    if (bbs1[j] == loop1->latch
+	|| !dominated_by_p (CDI_DOMINATORS, bbs1[j], true_edge->dest))
+      bbs1[j]->count
+	= bbs1[j]->count.apply_probability (true_edge->probability);
+  free (bbs1);
+
+  /* Proportion second loop's bb counts except those dominated by false
+     branch to avoid drop 1s down.  */
+  basic_block bbi_copy = get_bb_copy (false_edge->dest);
+  bbs2 = get_loop_body (loop2);
+  for (j = 0; j < loop2->num_nodes; j++)
+    if (bbs2[j] == loop2->latch
+	|| !dominated_by_p (CDI_DOMINATORS, bbs2[j], bbi_copy))
+      bbs2[j]->count
+	= bbs2[j]->count.apply_probability (true_edge->probability.invert ());
+  free (bbs2);
+}
+
 /* Checks if LOOP contains an conditional block whose condition
    depends on which side in the iteration space it is, and if so
    splits the iteration space into two loops.  Returns true if the
@@ -575,7 +608,10 @@ split_loop (class loop *loop1)
 					    stmts2);
 	tree cond = build2 (guard_code, boolean_type_node, guard_init, border);
 	if (!initial_true)
-	  cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); 
+	  cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond);
+
+	edge true_edge, false_edge;
+	extract_true_false_edges_from_block (bbs[i], &true_edge, &false_edge);
 
 	/* Now version the loop, placing loop2 after loop1 connecting
 	   them, and fix up SSA form for that.  */
@@ -583,11 +619,11 @@ split_loop (class loop *loop1)
 	basic_block cond_bb;
 
 	class loop *loop2 = loop_version (loop1, cond, &cond_bb,
-					   profile_probability::always (),
-					   profile_probability::always (),
-					   profile_probability::always (),
-					   profile_probability::always (),
-					   true);
+					  true_edge->probability,
+					  true_edge->probability.invert (),
+					  profile_probability::always (),
+					  profile_probability::always (),
+					  true);
 	gcc_assert (loop2);
 
 	edge new_e = connect_loops (loop1, loop2);
@@ -607,6 +643,15 @@ split_loop (class loop *loop1)
 	tree guard_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop1));
 	patch_loop_exit (loop1, guard_stmt, guard_next, newend, initial_true);
 
+	fix_loop_bb_probability (loop1, loop2, true_edge, false_edge);
+
+	/* Fix first loop's exit probability after scaling.  */
+	edge exit_to_latch1 = single_pred_edge (loop1->latch);
+	exit_to_latch1->probability = exit_to_latch1->probability.apply_scale (
+	  true_edge->probability.to_reg_br_prob_base (), REG_BR_PROB_BASE);
+	single_exit (loop1)->probability
+	  = exit_to_latch1->probability.invert ();
+
 	/* Finally patch out the two copies of the condition to be always
 	   true/false (or opposite).  */
 	gcond *force_true = as_a<gcond *> (last_stmt (bbs[i]));
@@ -1486,8 +1531,8 @@ do_split_loop_on_cond (struct loop *loop1, edge invar_branch)
   initialize_original_copy_tables ();
 
   struct loop *loop2 = loop_version (loop1, boolean_true_node, NULL,
-				     profile_probability::always (),
-				     profile_probability::never (),
+				     invar_branch->probability.invert (),
+				     invar_branch->probability,
 				     profile_probability::always (),
 				     profile_probability::always (),
 				     true);
@@ -1535,6 +1580,17 @@ do_split_loop_on_cond (struct loop *loop1, edge invar_branch)
      between loop1 and loop2.  */
   connect_loop_phis (loop1, loop2, to_loop2);
 
+  edge true_edge, false_edge, skip_edge1, skip_edge2;
+  extract_true_false_edges_from_block (cond_bb, &true_edge, &false_edge);
+
+  skip_edge1 = true_invar ? false_edge : true_edge;
+  skip_edge2 = true_invar ? true_edge : false_edge;
+  fix_loop_bb_probability (loop1, loop2, skip_edge1, skip_edge2);
+
+  /* Fix first loop's exit probability after scaling.  */
+  to_loop1->probability = invar_branch->probability.invert ();
+  to_loop2->probability = invar_branch->probability;
+
   free_original_copy_tables ();
 
   return true;
  
Xionghu Luo Dec. 21, 2021, 3:57 a.m. UTC | #3
On 2021/12/13 16:57, Xionghu Luo via Gcc-patches wrote:
> 
> 
> On 2021/12/9 07:47, Jeff Law wrote:
>>> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
>>> index 3f6ad046623..33128061aab 100644
>>> --- a/gcc/tree-ssa-loop-split.c
>>> +++ b/gcc/tree-ssa-loop-split.c
>>>
>>> @@ -607,6 +610,38 @@ split_loop (class loop *loop1)
>>>       tree guard_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge
>>> (loop1));
>>>       patch_loop_exit (loop1, guard_stmt, guard_next, newend,
>>> initial_true);
>>>   +    update_ssa (TODO_update_ssa);
>>> +
>>> +    /* Proportion first loop's bb counts except those dominated by true
>>> +       branch to avoid drop 1s down.  */
>>> +    basic_block *bbs1, *bbs2;
>>> +    bbs1 = get_loop_body (loop1);
>>> +    unsigned j;
>>> +    for (j = 0; j < loop1->num_nodes; j++)
>>> +      if (bbs1[j] == loop1->latch
>>> +          || !dominated_by_p (CDI_DOMINATORS, bbs1[j], true_edge->dest))
>>> +        bbs1[j]->count
>>> +          = bbs1[j]->count.apply_probability (true_edge->probability);
>>> +    free (bbs1);
>> It looks like there's two copies of this code in this patch, one in
>> split_loop and the other in do_split_loop_on_cond.  Would it make sense
>> to factor it out into its own little function?
>>
>>
>>> +
>>> +    /* Proportion second loop's bb counts except those dominated by
>>> false
>>> +       branch to avoid drop 1s down.  */
>>> +    basic_block bbi_copy = get_bb_copy (false_edge->dest);
>>> +    bbs2 = get_loop_body (loop2);
>>> +    for (j = 0; j < loop2->num_nodes; j++)
>>> +      if (bbs2[j] == loop2->latch
>>> +          || !dominated_by_p (CDI_DOMINATORS, bbs2[j], bbi_copy))
>>> +        bbs2[j]->count = bbs2[j]->count.apply_probability (
>>> +          true_edge->probability.invert ());
>>> +    free (bbs2);
>> Similarly for this block of code.
>>
>> If those can be reasonably factored out into two helper functions to be
>> called from split_loop and do_split_loop_on_cond, then this is OK with
>> the refactoring.
>>
>> jeff
> 
> 
> Thanks for the comments, updated as below.  Will commit this patchset and the
> approved patch for LIM if there are no objections:

This patch is committed to r12-6086.

> 
> 
> [PATCH v2 3/3] Fix loop split incorrect count and probability
> 
> In tree-ssa-loop-split.c, split_loop and split_loop_on_cond does two
> kind of split. split_loop only works for single loop and insert edge at
> exit when split, while split_loop_on_cond is not limited to single loop
> and insert edge at latch when split.  Both split behavior should consider
> loop count and probability update.  For split_loop, loop split condition
> is moved in front of loop1 and loop2; But split_loop_on_cond moves the
> condition between loop1 and loop2, this patch does:
>  1) profile count proportion for both original loop and copied loop
> without dropping down the true branch's count;
>  2) probability update in the two loops and between the two loops.
> 
> Regression tested pass, OK for master?
> 
> Changes diff for split_loop and split_loop_on_cond cases:
> 
> 1) diff base/loop-split.c.151t.lsplit patched/loop-split.c.152t.lsplit
> ...
>    <bb 2> [local count: 118111600]:
>    _1 = end_6(D) - beg_7(D);
>    j_9 = _1 + beg2_8(D);
>    if (end_6(D) > beg_7(D))
>      goto <bb 14>; [89.00%]
>    else
>      goto <bb 6>; [11.00%]
> 
>    <bb 14> [local count: 105119324]:
>    if (j_9 >= c_11(D))
> -    goto <bb 15>; [100.00%]
> +    goto <bb 15>; [33.00%]
>    else
> -    goto <bb 16>; [100.00%]
> +    goto <bb 16>; [67.00%]
> 
> -  <bb 15> [local count: 105119324]:
> +  <bb 15> [local count: 34689377]:
>    _27 = end_6(D) + -1;
>    _28 = beg_7(D) - end_6(D);
>    _29 = j_9 + _28;
>    _30 = _29 + 1;
>    _31 = MAX_EXPR <c_11(D), _30>;
> 
> -  <bb 3> [local count: 955630225]:
> +  <bb 3> [local count: 315357973]:
>    # i_18 = PHI <i_13(8), end_6(D)(15)>
>    # j_19 = PHI <j_14(8), j_9(15)>
>    printf ("a: %d %d\n", i_18, j_19);
>    i_13 = i_18 + -1;
>    j_14 = j_19 + -1;
>    if (j_14 >= _31)
> -    goto <bb 8>; [89.00%]
> +    goto <bb 8>; [29.37%]
>    else
> -    goto <bb 17>; [11.00%]
> +    goto <bb 17>; [70.63%]
> 
> -  <bb 8> [local count: 850510901]:
> +  <bb 8> [local count: 280668596]:
>    goto <bb 3>; [100.00%]
> 
> -  <bb 16> [local count: 105119324]:
> +  <bb 16> [local count: 70429947]:
>    # i_24 = PHI <end_6(D)(14), i_32(17)>
>    # j_25 = PHI <j_9(14), j_33(17)>
> 
>    <bb 10> [local count: 955630225]:
>    # i_3 = PHI <i_24(16), i_22(13)>
>    # j_2 = PHI <j_25(16), j_23(13)>
>    i_22 = i_3 + -1;
>    j_23 = j_2 + -1;
>    if (beg_7(D) < i_22)
>      goto <bb 13>; [89.00%]
>    else
>      goto <bb 9>; [11.00%]
> 
> -  <bb 13> [local count: 850510901]:
> +  <bb 13> [local count: 569842305]:
>    goto <bb 10>; [100.00%]
> 
>    <bb 17> [local count: 105119324]:
>    # i_32 = PHI <i_13(3)>
>    # j_33 = PHI <j_14(3)>
>    if (beg_7(D) < i_32)
>      goto <bb 16>; [80.00%]
>    else
>      goto <bb 9>; [20.00%]
> 
>    <bb 9> [local count: 105119324]:
> 
>    <bb 6> [local count: 118111600]:
>    return 0;
> 
>  }
> 
> 2) diff base/loop-cond-split-1.c.151t.lsplit  patched/loop-cond-split-1.c.151t.lsplit:
> ...
>    <bb 2> [local count: 118111600]:
>    if (n_7(D) > 0)
>      goto <bb 4>; [89.00%]
>    else
>      goto <bb 3>; [11.00%]
> 
>    <bb 3> [local count: 118111600]:
>    return;
> 
>    <bb 4> [local count: 105119324]:
>    pretmp_3 = ga;
> 
> -  <bb 5> [local count: 955630225]:
> +  <bb 5> [local count: 315357973]:
>    # i_13 = PHI <i_10(20), 0(4)>
>    # prephitmp_12 = PHI <prephitmp_5(20), pretmp_3(4)>
>    if (prephitmp_12 != 0)
>      goto <bb 6>; [33.00%]
>    else
>      goto <bb 7>; [67.00%]
> 
>    <bb 6> [local count: 315357972]:
>    _2 = do_something ();
>    ga = _2;
> 
> -  <bb 7> [local count: 955630225]:
> +  <bb 7> [local count: 315357973]:
>    # prephitmp_5 = PHI <prephitmp_12(5), _2(6)>
>    i_10 = inc (i_13);
>    if (n_7(D) > i_10)
>      goto <bb 21>; [89.00%]
>    else
>      goto <bb 11>; [11.00%]
> 
>    <bb 11> [local count: 105119324]:
>    goto <bb 3>; [100.00%]
> 
> -  <bb 21> [local count: 850510901]:
> +  <bb 21> [local count: 280668596]:
>    if (prephitmp_12 != 0)
> -    goto <bb 20>; [100.00%]
> +    goto <bb 20>; [33.00%]
>    else
> -    goto <bb 19>; [INV]
> +    goto <bb 19>; [67.00%]
> 
> -  <bb 20> [local count: 850510901]:
> +  <bb 20> [local count: 280668596]:
>    goto <bb 5>; [100.00%]
> 
> -  <bb 19> [count: 0]:
> +  <bb 19> [local count: 70429947]:
>    # i_23 = PHI <i_10(21)>
>    # prephitmp_25 = PHI <prephitmp_5(21)>
> 
> -  <bb 12> [local count: 955630225]:
> +  <bb 12> [local count: 640272252]:
>    # i_15 = PHI <i_23(19), i_22(16)>
>    # prephitmp_16 = PHI <prephitmp_25(19), prephitmp_16(16)>
>    i_22 = inc (i_15);
>    if (n_7(D) > i_22)
>      goto <bb 16>; [89.00%]
>    else
>      goto <bb 11>; [11.00%]
> 
> -  <bb 16> [local count: 850510901]:
> +  <bb 16> [local count: 569842305]:
>    goto <bb 12>; [100.00%]
>  }
> 
> gcc/ChangeLog:
> 
> 	* tree-ssa-loop-split.c (Fix_loop_bb_probability): New function.
> 	(split_loop): Fix incorrect profile_count and probability.
> 	(do_split_loop_on_cond): Likewise.
> ---
>  gcc/tree-ssa-loop-split.c | 72 ++++++++++++++++++++++++++++++++++-----
>  1 file changed, 64 insertions(+), 8 deletions(-)
> 
> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
> index 3f6ad046623..55011aeed97 100644
> --- a/gcc/tree-ssa-loop-split.c
> +++ b/gcc/tree-ssa-loop-split.c
> @@ -484,6 +484,39 @@ compute_new_first_bound (gimple_seq *stmts, class tree_niter_desc *niter,
>    return newend;
>  }
> 
> +/* Fix the two loop's bb count after split based on the split edge probability,
> +   don't adjust the bbs dominated by true branches of that loop to avoid
> +   dropping 1s down.  */
> +static void
> +fix_loop_bb_probability (class loop *loop1, class loop *loop2, edge true_edge,
> +			 edge false_edge)
> +{
> +  update_ssa (TODO_update_ssa);
> +
> +  /* Proportion first loop's bb counts except those dominated by true
> +     branch to avoid drop 1s down.  */
> +  basic_block *bbs1, *bbs2;
> +  bbs1 = get_loop_body (loop1);
> +  unsigned j;
> +  for (j = 0; j < loop1->num_nodes; j++)
> +    if (bbs1[j] == loop1->latch
> +	|| !dominated_by_p (CDI_DOMINATORS, bbs1[j], true_edge->dest))
> +      bbs1[j]->count
> +	= bbs1[j]->count.apply_probability (true_edge->probability);
> +  free (bbs1);
> +
> +  /* Proportion second loop's bb counts except those dominated by false
> +     branch to avoid drop 1s down.  */
> +  basic_block bbi_copy = get_bb_copy (false_edge->dest);
> +  bbs2 = get_loop_body (loop2);
> +  for (j = 0; j < loop2->num_nodes; j++)
> +    if (bbs2[j] == loop2->latch
> +	|| !dominated_by_p (CDI_DOMINATORS, bbs2[j], bbi_copy))
> +      bbs2[j]->count
> +	= bbs2[j]->count.apply_probability (true_edge->probability.invert ());
> +  free (bbs2);
> +}
> +
>  /* Checks if LOOP contains an conditional block whose condition
>     depends on which side in the iteration space it is, and if so
>     splits the iteration space into two loops.  Returns true if the
> @@ -575,7 +608,10 @@ split_loop (class loop *loop1)
>  					    stmts2);
>  	tree cond = build2 (guard_code, boolean_type_node, guard_init, border);
>  	if (!initial_true)
> -	  cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); 
> +	  cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond);
> +
> +	edge true_edge, false_edge;
> +	extract_true_false_edges_from_block (bbs[i], &true_edge, &false_edge);
> 
>  	/* Now version the loop, placing loop2 after loop1 connecting
>  	   them, and fix up SSA form for that.  */
> @@ -583,11 +619,11 @@ split_loop (class loop *loop1)
>  	basic_block cond_bb;
> 
>  	class loop *loop2 = loop_version (loop1, cond, &cond_bb,
> -					   profile_probability::always (),
> -					   profile_probability::always (),
> -					   profile_probability::always (),
> -					   profile_probability::always (),
> -					   true);
> +					  true_edge->probability,
> +					  true_edge->probability.invert (),
> +					  profile_probability::always (),
> +					  profile_probability::always (),
> +					  true);
>  	gcc_assert (loop2);
> 
>  	edge new_e = connect_loops (loop1, loop2);
> @@ -607,6 +643,15 @@ split_loop (class loop *loop1)
>  	tree guard_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop1));
>  	patch_loop_exit (loop1, guard_stmt, guard_next, newend, initial_true);
> 
> +	fix_loop_bb_probability (loop1, loop2, true_edge, false_edge);
> +
> +	/* Fix first loop's exit probability after scaling.  */
> +	edge exit_to_latch1 = single_pred_edge (loop1->latch);
> +	exit_to_latch1->probability = exit_to_latch1->probability.apply_scale (
> +	  true_edge->probability.to_reg_br_prob_base (), REG_BR_PROB_BASE);
> +	single_exit (loop1)->probability
> +	  = exit_to_latch1->probability.invert ();
> +
>  	/* Finally patch out the two copies of the condition to be always
>  	   true/false (or opposite).  */
>  	gcond *force_true = as_a<gcond *> (last_stmt (bbs[i]));
> @@ -1486,8 +1531,8 @@ do_split_loop_on_cond (struct loop *loop1, edge invar_branch)
>    initialize_original_copy_tables ();
> 
>    struct loop *loop2 = loop_version (loop1, boolean_true_node, NULL,
> -				     profile_probability::always (),
> -				     profile_probability::never (),
> +				     invar_branch->probability.invert (),
> +				     invar_branch->probability,
>  				     profile_probability::always (),
>  				     profile_probability::always (),
>  				     true);
> @@ -1535,6 +1580,17 @@ do_split_loop_on_cond (struct loop *loop1, edge invar_branch)
>       between loop1 and loop2.  */
>    connect_loop_phis (loop1, loop2, to_loop2);
> 
> +  edge true_edge, false_edge, skip_edge1, skip_edge2;
> +  extract_true_false_edges_from_block (cond_bb, &true_edge, &false_edge);
> +
> +  skip_edge1 = true_invar ? false_edge : true_edge;
> +  skip_edge2 = true_invar ? true_edge : false_edge;
> +  fix_loop_bb_probability (loop1, loop2, skip_edge1, skip_edge2);
> +
> +  /* Fix first loop's exit probability after scaling.  */
> +  to_loop1->probability = invar_branch->probability.invert ();
> +  to_loop2->probability = invar_branch->probability;
> +
>    free_original_copy_tables ();
> 
>    return true;
  

Patch

diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
index 3f6ad046623..33128061aab 100644
--- a/gcc/tree-ssa-loop-split.c
+++ b/gcc/tree-ssa-loop-split.c
@@ -575,7 +575,10 @@  split_loop (class loop *loop1)
 					    stmts2);
 	tree cond = build2 (guard_code, boolean_type_node, guard_init, border);
 	if (!initial_true)
-	  cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); 
+	  cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond);
+
+	edge true_edge, false_edge;
+	extract_true_false_edges_from_block (bbs[i], &true_edge, &false_edge);
 
 	/* Now version the loop, placing loop2 after loop1 connecting
 	   them, and fix up SSA form for that.  */
@@ -583,11 +586,11 @@  split_loop (class loop *loop1)
 	basic_block cond_bb;
 
 	class loop *loop2 = loop_version (loop1, cond, &cond_bb,
-					   profile_probability::always (),
-					   profile_probability::always (),
-					   profile_probability::always (),
-					   profile_probability::always (),
-					   true);
+					  true_edge->probability,
+					  true_edge->probability.invert (),
+					  profile_probability::always (),
+					  profile_probability::always (),
+					  true);
 	gcc_assert (loop2);
 
 	edge new_e = connect_loops (loop1, loop2);
@@ -607,6 +610,38 @@  split_loop (class loop *loop1)
 	tree guard_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop1));
 	patch_loop_exit (loop1, guard_stmt, guard_next, newend, initial_true);
 
+	update_ssa (TODO_update_ssa);
+
+	/* Proportion first loop's bb counts except those dominated by true
+	   branch to avoid drop 1s down.  */
+	basic_block *bbs1, *bbs2;
+	bbs1 = get_loop_body (loop1);
+	unsigned j;
+	for (j = 0; j < loop1->num_nodes; j++)
+	  if (bbs1[j] == loop1->latch
+	      || !dominated_by_p (CDI_DOMINATORS, bbs1[j], true_edge->dest))
+	    bbs1[j]->count
+	      = bbs1[j]->count.apply_probability (true_edge->probability);
+	free (bbs1);
+
+	/* Fix first loop's exit probability after scaling.  */
+	edge exit_to_latch1 = single_pred_edge (loop1->latch);
+	exit_to_latch1->probability = exit_to_latch1->probability.apply_scale (
+	  true_edge->probability.to_reg_br_prob_base (), REG_BR_PROB_BASE);
+	single_exit (loop1)->probability
+	  = exit_to_latch1->probability.invert ();
+
+	/* Proportion second loop's bb counts except those dominated by false
+	   branch to avoid drop 1s down.  */
+	basic_block bbi_copy = get_bb_copy (false_edge->dest);
+	bbs2 = get_loop_body (loop2);
+	for (j = 0; j < loop2->num_nodes; j++)
+	  if (bbs2[j] == loop2->latch
+	      || !dominated_by_p (CDI_DOMINATORS, bbs2[j], bbi_copy))
+	    bbs2[j]->count = bbs2[j]->count.apply_probability (
+	      true_edge->probability.invert ());
+	free (bbs2);
+
 	/* Finally patch out the two copies of the condition to be always
 	   true/false (or opposite).  */
 	gcond *force_true = as_a<gcond *> (last_stmt (bbs[i]));
@@ -1486,8 +1521,8 @@  do_split_loop_on_cond (struct loop *loop1, edge invar_branch)
   initialize_original_copy_tables ();
 
   struct loop *loop2 = loop_version (loop1, boolean_true_node, NULL,
-				     profile_probability::always (),
-				     profile_probability::never (),
+				     invar_branch->probability.invert (),
+				     invar_branch->probability,
 				     profile_probability::always (),
 				     profile_probability::always (),
 				     true);
@@ -1535,6 +1570,40 @@  do_split_loop_on_cond (struct loop *loop1, edge invar_branch)
      between loop1 and loop2.  */
   connect_loop_phis (loop1, loop2, to_loop2);
 
+  update_ssa (TODO_update_ssa);
+
+  edge true_edge, false_edge, skip_edge1, skip_edge2;
+  extract_true_false_edges_from_block (cond_bb, &true_edge, &false_edge);
+
+  /* Proportion first loop's bb counts except those dominated by true
+     branch to avoid drop 1s down.  */
+  skip_edge1 = true_invar ? false_edge : true_edge;
+  skip_edge2 = true_invar ? true_edge : false_edge;
+  basic_block *bbs1, *bbs2;
+  bbs1 = get_loop_body (loop1);
+  unsigned j;
+  for (j = 0; j < loop1->num_nodes; j++)
+    if (bbs1[j] == loop1->latch
+	|| !dominated_by_p (CDI_DOMINATORS, bbs1[j], skip_edge1->dest))
+      bbs1[j]->count
+	= bbs1[j]->count.apply_probability (skip_edge1->probability);
+  free (bbs1);
+
+  /* Fix first loop's exit probability after scaling.  */
+  to_loop1->probability = invar_branch->probability.invert ();
+  to_loop2->probability = invar_branch->probability;
+
+  /* Proportion second loop's bb counts except those dominated by false
+     branch to avoid drop 1s down.  */
+  basic_block bbi_copy = get_bb_copy (skip_edge2->dest);
+  bbs2 = get_loop_body (loop2);
+  for (j = 0; j < loop2->num_nodes; j++)
+    if (bbs2[j] == loop2->latch
+	|| !dominated_by_p (CDI_DOMINATORS, bbs2[j], bbi_copy))
+      bbs2[j]->count
+	= bbs2[j]->count.apply_probability (skip_edge2->probability);
+  free (bbs2);
+
   free_original_copy_tables ();
 
   return true;