[RFC] Pell strcoll initial iteration to improve performance.

Message ID 20150629145339.GA13548@domone
State New, archived
Headers

Commit Message

Ondrej Bilka June 29, 2015, 2:53 p.m. UTC
  Hi Leonhard,

As we try to reduce initialization overhead you should try to peel first
loop iteration. Witch following patch to do it I could have around 10% 
speedup. Where do you have script to create performance difference?

I needed to add initialization of seq1.save_idx and seq1.back_us as gcc 
thought it could be used uninitialized.

Comments?

	* string/strcoll_l.c (STRCOLL): Peel first loop iteration.
  

Comments

Leonhard Holz June 30, 2015, 5:35 a.m. UTC | #1
Hi Ondřej,

just run a "make benchmark" and see build/benchtest/bench-strcoll.out.

Best,
Leonhard

Am 29.06.2015 um 16:53 schrieb Ondřej Bílka:
> Hi Leonhard,
> 
> As we try to reduce initialization overhead you should try to peel first
> loop iteration. Witch following patch to do it I could have around 10% 
> speedup. Where do you have script to create performance difference?
> 
> I needed to add initialization of seq1.save_idx and seq1.back_us as gcc 
> thought it could be used uninitialized.
> 
> Comments?
> 
> 	* string/strcoll_l.c (STRCOLL): Peel first loop iteration.
> 
> 
> diff --git a/string/strcoll_l.c b/string/strcoll_l.c
> index 8f1225f..91b36f2 100644
> --- a/string/strcoll_l.c
> +++ b/string/strcoll_l.c
> @@ -311,7 +311,7 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
>    assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
>    assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
>    assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
> -
> +  int pass = 0;
>    int result = 0, rule = 0;
>  
>    coll_seq seq1, seq2;
> @@ -321,7 +321,61 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
>    seq2.len = 0;
>    seq2.idxmax = 0;
>  
> -  for (int pass = 0; pass < nrules; ++pass)
> +  seq1.save_idx = 0;
> +  seq1.back_us = NULL;
> +
> +  seq1.idxcnt = 0;
> +  seq1.idx = 0;
> +  seq2.idx = 0;
> +  seq1.backw_stop = ~0ul;
> +  seq1.backw = ~0ul;
> +  seq2.idxcnt = 0;
> +  seq2.backw_stop = ~0ul;
> +  seq2.backw = ~0ul;
> +
> +  /* We need the elements of the strings as unsigned values since they
> +     are used as indices.  */
> +  seq1.us = (const USTRING_TYPE *) s1;
> +  seq2.us = (const USTRING_TYPE *) s2;
> +
> +  /* We assume that if a rule has defined `position' in one section
> +     this is true for all of them.  Please note that the localedef programs
> +     makes sure that `position' is not used at the first level.  */
> +
> +  int position = rulesets[rule * nrules + pass] & sort_position;
> +
> +  get_next_seq (&seq1, nrules, rulesets, weights, table,
> +      extra, indirect, pass);
> +  get_next_seq (&seq2, nrules, rulesets, weights, table,
> +      extra, indirect, pass);
> +  /* See whether any or both strings are empty.  */
> +  if (seq1.len == 0 || seq2.len == 0)
> +    {
> +      if (seq1.len == seq2.len)
> +        {
> +          /* Both strings ended and are equal at this level.  Do a
> +	      byte-level comparison to ensure that we don't waste time
> +	      going through multiple passes for totally equal strings
> +	      before proceeding to subsequent passes.  */
> +          if (pass == 0 && encoding == __cet_other &&
> +	       STRCMP (s1, s2) == 0)
> +	     return result;
> +          else
> +	     goto next;
> +        }
> +
> +        /* This means one string is shorter than the other.  Find out
> +           which one and return an appropriate value.  */
> +      return seq1.len == 0 ? -1 : 1;
> +    }
> +
> +  result = do_compare (&seq1, &seq2, position, weights);
> +  if (result != 0)
> +    return result;
> +  goto start_while;
> +
> +
> +  for (pass = 0; pass < nrules; ++pass)
>      {
>        seq1.idxcnt = 0;
>        seq1.idx = 0;
> @@ -341,10 +395,11 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
>  	 this is true for all of them.  Please note that the localedef programs
>  	 makes sure that `position' is not used at the first level.  */
>  
> -      int position = rulesets[rule * nrules + pass] & sort_position;
> +      position = rulesets[rule * nrules + pass] & sort_position;
>  
>        while (1)
>  	{
> +start_while:
>  	  get_next_seq (&seq1, nrules, rulesets, weights, table,
>  				    extra, indirect, pass);
>  	  get_next_seq (&seq2, nrules, rulesets, weights, table,
> @@ -374,7 +429,7 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
>  	  if (result != 0)
>  	    return result;
>  	}
> -
> +      next:;
>        rule = seq1.rule;
>      }
>  
>
  
Ondrej Bilka June 30, 2015, 9:45 a.m. UTC | #2
On Tue, Jun 30, 2015 at 07:35:22AM +0200, Leonhard Holz wrote:
> Hi Ondřej,
> 
> just run a "make benchmark" and see build/benchtest/bench-strcoll.out.
> 
I did that you need to read two json files to see what changed which
isn't user friendly. How do you produce your percentage gains for locale?

Here are results from my computer.
"strcoll": {
   "filelist#C": {
    "duration": 2.15405e+08,
    "iterations": 16,
    "mean": 1.34628e+07
   },
   "filelist#en_US.UTF-8": {
    "duration": 6.10243e+08,
    "iterations": 16,
    "mean": 3.81402e+07
   },
   "lorem_ipsum#vi_VN.UTF-8": {
    "duration": 2.07991e+07,
    "iterations": 16,
    "mean": 1.29994e+06
   },
   "lorem_ipsum#ar_SA.UTF-8": {
    "duration": 2.30098e+07,
    "iterations": 16,
    "mean": 1.43811e+06
   },
   "lorem_ipsum#en_US.UTF-8": {
    "duration": 1.9611e+07,
    "iterations": 16,
    "mean": 1.22569e+06
   },
   "lorem_ipsum#zh_CN.UTF-8": {
    "duration": 1.26659e+07,
    "iterations": 16,
    "mean": 791621
   },
   "lorem_ipsum#cs_CZ.UTF-8": {
    "duration": 2.95334e+07,
    "iterations": 16,
    "mean": 1.84584e+06
   },
   "lorem_ipsum#en_GB.UTF-8": {
    "duration": 2.7764e+07,
    "iterations": 16,
    "mean": 1.73525e+06
   },
   "lorem_ipsum#da_DK.UTF-8": {
    "duration": 1.97664e+07,
    "iterations": 16,
    "mean": 1.2354e+06
   },
   "lorem_ipsum#pl_PL.UTF-8": {
    "duration": 2.06391e+07,
    "iterations": 16,
    "mean": 1.28994e+06
   },
   "lorem_ipsum#fr_FR.UTF-8": {
    "duration": 3.0974e+07,
    "iterations": 16,
    "mean": 1.93587e+06
   },
   "lorem_ipsum#pt_PT.UTF-8": {
    "duration": 3.07025e+07,
    "iterations": 16,
    "mean": 1.9189e+06
   },
   "lorem_ipsum#el_GR.UTF-8": {
    "duration": 3.65892e+07,
    "iterations": 16,
    "mean": 2.28682e+06
   },
   "lorem_ipsum#ru_RU.UTF-8": {
    "duration": 2.86577e+07,
    "iterations": 16,
    "mean": 1.79111e+06
   },
   "lorem_ipsum#iw_IL.UTF-8": {
    "duration": 2.02726e+07,
    "iterations": 16,
    "mean": 1.26704e+06
   },
   "lorem_ipsum#es_ES.UTF-8": {
    "duration": 2.5725e+07,
    "iterations": 16,
    "mean": 1.60781e+06
   },
   "lorem_ipsum#hi_IN.UTF-8": {
    "duration": 1.36606e+09,
    "iterations": 16,
    "mean": 8.53786e+07
   },
   "lorem_ipsum#sv_SE.UTF-8": {
    "duration": 1.95816e+07,
    "iterations": 16,
    "mean": 1.22385e+06
   },
   "lorem_ipsum#hu_HU.UTF-8": {
    "duration": 3.96907e+07,
    "iterations": 16,
    "mean": 2.48067e+06
   },
   "lorem_ipsum#tr_TR.UTF-8": {
    "duration": 2.50615e+07,
    "iterations": 16,
    "mean": 1.56635e+06
   },
   "lorem_ipsum#is_IS.UTF-8": {
    "duration": 2.05687e+07,
    "iterations": 16,
    "mean": 1.28554e+06
   },
   "lorem_ipsum#it_IT.UTF-8": {
    "duration": 2.92119e+07,
    "iterations": 16,
    "mean": 1.82574e+06
   },
   "lorem_ipsum#sr_RS.UTF-8": {
    "duration": 2.26173e+07,
    "iterations": 16,
    "mean": 1.41358e+06
   },
   "lorem_ipsum#ja_JP.UTF-8": {
    "duration": 1.55291e+07,
    "iterations": 16,
    "mean": 970571
   }
  }
"strcoll": {
   "filelist#C": {
    "duration": 2.64715e+08,
    "iterations": 16,
    "mean": 1.65447e+07
   },
   "filelist#en_US.UTF-8": {
    "duration": 5.7002e+08,
    "iterations": 16,
    "mean": 3.56263e+07
   },
   "lorem_ipsum#vi_VN.UTF-8": {
    "duration": 1.96996e+07,
    "iterations": 16,
    "mean": 1.23122e+06
   },
   "lorem_ipsum#ar_SA.UTF-8": {
    "duration": 2.21054e+07,
    "iterations": 16,
    "mean": 1.38159e+06
   },
   "lorem_ipsum#en_US.UTF-8": {
    "duration": 1.83283e+07,
    "iterations": 16,
    "mean": 1.14552e+06
   },
   "lorem_ipsum#zh_CN.UTF-8": {
    "duration": 1.25622e+07,
    "iterations": 16,
    "mean": 785138
   },
   "lorem_ipsum#cs_CZ.UTF-8": {
    "duration": 2.86762e+07,
    "iterations": 16,
    "mean": 1.79226e+06
   },
   "lorem_ipsum#en_GB.UTF-8": {
    "duration": 2.70558e+07,
    "iterations": 16,
    "mean": 1.69099e+06
   },
   "lorem_ipsum#da_DK.UTF-8": {
    "duration": 1.87433e+07,
    "iterations": 16,
    "mean": 1.17146e+06
   },
   "lorem_ipsum#pl_PL.UTF-8": {
    "duration": 1.95195e+07,
    "iterations": 16,
    "mean": 1.21997e+06
   },
   "lorem_ipsum#fr_FR.UTF-8": {
    "duration": 3.03286e+07,
    "iterations": 16,
    "mean": 1.89554e+06
   },
   "lorem_ipsum#pt_PT.UTF-8": {
    "duration": 2.98208e+07,
    "iterations": 16,
    "mean": 1.8638e+06
   },
   "lorem_ipsum#el_GR.UTF-8": {
    "duration": 3.62641e+07,
    "iterations": 16,
    "mean": 2.2665e+06
   },
   "lorem_ipsum#ru_RU.UTF-8": {
    "duration": 2.82956e+07,
    "iterations": 16,
    "mean": 1.76847e+06
   },
   "lorem_ipsum#iw_IL.UTF-8": {
    "duration": 1.92638e+07,
    "iterations": 16,
    "mean": 1.20399e+06
   },
   "lorem_ipsum#es_ES.UTF-8": {
    "duration": 2.47567e+07,
    "iterations": 16,
    "mean": 1.54729e+06
   },
   "lorem_ipsum#hi_IN.UTF-8": {
    "duration": 1.37025e+09,
    "iterations": 16,
    "mean": 8.56405e+07
   },
   "lorem_ipsum#sv_SE.UTF-8": {
    "duration": 1.84928e+07,
    "iterations": 16,
    "mean": 1.1558e+06
   },
   "lorem_ipsum#hu_HU.UTF-8": {
    "duration": 3.8506e+07,
    "iterations": 16,
    "mean": 2.40662e+06
   },
   "lorem_ipsum#tr_TR.UTF-8": {
    "duration": 2.41097e+07,
    "iterations": 16,
    "mean": 1.50686e+06
   },
   "lorem_ipsum#is_IS.UTF-8": {
    "duration": 1.94291e+07,
    "iterations": 16,
    "mean": 1.21432e+06
   },
   "lorem_ipsum#it_IT.UTF-8": {
    "duration": 2.82923e+07,
    "iterations": 16,
    "mean": 1.76827e+06
   },
   "lorem_ipsum#sr_RS.UTF-8": {
    "duration": 2.14058e+07,
    "iterations": 16,
    "mean": 1.33787e+06
   },
   "lorem_ipsum#ja_JP.UTF-8": {
    "duration": 1.5057e+07,
    "iterations": 16,
    "mean": 941065
   }
  }
  

Patch

diff --git a/string/strcoll_l.c b/string/strcoll_l.c
index 8f1225f..91b36f2 100644
--- a/string/strcoll_l.c
+++ b/string/strcoll_l.c
@@ -311,7 +311,7 @@  STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
   assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
   assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
   assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
-
+  int pass = 0;
   int result = 0, rule = 0;
 
   coll_seq seq1, seq2;
@@ -321,7 +321,61 @@  STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
   seq2.len = 0;
   seq2.idxmax = 0;
 
-  for (int pass = 0; pass < nrules; ++pass)
+  seq1.save_idx = 0;
+  seq1.back_us = NULL;
+
+  seq1.idxcnt = 0;
+  seq1.idx = 0;
+  seq2.idx = 0;
+  seq1.backw_stop = ~0ul;
+  seq1.backw = ~0ul;
+  seq2.idxcnt = 0;
+  seq2.backw_stop = ~0ul;
+  seq2.backw = ~0ul;
+
+  /* We need the elements of the strings as unsigned values since they
+     are used as indices.  */
+  seq1.us = (const USTRING_TYPE *) s1;
+  seq2.us = (const USTRING_TYPE *) s2;
+
+  /* We assume that if a rule has defined `position' in one section
+     this is true for all of them.  Please note that the localedef programs
+     makes sure that `position' is not used at the first level.  */
+
+  int position = rulesets[rule * nrules + pass] & sort_position;
+
+  get_next_seq (&seq1, nrules, rulesets, weights, table,
+      extra, indirect, pass);
+  get_next_seq (&seq2, nrules, rulesets, weights, table,
+      extra, indirect, pass);
+  /* See whether any or both strings are empty.  */
+  if (seq1.len == 0 || seq2.len == 0)
+    {
+      if (seq1.len == seq2.len)
+        {
+          /* Both strings ended and are equal at this level.  Do a
+	      byte-level comparison to ensure that we don't waste time
+	      going through multiple passes for totally equal strings
+	      before proceeding to subsequent passes.  */
+          if (pass == 0 && encoding == __cet_other &&
+	       STRCMP (s1, s2) == 0)
+	     return result;
+          else
+	     goto next;
+        }
+
+        /* This means one string is shorter than the other.  Find out
+           which one and return an appropriate value.  */
+      return seq1.len == 0 ? -1 : 1;
+    }
+
+  result = do_compare (&seq1, &seq2, position, weights);
+  if (result != 0)
+    return result;
+  goto start_while;
+
+
+  for (pass = 0; pass < nrules; ++pass)
     {
       seq1.idxcnt = 0;
       seq1.idx = 0;
@@ -341,10 +395,11 @@  STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
 	 this is true for all of them.  Please note that the localedef programs
 	 makes sure that `position' is not used at the first level.  */
 
-      int position = rulesets[rule * nrules + pass] & sort_position;
+      position = rulesets[rule * nrules + pass] & sort_position;
 
       while (1)
 	{
+start_while:
 	  get_next_seq (&seq1, nrules, rulesets, weights, table,
 				    extra, indirect, pass);
 	  get_next_seq (&seq2, nrules, rulesets, weights, table,
@@ -374,7 +429,7 @@  STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
 	  if (result != 0)
 	    return result;
 	}
-
+      next:;
       rule = seq1.rule;
     }