From patchwork Mon Jun 29 14:53:39 2015
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Ondrej Bilka <neleai@seznam.cz>
X-Patchwork-Id: 7419
Received: (qmail 30565 invoked by alias); 29 Jun 2015 14:53:51 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 30549 invoked by uid 89); 29 Jun 2015 14:53:51 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=0.3 required=5.0 tests=AWL, BAYES_20,
	FREEMAIL_FROM, SPF_NEUTRAL autolearn=no version=3.3.2
X-HELO: popelka.ms.mff.cuni.cz
Date: Mon, 29 Jun 2015 16:53:39 +0200
From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= <neleai@seznam.cz>
To: Leonhard Holz <leonhard.holz@web.de>
Cc: libc-alpha@sourceware.org
Subject: [PATCH RFC] Pell strcoll initial iteration to improve performance.
Message-ID: <20150629145339.GA13548@domone>
MIME-Version: 1.0
Content-Disposition: inline
User-Agent: Mutt/1.5.20 (2009-06-14)

Hi Leonhard,

As we try to reduce initialization overhead you should try to peel first
loop iteration. Witch following patch to do it I could have around 10% 
speedup. Where do you have script to create performance difference?

I needed to add initialization of seq1.save_idx and seq1.back_us as gcc 
thought it could be used uninitialized.

Comments?

	* string/strcoll_l.c (STRCOLL): Peel first loop iteration.

diff --git a/string/strcoll_l.c b/string/strcoll_l.c
index 8f1225f..91b36f2 100644
--- a/string/strcoll_l.c
+++ b/string/strcoll_l.c
@@ -311,7 +311,7 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
   assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
   assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
   assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
-
+  int pass = 0;
   int result = 0, rule = 0;
 
   coll_seq seq1, seq2;
@@ -321,7 +321,61 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
   seq2.len = 0;
   seq2.idxmax = 0;
 
-  for (int pass = 0; pass < nrules; ++pass)
+  seq1.save_idx = 0;
+  seq1.back_us = NULL;
+
+  seq1.idxcnt = 0;
+  seq1.idx = 0;
+  seq2.idx = 0;
+  seq1.backw_stop = ~0ul;
+  seq1.backw = ~0ul;
+  seq2.idxcnt = 0;
+  seq2.backw_stop = ~0ul;
+  seq2.backw = ~0ul;
+
+  /* We need the elements of the strings as unsigned values since they
+     are used as indices.  */
+  seq1.us = (const USTRING_TYPE *) s1;
+  seq2.us = (const USTRING_TYPE *) s2;
+
+  /* We assume that if a rule has defined `position' in one section
+     this is true for all of them.  Please note that the localedef programs
+     makes sure that `position' is not used at the first level.  */
+
+  int position = rulesets[rule * nrules + pass] & sort_position;
+
+  get_next_seq (&seq1, nrules, rulesets, weights, table,
+      extra, indirect, pass);
+  get_next_seq (&seq2, nrules, rulesets, weights, table,
+      extra, indirect, pass);
+  /* See whether any or both strings are empty.  */
+  if (seq1.len == 0 || seq2.len == 0)
+    {
+      if (seq1.len == seq2.len)
+        {
+          /* Both strings ended and are equal at this level.  Do a
+	      byte-level comparison to ensure that we don't waste time
+	      going through multiple passes for totally equal strings
+	      before proceeding to subsequent passes.  */
+          if (pass == 0 && encoding == __cet_other &&
+	       STRCMP (s1, s2) == 0)
+	     return result;
+          else
+	     goto next;
+        }
+
+        /* This means one string is shorter than the other.  Find out
+           which one and return an appropriate value.  */
+      return seq1.len == 0 ? -1 : 1;
+    }
+
+  result = do_compare (&seq1, &seq2, position, weights);
+  if (result != 0)
+    return result;
+  goto start_while;
+
+
+  for (pass = 0; pass < nrules; ++pass)
     {
       seq1.idxcnt = 0;
       seq1.idx = 0;
@@ -341,10 +395,11 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
 	 this is true for all of them.  Please note that the localedef programs
 	 makes sure that `position' is not used at the first level.  */
 
-      int position = rulesets[rule * nrules + pass] & sort_position;
+      position = rulesets[rule * nrules + pass] & sort_position;
 
       while (1)
 	{
+start_while:
 	  get_next_seq (&seq1, nrules, rulesets, weights, table,
 				    extra, indirect, pass);
 	  get_next_seq (&seq2, nrules, rulesets, weights, table,
@@ -374,7 +429,7 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
 	  if (result != 0)
 	    return result;
 	}
-
+      next:;
       rule = seq1.rule;
     }