From patchwork Wed May 13 08:58:10 2015
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Ondrej Bilka <neleai@seznam.cz>
X-Patchwork-Id: 6689
Received: (qmail 33646 invoked by alias); 13 May 2015 08:58:23 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 33636 invoked by uid 89); 13 May 2015 08:58:22 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=0.3 required=5.0 tests=AWL, BAYES_40,
	FREEMAIL_FROM, SPF_NEUTRAL autolearn=no version=3.3.2
X-HELO: popelka.ms.mff.cuni.cz
Date: Wed, 13 May 2015 10:58:10 +0200
From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= <neleai@seznam.cz>
To: libc-alpha@sourceware.org
Subject: [PATCH 1/3] Refactor strdiff.
Message-ID: <20150513085810.GA31782@domone>
MIME-Version: 1.0
Content-Disposition: inline
User-Agent: Mutt/1.5.20 (2009-06-14)

Hi, as I want to improve strcasecmp with strdiff first step is move it
to separate file. I also factored out UTF-8 handling. I also added
microoptimization to find start as you could do a < x < b check with
single comparison and substraction and unroll loop as it could happen
maximally three times.

Then there is wide character handling. I added explicit encoding there
as widechar version could be directly used.

OK with this?

	* string/strdiff.h: New file.
	* string/strcoll_l.c: Move out STRDIFF implementation.

diff --git a/string/strcoll_l.c b/string/strcoll_l.c
index 0fa005f..297ec9c 100644
--- a/string/strcoll_l.c
+++ b/string/strcoll_l.c
@@ -30,6 +30,7 @@
 # define USTRING_TYPE unsigned char
 # define STRCOLL __strcoll_l
 # define STRDIFF __strdiff
+# define STRDIFF_L __strdiff_l
 # define STRCMP strcmp
 # define WEIGHT_H "../locale/weight.h"
 # define SUFFIX	MB
@@ -42,19 +43,7 @@
 #include "../locale/localeinfo.h"
 #include WEIGHT_H
 
-#define MASK_UTF8_7BIT  (1 << 7)
-#define MASK_UTF8_START (3 << 6)
-
-size_t
-STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t)
-{
-  size_t n;
-
-  for (n = 0; *s != '\0' && *s++ == *t++; ++n)
-    continue;
-
-  return n;
-}
+#include "string/strdiff.h"
 
 /* Track status while looking for sequences in a string.  */
 typedef struct
@@ -274,24 +263,14 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
   if (nrules == 0)
     return STRCMP (s1, s2);
 
-  /* Fast forward to the position of the first difference.  Needs to be
-     encoding aware as the byte-by-byte comparison can stop in the middle
-     of a char sequence for multibyte encodings like UTF-8.  */
+  /* Fast forward to the position of the first difference.  */
   uint_fast32_t encoding =
     current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;
-  if (encoding != __cet_other)
-    {
-      size_t diff = STRDIFF (s1, s2);
-      if (diff > 0)
-	{
-	  if (encoding == __cet_utf8 && (*(s1 + diff) & MASK_UTF8_7BIT) != 0)
-	    do
-	      diff--;
-	    while (diff > 0 && (*(s1 + diff) & MASK_UTF8_START) != MASK_UTF8_START);
-	  s1 += diff;
-	  s2 += diff;
-	}
-    }
+
+  if (sizeof (STRING_TYPE) > 1)
+    STRDIFF_L (&s1, &s2, __cet_8bit);
+  else if (encoding != __cet_other)
+    STRDIFF_L (&s1, &s2, encoding);
 
   /* Catch empty strings.  */
   if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0'))
diff --git a/string/strdiff.h b/string/strdiff.h
new file mode 100644
index 0000000..224d899
--- /dev/null
+++ b/string/strdiff.h
@@ -0,0 +1,36 @@
+static size_t
+STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t)
+{
+  size_t n;
+
+  for (n = 0; *s != '\0' && *s++ == *t++; ++n)
+    continue;
+
+  return n;
+}
+
+#define UTF8_CONT_START 128
+#define UTF8_CONT_END 195
+
+static void 
+STRDIFF_L (const STRING_TYPE **s1, const STRING_TYPE **s2, uint_fast32_t encoding)
+{
+  size_t diff = STRDIFF (*s1, *s2);
+	  
+  if (encoding == __cet_utf8)
+    {
+      USTRING_TYPE c = *(*s1 + diff);
+      if (((UTF8_CONT_START <= c) & (c <= UTF8_CONT_END)) && diff > 0)
+        {
+          diff --;
+          c = *(*s1 + diff);
+          if (((UTF8_CONT_START <= c) & (c <= UTF8_CONT_END)) && diff > 0)
+            diff --;
+          c = *(*s1 + diff);
+          if (((UTF8_CONT_START <= c) & (c <= UTF8_CONT_END)) && diff > 0)
+            diff --;
+        }
+     }
+  *s1 += diff;
+  *s2 += diff;
+}