From patchwork Wed May 13 08:58:10 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ondrej Bilka X-Patchwork-Id: 6689 Received: (qmail 33646 invoked by alias); 13 May 2015 08:58:23 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 33636 invoked by uid 89); 13 May 2015 08:58:22 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=0.3 required=5.0 tests=AWL, BAYES_40, FREEMAIL_FROM, SPF_NEUTRAL autolearn=no version=3.3.2 X-HELO: popelka.ms.mff.cuni.cz Date: Wed, 13 May 2015 10:58:10 +0200 From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= To: libc-alpha@sourceware.org Subject: [PATCH 1/3] Refactor strdiff. Message-ID: <20150513085810.GA31782@domone> MIME-Version: 1.0 Content-Disposition: inline User-Agent: Mutt/1.5.20 (2009-06-14) Hi, as I want to improve strcasecmp with strdiff first step is move it to separate file. I also factored out UTF-8 handling. I also added microoptimization to find start as you could do a < x < b check with single comparison and substraction and unroll loop as it could happen maximally three times. Then there is wide character handling. I added explicit encoding there as widechar version could be directly used. OK with this? * string/strdiff.h: New file. * string/strcoll_l.c: Move out STRDIFF implementation. diff --git a/string/strcoll_l.c b/string/strcoll_l.c index 0fa005f..297ec9c 100644 --- a/string/strcoll_l.c +++ b/string/strcoll_l.c @@ -30,6 +30,7 @@ # define USTRING_TYPE unsigned char # define STRCOLL __strcoll_l # define STRDIFF __strdiff +# define STRDIFF_L __strdiff_l # define STRCMP strcmp # define WEIGHT_H "../locale/weight.h" # define SUFFIX MB @@ -42,19 +43,7 @@ #include "../locale/localeinfo.h" #include WEIGHT_H -#define MASK_UTF8_7BIT (1 << 7) -#define MASK_UTF8_START (3 << 6) - -size_t -STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t) -{ - size_t n; - - for (n = 0; *s != '\0' && *s++ == *t++; ++n) - continue; - - return n; -} +#include "string/strdiff.h" /* Track status while looking for sequences in a string. */ typedef struct @@ -274,24 +263,14 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l) if (nrules == 0) return STRCMP (s1, s2); - /* Fast forward to the position of the first difference. Needs to be - encoding aware as the byte-by-byte comparison can stop in the middle - of a char sequence for multibyte encodings like UTF-8. */ + /* Fast forward to the position of the first difference. */ uint_fast32_t encoding = current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word; - if (encoding != __cet_other) - { - size_t diff = STRDIFF (s1, s2); - if (diff > 0) - { - if (encoding == __cet_utf8 && (*(s1 + diff) & MASK_UTF8_7BIT) != 0) - do - diff--; - while (diff > 0 && (*(s1 + diff) & MASK_UTF8_START) != MASK_UTF8_START); - s1 += diff; - s2 += diff; - } - } + + if (sizeof (STRING_TYPE) > 1) + STRDIFF_L (&s1, &s2, __cet_8bit); + else if (encoding != __cet_other) + STRDIFF_L (&s1, &s2, encoding); /* Catch empty strings. */ if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0')) diff --git a/string/strdiff.h b/string/strdiff.h new file mode 100644 index 0000000..224d899 --- /dev/null +++ b/string/strdiff.h @@ -0,0 +1,36 @@ +static size_t +STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t) +{ + size_t n; + + for (n = 0; *s != '\0' && *s++ == *t++; ++n) + continue; + + return n; +} + +#define UTF8_CONT_START 128 +#define UTF8_CONT_END 195 + +static void +STRDIFF_L (const STRING_TYPE **s1, const STRING_TYPE **s2, uint_fast32_t encoding) +{ + size_t diff = STRDIFF (*s1, *s2); + + if (encoding == __cet_utf8) + { + USTRING_TYPE c = *(*s1 + diff); + if (((UTF8_CONT_START <= c) & (c <= UTF8_CONT_END)) && diff > 0) + { + diff --; + c = *(*s1 + diff); + if (((UTF8_CONT_START <= c) & (c <= UTF8_CONT_END)) && diff > 0) + diff --; + c = *(*s1 + diff); + if (((UTF8_CONT_START <= c) & (c <= UTF8_CONT_END)) && diff > 0) + diff --; + } + } + *s1 += diff; + *s2 += diff; +}