[3/*,v6] Generic comparison functions (strcmp, strncmp, strcasecmp, strncasecmp, strverscmp)

Message ID 20150603012505.GA27411@domone
State New, archived
Headers

Commit Message

Ondrej Bilka June 3, 2015, 1:25 a.m. UTC
  On Mon, Jun 01, 2015 at 12:01:31AM +0200, Ondřej Bílka wrote:
> And here is updated version of comparing functions.
> 
> Main change here is that I introduced splitting loop. If architecture
> has slow unaligned loads and its faster to create them with bit shifts
> in loop or we emulate these then we will generate two loops, one with
> aligned loads and one with unaligned loads.
> 
> These help for str* functions where inputs are coaligned around half of
> time. Aligned loop is waste of space for memcmp where only around 1/8 of
> inputs is coaligned.
> 
> Also this optimization increases size so we would need to do some live
> profiling to measure impact.
> 
But as I realized that we don't need to use clz for comparison just
mask bytes after first mismatch and do vector comparison.

What I tried to make that branchless I didn't realize that a faster way
is use conditional moves which gcc should do in this case.

For strn?casecmp I did some profiling with dryrun, a summary that I
collected is:

average size   0.7 calls    11118 succeed  87.0%

s1    aligned to 4 bytes  65.4% aligned to 8 bytes  60.7% aligned to 16 bytes  57.7%
s2    aligned to 4 bytes  37.7% aligned to 8 bytes  25.3% aligned to 16 bytes  19.1%
s1-s2 aligned to 4 bytes  36.0% aligned to 8 bytes  24.3% aligned to 16 bytes  17.8%

n <= 0:  84.7% n <= 1:  87.6% n <= 2:  88.0% n <= 3:  93.5%  n <= 4:
96.5% n <= 8:  97.8% n <= 16:  99.6% n <= 32: 100.0% n <= 64: 100.0%

average case mismatches   0.122

Which is problem as on this workload vectorization doesn't make lot of
sense when 84% mismatch is in first byte.

So I added a bytewise checks first, until there are four identical
characters in row when we switch to vector search. That helps problem
that vector search has high startup cost and it would slow us down if it
repetately advanced only few bytes due case mismatch.

I now have doubt about x64 implementation and if converting whole vector
to lowercase is correct or it would be cheaper just handle mismatches
separately and pay price.

 
 	* sysdeps/generic/diff_skeleton.h: New file.
 	* string/strcmp.c: Use skeleton.
 	* string/strncmp.c: Likewise.
 	* string/strverscmp.c: Likewise.
 	* string/strcasecmp.c: Likewise.
 	* string/strncase.c: Likewise.
  

Patch

diff --git a/string/strcasecmp.c b/string/strcasecmp.c
index 6b14912..f268d90 100644
--- a/string/strcasecmp.c
+++ b/string/strcasecmp.c
@@ -41,6 +41,24 @@ 
 # define LOCALE_PARAM_DECL
 #endif
 
+
+#include <string_vector.h>
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+
+static __always_inline
+size_t 
+byte_loop (char *x, char *y, size_t n, size_t end)
+{
+  size_t i;
+  for (i = 0; i < n; i++)
+    if (x[i] == '\0' || x[i] != y[i])
+      return i;
+
+  return n;
+}
+
+#include <string_diff_skeleton.h>
+
 /* Compare S1 and S2, ignoring case, returning less than, equal to or
    greater than zero if S1 is lexicographically less than,
    equal to or greater than S2.  */
@@ -56,15 +74,45 @@  __strcasecmp (s1, s2 LOCALE_PARAM)
   const unsigned char *p1 = (const unsigned char *) s1;
   const unsigned char *p2 = (const unsigned char *) s2;
   int result;
+  size_t i = 0; 
+
+  /* Here we save a buy or rent problem of using vector implementation.
+     It has big startup cost, which is problematic in loop when it 
+     advances only one character. To solve that we first use finite
+     state machine to call vector implementation only after four
+     consequtive identical characters. We use goto as assembly should
+     generate same code flow.
+    */
+  bytewise:
+  if ((result = TOLOWER (p1[i]) - TOLOWER (p2[i])) || p1[i] == '\0')
+    return result;
+   else
+    {
+      i++;
+      goto bytewise;
+    }
+  i++;
+  if (p1[i] != p2[i] || p1[i] == '\0')
+    goto bytewise;
+  i++;
+  if (p1[i] != p2[i] || p1[i] == '\0')
+    goto bytewise;
+  i++;
+  if (p1[i] != p2[i] || p1[i] == '\0')
+    goto bytewise;
 
-  if (p1 == p2)
-    return 0;
+  p1 += i;
+  p2 += i;
 
-  while ((result = TOLOWER (*p1) - TOLOWER (*p2++)) == 0)
-    if (*p1++ == '\0')
-      break;
+  diff_skeleton ((char **) &p1, (char **) &p2, 0);
 
-  return result;
+  if ((result = TOLOWER (*p1) - TOLOWER (*p2)) || *p1 == '\0')
+    return result;
+  else
+    {
+      i = 1; 
+      goto bytewise;
+    }
 }
 #ifndef __strcasecmp
 libc_hidden_def (__strcasecmp)
diff --git a/string/strcasecmp_const.c b/string/strcasecmp_const.c
new file mode 100644
index 0000000..60f089b
--- /dev/null
+++ b/string/strcasecmp_const.c
@@ -0,0 +1,63 @@ 
+/* Copyright (C) 1991-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+
+#ifndef _LIBC
+# define TOLOWER(Ch) tolower (Ch)
+#else
+# include <locale/localeinfo.h>
+# ifdef USE_IN_EXTENDED_LOCALE_MODEL
+# define __strcasecmp_const __strcasecmp_const_l
+# endif
+# define TOLOWER(Ch) __tolower_l ((Ch), loc)
+#endif
+
+#ifdef USE_IN_EXTENDED_LOCALE_MODEL
+# define LOCALE_PARAM , __locale_t loc
+#else
+# define LOCALE_PARAM
+#endif
+
+#include <string_vector.h>
+
+
+/* Compare S1 and S2, ignoring case, using precomputed skip tables.  */
+int
+__strcasecmp_const (char *s1, struct __precomputed_strcase *p LOCALE_PARAM)
+{
+#ifndef USE_IN_EXTENDED_LOCALE_MODEL
+  __locale_t loc = _NL_CURRENT_LOCALE;
+#endif
+  struct __locale_data *ctype = loc->__locales[LC_CTYPE];
+  int nonascii = ctype->values[_NL_ITEM_INDEX (_NL_CTYPE_NONASCII_CASE)].word;
+
+  if (nonascii || CROSS_PAGE(s1, LSIZE))
+    return __strcasecmp(s1, p->str);
+
+  vector_int v1 = LOADU(s1);
+  int i = first_nonzero_byte (((v1 ^ p->v2) & p->mask) | p->endm);
+
+  return TOLOWER ((unsigned char) SHIFT_BYTES (v1, i)) 
+         - ((unsigned char) SHIFT_BYTES (p->v2, i));
+}
diff --git a/string/strcmp.c b/string/strcmp.c
index 4d4c044..7c85b61 100644
--- a/string/strcmp.c
+++ b/string/strcmp.c
@@ -16,28 +16,57 @@ 
    <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
-
 #undef strcmp
 
 /* Compare S1 and S2, returning less than, equal to or
    greater than zero if S1 is lexicographically less than,
    equal to or greater than S2.  */
+
+#include <string_vector.h>
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+
+static __always_inline
+size_t 
+byte_loop(char *x, char *y, size_t n, size_t end)
+{
+  size_t i;
+  for (i = 0; i < n; i++)
+    if (x[i] == '\0' || x[i] != y[i])
+      return i;
+
+  return n;
+}
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+# define CUSTOM_RETURN
+# define RETURN_BYTE(sa,sb,i) return ((unsigned char) sa[i]) - ((unsigned char)sb[i])
+#define RETURN(sa, sb, mask) \
+  { \
+    vector_int va = LOADU(sa), vb = LOADU(sb); \
+    mask = (mask ^ (mask - 1)); \
+    mask = (mask & ones) * 255; \
+    if ((va & mask) > (vb & mask)) \
+      return 1; \
+    if ((va & mask) < (vb & mask)) \
+      return -1; \
+    return 0; \
+  }
+
+#endif
+
+#include <string_diff_skeleton.h>
+
 int
-strcmp (const char *p1, const char *p2)
+strcmp (const char *s1_start, const char *s2_start)
 {
-  const unsigned char *s1 = (const unsigned char *) p1;
-  const unsigned char *s2 = (const unsigned char *) p2;
-  unsigned char c1, c2;
-
-  do
-    {
-      c1 = (unsigned char) *s1++;
-      c2 = (unsigned char) *s2++;
-      if (c1 == '\0')
-	return c1 - c2;
-    }
-  while (c1 == c2);
-
-  return c1 - c2;
+  unsigned char *p1 = (unsigned char *) s1_start;
+  unsigned char *p2 = (unsigned char *) s2_start;
+#ifdef CUSTOM_RETURN
+  return diff_skeleton ((char **) &p1, (char **) &p2, 0);
+#else
+  diff_skeleton ((char **) &p1, (char **) &p2, 0); 
+  return *p1 - *p2;
+#endif
 }
+
 libc_hidden_builtin_def (strcmp)
diff --git a/string/strncmp.c b/string/strncmp.c
index 2a1137a..d8c7f76 100644
--- a/string/strncmp.c
+++ b/string/strncmp.c
@@ -16,59 +16,74 @@ 
    <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
-#include <memcopy.h>
 
 #undef strncmp
 
-#ifndef STRNCMP
-#define STRNCMP strncmp
-#endif
+/* Compare S1 and S2, returning less than, equal to or
+   greater than zero if S1 is lexicographically less than,
+   equal to or greater than S2.  */
 
-/* Compare no more than N characters of S1 and S2,
-   returning less than, equal to or greater than zero
-   if S1 is lexicographically less than, equal to or
-   greater than S2.  */
-int
-STRNCMP (const char *s1, const char *s2, size_t n)
-{
-  unsigned char c1 = '\0';
-  unsigned char c2 = '\0';
+#include <string_vector.h>
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+#define CHECK_N
 
-  if (n >= 4)
-    {
-      size_t n4 = n >> 2;
-      do
-	{
-	  c1 = (unsigned char) *s1++;
-	  c2 = (unsigned char) *s2++;
-	  if (c1 == '\0' || c1 != c2)
-	    return c1 - c2;
-	  c1 = (unsigned char) *s1++;
-	  c2 = (unsigned char) *s2++;
-	  if (c1 == '\0' || c1 != c2)
-	    return c1 - c2;
-	  c1 = (unsigned char) *s1++;
-	  c2 = (unsigned char) *s2++;
-	  if (c1 == '\0' || c1 != c2)
-	    return c1 - c2;
-	  c1 = (unsigned char) *s1++;
-	  c2 = (unsigned char) *s2++;
-	  if (c1 == '\0' || c1 != c2)
-	    return c1 - c2;
-	} while (--n4 > 0);
-      n &= 3;
-    }
-
-  while (n > 0)
+static __always_inline
+size_t 
+byte_loop(char *x, char *y, size_t n, size_t end)
+{
+  size_t i;
+  for (i = 0; i < n; i++)
     {
-      c1 = (unsigned char) *s1++;
-      c2 = (unsigned char) *s2++;
-      if (c1 == '\0' || c1 != c2)
-	return c1 - c2;
-      n--;
+      if (i == end)
+        return SIZE_MAX;
+      if (x[i] == '\0' || x[i] != y[i])
+        return i;
     }
 
-  return c1 - c2;
+  return n;
 }
 
-libc_hidden_builtin_def (STRNCMP)
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+# define CUSTOM_RETURN
+# define RETURN_BYTE(sa,sb,i) return ((unsigned char) sa[i]) - ((unsigned char)sb[i])
+#define RETURN(sa, sb, mask) \
+  { \
+    vector_int va = LOADU(sa), vb = LOADU(sb); \
+    mask = (mask ^ (mask - 1)); \
+    mask = (mask & ones) * 255; \
+    size_t read = sa - *p1; \
+    if (n <= read) \
+      return 0; \
+    if (n < read + LSIZE) \
+      { \
+        mask = (mask << (8 * (read - LSIZE - n))); \
+        mask = (mask >> (8 * (read - LSIZE - n))); \
+      } \
+    if ((va & mask) > (vb & mask)) \
+      return 1; \
+    if ((va & mask) < (vb & mask)) \
+      return -1; \
+    return 0; \
+  }
+
+#endif
+
+
+
+#include <string_diff_skeleton.h>
+
+int
+strncmp (const char *s1_start, const char *s2_start, size_t n)
+{
+  unsigned char *p1 = (unsigned char *) s1_start;
+  unsigned char *p2 = (unsigned char *) s2_start;
+#ifdef CUSTOM_RETURN
+  return diff_skeleton ((char **) &p1, (char **) &p2, n);
+#else
+  if (!diff_skeleton ((char **) &p1, (char **) &p2, n))
+    return 0;
+
+  return *p1 - *p2;
+#endif
+}
+libc_hidden_builtin_def (strncmp)
diff --git a/string/strverscmp.c b/string/strverscmp.c
index 38bf5e2..db3331c 100644
--- a/string/strverscmp.c
+++ b/string/strverscmp.c
@@ -33,6 +33,28 @@ 
 #define  LEN    3
 
 
+
+
+
+#include <string_vector.h>
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+
+static __always_inline
+size_t 
+byte_loop(char *x, char *y, size_t n, size_t end)
+{
+  size_t i;
+  for (i = 0; i < n; i++)
+    if (x[i] == '\0' || x[i] != y[i])
+      return i;
+
+  return n;
+}
+
+#include <string_diff_skeleton.h>
+
+
+
 /* Compare S1 and S2 as strings holding indices/version numbers,
    returning less than, equal to or greater than zero if S1 is less than,
    equal to or greater than S2 (for more info, see the texinfo doc).
@@ -46,6 +68,19 @@  __strverscmp (s1, s2)
   const unsigned char *p1 = (const unsigned char *) s1;
   const unsigned char *p2 = (const unsigned char *) s2;
 
+  diff_skeleton ((char **) &p1, (char **) &p2, 0);
+  if (isdigit (*p2) && p2 > (const unsigned char *) s2)
+    {
+      p1--;
+      p2--;
+    }
+
+  while (isdigit (*p1) && p1 > (const unsigned char *) s1)
+    {
+      p1--;
+      p2--;
+    }
+
   /* Symbol(s)    0       [1-9]   others
      Transition   (10) 0  (01) d  (00) x   */
   static const uint8_t next_state[] =
diff --git a/sysdeps/generic/string_diff_skeleton.h b/sysdeps/generic/string_diff_skeleton.h
new file mode 100644
index 0000000..804548d
--- /dev/null
+++ b/sysdeps/generic/string_diff_skeleton.h
@@ -0,0 +1,282 @@ 
+/* Skeleton of generic string comparison functions.
+   Copyright (C) 1991-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <string.h>
+#include <libc-internal.h>
+#include <stdint.h>
+
+/* On high endian an positive could cause false positive in previous byte.  */
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+# undef EXPRESSION
+# define EXPRESSION(x, y) EXPRESSION_NOCARRY (x, y)
+#endif
+
+
+#define LOOP_SIZE (LOOP_UNROLL * LSIZE)
+
+
+#ifndef FIRST_CHECK_BYTEWISE
+# define FIRST_CHECK_BYTEWISE 0
+#endif
+
+#ifdef CHECK_N
+# define _CHECK_N(x) x
+#else
+# define _CHECK_N(x) 0
+#endif
+
+/* A strdiff variant. Sets p1 and p2 to possition of first difference.
+   If it read n bytes without difference it return 0, otherwise 1  */
+
+static __always_inline
+int maybe_aligned_loop (char **p1, char **p2, size_t n, int aligned);
+
+static __always_inline
+int
+diff_skeleton (char **p1, char **p2, size_t n)
+{
+  vector_int mask;
+  char *s1 = *p1, *s2 = *p2;
+  vector_int v1, v2;
+  vector_int __attribute__ ((unused)) previous = 0;
+
+#ifndef CUSTOM_RETURN
+#define RETURN_BYTE(sa, sb, i) \
+  do 				                       \
+    { 				                       \
+      *p1 = sa + i;			               \
+      *p2 = sb + i;			               \
+      return _CHECK_N (n <= sa + i - s1) ? 0 : 1;      \
+    }					               \
+  while (0)
+
+#define RETURN(sa, sb, mask) RETURN_BYTE (sa, sb, first_nonzero_byte (mask))
+#endif
+
+  /*  Most comparisons differ in first byte and checking that is cheap.
+      A vector check in more expensive so there is question if saving
+      from byte check is more than penalty of branch misprediction.
+      Same reasoning could be applied for second byte with additional
+      penalty of checking first byte.  So we need to empirically test
+      whats optimal for given architecture.  */
+
+  int i = byte_loop (s1, s2, FIRST_CHECK_BYTEWISE, n);
+  if (i < FIRST_CHECK_BYTEWISE)
+    RETURN_BYTE (s1, s2, i);
+  if (i == SIZE_MAX)
+    return 0;
+  
+  s1 += FIRST_CHECK_BYTEWISE; 
+  s2 += FIRST_CHECK_BYTEWISE; 
+  n -= FIRST_CHECK_BYTEWISE; 
+
+  if (_CHECK_N (n == 0))
+    return 0;
+
+  /* We fetch 32 bytes while not crossing page boundary.
+     Most strings in practice are of that size and we avoid a loop.
+     This looks as best in practice, alternative below uses aligned load
+     but is slower when string starts just few
+     bytes before 32 byte boundary. A tradeoff is that we rarely could
+     fetch extra cache line without needing it but this optimization
+     does pay for that. */
+
+
+  if (!CROSS_PAGE (s1, UNALIGNED_HEADER_UNROLL * LSIZE) 
+      && !CROSS_PAGE (s2, UNALIGNED_HEADER_UNROLL * LSIZE) 
+      && UNALIGNED_HEADER_UNROLL > 0)
+    {
+
+  /* TODO When emulating unaligned loads we could align s1 after 
+     first iteration or if FIRST_CHECK_BYTEWISE >= LSIZE */
+
+#define UNALIGNED_HEADER_CHECK(i) \
+  if (UNALIGNED_HEADER_UNROLL >= i)                                      \
+    {                                                                    \
+      v1 = LOADU (s1 + (i - 1) * LSIZE);                                 \
+      v2 = LOADU (s2 + (i - 1) * LSIZE);                                 \
+      mask = EXPRESSION (v1, v2);                                        \
+      if (mask)                                                          \
+	RETURN (s1 + (i - 1) * LSIZE, s2 + (i - 1) * LSIZE,              \
+                mask);       			 			\
+    }
+
+      UNALIGNED_HEADER_CHECK (1);
+      UNALIGNED_HEADER_CHECK (2);
+      UNALIGNED_HEADER_CHECK (3);
+      UNALIGNED_HEADER_CHECK (4);
+      UNALIGNED_HEADER_CHECK (5);
+      UNALIGNED_HEADER_CHECK (6);
+      UNALIGNED_HEADER_CHECK (7);
+      UNALIGNED_HEADER_CHECK (8);
+
+      if (_CHECK_N (n <= LSIZE * UNALIGNED_HEADER_UNROLL))
+	return 0;
+    }
+  else
+    {
+      int i = byte_loop (s1, s2, LOOP_SIZE, n);
+      if (i < LOOP_SIZE)
+        RETURN_BYTE (s1, s2, i);
+      if (i == SIZE_MAX)
+        return 0;
+    }
+#ifndef ALIGNING_HELPS
+# define ALIGNING_HELPS 0
+#endif
+  return maybe_aligned_loop (p1, p2, n, ALIGNING_HELPS && ((s1 - s2) % LSIZE == 0));
+}
+
+  /* Now we read enough bytes to start a loop, assuming following:  */
+
+static __always_inline
+int
+maybe_aligned_loop (char **p1, char **p2, size_t n, int aligned)
+{
+#define MAYBE_ALIGNED_LOAD(a) (aligned ? LOAD (a) : LOADU(a))
+  vector_int mask;
+  char *s1 = *p1, *s2 = *p2;
+  vector_int v1, v2;
+  vector_int __attribute__ ((unused)) previous = 0;
+
+  assert (LOOP_UNROLL <= UNALIGNED_HEADER_UNROLL
+	  || UNALIGNED_HEADER_UNROLL == 0);
+
+  char *s1_loop = PTR_ALIGN_DOWN (s1, LOOP_SIZE);
+  char *s2_loop = s2 - (s1 - s1_loop);
+  int until_cross_page = (4096 - (((uintptr_t) (s2_loop + LOOP_SIZE))
+                                   % 4096)) / LOOP_SIZE;
+
+#ifdef CHECK_N
+# ifdef SAVE_N_ITERS_REGISTER
+  while (s1 + n - s_loop1 >= LOOP_SIZE)
+# else
+  size_t n_iters = (n - (s1_loop + LOOP_SIZE - s1)
+                      + LOOP_SIZE - 1) / (LOOP_SIZE);
+  while (n_iters--)
+# endif
+#else
+  while (1)
+#endif
+    {
+      s1_loop += LOOP_SIZE;
+      s2_loop += LOOP_SIZE;
+
+      if (until_cross_page == 0)
+        {
+          uintptr_t shift = ((uintptr_t) s2_loop) % LOOP_SIZE;
+      
+     /* A tricky part here is that we need to mask bytes that could be read 
+        in previous iteration, these were already checked and when in header
+        these may lie before start of string.  */
+
+#define CROSSING_CHECK(i) \
+  if (i <= LOOP_UNROLL)							    \
+    {									    \
+      v1 = MAYBE_ALIGNED_LOAD (s1_loop - shift + (i - 1) * LSIZE);          \
+      v2 = LOAD (s2_loop - shift + (i - 1) * LSIZE);                        \
+      mask = EXPRESSION (v1, v2);                                           \
+      mask = forget_bytes (mask, shift - LSIZE * (i - 1));                  \
+      if (mask)                                                             \
+	RETURN (s1_loop - shift + (i - 1) * LSIZE,                          \
+                s2_loop - shift + (i - 1) * LSIZE,                          \
+                mask);       			    			    \
+    }
+         CROSSING_CHECK (1);
+         CROSSING_CHECK (2);
+         CROSSING_CHECK (3);
+         CROSSING_CHECK (4);
+         CROSSING_CHECK (5);
+         CROSSING_CHECK (6);
+         CROSSING_CHECK (7);
+         CROSSING_CHECK (8);
+         
+         until_cross_page = 4096 / LOOP_SIZE;
+         if (_CHECK_N (n <= s1_loop + LOOP_SIZE - shift - s1))
+           return 0;
+       }
+      until_cross_page--;
+
+      vector_int masks[9];
+      masks[0] = 0;
+
+#define MERGE_MASK(i) \
+  if (LOOP_UNROLL >= i)                                                 \
+    {                                                                   \
+      v1 = LOAD (s1_loop + (i - 1) * LSIZE);                            \
+      v2 = MAYBE_ALIGNED_LOAD (s2_loop + (i - 1) * LSIZE);              \
+      masks[i] = masks[i - 1] | EXPRESSION (v1, v2);                    \
+    }
+
+      MERGE_MASK (1);
+      MERGE_MASK (2);
+      MERGE_MASK (3);
+      MERGE_MASK (4);
+      MERGE_MASK (5);
+      MERGE_MASK (6);
+      MERGE_MASK (7);
+      MERGE_MASK (8);
+
+      if (masks[LOOP_UNROLL])
+	{
+
+          /* Here we have two possibilities depending on register pressure.
+             When you don't have enough registers this recalculating of
+             results would create fastest loop for large inputs. However 
+             its likely affected by gcc bug where gcc will try to save 
+             intermediate results causing spill in each iteration to speed
+             up final iteration a bit. To avoid that we need compiler barrier
+             here.  */
+
+#ifdef RECALCULATE_ON_TAIL
+          asm volatile ("" : : : "memory");
+# define CHECK_MASK(i) \
+      v1 = LOAD (s1_loop + (i - 1) * LSIZE);                             \
+      v2 = MAYBE_ALIGNED_LOAD (s2_loop + (i - 1) * LSIZE);               \
+      mask = EXPRESSION (v1, v2);                                        \
+      if (mask)                                                          \
+	RETURN (s1_loop + (i - 1) * LSIZE, s2_loop + (i - 1) * LSIZE,    \
+                mask);       			 
+
+# else
+          /* On other hand when you have enough free registers then you can
+             save intermediate ors. When you checks masks then as you know 
+             that to reach each one previous ones must be zero you know that
+             or of previous masks will be exactly current one.  */
+
+# define CHECK_MASK(i) \
+  if (masks[i])                                                          \
+	RETURN (s1_loop + (i - 1) * LSIZE, s2_loop + (i - 1) * LSIZE,    \
+                masks[i]);
+
+# endif
+
+	  CHECK_MASK (1);
+	  CHECK_MASK (2);
+	  CHECK_MASK (3);
+	  CHECK_MASK (4);
+	  CHECK_MASK (5);
+	  CHECK_MASK (6);
+	  CHECK_MASK (7);
+	  CHECK_MASK (8);
+	}
+    }
+  return 0;
+}