[3/*,v5] Generic comparison functions (strcmp, strncmp, strcasecmp, strncasecmp, strverscmp)

Message ID 20150531220131.GA666@domone
State New, archived
Headers

Commit Message

Ondrej Bilka May 31, 2015, 10:01 p.m. UTC
  And here is updated version of comparing functions.

Main change here is that I introduced splitting loop. If architecture
has slow unaligned loads and its faster to create them with bit shifts
in loop or we emulate these then we will generate two loops, one with
aligned loads and one with unaligned loads.

These help for str* functions where inputs are coaligned around half of
time. Aligned loop is waste of space for memcmp where only around 1/8 of
inputs is coaligned.

Also this optimization increases size so we would need to do some live
profiling to measure impact.

For strcmp and strncmp I am done.
A caseless strcasecmp and strncasecmp are different. Now I just added
optimization of skipping initial segment of identical bytes as does
strn?cmp. I could add a more precise match that would ignore case for 
ascii-compatible encodings but its questionable if its worth it. My
worry is that it would be useless as first difference would be also
difference without regard of case so conversion just wasted cycles.

However for caseless comparison a header makes much more sense, I will
submit header patch there. Reason is that we could precompute a mask to
remove case bits. For <8 character strcasecmp with constant arg I could 
reduce it to few cycles if we export tls locale variable that checks
this.

Last one is strverscmp where my improvement is first find nonzero bytes,
then go back to digits. I would say that we could ignore identical
digits as they compare same but I didn't read sort order in detail.


  if (!CROSS_PAGE (x) && ascii_compatible())
    {
      int idx = first_nonzero_byte((LOADU (x) ^ y_chars) & filter_case);
      return tolower(x[idx]) - y[idx];
    }

	* sysdeps/generic/diff_skeleton.h: New file.
	* string/strcmp.c: Use skeleton.
	* string/strncmp.c: Likewise.
	* string/strverscmp.c: Likewise.
	* string/strcasecmp.c: Likewise.
	* string/strncase.c: Likewise.
  

Patch

diff --git a/string/strcasecmp.c b/string/strcasecmp.c
index 6b14912..09f2059 100644
--- a/string/strcasecmp.c
+++ b/string/strcasecmp.c
@@ -41,6 +41,24 @@ 
 # define LOCALE_PARAM_DECL
 #endif
 
+
+#include <string_vector.h>
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+
+static __always_inline
+size_t 
+byte_loop (char *x, char *y, size_t n, size_t end)
+{
+  size_t i;
+  for (i = 0; i < n; i++)
+    if (x[i] == '\0' || x[i] != y[i])
+      return i;
+
+  return n;
+}
+
+#include <string_diff_skeleton.h>
+
 /* Compare S1 and S2, ignoring case, returning less than, equal to or
    greater than zero if S1 is lexicographically less than,
    equal to or greater than S2.  */
@@ -56,9 +74,8 @@  __strcasecmp (s1, s2 LOCALE_PARAM)
   const unsigned char *p1 = (const unsigned char *) s1;
   const unsigned char *p2 = (const unsigned char *) s2;
   int result;
-
-  if (p1 == p2)
-    return 0;
+  
+  diff_skeleton ((char **) &p1, (char **) &p2, 0);
 
   while ((result = TOLOWER (*p1) - TOLOWER (*p2++)) == 0)
     if (*p1++ == '\0')
diff --git a/string/strcmp.c b/string/strcmp.c
index 4d4c044..9930af8 100644
--- a/string/strcmp.c
+++ b/string/strcmp.c
@@ -16,28 +16,37 @@ 
    <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
-
 #undef strcmp
 
 /* Compare S1 and S2, returning less than, equal to or
    greater than zero if S1 is lexicographically less than,
    equal to or greater than S2.  */
+
+#include <string_vector.h>
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+
+static __always_inline
+size_t 
+byte_loop(char *x, char *y, size_t n, size_t end)
+{
+  size_t i;
+  for (i = 0; i < n; i++)
+    if (x[i] == '\0' || x[i] != y[i])
+      return i;
+
+  return n;
+}
+
+#include <string_diff_skeleton.h>
+
 int
-strcmp (const char *p1, const char *p2)
+strcmp (const char *s1_start, const char *s2_start)
 {
-  const unsigned char *s1 = (const unsigned char *) p1;
-  const unsigned char *s2 = (const unsigned char *) p2;
-  unsigned char c1, c2;
-
-  do
-    {
-      c1 = (unsigned char) *s1++;
-      c2 = (unsigned char) *s2++;
-      if (c1 == '\0')
-	return c1 - c2;
-    }
-  while (c1 == c2);
-
-  return c1 - c2;
+  unsigned char *p1 = (unsigned char *) s1_start;
+  unsigned char *p2 = (unsigned char *) s2_start;
+
+  diff_skeleton ((char **) &p1, (char **) &p2, 0); 
+
+  return *p1 - *p2;
 }
 libc_hidden_builtin_def (strcmp)
diff --git a/string/strncase.c b/string/strncase.c
index a8fc046..2fcb1f9 100644
--- a/string/strncase.c
+++ b/string/strncase.c
@@ -43,6 +43,33 @@ 
 # define LOCALE_PARAM_DECL
 #endif
 
+
+
+
+#include <string_vector.h>
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+#define CHECK_N
+
+static __always_inline
+size_t 
+byte_loop(char *x, char *y, size_t n, size_t end)
+{
+  size_t i;
+  for (i = 0; i < n; i++)
+    {
+      if (i == end)
+        return SIZE_MAX;
+      if (x[i] == '\0' || x[i] != y[i])
+        return i;
+    }
+
+  return n;
+}
+
+#include <string_diff_skeleton.h>
+
+
+
 /* Compare no more than N characters of S1 and S2,
    ignoring case, returning less than, equal to or
    greater than zero if S1 is lexicographically less
@@ -61,7 +88,7 @@  __strncasecmp (s1, s2, n LOCALE_PARAM)
   const unsigned char *p2 = (const unsigned char *) s2;
   int result;
 
-  if (p1 == p2 || n == 0)
+  if (!diff_skeleton ((char **) &p1, (char **) &p2, n))
     return 0;
 
   while ((result = TOLOWER (*p1) - TOLOWER (*p2++)) == 0)
diff --git a/string/strncmp.c b/string/strncmp.c
index 2a1137a..cfa33b5 100644
--- a/string/strncmp.c
+++ b/string/strncmp.c
@@ -16,59 +16,44 @@ 
    <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
-#include <memcopy.h>
 
 #undef strncmp
 
-#ifndef STRNCMP
-#define STRNCMP strncmp
-#endif
+/* Compare S1 and S2, returning less than, equal to or
+   greater than zero if S1 is lexicographically less than,
+   equal to or greater than S2.  */
 
-/* Compare no more than N characters of S1 and S2,
-   returning less than, equal to or greater than zero
-   if S1 is lexicographically less than, equal to or
-   greater than S2.  */
-int
-STRNCMP (const char *s1, const char *s2, size_t n)
-{
-  unsigned char c1 = '\0';
-  unsigned char c2 = '\0';
-
-  if (n >= 4)
-    {
-      size_t n4 = n >> 2;
-      do
-	{
-	  c1 = (unsigned char) *s1++;
-	  c2 = (unsigned char) *s2++;
-	  if (c1 == '\0' || c1 != c2)
-	    return c1 - c2;
-	  c1 = (unsigned char) *s1++;
-	  c2 = (unsigned char) *s2++;
-	  if (c1 == '\0' || c1 != c2)
-	    return c1 - c2;
-	  c1 = (unsigned char) *s1++;
-	  c2 = (unsigned char) *s2++;
-	  if (c1 == '\0' || c1 != c2)
-	    return c1 - c2;
-	  c1 = (unsigned char) *s1++;
-	  c2 = (unsigned char) *s2++;
-	  if (c1 == '\0' || c1 != c2)
-	    return c1 - c2;
-	} while (--n4 > 0);
-      n &= 3;
-    }
+#include <string_vector.h>
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+#define CHECK_N
 
-  while (n > 0)
+static __always_inline
+size_t 
+byte_loop(char *x, char *y, size_t n, size_t end)
+{
+  size_t i;
+  for (i = 0; i < n; i++)
     {
-      c1 = (unsigned char) *s1++;
-      c2 = (unsigned char) *s2++;
-      if (c1 == '\0' || c1 != c2)
-	return c1 - c2;
-      n--;
+      if (i == end)
+        return SIZE_MAX;
+      if (x[i] == '\0' || x[i] != y[i])
+        return i;
     }
 
-  return c1 - c2;
+  return n;
 }
 
-libc_hidden_builtin_def (STRNCMP)
+#include <string_diff_skeleton.h>
+
+int
+strncmp (const char *s1_start, const char *s2_start, size_t n)
+{
+  unsigned char *p1 = (unsigned char *) s1_start;
+  unsigned char *p2 = (unsigned char *) s2_start;
+
+  if (!diff_skeleton ((char **) &p1, (char **) &p2, n))
+    return 0;
+
+  return *p1 - *p2;
+}
+libc_hidden_builtin_def (strncmp)
diff --git a/string/strverscmp.c b/string/strverscmp.c
index 38bf5e2..c7303f8 100644
--- a/string/strverscmp.c
+++ b/string/strverscmp.c
@@ -33,6 +33,28 @@ 
 #define  LEN    3
 
 
+
+
+
+#include <string_vector.h>
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+
+static __always_inline
+size_t 
+byte_loop(char *x, char *y, size_t n, size_t end)
+{
+  size_t i;
+  for (i = 0; i < n; i++)
+    if (x[i] == '\0' || x[i] != y[i])
+      return i;
+
+  return n;
+}
+
+#include <string_diff_skeleton.h>
+
+
+
 /* Compare S1 and S2 as strings holding indices/version numbers,
    returning less than, equal to or greater than zero if S1 is less than,
    equal to or greater than S2 (for more info, see the texinfo doc).
@@ -46,6 +68,13 @@  __strverscmp (s1, s2)
   const unsigned char *p1 = (const unsigned char *) s1;
   const unsigned char *p2 = (const unsigned char *) s2;
 
+  diff_skeleton ((char **) &p1, (char **) &p2, 0); 
+  while (isdigit (*p1) && p1 > (const unsigned char *) s1)
+    {
+      p1--;
+      p2--;
+    }
+
   /* Symbol(s)    0       [1-9]   others
      Transition   (10) 0  (01) d  (00) x   */
   static const uint8_t next_state[] =
diff --git a/sysdeps/generic/string_diff_skeleton.h b/sysdeps/generic/string_diff_skeleton.h
new file mode 100644
index 0000000..068f4e2
--- /dev/null
+++ b/sysdeps/generic/string_diff_skeleton.h
@@ -0,0 +1,279 @@ 
+/* Skeleton of generic string comparison functions.
+   Copyright (C) 1991-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <string.h>
+#include <libc-internal.h>
+#include <stdint.h>
+
+/* On high endian an positive could cause false positive in previous byte.  */
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+# undef EXPRESSION
+# define EXPRESSION(x, y) EXPRESSION_NOCARRY (x, y)
+#endif
+
+
+#define LOOP_SIZE (LOOP_UNROLL * LSIZE)
+
+
+#ifndef FIRST_CHECK_BYTEWISE
+# define FIRST_CHECK_BYTEWISE 0
+#endif
+
+#ifdef CHECK_N
+# define _CHECK_N(x) x
+#else
+# define _CHECK_N(x) 0
+#endif
+
+/* A strdiff variant. Sets p1 and p2 to possition of first difference.
+   If it read n bytes without difference it return 0, otherwise 1  */
+
+static __always_inline
+int maybe_aligned_loop (char **p1, char **p2, size_t n, int aligned);
+
+static __always_inline
+int
+diff_skeleton (char **p1, char **p2, size_t n)
+{
+  vector_int mask;
+  char *s1 = *p1, *s2 = *p2;
+  vector_int v1, v2;
+  vector_int __attribute__ ((unused)) previous = 0;
+
+#define RETURN(sa, sb, i) \
+  do 				                       \
+    { 				                       \
+      *p1 = sa + i;			               \
+      *p2 = sb + i;			               \
+      return _CHECK_N (n <= sa + i - s1) ? 0 : 1;      \
+    }					               \
+  while (0)
+
+
+  /*  Most comparisons differ in first byte and checking that is cheap.
+      A vector check in more expensive so there is question if saving
+      from byte check is more than penalty of branch misprediction.
+      Same reasoning could be applied for second byte with additional
+      penalty of checking first byte.  So we need to empirically test
+      whats optimal for given architecture.  */
+
+  int i = byte_loop (s1, s2, FIRST_CHECK_BYTEWISE, n);
+  if (i < FIRST_CHECK_BYTEWISE)
+    RETURN (s1, s2, i);
+  if (i == SIZE_MAX)
+    return 0;
+  
+  s1 += FIRST_CHECK_BYTEWISE; 
+  s2 += FIRST_CHECK_BYTEWISE; 
+  n -= FIRST_CHECK_BYTEWISE; 
+
+  if (_CHECK_N (n == 0))
+    return 0;
+
+  /* We fetch 32 bytes while not crossing page boundary.
+     Most strings in practice are of that size and we avoid a loop.
+     This looks as best in practice, alternative below uses aligned load
+     but is slower when string starts just few
+     bytes before 32 byte boundary. A tradeoff is that we rarely could
+     fetch extra cache line without needing it but this optimization
+     does pay for that. */
+
+
+  if (!CROSS_PAGE (s1, UNALIGNED_HEADER_UNROLL * LSIZE) 
+      && !CROSS_PAGE (s2, UNALIGNED_HEADER_UNROLL * LSIZE) 
+      && UNALIGNED_HEADER_UNROLL > 0)
+    {
+
+  /* TODO When emulating unaligned loads we could align s1 after 
+     first iteration or if FIRST_CHECK_BYTEWISE >= LSIZE */
+
+#define UNALIGNED_HEADER_CHECK(i) \
+  if (UNALIGNED_HEADER_UNROLL >= i)                                      \
+    {                                                                    \
+      v1 = LOADU (s1 + (i - 1) * LSIZE);                                 \
+      v2 = LOADU (s2 + (i - 1) * LSIZE);                                 \
+      mask = EXPRESSION (v1, v2);                                        \
+      if (mask)                                                          \
+	RETURN (s1 + (i - 1) * LSIZE, s2 + (i - 1) * LSIZE,              \
+               first_nonzero_byte (mask));       			 \
+    }
+
+      UNALIGNED_HEADER_CHECK (1);
+      UNALIGNED_HEADER_CHECK (2);
+      UNALIGNED_HEADER_CHECK (3);
+      UNALIGNED_HEADER_CHECK (4);
+      UNALIGNED_HEADER_CHECK (5);
+      UNALIGNED_HEADER_CHECK (6);
+      UNALIGNED_HEADER_CHECK (7);
+      UNALIGNED_HEADER_CHECK (8);
+
+      if (_CHECK_N (n <= LSIZE * UNALIGNED_HEADER_UNROLL))
+	return 0;
+    }
+  else
+    {
+      int i = byte_loop (s1, s2, LOOP_SIZE, n);
+      if (i < LOOP_SIZE)
+        RETURN (s1, s2, i);
+      if (i == SIZE_MAX)
+        return 0;
+    }
+#ifndef ALIGNING_HELPS
+# define ALIGNING_HELPS 0
+#endif
+  return maybe_aligned_loop (p1, p2, n, ALIGNING_HELPS && ((s1 - s2) % LSIZE == 0));
+}
+
+  /* Now we read enough bytes to start a loop, assuming following:  */
+
+static __always_inline
+int
+maybe_aligned_loop (char **p1, char **p2, size_t n, int aligned)
+{
+#define MAYBE_ALIGNED_LOAD(a) (aligned ? LOAD (a) : LOADU(a))
+  vector_int mask;
+  char *s1 = *p1, *s2 = *p2;
+  vector_int v1, v2;
+  vector_int __attribute__ ((unused)) previous = 0;
+
+  assert (LOOP_UNROLL <= UNALIGNED_HEADER_UNROLL
+	  || UNALIGNED_HEADER_UNROLL == 0);
+
+  char *s1_loop = PTR_ALIGN_DOWN (s1, LOOP_SIZE);
+  char *s2_loop = s2 - (s1 - s1_loop);
+  int until_cross_page = (4096 - (((uintptr_t) (s2_loop + LOOP_SIZE))
+                                   % 4096)) / LOOP_SIZE;
+
+#ifdef CHECK_N
+# ifdef SAVE_N_ITERS_REGISTER
+  while (s1 + n - s_loop1 >= LOOP_SIZE)
+# else
+  size_t n_iters = (n - (s1_loop + LOOP_SIZE - s1)
+                      + LOOP_SIZE - 1) / (LOOP_SIZE);
+  while (n_iters--)
+# endif
+#else
+  while (1)
+#endif
+    {
+      s1_loop += LOOP_SIZE;
+      s2_loop += LOOP_SIZE;
+
+      if (until_cross_page == 0)
+        {
+          uintptr_t shift = ((uintptr_t) s2_loop) % LOOP_SIZE;
+      
+     /* A tricky part here is that we need to mask bytes that could be read 
+        in previous iteration, these were already checked and when in header
+        these may lie before start of string.  */
+
+#define CROSSING_CHECK(i) \
+  if (i <= LOOP_UNROLL)							    \
+    {									    \
+      v1 = MAYBE_ALIGNED_LOAD (s1_loop - shift + (i - 1) * LSIZE);          \
+      v2 = LOAD (s2_loop - shift + (i - 1) * LSIZE);                        \
+      mask = EXPRESSION (v1, v2);                                           \
+      mask = forget_bytes (mask, shift - LSIZE * (i - 1));                  \
+      if (mask)                                                             \
+	RETURN (s1_loop - shift + (i - 1) * LSIZE,                          \
+                s2_loop - shift + (i - 1) * LSIZE,                          \
+                first_nonzero_byte (mask));       			    \
+    }
+         CROSSING_CHECK (1);
+         CROSSING_CHECK (2);
+         CROSSING_CHECK (3);
+         CROSSING_CHECK (4);
+         CROSSING_CHECK (5);
+         CROSSING_CHECK (6);
+         CROSSING_CHECK (7);
+         CROSSING_CHECK (8);
+         
+         until_cross_page = 4096 / LOOP_SIZE;
+         if (_CHECK_N (n <= s1_loop + LOOP_SIZE - shift - s1))
+           return 0;
+       }
+      until_cross_page--;
+
+      vector_int masks[9];
+      masks[0] = 0;
+
+#define MERGE_MASK(i) \
+  if (LOOP_UNROLL >= i)                                                 \
+    {                                                                   \
+      v1 = LOAD (s1_loop + (i - 1) * LSIZE);                            \
+      v2 = MAYBE_ALIGNED_LOAD (s2_loop + (i - 1) * LSIZE);              \
+      masks[i] = masks[i - 1] | EXPRESSION (v1, v2);                    \
+    }
+
+      MERGE_MASK (1);
+      MERGE_MASK (2);
+      MERGE_MASK (3);
+      MERGE_MASK (4);
+      MERGE_MASK (5);
+      MERGE_MASK (6);
+      MERGE_MASK (7);
+      MERGE_MASK (8);
+
+      if (masks[LOOP_UNROLL])
+	{
+
+          /* Here we have two possibilities depending on register pressure.
+             When you don't have enough registers this recalculating of
+             results would create fastest loop for large inputs. However 
+             its likely affected by gcc bug where gcc will try to save 
+             intermediate results causing spill in each iteration to speed
+             up final iteration a bit. To avoid that we need compiler barrier
+             here.  */
+
+#ifdef RECALCULATE_ON_TAIL
+          asm volatile ("" : : : "memory");
+# define CHECK_MASK(i) \
+      v1 = LOAD (s1_loop + (i - 1) * LSIZE);                             \
+      v2 = MAYBE_ALIGNED_LOAD (s2_loop + (i - 1) * LSIZE);               \
+      mask = EXPRESSION (v1, v2);                                        \
+      if (mask)                                                          \
+	RETURN (s1_loop + (i - 1) * LSIZE, s2_loop + (i - 1) * LSIZE,    \
+               first_nonzero_byte (mask));       			 
+
+# else
+          /* On other hand when you have enough free registers then you can
+             save intermediate ors. When you checks masks then as you know 
+             that to reach each one previous ones must be zero you know that
+             or of previous masks will be exactly current one.  */
+
+# define CHECK_MASK(i) \
+  if (masks[i])                                                          \
+	RETURN (s1_loop + (i - 1) * LSIZE, s2_loop + (i - 1) * LSIZE,    \
+               first_nonzero_byte (masks[i]));
+
+# endif
+
+	  CHECK_MASK (1);
+	  CHECK_MASK (2);
+	  CHECK_MASK (3);
+	  CHECK_MASK (4);
+	  CHECK_MASK (5);
+	  CHECK_MASK (6);
+	  CHECK_MASK (7);
+	  CHECK_MASK (8);
+	}
+    }
+  return 0;
+}