diff mbox

[5/*] Generic string function optimization: strcmp and strncmp

Message ID 20150528152331.GA24617@domone
State New
Headers show

Commit Message

Ondrej Bilka May 28, 2015, 3:23 p.m. UTC
This adds a platform generic optimization of strcmp and strncmp.
I adapted a x64 skeleton from strcmp. I correctly added end checks which 
I didn't do before so I will also send optimized strncmp.

 	* string/diff_skeleton.h: New file.
 	* string/strcmp.c: Use diff skeleton.
	* string/strncmp.c: Likewise.
diff mbox

Patch

diff --git a/string/diff_skeleton.h b/string/diff_skeleton.h
new file mode 100644
index 0000000..a2cdccc
--- /dev/null
+++ b/string/diff_skeleton.h
@@ -0,0 +1,181 @@ 
+/* Skeleton of *cmp string functions.
+   Copyright (C) 1991-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+#include <libc-internal.h>
+#include <stdint.h>
+
+#ifndef BOUND
+# define BOUND(x) 0
+#endif
+
+
+static __always_inline
+int
+found_in_long_bytes(char *s1, char *s2, char **r1, char **r2)
+{
+  const unsigned long int *lptr1 = (const unsigned long int *) s1;
+  const unsigned long int *lptr2 = (const unsigned long int *) s2;
+
+  unsigned long int mask = EXPRESSION(*lptr1, *lptr2);
+  if (mask)
+    {
+      size_t found = first_nonzero_byte (mask);
+      *r1 = s1 + found;
+      *r2 = s2 + found;
+      return 1;
+    }
+  else
+    return 0;
+}
+
+static __always_inline
+int
+diff_skeleton (char **p1, char **p2, char *end)
+{
+  unsigned long int mask;
+  const unsigned long int *lptr1, *lptr2;
+  char *s1 = *p1, *s2 = *p2;
+
+
+#if _STRING_ARCH_unaligned == 0
+  /* We don't optimize for architectures without aligned load yet.
+     Problem is that header is hot and you need different tricks 
+     with aligned load. However loop would be relatively easy by 
+     emulating loads with mix of byte shifts. */
+
+  size_t i = byte_loop(s1, s2, SIZE_MAX, end);
+  *p1 = s1 + i;
+  *p2 = s2 + i;
+  return i != SIZE_MAX;
+#endif
+
+  /* We fetch 32 bytes while not crossing page boundary. 
+     Most strings in practice are of that size and we avoid a loop.
+     This looks as best in practice, alternative below uses aligned load 
+     but is slower when string starts just few 
+     bytes before 32 byte boundary. A tradeoff is that we rarely could 
+     fetch extra cache line without needing it but this optimization 
+     does pay for that. */
+  if (!CROSS_PAGE (s1, 32) && !CROSS_PAGE (s2, 32))
+    {
+      if (found_in_long_bytes (s1 + 0 * LSIZE, s2 + 0 * LSIZE, p1, p2))
+        return 1;
+      if (found_in_long_bytes (s1 + 1 * LSIZE, s2 + 1 * LSIZE, p1, p2))
+        return 1;
+      if (found_in_long_bytes (s1 + 2 * LSIZE, s2 + 2 * LSIZE, p1, p2))
+        return 1;
+      if (found_in_long_bytes (s1 + 3 * LSIZE, s2 + 3 * LSIZE, p1, p2))
+        return 1;
+      if (sizeof (unsigned long int) == 4)
+        {
+          if (found_in_long_bytes (s1 + 4 * LSIZE, s2 + 4 * LSIZE, p1, p2))
+            return 1;
+          if (found_in_long_bytes (s1 + 5 * LSIZE, s2 + 5 * LSIZE, p1, p2))
+            return 1;
+          if (found_in_long_bytes (s1 + 6 * LSIZE, s2 + 6 * LSIZE, p1, p2))
+            return 1;
+          if (found_in_long_bytes (s1 + 7 * LSIZE, s2 + 7 * LSIZE, p1, p2))
+            return 1;
+        }
+
+      if (BOUND (s1 + 32))
+        return 0;
+    }
+  else
+    {
+      size_t i = byte_loop(s1, s2, 32, end);
+
+      if (i==SIZE_MAX)
+        return 0;
+
+      if (i < 32)
+        {
+          *p1 = s1 + i;
+	  *p2 = s2 + i;
+          return 1;
+        }    
+    }
+   /* Now we read enough bytes to start a loop.  */
+
+  char *s1_loop = PTR_ALIGN_DOWN (s1, 4 * LSIZE);
+  char *s2_loop = s2 - (s1 - s1_loop);
+  int until_cross_page = (4096 - (((uintptr_t) (s2_loop + 4 * LSIZE)) % 4096))\
+                         / (4 * LSIZE);
+
+#ifdef COALIGN_HELP
+  if (((uintptr_t)s2_loop)%32==0)
+    until_cross_page = 1000000;
+#endif
+  while (!BOUND (s1_loop + 4 * LSIZE))
+    {
+      s1_loop += 4 * LSIZE;
+      s2_loop += 4 * LSIZE;
+
+
+      if (until_cross_page == 0)
+        {
+          uintptr_t shift = ((uintptr_t) s2_loop) % (4 * LSIZE);
+          if (found_in_long_bytes (s1_loop - shift + 0 * LSIZE, 
+                                   s2_loop - shift + 0 * LSIZE, p1, p2))
+            return 1;
+          if (found_in_long_bytes (s1_loop - shift + 1 * LSIZE, 
+                                   s2_loop - shift + 1 * LSIZE, p1, p2))
+            return 1;
+          if (found_in_long_bytes (s1_loop - shift + 2 * LSIZE, 
+                                   s2_loop - shift + 2 * LSIZE, p1, p2))
+            return 1;
+          if (found_in_long_bytes (s1_loop - shift + 3 * LSIZE, 
+                                   s2_loop - shift + 3 * LSIZE, p1, p2))
+            return 1;
+
+          until_cross_page = 4096 / (4 * LSIZE);
+          if (BOUND (s1_loop + 4 * LSIZE - shift))
+            return 0;
+        }
+
+      until_cross_page--;
+
+      lptr1 = (const unsigned long int *) (s1_loop + 0 * LSIZE);
+      lptr2 = (const unsigned long int *) (s2_loop + 0 * LSIZE);
+      mask = EXPRESSION (*lptr1, *lptr2);
+      lptr1 = (const unsigned long int *) (s1_loop + 1 * LSIZE);
+      lptr2 = (const unsigned long int *) (s2_loop + 1 * LSIZE);
+      mask |= EXPRESSION (*lptr1, *lptr2);
+      lptr1 = (const unsigned long int *) (s1_loop + 2 * LSIZE);
+      lptr2 = (const unsigned long int *) (s2_loop + 2 * LSIZE);
+      mask |= EXPRESSION (*lptr1, *lptr2);
+      lptr1 = (const unsigned long int *) (s1_loop + 3 * LSIZE);
+      lptr2 = (const unsigned long int *) (s2_loop + 3 * LSIZE);
+      mask |= EXPRESSION (*lptr1, *lptr2);
+
+      if (mask)
+        {
+          if (found_in_long_bytes (s1_loop + 0 * LSIZE, s2_loop + 0 * LSIZE, p1, p2))
+            return 1;
+          if (found_in_long_bytes (s1_loop + 1 * LSIZE, s2_loop + 1 * LSIZE, p1, p2))
+            return 1;
+          if (found_in_long_bytes (s1_loop + 2 * LSIZE, s2_loop + 2 * LSIZE, p1, p2))
+            return 1;
+
+          found_in_long_bytes (s1_loop + 3 * LSIZE, s2_loop + 3 * LSIZE, p1, p2);
+          return 1;
+        }
+    }
+ return 0;
+}
diff --git a/string/strcmp.c b/string/strcmp.c
index 4d4c044..c1d4c22 100644
--- a/string/strcmp.c
+++ b/string/strcmp.c
@@ -16,28 +16,37 @@ 
    <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
-
 #undef strcmp
 
 /* Compare S1 and S2, returning less than, equal to or
    greater than zero if S1 is lexicographically less than,
    equal to or greater than S2.  */
+
+#include "string/common.h"
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+
+static __always_inline
+size_t 
+byte_loop(char *x, char *y, size_t n, char *end)
+{
+  size_t i;
+  for (i = 0; i < n; i++)
+    if (x[i] == '\0' || x[i] != y[i])
+      return i;
+
+  return n;
+}
+
+#include "string/diff_skeleton.h"
+
 int
-strcmp (const char *p1, const char *p2)
+strcmp (const char *s1_start, const char *s2_start)
 {
-  const unsigned char *s1 = (const unsigned char *) p1;
-  const unsigned char *s2 = (const unsigned char *) p2;
-  unsigned char c1, c2;
-
-  do
-    {
-      c1 = (unsigned char) *s1++;
-      c2 = (unsigned char) *s2++;
-      if (c1 == '\0')
-	return c1 - c2;
-    }
-  while (c1 == c2);
-
-  return c1 - c2;
+  unsigned char *p1 = (unsigned char *) s1_start;
+  unsigned char *p2 = (unsigned char *) s2_start;
+
+  diff_skeleton ((char **) &p1, (char **) &p2, NULL); 
+
+  return *p1 - *p2;
 }
 libc_hidden_builtin_def (strcmp)
diff --git a/string/strncmp.c b/string/strncmp.c
index 2a1137a..4c4e9f7 100644
--- a/string/strncmp.c
+++ b/string/strncmp.c
@@ -16,59 +16,55 @@ 
    <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
-#include <memcopy.h>
 
 #undef strncmp
 
-#ifndef STRNCMP
-#define STRNCMP strncmp
-#endif
+/* Compare S1 and S2, returning less than, equal to or
+   greater than zero if S1 is lexicographically less than,
+   equal to or greater than S2.  */
 
-/* Compare no more than N characters of S1 and S2,
-   returning less than, equal to or greater than zero
-   if S1 is lexicographically less than, equal to or
-   greater than S2.  */
-int
-STRNCMP (const char *s1, const char *s2, size_t n)
-{
-  unsigned char c1 = '\0';
-  unsigned char c2 = '\0';
+#include "string/common.h"
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+#define BOUND(p) ((uintptr_t) p >= (uintptr_t) end)
 
-  if (n >= 4)
+static __always_inline
+size_t 
+byte_loop(char *x, char *y, size_t c, char *end)
+{
+  size_t i;
+  for (i = 0; i < c; i++)
     {
-      size_t n4 = n >> 2;
-      do
-	{
-	  c1 = (unsigned char) *s1++;
-	  c2 = (unsigned char) *s2++;
-	  if (c1 == '\0' || c1 != c2)
-	    return c1 - c2;
-	  c1 = (unsigned char) *s1++;
-	  c2 = (unsigned char) *s2++;
-	  if (c1 == '\0' || c1 != c2)
-	    return c1 - c2;
-	  c1 = (unsigned char) *s1++;
-	  c2 = (unsigned char) *s2++;
-	  if (c1 == '\0' || c1 != c2)
-	    return c1 - c2;
-	  c1 = (unsigned char) *s1++;
-	  c2 = (unsigned char) *s2++;
-	  if (c1 == '\0' || c1 != c2)
-	    return c1 - c2;
-	} while (--n4 > 0);
-      n &= 3;
+      if (x + i == end)
+        return SIZE_MAX;
+      if (x[i] == '\0' || x[i] != y[i])
+        return i;
     }
 
-  while (n > 0)
+  return c;
+}
+
+#include "string/diff_skeleton.h"
+
+int
+strncmp (const char *s1_start, const char *s2_start, size_t n)
+{
+  unsigned char *p1 = (unsigned char *) s1_start;
+  unsigned char *p2 = (unsigned char *) s2_start;
+  char *end = (char *) (((uintptr_t) p1) + n);
+  if ((uintptr_t) end <= (uintptr_t) p1)
     {
-      c1 = (unsigned char) *s1++;
-      c2 = (unsigned char) *s2++;
-      if (c1 == '\0' || c1 != c2)
-	return c1 - c2;
-      n--;
+      if (n == 0)
+        return 0;
+      end = (char *) UINTPTR_MAX;
     }
 
-  return c1 - c2;
-}
+  if (!diff_skeleton ((char **) &p1, (char **) &p2, end))
+    return 0;
 
-libc_hidden_builtin_def (STRNCMP)
+  if (BOUND (p1))
+    return 0;
+
+
+  return *p1 - *p2;
+}
+libc_hidden_builtin_def (strncmp)