diff mbox

[RFC,3/3] Exploit that strchr needle is mostly ascii

Message ID 20150526181035.GA27596@domone
State RFC, archived
Headers show

Commit Message

Ondrej Bilka May 26, 2015, 6:10 p.m. UTC
I realized that strchrnul could be optimized more with exploiting
statistical properties of input. A idea is that needle is most time in
range 0-127

When we handle bytes 128-255 separately it allows considerable
simplification as we do remove false positive 128 only once instead of
twice as previously done.

Comments?


	* string/strchrnul.c: Exploit that c is most times in ascii.
diff mbox

Patch

diff --git b/string/strchrnul.c a/string/strchrnul.c
index ea91195..5019773 100644
--- b/string/strchrnul.c
+++ a/string/strchrnul.c
@@ -47,12 +47,24 @@  contains_zero (unsigned long int s)
   return ((s + add) ^ ~s) & high_bits;
 }
 
+
+/* Here idea is still use the result of expression
+   contains_zero (*p) | contains_zero (*p ^ cmask)
+   but we can optimize it. By moving or to compute
+   ((s + add) | ((s ^ cmask) + add) a highest bit is set only for characters
+   0, c, 128, c + 128
+   As we ensured that c has highest byte zero a ^ ~*p eliminates both 128 and 
+   128 + c instead doing xor twice.  */
+
+
 static always_inline
 int
 found_in_long_bytes(char *s, unsigned long int cmask, char **result)
 {
   const unsigned long int *lptr = (const unsigned long int *) s;
-  unsigned long int mask = contains_zero (*lptr) | contains_zero (*lptr ^ cmask);
+  unsigned long int mask = (((*lptr + add) | ((*lptr ^ cmask) + add)) 
+                            ^ ~*lptr) & high_bits;
+
   if (mask)
     {
       *result = s + ffsl (mask) / 8 - 1;
@@ -76,9 +88,30 @@  STRCHRNUL (const char *s_in, int c_in)
   const unsigned long int *lptr;
   char *s = (char *) s_in;
   unsigned char c = (unsigned char) c_in;
-  char *r;
+  char *r, *s_aligned;
   unsigned long int cmask = c * ones;
 
+  if (__libc_unlikely (c > 127))
+    {
+      s_aligned = PTR_ALIGN_DOWN (s, LSIZE);
+      lptr = (const unsigned long int *) s_aligned;
+      mask = (contains_zero (*lptr) 
+              | contains_zero (*lptr ^ cmask))
+             >> (8 * (s_aligned - s));
+
+      if (mask)
+        return s + ffsl (mask) / 8 - 1;
+      while (1)
+        {
+          s_aligned += LSIZE;
+          lptr = (const unsigned long int *) s_aligned;
+          mask = (contains_zero (*lptr)
+               | contains_zero (*lptr ^ cmask));
+          if (mask)       
+            return s_aligned + ffsl (mask) / 8 - 1;
+        }
+    }
+
 #if _STRING_ARCH_unaligned
   /* We fetch 32 bytes while not crossing page boundary. 
      Most strings in practice are of that size and we avoid a loop.
@@ -115,7 +148,7 @@  STRCHRNUL (const char *s_in, int c_in)
   /* We need use aligned loads. For first load we read some bytes before 
      start that we discard by shifting them down. */
  
-      char *s_aligned = PTR_ALIGN_DOWN (s, LSIZE);
+      s_aligned = PTR_ALIGN_DOWN (s, LSIZE);
       lptr = (const unsigned long int *) s_aligned;
       mask = (contains_zero (*lptr) 
               | contains_zero (*lptr ^ cmask))