[2/4] Improve generic strspn performance

Message ID 56FA8C69.8010709@linaro.org
State Superseded
Headers

Commit Message

Adhemerval Zanella March 29, 2016, 2:08 p.m. UTC
  On 29-03-2016 10:02, Wilco Dijkstra wrote:
> Adhemerval Zanella wrote:
> 
>> +  if (accept[0] == '\0')
>> +    return 0;
>> +  if (accept[1] == '\0')
>> +    { 
> 
> GCC doesn't get the static branch prediction correct for the 2nd if,
> so it would be useful to add __glibc_unlikely given that single-character
> accepts are rare.
> 
>> +  s = (unsigned char *) ((size_t)(s) & ~3);
>> +  unsigned int c0, c1, c2, c3; 
>> +  do {
>> +      s += 4;
>> +      c0 = p[s[0]];
>> +      c1 = p[s[1]];
>> +      c2 = p[s[2]];
>> +      c3 = p[s[3]];
>> +  } while ((c0 && c1 && c2 && c3) == 1);
> 
> That should use '&' rather than '&&' and '!= 0' similar to how I did it in strcspn.
> This will use 3 AND(S) instructions and a single branch.
> 
>> +
>> +  size_t count = s - (unsigned char *) str;
>> +  return (c0 && c1) == 0 ? count - !c0 + 1 : count - !c2 + 3;
> 
> Again, c0 & c1 is better and allows CSE with the while expression above.
> Also -!c0 +1 is equivalent to c0, -!c2 + 3 is equivalent to c2 + 2 - this is simpler
> and faster.
> 
> Otherwise it looks good, and thanks for doing this one too!
> 
> Cheers,
> Wilco
> 

Thanks for the review.  I think this version fixes the points you noted:
  

Patch

diff --git a/string/strspn.c b/string/strspn.c
index f0635c1..15d7fa7 100644
--- a/string/strspn.c
+++ b/string/strspn.c
@@ -25,23 +25,49 @@ 
 /* Return the length of the maximum initial segment
    of S which contains only characters in ACCEPT.  */
 size_t
-STRSPN (const char *s, const char *accept)
+STRSPN (const char *str, const char *accept)
 {
-  const char *p;
-  const char *a;
-  size_t count = 0;
-
-  for (p = s; *p != '\0'; ++p)
-    {
-      for (a = accept; *a != '\0'; ++a)
-	if (*p == *a)
-	  break;
-      if (*a == '\0')
-	return count;
-      else
-	++count;
+  if (accept[0] == '\0')
+    return 0;
+  if (__glibc_unlikely (accept[1] == '\0'))
+    { 
+      const char *a = str;
+      for (; *str == *accept; str++);
+      return str - a;
     }
 
-  return count;
+  /* Use multiple small memsets to enable inlining on most targets.  */
+  unsigned char table[256];
+  unsigned char *p = memset (table, 0, 64);
+  memset (p + 64, 0, 64);
+  memset (p + 128, 0, 64);
+  memset (p + 192, 0, 64);
+
+  unsigned char *s = (unsigned char*) accept;
+  /* Different from strcspn it does not add the NULL on the table
+     so can avoid check if str[i] is NULL, since table['\0'] will
+     be 0 and thus stopping the loop check.  */
+  do
+    p[*s++] = 1;
+  while (*s);
+
+  s = (unsigned char*) str;
+  if (!p[s[0]]) return 0;
+  if (!p[s[1]]) return 1;
+  if (!p[s[2]]) return 2;
+  if (!p[s[3]]) return 3;
+          
+  s = (unsigned char *) ((size_t)(s) & ~3);
+  unsigned int c0, c1, c2, c3; 
+  do {
+      s += 4;
+      c0 = p[s[0]];
+      c1 = p[s[1]];
+      c2 = p[s[2]];
+      c3 = p[s[3]];
+  } while ((c0 & c1 & c2 & c3) != 0);
+
+  size_t count = s - (unsigned char *) str;
+  return (c0 & c1) == 0 ? count + c0 : count + c2 + 2;
 }
 libc_hidden_builtin_def (strspn)