Improve generic strcspn performance

Message ID AM3PR08MB0088F82AB469E058650B493283F60@AM3PR08MB0088.eurprd08.prod.outlook.com
State Superseded
Headers

Commit Message

Wilco Dijkstra Jan. 8, 2016, 6:40 p.m. UTC
  Improve strcspn performance using a much faster algorithm. It is kept simple 
so it works well on most targets. It is generally at least 10 times faster 
than the existing implementation on bench-strcspn on a few AArch64 
implementations, and for some tests 100 times as fast (repeatedly calling 
strchr on a small string is extremely slow...). In fact the string/bits/string2.h
inlines make no longer sense, as GCC already uses strlen if reject is an empty
string, strchrnul is 5 times as fast as __strcspn_c1, while __strcspn_c2 and
__strcspn_c3 are slower than the strcspn main loop for large strings (though reject
length 2-4 could be special cased in the future to gain even more performance).

Built and tested on AArch64. OK for GLIBC 2.23?

ChangeLog:
2016-01-08  Wilco Dijkstra  <wdijkstr@arm.com>

	* string/strcspn.c (strcspn): Rewrite function.
	* string/bits/string2.h (strcspn): Use __builtin_strcspn.
	(__strcspn_c1) Remove inline function.
	(__strcspn_c2) Likewise.
	(__strcspn_c3) Likewise.
  

Comments

Adhemerval Zanella Jan. 8, 2016, 8:05 p.m. UTC | #1
Some comments below:

On 08-01-2016 16:40, Wilco Dijkstra wrote:
> Improve strcspn performance using a much faster algorithm. It is kept simple 
> so it works well on most targets. It is generally at least 10 times faster 
> than the existing implementation on bench-strcspn on a few AArch64 
> implementations, and for some tests 100 times as fast (repeatedly calling 
> strchr on a small string is extremely slow...). In fact the string/bits/string2.h
> inlines make no longer sense, as GCC already uses strlen if reject is an empty
> string, strchrnul is 5 times as fast as __strcspn_c1, while __strcspn_c2 and
> __strcspn_c3 are slower than the strcspn main loop for large strings (though reject
> length 2-4 could be special cased in the future to gain even more performance).
> 
> Built and tested on AArch64. OK for GLIBC 2.23?
> 
> ChangeLog:
> 2016-01-08  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	* string/strcspn.c (strcspn): Rewrite function.
> 	* string/bits/string2.h (strcspn): Use __builtin_strcspn.
> 	(__strcspn_c1) Remove inline function.
> 	(__strcspn_c2) Likewise.
> 	(__strcspn_c3) Likewise.
> 
> diff --git a/string/bits/string2.h b/string/bits/string2.h
> index e18c67530ec78338c9591015f3d95c785de54726..d0926f1e7898a13852081f45d54bff5145751695 100644
> --- a/string/bits/string2.h
> +++ b/string/bits/string2.h
> @@ -199,62 +199,10 @@ extern void *__rawmemchr (const void *__s, int __c);
>  
>  /* Return the length of the initial segment of S which
>     consists entirely of characters not in REJECT.  */
> -#if !defined _HAVE_STRING_ARCH_strcspn
> -# ifndef _HAVE_STRING_ARCH_strcspn
> -#  if __GNUC_PREREQ (3, 2)
> -#   define strcspn(s, reject) \
> -  __extension__								      \
> -  ({ char __r0, __r1, __r2;						      \
> -     (__builtin_constant_p (reject) && __string2_1bptr_p (reject)	      \
> -      ? ((__builtin_constant_p (s) && __string2_1bptr_p (s))		      \
> -	 ? __builtin_strcspn (s, reject)				      \
> -	 : ((__r0 = ((const char *) (reject))[0], __r0 == '\0')		      \
> -	    ? strlen (s)						      \
> -	    : ((__r1 = ((const char *) (reject))[1], __r1 == '\0')	      \
> -	       ? __strcspn_c1 (s, __r0)					      \
> -	       : ((__r2 = ((const char *) (reject))[2], __r2 == '\0')	      \
> -		  ? __strcspn_c2 (s, __r0, __r1)			      \
> -		  : (((const char *) (reject))[3] == '\0'		      \
> -		     ? __strcspn_c3 (s, __r0, __r1, __r2)		      \
> -		     : __builtin_strcspn (s, reject))))))		      \
> -      : __builtin_strcspn (s, reject)); })
> -#  endif
> +#ifndef _HAVE_STRING_ARCH_strcspn
> +# if __GNUC_PREREQ (3, 2)
> +#  define strcspn(s, reject) __builtin_strcspn (s, reject)
>  # endif
> -
> -__STRING_INLINE size_t __strcspn_c1 (const char *__s, int __reject);
> -__STRING_INLINE size_t
> -__strcspn_c1 (const char *__s, int __reject)
> -{
> -  size_t __result = 0;
> -  while (__s[__result] != '\0' && __s[__result] != __reject)
> -    ++__result;
> -  return __result;
> -}
> -
> -__STRING_INLINE size_t __strcspn_c2 (const char *__s, int __reject1,
> -				     int __reject2);
> -__STRING_INLINE size_t
> -__strcspn_c2 (const char *__s, int __reject1, int __reject2)
> -{
> -  size_t __result = 0;
> -  while (__s[__result] != '\0' && __s[__result] != __reject1
> -	 && __s[__result] != __reject2)
> -    ++__result;
> -  return __result;
> -}
> -
> -__STRING_INLINE size_t __strcspn_c3 (const char *__s, int __reject1,
> -				     int __reject2, int __reject3);
> -__STRING_INLINE size_t
> -__strcspn_c3 (const char *__s, int __reject1, int __reject2,
> -	      int __reject3)
> -{
> -  size_t __result = 0;
> -  while (__s[__result] != '\0' && __s[__result] != __reject1
> -	 && __s[__result] != __reject2 && __s[__result] != __reject3)
> -    ++__result;
> -  return __result;
> -}
>  #endif
>  
>  
> diff --git a/string/strcspn.c b/string/strcspn.c
> index 2694d2ab0e807d0712d6b103dbdfd75ef164ebf1..cdb2df315c394889fe04b69980c63ea4ddbdb286 100644
> --- a/string/strcspn.c
> +++ b/string/strcspn.c
> @@ -26,16 +26,48 @@
>  /* Return the length of the maximum initial segment of S
>     which contains no characters from REJECT.  */
>  size_t
> -STRCSPN (const char *s, const char *reject)
> +STRCSPN (const char *str, const char *reject)
>  {
> -  size_t count = 0;
> +  unsigned char table[256];
> +  unsigned char *p, *s;
> +  unsigned int c0, c1, c2, c3;
> +  size_t count;
>  
> -  while (*s != '\0')
> -    if (strchr (reject, *s++) == NULL)
> -      ++count;
> -    else
> -      return count;
> +  if (reject[0] == '\0')
> +    return strlen (str);
> +  if (reject[1] == '\0')
> +    return __strchrnul (str, reject [0]) - str;

I am not sure how often strcspn is used with empty or one char argument to
validate this optimization in specific since it adds more branch cases for
more general inputs.

>  
> -  return count;
> +  /* Use multiple small memsets to enable inlining on most targets.  */
> +  p = memset (table, 0, 64);
> +  memset (p + 64, 0, 64);
> +  memset (p + 128, 0, 64);
> +  memset (p + 192, 0, 64);

It is unfortunate we need to use this to force inline instead to let the
compiler handle it directly (and also simplifying the code by using
c99 initializers).  I noted x86_64 does no inline, although aarch64 and
powerpc64le calls memset.  How bad is avoiding this explicit calls now
and work on compiler side to detect this aligned memset? 

> +
> +  s = (unsigned char*) reject;
> +  do
> +    p[c0 = *s++] = 1;
> +  while (c0);
> +
> +  s = (unsigned char*) str;
> +  if (p[s[0]]) return 0;
> +  if (p[s[1]]) return 1;
> +  if (p[s[2]]) return 2;
> +  if (p[s[3]]) return 3;
> +
> +  s = (unsigned char *) ((size_t)s & ~3);
> +
> +  do
> +    {
> +      s += 4;
> +      c0 = p[s[0]];
> +      c1 = p[s[1]];
> +      c2 = p[s[2]];
> +      c3 = p[s[3]];
> +    }
> +  while ((c0 | c1 | c2 | c3) == 0);
> +
> +  count = s - (unsigned char *) str;
> +  return (c0 | c1) != 0 ? count - c0 + 1 : count - c2 + 3;
>  }
>  libc_hidden_builtin_def (strcspn)
>
  
Wilco Dijkstra Jan. 18, 2016, 7:09 p.m. UTC | #2
Adhemerval Zanella Netto - Jan. 8, 2016, 8:05 p.m. wrote:
> > +  if (reject[0] == '\0')
> > +    return strlen (str);
> > +  if (reject[1] == '\0')
> > +    return __strchrnul (str, reject [0]) - str;
>
> I am not sure how often strcspn is used with empty or one char argument to
> validate this optimization in specific since it adds more branch cases for
> more general inputs.

An empty string is extremely unlikely, however one and two characters seem to
occur frequently (grep the GLIC sources for str(c)spn/strpbrk). My goal was
to get rid of the odd inlines in the headers and enable the generic C implementation
to beat the special cases by a good margin. Compared to the overhead of the
initialization of the table, these extra checks cost very little (and once you check
for a single-character string, you also need to check for an empty string).

> > -  return count;
> > +  /* Use multiple small memsets to enable inlining on most targets.  */
> > +  p = memset (table, 0, 64);
> > +  memset (p + 64, 0, 64);
> > +  memset (p + 128, 0, 64);
> > +  memset (p + 192, 0, 64);
>
> It is unfortunate we need to use this to force inline instead to let the
> compiler handle it directly (and also simplifying the code by using
> c99 initializers).  I noted x86_64 does no inline, although aarch64 and
> powerpc64le calls memset.  How bad is avoiding this explicit calls now
> and work on compiler side to detect this aligned memset?

Yes but unfortunately inlining of memset is essential to get reasonable
performance on small sizes. Eg. for sizes 30-60 the overhead of not inlining
is 25-30% on Cortex-A57.

We could maybe add a --param max-inline-memset=N option to a future GCC for 
building GLIBC (or just these files), however this doesn't help when GLIBC is
built using any current GCC versions.

Another possibility might be to write a loop with stores of size_t and build with
a huge value for max-completely-peeled-insns. Or just give up and use macros
to write out all stores explicitly...

Wilco
  

Patch

diff --git a/string/bits/string2.h b/string/bits/string2.h
index e18c67530ec78338c9591015f3d95c785de54726..d0926f1e7898a13852081f45d54bff5145751695 100644
--- a/string/bits/string2.h
+++ b/string/bits/string2.h
@@ -199,62 +199,10 @@  extern void *__rawmemchr (const void *__s, int __c);
 
 /* Return the length of the initial segment of S which
    consists entirely of characters not in REJECT.  */
-#if !defined _HAVE_STRING_ARCH_strcspn
-# ifndef _HAVE_STRING_ARCH_strcspn
-#  if __GNUC_PREREQ (3, 2)
-#   define strcspn(s, reject) \
-  __extension__								      \
-  ({ char __r0, __r1, __r2;						      \
-     (__builtin_constant_p (reject) && __string2_1bptr_p (reject)	      \
-      ? ((__builtin_constant_p (s) && __string2_1bptr_p (s))		      \
-	 ? __builtin_strcspn (s, reject)				      \
-	 : ((__r0 = ((const char *) (reject))[0], __r0 == '\0')		      \
-	    ? strlen (s)						      \
-	    : ((__r1 = ((const char *) (reject))[1], __r1 == '\0')	      \
-	       ? __strcspn_c1 (s, __r0)					      \
-	       : ((__r2 = ((const char *) (reject))[2], __r2 == '\0')	      \
-		  ? __strcspn_c2 (s, __r0, __r1)			      \
-		  : (((const char *) (reject))[3] == '\0'		      \
-		     ? __strcspn_c3 (s, __r0, __r1, __r2)		      \
-		     : __builtin_strcspn (s, reject))))))		      \
-      : __builtin_strcspn (s, reject)); })
-#  endif
+#ifndef _HAVE_STRING_ARCH_strcspn
+# if __GNUC_PREREQ (3, 2)
+#  define strcspn(s, reject) __builtin_strcspn (s, reject)
 # endif
-
-__STRING_INLINE size_t __strcspn_c1 (const char *__s, int __reject);
-__STRING_INLINE size_t
-__strcspn_c1 (const char *__s, int __reject)
-{
-  size_t __result = 0;
-  while (__s[__result] != '\0' && __s[__result] != __reject)
-    ++__result;
-  return __result;
-}
-
-__STRING_INLINE size_t __strcspn_c2 (const char *__s, int __reject1,
-				     int __reject2);
-__STRING_INLINE size_t
-__strcspn_c2 (const char *__s, int __reject1, int __reject2)
-{
-  size_t __result = 0;
-  while (__s[__result] != '\0' && __s[__result] != __reject1
-	 && __s[__result] != __reject2)
-    ++__result;
-  return __result;
-}
-
-__STRING_INLINE size_t __strcspn_c3 (const char *__s, int __reject1,
-				     int __reject2, int __reject3);
-__STRING_INLINE size_t
-__strcspn_c3 (const char *__s, int __reject1, int __reject2,
-	      int __reject3)
-{
-  size_t __result = 0;
-  while (__s[__result] != '\0' && __s[__result] != __reject1
-	 && __s[__result] != __reject2 && __s[__result] != __reject3)
-    ++__result;
-  return __result;
-}
 #endif
 
 
diff --git a/string/strcspn.c b/string/strcspn.c
index 2694d2ab0e807d0712d6b103dbdfd75ef164ebf1..cdb2df315c394889fe04b69980c63ea4ddbdb286 100644
--- a/string/strcspn.c
+++ b/string/strcspn.c
@@ -26,16 +26,48 @@ 
 /* Return the length of the maximum initial segment of S
    which contains no characters from REJECT.  */
 size_t
-STRCSPN (const char *s, const char *reject)
+STRCSPN (const char *str, const char *reject)
 {
-  size_t count = 0;
+  unsigned char table[256];
+  unsigned char *p, *s;
+  unsigned int c0, c1, c2, c3;
+  size_t count;
 
-  while (*s != '\0')
-    if (strchr (reject, *s++) == NULL)
-      ++count;
-    else
-      return count;
+  if (reject[0] == '\0')
+    return strlen (str);
+  if (reject[1] == '\0')
+    return __strchrnul (str, reject [0]) - str;
 
-  return count;
+  /* Use multiple small memsets to enable inlining on most targets.  */
+  p = memset (table, 0, 64);
+  memset (p + 64, 0, 64);
+  memset (p + 128, 0, 64);
+  memset (p + 192, 0, 64);
+
+  s = (unsigned char*) reject;
+  do
+    p[c0 = *s++] = 1;
+  while (c0);
+
+  s = (unsigned char*) str;
+  if (p[s[0]]) return 0;
+  if (p[s[1]]) return 1;
+  if (p[s[2]]) return 2;
+  if (p[s[3]]) return 3;
+
+  s = (unsigned char *) ((size_t)s & ~3);
+
+  do
+    {
+      s += 4;
+      c0 = p[s[0]];
+      c1 = p[s[1]];
+      c2 = p[s[2]];
+      c3 = p[s[3]];
+    }
+  while ((c0 | c1 | c2 | c3) == 0);
+
+  count = s - (unsigned char *) str;
+  return (c0 | c1) != 0 ? count - c0 + 1 : count - c2 + 3;
 }
 libc_hidden_builtin_def (strcspn)