Patchwork [v2] Improve strstr performance

login
register
mail settings
Submitter Wilco Dijkstra
Date June 30, 2018, 2:30 p.m.
Message ID <DB5PR08MB10306C82FE5593353F04E18E834D0@DB5PR08MB1030.eurprd08.prod.outlook.com>
Download mbox | patch
Permalink /patch/28172/
State New
Headers show

Comments

Wilco Dijkstra - June 30, 2018, 2:30 p.m.
Version 2 uses __strnlen to avoid namespace issues.

Improve strstr performance.  Strstr tends to be slow because it uses
many calls to memchr and a slow byte loop to scan for the next match.
Performance is significantly improved by using strnlen on larger blocks
and using strchr to search for the next matching character.  strcasestr
can also use strnlen to scan ahead, and memmem can use memchr to check
for the next match.

On the GLIBC bench tests the performance gains on Cortex-A72 are:
strstr: +25%
strcasestr: +4.3%
memmem: +18%

On a 256KB dataset strstr performance improves by 67%, strcasestr by 47%.

ChangeLog:
2018-06-30  Wilco Dijkstra  <wdijkstr@arm.com>

	* benchtests/bench-strcasestr.c: Rename __strnlen to strnlen.
	* benchtests/bench-strstr.c: Likewise.
	* string/memmem.c (FASTSEARCH): Define.	
	* string/str-two-way.h (two_way_short_needle): Minor cleanups.
	Add support for FASTSEARCH.
	* string/strcasestr.c (AVAILABLE): Use read-ahead __strnlen.
	* string/strstr.c (AVAILABLE): Use read-ahead __strnlen.
	(FASTSEARCH): Define.
	* string/test-strcasestr.c: Rename __strnlen to strnlen.
	* string/test-strstr.c: Likewise.
--
Adhemerval Zanella Netto - July 3, 2018, 12:49 p.m.
On 30/06/2018 11:30, Wilco Dijkstra wrote:
> Version 2 uses __strnlen to avoid namespace issues.
> 
> Improve strstr performance.  Strstr tends to be slow because it uses
> many calls to memchr and a slow byte loop to scan for the next match.
> Performance is significantly improved by using strnlen on larger blocks
> and using strchr to search for the next matching character.  strcasestr
> can also use strnlen to scan ahead, and memmem can use memchr to check
> for the next match.
> 
> On the GLIBC bench tests the performance gains on Cortex-A72 are:
> strstr: +25%
> strcasestr: +4.3%
> memmem: +18%
> 
> On a 256KB dataset strstr performance improves by 67%, strcasestr by 47%.
> 
> ChangeLog:
> 2018-06-30  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	* benchtests/bench-strcasestr.c: Rename __strnlen to strnlen.
> 	* benchtests/bench-strstr.c: Likewise.
> 	* string/memmem.c (FASTSEARCH): Define.	
> 	* string/str-two-way.h (two_way_short_needle): Minor cleanups.
> 	Add support for FASTSEARCH.
> 	* string/strcasestr.c (AVAILABLE): Use read-ahead __strnlen.
> 	* string/strstr.c (AVAILABLE): Use read-ahead __strnlen.
> 	(FASTSEARCH): Define.
> 	* string/test-strcasestr.c: Rename __strnlen to strnlen.
> 	* string/test-strstr.c: Likewise.

LGTM thanks.

> --
> diff --git a/benchtests/bench-strcasestr.c b/benchtests/bench-strcasestr.c
> index e6659ea79ec9b41427b8376f21b67dc326f60ec0..4337e0c18dbd5ed4a3743af51046da2dc6b6fe3a 100644
> --- a/benchtests/bench-strcasestr.c
> +++ b/benchtests/bench-strcasestr.c
> @@ -24,6 +24,7 @@
>  #define STRCASESTR simple_strcasestr
>  #define NO_ALIAS
>  #define __strncasecmp strncasecmp
> +#define __strnlen strnlen
>  #include "../string/strcasestr.c"
>  
>  
> diff --git a/benchtests/bench-strstr.c b/benchtests/bench-strstr.c
> index c30cd1078586fe66c9177707ba603bb6ba0476a7..a31294e3c96d80a4fd61bb5b423a825fe54d3227 100644
> --- a/benchtests/bench-strstr.c
> +++ b/benchtests/bench-strstr.c
> @@ -23,6 +23,7 @@
>  
>  #define STRSTR simple_strstr
>  #define libc_hidden_builtin_def(X)
> +#define __strnlen strnlen
>  #include "../string/strstr.c"
>  
>  
> diff --git a/string/memmem.c b/string/memmem.c
> index c17e1cf6a63c0709df0c1c7a45410c5178b2876e..43efaa3fb718e7a22c3b4122c278720768644d0f 100644
> --- a/string/memmem.c
> +++ b/string/memmem.c
> @@ -31,6 +31,7 @@
>  
>  #define RETURN_TYPE void *
>  #define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
> +#define FASTSEARCH(S,C,N) (void*) memchr ((void *)(S), (C), (N))
>  #include "str-two-way.h"
>  
>  #undef memmem
> diff --git a/string/str-two-way.h b/string/str-two-way.h
> index cd2605857d9b5c432c3c2b1c5e35f55d8e9285b6..523d946c59412e1f1f65b8ec3778553b83191952 100644
> --- a/string/str-two-way.h
> +++ b/string/str-two-way.h
> @@ -281,50 +281,50 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
>      }
>    else
>      {
> -      const unsigned char *phaystack = &haystack[suffix];
> +      const unsigned char *phaystack;
>        /* The comparison always starts from needle[suffix], so cache it
>  	 and use an optimized first-character loop.  */
>        unsigned char needle_suffix = CANON_ELEMENT (needle[suffix]);
>  
> -#if CHECK_EOL
> -      /* We start matching from the SUFFIX'th element, so make sure we
> -	 don't hit '\0' before that.  */
> -      if (haystack_len < suffix + 1
> -	  && !AVAILABLE (haystack, haystack_len, 0, suffix + 1))
> -	return NULL;
> -#endif
> -
>        /* The two halves of needle are distinct; no extra memory is
>  	 required, and any mismatch results in a maximal shift.  */
>        period = MAX (suffix, needle_len - suffix) + 1;
>        j = 0;
> -      while (1
> -#if !CHECK_EOL
> -	     && AVAILABLE (haystack, haystack_len, j, needle_len)
> -#endif
> -	     )
> +      while (AVAILABLE (haystack, haystack_len, j, needle_len))
>  	{
>  	  unsigned char haystack_char;
>  	  const unsigned char *pneedle;
>  
> -	  /* TODO: The first-character loop can be sped up by adapting
> -	     longword-at-a-time implementation of memchr/strchr.  */
> -	  if (needle_suffix
> +	  phaystack = &haystack[suffix + j];
> +
> +#ifdef FASTSEARCH
> +	  if (*phaystack++ != needle_suffix)
> +	    {
> +	      phaystack = FASTSEARCH (phaystack, needle_suffix,
> +				      haystack_len - needle_len - j);
> +	      if (phaystack == NULL)
> +		goto ret0;
> +	      j = phaystack - &haystack[suffix];
> +	      phaystack++;
> +	    }
> +#else
> +	  while (needle_suffix
>  	      != (haystack_char = CANON_ELEMENT (*phaystack++)))
>  	    {
>  	      RET0_IF_0 (haystack_char);
> -#if !CHECK_EOL
> +# if !CHECK_EOL
>  	      ++j;
> -#endif
> -	      continue;
> +	      if (!AVAILABLE (haystack, haystack_len, j, needle_len))
> +		goto ret0;
> +# endif
>  	    }
>  
> -#if CHECK_EOL
> +# if CHECK_EOL
>  	  /* Calculate J if it wasn't kept up-to-date in the first-character
>  	     loop.  */
>  	  j = phaystack - &haystack[suffix] - 1;
> +# endif
>  #endif
> -
>  	  /* Scan for matches in right half.  */
>  	  i = suffix + 1;
>  	  pneedle = &needle[i];
> @@ -338,6 +338,11 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
>  		}
>  	      ++i;
>  	    }
> +#if CHECK_EOL
> +	  /* Update minimal length of haystack.  */
> +	  if (phaystack > haystack + haystack_len)
> +	    haystack_len = phaystack - haystack;
> +#endif
>  	  if (needle_len <= i)
>  	    {
>  	      /* Scan for matches in left half.  */
> @@ -360,13 +365,6 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
>  	    }
>  	  else
>  	    j += i - suffix + 1;
> -
> -#if CHECK_EOL
> -	  if (!AVAILABLE (haystack, haystack_len, j, needle_len))
> -	    break;
> -#endif
> -
> -	  phaystack = &haystack[suffix + j];
>  	}
>      }
>   ret0: __attribute__ ((unused))
> diff --git a/string/strcasestr.c b/string/strcasestr.c
> index 90ba189790481a58eb7627eb8c396530f988f985..5909fe3cdba88e476c7a989a020f3611bbfeb1de 100644
> --- a/string/strcasestr.c
> +++ b/string/strcasestr.c
> @@ -37,8 +37,8 @@
>  /* Two-Way algorithm.  */
>  #define RETURN_TYPE char *
>  #define AVAILABLE(h, h_l, j, n_l)			\
> -  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
> -   && ((h_l) = (j) + (n_l)))
> +  (((j) + (n_l) <= (h_l)) || ((h_l) += __strnlen ((void*)((h) + (h_l)), 512), \
> +			      (j) + (n_l) <= (h_l)))
>  #define CHECK_EOL (1)
>  #define RET0_IF_0(a) if (!a) goto ret0
>  #define CANON_ELEMENT(c) TOLOWER (c)
> diff --git a/string/strstr.c b/string/strstr.c
> index b3b5deb673758510616d32849bcf2fc15ce941cd..265e9f310ce507ce63740cc42d8ceea1d28dab01 100644
> --- a/string/strstr.c
> +++ b/string/strstr.c
> @@ -33,10 +33,11 @@
>  
>  #define RETURN_TYPE char *
>  #define AVAILABLE(h, h_l, j, n_l)			\
> -  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
> -   && ((h_l) = (j) + (n_l)))
> +  (((j) + (n_l) <= (h_l)) || ((h_l) += __strnlen ((void*)((h) + (h_l)), 512), \
> +			      (j) + (n_l) <= (h_l)))
>  #define CHECK_EOL (1)
>  #define RET0_IF_0(a) if (!a) goto ret0
> +#define FASTSEARCH(S,C,N) (void*) strchr ((void*)(S), (C))
>  #include "str-two-way.h"
>  
>  #undef strstr
> diff --git a/string/test-strcasestr.c b/string/test-strcasestr.c
> index 2b0a38ea734974ec389ad206874081a8b67d8066..9b1088df54c64b82348c555e16d35cf967e19dd9 100644
> --- a/string/test-strcasestr.c
> +++ b/string/test-strcasestr.c
> @@ -25,6 +25,7 @@
>  #define STRCASESTR simple_strcasestr
>  #define NO_ALIAS
>  #define __strncasecmp strncasecmp
> +#define __strnlen strnlen
>  #include "strcasestr.c"
>  
>  
> diff --git a/string/test-strstr.c b/string/test-strstr.c
> index acf6ff8224608737701046a421faed2be9f52f68..8d99716ff39cc2c22ebebdacb5f30438f9b32ffc 100644
> --- a/string/test-strstr.c
> +++ b/string/test-strstr.c
> @@ -24,6 +24,7 @@
>  
>  #define STRSTR simple_strstr
>  #define libc_hidden_builtin_def(arg) /* nothing */
> +#define __strnlen strnlen
>  #include "strstr.c"
>  
>  
>

Patch

diff --git a/benchtests/bench-strcasestr.c b/benchtests/bench-strcasestr.c
index e6659ea79ec9b41427b8376f21b67dc326f60ec0..4337e0c18dbd5ed4a3743af51046da2dc6b6fe3a 100644
--- a/benchtests/bench-strcasestr.c
+++ b/benchtests/bench-strcasestr.c
@@ -24,6 +24,7 @@ 
 #define STRCASESTR simple_strcasestr
 #define NO_ALIAS
 #define __strncasecmp strncasecmp
+#define __strnlen strnlen
 #include "../string/strcasestr.c"
 
 
diff --git a/benchtests/bench-strstr.c b/benchtests/bench-strstr.c
index c30cd1078586fe66c9177707ba603bb6ba0476a7..a31294e3c96d80a4fd61bb5b423a825fe54d3227 100644
--- a/benchtests/bench-strstr.c
+++ b/benchtests/bench-strstr.c
@@ -23,6 +23,7 @@ 
 
 #define STRSTR simple_strstr
 #define libc_hidden_builtin_def(X)
+#define __strnlen strnlen
 #include "../string/strstr.c"
 
 
diff --git a/string/memmem.c b/string/memmem.c
index c17e1cf6a63c0709df0c1c7a45410c5178b2876e..43efaa3fb718e7a22c3b4122c278720768644d0f 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -31,6 +31,7 @@ 
 
 #define RETURN_TYPE void *
 #define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
+#define FASTSEARCH(S,C,N) (void*) memchr ((void *)(S), (C), (N))
 #include "str-two-way.h"
 
 #undef memmem
diff --git a/string/str-two-way.h b/string/str-two-way.h
index cd2605857d9b5c432c3c2b1c5e35f55d8e9285b6..523d946c59412e1f1f65b8ec3778553b83191952 100644
--- a/string/str-two-way.h
+++ b/string/str-two-way.h
@@ -281,50 +281,50 @@  two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
     }
   else
     {
-      const unsigned char *phaystack = &haystack[suffix];
+      const unsigned char *phaystack;
       /* The comparison always starts from needle[suffix], so cache it
 	 and use an optimized first-character loop.  */
       unsigned char needle_suffix = CANON_ELEMENT (needle[suffix]);
 
-#if CHECK_EOL
-      /* We start matching from the SUFFIX'th element, so make sure we
-	 don't hit '\0' before that.  */
-      if (haystack_len < suffix + 1
-	  && !AVAILABLE (haystack, haystack_len, 0, suffix + 1))
-	return NULL;
-#endif
-
       /* The two halves of needle are distinct; no extra memory is
 	 required, and any mismatch results in a maximal shift.  */
       period = MAX (suffix, needle_len - suffix) + 1;
       j = 0;
-      while (1
-#if !CHECK_EOL
-	     && AVAILABLE (haystack, haystack_len, j, needle_len)
-#endif
-	     )
+      while (AVAILABLE (haystack, haystack_len, j, needle_len))
 	{
 	  unsigned char haystack_char;
 	  const unsigned char *pneedle;
 
-	  /* TODO: The first-character loop can be sped up by adapting
-	     longword-at-a-time implementation of memchr/strchr.  */
-	  if (needle_suffix
+	  phaystack = &haystack[suffix + j];
+
+#ifdef FASTSEARCH
+	  if (*phaystack++ != needle_suffix)
+	    {
+	      phaystack = FASTSEARCH (phaystack, needle_suffix,
+				      haystack_len - needle_len - j);
+	      if (phaystack == NULL)
+		goto ret0;
+	      j = phaystack - &haystack[suffix];
+	      phaystack++;
+	    }
+#else
+	  while (needle_suffix
 	      != (haystack_char = CANON_ELEMENT (*phaystack++)))
 	    {
 	      RET0_IF_0 (haystack_char);
-#if !CHECK_EOL
+# if !CHECK_EOL
 	      ++j;
-#endif
-	      continue;
+	      if (!AVAILABLE (haystack, haystack_len, j, needle_len))
+		goto ret0;
+# endif
 	    }
 
-#if CHECK_EOL
+# if CHECK_EOL
 	  /* Calculate J if it wasn't kept up-to-date in the first-character
 	     loop.  */
 	  j = phaystack - &haystack[suffix] - 1;
+# endif
 #endif
-
 	  /* Scan for matches in right half.  */
 	  i = suffix + 1;
 	  pneedle = &needle[i];
@@ -338,6 +338,11 @@  two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
 		}
 	      ++i;
 	    }
+#if CHECK_EOL
+	  /* Update minimal length of haystack.  */
+	  if (phaystack > haystack + haystack_len)
+	    haystack_len = phaystack - haystack;
+#endif
 	  if (needle_len <= i)
 	    {
 	      /* Scan for matches in left half.  */
@@ -360,13 +365,6 @@  two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
 	    }
 	  else
 	    j += i - suffix + 1;
-
-#if CHECK_EOL
-	  if (!AVAILABLE (haystack, haystack_len, j, needle_len))
-	    break;
-#endif
-
-	  phaystack = &haystack[suffix + j];
 	}
     }
  ret0: __attribute__ ((unused))
diff --git a/string/strcasestr.c b/string/strcasestr.c
index 90ba189790481a58eb7627eb8c396530f988f985..5909fe3cdba88e476c7a989a020f3611bbfeb1de 100644
--- a/string/strcasestr.c
+++ b/string/strcasestr.c
@@ -37,8 +37,8 @@ 
 /* Two-Way algorithm.  */
 #define RETURN_TYPE char *
 #define AVAILABLE(h, h_l, j, n_l)			\
-  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
-   && ((h_l) = (j) + (n_l)))
+  (((j) + (n_l) <= (h_l)) || ((h_l) += __strnlen ((void*)((h) + (h_l)), 512), \
+			      (j) + (n_l) <= (h_l)))
 #define CHECK_EOL (1)
 #define RET0_IF_0(a) if (!a) goto ret0
 #define CANON_ELEMENT(c) TOLOWER (c)
diff --git a/string/strstr.c b/string/strstr.c
index b3b5deb673758510616d32849bcf2fc15ce941cd..265e9f310ce507ce63740cc42d8ceea1d28dab01 100644
--- a/string/strstr.c
+++ b/string/strstr.c
@@ -33,10 +33,11 @@ 
 
 #define RETURN_TYPE char *
 #define AVAILABLE(h, h_l, j, n_l)			\
-  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
-   && ((h_l) = (j) + (n_l)))
+  (((j) + (n_l) <= (h_l)) || ((h_l) += __strnlen ((void*)((h) + (h_l)), 512), \
+			      (j) + (n_l) <= (h_l)))
 #define CHECK_EOL (1)
 #define RET0_IF_0(a) if (!a) goto ret0
+#define FASTSEARCH(S,C,N) (void*) strchr ((void*)(S), (C))
 #include "str-two-way.h"
 
 #undef strstr
diff --git a/string/test-strcasestr.c b/string/test-strcasestr.c
index 2b0a38ea734974ec389ad206874081a8b67d8066..9b1088df54c64b82348c555e16d35cf967e19dd9 100644
--- a/string/test-strcasestr.c
+++ b/string/test-strcasestr.c
@@ -25,6 +25,7 @@ 
 #define STRCASESTR simple_strcasestr
 #define NO_ALIAS
 #define __strncasecmp strncasecmp
+#define __strnlen strnlen
 #include "strcasestr.c"
 
 
diff --git a/string/test-strstr.c b/string/test-strstr.c
index acf6ff8224608737701046a421faed2be9f52f68..8d99716ff39cc2c22ebebdacb5f30438f9b32ffc 100644
--- a/string/test-strstr.c
+++ b/string/test-strstr.c
@@ -24,6 +24,7 @@ 
 
 #define STRSTR simple_strstr
 #define libc_hidden_builtin_def(arg) /* nothing */
+#define __strnlen strnlen
 #include "strstr.c"