Improve strstr performance

Message ID DB5PR08MB1030DA9F862D70A49C3FA40383780@DB5PR08MB1030.eurprd08.prod.outlook.com
State New, archived
Headers

Commit Message

Wilco Dijkstra June 11, 2018, 10:31 a.m. UTC
  Improve strstr performance.  Strstr tends to be slow because it uses
many calls to memchr and a slow byte loop to scan for the next match.
Performance is significantly improved by using strnlen on larger blocks
and using strchr to search for the next matching character.  strcasestr
can also use strnlen to scan ahead, and memmem can use memchr to check
for the next match.

On the GLIBC bench tests the overall performance gains on Cortex-A72 are:
strstr: +25%
strcasestr: +4.3%
memmem: +18%

On a 256KB dataset strstr performance improves by 67%, strcasestr by 47%.

Tested on AArch64, passes GLIBC tests.

ChangeLog:
2018-06-08  Wilco Dijkstra  <wdijkstr@arm.com>

	* string/memmem.c (FASTSEARCH): Define.	
	* string/str-two-way.h (two_way_short_needle): Minor cleanups.
	Add support for FASTSEARCH.
	* string/strcasestr.c (AVAILABLE): Use read-ahead strnlen.
	* string/strstr.c (AVAILABLE): Use read-ahead strnlen.
	(FASTSEARCH): Define.
--
  

Comments

Adhemerval Zanella Netto June 29, 2018, 7:51 p.m. UTC | #1
On 11/06/2018 07:31, Wilco Dijkstra wrote:
> Improve strstr performance.  Strstr tends to be slow because it uses
> many calls to memchr and a slow byte loop to scan for the next match.
> Performance is significantly improved by using strnlen on larger blocks
> and using strchr to search for the next matching character.  strcasestr
> can also use strnlen to scan ahead, and memmem can use memchr to check
> for the next match.
> 
> On the GLIBC bench tests the overall performance gains on Cortex-A72 are:
> strstr: +25%
> strcasestr: +4.3%
> memmem: +18%
> 
> On a 256KB dataset strstr performance improves by 67%, strcasestr by 47%.
> 
> Tested on AArch64, passes GLIBC tests.

I am checking if it is worth to keep the implementation in sync with gnulib,
since it contains some changes over the years which did not make into glibc.

I create a branch with these changes [1] along with your changes, but
results are mixed:

   - For strstr/strcasestr performance is the same for neddles larger than
     4, but faster for shorter needles.

   - For memmem performance is faster for needle of size 2, but slower for
     large haystack.

My plan is to check if we can simplify the code a bit, as gnulib one has
done; but in overall the change should be ok (modulo one fix below).

[1] https://sourceware.org/git/?p=glibc.git;a=shortlog;h=refs/heads/azanella/str-two-way-optimization

> diff --git a/string/strcasestr.c b/string/strcasestr.c
> index 90ba189790481a58eb7627eb8c396530f988f985..d54c4b56853735fb8f5ebd59419b222ca8827651 100644
> --- a/string/strcasestr.c
> +++ b/string/strcasestr.c
> @@ -37,8 +37,8 @@
>  /* Two-Way algorithm.  */
>  #define RETURN_TYPE char *
>  #define AVAILABLE(h, h_l, j, n_l)			\
> -  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
> -   && ((h_l) = (j) + (n_l)))
> +  (((j) + (n_l) <= (h_l)) || ((h_l) += strnlen ((void*)((h) + (h_l)), 512), \
> +			      (j) + (n_l) <= (h_l)))

Why did it not trigger linknamespace issues with strnlen usage? For instance:

$ cat conform/ISO/assert.h/linknamespace.out 
[initial] __assert_fail -> [libc.a(assert.o)] __dcgettext -> [libc.a(dcgettext.o)] __dcigettext -> [libc.a(dcigettext.o)] strstr -> [libc.a(strstr.o)] strnlen

On the branch I created it contains a fix for this [2].

[2] https://sourceware.org/git/?p=glibc.git;a=commit;h=6cc58a9708ff256323c34d200fd9362f25bf66f7

>  #define CHECK_EOL (1)
>  #define RET0_IF_0(a) if (!a) goto ret0
>  #define CANON_ELEMENT(c) TOLOWER (c)
> diff --git a/string/strstr.c b/string/strstr.c
> index b3b5deb673758510616d32849bcf2fc15ce941cd..346f303c7423c95de0fbcdeb1a9f31293b303372 100644
> --- a/string/strstr.c
> +++ b/string/strstr.c
> @@ -33,10 +33,11 @@
>  
>  #define RETURN_TYPE char *
>  #define AVAILABLE(h, h_l, j, n_l)			\
> -  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
> -   && ((h_l) = (j) + (n_l)))
> +  (((j) + (n_l) <= (h_l)) || ((h_l) += strnlen ((void*)((h) + (h_l)), 512), \
> +			      (j) + (n_l) <= (h_l)))
>  #define CHECK_EOL (1)
>  #define RET0_IF_0(a) if (!a) goto ret0
> +#define FASTSEARCH(S,C,N) (void*) strchr ((void*)(S), (C))
>  #include "str-two-way.h"
>  
>  #undef strstr
>
  

Patch

diff --git a/string/memmem.c b/string/memmem.c
index c17e1cf6a63c0709df0c1c7a45410c5178b2876e..43efaa3fb718e7a22c3b4122c278720768644d0f 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -31,6 +31,7 @@ 
 
 #define RETURN_TYPE void *
 #define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
+#define FASTSEARCH(S,C,N) (void*) memchr ((void *)(S), (C), (N))
 #include "str-two-way.h"
 
 #undef memmem
diff --git a/string/str-two-way.h b/string/str-two-way.h
index cd2605857d9b5c432c3c2b1c5e35f55d8e9285b6..6ce6798ea4d595bb4c95335233eb92595f447a28 100644
--- a/string/str-two-way.h
+++ b/string/str-two-way.h
@@ -281,50 +281,50 @@  two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
     }
   else
     {
-      const unsigned char *phaystack = &haystack[suffix];
+      const unsigned char *phaystack;
       /* The comparison always starts from needle[suffix], so cache it
 	 and use an optimized first-character loop.  */
       unsigned char needle_suffix = CANON_ELEMENT (needle[suffix]);
 
-#if CHECK_EOL
-      /* We start matching from the SUFFIX'th element, so make sure we
-	 don't hit '\0' before that.  */
-      if (haystack_len < suffix + 1
-	  && !AVAILABLE (haystack, haystack_len, 0, suffix + 1))
-	return NULL;
-#endif
-
       /* The two halves of needle are distinct; no extra memory is
 	 required, and any mismatch results in a maximal shift.  */
       period = MAX (suffix, needle_len - suffix) + 1;
       j = 0;
-      while (1
-#if !CHECK_EOL
-	     && AVAILABLE (haystack, haystack_len, j, needle_len)
-#endif
-	     )
+      while (AVAILABLE (haystack, haystack_len, j, needle_len))
 	{
 	  unsigned char haystack_char;
 	  const unsigned char *pneedle;
 
-	  /* TODO: The first-character loop can be sped up by adapting
-	     longword-at-a-time implementation of memchr/strchr.  */
-	  if (needle_suffix
+	  phaystack = &haystack[suffix + j];
+
+#ifdef FASTSEARCH
+	  if (*phaystack++ != needle_suffix)
+	    {
+	      phaystack = FASTSEARCH (phaystack, needle_suffix,
+				      haystack_len - needle_len - j);
+	      if (phaystack == NULL)
+		goto ret0;
+	      j = phaystack - &haystack[suffix];
+	      phaystack++;
+	    }
+#else
+	  while (needle_suffix
 	      != (haystack_char = CANON_ELEMENT (*phaystack++)))
 	    {
 	      RET0_IF_0 (haystack_char);
-#if !CHECK_EOL
+# if !CHECK_EOL
 	      ++j;
-#endif
-	      continue;
+	      if (!AVAILABLE (haystack, haystack_len, j, needle_len))
+		goto ret0;
+# endif
 	    }
 
-#if CHECK_EOL
+# if CHECK_EOL
 	  /* Calculate J if it wasn't kept up-to-date in the first-character
 	     loop.  */
 	  j = phaystack - &haystack[suffix] - 1;
+# endif
 #endif
-
 	  /* Scan for matches in right half.  */
 	  i = suffix + 1;
 	  pneedle = &needle[i];
@@ -338,6 +338,11 @@  two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
 		}
 	      ++i;
 	    }
+#if CHECK_EOL
+	  /* Update minimal length of haystack.  */
+	  if (phaystack > haystack + haystack_len)
+	    haystack_len = phaystack - haystack;
+#endif
 	  if (needle_len <= i)
 	    {
 	      /* Scan for matches in left half.  */
@@ -360,13 +365,6 @@  two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
 	    }
 	  else
 	    j += i - suffix + 1;
-
-#if CHECK_EOL
-	  if (!AVAILABLE (haystack, haystack_len, j, needle_len))
-	    break;
-#endif
-
-	  phaystack = &haystack[suffix + j];
 	}
     }
  ret0: __attribute__ ((unused))
diff --git a/string/strcasestr.c b/string/strcasestr.c
index 90ba189790481a58eb7627eb8c396530f988f985..d54c4b56853735fb8f5ebd59419b222ca8827651 100644
--- a/string/strcasestr.c
+++ b/string/strcasestr.c
@@ -37,8 +37,8 @@ 
 /* Two-Way algorithm.  */
 #define RETURN_TYPE char *
 #define AVAILABLE(h, h_l, j, n_l)			\
-  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
-   && ((h_l) = (j) + (n_l)))
+  (((j) + (n_l) <= (h_l)) || ((h_l) += strnlen ((void*)((h) + (h_l)), 512), \
+			      (j) + (n_l) <= (h_l)))
 #define CHECK_EOL (1)
 #define RET0_IF_0(a) if (!a) goto ret0
 #define CANON_ELEMENT(c) TOLOWER (c)
diff --git a/string/strstr.c b/string/strstr.c
index b3b5deb673758510616d32849bcf2fc15ce941cd..346f303c7423c95de0fbcdeb1a9f31293b303372 100644
--- a/string/strstr.c
+++ b/string/strstr.c
@@ -33,10 +33,11 @@ 
 
 #define RETURN_TYPE char *
 #define AVAILABLE(h, h_l, j, n_l)			\
-  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
-   && ((h_l) = (j) + (n_l)))
+  (((j) + (n_l) <= (h_l)) || ((h_l) += strnlen ((void*)((h) + (h_l)), 512), \
+			      (j) + (n_l) <= (h_l)))
 #define CHECK_EOL (1)
 #define RET0_IF_0(a) if (!a) goto ret0
+#define FASTSEARCH(S,C,N) (void*) strchr ((void*)(S), (C))
 #include "str-two-way.h"
 
 #undef strstr