@@ -221,7 +221,7 @@ critical_factorization (const unsigned char *needle, size_t needle_len,
   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */
-static RETURN_TYPE
+static inline RETURN_TYPE
 two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
                      const unsigned char *needle, size_t needle_len)
 {
@@ -383,7 +383,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
   sublinear performance is not possible. */
-static RETURN_TYPE
+__attribute__((noinline)) static RETURN_TYPE
 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
                     const unsigned char *needle, size_t needle_len)
 {
@@ -16,21 +16,12 @@
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>. */
Â
-/* This particular implementation was written by Eric Blake, 2008. */
-
 #ifndef _LIBC
 # include <config.h>
 #endif
Â
-/* Specification of strstr. */
 #include <string.h>
Â
-#include <stdbool.h>
-
-#ifndef _LIBC
-# define __builtin_expect(expr, val)Â Â (expr)
-#endif
-
 #define RETURN_TYPE char *
 #define AVAILABLE(h, h_l, j, n_l)                      \
  (((j) + (n_l) <= (h_l)) \
@@ -47,47 +38,122 @@
 #define STRSTR strstr
 #endif
Â
-/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK
-Â Â if NEEDLE is empty, otherwise NULL if NEEDLE is not found in
-  HAYSTACK. */
-char *
-STRSTR (const char *haystack, const char *needle)
+static inline char *
+strstr2 (const unsigned char *hs, const unsigned char *ne)
 {
- size_t needle_len; /* Length of NEEDLE. */
- size_t haystack_len; /* Known minimum length of HAYSTACK. */
-
- /* Handle empty NEEDLE special case. */
-Â if (needle[0] == '\0')
-Â Â Â return (char *) haystack;
+Â uint32_t h1 = (ne[0] << 16) | ne[1];
+Â uint32_t h2 = 0;
+Â for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs)
+Â Â Â Â Â h2 = (h2 << 16) | c;
+Â return h1 == h2 ? (char *)hs - 2 : NULL;
+}
Â
- /* Skip until we find the first matching char from NEEDLE. */
-Â haystack = strchr (haystack, needle[0]);
-Â if (haystack == NULL || needle[1] == '\0')
-Â Â Â return (char *) haystack;
+static inline char *
+strstr3 (const unsigned char *hs, const unsigned char *ne)
+{
+Â uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8);
+Â uint32_t h2 = 0;
+Â for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs)
+Â Â Â Â Â h2 = (h2 | c) << 8;
+Â return h1 == h2 ? (char *)hs - 3 : NULL;
+}
Â
-Â /* Ensure HAYSTACK length is at least as long as NEEDLE length.
-Â Â Â Â Since a match may occur early on in a huge HAYSTACK, use strnlen
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
+
+/* Fast strstr algorithm with guaranteed linear-time performance.
+  Small needles up to size 2 use a dedicated linear search. Longer needles
+  up to size 256 use a novel modified Horspool algorithm. It hashes pairs
+  of characters to quickly skip past mismatches. The main search loop only
+Â Â exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+  and allowing for a larger skip if there is no match. A self-adapting
+Â Â filtering check is used to quickly detect mismatches in long needles.
+Â Â By limiting the needle length to 256, the shift table can be reduced to 8
+Â Â bits per entry, lowering preprocessing overhead and minimizing cache effects.
+Â Â The limit also implies worst-case performance is linear.
+  Needles larger than 256 characters use the linear-time Two-Way algorithm. */
+char *
+STRSTR (const char *haystack, const char *needle)
+{
+Â const unsigned char *hs = (const unsigned char *) haystack;
+Â const unsigned char *ne = (const unsigned char *) needle;
+
+ /* Handle short needle special cases first. */
+Â if (ne[0] == '\0')
+Â Â Â return (char *)hs;
+Â hs = (const unsigned char *)strchr ((const char*)hs, ne[0]);
+Â if (hs == NULL || ne[1] == '\0')
+Â Â Â return (char*)hs;
+Â if (ne[2] == '\0')
+Â Â Â return strstr2 (hs, ne);
+Â if (ne[3] == '\0')
+Â Â Â return strstr3 (hs, ne);
+
+Â /* Ensure haystack length is at least as long as needle length.
+Â Â Â Â Since a match may occur early on in a huge haystack, use strnlen
     and read ahead a few cachelines for improved performance. */
-Â needle_len = strlen (needle);
-Â haystack_len = __strnlen (haystack, needle_len + 256);
-Â if (haystack_len < needle_len)
+Â size_t ne_len = strlen ((const char*)ne);
+Â size_t hs_len = __strnlen ((const char*)hs, ne_len | 512);
+Â if (hs_len < ne_len)
    return NULL;
Â
- /* Check whether we have a match. This improves performance since we avoid
-    the initialization overhead of the two-way algorithm. */
-Â if (memcmp (haystack, needle, needle_len) == 0)
-Â Â Â return (char *) haystack;
-
- /* Perform the search. Abstract memory is considered to be an array
-    of 'unsigned char' values, not an array of 'char' values. See
-    ISO C 99 section 6.2.6.1. */
-Â if (needle_len < LONG_NEEDLE_THRESHOLD)
-Â Â Â return two_way_short_needle ((const unsigned char *) haystack,
-Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â haystack_len,
-Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â (const unsigned char *) needle, needle_len);
-Â return two_way_long_needle ((const unsigned char *) haystack, haystack_len,
-Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â (const unsigned char *) needle, needle_len);
+ /* Check whether we have a match. This improves performance since we
+    avoid initialization overheads. */
+Â if (memcmp (hs, ne, ne_len) == 0)
+Â Â Â return (char *) hs;
+
+ /* Use Two-Way algorithm for very long needles. */
+Â if (__glibc_unlikely (ne_len > 256))
+Â Â Â return two_way_long_needle (hs, hs_len, ne, ne_len);
+
+Â const unsigned char *end = hs + hs_len - ne_len;
+Â uint8_t shift[256];
+Â size_t tmp, shift1;
+Â size_t m1 = ne_len - 1;
+Â size_t offset = 0;
+
+ /* Initialize bad character shift hash table. */
+Â memset (shift, 0, sizeof (shift));
+Â for (int i = 1; i < m1; i++)
+Â Â Â shift[hash2 (ne + i)] = i;
+Â shift1 = m1 - shift[hash2 (ne + m1)];
+Â shift[hash2 (ne + m1)] = m1;
+
+Â while (1)
+Â Â Â {
+Â Â Â Â Â if (__glibc_unlikely (hs > end))
+Â Â Â Â Â Â {
+Â Â Â Â Â Â Â Â end += __strnlen ((const char*)end + m1 + 1, 2048);
+Â Â Â Â Â Â Â Â if (hs > end)
+Â Â Â Â Â Â Â Â Â Â return NULL;
+Â Â Â Â Â Â }
+
+     /* Skip past character pairs not in the needle. */
+Â Â Â Â Â do
+Â Â Â Â Â Â {
+Â Â Â Â Â Â Â Â hs += m1;
+Â Â Â Â Â Â Â Â tmp = shift[hash2 (hs)];
+Â Â Â Â Â Â }
+Â Â Â Â Â while (hs <= end && tmp == 0);
+
+Â Â Â Â Â /* If the match is not at the end of the needle, shift to the end
+       and continue until we match the last 2 characters. */
+Â Â Â Â Â hs -= tmp;
+Â Â Â Â Â if (tmp < m1)
+Â Â Â Â Â Â continue;
+
+     /* The last 2 characters match. If the needle is long, check a
+       fixed number of characters first to quickly filter out mismatches. */
+Â Â Â Â Â if (memcmp (hs + offset, ne + offset, sizeof (int)) == 0)
+Â Â Â Â Â Â {
+Â Â Â Â Â Â Â Â if (memcmp (hs, ne, m1) == 0)
+Â Â Â Â Â Â Â Â Â Â return (void *) hs;
+
+        /* Adjust filter offset when it doesn't find the mismatch. */
+Â Â Â Â Â Â Â Â offset = (offset >= sizeof (int) ? offset : m1) - sizeof (int);
+Â Â Â Â Â Â }
+
+     /* Skip based on matching the last 2 characters. */
+Â Â Â Â Â hs += shift1;
+Â Â Â }
 }
 libc_hidden_builtin_def (strstr)
-
-#undef LONG_NEEDLE_THRESHOLD