diff --git a/string/memmem.c b/string/memmem.c
index d72b8249e62a744c2d031c9ccb0157f141df641f..150777800456075d981bd589e5f00388e9aa54f0 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -15,17 +15,13 @@
   License along with the GNU C Library; if not, see
   . */
Â
-/* This particular implementation was written by Eric Blake, 2008. */
-
 #ifndef _LIBC
 # include
 #endif
Â
-/* Specification of memmem. */
 #include
Â
 #ifndef _LIBC
-# define __builtin_expect(expr, val)Â Â (expr)
 # define __memmem      memmem
 #endif
Â
@@ -36,51 +32,90 @@
Â
 #undef memmem
Â
-/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK
-Â Â if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
-  HAYSTACK. */
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
+
+/* Fast memmem algorithm with guaranteed linear-time performance.
+  Small needles up to size 2 use a dedicated linear search. Longer needles
+  up to size 256 use a novel modified Horspool algorithm. It hashes pairs
+  of characters to quickly skip past mismatches. The main search loop only
+Â Â exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+  and allowing for a larger skip if there is no match. A self-adapting
+Â Â filtering check is used to quickly detect mismatches in long needles.
+Â Â By limiting the needle length to 256, the shift table can be reduced to 8
+Â Â bits per entry, lowering preprocessing overhead and minimizing cache effects.
+Â Â The limit also implies worst-case performance is linear.
+  Needles larger than 256 characters use the linear-time Two-Way algorithm. */
 void *
-__memmem (const void *haystack_start, size_t haystack_len,
-Â Â Â Â Â Â Â Â const void *needle_start, size_t needle_len)
+__memmem (const void *haystack, size_t hs_len,
+Â Â Â Â Â Â Â Â const void *needle, size_t ne_len)
 {
-Â /* Abstract memory is considered to be an array of 'unsigned char' values,
-    not an array of 'char' values. See ISO C 99 section 6.2.6.1. */
-Â const unsigned char *haystack = (const unsigned char *) haystack_start;
-Â const unsigned char *needle = (const unsigned char *) needle_start;
-
-Â if (needle_len == 0)
-Â Â Â /* The first occurrence of the empty string is deemed to occur at
-      the beginning of the string. */
-Â Â Â return (void *) haystack;
-
-Â /* Sanity check, otherwise the loop might search through the whole
-    memory. */
-Â if (__glibc_unlikely (haystack_len < needle_len))
+Â const unsigned char *hs = (const unsigned char *) haystack;
+Â const unsigned char *ne = (const unsigned char *) needle;
+
+Â if (ne_len == 0)
+Â Â Â return (void *) hs;
+Â if (ne_len == 1)
+Â Â Â return (void *) memchr (hs, ne[0], hs_len);
+
+ /* Ensure haystack length is >= needle length. */
+Â if (hs_len < ne_len)
    return NULL;
Â
-Â /* Use optimizations in memchr when possible, to reduce the search
-Â Â Â Â size of haystack using a linear algorithm with a smaller
-    coefficient. However, avoid memchr for long needles, since we
-    can often achieve sublinear performance. */
-Â if (needle_len < LONG_NEEDLE_THRESHOLD)
+Â const unsigned char *end = hs + hs_len - ne_len;
+
+Â if (ne_len == 2)
+Â Â Â {
+Â Â Â Â Â uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1];
+Â Â Â Â Â for (hs++; hs <= end && hw != nw; )
+Â Â Â Â Â Â hw = hw << 16 | *++hs;
+Â Â Â Â Â return hw == nw ? (void *)hs - 1 : NULL;
+Â Â Â }
+
+ /* Use Two-Way algorithm for very long needles. */
+Â if (__builtin_expect (ne_len > 256, 0))
+Â Â Â return two_way_long_needle (hs, hs_len, ne, ne_len);
+
+Â uint8_t shift[256];
+Â size_t tmp, shift1;
+Â size_t m1 = ne_len - 1;
+Â size_t offset = 0;
+
+Â memset (shift, 0, sizeof (shift));
+Â for (int i = 1; i < m1; i++)
+Â Â Â shift[hash2 (ne + i)] = i;
+Â shift1 = m1 - shift[hash2 (ne + m1)];
+Â shift[hash2 (ne + m1)] = m1;
+
+Â for ( ; hs <= end; )
    {
-Â Â Â Â Â haystack = memchr (haystack, *needle, haystack_len);
-Â Â Â Â Â if (!haystack || __builtin_expect (needle_len == 1, 0))
-Â Â Â Â Â Â return (void *) haystack;
-Â Â Â Â Â haystack_len -= haystack - (const unsigned char *) haystack_start;
-Â Â Â Â Â if (haystack_len < needle_len)
-Â Â Â Â Â Â return NULL;
-     /* Check whether we have a match. This improves performance since we
-       avoid the initialization overhead of the two-way algorithm. */
-Â Â Â Â Â if (memcmp (haystack, needle, needle_len) == 0)
-Â Â Â Â Â Â return (void *) haystack;
-Â Â Â Â Â return two_way_short_needle (haystack, haystack_len, needle, needle_len);
+     /* Skip past character pairs not in the needle. */
+Â Â Â Â Â do
+Â Â Â Â Â Â {
+Â Â Â Â Â Â Â Â hs += m1;
+Â Â Â Â Â Â Â Â tmp = shift[hash2 (hs)];
+Â Â Â Â Â Â }
+Â Â Â Â Â while (tmp == 0 && hs <= end);
+
+Â Â Â Â Â /* If the match is not at the end of the needle, shift to the end
+       and continue until we match the last 2 characters. */
+Â Â Â Â Â hs -= tmp;
+Â Â Â Â Â if (tmp < m1)
+Â Â Â Â Â Â continue;
+
+Â Â Â Â Â if (m1 <= 15 || memcmp (hs + offset, ne + offset, sizeof (long)) == 0)
+Â Â Â Â Â Â {
+Â Â Â Â Â Â Â Â if (memcmp (hs, ne, m1) == 0)
+Â Â Â Â Â Â Â Â Â Â return (void *) hs;
+
+        /* Adjust filter offset when it doesn't find the mismatch. */
+Â Â Â Â Â Â Â Â offset = (offset >= sizeof (long) ? offset : m1) - sizeof (long);
+Â Â Â Â Â Â }
+
+     /* Skip based on matching the last 2 characters. */
+Â Â Â Â Â hs += shift1;
    }
-Â else
-Â Â Â return two_way_long_needle (haystack, haystack_len, needle, needle_len);
+Â return NULL;
 }
 libc_hidden_def (__memmem)
 weak_alias (__memmem, memmem)
 libc_hidden_weak (memmem)
-
-#undef LONG_NEEDLE_THRESHOLD
diff --git a/string/str-two-way.h b/string/str-two-way.h
index 31c3f18fb057cdd999c3ac9e9d894a8b62a98a70..5a800e0eaf1c7505a9340a7aabd149326958df4a 100644
--- a/string/str-two-way.h
+++ b/string/str-two-way.h
@@ -383,7 +383,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
   sublinear performance is not possible. */
-static RETURN_TYPE
+__attribute__((noinline)) static RETURN_TYPE
 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
                     const unsigned char *needle, size_t needle_len)
 {