sysdeps/memmem-avx2.c: add memmem-avx2.c

Message ID 20231215170315.1806024-1-tirtajames45@gmail.com
State Superseded
Headers
Series sysdeps/memmem-avx2.c: add memmem-avx2.c |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch fail Patch failed to apply to master at the time it was sent
redhat-pt-bot/TryBot-32bit fail Patch series failed to apply

Commit Message

James Tirta Halim Dec. 15, 2023, 5:03 p.m. UTC
  Find the rarest byte in NE. Do a naive loop until HS is aligned. Once aligned, find
the parts of HS that matches the rare byte and the byte after it, shift
back to the position of HS that should match NE and do a memcmp.

Average timings (Core i5 8400):
__memmem_avx2	basic_memmem	twoway_memmem	memmem
1342.942864	19100.87074	3335.335377	2745.971856

Passes make check.

---
 sysdeps/x86_64/multiarch/memmem-avx2.c | 83 ++++++++++++++++----------
 1 file changed, 50 insertions(+), 33 deletions(-)
  

Comments

Carlos O'Donell Dec. 15, 2023, 7:53 p.m. UTC | #1
On 12/15/23 12:03, James Tirta Halim wrote:
> Find the rarest byte in NE. Do a naive loop until HS is aligned. Once aligned, find
> the parts of HS that matches the rare byte and the byte after it, shift
> back to the position of HS that should match NE and do a memcmp.

Patch fails pre-commit CI -- Doesn't apply.

https://patchwork.sourceware.org/project/glibc/patch/20231215170315.1806024-1-tirtajames45@gmail.com/

This looks like it depends on the up-thread patch.

Please send patches as a series e.g. git format-patch HEAD~1; then use git send email.

Please review the contribution checklist:
https://sourceware.org/glibc/wiki/Contribution%20checklist

Please review Copyright and license:
https://sourceware.org/glibc/wiki/Contribution%20checklist#Copyright_and_license

This patch needs either DCO or assignment.

> Average timings (Core i5 8400):
> __memmem_avx2	basic_memmem	twoway_memmem	memmem
> 1342.942864	19100.87074	3335.335377	2745.971856
> 
> Passes make check.
> 
> ---
>  sysdeps/x86_64/multiarch/memmem-avx2.c | 83 ++++++++++++++++----------
>  1 file changed, 50 insertions(+), 33 deletions(-)
> 
> diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c b/sysdeps/x86_64/multiarch/memmem-avx2.c
> index b0cced73aa..524d0fe45f 100644
> --- a/sysdeps/x86_64/multiarch/memmem-avx2.c
> +++ b/sysdeps/x86_64/multiarch/memmem-avx2.c
> @@ -3,53 +3,70 @@
>  #include <inttypes.h>
>  #include <libc-pointer-arith.h>
>  
> +static inline void *
> +__find_rarest_byte (const void *ne,
> +                      size_t n)
> +{
> +  static const unsigned char rarebyte_table[256] = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, 2, 158, 4, 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, 52, 53, 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, 218, 219, 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223, 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216, 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204, 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246, 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243, 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190, 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, 81, 87, 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, 156, 115, 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, 136, 137, 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, 150, 73, 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, 64, 149, 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, 144, 123, 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, 28, 67, 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, 33, 34, 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, 47, 48, 85, 49, 50, 51 };
> +  const unsigned char *rare = (const unsigned char *)  ne;
> +  const unsigned char *p = (const unsigned char *)  ne;
> +  int c_rare = rarebyte_table[*rare];
> +  int c;
> +  for (; n--; ++p)
> +    {
> +    c = rarebyte_table[*p];
> +    if (c < c_rare) {
> +      rare = p;
> +      c_rare = c;
> +    }
> +    }
> +  return (void *) rare;
> +}
> +
>  void *
> -__memmem_avx2 (const void *hs, size_t hs_len, const void *ne, size_t ne_len)
> +__memmem_avx2 (const void *hs,
> +              size_t hs_len,
> +              const void *ne,
> +              size_t ne_len)
>  {
>    if (ne_len == 1)
>      return (void *) memchr (hs, *(unsigned char *) ne, hs_len);
>    if (__glibc_unlikely (ne_len == 0))
>      return (void *) hs;
> -  if (__glibc_unlikely (hs_len == ne_len))
> -    return !memcmp (hs, ne, ne_len) ? (void *) hs : NULL;
>    if (__glibc_unlikely (hs_len < ne_len))
>      return NULL;
> -  const __m256i nv = _mm256_set1_epi8 (*(char *) ne);
>    const unsigned char *h = (const unsigned char *) hs;
> -  const unsigned char *n = (const unsigned char *) ne;
>    const unsigned char *const end = h + hs_len - ne_len;
> -  const int c1 = *(n + 1);
> -  n += 2, ne_len -= 2;
> -  __m256i hv;
> -  uint32_t i, m;
> -  if (!PTR_IS_ALIGNED (h)) {
> -    hv = _mm256_loadu_si256 ((const __m256i *) h);
> -    m = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv));
> -    for (; m; m = _blsr_u32 (m)) {
> -      i = _tzcnt_u32 (m);
> -      if (__glibc_unlikely (h + i > end))
> -        return NULL;
> -      if (*(h + i + 1) == c1 && !memcmp (h + i + 2, n, ne_len))
> -        return (char *) h + i;
> -    }
> -    h += sizeof (__m256i);
> -    if (__glibc_unlikely (h > end))
> +  size_t shift = PTR_DIFF (__find_rarest_byte (ne, ne_len), ne);
> +  if (shift == ne_len - 1)
> +    --shift;
> +  h += shift;
> +  for (; !PTR_IS_ALIGNED (h, sizeof (__m256i)); ++h)
> +    {
> +    if (__glibc_unlikely (h - shift > end))
>        return NULL;
> -    h = (const unsigned char *) PTR_ALIGN_UP (h, sizeof (__m256i));
> -  }
> -  for (;;) {
> +    if (*h == *((unsigned char *) ne + shift) && !memcmp (h - shift, ne, ne_len))
> +      return (void *) (h - shift);
> +    }
> +  const __m256i nv = _mm256_set1_epi8 (*((char *) ne + shift));
> +  const __m256i nv1 = _mm256_set1_epi8 (*((char *) ne + shift + 1));
> +  __m256i hv, hv1;
> +  uint32_t i, hm0, hm1, m;
> +  for (; h - shift <= end; h += sizeof (__m256i)) {
>      hv = _mm256_load_si256 ((const __m256i *) h);
> -    m = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv));
> -    for (; m; m = _blsr_u32 (m)) {
> +    hv1 = _mm256_loadu_si256 ((const __m256i *) (h + 1));
> +    hm0 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv));
> +    hm1 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv1, nv1));
> +    m = hm0 & hm1;
> +    while (m)
> +      {
>        i = _tzcnt_u32 (m);
> -      if (__glibc_unlikely (h + i > end))
> +      m = _blsr_u32 (m);
> +      if (__glibc_unlikely (h + i - shift > end))
>          return NULL;
> -      if (*(h + i + 1) == c1 && !memcmp (h + i + 2, n, ne_len))
> -        return (char *) h + i;
> -    }
> -    h += sizeof (__m256i);
> -    if (__glibc_unlikely (h > end))
> -      return NULL;
> +      if (!memcmp (h + i - shift, ne, ne_len))
> +        return (char *) h + i - shift;
> +      }
>    }
>    return NULL;
>  }
  

Patch

diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c b/sysdeps/x86_64/multiarch/memmem-avx2.c
index b0cced73aa..524d0fe45f 100644
--- a/sysdeps/x86_64/multiarch/memmem-avx2.c
+++ b/sysdeps/x86_64/multiarch/memmem-avx2.c
@@ -3,53 +3,70 @@ 
 #include <inttypes.h>
 #include <libc-pointer-arith.h>
 
+static inline void *
+__find_rarest_byte (const void *ne,
+                      size_t n)
+{
+  static const unsigned char rarebyte_table[256] = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, 2, 158, 4, 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, 52, 53, 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, 218, 219, 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223, 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216, 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204, 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246, 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243, 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190, 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, 81, 87, 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, 156, 115, 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, 136, 137, 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, 150, 73, 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, 64, 149, 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, 144, 123, 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, 28, 67, 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, 33, 34, 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, 47, 48, 85, 49, 50, 51 };
+  const unsigned char *rare = (const unsigned char *)  ne;
+  const unsigned char *p = (const unsigned char *)  ne;
+  int c_rare = rarebyte_table[*rare];
+  int c;
+  for (; n--; ++p)
+    {
+    c = rarebyte_table[*p];
+    if (c < c_rare) {
+      rare = p;
+      c_rare = c;
+    }
+    }
+  return (void *) rare;
+}
+
 void *
-__memmem_avx2 (const void *hs, size_t hs_len, const void *ne, size_t ne_len)
+__memmem_avx2 (const void *hs,
+              size_t hs_len,
+              const void *ne,
+              size_t ne_len)
 {
   if (ne_len == 1)
     return (void *) memchr (hs, *(unsigned char *) ne, hs_len);
   if (__glibc_unlikely (ne_len == 0))
     return (void *) hs;
-  if (__glibc_unlikely (hs_len == ne_len))
-    return !memcmp (hs, ne, ne_len) ? (void *) hs : NULL;
   if (__glibc_unlikely (hs_len < ne_len))
     return NULL;
-  const __m256i nv = _mm256_set1_epi8 (*(char *) ne);
   const unsigned char *h = (const unsigned char *) hs;
-  const unsigned char *n = (const unsigned char *) ne;
   const unsigned char *const end = h + hs_len - ne_len;
-  const int c1 = *(n + 1);
-  n += 2, ne_len -= 2;
-  __m256i hv;
-  uint32_t i, m;
-  if (!PTR_IS_ALIGNED (h)) {
-    hv = _mm256_loadu_si256 ((const __m256i *) h);
-    m = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv));
-    for (; m; m = _blsr_u32 (m)) {
-      i = _tzcnt_u32 (m);
-      if (__glibc_unlikely (h + i > end))
-        return NULL;
-      if (*(h + i + 1) == c1 && !memcmp (h + i + 2, n, ne_len))
-        return (char *) h + i;
-    }
-    h += sizeof (__m256i);
-    if (__glibc_unlikely (h > end))
+  size_t shift = PTR_DIFF (__find_rarest_byte (ne, ne_len), ne);
+  if (shift == ne_len - 1)
+    --shift;
+  h += shift;
+  for (; !PTR_IS_ALIGNED (h, sizeof (__m256i)); ++h)
+    {
+    if (__glibc_unlikely (h - shift > end))
       return NULL;
-    h = (const unsigned char *) PTR_ALIGN_UP (h, sizeof (__m256i));
-  }
-  for (;;) {
+    if (*h == *((unsigned char *) ne + shift) && !memcmp (h - shift, ne, ne_len))
+      return (void *) (h - shift);
+    }
+  const __m256i nv = _mm256_set1_epi8 (*((char *) ne + shift));
+  const __m256i nv1 = _mm256_set1_epi8 (*((char *) ne + shift + 1));
+  __m256i hv, hv1;
+  uint32_t i, hm0, hm1, m;
+  for (; h - shift <= end; h += sizeof (__m256i)) {
     hv = _mm256_load_si256 ((const __m256i *) h);
-    m = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv));
-    for (; m; m = _blsr_u32 (m)) {
+    hv1 = _mm256_loadu_si256 ((const __m256i *) (h + 1));
+    hm0 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv));
+    hm1 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv1, nv1));
+    m = hm0 & hm1;
+    while (m)
+      {
       i = _tzcnt_u32 (m);
-      if (__glibc_unlikely (h + i > end))
+      m = _blsr_u32 (m);
+      if (__glibc_unlikely (h + i - shift > end))
         return NULL;
-      if (*(h + i + 1) == c1 && !memcmp (h + i + 2, n, ne_len))
-        return (char *) h + i;
-    }
-    h += sizeof (__m256i);
-    if (__glibc_unlikely (h > end))
-      return NULL;
+      if (!memcmp (h + i - shift, ne, ne_len))
+        return (char *) h + i - shift;
+      }
   }
   return NULL;
 }