sysdeps/x86_64/multiarch/memmem-avx2.c: add memmem-avx2.c

Message ID 20231216043334.72176-1-tirtajames45@gmail.com
State Superseded
Headers
Series sysdeps/x86_64/multiarch/memmem-avx2.c: add memmem-avx2.c |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Testing passed
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Testing passed

Commit Message

James Tirta Halim Dec. 16, 2023, 4:33 a.m. UTC
  Find the rarest byte in NE. Do a naive loop until HS is aligned. Once aligned, find
the parts of HS that matches the rare byte and the byte after it, shift
back to the position of HS that should match NE and do a memcmp.

Average timings (Core i5 8400):
__memmem_avx2   basic_memmem    twoway_memmem   memmem
1342.942864     19100.87074     3335.335377     2745.971856

---
 sysdeps/x86_64/multiarch/memmem-avx2.c | 72 ++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/memmem-avx2.c
  

Comments

Carlos O'Donell Dec. 18, 2023, 2:12 p.m. UTC | #1
On 12/15/23 23:33, James Tirta Halim wrote:
> Find the rarest byte in NE. Do a naive loop until HS is aligned. Once aligned, find
> the parts of HS that matches the rare byte and the byte after it, shift
> back to the position of HS that should match NE and do a memcmp.

James,

Please clarify your assignment status or provide DCO for these changes.

Please review the contribution checklist here:
https://sourceware.org/glibc/wiki/Contribution%20checklist


> Average timings (Core i5 8400):
> __memmem_avx2   basic_memmem    twoway_memmem   memmem
> 1342.942864     19100.87074     3335.335377     2745.971856
> 
> ---
>  sysdeps/x86_64/multiarch/memmem-avx2.c | 72 ++++++++++++++++++++++++++
>  1 file changed, 72 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/memmem-avx2.c
> 
> diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c b/sysdeps/x86_64/multiarch/memmem-avx2.c
> new file mode 100644
> index 0000000000..524d0fe45f
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memmem-avx2.c
> @@ -0,0 +1,72 @@
> +#include <immintrin.h>
> +#include <string.h>
> +#include <inttypes.h>
> +#include <libc-pointer-arith.h>
> +
> +static inline void *
> +__find_rarest_byte (const void *ne,
> +                      size_t n)
> +{
> +  static const unsigned char rarebyte_table[256] = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, 2, 158, 4, 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, 52, 53, 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, 218, 219, 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223, 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216, 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204, 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246, 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243, 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190, 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, 81, 87, 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, 156, 115, 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, 136, 137, 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, 150, 73, 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, 64, 149, 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, 144, 123, 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, 28, 67, 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, 33, 34, 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, 47, 48, 85, 49, 50, 51 };
> +  const unsigned char *rare = (const unsigned char *)  ne;
> +  const unsigned char *p = (const unsigned char *)  ne;
> +  int c_rare = rarebyte_table[*rare];
> +  int c;
> +  for (; n--; ++p)
> +    {
> +    c = rarebyte_table[*p];
> +    if (c < c_rare) {
> +      rare = p;
> +      c_rare = c;
> +    }
> +    }
> +  return (void *) rare;
> +}
> +
> +void *
> +__memmem_avx2 (const void *hs,
> +              size_t hs_len,
> +              const void *ne,
> +              size_t ne_len)
> +{
> +  if (ne_len == 1)
> +    return (void *) memchr (hs, *(unsigned char *) ne, hs_len);
> +  if (__glibc_unlikely (ne_len == 0))
> +    return (void *) hs;
> +  if (__glibc_unlikely (hs_len < ne_len))
> +    return NULL;
> +  const unsigned char *h = (const unsigned char *) hs;
> +  const unsigned char *const end = h + hs_len - ne_len;
> +  size_t shift = PTR_DIFF (__find_rarest_byte (ne, ne_len), ne);
> +  if (shift == ne_len - 1)
> +    --shift;
> +  h += shift;
> +  for (; !PTR_IS_ALIGNED (h, sizeof (__m256i)); ++h)
> +    {
> +    if (__glibc_unlikely (h - shift > end))
> +      return NULL;
> +    if (*h == *((unsigned char *) ne + shift) && !memcmp (h - shift, ne, ne_len))
> +      return (void *) (h - shift);
> +    }
> +  const __m256i nv = _mm256_set1_epi8 (*((char *) ne + shift));
> +  const __m256i nv1 = _mm256_set1_epi8 (*((char *) ne + shift + 1));
> +  __m256i hv, hv1;
> +  uint32_t i, hm0, hm1, m;
> +  for (; h - shift <= end; h += sizeof (__m256i)) {
> +    hv = _mm256_load_si256 ((const __m256i *) h);
> +    hv1 = _mm256_loadu_si256 ((const __m256i *) (h + 1));
> +    hm0 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv));
> +    hm1 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv1, nv1));
> +    m = hm0 & hm1;
> +    while (m)
> +      {
> +      i = _tzcnt_u32 (m);
> +      m = _blsr_u32 (m);
> +      if (__glibc_unlikely (h + i - shift > end))
> +        return NULL;
> +      if (!memcmp (h + i - shift, ne, ne_len))
> +        return (char *) h + i - shift;
> +      }
> +  }
> +  return NULL;
> +}
  
Noah Goldstein Dec. 18, 2023, 5:48 p.m. UTC | #2
On Fri, Dec 15, 2023 at 10:37 PM James Tirta Halim
<tirtajames45@gmail.com> wrote:
>
> Find the rarest byte in NE. Do a naive loop until HS is aligned. Once aligned, find
> the parts of HS that matches the rare byte and the byte after it, shift
> back to the position of HS that should match NE and do a memcmp.
>
> Average timings (Core i5 8400):
> __memmem_avx2   basic_memmem    twoway_memmem   memmem
> 1342.942864     19100.87074     3335.335377     2745.971856
can you attach the .out result file?
>
> ---
>  sysdeps/x86_64/multiarch/memmem-avx2.c | 72 ++++++++++++++++++++++++++
>  1 file changed, 72 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/memmem-avx2.c
>
> diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c b/sysdeps/x86_64/multiarch/memmem-avx2.c
> new file mode 100644
> index 0000000000..524d0fe45f
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memmem-avx2.c
> @@ -0,0 +1,72 @@
> +#include <immintrin.h>
> +#include <string.h>
> +#include <inttypes.h>
> +#include <libc-pointer-arith.h>
> +
> +static inline void *
> +__find_rarest_byte (const void *ne,
> +                      size_t n)
> +{
> +  static const unsigned char rarebyte_table[256] = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, 2, 158, 4, 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, 52, 53, 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, 218, 219, 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223, 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216, 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204, 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246, 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243, 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190, 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, 81, 87, 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, 156, 115, 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, 136, 137, 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, 150, 73, 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, 64, 149, 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, 144, 123, 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, 28, 67, 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, 33, 34, 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, 47, 48, 85, 49, 50, 51 };
can you add a coment explaining how this table was generated / what it is?
> +  const unsigned char *rare = (const unsigned char *)  ne;
> +  const unsigned char *p = (const unsigned char *)  ne;
> +  int c_rare = rarebyte_table[*rare];
> +  int c;
> +  for (; n--; ++p)
> +    {
> +    c = rarebyte_table[*p];
> +    if (c < c_rare) {
> +      rare = p;
> +      c_rare = c;
> +    }
> +    }
> +  return (void *) rare;
> +}
> +
> +void *
> +__memmem_avx2 (const void *hs,
> +              size_t hs_len,
> +              const void *ne,
> +              size_t ne_len)
> +{
> +  if (ne_len == 1)
> +    return (void *) memchr (hs, *(unsigned char *) ne, hs_len);
> +  if (__glibc_unlikely (ne_len == 0))
> +    return (void *) hs;
> +  if (__glibc_unlikely (hs_len < ne_len))
> +    return NULL;
> +  const unsigned char *h = (const unsigned char *) hs;
> +  const unsigned char *const end = h + hs_len - ne_len;
> +  size_t shift = PTR_DIFF (__find_rarest_byte (ne, ne_len), ne);
> +  if (shift == ne_len - 1)
> +    --shift;
> +  h += shift;
> +  for (; !PTR_IS_ALIGNED (h, sizeof (__m256i)); ++h)
> +    {
> +    if (__glibc_unlikely (h - shift > end))
> +      return NULL;
> +    if (*h == *((unsigned char *) ne + shift) && !memcmp (h - shift, ne, ne_len))
should be `__memcmp` or you could directly use `__memcmpeq_avx2`
(probably the fastest here).
likewise below.
> +      return (void *) (h - shift);
> +    }
> +  const __m256i nv = _mm256_set1_epi8 (*((char *) ne + shift));
> +  const __m256i nv1 = _mm256_set1_epi8 (*((char *) ne + shift + 1));
> +  __m256i hv, hv1;
> +  uint32_t i, hm0, hm1, m;
> +  for (; h - shift <= end; h += sizeof (__m256i)) {
> +    hv = _mm256_load_si256 ((const __m256i *) h);
> +    hv1 = _mm256_loadu_si256 ((const __m256i *) (h + 1));
> +    hm0 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv));
> +    hm1 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv1, nv1));
think faster is:
m = _mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpeq_epi8(hv, nv),
_mm256_cmpeq_epi8(hv1, nv)));
> +    m = hm0 & hm1;
> +    while (m)
> +      {
> +      i = _tzcnt_u32 (m);
> +      m = _blsr_u32 (m);
> +      if (__glibc_unlikely (h + i - shift > end))
> +        return NULL;
> +      if (!memcmp (h + i - shift, ne, ne_len))
> +        return (char *) h + i - shift;
> +      }
> +  }
> +  return NULL;
> +}
> --
> 2.43.0
>
  

Patch

diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c b/sysdeps/x86_64/multiarch/memmem-avx2.c
new file mode 100644
index 0000000000..524d0fe45f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmem-avx2.c
@@ -0,0 +1,72 @@ 
+#include <immintrin.h>
+#include <string.h>
+#include <inttypes.h>
+#include <libc-pointer-arith.h>
+
+static inline void *
+__find_rarest_byte (const void *ne,
+                      size_t n)
+{
+  static const unsigned char rarebyte_table[256] = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, 2, 158, 4, 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, 52, 53, 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, 218, 219, 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223, 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216, 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204, 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246, 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243, 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190, 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, 81, 87, 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, 156, 115, 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, 136, 137, 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, 150, 73, 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, 64, 149, 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, 144, 123, 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, 28, 67, 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, 33, 34, 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, 47, 48, 85, 49, 50, 51 };
+  const unsigned char *rare = (const unsigned char *)  ne;
+  const unsigned char *p = (const unsigned char *)  ne;
+  int c_rare = rarebyte_table[*rare];
+  int c;
+  for (; n--; ++p)
+    {
+    c = rarebyte_table[*p];
+    if (c < c_rare) {
+      rare = p;
+      c_rare = c;
+    }
+    }
+  return (void *) rare;
+}
+
+void *
+__memmem_avx2 (const void *hs,
+              size_t hs_len,
+              const void *ne,
+              size_t ne_len)
+{
+  if (ne_len == 1)
+    return (void *) memchr (hs, *(unsigned char *) ne, hs_len);
+  if (__glibc_unlikely (ne_len == 0))
+    return (void *) hs;
+  if (__glibc_unlikely (hs_len < ne_len))
+    return NULL;
+  const unsigned char *h = (const unsigned char *) hs;
+  const unsigned char *const end = h + hs_len - ne_len;
+  size_t shift = PTR_DIFF (__find_rarest_byte (ne, ne_len), ne);
+  if (shift == ne_len - 1)
+    --shift;
+  h += shift;
+  for (; !PTR_IS_ALIGNED (h, sizeof (__m256i)); ++h)
+    {
+    if (__glibc_unlikely (h - shift > end))
+      return NULL;
+    if (*h == *((unsigned char *) ne + shift) && !memcmp (h - shift, ne, ne_len))
+      return (void *) (h - shift);
+    }
+  const __m256i nv = _mm256_set1_epi8 (*((char *) ne + shift));
+  const __m256i nv1 = _mm256_set1_epi8 (*((char *) ne + shift + 1));
+  __m256i hv, hv1;
+  uint32_t i, hm0, hm1, m;
+  for (; h - shift <= end; h += sizeof (__m256i)) {
+    hv = _mm256_load_si256 ((const __m256i *) h);
+    hv1 = _mm256_loadu_si256 ((const __m256i *) (h + 1));
+    hm0 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv));
+    hm1 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv1, nv1));
+    m = hm0 & hm1;
+    while (m)
+      {
+      i = _tzcnt_u32 (m);
+      m = _blsr_u32 (m);
+      if (__glibc_unlikely (h + i - shift > end))
+        return NULL;
+      if (!memcmp (h + i - shift, ne, ne_len))
+        return (char *) h + i - shift;
+      }
+  }
+  return NULL;
+}