sysdeps/x86_64/multiarch/memmem-avx2.c: add memmem-avx2.c
Checks
Context |
Check |
Description |
redhat-pt-bot/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 |
success
|
Testing passed
|
redhat-pt-bot/TryBot-32bit |
success
|
Build for i686
|
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_glibc_build--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-arm |
success
|
Testing passed
|
Commit Message
Find the rarest byte in NE. Do a naive loop until HS is aligned. Once aligned, find
the parts of HS that matches the rare byte and the byte after it, shift
back to the position of HS that should match NE and do a memcmp.
Average timings (Core i5 8400):
__memmem_avx2 basic_memmem twoway_memmem memmem
1342.942864 19100.87074 3335.335377 2745.971856
---
sysdeps/x86_64/multiarch/memmem-avx2.c | 72 ++++++++++++++++++++++++++
1 file changed, 72 insertions(+)
create mode 100644 sysdeps/x86_64/multiarch/memmem-avx2.c
Comments
On 12/15/23 23:33, James Tirta Halim wrote:
> Find the rarest byte in NE. Do a naive loop until HS is aligned. Once aligned, find
> the parts of HS that matches the rare byte and the byte after it, shift
> back to the position of HS that should match NE and do a memcmp.
James,
Please clarify your assignment status or provide DCO for these changes.
Please review the contribution checklist here:
https://sourceware.org/glibc/wiki/Contribution%20checklist
> Average timings (Core i5 8400):
> __memmem_avx2 basic_memmem twoway_memmem memmem
> 1342.942864 19100.87074 3335.335377 2745.971856
>
> ---
> sysdeps/x86_64/multiarch/memmem-avx2.c | 72 ++++++++++++++++++++++++++
> 1 file changed, 72 insertions(+)
> create mode 100644 sysdeps/x86_64/multiarch/memmem-avx2.c
>
> diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c b/sysdeps/x86_64/multiarch/memmem-avx2.c
> new file mode 100644
> index 0000000000..524d0fe45f
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memmem-avx2.c
> @@ -0,0 +1,72 @@
> +#include <immintrin.h>
> +#include <string.h>
> +#include <inttypes.h>
> +#include <libc-pointer-arith.h>
> +
> +static inline void *
> +__find_rarest_byte (const void *ne,
> + size_t n)
> +{
> + static const unsigned char rarebyte_table[256] = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, 2, 158, 4, 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, 52, 53, 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, 218, 219, 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223, 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216, 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204, 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246, 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243, 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190, 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, 81, 87, 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, 156, 115, 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, 136, 137, 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, 150, 73, 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, 64, 149, 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, 144, 123, 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, 28, 67, 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, 33, 34, 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, 47, 48, 85, 49, 50, 51 };
> + const unsigned char *rare = (const unsigned char *) ne;
> + const unsigned char *p = (const unsigned char *) ne;
> + int c_rare = rarebyte_table[*rare];
> + int c;
> + for (; n--; ++p)
> + {
> + c = rarebyte_table[*p];
> + if (c < c_rare) {
> + rare = p;
> + c_rare = c;
> + }
> + }
> + return (void *) rare;
> +}
> +
> +void *
> +__memmem_avx2 (const void *hs,
> + size_t hs_len,
> + const void *ne,
> + size_t ne_len)
> +{
> + if (ne_len == 1)
> + return (void *) memchr (hs, *(unsigned char *) ne, hs_len);
> + if (__glibc_unlikely (ne_len == 0))
> + return (void *) hs;
> + if (__glibc_unlikely (hs_len < ne_len))
> + return NULL;
> + const unsigned char *h = (const unsigned char *) hs;
> + const unsigned char *const end = h + hs_len - ne_len;
> + size_t shift = PTR_DIFF (__find_rarest_byte (ne, ne_len), ne);
> + if (shift == ne_len - 1)
> + --shift;
> + h += shift;
> + for (; !PTR_IS_ALIGNED (h, sizeof (__m256i)); ++h)
> + {
> + if (__glibc_unlikely (h - shift > end))
> + return NULL;
> + if (*h == *((unsigned char *) ne + shift) && !memcmp (h - shift, ne, ne_len))
> + return (void *) (h - shift);
> + }
> + const __m256i nv = _mm256_set1_epi8 (*((char *) ne + shift));
> + const __m256i nv1 = _mm256_set1_epi8 (*((char *) ne + shift + 1));
> + __m256i hv, hv1;
> + uint32_t i, hm0, hm1, m;
> + for (; h - shift <= end; h += sizeof (__m256i)) {
> + hv = _mm256_load_si256 ((const __m256i *) h);
> + hv1 = _mm256_loadu_si256 ((const __m256i *) (h + 1));
> + hm0 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv));
> + hm1 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv1, nv1));
> + m = hm0 & hm1;
> + while (m)
> + {
> + i = _tzcnt_u32 (m);
> + m = _blsr_u32 (m);
> + if (__glibc_unlikely (h + i - shift > end))
> + return NULL;
> + if (!memcmp (h + i - shift, ne, ne_len))
> + return (char *) h + i - shift;
> + }
> + }
> + return NULL;
> +}
On Fri, Dec 15, 2023 at 10:37 PM James Tirta Halim
<tirtajames45@gmail.com> wrote:
>
> Find the rarest byte in NE. Do a naive loop until HS is aligned. Once aligned, find
> the parts of HS that matches the rare byte and the byte after it, shift
> back to the position of HS that should match NE and do a memcmp.
>
> Average timings (Core i5 8400):
> __memmem_avx2 basic_memmem twoway_memmem memmem
> 1342.942864 19100.87074 3335.335377 2745.971856
can you attach the .out result file?
>
> ---
> sysdeps/x86_64/multiarch/memmem-avx2.c | 72 ++++++++++++++++++++++++++
> 1 file changed, 72 insertions(+)
> create mode 100644 sysdeps/x86_64/multiarch/memmem-avx2.c
>
> diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c b/sysdeps/x86_64/multiarch/memmem-avx2.c
> new file mode 100644
> index 0000000000..524d0fe45f
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memmem-avx2.c
> @@ -0,0 +1,72 @@
> +#include <immintrin.h>
> +#include <string.h>
> +#include <inttypes.h>
> +#include <libc-pointer-arith.h>
> +
> +static inline void *
> +__find_rarest_byte (const void *ne,
> + size_t n)
> +{
> + static const unsigned char rarebyte_table[256] = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, 2, 158, 4, 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, 52, 53, 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, 218, 219, 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223, 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216, 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204, 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246, 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243, 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190, 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, 81, 87, 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, 156, 115, 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, 136, 137, 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, 150, 73, 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, 64, 149, 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, 144, 123, 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, 28, 67, 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, 33, 34, 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, 47, 48, 85, 49, 50, 51 };
can you add a coment explaining how this table was generated / what it is?
> + const unsigned char *rare = (const unsigned char *) ne;
> + const unsigned char *p = (const unsigned char *) ne;
> + int c_rare = rarebyte_table[*rare];
> + int c;
> + for (; n--; ++p)
> + {
> + c = rarebyte_table[*p];
> + if (c < c_rare) {
> + rare = p;
> + c_rare = c;
> + }
> + }
> + return (void *) rare;
> +}
> +
> +void *
> +__memmem_avx2 (const void *hs,
> + size_t hs_len,
> + const void *ne,
> + size_t ne_len)
> +{
> + if (ne_len == 1)
> + return (void *) memchr (hs, *(unsigned char *) ne, hs_len);
> + if (__glibc_unlikely (ne_len == 0))
> + return (void *) hs;
> + if (__glibc_unlikely (hs_len < ne_len))
> + return NULL;
> + const unsigned char *h = (const unsigned char *) hs;
> + const unsigned char *const end = h + hs_len - ne_len;
> + size_t shift = PTR_DIFF (__find_rarest_byte (ne, ne_len), ne);
> + if (shift == ne_len - 1)
> + --shift;
> + h += shift;
> + for (; !PTR_IS_ALIGNED (h, sizeof (__m256i)); ++h)
> + {
> + if (__glibc_unlikely (h - shift > end))
> + return NULL;
> + if (*h == *((unsigned char *) ne + shift) && !memcmp (h - shift, ne, ne_len))
should be `__memcmp` or you could directly use `__memcmpeq_avx2`
(probably the fastest here).
likewise below.
> + return (void *) (h - shift);
> + }
> + const __m256i nv = _mm256_set1_epi8 (*((char *) ne + shift));
> + const __m256i nv1 = _mm256_set1_epi8 (*((char *) ne + shift + 1));
> + __m256i hv, hv1;
> + uint32_t i, hm0, hm1, m;
> + for (; h - shift <= end; h += sizeof (__m256i)) {
> + hv = _mm256_load_si256 ((const __m256i *) h);
> + hv1 = _mm256_loadu_si256 ((const __m256i *) (h + 1));
> + hm0 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv));
> + hm1 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv1, nv1));
think faster is:
m = _mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpeq_epi8(hv, nv),
_mm256_cmpeq_epi8(hv1, nv)));
> + m = hm0 & hm1;
> + while (m)
> + {
> + i = _tzcnt_u32 (m);
> + m = _blsr_u32 (m);
> + if (__glibc_unlikely (h + i - shift > end))
> + return NULL;
> + if (!memcmp (h + i - shift, ne, ne_len))
> + return (char *) h + i - shift;
> + }
> + }
> + return NULL;
> +}
> --
> 2.43.0
>
new file mode 100644
@@ -0,0 +1,72 @@
+#include <immintrin.h>
+#include <string.h>
+#include <inttypes.h>
+#include <libc-pointer-arith.h>
+
+static inline void *
+__find_rarest_byte (const void *ne,
+ size_t n)
+{
+ static const unsigned char rarebyte_table[256] = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, 2, 158, 4, 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, 52, 53, 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, 218, 219, 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223, 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216, 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204, 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246, 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243, 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190, 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, 81, 87, 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, 156, 115, 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, 136, 137, 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, 150, 73, 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, 64, 149, 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, 144, 123, 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, 28, 67, 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, 33, 34, 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, 47, 48, 85, 49, 50, 51 };
+ const unsigned char *rare = (const unsigned char *) ne;
+ const unsigned char *p = (const unsigned char *) ne;
+ int c_rare = rarebyte_table[*rare];
+ int c;
+ for (; n--; ++p)
+ {
+ c = rarebyte_table[*p];
+ if (c < c_rare) {
+ rare = p;
+ c_rare = c;
+ }
+ }
+ return (void *) rare;
+}
+
+void *
+__memmem_avx2 (const void *hs,
+ size_t hs_len,
+ const void *ne,
+ size_t ne_len)
+{
+ if (ne_len == 1)
+ return (void *) memchr (hs, *(unsigned char *) ne, hs_len);
+ if (__glibc_unlikely (ne_len == 0))
+ return (void *) hs;
+ if (__glibc_unlikely (hs_len < ne_len))
+ return NULL;
+ const unsigned char *h = (const unsigned char *) hs;
+ const unsigned char *const end = h + hs_len - ne_len;
+ size_t shift = PTR_DIFF (__find_rarest_byte (ne, ne_len), ne);
+ if (shift == ne_len - 1)
+ --shift;
+ h += shift;
+ for (; !PTR_IS_ALIGNED (h, sizeof (__m256i)); ++h)
+ {
+ if (__glibc_unlikely (h - shift > end))
+ return NULL;
+ if (*h == *((unsigned char *) ne + shift) && !memcmp (h - shift, ne, ne_len))
+ return (void *) (h - shift);
+ }
+ const __m256i nv = _mm256_set1_epi8 (*((char *) ne + shift));
+ const __m256i nv1 = _mm256_set1_epi8 (*((char *) ne + shift + 1));
+ __m256i hv, hv1;
+ uint32_t i, hm0, hm1, m;
+ for (; h - shift <= end; h += sizeof (__m256i)) {
+ hv = _mm256_load_si256 ((const __m256i *) h);
+ hv1 = _mm256_loadu_si256 ((const __m256i *) (h + 1));
+ hm0 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv));
+ hm1 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv1, nv1));
+ m = hm0 & hm1;
+ while (m)
+ {
+ i = _tzcnt_u32 (m);
+ m = _blsr_u32 (m);
+ if (__glibc_unlikely (h + i - shift > end))
+ return NULL;
+ if (!memcmp (h + i - shift, ne, ne_len))
+ return (char *) h + i - shift;
+ }
+ }
+ return NULL;
+}