[1/1] x86_64: Add strstr function with 512-bit EVEX
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
dj/TryBot-32bit |
success
|
Build for i686
|
Commit Message
Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
(1) We spend a few cycles at the begining to peek into the needle. We
locate an edge in the needle (first occurance of 2 consequent distinct
characters) and also store the first 64-bytes into a zmm register.
(2) We search for the edge in the haystack by looking into one cache
line of the haystack at a time. This avoids having to read past a page
boundary which can cause a seg fault.
(3) If an edge is found in the haystack we first compare the first
64-bytes of the needle (already stored in a zmm register) before we
proceed with a full string compare performed byte by byte.
Benchmarking data on ICX shows upto 2x speed up when compared to
__strstr_sse2_unaligned (including partial benchtests data from
bench-strstr.out):
|---------------------------------+---------------+-----------------------|
| | strstr_avx512 | strstr_sse2_unaligned |
|---------------------------------+---------------+-----------------------|
| Length 16384/ 16, 1/11, found: | 1939.75 | 3458.44 |
| Length 16384/ 16, 14/ 5, fail : | 1967.75 | 3541.12 |
| Length 16384/ 32, 1/11, found: | 1540.38 | 2908.25 |
| Length 16384/ 32, 14/ 5, fail : | 1345.94 | 2866.31 |
| Length 16384/ 64, 1/11, found: | 1968.81 | 4327.56 |
| Length 16384/ 64, 14/ 5, fail : | 1993.75 | 4215.69 |
| Length 16384/128, 1/11, found: | 1535.44 | 3780.56 |
| Length 16384/128, 14/ 5, fail : | 1414.75 | 3595.25 |
| Length 16384/256, 1/11, found: | 2957.75 | 5501.44 |
| Length 16384/256, 14/ 5, fail : | 2682.62 | 5099.88 |
| Length 32768/ 16, 1/11, found: | 7820.19 | 11262.9 |
| Length 32768/ 16, 14/ 5, fail : | 8196.88 | 10871.2 |
| Length 32768/ 32, 1/11, found: | 5709.19 | 6611.56 |
| Length 32768/ 32, 14/ 5, fail : | 5716.12 | 6647.06 |
| Length 32768/ 64, 1/11, found: | 7160.44 | 10143.7 |
| Length 32768/ 64, 14/ 5, fail : | 7021.38 | 10150.6 |
| Length 32768/128, 1/11, found: | 4935.31 | 6756.56 |
| Length 32768/128, 14/ 5, fail : | 4774.38 | 6746.19 |
| Length 32768/256, 1/11, found: | 7933.19 | 12563.8 |
| Length 32768/256, 14/ 5, fail : | 7975 | 12558.6 |
| Length 65536/ 16, 1/11, found: | 9066.69 | 9419.62 |
| Length 65536/ 16, 14/ 5, fail : | 8496 | 9384.75 |
| Length 65536/ 32, 1/11, found: | 10258.8 | 11192.4 |
| Length 65536/ 32, 14/ 5, fail : | 8712.12 | 11172.3 |
| Length 65536/ 64, 1/11, found: | 11085.2 | 18162.1 |
| Length 65536/ 64, 14/ 5, fail : | 11219.6 | 17921.5 |
| Length 65536/128, 1/11, found: | 9753.56 | 18704.6 |
| Length 65536/128, 14/ 5, fail : | 9588.81 | 18465.6 |
| Length 65536/256, 1/11, found: | 18333.3 | 28505.2 |
| Length 65536/256, 14/ 5, fail : | 18018.8 | 27990.8 |
|---------------------------------+---------------+-----------------------|
---
sysdeps/x86_64/multiarch/Makefile | 2 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 +
sysdeps/x86_64/multiarch/strstr-avx512.c | 208 +++++++++++++++++++++
sysdeps/x86_64/multiarch/strstr.c | 24 ++-
4 files changed, 236 insertions(+), 4 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
Comments
On Thu, May 26, 2022 at 3:11 PM Raghuveer Devulapalli via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
>
> (1) We spend a few cycles at the begining to peek into the needle. We
> locate an edge in the needle (first occurance of 2 consequent distinct
> characters) and also store the first 64-bytes into a zmm register.
>
> (2) We search for the edge in the haystack by looking into one cache
> line of the haystack at a time. This avoids having to read past a page
> boundary which can cause a seg fault.
>
> (3) If an edge is found in the haystack we first compare the first
> 64-bytes of the needle (already stored in a zmm register) before we
> proceed with a full string compare performed byte by byte.
>
> Benchmarking data on ICX shows upto 2x speed up when compared to
> __strstr_sse2_unaligned (including partial benchtests data from
> bench-strstr.out):
>
> |---------------------------------+---------------+-----------------------|
> | | strstr_avx512 | strstr_sse2_unaligned |
> |---------------------------------+---------------+-----------------------|
> | Length 16384/ 16, 1/11, found: | 1939.75 | 3458.44 |
> | Length 16384/ 16, 14/ 5, fail : | 1967.75 | 3541.12 |
> | Length 16384/ 32, 1/11, found: | 1540.38 | 2908.25 |
> | Length 16384/ 32, 14/ 5, fail : | 1345.94 | 2866.31 |
> | Length 16384/ 64, 1/11, found: | 1968.81 | 4327.56 |
> | Length 16384/ 64, 14/ 5, fail : | 1993.75 | 4215.69 |
> | Length 16384/128, 1/11, found: | 1535.44 | 3780.56 |
> | Length 16384/128, 14/ 5, fail : | 1414.75 | 3595.25 |
> | Length 16384/256, 1/11, found: | 2957.75 | 5501.44 |
> | Length 16384/256, 14/ 5, fail : | 2682.62 | 5099.88 |
> | Length 32768/ 16, 1/11, found: | 7820.19 | 11262.9 |
> | Length 32768/ 16, 14/ 5, fail : | 8196.88 | 10871.2 |
> | Length 32768/ 32, 1/11, found: | 5709.19 | 6611.56 |
> | Length 32768/ 32, 14/ 5, fail : | 5716.12 | 6647.06 |
> | Length 32768/ 64, 1/11, found: | 7160.44 | 10143.7 |
> | Length 32768/ 64, 14/ 5, fail : | 7021.38 | 10150.6 |
> | Length 32768/128, 1/11, found: | 4935.31 | 6756.56 |
> | Length 32768/128, 14/ 5, fail : | 4774.38 | 6746.19 |
> | Length 32768/256, 1/11, found: | 7933.19 | 12563.8 |
> | Length 32768/256, 14/ 5, fail : | 7975 | 12558.6 |
> | Length 65536/ 16, 1/11, found: | 9066.69 | 9419.62 |
> | Length 65536/ 16, 14/ 5, fail : | 8496 | 9384.75 |
> | Length 65536/ 32, 1/11, found: | 10258.8 | 11192.4 |
> | Length 65536/ 32, 14/ 5, fail : | 8712.12 | 11172.3 |
> | Length 65536/ 64, 1/11, found: | 11085.2 | 18162.1 |
> | Length 65536/ 64, 14/ 5, fail : | 11219.6 | 17921.5 |
> | Length 65536/128, 1/11, found: | 9753.56 | 18704.6 |
> | Length 65536/128, 14/ 5, fail : | 9588.81 | 18465.6 |
> | Length 65536/256, 1/11, found: | 18333.3 | 28505.2 |
> | Length 65536/256, 14/ 5, fail : | 18018.8 | 27990.8 |
> |---------------------------------+---------------+-----------------------|
> ---
> sysdeps/x86_64/multiarch/Makefile | 2 +
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 +
> sysdeps/x86_64/multiarch/strstr-avx512.c | 208 +++++++++++++++++++++
> sysdeps/x86_64/multiarch/strstr.c | 24 ++-
> 4 files changed, 236 insertions(+), 4 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e7b413edad..6dc54a7265 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -126,6 +126,7 @@ sysdep_routines += \
> strrchr-sse2 \
> strspn-c \
> strspn-sse2 \
> + strstr-avx512 \
> strstr-sse2-unaligned \
> varshift \
> # sysdep_routines
> @@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4
> CFLAGS-strcspn-c.c += -msse4
> CFLAGS-strpbrk-c.c += -msse4
> CFLAGS-strspn-c.c += -msse4
> +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
> endif
>
> ifeq ($(subdir),wcsmbs)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index a594f4176e..cc9a7eaaa1 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strstr.c. */
> IFUNC_IMPL (i, name, strstr,
> + IFUNC_IMPL_ADD (array, i, strstr,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (AVX512DQ)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __strstr_avx512)
> IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
>
> diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> new file mode 100644
> index 0000000000..4082a75a1b
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> @@ -0,0 +1,208 @@
> +/* strstr optimized with 512-bit AVX-512 instructions
> + Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <immintrin.h>
> +#include <inttypes.h>
> +#include <stdbool.h>
> +#include <string.h>
> +
> +#define FULL_MMASK64 0xffffffffffffffff
> +#define ONE_64BIT 0x1ull
> +#define ZMM_SIZE_IN_BYTES 64
> +
> +/*
> + Returns the index of the first edge within the needle, returns 0 if no edge
> + is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> + */
> +static inline size_t
> +find_edge_in_needle (const char *ned)
> +{
> + size_t ind = 0;
> + while (ned[ind + 1] != '\0')
> + {
> + if (ned[ind] != ned[ind + 1])
> + return ind;
> + else
> + ind = ind + 1;
> + }
> + return 0;
> +}
> +
> +/*
> + Compare needle with haystack byte by byte at specified location
> + */
> +static inline bool
> +verify_string_match (const char *hay, const size_t hay_index, const char *ned,
> + size_t ind)
> +{
> + while (ned[ind] != '\0')
> + {
strcmp? (you might be able to use memcmp which will be faster
but will need a bit of refactor to keep true nedlen and check for page
cross on hay)
> + if (ned[ind] != hay[hay_index + ind])
> + return false;
> + ind = ind + 1;
> + }
> + return true;
> +}
> +
> +/*
> + Compare needle with haystack at specified location. The first 64 bytes are
> + compared using a ZMM register.
> + */
> +static inline bool
> +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> + const char *ned, const __mmask64 ned_mask,
> + const __m512i ned_zmm)
> +{
> + /* check first 64 bytes using zmm and then scalar */
> + __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
> + __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
> + if (match != 0x0) // failed the first few chars
> + return false;
> + else if (ned_mask == FULL_MMASK64)
> + return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
> + return true;
> +}
> +
> +char *
> +__strstr_avx512 (const char *haystack, const char *ned)
> +{
> + char first = ned[0];
> + if (first == '\0')
> + return (char *)haystack;
> + if (ned[1] == '\0')
> + return (char *)strchr (haystack, ned[0]);
> +
> + size_t edge = find_edge_in_needle (ned);
> +
> + /* ensure haystack is as long as the pos of edge in needle */
> + for (int ii = 0; ii < edge; ++ii)
> + {
strnlen
> + if (haystack[ii] == '\0')
> + return NULL;
> + }
> +
> + const __m512i null = _mm512_setzero_si512 (); // '\0'
> +
> + /*
> + Load 64 bytes of the needle and save it to a zmm register
> + Read one cache line at a time to avoid loading across a page boundary
> + */
> + __mmask64 ned_load_mask
> + = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
FULL_MMASK64 >> (((-(uintptr_t)ned) & 63));
> + __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
Maybe conditional on highly unlike page cross this is very
expensive if causes page walk
> + __mmask64 ned_nullmask
> + = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm, null);
_mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm)
likewise at all other compares with null unless it breaks
microfusion more than once.
If you can replace all then get rid of null
> + if (__glibc_unlikely (ned_nullmask == 0x0))
> + {
> + ned_zmm = _mm512_loadu_si512 (ned);
> + ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
> + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> + if (ned_nullmask != 0x0)
> + ned_load_mask = ned_load_mask >> 1;
> + }
> + else
> + {
> + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> + ned_load_mask = ned_load_mask >> 1;
I think you can get away with just ned_load_mask =
ned_nullmask - ONE_64BIT because you only use this after
checking haystack no null-term
> + }
> + const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
> + const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> +
> + /*
> + Read the bytes of haystack in the current cache line
> + */
> + size_t hay_index = edge;
> + __mmask64 loadmask = _bzhi_u64 (
> + FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
> + /* First load is a partial cache line */
> + __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> + /* Search for NULL and compare only till null char */
> + __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask (loadmask, hay0, null);
> + __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> + cmpmask = _kand_mask64 (cmpmask, loadmask);
nullmask ^ (nullmask - ONE_64BIT); codegen ends up actually
using kand_mask here. Since loadmask and nullmask both go through
GPR (nullmask for the blsmsk) you can do this explicitly in uint64_t
to help GCC out.
> + /* Search for the 2 charaters of needle */
> + __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> + __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> + k1 = _kshiftri_mask64 (k1, 1);
> + /* k2 masks tell us if both chars from needle match */
> + uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
> + /* For every match, search for the entire needle for a full match */
> + while (k2)
> + {
> + uint64_t bitcount = _tzcnt_u64(k2);
> + k2 = _blsr_u64(k2);
> + size_t match_pos = hay_index + bitcount - edge;
> + if (nullmask == 0)
> + {
> + if (verify_string_match_avx512 (haystack, match_pos, ned,
> + ned_load_mask, ned_zmm))
> + return (char *)haystack + match_pos;
> + }
> + else
> + {
> + if (verify_string_match (haystack, match_pos, ned, 0))
> + return (char *)haystack + match_pos;
> + }
> + }
> + /* We haven't checked for potential match at the last char yet */
> + hay_index += _mm_popcnt_u64 (loadmask) - 1;
hay_index = 0; haystay |= 63; You might want to check codegen and
ensure hay_index is being optimized out. AFAICT you just need a
pointer.
> +
> + /*
> + Loop over one cache line at a time to prevent reading over page
> + boundary
> + */
> + __m512i hay1;
> + while (nullmask == 0)
> + {
> + hay0 = _mm512_loadu_si512 (haystack + hay_index);
> + hay1 = _mm512_load_si512 (haystack + hay_index
> + + 1); // Always 64 byte aligned
Is this really faster than using kshiftri?
> + nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
> + /* Compare only till null char */
> + cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> + k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> + k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> + /* k2 masks tell us if both chars from needle match */
> + k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
> + /* For every match, compare full strings for potential match */
> + while (k2)
> + {
> + uint64_t bitcount = _tzcnt_u64(k2);
> + k2 = _blsr_u64(k2);
> + size_t match_pos = hay_index + bitcount - edge;
> + if (nullmask == 0)
> + {
> + /*
> + Since the haystack doesn't terminate at the current cache
> + line, we can use zmm register to compare the first 64 bytes
> + */
> + if (verify_string_match_avx512 (haystack, match_pos, ned,
> + ned_load_mask, ned_zmm))
> + return (char *)haystack + match_pos;
> + }
> + else
> + {
> + /* Compare byte by byte */
> + if (verify_string_match (haystack, match_pos, ned, 0))
> + return (char *)haystack + match_pos;
> + }
> + }
> + hay_index += ZMM_SIZE_IN_BYTES;
> + }
> + return NULL;
> +}
> diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
> index 95600a9de5..2fb8b169b6 100644
> --- a/sysdeps/x86_64/multiarch/strstr.c
> +++ b/sysdeps/x86_64/multiarch/strstr.c
> @@ -35,16 +35,32 @@
>
> extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
> extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
> +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
>
> #include "init-arch.h"
>
> /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> ifunc symbol properly. */
> extern __typeof (__redirect_strstr) __libc_strstr;
> -libc_ifunc (__libc_strstr,
> - HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> - ? __strstr_sse2_unaligned
> - : __strstr_sse2)
>
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> + const struct cpu_features *cpu_features = __get_cpu_features ();
> +
> + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> + && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> + && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> + return __strstr_avx512;
> +
> + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> + return __strstr_sse2_unaligned;
> +
> + return __strstr_sse2;
> +}
> +
> +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
> #undef strstr
> strong_alias (__libc_strstr, strstr)
> --
> 2.36.1
>
On Thu, May 26, 2022 at 3:11 PM Raghuveer Devulapalli via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
>
> (1) We spend a few cycles at the begining to peek into the needle. We
> locate an edge in the needle (first occurance of 2 consequent distinct
> characters) and also store the first 64-bytes into a zmm register.
>
> (2) We search for the edge in the haystack by looking into one cache
> line of the haystack at a time. This avoids having to read past a page
> boundary which can cause a seg fault.
>
> (3) If an edge is found in the haystack we first compare the first
> 64-bytes of the needle (already stored in a zmm register) before we
> proceed with a full string compare performed byte by byte.
>
> Benchmarking data on ICX shows upto 2x speed up when compared to
> __strstr_sse2_unaligned (including partial benchtests data from
> bench-strstr.out):
>
> |---------------------------------+---------------+-----------------------|
> | | strstr_avx512 | strstr_sse2_unaligned |
> |---------------------------------+---------------+-----------------------|
> | Length 16384/ 16, 1/11, found: | 1939.75 | 3458.44 |
> | Length 16384/ 16, 14/ 5, fail : | 1967.75 | 3541.12 |
> | Length 16384/ 32, 1/11, found: | 1540.38 | 2908.25 |
> | Length 16384/ 32, 14/ 5, fail : | 1345.94 | 2866.31 |
> | Length 16384/ 64, 1/11, found: | 1968.81 | 4327.56 |
> | Length 16384/ 64, 14/ 5, fail : | 1993.75 | 4215.69 |
> | Length 16384/128, 1/11, found: | 1535.44 | 3780.56 |
> | Length 16384/128, 14/ 5, fail : | 1414.75 | 3595.25 |
> | Length 16384/256, 1/11, found: | 2957.75 | 5501.44 |
> | Length 16384/256, 14/ 5, fail : | 2682.62 | 5099.88 |
> | Length 32768/ 16, 1/11, found: | 7820.19 | 11262.9 |
> | Length 32768/ 16, 14/ 5, fail : | 8196.88 | 10871.2 |
> | Length 32768/ 32, 1/11, found: | 5709.19 | 6611.56 |
> | Length 32768/ 32, 14/ 5, fail : | 5716.12 | 6647.06 |
> | Length 32768/ 64, 1/11, found: | 7160.44 | 10143.7 |
> | Length 32768/ 64, 14/ 5, fail : | 7021.38 | 10150.6 |
> | Length 32768/128, 1/11, found: | 4935.31 | 6756.56 |
> | Length 32768/128, 14/ 5, fail : | 4774.38 | 6746.19 |
> | Length 32768/256, 1/11, found: | 7933.19 | 12563.8 |
> | Length 32768/256, 14/ 5, fail : | 7975 | 12558.6 |
> | Length 65536/ 16, 1/11, found: | 9066.69 | 9419.62 |
> | Length 65536/ 16, 14/ 5, fail : | 8496 | 9384.75 |
> | Length 65536/ 32, 1/11, found: | 10258.8 | 11192.4 |
> | Length 65536/ 32, 14/ 5, fail : | 8712.12 | 11172.3 |
> | Length 65536/ 64, 1/11, found: | 11085.2 | 18162.1 |
> | Length 65536/ 64, 14/ 5, fail : | 11219.6 | 17921.5 |
> | Length 65536/128, 1/11, found: | 9753.56 | 18704.6 |
> | Length 65536/128, 14/ 5, fail : | 9588.81 | 18465.6 |
> | Length 65536/256, 1/11, found: | 18333.3 | 28505.2 |
> | Length 65536/256, 14/ 5, fail : | 18018.8 | 27990.8 |
> |---------------------------------+---------------+-----------------------|
Can you add aggregate geomean all benchmarks sse2 / evex512?
Also can you add all numbers to the email.
> ---
> sysdeps/x86_64/multiarch/Makefile | 2 +
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 +
> sysdeps/x86_64/multiarch/strstr-avx512.c | 208 +++++++++++++++++++++
> sysdeps/x86_64/multiarch/strstr.c | 24 ++-
> 4 files changed, 236 insertions(+), 4 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e7b413edad..6dc54a7265 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -126,6 +126,7 @@ sysdep_routines += \
> strrchr-sse2 \
> strspn-c \
> strspn-sse2 \
> + strstr-avx512 \
> strstr-sse2-unaligned \
> varshift \
> # sysdep_routines
> @@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4
> CFLAGS-strcspn-c.c += -msse4
> CFLAGS-strpbrk-c.c += -msse4
> CFLAGS-strspn-c.c += -msse4
> +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
> endif
>
> ifeq ($(subdir),wcsmbs)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index a594f4176e..cc9a7eaaa1 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strstr.c. */
> IFUNC_IMPL (i, name, strstr,
> + IFUNC_IMPL_ADD (array, i, strstr,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (AVX512DQ)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __strstr_avx512)
> IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
>
> diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> new file mode 100644
> index 0000000000..4082a75a1b
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> @@ -0,0 +1,208 @@
> +/* strstr optimized with 512-bit AVX-512 instructions
> + Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <immintrin.h>
> +#include <inttypes.h>
> +#include <stdbool.h>
> +#include <string.h>
> +
> +#define FULL_MMASK64 0xffffffffffffffff
> +#define ONE_64BIT 0x1ull
> +#define ZMM_SIZE_IN_BYTES 64
> +
> +/*
> + Returns the index of the first edge within the needle, returns 0 if no edge
> + is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> + */
> +static inline size_t
> +find_edge_in_needle (const char *ned)
> +{
> + size_t ind = 0;
> + while (ned[ind + 1] != '\0')
> + {
> + if (ned[ind] != ned[ind + 1])
> + return ind;
> + else
> + ind = ind + 1;
> + }
> + return 0;
> +}
> +
> +/*
> + Compare needle with haystack byte by byte at specified location
> + */
> +static inline bool
> +verify_string_match (const char *hay, const size_t hay_index, const char *ned,
> + size_t ind)
> +{
> + while (ned[ind] != '\0')
> + {
> + if (ned[ind] != hay[hay_index + ind])
> + return false;
> + ind = ind + 1;
> + }
> + return true;
> +}
> +
> +/*
> + Compare needle with haystack at specified location. The first 64 bytes are
> + compared using a ZMM register.
> + */
> +static inline bool
> +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> + const char *ned, const __mmask64 ned_mask,
> + const __m512i ned_zmm)
> +{
> + /* check first 64 bytes using zmm and then scalar */
> + __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
> + __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
> + if (match != 0x0) // failed the first few chars
> + return false;
> + else if (ned_mask == FULL_MMASK64)
> + return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
> + return true;
> +}
> +
> +char *
> +__strstr_avx512 (const char *haystack, const char *ned)
> +{
> + char first = ned[0];
> + if (first == '\0')
> + return (char *)haystack;
> + if (ned[1] == '\0')
> + return (char *)strchr (haystack, ned[0]);
> +
> + size_t edge = find_edge_in_needle (ned);
> +
> + /* ensure haystack is as long as the pos of edge in needle */
> + for (int ii = 0; ii < edge; ++ii)
> + {
> + if (haystack[ii] == '\0')
> + return NULL;
> + }
> +
> + const __m512i null = _mm512_setzero_si512 (); // '\0'
> +
> + /*
> + Load 64 bytes of the needle and save it to a zmm register
> + Read one cache line at a time to avoid loading across a page boundary
> + */
> + __mmask64 ned_load_mask
> + = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
> + __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
> + __mmask64 ned_nullmask
> + = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm, null);
> + if (__glibc_unlikely (ned_nullmask == 0x0))
> + {
> + ned_zmm = _mm512_loadu_si512 (ned);
> + ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
> + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> + if (ned_nullmask != 0x0)
> + ned_load_mask = ned_load_mask >> 1;
> + }
> + else
> + {
> + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> + ned_load_mask = ned_load_mask >> 1;
> + }
> + const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
> + const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> +
> + /*
> + Read the bytes of haystack in the current cache line
> + */
> + size_t hay_index = edge;
> + __mmask64 loadmask = _bzhi_u64 (
> + FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
> + /* First load is a partial cache line */
> + __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> + /* Search for NULL and compare only till null char */
> + __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask (loadmask, hay0, null);
> + __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> + cmpmask = _kand_mask64 (cmpmask, loadmask);
> + /* Search for the 2 charaters of needle */
> + __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> + __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> + k1 = _kshiftri_mask64 (k1, 1);
> + /* k2 masks tell us if both chars from needle match */
> + uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
> + /* For every match, search for the entire needle for a full match */
> + while (k2)
> + {
> + uint64_t bitcount = _tzcnt_u64(k2);
> + k2 = _blsr_u64(k2);
> + size_t match_pos = hay_index + bitcount - edge;
> + if (nullmask == 0)
> + {
> + if (verify_string_match_avx512 (haystack, match_pos, ned,
> + ned_load_mask, ned_zmm))
> + return (char *)haystack + match_pos;
> + }
> + else
> + {
> + if (verify_string_match (haystack, match_pos, ned, 0))
> + return (char *)haystack + match_pos;
> + }
> + }
> + /* We haven't checked for potential match at the last char yet */
> + hay_index += _mm_popcnt_u64 (loadmask) - 1;
> +
> + /*
> + Loop over one cache line at a time to prevent reading over page
> + boundary
> + */
> + __m512i hay1;
> + while (nullmask == 0)
> + {
> + hay0 = _mm512_loadu_si512 (haystack + hay_index);
> + hay1 = _mm512_load_si512 (haystack + hay_index
> + + 1); // Always 64 byte aligned
> + nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
> + /* Compare only till null char */
> + cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> + k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> + k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> + /* k2 masks tell us if both chars from needle match */
> + k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
> + /* For every match, compare full strings for potential match */
> + while (k2)
> + {
> + uint64_t bitcount = _tzcnt_u64(k2);
> + k2 = _blsr_u64(k2);
> + size_t match_pos = hay_index + bitcount - edge;
> + if (nullmask == 0)
> + {
> + /*
> + Since the haystack doesn't terminate at the current cache
> + line, we can use zmm register to compare the first 64 bytes
> + */
> + if (verify_string_match_avx512 (haystack, match_pos, ned,
> + ned_load_mask, ned_zmm))
> + return (char *)haystack + match_pos;
> + }
> + else
> + {
> + /* Compare byte by byte */
> + if (verify_string_match (haystack, match_pos, ned, 0))
> + return (char *)haystack + match_pos;
> + }
> + }
> + hay_index += ZMM_SIZE_IN_BYTES;
> + }
> + return NULL;
> +}
> diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
> index 95600a9de5..2fb8b169b6 100644
> --- a/sysdeps/x86_64/multiarch/strstr.c
> +++ b/sysdeps/x86_64/multiarch/strstr.c
> @@ -35,16 +35,32 @@
>
> extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
> extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
> +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
>
> #include "init-arch.h"
>
> /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> ifunc symbol properly. */
> extern __typeof (__redirect_strstr) __libc_strstr;
> -libc_ifunc (__libc_strstr,
> - HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> - ? __strstr_sse2_unaligned
> - : __strstr_sse2)
>
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> + const struct cpu_features *cpu_features = __get_cpu_features ();
> +
> + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> + && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> + && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> + return __strstr_avx512;
> +
> + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> + return __strstr_sse2_unaligned;
> +
> + return __strstr_sse2;
> +}
> +
> +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
> #undef strstr
> strong_alias (__libc_strstr, strstr)
> --
> 2.36.1
>
On Thu, May 26, 2022 at 4:41 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, May 26, 2022 at 3:11 PM Raghuveer Devulapalli via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
> >
> > (1) We spend a few cycles at the begining to peek into the needle. We
> > locate an edge in the needle (first occurance of 2 consequent distinct
> > characters) and also store the first 64-bytes into a zmm register.
> >
> > (2) We search for the edge in the haystack by looking into one cache
> > line of the haystack at a time. This avoids having to read past a page
> > boundary which can cause a seg fault.
> >
> > (3) If an edge is found in the haystack we first compare the first
> > 64-bytes of the needle (already stored in a zmm register) before we
> > proceed with a full string compare performed byte by byte.
> >
> > Benchmarking data on ICX shows upto 2x speed up when compared to
> > __strstr_sse2_unaligned (including partial benchtests data from
> > bench-strstr.out):
> >
> > |---------------------------------+---------------+-----------------------|
> > | | strstr_avx512 | strstr_sse2_unaligned |
> > |---------------------------------+---------------+-----------------------|
> > | Length 16384/ 16, 1/11, found: | 1939.75 | 3458.44 |
> > | Length 16384/ 16, 14/ 5, fail : | 1967.75 | 3541.12 |
> > | Length 16384/ 32, 1/11, found: | 1540.38 | 2908.25 |
> > | Length 16384/ 32, 14/ 5, fail : | 1345.94 | 2866.31 |
> > | Length 16384/ 64, 1/11, found: | 1968.81 | 4327.56 |
> > | Length 16384/ 64, 14/ 5, fail : | 1993.75 | 4215.69 |
> > | Length 16384/128, 1/11, found: | 1535.44 | 3780.56 |
> > | Length 16384/128, 14/ 5, fail : | 1414.75 | 3595.25 |
> > | Length 16384/256, 1/11, found: | 2957.75 | 5501.44 |
> > | Length 16384/256, 14/ 5, fail : | 2682.62 | 5099.88 |
> > | Length 32768/ 16, 1/11, found: | 7820.19 | 11262.9 |
> > | Length 32768/ 16, 14/ 5, fail : | 8196.88 | 10871.2 |
> > | Length 32768/ 32, 1/11, found: | 5709.19 | 6611.56 |
> > | Length 32768/ 32, 14/ 5, fail : | 5716.12 | 6647.06 |
> > | Length 32768/ 64, 1/11, found: | 7160.44 | 10143.7 |
> > | Length 32768/ 64, 14/ 5, fail : | 7021.38 | 10150.6 |
> > | Length 32768/128, 1/11, found: | 4935.31 | 6756.56 |
> > | Length 32768/128, 14/ 5, fail : | 4774.38 | 6746.19 |
> > | Length 32768/256, 1/11, found: | 7933.19 | 12563.8 |
> > | Length 32768/256, 14/ 5, fail : | 7975 | 12558.6 |
> > | Length 65536/ 16, 1/11, found: | 9066.69 | 9419.62 |
> > | Length 65536/ 16, 14/ 5, fail : | 8496 | 9384.75 |
> > | Length 65536/ 32, 1/11, found: | 10258.8 | 11192.4 |
> > | Length 65536/ 32, 14/ 5, fail : | 8712.12 | 11172.3 |
> > | Length 65536/ 64, 1/11, found: | 11085.2 | 18162.1 |
> > | Length 65536/ 64, 14/ 5, fail : | 11219.6 | 17921.5 |
> > | Length 65536/128, 1/11, found: | 9753.56 | 18704.6 |
> > | Length 65536/128, 14/ 5, fail : | 9588.81 | 18465.6 |
> > | Length 65536/256, 1/11, found: | 18333.3 | 28505.2 |
> > | Length 65536/256, 14/ 5, fail : | 18018.8 | 27990.8 |
> > |---------------------------------+---------------+-----------------------|
>
> Can you add aggregate geomean all benchmarks sse2 / evex512?
>
> Also can you add all numbers to the email.
> > ---
> > sysdeps/x86_64/multiarch/Makefile | 2 +
> > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 +
> > sysdeps/x86_64/multiarch/strstr-avx512.c | 208 +++++++++++++++++++++
> > sysdeps/x86_64/multiarch/strstr.c | 24 ++-
> > 4 files changed, 236 insertions(+), 4 deletions(-)
> > create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index e7b413edad..6dc54a7265 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -126,6 +126,7 @@ sysdep_routines += \
> > strrchr-sse2 \
> > strspn-c \
> > strspn-sse2 \
> > + strstr-avx512 \
> > strstr-sse2-unaligned \
> > varshift \
> > # sysdep_routines
> > @@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4
> > CFLAGS-strcspn-c.c += -msse4
> > CFLAGS-strpbrk-c.c += -msse4
> > CFLAGS-strspn-c.c += -msse4
> > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
> > endif
> >
> > ifeq ($(subdir),wcsmbs)
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index a594f4176e..cc9a7eaaa1 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> > /* Support sysdeps/x86_64/multiarch/strstr.c. */
> > IFUNC_IMPL (i, name, strstr,
> > + IFUNC_IMPL_ADD (array, i, strstr,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (AVX512DQ)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > + __strstr_avx512)
> > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
> >
> > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > new file mode 100644
> > index 0000000000..4082a75a1b
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > @@ -0,0 +1,208 @@
> > +/* strstr optimized with 512-bit AVX-512 instructions
> > + Copyright (C) 2022 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#include <immintrin.h>
> > +#include <inttypes.h>
> > +#include <stdbool.h>
> > +#include <string.h>
> > +
> > +#define FULL_MMASK64 0xffffffffffffffff
> > +#define ONE_64BIT 0x1ull
> > +#define ZMM_SIZE_IN_BYTES 64
> > +
> > +/*
> > + Returns the index of the first edge within the needle, returns 0 if no edge
> > + is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > + */
> > +static inline size_t
> > +find_edge_in_needle (const char *ned)
> > +{
> > + size_t ind = 0;
> > + while (ned[ind + 1] != '\0')
> > + {
> > + if (ned[ind] != ned[ind + 1])
> > + return ind;
> > + else
> > + ind = ind + 1;
> > + }
> > + return 0;
> > +}
> > +
> > +/*
> > + Compare needle with haystack byte by byte at specified location
> > + */
> > +static inline bool
> > +verify_string_match (const char *hay, const size_t hay_index, const char *ned,
> > + size_t ind)
> > +{
> > + while (ned[ind] != '\0')
> > + {
> > + if (ned[ind] != hay[hay_index + ind])
> > + return false;
> > + ind = ind + 1;
> > + }
> > + return true;
> > +}
> > +
> > +/*
> > + Compare needle with haystack at specified location. The first 64 bytes are
> > + compared using a ZMM register.
> > + */
> > +static inline bool
> > +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> > + const char *ned, const __mmask64 ned_mask,
> > + const __m512i ned_zmm)
> > +{
> > + /* check first 64 bytes using zmm and then scalar */
> > + __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
> > + __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
> > + if (match != 0x0) // failed the first few chars
> > + return false;
> > + else if (ned_mask == FULL_MMASK64)
> > + return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
> > + return true;
> > +}
> > +
> > +char *
> > +__strstr_avx512 (const char *haystack, const char *ned)
> > +{
> > + char first = ned[0];
> > + if (first == '\0')
> > + return (char *)haystack;
> > + if (ned[1] == '\0')
> > + return (char *)strchr (haystack, ned[0]);
> > +
> > + size_t edge = find_edge_in_needle (ned);
> > +
> > + /* ensure haystack is as long as the pos of edge in needle */
> > + for (int ii = 0; ii < edge; ++ii)
> > + {
> > + if (haystack[ii] == '\0')
> > + return NULL;
> > + }
> > +
> > + const __m512i null = _mm512_setzero_si512 (); // '\0'
> > +
> > + /*
> > + Load 64 bytes of the needle and save it to a zmm register
> > + Read one cache line at a time to avoid loading across a page boundary
> > + */
> > + __mmask64 ned_load_mask
> > + = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
> > + __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
> > + __mmask64 ned_nullmask
> > + = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm, null);
> > + if (__glibc_unlikely (ned_nullmask == 0x0))
> > + {
> > + ned_zmm = _mm512_loadu_si512 (ned);
> > + ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
> > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > + if (ned_nullmask != 0x0)
> > + ned_load_mask = ned_load_mask >> 1;
> > + }
> > + else
> > + {
> > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > + ned_load_mask = ned_load_mask >> 1;
> > + }
> > + const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
> > + const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> > +
> > + /*
> > + Read the bytes of haystack in the current cache line
> > + */
> > + size_t hay_index = edge;
> > + __mmask64 loadmask = _bzhi_u64 (
> > + FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
> > + /* First load is a partial cache line */
> > + __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > + /* Search for NULL and compare only till null char */
> > + __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask (loadmask, hay0, null);
> > + __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > + cmpmask = _kand_mask64 (cmpmask, loadmask);
> > + /* Search for the 2 charaters of needle */
> > + __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > + __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > + k1 = _kshiftri_mask64 (k1, 1);
> > + /* k2 masks tell us if both chars from needle match */
> > + uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
> > + /* For every match, search for the entire needle for a full match */
> > + while (k2)
> > + {
> > + uint64_t bitcount = _tzcnt_u64(k2);
> > + k2 = _blsr_u64(k2);
> > + size_t match_pos = hay_index + bitcount - edge;
> > + if (nullmask == 0)
> > + {
> > + if (verify_string_match_avx512 (haystack, match_pos, ned,
> > + ned_load_mask, ned_zmm))
> > + return (char *)haystack + match_pos;
> > + }
> > + else
> > + {
> > + if (verify_string_match (haystack, match_pos, ned, 0))
> > + return (char *)haystack + match_pos;
> > + }
> > + }
> > + /* We haven't checked for potential match at the last char yet */
> > + hay_index += _mm_popcnt_u64 (loadmask) - 1;
> > +
> > + /*
> > + Loop over one cache line at a time to prevent reading over page
> > + boundary
> > + */
> > + __m512i hay1;
> > + while (nullmask == 0)
> > + {
> > + hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > + hay1 = _mm512_load_si512 (haystack + hay_index
> > + + 1); // Always 64 byte aligned
> > + nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
> > + /* Compare only till null char */
> > + cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > + k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > + k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > + /* k2 masks tell us if both chars from needle match */
> > + k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
> > + /* For every match, compare full strings for potential match */
> > + while (k2)
> > + {
> > + uint64_t bitcount = _tzcnt_u64(k2);
> > + k2 = _blsr_u64(k2);
> > + size_t match_pos = hay_index + bitcount - edge;
> > + if (nullmask == 0)
> > + {
> > + /*
> > + Since the haystack doesn't terminate at the current cache
> > + line, we can use zmm register to compare the first 64 bytes
> > + */
> > + if (verify_string_match_avx512 (haystack, match_pos, ned,
> > + ned_load_mask, ned_zmm))
> > + return (char *)haystack + match_pos;
> > + }
> > + else
> > + {
> > + /* Compare byte by byte */
> > + if (verify_string_match (haystack, match_pos, ned, 0))
> > + return (char *)haystack + match_pos;
> > + }
> > + }
> > + hay_index += ZMM_SIZE_IN_BYTES;
> > + }
> > + return NULL;
> > +}
> > diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
> > index 95600a9de5..2fb8b169b6 100644
> > --- a/sysdeps/x86_64/multiarch/strstr.c
> > +++ b/sysdeps/x86_64/multiarch/strstr.c
> > @@ -35,16 +35,32 @@
> >
> > extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
> > extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
> > +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
> >
> > #include "init-arch.h"
> >
> > /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> > ifunc symbol properly. */
> > extern __typeof (__redirect_strstr) __libc_strstr;
> > -libc_ifunc (__libc_strstr,
> > - HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> > - ? __strstr_sse2_unaligned
> > - : __strstr_sse2)
> >
> > +static inline void *
> > +IFUNC_SELECTOR (void)
> > +{
> > + const struct cpu_features *cpu_features = __get_cpu_features ();
> > +
> > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > + return __strstr_avx512;
> > +
> > + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > + return __strstr_sse2_unaligned;
> > +
> > + return __strstr_sse2;
> > +}
> > +
> > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
> > #undef strstr
> > strong_alias (__libc_strstr, strstr)
> > --
> > 2.36.1
> >
Can you run clang-format on this where it makes sense.
Geometric mean per haystack size:
|---------------+--------------+---------------+-----------------+-------------------------+---------------|
| haystack size | basic_strstr | twoway_strstr | __strstr_avx512 | __strstr_sse2_unaligned | __strstr_sse2 |
|---------------+--------------+---------------+-----------------+-------------------------+---------------|
| 64 | 102.24 | 227.66 | 43.99 | 45.85 | 155.96 |
| 96 | 1460.31 | 1899.5 | 167.66 | 305.91 | 547.13 |
| 128 | 177.45 | 305.07 | 42.51 | 70.85 | 178.41 |
| 160 | 230.54 | 515.28 | 59.65 | 68.76 | 229.79 |
| 192 | 2859.48 | 3345.92 | 389.53 | 557.72 | 697.96 |
| 224 | 299.04 | 486.03 | 60.66 | 69.22 | 222.73 |
| 256 | 344.44 | 384.04 | 64.46 | 91.64 | 317.63 |
| 512 | 751.13 | 2032.34 | 119.33 | 186.86 | 1063.78 |
| 1024 | 1452.07 | 2877.57 | 199.28 | 319.36 | 1159.13 |
| 2048 | 3090.89 | 4387.19 | 303.44 | 559.34 | 1410.79 |
| 4096 | 5691.89 | 7149.54 | 558.94 | 1141.53 | 1800.8 |
| 8192 | 11240.29 | 13091.44 | 1479.94 | 2142.81 | 2464.92 |
| 16384 | 22188.11 | 8634.67 | 1934.69 | 3929.45 | 4400.37 |
| 32768 | 44837.56 | 56494.02 | 6724.21 | 9431.22 | 8044.71 |
| 65536 | 88900.08 | 96978.61 | 11453.29 | 17091.89 | 13750.56 |
|---------------+--------------+---------------+-----------------+-------------------------+---------------|
Raw data from bench-strstr.out:
basic_strstr twoway_strstr __strstr_avx512 __strstr_sse2_unaligned __strstr_sse2
Length 64/ 1, alignment 1/ 3, found: 82.5625 47.1875 220.875 223.5 36.8125
Length 64/ 1, alignment 0/ 9, fail : 86 22.875 17.3125 30.1875 21.3125
Length 64/ 2, alignment 1/ 3, found: 90.125 281.562 116.625 25.9375 79.3125
Length 64/ 2, alignment 0/ 9, fail : 88.3125 68.375 91.125 133.188 76.5
Length 64/ 3, alignment 1/ 3, found: 89 143 78.5625 27.25 147.062
Length 64/ 3, alignment 0/ 9, fail : 84.6875 112.875 34.9375 37.5 135.562
Length 64/ 4, alignment 1/ 3, found: 98.0625 184.25 30.0625 26.9375 157.188
Length 64/ 4, alignment 0/ 9, fail : 96.5625 173.438 32.3125 36.5625 144
Length 64/ 5, alignment 1/ 3, found: 96.75 187.75 27.125 29.5 153.438
Length 64/ 5, alignment 0/ 9, fail : 89.5 160.938 32.75 36.3125 147.75
Length 64/ 6, alignment 1/ 3, found: 110 374.062 34.9375 40.9375 164.375
Length 64/ 6, alignment 0/ 9, fail : 103.062 377.812 37.8125 45.6875 153.062
Length 64/ 7, alignment 1/ 3, found: 120.625 367.75 34.9375 39.5 152.562
Length 64/ 7, alignment 0/ 9, fail : 110.75 358.688 42.125 50.4375 150.125
Length 64/ 8, alignment 1/ 3, found: 120.375 295.812 36.1875 37.5625 154.062
Length 64/ 8, alignment 0/ 9, fail : 108.125 309.625 44.875 46.75 150.938
Length 64/ 9, alignment 1/ 3, found: 139.75 162.188 33.375 44.3125 166.125
Length 64/ 9, alignment 0/ 9, fail : 129.5 136.062 32.5 36.125 162.688
Length 64/ 10, alignment 1/ 3, found: 96.375 40.6875 26.5625 39.1875 48.0625
Length 64/ 10, alignment 0/ 9, fail : 83.5 26.8125 34.125 36.5 23.6875
Length 64/ 11, alignment 1/ 3, found: 106.938 186.562 26.625 31.6875 183.75
Length 64/ 11, alignment 0/ 9, fail : 99.0625 148.75 33.4375 38 175.375
Length 64/ 12, alignment 1/ 3, found: 101.062 214.062 28.5 34.0625 199
Length 64/ 12, alignment 0/ 9, fail : 88.875 175.938 32.75 35.9375 196.75
Length 64/ 13, alignment 1/ 3, found: 133.188 305.562 28.3125 36.0625 240.312
Length 64/ 13, alignment 0/ 9, fail : 96.0625 300.938 32.8125 36.625 233.625
Length 64/ 14, alignment 1/ 3, found: 111.125 309.562 26.875 37.5625 230.125
Length 64/ 14, alignment 0/ 9, fail : 89.8125 312.062 32.875 36.3125 221.5
Length 64/ 15, alignment 1/ 3, found: 117.25 387.688 26.5625 39.8125 224
Length 64/ 15, alignment 0/ 9, fail : 92.5625 380.875 32.375 35.8125 219.312
Length 64/ 16, alignment 1/ 3, found: 113.625 380.125 26.5 39.625 225.625
Length 64/ 16, alignment 0/ 9, fail : 98.625 351.125 40.8125 41.875 216.75
Length 96/ 1, alignment 1/ 3, found: 121.188 30.1875 21.6875 32.9375 24.875
Length 96/ 1, alignment 0/ 9, fail : 121 29.9375 18 27.0625 22.9375
Length 96/ 2, alignment 1/ 3, found: 128.188 317.938 44.8125 43.5625 262.5
Length 96/ 2, alignment 0/ 9, fail : 127.812 296.438 33.5 39.5625 221.688
Length 96/ 3, alignment 1/ 3, found: 148.188 151.25 58.0625 50.5 208.688
Length 96/ 3, alignment 0/ 9, fail : 143.062 131.062 45.25 41.5 199.188
Length 96/ 4, alignment 1/ 3, found: 131.875 143.75 56.5 43.875 199.938
Length 96/ 4, alignment 0/ 9, fail : 126.375 119.25 41.3125 36 181
Length 96/ 5, alignment 1/ 3, found: 131.812 321.688 61.0625 52 174.688
Length 96/ 5, alignment 0/ 9, fail : 125.938 310.062 42.75 42.6875 158.812
Length 96/ 6, alignment 1/ 3, found: 150.875 516.688 55.5 45.6875 179.562
Length 96/ 6, alignment 0/ 9, fail : 143.938 524.438 33.5 35.75 164.562
Length 96/ 7, alignment 1/ 3, found: 132.688 430.938 48.0625 48.9375 168.188
Length 96/ 7, alignment 0/ 9, fail : 127.438 364.438 32.6875 36 165.062
Length 96/ 8, alignment 1/ 3, found: 130.875 46.0625 52.875 47.4375 49.6875
Length 96/ 8, alignment 0/ 9, fail : 117.125 22.125 32.3125 36.25 25.1875
Length 96/ 9, alignment 1/ 3, found: 201.75 197.625 63.5625 55.75 179.188
Length 96/ 9, alignment 0/ 9, fail : 190.562 143.5 39.1875 44.125 176.062
Length 96/ 10, alignment 1/ 3, found: 141.312 400.875 54.25 49.0625 208.875
Length 96/ 10, alignment 0/ 9, fail : 128.938 422.25 33.125 36.625 199.812
Length 96/ 11, alignment 1/ 3, found: 170.062 468.75 65.375 62.5 207.438
Length 96/ 11, alignment 0/ 9, fail : 148.375 474.625 38.5625 42.5 192.125
Length 96/ 12, alignment 1/ 3, found: 135.688 466.375 54.9375 55.25 192.5
Length 96/ 12, alignment 0/ 9, fail : 121.625 480.938 32.25 37.9375 192.438
Length 96/ 13, alignment 1/ 3, found: 129.562 39.5625 57.9375 53.5 47.875
Length 96/ 13, alignment 0/ 9, fail : 112.438 23.3125 32.5 36.25 25.125
Length 96/ 14, alignment 1/ 3, found: 140 453.312 72.375 66.0625 207.75
Length 96/ 14, alignment 0/ 9, fail : 126.812 463.438 39.4375 45 208
Length 96/ 15, alignment 1/ 3, found: 176 238.75 63.25 56.375 230.688
Length 96/ 15, alignment 0/ 9, fail : 140.688 192.5 31.9375 36.1875 231.312
Length 96/ 16, alignment 1/ 3, found: 130.25 38.5625 63.625 59.375 42.3125
Length 96/ 16, alignment 0/ 9, fail : 111.562 22.75 32 36.8125 23.5
Length 128/ 1, alignment 1/ 3, found: 152.938 29.0625 22 37.5 23.25
Length 128/ 1, alignment 0/ 9, fail : 150.125 26.1875 22.25 36.3125 24.5625
Length 128/ 2, alignment 1/ 3, found: 195.5 156.125 46.9375 50.875 303.5
Length 128/ 2, alignment 0/ 9, fail : 194.562 134.562 42.1875 48.125 299.125
Length 128/ 3, alignment 1/ 3, found: 164.875 312.25 57.3125 46.1875 268.312
Length 128/ 3, alignment 0/ 9, fail : 158.688 282.562 45.6875 48.5 262.188
Length 128/ 4, alignment 1/ 3, found: 180.375 171.75 56.0625 57.0625 233.125
Length 128/ 4, alignment 0/ 9, fail : 167.125 151.938 54.375 661.688 210.25
Length 128/ 5, alignment 1/ 3, found: 191.688 646 45.6875 51.875 188.438
Length 128/ 5, alignment 0/ 9, fail : 184.938 644 45.25 63.0625 177.625
Length 128/ 6, alignment 1/ 3, found: 185.812 208 48.6875 57 181.938
Length 128/ 6, alignment 0/ 9, fail : 171.75 196.25 51.125 66.25 177.25
Length 128/ 7, alignment 1/ 3, found: 172.938 134.75 46.6875 55.5625 147.625
Length 128/ 7, alignment 0/ 9, fail : 157.875 104.688 44.6875 65.5 146.625
Length 128/ 8, alignment 1/ 3, found: 250.375 298.75 39.6875 50.8125 184.25
Length 128/ 8, alignment 0/ 9, fail : 248.25 265.562 41.1875 47.6875 179.562
Length 128/ 9, alignment 1/ 3, found: 152 41.875 37.6875 46.9375 41.75
Length 128/ 9, alignment 0/ 9, fail : 152.125 36.8125 39.5 46.375 36.1875
Length 128/ 10, alignment 1/ 3, found: 178.625 449.75 49.25 58.25 196.25
Length 128/ 10, alignment 0/ 9, fail : 164.812 468.625 47.75 52.375 188.062
Length 128/ 11, alignment 1/ 3, found: 184.75 608.375 35.4375 51.1875 199.125
Length 128/ 11, alignment 0/ 9, fail : 180.625 619.188 41.9375 47.3125 198.438
Length 128/ 12, alignment 1/ 3, found: 158.438 41.75 36.0625 55.8125 43.8125
Length 128/ 12, alignment 0/ 9, fail : 151.5 34.3125 40.3125 45.1875 38.75
Length 128/ 13, alignment 1/ 3, found: 190.062 571.062 36.25 52.5 210.625
Length 128/ 13, alignment 0/ 9, fail : 162.75 585.938 40.75 45.6875 199.188
Length 128/ 14, alignment 1/ 3, found: 172.188 498.625 42.375 61.6875 219.75
Length 128/ 14, alignment 0/ 9, fail : 158.938 509.188 46.875 51.5625 216.062
Length 128/ 15, alignment 1/ 3, found: 215.312 591.875 37.25 59.25 223.812
Length 128/ 15, alignment 0/ 9, fail : 180.312 594.812 40.9375 44.8125 218.438
Length 128/ 16, alignment 1/ 3, found: 184.875 185.438 36.9375 58.75 236.5
Length 128/ 16, alignment 0/ 9, fail : 163.125 162.062 41.3125 45.625 234.812
Length 160/ 1, alignment 1/ 3, found: 261.125 42.125 27.25 33.75 37.125
Length 160/ 1, alignment 0/ 9, fail : 227.062 35.9375 27.9375 33.625 30.0625
Length 160/ 2, alignment 1/ 3, found: 202.938 334.062 48 51.1875 226.938
Length 160/ 2, alignment 0/ 9, fail : 192.625 296.875 41.625 47.25 216
Length 160/ 3, alignment 1/ 3, found: 256.688 201.188 70.3125 74.5625 396.875
Length 160/ 3, alignment 0/ 9, fail : 226.312 170.688 53.25 67.5625 347
Length 160/ 4, alignment 1/ 3, found: 240.375 402.375 80.375 82 287.625
Length 160/ 4, alignment 0/ 9, fail : 216.5 365.125 65.8125 70.75 271.125
Length 160/ 5, alignment 1/ 3, found: 230 242.25 59.6875 71.9375 242.688
Length 160/ 5, alignment 0/ 9, fail : 211.812 213.938 46.1875 59.375 235.25
Length 160/ 6, alignment 1/ 3, found: 214.75 359.125 60.3125 71.8125 233.375
Length 160/ 6, alignment 0/ 9, fail : 212.062 315.562 45.375 67.0625 225.625
Length 160/ 7, alignment 1/ 3, found: 230 504.375 59.4375 71.1875 226.938
Length 160/ 7, alignment 0/ 9, fail : 214.5 477.062 43.25 62.0625 222.812
Length 160/ 8, alignment 1/ 3, found: 243.875 693 74.625 78.25 240.625
Length 160/ 8, alignment 0/ 9, fail : 220.562 653.062 52.6875 61.5625 221.812
Length 160/ 9, alignment 1/ 3, found: 230 658.938 68.0625 69.5 237.312
Length 160/ 9, alignment 0/ 9, fail : 208.188 617.688 48.125 50 229.625
Length 160/ 10, alignment 1/ 3, found: 244.188 807.312 76.5 82.125 245.75
Length 160/ 10, alignment 0/ 9, fail : 214.188 740.188 58 67.5 230.062
Length 160/ 11, alignment 1/ 3, found: 239.375 734.75 71.8125 64.4375 215
Length 160/ 11, alignment 0/ 9, fail : 216.812 662.625 41.875 47 210.125
Length 160/ 12, alignment 1/ 3, found: 194 570.625 72.625 62.1875 212.625
Length 160/ 12, alignment 0/ 9, fail : 189.5 500.125 43.625 48.375 207.875
Length 160/ 13, alignment 1/ 3, found: 312.875 790 84.0625 107.812 228.125
Length 160/ 13, alignment 0/ 9, fail : 285.5 745.375 55.8125 77.125 218.25
Length 160/ 14, alignment 1/ 3, found: 227.875 705.938 81.5625 96 230.688
Length 160/ 14, alignment 0/ 9, fail : 221.688 700.062 55.4375 70.4375 231.188
Length 160/ 15, alignment 1/ 3, found: 252.312 768.375 85.4375 96.875 253.5
Length 160/ 15, alignment 0/ 9, fail : 231.625 768.25 52.125 72.8125 244.312
Length 160/ 16, alignment 1/ 3, found: 275.688 720.75 95.6875 106.438 250.375
Length 160/ 16, alignment 0/ 9, fail : 232.25 691.25 62 77.875 246.438
Length 192/ 1, alignment 1/ 3, found: 220.25 35.125 30.8125 40 34.3125
Length 192/ 1, alignment 0/ 9, fail : 217.312 34.0625 29.0625 41.25 33.125
Length 192/ 2, alignment 1/ 3, found: 212.438 53.125 55.6875 54.875 43
Length 192/ 2, alignment 0/ 9, fail : 207.312 34.5 49.375 54.5 35.5625
Length 192/ 3, alignment 1/ 3, found: 215.375 48.5 48.8125 52.25 44.5
Length 192/ 3, alignment 0/ 9, fail : 207.188 28.8125 48.375 56.1875 32.6875
Length 192/ 4, alignment 1/ 3, found: 244 369.438 45.375 52.3125 260.562
Length 192/ 4, alignment 0/ 9, fail : 230.75 341.938 46.3125 55.8125 243
Length 192/ 5, alignment 1/ 3, found: 240.688 676.812 44.8125 54.8125 226.875
Length 192/ 5, alignment 0/ 9, fail : 224.5 655.438 46.0625 54.8125 218.875
Length 192/ 6, alignment 1/ 3, found: 275.875 784.312 55.125 64.875 215
Length 192/ 6, alignment 0/ 9, fail : 259.062 770.5 51.5625 62.1875 208.062
Length 192/ 7, alignment 1/ 3, found: 270.688 826.625 69.5 85.875 236.25
Length 192/ 7, alignment 0/ 9, fail : 248.312 773.25 67.1875 82.875 216.25
Length 192/ 8, alignment 1/ 3, found: 334.375 888.25 55.8125 70.1875 257.312
Length 192/ 8, alignment 0/ 9, fail : 330.438 856.25 53.1875 75.625 235.625
Length 192/ 9, alignment 1/ 3, found: 228 162.375 49.875 68.4375 178.688
Length 192/ 9, alignment 0/ 9, fail : 222.25 131.5 52.0625 70.4375 170.75
Length 192/ 10, alignment 1/ 3, found: 233.188 168.688 48.5 64.625 187.5
Length 192/ 10, alignment 0/ 9, fail : 224.312 146.625 48.3125 67.125 178
Length 192/ 11, alignment 1/ 3, found: 222.188 47.3125 45.5 68.3125 51.8125
Length 192/ 11, alignment 0/ 9, fail : 221.812 139.875 48.25 68.8125 184.062
Length 192/ 12, alignment 1/ 3, found: 335.062 481.938 50.6875 78.375 249.5
Length 192/ 12, alignment 0/ 9, fail : 317.562 466.938 53.6875 69.75 239.688
Length 192/ 13, alignment 1/ 3, found: 235.75 234.438 44.125 67.375 222.875
Length 192/ 13, alignment 0/ 9, fail : 213.938 230.312 45.375 53.9375 215.688
Length 192/ 14, alignment 1/ 3, found: 225.375 45.0625 46.3125 69.6875 51.1875
Length 192/ 14, alignment 0/ 9, fail : 214.5 29.875 51.25 53.4375 33
Length 192/ 15, alignment 1/ 3, found: 223.5 44.6875 46.4375 65.0625 47.6875
Length 192/ 15, alignment 0/ 9, fail : 207.562 31.125 44.9375 54.3125 33.25
Length 192/ 16, alignment 1/ 3, found: 224.188 45.9375 43.6875 64.8125 49.0625
Length 192/ 16, alignment 0/ 9, fail : 207.438 30.625 44.6875 53.125 31.5
Length 224/ 1, alignment 1/ 3, found: 302.625 36.4375 31.1875 41.375 37.1875
Length 224/ 1, alignment 0/ 9, fail : 273.625 33.125 31.1875 38.3125 32.75
Length 224/ 2, alignment 1/ 3, found: 300.875 387.188 53.625 57.5625 560.875
Length 224/ 2, alignment 0/ 9, fail : 296.812 356.938 47.0625 55.4375 515.25
Length 224/ 3, alignment 1/ 3, found: 292.562 714.875 76.4375 90.125 493.625
Length 224/ 3, alignment 0/ 9, fail : 278.625 714.438 64.1875 87.0625 455
Length 224/ 4, alignment 1/ 3, found: 314.625 552.438 55.6875 67.5625 303.812
Length 224/ 4, alignment 0/ 9, fail : 293.938 514.812 46.3125 51.1875 303.312
Length 224/ 5, alignment 1/ 3, found: 265.5 309.812 61.75 62.875 192.812
Length 224/ 5, alignment 0/ 9, fail : 258.438 275.75 47.0625 52.875 186.25
Length 224/ 6, alignment 1/ 3, found: 320.125 567.688 83.5 104.438 241.25
Length 224/ 6, alignment 0/ 9, fail : 295.125 559.062 72.4375 90.4375 229.062
Length 224/ 7, alignment 1/ 3, found: 300.875 140.812 73.875 76.1875 222.75
Length 224/ 7, alignment 0/ 9, fail : 292.812 130 55.5625 62.3125 214.188
Length 224/ 8, alignment 1/ 3, found: 302.812 139.438 68.125 77.625 231.062
Length 224/ 8, alignment 0/ 9, fail : 286.438 123.125 50.625 65.0625 220
Length 224/ 9, alignment 1/ 3, found: 297.5 826.562 71.0625 89.9375 237.125
Length 224/ 9, alignment 0/ 9, fail : 271.25 840.25 59.25 72.5 215.938
Length 224/ 10, alignment 1/ 3, found: 289.438 1008.06 70 70.75 235.875
Length 224/ 10, alignment 0/ 9, fail : 274.688 1017.12 46.5 55.0625 221.312
Length 224/ 11, alignment 1/ 3, found: 325.812 1010.56 78.9375 93.875 238.438
Length 224/ 11, alignment 0/ 9, fail : 308.062 1004.31 56.6875 70.625 233.875
Length 224/ 12, alignment 1/ 3, found: 281.062 983.875 70.3125 71.875 250.75
Length 224/ 12, alignment 0/ 9, fail : 259.938 983.75 46.4375 55.1875 243.188
Length 224/ 13, alignment 1/ 3, found: 305.438 55.0625 72.3125 70.1875 52.75
Length 224/ 13, alignment 0/ 9, fail : 281.938 45.8125 46.625 55.125 45.25
Length 224/ 14, alignment 1/ 3, found: 299 49.5625 74.9375 70.4375 51.625
Length 224/ 14, alignment 0/ 9, fail : 284.812 32.8125 46.375 52.6875 33
Length 224/ 15, alignment 1/ 3, found: 311.188 48.5625 78.4375 72.1875 47.4375
Length 224/ 15, alignment 0/ 9, fail : 279.688 30.625 46.5 55 29.875
Length 224/ 16, alignment 1/ 3, found: 429 1053.5 94.8125 105 282.75
Length 224/ 16, alignment 0/ 9, fail : 394.5 1006.56 63.375 74.125 269.125
Length 256/ 1, alignment 1/ 3, found: 344.938 36.5 34.5 43.875 36.6875
Length 256/ 1, alignment 0/ 9, fail : 336.938 40.3125 29.3125 45.4375 38.875
Length 256/ 2, alignment 1/ 3, found: 317.688 158.125 66.25 70.8125 666.375
Length 256/ 2, alignment 0/ 9, fail : 303.125 139.062 55.6875 62.75 630.938
Length 256/ 3, alignment 1/ 3, found: 301.812 243 66.3125 84.125 235.188
Length 256/ 3, alignment 0/ 9, fail : 289.5 230.938 69.125 89.5625 227.562
Length 256/ 4, alignment 1/ 3, found: 431.562 278.5 59.5 70.9375 313.875
Length 256/ 4, alignment 0/ 9, fail : 415.375 259.125 58.5 71.25 289.5
Length 256/ 5, alignment 1/ 3, found: 353.438 1133.94 66.625 81.375 335.125
Length 256/ 5, alignment 0/ 9, fail : 335.812 1082.75 60.375 75.6875 316.625
Length 256/ 6, alignment 1/ 3, found: 321.812 251.125 58.375 72.6875 304.875
Length 256/ 6, alignment 0/ 9, fail : 302.312 208.625 60.3125 72.75 284.312
Length 256/ 7, alignment 1/ 3, found: 463.25 232.875 78.5 97.625 270
Length 256/ 7, alignment 0/ 9, fail : 445.375 210 69.5 89.1875 257.375
Length 256/ 8, alignment 1/ 3, found: 371.062 166.625 69.375 94.0625 266.875
Length 256/ 8, alignment 0/ 9, fail : 355 136.188 66.1875 89.75 251.688
Length 256/ 9, alignment 1/ 3, found: 321.062 164.938 61.0625 80.375 255.375
Length 256/ 9, alignment 0/ 9, fail : 306.625 137.875 59.875 76.6875 242.562
Length 256/ 10, alignment 1/ 3, found: 323.688 166.25 57.4375 73.4375 253.062
Length 256/ 10, alignment 0/ 9, fail : 298.688 138.75 59.625 68.625 250.312
Length 256/ 11, alignment 1/ 3, found: 318.438 161.312 54.8125 80.9375 272.188
Length 256/ 11, alignment 0/ 9, fail : 298.875 132.188 55.4375 68.875 263.812
Length 256/ 12, alignment 1/ 3, found: 325.938 161.875 61.0625 87.6875 255.312
Length 256/ 12, alignment 0/ 9, fail : 301.812 135.125 65 80.1875 250.125
Length 256/ 13, alignment 1/ 3, found: 337.188 401.938 50.6875 73.375 271.438
Length 256/ 13, alignment 0/ 9, fail : 308.438 356.562 49.25 63.8125 258.438
Length 256/ 14, alignment 1/ 3, found: 312.5 242.75 51.125 73.0625 242
Length 256/ 14, alignment 0/ 9, fail : 287.625 206.125 51.125 60.9375 238.5
Length 256/ 15, alignment 1/ 3, found: 293.25 244.188 48.875 73.0625 249.312
Length 256/ 15, alignment 0/ 9, fail : 288 205.188 49.8125 60.3125 239.5
Length 256/ 16, alignment 1/ 3, found: 343.75 51.625 47.9375 75.5625 50.625
Length 256/ 16, alignment 0/ 9, fail : 322.875 35.3125 49.3125 59.9375 33.9375
Length 256/ 16, alignment 1/11, found: 340.938 46.9375 48.625 73.8125 49.4375
Length 256/ 16, alignment 14/ 5, fail : 323.062 32.5625 49.8125 59.625 33.875
Length 256/ 32, alignment 1/11, found: 306.75 894.188 49.6875 105.938 408.625
Length 256/ 32, alignment 14/ 5, fail : 280.5 781.875 49.5 60.5 395.75
Length 256/ 64, alignment 1/11, found: 459 1323.25 75.875 170.438 701.812
Length 256/ 64, alignment 14/ 5, fail : 413.25 1125.06 68.0625 89.5625 687.5
Length 256/128, alignment 1/11, found: 431.188 2134.38 139.688 205.125 1295.88
Length 256/128, alignment 14/ 5, fail : 356.188 1900.06 55.5 61.5 1276.75
Length 256/256, alignment 1/11, found: 548.438 81.125 264 574.625 75.9375
Length 256/256, alignment 14/ 5, fail : 329.5 60.375 65.8125 78.9375 62.3125
Length 512/ 16, alignment 1/11, found: 798.5 1669.25 81.4375 105.75 421.812
Length 512/ 16, alignment 14/ 5, fail : 767.062 1608.88 74.0625 100.25 395.688
Length 512/ 32, alignment 1/11, found: 745.938 1202.19 91.125 180.25 435.625
Length 512/ 32, alignment 14/ 5, fail : 671.5 1104.69 86.875 122.875 419.25
Length 512/ 64, alignment 1/11, found: 1000.75 1475 96.875 229.562 707
Length 512/ 64, alignment 14/ 5, fail : 597.438 1275.12 94.875 143 696.812
Length 512/128, alignment 1/11, found: 713.812 2273.5 170.625 248.25 1316.31
Length 512/128, alignment 14/ 5, fail : 614.688 1924.94 84.4375 116.125 1280.94
Length 512/256, alignment 1/11, found: 914.938 4177.12 299.5 445.812 2491.88
Length 512/256, alignment 14/ 5, fail : 686.625 3612.75 113.5 176.75 2472.44
Length 1024/ 16, alignment 1/11, found: 1423.75 4580.5 184.938 305.062 530
Length 1024/ 16, alignment 14/ 5, fail : 1264.88 4479.94 176.562 277.938 487.5
Length 1024/ 32, alignment 1/11, found: 1894.75 1585.88 137.562 275.562 620.562
Length 1024/ 32, alignment 14/ 5, fail : 1645.44 1470.44 133.938 227.125 599.25
Length 1024/ 64, alignment 1/11, found: 1476.81 1970.31 243 425.375 826.375
Length 1024/ 64, alignment 14/ 5, fail : 1336.94 1771.75 247.125 363.562 799.25
Length 1024/128, alignment 1/11, found: 1439.56 2714.44 213.438 334.062 1363.44
Length 1024/128, alignment 14/ 5, fail : 1202.56 2473.94 128.188 198.938 1326.44
Length 1024/256, alignment 1/11, found: 1598.38 4090.94 373.062 544.312 2544.19
Length 1024/256, alignment 14/ 5, fail : 1237.62 3637.56 154.938 241.625 2494.31
Length 2048/ 16, alignment 1/11, found: 3643.25 9594.62 244.438 427.625 967.125
Length 2048/ 16, alignment 14/ 5, fail : 3444.25 9472.19 244.75 424.375 900.25
Length 2048/ 32, alignment 1/11, found: 3669.06 2153.62 211.312 449.5 843.5
Length 2048/ 32, alignment 14/ 5, fail : 3619.38 2015.75 235.812 394.812 800.312
Length 2048/ 64, alignment 1/11, found: 2559.75 2449.56 241.75 494.125 1088.69
Length 2048/ 64, alignment 14/ 5, fail : 2377.44 2210.19 236.688 423.25 1032.62
Length 2048/128, alignment 1/11, found: 2987.94 3316.69 420.75 794.938 1585.12
Length 2048/128, alignment 14/ 5, fail : 2705.75 3118.12 340.938 648.625 1537.62
Length 2048/256, alignment 1/11, found: 3170.81 5005.38 530.188 893.875 2705
Length 2048/256, alignment 14/ 5, fail : 2731.31 4535.81 327.75 642.312 2647.69
Length 4096/ 16, alignment 1/11, found: 5343.5 19144.8 536.25 1089.5 1553.5
Length 4096/ 16, alignment 14/ 5, fail : 5217.44 19170.2 493.438 993.25 1456.69
Length 4096/ 32, alignment 1/11, found: 7072.5 4249.25 492.188 1066.38 1455.69
Length 4096/ 32, alignment 14/ 5, fail : 6993.19 4061.94 466.812 985.125 1393.56
Length 4096/ 64, alignment 1/11, found: 5117.25 3570.31 465.188 1093.12 1370.81
Length 4096/ 64, alignment 14/ 5, fail : 5051.31 3372.44 653.625 1009 1323.5
Length 4096/128, alignment 1/11, found: 5459.12 3780.44 434.812 954.125 1661.44
Length 4096/128, alignment 14/ 5, fail : 5142.81 3402.81 327.875 809.25 1619.19
Length 4096/256, alignment 1/11, found: 5971.44 5554.25 957.562 1919.31 3112.19
Length 4096/256, alignment 14/ 5, fail : 5550.38 5189 761.625 1496.19 3061.38
Length 8192/ 16, alignment 1/11, found: 9681.88 40017.5 3486.19 1857.81 2314.12
Length 8192/ 16, alignment 14/ 5, fail : 9640.31 39833.3 3078.38 1816.25 2244.5
Length 8192/ 32, alignment 1/11, found: 10085.7 7358.25 744.312 1513 1976.62
Length 8192/ 32, alignment 14/ 5, fail : 9982.5 7094.75 767.312 1469.19 1899.62
Length 8192/ 64, alignment 1/11, found: 10161.9 5016.19 924.062 2150.12 1796.06
Length 8192/ 64, alignment 14/ 5, fail : 9865 4756.69 939.125 2012.38 1726.5
Length 8192/128, alignment 1/11, found: 14496.5 6407.75 873.625 2102.81 2431.75
Length 8192/128, alignment 14/ 5, fail : 14451.4 6055.88 786.938 1988.5 2540.62
Length 8192/256, alignment 1/11, found: 12171.9 7560.44 1695.62 3455.44 3896.44
Length 8192/256, alignment 14/ 5, fail : 11865.8 6813.69 1503.81 3062.56 3823
Length 16384/ 16, alignment 1/11, found: 26405.8 2771 1939.75 3458.44 4665.75
Length 16384/ 16, alignment 14/ 5, fail : 26365.4 2703.5 1967.75 3541.12 4614.38
Length 16384/ 32, alignment 1/11, found: 21014.2 12941.8 1540.38 2908.25 3856.19
Length 16384/ 32, alignment 14/ 5, fail : 20926.2 12508 1345.94 2866.31 3699.19
Length 16384/ 64, alignment 1/11, found: 21047.8 9763.06 1968.81 4327.56 3937.94
Length 16384/ 64, alignment 14/ 5, fail : 20831.2 9370.19 1993.75 4215.69 3806.38
Length 16384/128, alignment 1/11, found: 20901.2 8829 1535.44 3780.56 4409.81
Length 16384/128, alignment 14/ 5, fail : 20554.6 8484.38 1414.75 3595.25 4334.81
Length 16384/256, alignment 1/11, found: 22051.7 9850.38 2957.75 5501.44 5395.38
Length 16384/256, alignment 14/ 5, fail : 21783 9125.38 2682.62 5099.88 5283.88
Length 32768/ 16, alignment 1/11, found: 44759.6 172535 7820.19 11262.9 10746.4
Length 32768/ 16, alignment 14/ 5, fail : 44607.6 165730 8196.88 10871.2 10649.1
Length 32768/ 32, alignment 1/11, found: 44149.3 32756.8 5709.19 6611.56 8476.81
Length 32768/ 32, alignment 14/ 5, fail : 44038.1 33773.8 5716.12 6647.06 8526.75
Length 32768/ 64, alignment 1/11, found: 40456.6 29023.6 7160.44 10143.7 5885.75
Length 32768/ 64, alignment 14/ 5, fail : 40481.6 28606.3 7021.38 10150.6 5737.75
Length 32768/128, alignment 1/11, found: 47779 29011.2 4935.31 6756.56 7652.69
Length 32768/128, alignment 14/ 5, fail : 47737 28753.4 4774.38 6746.19 7519.75
Length 32768/256, alignment 1/11, found: 47323.5 23237.3 7933.19 12563.8 7731.69
Length 32768/256, alignment 14/ 5, fail : 47043.3 21512.8 7975 12558.6 7520.44
Length 65536/ 16, alignment 1/11, found: 83641.9 326590 9066.69 9419.62 22823.9
Length 65536/ 16, alignment 14/ 5, fail : 83851.6 326332 8496 9384.75 22353.8
Length 65536/ 32, alignment 1/11, found: 79622.9 54849.7 10258.8 11192.4 11908.4
Length 65536/ 32, alignment 14/ 5, fail : 79978.1 54414.6 8712.12 11172.3 11775.9
Length 65536/ 64, alignment 1/11, found: 84139.6 49233.6 11085.2 18162.1 12065.2
Length 65536/ 64, alignment 14/ 5, fail : 84550.4 48807 11219.6 17921.5 12006.4
Length 65536/128, alignment 1/11, found: 82355.9 28967.7 9753.56 18704.6 12264.6
Length 65536/128, alignment 14/ 5, fail : 82424.4 27971.3 9588.81 18465.6 12060.8
Length 65536/256, alignment 1/11, found: 114307 26580.4 18333.3 28505.2 10345.7
Length 65536/256, alignment 14/ 5, fail : 114129 26039.8 18018.8 27990.8 9900.94
Length 65536/ 64, complex needle 1: 3.74241e+06 46785.3 16544 866685 862864
Length 65536/ 64, complex needle 2: 4.31153e+06 304627 11677.8 990792 986300
Length 65536/ 64, complex needle 3: 989750 1.57276e+06 97158.6 446779 18592.6
Length 65536/256, complex needle 1: 1.20009e+07 32962.1 47821.9 1.0267e+06 1.00796e+06
Length 65536/256, complex needle 2: 1.79424e+07 275658 64927.6 1.06521e+06 1.03903e+06
Length 65536/256, complex needle 3: 991491 1.5706e+06 105750 459469 8755.81
Length 65536/1024, complex needle 1: 6.14575e+07 24270.9 71510.6 216958 25173.1
Length 65536/1024, complex needle 2: 6.84464e+07 276943 78251.6 442606 341867
Length 65536/1024, complex needle 3: 988035 1.56111e+06 97955.1 429106 1.56998e+06
-----Original Message-----
From: Noah Goldstein <goldstein.w.n@gmail.com>
Sent: Thursday, May 26, 2022 3:27 PM
To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
Cc: GNU C Library <libc-alpha@sourceware.org>
Subject: Re: [PATCH 1/1] x86_64: Add strstr function with 512-bit EVEX
On Thu, May 26, 2022 at 4:41 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, May 26, 2022 at 3:11 PM Raghuveer Devulapalli via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
> >
> > (1) We spend a few cycles at the begining to peek into the needle.
> > We locate an edge in the needle (first occurance of 2 consequent
> > distinct
> > characters) and also store the first 64-bytes into a zmm register.
> >
> > (2) We search for the edge in the haystack by looking into one cache
> > line of the haystack at a time. This avoids having to read past a
> > page boundary which can cause a seg fault.
> >
> > (3) If an edge is found in the haystack we first compare the first
> > 64-bytes of the needle (already stored in a zmm register) before we
> > proceed with a full string compare performed byte by byte.
> >
> > Benchmarking data on ICX shows upto 2x speed up when compared to
> > __strstr_sse2_unaligned (including partial benchtests data from
> > bench-strstr.out):
> >
> > |---------------------------------+---------------+-----------------------|
> > | | strstr_avx512 |
> > | strstr_sse2_unaligned |
> > |---------------------------------+---------------+-----------------------|
> > | Length 16384/ 16, 1/11, found: | 1939.75 | 3458.44 |
> > | Length 16384/ 16, 14/ 5, fail : | 1967.75 | 3541.12 |
> > | Length 16384/ 32, 1/11, found: | 1540.38 | 2908.25 |
> > | Length 16384/ 32, 14/ 5, fail : | 1345.94 | 2866.31 |
> > | Length 16384/ 64, 1/11, found: | 1968.81 | 4327.56 |
> > | Length 16384/ 64, 14/ 5, fail : | 1993.75 | 4215.69 |
> > | Length 16384/128, 1/11, found: | 1535.44 | 3780.56 |
> > | Length 16384/128, 14/ 5, fail : | 1414.75 | 3595.25 |
> > | Length 16384/256, 1/11, found: | 2957.75 | 5501.44 |
> > | Length 16384/256, 14/ 5, fail : | 2682.62 | 5099.88 |
> > | Length 32768/ 16, 1/11, found: | 7820.19 | 11262.9 |
> > | Length 32768/ 16, 14/ 5, fail : | 8196.88 | 10871.2 |
> > | Length 32768/ 32, 1/11, found: | 5709.19 | 6611.56 |
> > | Length 32768/ 32, 14/ 5, fail : | 5716.12 | 6647.06 |
> > | Length 32768/ 64, 1/11, found: | 7160.44 | 10143.7 |
> > | Length 32768/ 64, 14/ 5, fail : | 7021.38 | 10150.6 |
> > | Length 32768/128, 1/11, found: | 4935.31 | 6756.56 |
> > | Length 32768/128, 14/ 5, fail : | 4774.38 | 6746.19 |
> > | Length 32768/256, 1/11, found: | 7933.19 | 12563.8 |
> > | Length 32768/256, 14/ 5, fail : | 7975 | 12558.6 |
> > | Length 65536/ 16, 1/11, found: | 9066.69 | 9419.62 |
> > | Length 65536/ 16, 14/ 5, fail : | 8496 | 9384.75 |
> > | Length 65536/ 32, 1/11, found: | 10258.8 | 11192.4 |
> > | Length 65536/ 32, 14/ 5, fail : | 8712.12 | 11172.3 |
> > | Length 65536/ 64, 1/11, found: | 11085.2 | 18162.1 |
> > | Length 65536/ 64, 14/ 5, fail : | 11219.6 | 17921.5 |
> > | Length 65536/128, 1/11, found: | 9753.56 | 18704.6 |
> > | Length 65536/128, 14/ 5, fail : | 9588.81 | 18465.6 |
> > | Length 65536/256, 1/11, found: | 18333.3 | 28505.2 |
> > | Length 65536/256, 14/ 5, fail : | 18018.8 | 27990.8 |
> > |---------------------------------+---------------+-----------------------|
>
> Can you add aggregate geomean all benchmarks sse2 / evex512?
>
> Also can you add all numbers to the email.
> > ---
> > sysdeps/x86_64/multiarch/Makefile | 2 +
> > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 +
> > sysdeps/x86_64/multiarch/strstr-avx512.c | 208 +++++++++++++++++++++
> > sysdeps/x86_64/multiarch/strstr.c | 24 ++-
> > 4 files changed, 236 insertions(+), 4 deletions(-) create mode
> > 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile
> > b/sysdeps/x86_64/multiarch/Makefile
> > index e7b413edad..6dc54a7265 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -126,6 +126,7 @@ sysdep_routines += \
> > strrchr-sse2 \
> > strspn-c \
> > strspn-sse2 \
> > + strstr-avx512 \
> > strstr-sse2-unaligned \
> > varshift \
> > # sysdep_routines
> > @@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c
> > += -msse4 CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4
> > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq
> > +-mavx512bw -mbmi -mbmi2 -O3
> > endif
> >
> > ifeq ($(subdir),wcsmbs)
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index a594f4176e..cc9a7eaaa1 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name,
> > struct libc_ifunc_impl *array,
> >
> > /* Support sysdeps/x86_64/multiarch/strstr.c. */
> > IFUNC_IMPL (i, name, strstr,
> > + IFUNC_IMPL_ADD (array, i, strstr,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (AVX512DQ)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > + __strstr_avx512)
> > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
> >
> > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > new file mode 100644
> > index 0000000000..4082a75a1b
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > @@ -0,0 +1,208 @@
> > +/* strstr optimized with 512-bit AVX-512 instructions
> > + Copyright (C) 2022 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#include <immintrin.h>
> > +#include <inttypes.h>
> > +#include <stdbool.h>
> > +#include <string.h>
> > +
> > +#define FULL_MMASK64 0xffffffffffffffff #define ONE_64BIT 0x1ull
> > +#define ZMM_SIZE_IN_BYTES 64
> > +
> > +/*
> > + Returns the index of the first edge within the needle, returns 0
> > +if no edge is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > + */
> > +static inline size_t
> > +find_edge_in_needle (const char *ned) {
> > + size_t ind = 0;
> > + while (ned[ind + 1] != '\0')
> > + {
> > + if (ned[ind] != ned[ind + 1])
> > + return ind;
> > + else
> > + ind = ind + 1;
> > + }
> > + return 0;
> > +}
> > +
> > +/*
> > + Compare needle with haystack byte by byte at specified location
> > +*/ static inline bool verify_string_match (const char *hay, const
> > +size_t hay_index, const char *ned,
> > + size_t ind)
> > +{
> > + while (ned[ind] != '\0')
> > + {
> > + if (ned[ind] != hay[hay_index + ind])
> > + return false;
> > + ind = ind + 1;
> > + }
> > + return true;
> > +}
> > +
> > +/*
> > + Compare needle with haystack at specified location. The first 64
> > +bytes are compared using a ZMM register.
> > + */
> > +static inline bool
> > +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> > + const char *ned, const __mmask64 ned_mask,
> > + const __m512i ned_zmm) {
> > + /* check first 64 bytes using zmm and then scalar */
> > + __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe
> > +to do so
> > + __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask,
> > +hay_zmm, ned_zmm);
> > + if (match != 0x0) // failed the first few chars
> > + return false;
> > + else if (ned_mask == FULL_MMASK64)
> > + return verify_string_match (hay, hay_index, ned,
> > +ZMM_SIZE_IN_BYTES);
> > + return true;
> > +}
> > +
> > +char *
> > +__strstr_avx512 (const char *haystack, const char *ned) {
> > + char first = ned[0];
> > + if (first == '\0')
> > + return (char *)haystack;
> > + if (ned[1] == '\0')
> > + return (char *)strchr (haystack, ned[0]);
> > +
> > + size_t edge = find_edge_in_needle (ned);
> > +
> > + /* ensure haystack is as long as the pos of edge in needle */
> > + for (int ii = 0; ii < edge; ++ii)
> > + {
> > + if (haystack[ii] == '\0')
> > + return NULL;
> > + }
> > +
> > + const __m512i null = _mm512_setzero_si512 (); // '\0'
> > +
> > + /*
> > + Load 64 bytes of the needle and save it to a zmm register
> > + Read one cache line at a time to avoid loading across a page boundary
> > + */
> > + __mmask64 ned_load_mask
> > + = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
> > + __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
> > + __mmask64 ned_nullmask
> > + = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm, null);
> > + if (__glibc_unlikely (ned_nullmask == 0x0))
> > + {
> > + ned_zmm = _mm512_loadu_si512 (ned);
> > + ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
> > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > + if (ned_nullmask != 0x0)
> > + ned_load_mask = ned_load_mask >> 1;
> > + }
> > + else
> > + {
> > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > + ned_load_mask = ned_load_mask >> 1;
> > + }
> > + const __m512i ned0 = _mm512_set1_epi8 (ned[edge]); const __m512i
> > + ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> > +
> > + /*
> > + Read the bytes of haystack in the current cache line
> > + */
> > + size_t hay_index = edge;
> > + __mmask64 loadmask = _bzhi_u64 (
> > + FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) &
> > + 63));
> > + /* First load is a partial cache line */ __m512i hay0 =
> > + _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > + /* Search for NULL and compare only till null char */
> > + __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask (loadmask, hay0,
> > + null);
> > + __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT); cmpmask =
> > + _kand_mask64 (cmpmask, loadmask);
> > + /* Search for the 2 charaters of needle */
> > + __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > + __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > + k1 = _kshiftri_mask64 (k1, 1);
> > + /* k2 masks tell us if both chars from needle match */ uint64_t
> > + k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1),
> > + cmpmask));
> > + /* For every match, search for the entire needle for a full match
> > + */ while (k2)
> > + {
> > + uint64_t bitcount = _tzcnt_u64(k2);
> > + k2 = _blsr_u64(k2);
> > + size_t match_pos = hay_index + bitcount - edge;
> > + if (nullmask == 0)
> > + {
> > + if (verify_string_match_avx512 (haystack, match_pos, ned,
> > + ned_load_mask, ned_zmm))
> > + return (char *)haystack + match_pos;
> > + }
> > + else
> > + {
> > + if (verify_string_match (haystack, match_pos, ned, 0))
> > + return (char *)haystack + match_pos;
> > + }
> > + }
> > + /* We haven't checked for potential match at the last char yet */
> > + hay_index += _mm_popcnt_u64 (loadmask) - 1;
> > +
> > + /*
> > + Loop over one cache line at a time to prevent reading over page
> > + boundary
> > + */
> > + __m512i hay1;
> > + while (nullmask == 0)
> > + {
> > + hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > + hay1 = _mm512_load_si512 (haystack + hay_index
> > + + 1); // Always 64 byte aligned
> > + nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
> > + /* Compare only till null char */
> > + cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > + k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > + k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > + /* k2 masks tell us if both chars from needle match */
> > + k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
> > + /* For every match, compare full strings for potential match */
> > + while (k2)
> > + {
> > + uint64_t bitcount = _tzcnt_u64(k2);
> > + k2 = _blsr_u64(k2);
> > + size_t match_pos = hay_index + bitcount - edge;
> > + if (nullmask == 0)
> > + {
> > + /*
> > + Since the haystack doesn't terminate at the current cache
> > + line, we can use zmm register to compare the first 64 bytes
> > + */
> > + if (verify_string_match_avx512 (haystack, match_pos, ned,
> > + ned_load_mask, ned_zmm))
> > + return (char *)haystack + match_pos;
> > + }
> > + else
> > + {
> > + /* Compare byte by byte */
> > + if (verify_string_match (haystack, match_pos, ned, 0))
> > + return (char *)haystack + match_pos;
> > + }
> > + }
> > + hay_index += ZMM_SIZE_IN_BYTES;
> > + }
> > + return NULL;
> > +}
> > diff --git a/sysdeps/x86_64/multiarch/strstr.c
> > b/sysdeps/x86_64/multiarch/strstr.c
> > index 95600a9de5..2fb8b169b6 100644
> > --- a/sysdeps/x86_64/multiarch/strstr.c
> > +++ b/sysdeps/x86_64/multiarch/strstr.c
> > @@ -35,16 +35,32 @@
> >
> > extern __typeof (__redirect_strstr) __strstr_sse2_unaligned
> > attribute_hidden; extern __typeof (__redirect_strstr) __strstr_sse2
> > attribute_hidden;
> > +extern __typeof (__redirect_strstr) __strstr_avx512
> > +attribute_hidden;
> >
> > #include "init-arch.h"
> >
> > /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> > ifunc symbol properly. */
> > extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc
> > (__libc_strstr,
> > - HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> > - ? __strstr_sse2_unaligned
> > - : __strstr_sse2)
> >
> > +static inline void *
> > +IFUNC_SELECTOR (void)
> > +{
> > + const struct cpu_features *cpu_features = __get_cpu_features ();
> > +
> > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > + return __strstr_avx512;
> > +
> > + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > + return __strstr_sse2_unaligned;
> > +
> > + return __strstr_sse2;
> > +}
> > +
> > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr,
> > +IFUNC_SELECTOR ());
> > #undef strstr
> > strong_alias (__libc_strstr, strstr)
> > --
> > 2.36.1
> >
Can you run clang-format on this where it makes sense.
> -----Original Message-----
> From: Noah Goldstein <goldstein.w.n@gmail.com>
> Sent: Thursday, May 26, 2022 2:26 PM
> To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
> Cc: GNU C Library <libc-alpha@sourceware.org>
> Subject: Re: [PATCH 1/1] x86_64: Add strstr function with 512-bit EVEX
>
> On Thu, May 26, 2022 at 3:11 PM Raghuveer Devulapalli via Libc-alpha <libc-
> alpha@sourceware.org> wrote:
> >
> > Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
> >
> > (1) We spend a few cycles at the begining to peek into the needle. We
> > locate an edge in the needle (first occurance of 2 consequent distinct
> > characters) and also store the first 64-bytes into a zmm register.
> >
> > (2) We search for the edge in the haystack by looking into one cache
> > line of the haystack at a time. This avoids having to read past a page
> > boundary which can cause a seg fault.
> >
> > (3) If an edge is found in the haystack we first compare the first
> > 64-bytes of the needle (already stored in a zmm register) before we
> > proceed with a full string compare performed byte by byte.
> >
> > Benchmarking data on ICX shows upto 2x speed up when compared to
> > __strstr_sse2_unaligned (including partial benchtests data from
> > bench-strstr.out):
> >
> > |---------------------------------+---------------+-----------------------|
> > | | strstr_avx512 |
> > | strstr_sse2_unaligned |
> > |---------------------------------+---------------+-----------------------|
> > | Length 16384/ 16, 1/11, found: | 1939.75 | 3458.44 |
> > | Length 16384/ 16, 14/ 5, fail : | 1967.75 | 3541.12 |
> > | Length 16384/ 32, 1/11, found: | 1540.38 | 2908.25 |
> > | Length 16384/ 32, 14/ 5, fail : | 1345.94 | 2866.31 |
> > | Length 16384/ 64, 1/11, found: | 1968.81 | 4327.56 |
> > | Length 16384/ 64, 14/ 5, fail : | 1993.75 | 4215.69 |
> > | Length 16384/128, 1/11, found: | 1535.44 | 3780.56 |
> > | Length 16384/128, 14/ 5, fail : | 1414.75 | 3595.25 |
> > | Length 16384/256, 1/11, found: | 2957.75 | 5501.44 |
> > | Length 16384/256, 14/ 5, fail : | 2682.62 | 5099.88 |
> > | Length 32768/ 16, 1/11, found: | 7820.19 | 11262.9 |
> > | Length 32768/ 16, 14/ 5, fail : | 8196.88 | 10871.2 |
> > | Length 32768/ 32, 1/11, found: | 5709.19 | 6611.56 |
> > | Length 32768/ 32, 14/ 5, fail : | 5716.12 | 6647.06 |
> > | Length 32768/ 64, 1/11, found: | 7160.44 | 10143.7 |
> > | Length 32768/ 64, 14/ 5, fail : | 7021.38 | 10150.6 |
> > | Length 32768/128, 1/11, found: | 4935.31 | 6756.56 |
> > | Length 32768/128, 14/ 5, fail : | 4774.38 | 6746.19 |
> > | Length 32768/256, 1/11, found: | 7933.19 | 12563.8 |
> > | Length 32768/256, 14/ 5, fail : | 7975 | 12558.6 |
> > | Length 65536/ 16, 1/11, found: | 9066.69 | 9419.62 |
> > | Length 65536/ 16, 14/ 5, fail : | 8496 | 9384.75 |
> > | Length 65536/ 32, 1/11, found: | 10258.8 | 11192.4 |
> > | Length 65536/ 32, 14/ 5, fail : | 8712.12 | 11172.3 |
> > | Length 65536/ 64, 1/11, found: | 11085.2 | 18162.1 |
> > | Length 65536/ 64, 14/ 5, fail : | 11219.6 | 17921.5 |
> > | Length 65536/128, 1/11, found: | 9753.56 | 18704.6 |
> > | Length 65536/128, 14/ 5, fail : | 9588.81 | 18465.6 |
> > | Length 65536/256, 1/11, found: | 18333.3 | 28505.2 |
> > | Length 65536/256, 14/ 5, fail : | 18018.8 | 27990.8 |
> > |---------------------------------+---------------+-----------------------|
> > ---
> > sysdeps/x86_64/multiarch/Makefile | 2 +
> > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 +
> > sysdeps/x86_64/multiarch/strstr-avx512.c | 208
> +++++++++++++++++++++
> > sysdeps/x86_64/multiarch/strstr.c | 24 ++-
> > 4 files changed, 236 insertions(+), 4 deletions(-) create mode
> > 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile
> > b/sysdeps/x86_64/multiarch/Makefile
> > index e7b413edad..6dc54a7265 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -126,6 +126,7 @@ sysdep_routines += \
> > strrchr-sse2 \
> > strspn-c \
> > strspn-sse2 \
> > + strstr-avx512 \
> > strstr-sse2-unaligned \
> > varshift \
> > # sysdep_routines
> > @@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c
> +=
> > -msse4 CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4
> > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -
> mavx512bw
> > +-mbmi -mbmi2 -O3
> > endif
> >
> > ifeq ($(subdir),wcsmbs)
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index a594f4176e..cc9a7eaaa1 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name, struct
> > libc_ifunc_impl *array,
> >
> > /* Support sysdeps/x86_64/multiarch/strstr.c. */
> > IFUNC_IMPL (i, name, strstr,
> > + IFUNC_IMPL_ADD (array, i, strstr,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (AVX512DQ)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > + __strstr_avx512)
> > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
> >
> > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > new file mode 100644
> > index 0000000000..4082a75a1b
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > @@ -0,0 +1,208 @@
> > +/* strstr optimized with 512-bit AVX-512 instructions
> > + Copyright (C) 2022 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#include <immintrin.h>
> > +#include <inttypes.h>
> > +#include <stdbool.h>
> > +#include <string.h>
> > +
> > +#define FULL_MMASK64 0xffffffffffffffff #define ONE_64BIT 0x1ull
> > +#define ZMM_SIZE_IN_BYTES 64
> > +
> > +/*
> > + Returns the index of the first edge within the needle, returns 0 if
> > +no edge is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > + */
> > +static inline size_t
> > +find_edge_in_needle (const char *ned) {
> > + size_t ind = 0;
> > + while (ned[ind + 1] != '\0')
> > + {
> > + if (ned[ind] != ned[ind + 1])
> > + return ind;
> > + else
> > + ind = ind + 1;
> > + }
> > + return 0;
> > +}
> > +
> > +/*
> > + Compare needle with haystack byte by byte at specified location */
> > +static inline bool verify_string_match (const char *hay, const size_t
> > +hay_index, const char *ned,
> > + size_t ind)
> > +{
> > + while (ned[ind] != '\0')
> > + {
> strcmp? (you might be able to use memcmp which will be faster
> but will need a bit of refactor to keep true nedlen and check for page
> cross on hay)
Wouldn't strcmp give you the wrong answer here? For ex: I would need it to return true when haystack is "abcdefg" and needle is "bcd"
> > + if (ned[ind] != hay[hay_index + ind])
> > + return false;
> > + ind = ind + 1;
> > + }
> > + return true;
> > +}
> > +
> > +/*
> > + Compare needle with haystack at specified location. The first 64
> > +bytes are compared using a ZMM register.
> > + */
> > +static inline bool
> > +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> > + const char *ned, const __mmask64 ned_mask,
> > + const __m512i ned_zmm) {
> > + /* check first 64 bytes using zmm and then scalar */
> > + __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe
> to
> > +do so
> > + __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask,
> hay_zmm,
> > +ned_zmm);
> > + if (match != 0x0) // failed the first few chars
> > + return false;
> > + else if (ned_mask == FULL_MMASK64)
> > + return verify_string_match (hay, hay_index, ned,
> > +ZMM_SIZE_IN_BYTES);
> > + return true;
> > +}
> > +
> > +char *
> > +__strstr_avx512 (const char *haystack, const char *ned) {
> > + char first = ned[0];
> > + if (first == '\0')
> > + return (char *)haystack;
> > + if (ned[1] == '\0')
> > + return (char *)strchr (haystack, ned[0]);
> > +
> > + size_t edge = find_edge_in_needle (ned);
> > +
> > + /* ensure haystack is as long as the pos of edge in needle */ for
> > + (int ii = 0; ii < edge; ++ii)
> > + {
> strnlen
Makes sense.
> > + if (haystack[ii] == '\0')
> > + return NULL;
> > + }
> > +
> > + const __m512i null = _mm512_setzero_si512 (); // '\0'
> > +
> > + /*
> > + Load 64 bytes of the needle and save it to a zmm register
> > + Read one cache line at a time to avoid loading across a page boundary
> > + */
> > + __mmask64 ned_load_mask
> > + = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
> FULL_MMASK64 >> (((-(uintptr_t)ned) & 63));
+1
> > + __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask,
> ned);
> Maybe conditional on highly unlike page cross this is very
> expensive if causes page walk
Elements on the next cache line are zero masked, shouldn't that prevent a cross page load?
> > + __mmask64 ned_nullmask
> > + = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm,
> null);
> _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm,
> ned_zmm)
>
> likewise at all other compares with null unless it breaks
> microfusion more than once.
The compiler was using vptestnmb, doesn't hurt to explicitly use it anyways.
>
> If you can replace all then get rid of null
> > + if (__glibc_unlikely (ned_nullmask == 0x0))
> > + {
> > + ned_zmm = _mm512_loadu_si512 (ned);
> > + ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
> > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > + if (ned_nullmask != 0x0)
> > + ned_load_mask = ned_load_mask >> 1;
> > + }
> > + else
> > + {
> > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > + ned_load_mask = ned_load_mask >> 1;
> I think you can get away with just ned_load_mask =
> ned_nullmask - ONE_64BIT because you only use this after
> checking haystack no null-term
Without the >> 1, we will compare the null char of the needle to the haystack which will give you the wrong answer.
> > + }
> > + const __m512i ned0 = _mm512_set1_epi8 (ned[edge]); const __m512i
> > + ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> > +
> > + /*
> > + Read the bytes of haystack in the current cache line
> > + */
> > + size_t hay_index = edge;
> > + __mmask64 loadmask = _bzhi_u64 (
> > + FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
> > + /* First load is a partial cache line */ __m512i hay0 =
> > + _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > + /* Search for NULL and compare only till null char */
> > + __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask (loadmask,
> hay0,
> > + null);
> > + __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT); cmpmask
> =
> > + _kand_mask64 (cmpmask, loadmask);
> nullmask ^ (nullmask - ONE_64BIT); codegen ends up actually
> using kand_mask here. Since loadmask and nullmask both go through
> GPR (nullmask for the blsmsk) you can do this explicitly in uint64_t
> to help GCC out.
>
> > + /* Search for the 2 charaters of needle */
> > + __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > + __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > + k1 = _kshiftri_mask64 (k1, 1);
> > + /* k2 masks tell us if both chars from needle match */ uint64_t k2
> > + = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1),
> cmpmask));
> > + /* For every match, search for the entire needle for a full match
> > + */ while (k2)
> > + {
> > + uint64_t bitcount = _tzcnt_u64(k2);
> > + k2 = _blsr_u64(k2);
> > + size_t match_pos = hay_index + bitcount - edge;
> > + if (nullmask == 0)
> > + {
> > + if (verify_string_match_avx512 (haystack, match_pos, ned,
> > + ned_load_mask, ned_zmm))
> > + return (char *)haystack + match_pos;
> > + }
> > + else
> > + {
> > + if (verify_string_match (haystack, match_pos, ned, 0))
> > + return (char *)haystack + match_pos;
> > + }
> > + }
> > + /* We haven't checked for potential match at the last char yet */
> > + hay_index += _mm_popcnt_u64 (loadmask) - 1;
> hay_index = 0; haystay |= 63; You might want to check codegen and
> ensure hay_index is being optimized out. AFAICT you just need a
> pointer.
AFAICT, looks like it does optimize it out.
> > +
> > + /*
> > + Loop over one cache line at a time to prevent reading over page
> > + boundary
> > + */
> > + __m512i hay1;
> > + while (nullmask == 0)
> > + {
> > + hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > + hay1 = _mm512_load_si512 (haystack + hay_index
> > + + 1); // Always 64 byte aligned
> Is this really faster than using kshiftri?
Yes (assuming you mean using just one load and use mask shift operations to look for a match). A lot of instructions in this loop are stuck on port 5 and so is kshiftri. Using 2 loads which execute on Port 2 and 3 relives that pressure.
> > + nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
> > + /* Compare only till null char */
> > + cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > + k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > + k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > + /* k2 masks tell us if both chars from needle match */
> > + k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1),
> cmpmask));
> > + /* For every match, compare full strings for potential match */
> > + while (k2)
> > + {
> > + uint64_t bitcount = _tzcnt_u64(k2);
> > + k2 = _blsr_u64(k2);
> > + size_t match_pos = hay_index + bitcount - edge;
> > + if (nullmask == 0)
> > + {
> > + /*
> > + Since the haystack doesn't terminate at the current cache
> > + line, we can use zmm register to compare the first 64 bytes
> > + */
> > + if (verify_string_match_avx512 (haystack, match_pos, ned,
> > + ned_load_mask, ned_zmm))
> > + return (char *)haystack + match_pos;
> > + }
> > + else
> > + {
> > + /* Compare byte by byte */
> > + if (verify_string_match (haystack, match_pos, ned, 0))
> > + return (char *)haystack + match_pos;
> > + }
> > + }
> > + hay_index += ZMM_SIZE_IN_BYTES;
> > + }
> > + return NULL;
> > +}
> > diff --git a/sysdeps/x86_64/multiarch/strstr.c
> > b/sysdeps/x86_64/multiarch/strstr.c
> > index 95600a9de5..2fb8b169b6 100644
> > --- a/sysdeps/x86_64/multiarch/strstr.c
> > +++ b/sysdeps/x86_64/multiarch/strstr.c
> > @@ -35,16 +35,32 @@
> >
> > extern __typeof (__redirect_strstr) __strstr_sse2_unaligned
> > attribute_hidden; extern __typeof (__redirect_strstr) __strstr_sse2
> > attribute_hidden;
> > +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
> >
> > #include "init-arch.h"
> >
> > /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> > ifunc symbol properly. */
> > extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc
> > (__libc_strstr,
> > - HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> > - ? __strstr_sse2_unaligned
> > - : __strstr_sse2)
> >
> > +static inline void *
> > +IFUNC_SELECTOR (void)
> > +{
> > + const struct cpu_features *cpu_features = __get_cpu_features ();
> > +
> > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > + return __strstr_avx512;
> > +
> > + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > + return __strstr_sse2_unaligned;
> > +
> > + return __strstr_sse2;
> > +}
> > +
> > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr,
> > +IFUNC_SELECTOR ());
> > #undef strstr
> > strong_alias (__libc_strstr, strstr)
> > --
> > 2.36.1
> >
> -----Original Message-----
> From: Devulapalli, Raghuveer
> Sent: Tuesday, May 31, 2022 12:17 PM
> To: Noah Goldstein <goldstein.w.n@gmail.com>
> Cc: GNU C Library <libc-alpha@sourceware.org>
> Subject: RE: [PATCH 1/1] x86_64: Add strstr function with 512-bit EVEX
>
>
>
> > -----Original Message-----
> > From: Noah Goldstein <goldstein.w.n@gmail.com>
> > Sent: Thursday, May 26, 2022 2:26 PM
> > To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
> > Cc: GNU C Library <libc-alpha@sourceware.org>
> > Subject: Re: [PATCH 1/1] x86_64: Add strstr function with 512-bit EVEX
> >
> > On Thu, May 26, 2022 at 3:11 PM Raghuveer Devulapalli via Libc-alpha
> > <libc- alpha@sourceware.org> wrote:
> > >
> > > Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
> > >
> > > (1) We spend a few cycles at the begining to peek into the needle.
> > > We locate an edge in the needle (first occurance of 2 consequent
> > > distinct
> > > characters) and also store the first 64-bytes into a zmm register.
> > >
> > > (2) We search for the edge in the haystack by looking into one cache
> > > line of the haystack at a time. This avoids having to read past a
> > > page boundary which can cause a seg fault.
> > >
> > > (3) If an edge is found in the haystack we first compare the first
> > > 64-bytes of the needle (already stored in a zmm register) before we
> > > proceed with a full string compare performed byte by byte.
> > >
> > > Benchmarking data on ICX shows upto 2x speed up when compared to
> > > __strstr_sse2_unaligned (including partial benchtests data from
> > > bench-strstr.out):
> > >
> > > |---------------------------------+---------------+-----------------------|
> > > | | strstr_avx512 |
> > > | strstr_sse2_unaligned |
> > > |---------------------------------+---------------+-----------------------|
> > > | Length 16384/ 16, 1/11, found: | 1939.75 | 3458.44 |
> > > | Length 16384/ 16, 14/ 5, fail : | 1967.75 | 3541.12 |
> > > | Length 16384/ 32, 1/11, found: | 1540.38 | 2908.25 |
> > > | Length 16384/ 32, 14/ 5, fail : | 1345.94 | 2866.31 |
> > > | Length 16384/ 64, 1/11, found: | 1968.81 | 4327.56 |
> > > | Length 16384/ 64, 14/ 5, fail : | 1993.75 | 4215.69 |
> > > | Length 16384/128, 1/11, found: | 1535.44 | 3780.56 |
> > > | Length 16384/128, 14/ 5, fail : | 1414.75 | 3595.25 |
> > > | Length 16384/256, 1/11, found: | 2957.75 | 5501.44 |
> > > | Length 16384/256, 14/ 5, fail : | 2682.62 | 5099.88 |
> > > | Length 32768/ 16, 1/11, found: | 7820.19 | 11262.9 |
> > > | Length 32768/ 16, 14/ 5, fail : | 8196.88 | 10871.2 |
> > > | Length 32768/ 32, 1/11, found: | 5709.19 | 6611.56 |
> > > | Length 32768/ 32, 14/ 5, fail : | 5716.12 | 6647.06 |
> > > | Length 32768/ 64, 1/11, found: | 7160.44 | 10143.7 |
> > > | Length 32768/ 64, 14/ 5, fail : | 7021.38 | 10150.6 |
> > > | Length 32768/128, 1/11, found: | 4935.31 | 6756.56 |
> > > | Length 32768/128, 14/ 5, fail : | 4774.38 | 6746.19 |
> > > | Length 32768/256, 1/11, found: | 7933.19 | 12563.8 |
> > > | Length 32768/256, 14/ 5, fail : | 7975 | 12558.6 |
> > > | Length 65536/ 16, 1/11, found: | 9066.69 | 9419.62 |
> > > | Length 65536/ 16, 14/ 5, fail : | 8496 | 9384.75 |
> > > | Length 65536/ 32, 1/11, found: | 10258.8 | 11192.4 |
> > > | Length 65536/ 32, 14/ 5, fail : | 8712.12 | 11172.3 |
> > > | Length 65536/ 64, 1/11, found: | 11085.2 | 18162.1 |
> > > | Length 65536/ 64, 14/ 5, fail : | 11219.6 | 17921.5 |
> > > | Length 65536/128, 1/11, found: | 9753.56 | 18704.6 |
> > > | Length 65536/128, 14/ 5, fail : | 9588.81 | 18465.6 |
> > > | Length 65536/256, 1/11, found: | 18333.3 | 28505.2 |
> > > | Length 65536/256, 14/ 5, fail : | 18018.8 | 27990.8 |
> > > |---------------------------------+---------------+-----------------------|
> > > ---
> > > sysdeps/x86_64/multiarch/Makefile | 2 +
> > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 +
> > > sysdeps/x86_64/multiarch/strstr-avx512.c | 208
> > +++++++++++++++++++++
> > > sysdeps/x86_64/multiarch/strstr.c | 24 ++-
> > > 4 files changed, 236 insertions(+), 4 deletions(-) create mode
> > > 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/Makefile
> > > b/sysdeps/x86_64/multiarch/Makefile
> > > index e7b413edad..6dc54a7265 100644
> > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > @@ -126,6 +126,7 @@ sysdep_routines += \
> > > strrchr-sse2 \
> > > strspn-c \
> > > strspn-sse2 \
> > > + strstr-avx512 \
> > > strstr-sse2-unaligned \
> > > varshift \
> > > # sysdep_routines
> > > @@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c
> > +=
> > > -msse4 CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4
> > > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -
> > mavx512bw
> > > +-mbmi -mbmi2 -O3
> > > endif
> > >
> > > ifeq ($(subdir),wcsmbs)
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index a594f4176e..cc9a7eaaa1 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name,
> > > struct libc_ifunc_impl *array,
> > >
> > > /* Support sysdeps/x86_64/multiarch/strstr.c. */
> > > IFUNC_IMPL (i, name, strstr,
> > > + IFUNC_IMPL_ADD (array, i, strstr,
> > > + (CPU_FEATURE_USABLE (AVX512VL)
> > > + && CPU_FEATURE_USABLE (AVX512BW)
> > > + && CPU_FEATURE_USABLE (AVX512DQ)
> > > + && CPU_FEATURE_USABLE (BMI2)),
> > > + __strstr_avx512)
> > > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> > > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > new file mode 100644
> > > index 0000000000..4082a75a1b
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > @@ -0,0 +1,208 @@
> > > +/* strstr optimized with 512-bit AVX-512 instructions
> > > + Copyright (C) 2022 Free Software Foundation, Inc.
> > > + This file is part of the GNU C Library.
> > > +
> > > + The GNU C Library is free software; you can redistribute it and/or
> > > + modify it under the terms of the GNU Lesser General Public
> > > + License as published by the Free Software Foundation; either
> > > + version 2.1 of the License, or (at your option) any later version.
> > > +
> > > + The GNU C Library is distributed in the hope that it will be useful,
> > > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> > GNU
> > > + Lesser General Public License for more details.
> > > +
> > > + You should have received a copy of the GNU Lesser General Public
> > > + License along with the GNU C Library; if not, see
> > > + <https://www.gnu.org/licenses/>. */
> > > +
> > > +#include <immintrin.h>
> > > +#include <inttypes.h>
> > > +#include <stdbool.h>
> > > +#include <string.h>
> > > +
> > > +#define FULL_MMASK64 0xffffffffffffffff #define ONE_64BIT 0x1ull
> > > +#define ZMM_SIZE_IN_BYTES 64
> > > +
> > > +/*
> > > + Returns the index of the first edge within the needle, returns 0
> > > +if no edge is found. Example: 'ab' is the first edge in
> 'aaaaaaaaaabaarddg'
> > > + */
> > > +static inline size_t
> > > +find_edge_in_needle (const char *ned) {
> > > + size_t ind = 0;
> > > + while (ned[ind + 1] != '\0')
> > > + {
> > > + if (ned[ind] != ned[ind + 1])
> > > + return ind;
> > > + else
> > > + ind = ind + 1;
> > > + }
> > > + return 0;
> > > +}
> > > +
> > > +/*
> > > + Compare needle with haystack byte by byte at specified location
> > > +*/ static inline bool verify_string_match (const char *hay, const
> > > +size_t hay_index, const char *ned,
> > > + size_t ind)
> > > +{
> > > + while (ned[ind] != '\0')
> > > + {
> > strcmp? (you might be able to use memcmp which will be faster
> > but will need a bit of refactor to keep true nedlen and check for page
> > cross on hay)
>
> Wouldn't strcmp give you the wrong answer here? For ex: I would need it to
> return true when haystack is "abcdefg" and needle is "bcd"
>
> > > + if (ned[ind] != hay[hay_index + ind])
> > > + return false;
> > > + ind = ind + 1;
> > > + }
> > > + return true;
> > > +}
> > > +
> > > +/*
> > > + Compare needle with haystack at specified location. The first 64
> > > +bytes are compared using a ZMM register.
> > > + */
> > > +static inline bool
> > > +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> > > + const char *ned, const __mmask64 ned_mask,
> > > + const __m512i ned_zmm) {
> > > + /* check first 64 bytes using zmm and then scalar */
> > > + __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); //
> safe
> > to
> > > +do so
> > > + __mmask64 match = _mm512_mask_cmpneq_epi8_mask
> (ned_mask,
> > hay_zmm,
> > > +ned_zmm);
> > > + if (match != 0x0) // failed the first few chars
> > > + return false;
> > > + else if (ned_mask == FULL_MMASK64)
> > > + return verify_string_match (hay, hay_index, ned,
> > > +ZMM_SIZE_IN_BYTES);
> > > + return true;
> > > +}
> > > +
> > > +char *
> > > +__strstr_avx512 (const char *haystack, const char *ned) {
> > > + char first = ned[0];
> > > + if (first == '\0')
> > > + return (char *)haystack;
> > > + if (ned[1] == '\0')
> > > + return (char *)strchr (haystack, ned[0]);
> > > +
> > > + size_t edge = find_edge_in_needle (ned);
> > > +
> > > + /* ensure haystack is as long as the pos of edge in needle */
> > > + for (int ii = 0; ii < edge; ++ii)
> > > + {
> > strnlen
>
> Makes sense.
The function call here negatively affects performance for small sized haystack. I will keep this as-is.
>
> > > + if (haystack[ii] == '\0')
> > > + return NULL;
> > > + }
> > > +
> > > + const __m512i null = _mm512_setzero_si512 (); // '\0'
> > > +
> > > + /*
> > > + Load 64 bytes of the needle and save it to a zmm register
> > > + Read one cache line at a time to avoid loading across a page boundary
> > > + */
> > > + __mmask64 ned_load_mask
> > > + = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
> > FULL_MMASK64 >> (((-(uintptr_t)ned) & 63));
>
> +1
>
> > > + __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask,
> > ned);
> > Maybe conditional on highly unlike page cross this is very
> > expensive if causes page walk
>
> Elements on the next cache line are zero masked, shouldn't that prevent a
> cross page load?
>
> > > + __mmask64 ned_nullmask
> > > + = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm,
> > null);
> > _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm,
> > ned_zmm)
> >
> > likewise at all other compares with null unless it breaks
> > microfusion more than once.
>
> The compiler was using vptestnmb, doesn't hurt to explicitly use it anyways.
>
> >
> > If you can replace all then get rid of null
> > > + if (__glibc_unlikely (ned_nullmask == 0x0))
> > > + {
> > > + ned_zmm = _mm512_loadu_si512 (ned);
> > > + ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
> > > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > > + if (ned_nullmask != 0x0)
> > > + ned_load_mask = ned_load_mask >> 1;
> > > + }
> > > + else
> > > + {
> > > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > > + ned_load_mask = ned_load_mask >> 1;
> > I think you can get away with just ned_load_mask =
> > ned_nullmask - ONE_64BIT because you only use this after
> > checking haystack no null-term
>
> Without the >> 1, we will compare the null char of the needle to the
> haystack which will give you the wrong answer.
>
> > > + }
> > > + const __m512i ned0 = _mm512_set1_epi8 (ned[edge]); const
> __m512i
> > > + ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> > > +
> > > + /*
> > > + Read the bytes of haystack in the current cache line
> > > + */
> > > + size_t hay_index = edge;
> > > + __mmask64 loadmask = _bzhi_u64 (
> > > + FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) &
> > > + 63));
> > > + /* First load is a partial cache line */ __m512i hay0 =
> > > + _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > > + /* Search for NULL and compare only till null char */
> > > + __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask
> (loadmask,
> > hay0,
> > > + null);
> > > + __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> cmpmask
> > =
> > > + _kand_mask64 (cmpmask, loadmask);
> > nullmask ^ (nullmask - ONE_64BIT); codegen ends up actually
> > using kand_mask here. Since loadmask and nullmask both go through
> > GPR (nullmask for the blsmsk) you can do this explicitly in uint64_t
> > to help GCC out.
> >
> > > + /* Search for the 2 charaters of needle */
> > > + __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > + __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > > + k1 = _kshiftri_mask64 (k1, 1);
> > > + /* k2 masks tell us if both chars from needle match */ uint64_t
> > > + k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1),
> > cmpmask));
> > > + /* For every match, search for the entire needle for a full match
> > > + */ while (k2)
> > > + {
> > > + uint64_t bitcount = _tzcnt_u64(k2);
> > > + k2 = _blsr_u64(k2);
> > > + size_t match_pos = hay_index + bitcount - edge;
> > > + if (nullmask == 0)
> > > + {
> > > + if (verify_string_match_avx512 (haystack, match_pos, ned,
> > > + ned_load_mask, ned_zmm))
> > > + return (char *)haystack + match_pos;
> > > + }
> > > + else
> > > + {
> > > + if (verify_string_match (haystack, match_pos, ned, 0))
> > > + return (char *)haystack + match_pos;
> > > + }
> > > + }
> > > + /* We haven't checked for potential match at the last char yet */
> > > + hay_index += _mm_popcnt_u64 (loadmask) - 1;
> > hay_index = 0; haystay |= 63; You might want to check codegen and
> > ensure hay_index is being optimized out. AFAICT you just need a
> > pointer.
>
> AFAICT, looks like it does optimize it out.
>
> > > +
> > > + /*
> > > + Loop over one cache line at a time to prevent reading over page
> > > + boundary
> > > + */
> > > + __m512i hay1;
> > > + while (nullmask == 0)
> > > + {
> > > + hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > > + hay1 = _mm512_load_si512 (haystack + hay_index
> > > + + 1); // Always 64 byte aligned
> > Is this really faster than using kshiftri?
>
> Yes (assuming you mean using just one load and use mask shift operations
> to look for a match). A lot of instructions in this loop are stuck on port 5 and
> so is kshiftri. Using 2 loads which execute on Port 2 and 3 relives that
> pressure.
>
> > > + nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
> > > + /* Compare only till null char */
> > > + cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > + k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > + k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > > + /* k2 masks tell us if both chars from needle match */
> > > + k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1),
> > cmpmask));
> > > + /* For every match, compare full strings for potential match */
> > > + while (k2)
> > > + {
> > > + uint64_t bitcount = _tzcnt_u64(k2);
> > > + k2 = _blsr_u64(k2);
> > > + size_t match_pos = hay_index + bitcount - edge;
> > > + if (nullmask == 0)
> > > + {
> > > + /*
> > > + Since the haystack doesn't terminate at the current cache
> > > + line, we can use zmm register to compare the first 64 bytes
> > > + */
> > > + if (verify_string_match_avx512 (haystack, match_pos, ned,
> > > + ned_load_mask, ned_zmm))
> > > + return (char *)haystack + match_pos;
> > > + }
> > > + else
> > > + {
> > > + /* Compare byte by byte */
> > > + if (verify_string_match (haystack, match_pos, ned, 0))
> > > + return (char *)haystack + match_pos;
> > > + }
> > > + }
> > > + hay_index += ZMM_SIZE_IN_BYTES;
> > > + }
> > > + return NULL;
> > > +}
> > > diff --git a/sysdeps/x86_64/multiarch/strstr.c
> > > b/sysdeps/x86_64/multiarch/strstr.c
> > > index 95600a9de5..2fb8b169b6 100644
> > > --- a/sysdeps/x86_64/multiarch/strstr.c
> > > +++ b/sysdeps/x86_64/multiarch/strstr.c
> > > @@ -35,16 +35,32 @@
> > >
> > > extern __typeof (__redirect_strstr) __strstr_sse2_unaligned
> > > attribute_hidden; extern __typeof (__redirect_strstr) __strstr_sse2
> > > attribute_hidden;
> > > +extern __typeof (__redirect_strstr) __strstr_avx512
> > > +attribute_hidden;
> > >
> > > #include "init-arch.h"
> > >
> > > /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> > > ifunc symbol properly. */
> > > extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc
> > > (__libc_strstr,
> > > - HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> > > - ? __strstr_sse2_unaligned
> > > - : __strstr_sse2)
> > >
> > > +static inline void *
> > > +IFUNC_SELECTOR (void)
> > > +{
> > > + const struct cpu_features *cpu_features = __get_cpu_features ();
> > > +
> > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> > > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > > + return __strstr_avx512;
> > > +
> > > + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > > + return __strstr_sse2_unaligned;
> > > +
> > > + return __strstr_sse2;
> > > +}
> > > +
> > > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr,
> > > +IFUNC_SELECTOR ());
> > > #undef strstr
> > > strong_alias (__libc_strstr, strstr)
> > > --
> > > 2.36.1
> > >
On Tue, May 31, 2022 at 2:16 PM Devulapalli, Raghuveer
<raghuveer.devulapalli@intel.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Noah Goldstein <goldstein.w.n@gmail.com>
> > Sent: Thursday, May 26, 2022 2:26 PM
> > To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
> > Cc: GNU C Library <libc-alpha@sourceware.org>
> > Subject: Re: [PATCH 1/1] x86_64: Add strstr function with 512-bit EVEX
> >
> > On Thu, May 26, 2022 at 3:11 PM Raghuveer Devulapalli via Libc-alpha <libc-
> > alpha@sourceware.org> wrote:
> > >
> > > Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
> > >
> > > (1) We spend a few cycles at the begining to peek into the needle. We
> > > locate an edge in the needle (first occurance of 2 consequent distinct
> > > characters) and also store the first 64-bytes into a zmm register.
> > >
> > > (2) We search for the edge in the haystack by looking into one cache
> > > line of the haystack at a time. This avoids having to read past a page
> > > boundary which can cause a seg fault.
> > >
> > > (3) If an edge is found in the haystack we first compare the first
> > > 64-bytes of the needle (already stored in a zmm register) before we
> > > proceed with a full string compare performed byte by byte.
> > >
> > > Benchmarking data on ICX shows upto 2x speed up when compared to
> > > __strstr_sse2_unaligned (including partial benchtests data from
> > > bench-strstr.out):
> > >
> > > |---------------------------------+---------------+-----------------------|
> > > | | strstr_avx512 |
> > > | strstr_sse2_unaligned |
> > > |---------------------------------+---------------+-----------------------|
> > > | Length 16384/ 16, 1/11, found: | 1939.75 | 3458.44 |
> > > | Length 16384/ 16, 14/ 5, fail : | 1967.75 | 3541.12 |
> > > | Length 16384/ 32, 1/11, found: | 1540.38 | 2908.25 |
> > > | Length 16384/ 32, 14/ 5, fail : | 1345.94 | 2866.31 |
> > > | Length 16384/ 64, 1/11, found: | 1968.81 | 4327.56 |
> > > | Length 16384/ 64, 14/ 5, fail : | 1993.75 | 4215.69 |
> > > | Length 16384/128, 1/11, found: | 1535.44 | 3780.56 |
> > > | Length 16384/128, 14/ 5, fail : | 1414.75 | 3595.25 |
> > > | Length 16384/256, 1/11, found: | 2957.75 | 5501.44 |
> > > | Length 16384/256, 14/ 5, fail : | 2682.62 | 5099.88 |
> > > | Length 32768/ 16, 1/11, found: | 7820.19 | 11262.9 |
> > > | Length 32768/ 16, 14/ 5, fail : | 8196.88 | 10871.2 |
> > > | Length 32768/ 32, 1/11, found: | 5709.19 | 6611.56 |
> > > | Length 32768/ 32, 14/ 5, fail : | 5716.12 | 6647.06 |
> > > | Length 32768/ 64, 1/11, found: | 7160.44 | 10143.7 |
> > > | Length 32768/ 64, 14/ 5, fail : | 7021.38 | 10150.6 |
> > > | Length 32768/128, 1/11, found: | 4935.31 | 6756.56 |
> > > | Length 32768/128, 14/ 5, fail : | 4774.38 | 6746.19 |
> > > | Length 32768/256, 1/11, found: | 7933.19 | 12563.8 |
> > > | Length 32768/256, 14/ 5, fail : | 7975 | 12558.6 |
> > > | Length 65536/ 16, 1/11, found: | 9066.69 | 9419.62 |
> > > | Length 65536/ 16, 14/ 5, fail : | 8496 | 9384.75 |
> > > | Length 65536/ 32, 1/11, found: | 10258.8 | 11192.4 |
> > > | Length 65536/ 32, 14/ 5, fail : | 8712.12 | 11172.3 |
> > > | Length 65536/ 64, 1/11, found: | 11085.2 | 18162.1 |
> > > | Length 65536/ 64, 14/ 5, fail : | 11219.6 | 17921.5 |
> > > | Length 65536/128, 1/11, found: | 9753.56 | 18704.6 |
> > > | Length 65536/128, 14/ 5, fail : | 9588.81 | 18465.6 |
> > > | Length 65536/256, 1/11, found: | 18333.3 | 28505.2 |
> > > | Length 65536/256, 14/ 5, fail : | 18018.8 | 27990.8 |
> > > |---------------------------------+---------------+-----------------------|
> > > ---
> > > sysdeps/x86_64/multiarch/Makefile | 2 +
> > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 +
> > > sysdeps/x86_64/multiarch/strstr-avx512.c | 208
> > +++++++++++++++++++++
> > > sysdeps/x86_64/multiarch/strstr.c | 24 ++-
> > > 4 files changed, 236 insertions(+), 4 deletions(-) create mode
> > > 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/Makefile
> > > b/sysdeps/x86_64/multiarch/Makefile
> > > index e7b413edad..6dc54a7265 100644
> > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > @@ -126,6 +126,7 @@ sysdep_routines += \
> > > strrchr-sse2 \
> > > strspn-c \
> > > strspn-sse2 \
> > > + strstr-avx512 \
> > > strstr-sse2-unaligned \
> > > varshift \
> > > # sysdep_routines
> > > @@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c
> > +=
> > > -msse4 CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4
> > > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -
> > mavx512bw
> > > +-mbmi -mbmi2 -O3
> > > endif
> > >
> > > ifeq ($(subdir),wcsmbs)
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index a594f4176e..cc9a7eaaa1 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name, struct
> > > libc_ifunc_impl *array,
> > >
> > > /* Support sysdeps/x86_64/multiarch/strstr.c. */
> > > IFUNC_IMPL (i, name, strstr,
> > > + IFUNC_IMPL_ADD (array, i, strstr,
> > > + (CPU_FEATURE_USABLE (AVX512VL)
> > > + && CPU_FEATURE_USABLE (AVX512BW)
> > > + && CPU_FEATURE_USABLE (AVX512DQ)
> > > + && CPU_FEATURE_USABLE (BMI2)),
> > > + __strstr_avx512)
> > > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> > > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > new file mode 100644
> > > index 0000000000..4082a75a1b
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > @@ -0,0 +1,208 @@
> > > +/* strstr optimized with 512-bit AVX-512 instructions
> > > + Copyright (C) 2022 Free Software Foundation, Inc.
> > > + This file is part of the GNU C Library.
> > > +
> > > + The GNU C Library is free software; you can redistribute it and/or
> > > + modify it under the terms of the GNU Lesser General Public
> > > + License as published by the Free Software Foundation; either
> > > + version 2.1 of the License, or (at your option) any later version.
> > > +
> > > + The GNU C Library is distributed in the hope that it will be useful,
> > > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> > GNU
> > > + Lesser General Public License for more details.
> > > +
> > > + You should have received a copy of the GNU Lesser General Public
> > > + License along with the GNU C Library; if not, see
> > > + <https://www.gnu.org/licenses/>. */
> > > +
> > > +#include <immintrin.h>
> > > +#include <inttypes.h>
> > > +#include <stdbool.h>
> > > +#include <string.h>
> > > +
> > > +#define FULL_MMASK64 0xffffffffffffffff #define ONE_64BIT 0x1ull
> > > +#define ZMM_SIZE_IN_BYTES 64
> > > +
> > > +/*
> > > + Returns the index of the first edge within the needle, returns 0 if
> > > +no edge is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > > + */
> > > +static inline size_t
> > > +find_edge_in_needle (const char *ned) {
> > > + size_t ind = 0;
> > > + while (ned[ind + 1] != '\0')
> > > + {
> > > + if (ned[ind] != ned[ind + 1])
> > > + return ind;
> > > + else
> > > + ind = ind + 1;
> > > + }
> > > + return 0;
> > > +}
> > > +
> > > +/*
> > > + Compare needle with haystack byte by byte at specified location */
> > > +static inline bool verify_string_match (const char *hay, const size_t
> > > +hay_index, const char *ned,
> > > + size_t ind)
> > > +{
> > > + while (ned[ind] != '\0')
> > > + {
> > strcmp? (you might be able to use memcmp which will be faster
> > but will need a bit of refactor to keep true nedlen and check for page
> > cross on hay)
>
> Wouldn't strcmp give you the wrong answer here? For ex: I would need it to return true when haystack is "abcdefg" and needle is "bcd"
Yeah strcmp directly wont work directly. If you can refactor so neelen is known
then you can use memcmp. Otherwise can you manually implement at least
first VEC_SIZE comparison with vectors (obv make sure its a speedup). That
should cover *most* needles.
If need be you can throw the edge cases to just byte comparison.
>
> > > + if (ned[ind] != hay[hay_index + ind])
> > > + return false;
> > > + ind = ind + 1;
> > > + }
> > > + return true;
> > > +}
> > > +
> > > +/*
> > > + Compare needle with haystack at specified location. The first 64
> > > +bytes are compared using a ZMM register.
> > > + */
> > > +static inline bool
> > > +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> > > + const char *ned, const __mmask64 ned_mask,
> > > + const __m512i ned_zmm) {
> > > + /* check first 64 bytes using zmm and then scalar */
> > > + __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe
> > to
> > > +do so
> > > + __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask,
> > hay_zmm,
> > > +ned_zmm);
> > > + if (match != 0x0) // failed the first few chars
> > > + return false;
> > > + else if (ned_mask == FULL_MMASK64)
> > > + return verify_string_match (hay, hay_index, ned,
> > > +ZMM_SIZE_IN_BYTES);
> > > + return true;
> > > +}
> > > +
> > > +char *
> > > +__strstr_avx512 (const char *haystack, const char *ned) {
> > > + char first = ned[0];
> > > + if (first == '\0')
> > > + return (char *)haystack;
> > > + if (ned[1] == '\0')
> > > + return (char *)strchr (haystack, ned[0]);
> > > +
> > > + size_t edge = find_edge_in_needle (ned);
> > > +
> > > + /* ensure haystack is as long as the pos of edge in needle */ for
> > > + (int ii = 0; ii < edge; ++ii)
> > > + {
> > strnlen
>
> Makes sense.
>
> > > + if (haystack[ii] == '\0')
> > > + return NULL;
> > > + }
> > > +
> > > + const __m512i null = _mm512_setzero_si512 (); // '\0'
> > > +
> > > + /*
> > > + Load 64 bytes of the needle and save it to a zmm register
> > > + Read one cache line at a time to avoid loading across a page boundary
> > > + */
> > > + __mmask64 ned_load_mask
> > > + = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
> > FULL_MMASK64 >> (((-(uintptr_t)ned) & 63));
>
> +1
>
> > > + __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask,
> > ned);
> > Maybe conditional on highly unlike page cross this is very
> > expensive if causes page walk
>
> Elements on the next cache line are zero masked, shouldn't that prevent a cross page load?
Still does a page walk (and IIRC has non-intuitive rules for updating TLB so it
can get pretty ugly).
>
> > > + __mmask64 ned_nullmask
> > > + = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm,
> > null);
> > _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm,
> > ned_zmm)
> >
> > likewise at all other compares with null unless it breaks
> > microfusion more than once.
>
> The compiler was using vptestnmb, doesn't hurt to explicitly use it anyways.
>
> >
> > If you can replace all then get rid of null
> > > + if (__glibc_unlikely (ned_nullmask == 0x0))
> > > + {
> > > + ned_zmm = _mm512_loadu_si512 (ned);
> > > + ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
> > > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > > + if (ned_nullmask != 0x0)
> > > + ned_load_mask = ned_load_mask >> 1;
> > > + }
> > > + else
> > > + {
> > > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > > + ned_load_mask = ned_load_mask >> 1;
> > I think you can get away with just ned_load_mask =
> > ned_nullmask - ONE_64BIT because you only use this after
> > checking haystack no null-term
>
> Without the >> 1, we will compare the null char of the needle to the haystack which will give you the wrong answer.
Thats only if you have the `xor` with the original. The xor with the
original is needed
to clear null hits out of range but because you already check that the haystack
has no null term is fine to leave those potential high bit 1s in place.
I may be wrong.
>
> > > + }
> > > + const __m512i ned0 = _mm512_set1_epi8 (ned[edge]); const __m512i
> > > + ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> > > +
> > > + /*
> > > + Read the bytes of haystack in the current cache line
> > > + */
> > > + size_t hay_index = edge;
> > > + __mmask64 loadmask = _bzhi_u64 (
> > > + FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
> > > + /* First load is a partial cache line */ __m512i hay0 =
> > > + _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > > + /* Search for NULL and compare only till null char */
> > > + __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask (loadmask,
> > hay0,
> > > + null);
> > > + __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT); cmpmask
> > =
> > > + _kand_mask64 (cmpmask, loadmask);
> > nullmask ^ (nullmask - ONE_64BIT); codegen ends up actually
> > using kand_mask here. Since loadmask and nullmask both go through
> > GPR (nullmask for the blsmsk) you can do this explicitly in uint64_t
> > to help GCC out.
Doable?
> >
> > > + /* Search for the 2 charaters of needle */
> > > + __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > + __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > > + k1 = _kshiftri_mask64 (k1, 1);
> > > + /* k2 masks tell us if both chars from needle match */ uint64_t k2
> > > + = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1),
> > cmpmask));
Doable?
> > > + /* For every match, search for the entire needle for a full match
> > > + */ while (k2)
> > > + {
> > > + uint64_t bitcount = _tzcnt_u64(k2);
> > > + k2 = _blsr_u64(k2);
> > > + size_t match_pos = hay_index + bitcount - edge;
> > > + if (nullmask == 0)
> > > + {
> > > + if (verify_string_match_avx512 (haystack, match_pos, ned,
> > > + ned_load_mask, ned_zmm))
> > > + return (char *)haystack + match_pos;
> > > + }
> > > + else
> > > + {
> > > + if (verify_string_match (haystack, match_pos, ned, 0))
> > > + return (char *)haystack + match_pos;
> > > + }
> > > + }
> > > + /* We haven't checked for potential match at the last char yet */
> > > + hay_index += _mm_popcnt_u64 (loadmask) - 1;
> > hay_index = 0; haystay |= 63; You might want to check codegen and
> > ensure hay_index is being optimized out. AFAICT you just need a
> > pointer.
>
> AFAICT, looks like it does optimize it out.
Good. Can you switch to the `|=` though.
>
> > > +
> > > + /*
> > > + Loop over one cache line at a time to prevent reading over page
> > > + boundary
> > > + */
> > > + __m512i hay1;
> > > + while (nullmask == 0)
> > > + {
> > > + hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > > + hay1 = _mm512_load_si512 (haystack + hay_index
> > > + + 1); // Always 64 byte aligned
> > Is this really faster than using kshiftri?
>
> Yes (assuming you mean using just one load and use mask shift operations to look for a match). A lot of instructions in this loop are stuck on port 5 and so is kshiftri. Using 2 loads which execute on Port 2 and 3 relives that pressure.
>
Fair enough.
> > > + nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
> > > + /* Compare only till null char */
> > > + cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > + k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > + k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > > + /* k2 masks tell us if both chars from needle match */
> > > + k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1),
> > cmpmask));
Doable?
> > > + /* For every match, compare full strings for potential match */
> > > + while (k2)
> > > + {
> > > + uint64_t bitcount = _tzcnt_u64(k2);
> > > + k2 = _blsr_u64(k2);
> > > + size_t match_pos = hay_index + bitcount - edge;
> > > + if (nullmask == 0)
> > > + {
> > > + /*
> > > + Since the haystack doesn't terminate at the current cache
> > > + line, we can use zmm register to compare the first 64 bytes
> > > + */
> > > + if (verify_string_match_avx512 (haystack, match_pos, ned,
> > > + ned_load_mask, ned_zmm))
> > > + return (char *)haystack + match_pos;
> > > + }
> > > + else
> > > + {
> > > + /* Compare byte by byte */
> > > + if (verify_string_match (haystack, match_pos, ned, 0))
> > > + return (char *)haystack + match_pos;
> > > + }
> > > + }
> > > + hay_index += ZMM_SIZE_IN_BYTES;
> > > + }
> > > + return NULL;
> > > +}
> > > diff --git a/sysdeps/x86_64/multiarch/strstr.c
> > > b/sysdeps/x86_64/multiarch/strstr.c
> > > index 95600a9de5..2fb8b169b6 100644
> > > --- a/sysdeps/x86_64/multiarch/strstr.c
> > > +++ b/sysdeps/x86_64/multiarch/strstr.c
> > > @@ -35,16 +35,32 @@
> > >
> > > extern __typeof (__redirect_strstr) __strstr_sse2_unaligned
> > > attribute_hidden; extern __typeof (__redirect_strstr) __strstr_sse2
> > > attribute_hidden;
> > > +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
> > >
> > > #include "init-arch.h"
> > >
> > > /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> > > ifunc symbol properly. */
> > > extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc
> > > (__libc_strstr,
> > > - HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> > > - ? __strstr_sse2_unaligned
> > > - : __strstr_sse2)
> > >
> > > +static inline void *
> > > +IFUNC_SELECTOR (void)
> > > +{
> > > + const struct cpu_features *cpu_features = __get_cpu_features ();
> > > +
> > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> > > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > > + return __strstr_avx512;
> > > +
> > > + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > > + return __strstr_sse2_unaligned;
> > > +
> > > + return __strstr_sse2;
> > > +}
> > > +
> > > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr,
> > > +IFUNC_SELECTOR ());
> > > #undef strstr
> > > strong_alias (__libc_strstr, strstr)
> > > --
> > > 2.36.1
> > >
> -----Original Message-----
> From: Noah Goldstein <goldstein.w.n@gmail.com>
> Sent: Tuesday, May 31, 2022 2:34 PM
> To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
> Cc: GNU C Library <libc-alpha@sourceware.org>
> Subject: Re: [PATCH 1/1] x86_64: Add strstr function with 512-bit EVEX
>
> On Tue, May 31, 2022 at 2:16 PM Devulapalli, Raghuveer
> <raghuveer.devulapalli@intel.com> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: Noah Goldstein <goldstein.w.n@gmail.com>
> > > Sent: Thursday, May 26, 2022 2:26 PM
> > > To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
> > > Cc: GNU C Library <libc-alpha@sourceware.org>
> > > Subject: Re: [PATCH 1/1] x86_64: Add strstr function with 512-bit
> > > EVEX
> > >
> > > On Thu, May 26, 2022 at 3:11 PM Raghuveer Devulapalli via Libc-alpha
> > > <libc- alpha@sourceware.org> wrote:
> > > >
> > > > Adding a 512-bit EVEX version of strstr. The algorithm works as
> follows:
> > > >
> > > > (1) We spend a few cycles at the begining to peek into the needle.
> > > > We locate an edge in the needle (first occurance of 2 consequent
> > > > distinct
> > > > characters) and also store the first 64-bytes into a zmm register.
> > > >
> > > > (2) We search for the edge in the haystack by looking into one
> > > > cache line of the haystack at a time. This avoids having to read
> > > > past a page boundary which can cause a seg fault.
> > > >
> > > > (3) If an edge is found in the haystack we first compare the first
> > > > 64-bytes of the needle (already stored in a zmm register) before
> > > > we proceed with a full string compare performed byte by byte.
> > > >
> > > > Benchmarking data on ICX shows upto 2x speed up when compared to
> > > > __strstr_sse2_unaligned (including partial benchtests data from
> > > > bench-strstr.out):
> > > >
> > > > |---------------------------------+---------------+-----------------------|
> > > > | | strstr_avx512 |
> > > > | strstr_sse2_unaligned |
> > > > |---------------------------------+---------------+-----------------------|
> > > > | Length 16384/ 16, 1/11, found: | 1939.75 | 3458.44 |
> > > > | Length 16384/ 16, 14/ 5, fail : | 1967.75 | 3541.12 |
> > > > | Length 16384/ 32, 1/11, found: | 1540.38 | 2908.25 |
> > > > | Length 16384/ 32, 14/ 5, fail : | 1345.94 | 2866.31 |
> > > > | Length 16384/ 64, 1/11, found: | 1968.81 | 4327.56 |
> > > > | Length 16384/ 64, 14/ 5, fail : | 1993.75 | 4215.69 |
> > > > | Length 16384/128, 1/11, found: | 1535.44 | 3780.56 |
> > > > | Length 16384/128, 14/ 5, fail : | 1414.75 | 3595.25 |
> > > > | Length 16384/256, 1/11, found: | 2957.75 | 5501.44 |
> > > > | Length 16384/256, 14/ 5, fail : | 2682.62 | 5099.88 |
> > > > | Length 32768/ 16, 1/11, found: | 7820.19 | 11262.9 |
> > > > | Length 32768/ 16, 14/ 5, fail : | 8196.88 | 10871.2 |
> > > > | Length 32768/ 32, 1/11, found: | 5709.19 | 6611.56 |
> > > > | Length 32768/ 32, 14/ 5, fail : | 5716.12 | 6647.06 |
> > > > | Length 32768/ 64, 1/11, found: | 7160.44 | 10143.7 |
> > > > | Length 32768/ 64, 14/ 5, fail : | 7021.38 | 10150.6 |
> > > > | Length 32768/128, 1/11, found: | 4935.31 | 6756.56 |
> > > > | Length 32768/128, 14/ 5, fail : | 4774.38 | 6746.19 |
> > > > | Length 32768/256, 1/11, found: | 7933.19 | 12563.8 |
> > > > | Length 32768/256, 14/ 5, fail : | 7975 | 12558.6 |
> > > > | Length 65536/ 16, 1/11, found: | 9066.69 | 9419.62 |
> > > > | Length 65536/ 16, 14/ 5, fail : | 8496 | 9384.75 |
> > > > | Length 65536/ 32, 1/11, found: | 10258.8 | 11192.4 |
> > > > | Length 65536/ 32, 14/ 5, fail : | 8712.12 | 11172.3 |
> > > > | Length 65536/ 64, 1/11, found: | 11085.2 | 18162.1 |
> > > > | Length 65536/ 64, 14/ 5, fail : | 11219.6 | 17921.5 |
> > > > | Length 65536/128, 1/11, found: | 9753.56 | 18704.6 |
> > > > | Length 65536/128, 14/ 5, fail : | 9588.81 | 18465.6 |
> > > > | Length 65536/256, 1/11, found: | 18333.3 | 28505.2 |
> > > > | Length 65536/256, 14/ 5, fail : | 18018.8 | 27990.8 |
> > > > |---------------------------------+---------------+-----------------------|
> > > > ---
> > > > sysdeps/x86_64/multiarch/Makefile | 2 +
> > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 +
> > > > sysdeps/x86_64/multiarch/strstr-avx512.c | 208
> > > +++++++++++++++++++++
> > > > sysdeps/x86_64/multiarch/strstr.c | 24 ++-
> > > > 4 files changed, 236 insertions(+), 4 deletions(-) create mode
> > > > 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/Makefile
> > > > b/sysdeps/x86_64/multiarch/Makefile
> > > > index e7b413edad..6dc54a7265 100644
> > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > @@ -126,6 +126,7 @@ sysdep_routines += \
> > > > strrchr-sse2 \
> > > > strspn-c \
> > > > strspn-sse2 \
> > > > + strstr-avx512 \
> > > > strstr-sse2-unaligned \
> > > > varshift \
> > > > # sysdep_routines
> > > > @@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4
> > > > CFLAGS-strcspn-c.c
> > > +=
> > > > -msse4 CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4
> > > > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -
> > > mavx512bw
> > > > +-mbmi -mbmi2 -O3
> > > > endif
> > > >
> > > > ifeq ($(subdir),wcsmbs)
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > index a594f4176e..cc9a7eaaa1 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > @@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name,
> > > > struct libc_ifunc_impl *array,
> > > >
> > > > /* Support sysdeps/x86_64/multiarch/strstr.c. */
> > > > IFUNC_IMPL (i, name, strstr,
> > > > + IFUNC_IMPL_ADD (array, i, strstr,
> > > > + (CPU_FEATURE_USABLE (AVX512VL)
> > > > + && CPU_FEATURE_USABLE (AVX512BW)
> > > > + && CPU_FEATURE_USABLE (AVX512DQ)
> > > > + && CPU_FEATURE_USABLE (BMI2)),
> > > > + __strstr_avx512)
> > > > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> > > > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > new file mode 100644
> > > > index 0000000000..4082a75a1b
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > @@ -0,0 +1,208 @@
> > > > +/* strstr optimized with 512-bit AVX-512 instructions
> > > > + Copyright (C) 2022 Free Software Foundation, Inc.
> > > > + This file is part of the GNU C Library.
> > > > +
> > > > + The GNU C Library is free software; you can redistribute it and/or
> > > > + modify it under the terms of the GNU Lesser General Public
> > > > + License as published by the Free Software Foundation; either
> > > > + version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > + The GNU C Library is distributed in the hope that it will be useful,
> > > > + but WITHOUT ANY WARRANTY; without even the implied warranty
> of
> > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
> the
> > > GNU
> > > > + Lesser General Public License for more details.
> > > > +
> > > > + You should have received a copy of the GNU Lesser General Public
> > > > + License along with the GNU C Library; if not, see
> > > > + <https://www.gnu.org/licenses/>. */
> > > > +
> > > > +#include <immintrin.h>
> > > > +#include <inttypes.h>
> > > > +#include <stdbool.h>
> > > > +#include <string.h>
> > > > +
> > > > +#define FULL_MMASK64 0xffffffffffffffff #define ONE_64BIT 0x1ull
> > > > +#define ZMM_SIZE_IN_BYTES 64
> > > > +
> > > > +/*
> > > > + Returns the index of the first edge within the needle, returns 0
> > > > +if no edge is found. Example: 'ab' is the first edge in
> 'aaaaaaaaaabaarddg'
> > > > + */
> > > > +static inline size_t
> > > > +find_edge_in_needle (const char *ned) {
> > > > + size_t ind = 0;
> > > > + while (ned[ind + 1] != '\0')
> > > > + {
> > > > + if (ned[ind] != ned[ind + 1])
> > > > + return ind;
> > > > + else
> > > > + ind = ind + 1;
> > > > + }
> > > > + return 0;
> > > > +}
> > > > +
> > > > +/*
> > > > + Compare needle with haystack byte by byte at specified location
> > > > +*/ static inline bool verify_string_match (const char *hay, const
> > > > +size_t hay_index, const char *ned,
> > > > + size_t ind)
> > > > +{
> > > > + while (ned[ind] != '\0')
> > > > + {
> > > strcmp? (you might be able to use memcmp which will be faster
> > > but will need a bit of refactor to keep true nedlen and check for page
> > > cross on hay)
> >
> > Wouldn't strcmp give you the wrong answer here? For ex: I would need it
> to return true when haystack is "abcdefg" and needle is "bcd"
>
> Yeah strcmp directly wont work directly. If you can refactor so neelen is
> known then you can use memcmp. Otherwise can you manually implement
> at least first VEC_SIZE comparison with vectors (obv make sure its a
> speedup). That should cover *most* needles.
>
> If need be you can throw the edge cases to just byte comparison.
This is exactly what the function verify_string_match_avx512 does.
It compares the first 64 bytes of the needle to 64 bytes of haystack
with vcmpb and then does a byte by byte comparison for the rest
of the data. The hot loop calls the verify_string_match_avx512 first
and then switches to byte by byte comparison which is kind of
inevitable to avoid cross page loads for both haystack and needle.
> >
> > > > + if (ned[ind] != hay[hay_index + ind])
> > > > + return false;
> > > > + ind = ind + 1;
> > > > + }
> > > > + return true;
> > > > +}
> > > > +
> > > > +/*
> > > > + Compare needle with haystack at specified location. The first 64
> > > > +bytes are compared using a ZMM register.
> > > > + */
> > > > +static inline bool
> > > > +verify_string_match_avx512 (const char *hay, const size_t
> hay_index,
> > > > + const char *ned, const __mmask64 ned_mask,
> > > > + const __m512i ned_zmm) {
> > > > + /* check first 64 bytes using zmm and then scalar */
> > > > + __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); //
> safe
> > > to
> > > > +do so
> > > > + __mmask64 match = _mm512_mask_cmpneq_epi8_mask
> (ned_mask,
> > > hay_zmm,
> > > > +ned_zmm);
> > > > + if (match != 0x0) // failed the first few chars
> > > > + return false;
> > > > + else if (ned_mask == FULL_MMASK64)
> > > > + return verify_string_match (hay, hay_index, ned,
> > > > +ZMM_SIZE_IN_BYTES);
> > > > + return true;
> > > > +}
> > > > +
> > > > +char *
> > > > +__strstr_avx512 (const char *haystack, const char *ned) {
> > > > + char first = ned[0];
> > > > + if (first == '\0')
> > > > + return (char *)haystack;
> > > > + if (ned[1] == '\0')
> > > > + return (char *)strchr (haystack, ned[0]);
> > > > +
> > > > + size_t edge = find_edge_in_needle (ned);
> > > > +
> > > > + /* ensure haystack is as long as the pos of edge in needle */
> > > > + for (int ii = 0; ii < edge; ++ii)
> > > > + {
> > > strnlen
> >
> > Makes sense.
> >
> > > > + if (haystack[ii] == '\0')
> > > > + return NULL;
> > > > + }
> > > > +
> > > > + const __m512i null = _mm512_setzero_si512 (); // '\0'
> > > > +
> > > > + /*
> > > > + Load 64 bytes of the needle and save it to a zmm register
> > > > + Read one cache line at a time to avoid loading across a page
> boundary
> > > > + */
> > > > + __mmask64 ned_load_mask
> > > > + = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
> > > FULL_MMASK64 >> (((-(uintptr_t)ned) & 63));
> >
> > +1
> >
> > > > + __m512i ned_zmm = _mm512_maskz_loadu_epi8
> (ned_load_mask,
> > > ned);
> > > Maybe conditional on highly unlike page cross this is very
> > > expensive if causes page walk
> >
> > Elements on the next cache line are zero masked, shouldn't that prevent a
> cross page load?
>
> Still does a page walk (and IIRC has non-intuitive rules for updating TLB so it
> can get pretty ugly).
Ok, will add the conditional on page cross.
> >
> > > > + __mmask64 ned_nullmask
> > > > + = _mm512_mask_cmpeq_epi8_mask (ned_load_mask,
> ned_zmm,
> > > null);
> > > _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm,
> > > ned_zmm)
> > >
> > > likewise at all other compares with null unless it breaks
> > > microfusion more than once.
> >
> > The compiler was using vptestnmb, doesn't hurt to explicitly use it
> anyways.
> >
> > >
> > > If you can replace all then get rid of null
> > > > + if (__glibc_unlikely (ned_nullmask == 0x0))
> > > > + {
> > > > + ned_zmm = _mm512_loadu_si512 (ned);
> > > > + ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
> > > > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > > > + if (ned_nullmask != 0x0)
> > > > + ned_load_mask = ned_load_mask >> 1;
> > > > + }
> > > > + else
> > > > + {
> > > > + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > > > + ned_load_mask = ned_load_mask >> 1;
> > > I think you can get away with just ned_load_mask =
> > > ned_nullmask - ONE_64BIT because you only use this after
> > > checking haystack no null-term
> >
> > Without the >> 1, we will compare the null char of the needle to the
> haystack which will give you the wrong answer.
>
> Thats only if you have the `xor` with the original. The xor with the original is
> needed to clear null hits out of range but because you already check that the
> haystack has no null term is fine to leave those potential high bit 1s in place.
>
> I may be wrong.
The benchtests fail without the shift op. Pretty sure I need it.
> >
> > > > + }
> > > > + const __m512i ned0 = _mm512_set1_epi8 (ned[edge]); const
> > > > + __m512i
> > > > + ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> > > > +
> > > > + /*
> > > > + Read the bytes of haystack in the current cache line
> > > > + */
> > > > + size_t hay_index = edge;
> > > > + __mmask64 loadmask = _bzhi_u64 (
> > > > + FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) &
> > > > + 63));
> > > > + /* First load is a partial cache line */ __m512i hay0 =
> > > > + _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > > > + /* Search for NULL and compare only till null char */
> > > > + __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask
> (loadmask,
> > > hay0,
> > > > + null);
> > > > + __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> cmpmask
> > > =
> > > > + _kand_mask64 (cmpmask, loadmask);
> > > nullmask ^ (nullmask - ONE_64BIT); codegen ends up actually
> > > using kand_mask here. Since loadmask and nullmask both go through
> > > GPR (nullmask for the blsmsk) you can do this explicitly in uint64_t
> > > to help GCC out.
>
> Doable?
Should be fine. I assume you mean use type uint64_t for both nullmask and cmpmask.
> > >
> > > > + /* Search for the 2 charaters of needle */
> > > > + __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > > + __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > > > + k1 = _kshiftri_mask64 (k1, 1);
> > > > + /* k2 masks tell us if both chars from needle match */
> > > > + uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0,
> > > > + k1),
> > > cmpmask));
>
> Doable?
Yup.
>
> > > > + /* For every match, search for the entire needle for a full
> > > > + match */ while (k2)
> > > > + {
> > > > + uint64_t bitcount = _tzcnt_u64(k2);
> > > > + k2 = _blsr_u64(k2);
> > > > + size_t match_pos = hay_index + bitcount - edge;
> > > > + if (nullmask == 0)
> > > > + {
> > > > + if (verify_string_match_avx512 (haystack, match_pos, ned,
> > > > + ned_load_mask, ned_zmm))
> > > > + return (char *)haystack + match_pos;
> > > > + }
> > > > + else
> > > > + {
> > > > + if (verify_string_match (haystack, match_pos, ned, 0))
> > > > + return (char *)haystack + match_pos;
> > > > + }
> > > > + }
> > > > + /* We haven't checked for potential match at the last char yet
> > > > + */ hay_index += _mm_popcnt_u64 (loadmask) - 1;
> > > hay_index = 0; haystay |= 63; You might want to check codegen and
> > > ensure hay_index is being optimized out. AFAICT you just need a
> > > pointer.
> >
> > AFAICT, looks like it does optimize it out.
>
> Good. Can you switch to the `|=` though.
Not sure what you mean. Could you explain?
> >
> > > > +
> > > > + /*
> > > > + Loop over one cache line at a time to prevent reading over page
> > > > + boundary
> > > > + */
> > > > + __m512i hay1;
> > > > + while (nullmask == 0)
> > > > + {
> > > > + hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > > > + hay1 = _mm512_load_si512 (haystack + hay_index
> > > > + + 1); // Always 64 byte aligned
> > > Is this really faster than using kshiftri?
> >
> > Yes (assuming you mean using just one load and use mask shift operations
> to look for a match). A lot of instructions in this loop are stuck on port 5 and
> so is kshiftri. Using 2 loads which execute on Port 2 and 3 relives that
> pressure.
> >
>
> Fair enough.
> > > > + nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
> > > > + /* Compare only till null char */
> > > > + cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > > + k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > > + k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > > > + /* k2 masks tell us if both chars from needle match */
> > > > + k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1),
> > > cmpmask));
>
> Doable?
Yup.
> > > > + /* For every match, compare full strings for potential match */
> > > > + while (k2)
> > > > + {
> > > > + uint64_t bitcount = _tzcnt_u64(k2);
> > > > + k2 = _blsr_u64(k2);
> > > > + size_t match_pos = hay_index + bitcount - edge;
> > > > + if (nullmask == 0)
> > > > + {
> > > > + /*
> > > > + Since the haystack doesn't terminate at the current cache
> > > > + line, we can use zmm register to compare the first 64 bytes
> > > > + */
> > > > + if (verify_string_match_avx512 (haystack, match_pos, ned,
> > > > + ned_load_mask, ned_zmm))
> > > > + return (char *)haystack + match_pos;
> > > > + }
> > > > + else
> > > > + {
> > > > + /* Compare byte by byte */
> > > > + if (verify_string_match (haystack, match_pos, ned, 0))
> > > > + return (char *)haystack + match_pos;
> > > > + }
> > > > + }
> > > > + hay_index += ZMM_SIZE_IN_BYTES;
> > > > + }
> > > > + return NULL;
> > > > +}
> > > > diff --git a/sysdeps/x86_64/multiarch/strstr.c
> > > > b/sysdeps/x86_64/multiarch/strstr.c
> > > > index 95600a9de5..2fb8b169b6 100644
> > > > --- a/sysdeps/x86_64/multiarch/strstr.c
> > > > +++ b/sysdeps/x86_64/multiarch/strstr.c
> > > > @@ -35,16 +35,32 @@
> > > >
> > > > extern __typeof (__redirect_strstr) __strstr_sse2_unaligned
> > > > attribute_hidden; extern __typeof (__redirect_strstr)
> > > > __strstr_sse2 attribute_hidden;
> > > > +extern __typeof (__redirect_strstr) __strstr_avx512
> > > > +attribute_hidden;
> > > >
> > > > #include "init-arch.h"
> > > >
> > > > /* Avoid DWARF definition DIE on ifunc symbol so that GDB can
> handle
> > > > ifunc symbol properly. */
> > > > extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc
> > > > (__libc_strstr,
> > > > - HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> > > > - ? __strstr_sse2_unaligned
> > > > - : __strstr_sse2)
> > > >
> > > > +static inline void *
> > > > +IFUNC_SELECTOR (void)
> > > > +{
> > > > + const struct cpu_features *cpu_features = __get_cpu_features
> > > > +();
> > > > +
> > > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> > > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> > > > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > > > + return __strstr_avx512;
> > > > +
> > > > + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > > > + return __strstr_sse2_unaligned;
> > > > +
> > > > + return __strstr_sse2;
> > > > +}
> > > > +
> > > > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr,
> > > > +IFUNC_SELECTOR ());
> > > > #undef strstr
> > > > strong_alias (__libc_strstr, strstr)
> > > > --
> > > > 2.36.1
> > > >
@@ -126,6 +126,7 @@ sysdep_routines += \
strrchr-sse2 \
strspn-c \
strspn-sse2 \
+ strstr-avx512 \
strstr-sse2-unaligned \
varshift \
# sysdep_routines
@@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
CFLAGS-strspn-c.c += -msse4
+CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
endif
ifeq ($(subdir),wcsmbs)
@@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strstr.c. */
IFUNC_IMPL (i, name, strstr,
+ IFUNC_IMPL_ADD (array, i, strstr,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (AVX512DQ)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __strstr_avx512)
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
new file mode 100644
@@ -0,0 +1,208 @@
+/* strstr optimized with 512-bit AVX-512 instructions
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <immintrin.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <string.h>
+
+#define FULL_MMASK64 0xffffffffffffffff
+#define ONE_64BIT 0x1ull
+#define ZMM_SIZE_IN_BYTES 64
+
+/*
+ Returns the index of the first edge within the needle, returns 0 if no edge
+ is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
+ */
+static inline size_t
+find_edge_in_needle (const char *ned)
+{
+ size_t ind = 0;
+ while (ned[ind + 1] != '\0')
+ {
+ if (ned[ind] != ned[ind + 1])
+ return ind;
+ else
+ ind = ind + 1;
+ }
+ return 0;
+}
+
+/*
+ Compare needle with haystack byte by byte at specified location
+ */
+static inline bool
+verify_string_match (const char *hay, const size_t hay_index, const char *ned,
+ size_t ind)
+{
+ while (ned[ind] != '\0')
+ {
+ if (ned[ind] != hay[hay_index + ind])
+ return false;
+ ind = ind + 1;
+ }
+ return true;
+}
+
+/*
+ Compare needle with haystack at specified location. The first 64 bytes are
+ compared using a ZMM register.
+ */
+static inline bool
+verify_string_match_avx512 (const char *hay, const size_t hay_index,
+ const char *ned, const __mmask64 ned_mask,
+ const __m512i ned_zmm)
+{
+ /* check first 64 bytes using zmm and then scalar */
+ __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
+ __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
+ if (match != 0x0) // failed the first few chars
+ return false;
+ else if (ned_mask == FULL_MMASK64)
+ return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
+ return true;
+}
+
+char *
+__strstr_avx512 (const char *haystack, const char *ned)
+{
+ char first = ned[0];
+ if (first == '\0')
+ return (char *)haystack;
+ if (ned[1] == '\0')
+ return (char *)strchr (haystack, ned[0]);
+
+ size_t edge = find_edge_in_needle (ned);
+
+ /* ensure haystack is as long as the pos of edge in needle */
+ for (int ii = 0; ii < edge; ++ii)
+ {
+ if (haystack[ii] == '\0')
+ return NULL;
+ }
+
+ const __m512i null = _mm512_setzero_si512 (); // '\0'
+
+ /*
+ Load 64 bytes of the needle and save it to a zmm register
+ Read one cache line at a time to avoid loading across a page boundary
+ */
+ __mmask64 ned_load_mask
+ = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
+ __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
+ __mmask64 ned_nullmask
+ = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm, null);
+ if (__glibc_unlikely (ned_nullmask == 0x0))
+ {
+ ned_zmm = _mm512_loadu_si512 (ned);
+ ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
+ ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
+ if (ned_nullmask != 0x0)
+ ned_load_mask = ned_load_mask >> 1;
+ }
+ else
+ {
+ ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
+ ned_load_mask = ned_load_mask >> 1;
+ }
+ const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
+ const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
+
+ /*
+ Read the bytes of haystack in the current cache line
+ */
+ size_t hay_index = edge;
+ __mmask64 loadmask = _bzhi_u64 (
+ FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
+ /* First load is a partial cache line */
+ __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
+ /* Search for NULL and compare only till null char */
+ __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask (loadmask, hay0, null);
+ __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT);
+ cmpmask = _kand_mask64 (cmpmask, loadmask);
+ /* Search for the 2 charaters of needle */
+ __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
+ __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
+ k1 = _kshiftri_mask64 (k1, 1);
+ /* k2 masks tell us if both chars from needle match */
+ uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
+ /* For every match, search for the entire needle for a full match */
+ while (k2)
+ {
+ uint64_t bitcount = _tzcnt_u64(k2);
+ k2 = _blsr_u64(k2);
+ size_t match_pos = hay_index + bitcount - edge;
+ if (nullmask == 0)
+ {
+ if (verify_string_match_avx512 (haystack, match_pos, ned,
+ ned_load_mask, ned_zmm))
+ return (char *)haystack + match_pos;
+ }
+ else
+ {
+ if (verify_string_match (haystack, match_pos, ned, 0))
+ return (char *)haystack + match_pos;
+ }
+ }
+ /* We haven't checked for potential match at the last char yet */
+ hay_index += _mm_popcnt_u64 (loadmask) - 1;
+
+ /*
+ Loop over one cache line at a time to prevent reading over page
+ boundary
+ */
+ __m512i hay1;
+ while (nullmask == 0)
+ {
+ hay0 = _mm512_loadu_si512 (haystack + hay_index);
+ hay1 = _mm512_load_si512 (haystack + hay_index
+ + 1); // Always 64 byte aligned
+ nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
+ /* Compare only till null char */
+ cmpmask = nullmask ^ (nullmask - ONE_64BIT);
+ k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
+ k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
+ /* k2 masks tell us if both chars from needle match */
+ k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
+ /* For every match, compare full strings for potential match */
+ while (k2)
+ {
+ uint64_t bitcount = _tzcnt_u64(k2);
+ k2 = _blsr_u64(k2);
+ size_t match_pos = hay_index + bitcount - edge;
+ if (nullmask == 0)
+ {
+ /*
+ Since the haystack doesn't terminate at the current cache
+ line, we can use zmm register to compare the first 64 bytes
+ */
+ if (verify_string_match_avx512 (haystack, match_pos, ned,
+ ned_load_mask, ned_zmm))
+ return (char *)haystack + match_pos;
+ }
+ else
+ {
+ /* Compare byte by byte */
+ if (verify_string_match (haystack, match_pos, ned, 0))
+ return (char *)haystack + match_pos;
+ }
+ }
+ hay_index += ZMM_SIZE_IN_BYTES;
+ }
+ return NULL;
+}
@@ -35,16 +35,32 @@
extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
+extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
#include "init-arch.h"
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
extern __typeof (__redirect_strstr) __libc_strstr;
-libc_ifunc (__libc_strstr,
- HAS_ARCH_FEATURE (Fast_Unaligned_Load)
- ? __strstr_sse2_unaligned
- : __strstr_sse2)
+static inline void *
+IFUNC_SELECTOR (void)
+{
+ const struct cpu_features *cpu_features = __get_cpu_features ();
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ return __strstr_avx512;
+
+ if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+ return __strstr_sse2_unaligned;
+
+ return __strstr_sse2;
+}
+
+libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
#undef strstr
strong_alias (__libc_strstr, strstr)