[1/1] x86_64: Add strstr function with 512-bit EVEX

Message ID 20220526202209.1382238-1-raghuveer.devulapalli@intel.com
State Superseded
Headers
Series [1/1] x86_64: Add strstr function with 512-bit EVEX |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Raghuveer Devulapalli May 26, 2022, 8:22 p.m. UTC
  Adding a 512-bit EVEX version of strstr. The algorithm works as follows:

(1) We spend a few cycles at the begining to peek into the needle. We
locate an edge in the needle (first occurance of 2 consequent distinct
characters) and also store the first 64-bytes into a zmm register.

(2) We search for the edge in the haystack by looking into one cache
line of the haystack at a time. This avoids having to read past a page
boundary which can cause a seg fault.

(3) If an edge is found in the haystack we first compare the first
64-bytes of the needle (already stored in a zmm register) before we
proceed with a full string compare performed byte by byte.

Benchmarking data on ICX shows upto 2x speed up when compared to
__strstr_sse2_unaligned (including partial benchtests data from
bench-strstr.out):

|---------------------------------+---------------+-----------------------|
|                                 | strstr_avx512 | strstr_sse2_unaligned |
|---------------------------------+---------------+-----------------------|
| Length 16384/ 16,  1/11, found: | 1939.75       | 3458.44               |
| Length 16384/ 16, 14/ 5, fail : | 1967.75       | 3541.12               |
| Length 16384/ 32,  1/11, found: | 1540.38       | 2908.25               |
| Length 16384/ 32, 14/ 5, fail : | 1345.94       | 2866.31               |
| Length 16384/ 64,  1/11, found: | 1968.81       | 4327.56               |
| Length 16384/ 64, 14/ 5, fail : | 1993.75       | 4215.69               |
| Length 16384/128,  1/11, found: | 1535.44       | 3780.56               |
| Length 16384/128, 14/ 5, fail : | 1414.75       | 3595.25               |
| Length 16384/256,  1/11, found: | 2957.75       | 5501.44               |
| Length 16384/256, 14/ 5, fail : | 2682.62       | 5099.88               |
| Length 32768/ 16,  1/11, found: | 7820.19       | 11262.9               |
| Length 32768/ 16, 14/ 5, fail : | 8196.88       | 10871.2               |
| Length 32768/ 32,  1/11, found: | 5709.19       | 6611.56               |
| Length 32768/ 32, 14/ 5, fail : | 5716.12       | 6647.06               |
| Length 32768/ 64,  1/11, found: | 7160.44       | 10143.7               |
| Length 32768/ 64, 14/ 5, fail : | 7021.38       | 10150.6               |
| Length 32768/128,  1/11, found: | 4935.31       | 6756.56               |
| Length 32768/128, 14/ 5, fail : | 4774.38       | 6746.19               |
| Length 32768/256,  1/11, found: | 7933.19       | 12563.8               |
| Length 32768/256, 14/ 5, fail : | 7975          | 12558.6               |
| Length 65536/ 16,  1/11, found: | 9066.69       | 9419.62               |
| Length 65536/ 16, 14/ 5, fail : | 8496          | 9384.75               |
| Length 65536/ 32,  1/11, found: | 10258.8       | 11192.4               |
| Length 65536/ 32, 14/ 5, fail : | 8712.12       | 11172.3               |
| Length 65536/ 64,  1/11, found: | 11085.2       | 18162.1               |
| Length 65536/ 64, 14/ 5, fail : | 11219.6       | 17921.5               |
| Length 65536/128,  1/11, found: | 9753.56       | 18704.6               |
| Length 65536/128, 14/ 5, fail : | 9588.81       | 18465.6               |
| Length 65536/256,  1/11, found: | 18333.3       | 28505.2               |
| Length 65536/256, 14/ 5, fail : | 18018.8       | 27990.8               |
|---------------------------------+---------------+-----------------------|
---
 sysdeps/x86_64/multiarch/Makefile          |   2 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
 sysdeps/x86_64/multiarch/strstr-avx512.c   | 208 +++++++++++++++++++++
 sysdeps/x86_64/multiarch/strstr.c          |  24 ++-
 4 files changed, 236 insertions(+), 4 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
  

Comments

Noah Goldstein May 26, 2022, 9:25 p.m. UTC | #1
On Thu, May 26, 2022 at 3:11 PM Raghuveer Devulapalli via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
>
> (1) We spend a few cycles at the begining to peek into the needle. We
> locate an edge in the needle (first occurance of 2 consequent distinct
> characters) and also store the first 64-bytes into a zmm register.
>
> (2) We search for the edge in the haystack by looking into one cache
> line of the haystack at a time. This avoids having to read past a page
> boundary which can cause a seg fault.
>
> (3) If an edge is found in the haystack we first compare the first
> 64-bytes of the needle (already stored in a zmm register) before we
> proceed with a full string compare performed byte by byte.
>
> Benchmarking data on ICX shows upto 2x speed up when compared to
> __strstr_sse2_unaligned (including partial benchtests data from
> bench-strstr.out):
>
> |---------------------------------+---------------+-----------------------|
> |                                 | strstr_avx512 | strstr_sse2_unaligned |
> |---------------------------------+---------------+-----------------------|
> | Length 16384/ 16,  1/11, found: | 1939.75       | 3458.44               |
> | Length 16384/ 16, 14/ 5, fail : | 1967.75       | 3541.12               |
> | Length 16384/ 32,  1/11, found: | 1540.38       | 2908.25               |
> | Length 16384/ 32, 14/ 5, fail : | 1345.94       | 2866.31               |
> | Length 16384/ 64,  1/11, found: | 1968.81       | 4327.56               |
> | Length 16384/ 64, 14/ 5, fail : | 1993.75       | 4215.69               |
> | Length 16384/128,  1/11, found: | 1535.44       | 3780.56               |
> | Length 16384/128, 14/ 5, fail : | 1414.75       | 3595.25               |
> | Length 16384/256,  1/11, found: | 2957.75       | 5501.44               |
> | Length 16384/256, 14/ 5, fail : | 2682.62       | 5099.88               |
> | Length 32768/ 16,  1/11, found: | 7820.19       | 11262.9               |
> | Length 32768/ 16, 14/ 5, fail : | 8196.88       | 10871.2               |
> | Length 32768/ 32,  1/11, found: | 5709.19       | 6611.56               |
> | Length 32768/ 32, 14/ 5, fail : | 5716.12       | 6647.06               |
> | Length 32768/ 64,  1/11, found: | 7160.44       | 10143.7               |
> | Length 32768/ 64, 14/ 5, fail : | 7021.38       | 10150.6               |
> | Length 32768/128,  1/11, found: | 4935.31       | 6756.56               |
> | Length 32768/128, 14/ 5, fail : | 4774.38       | 6746.19               |
> | Length 32768/256,  1/11, found: | 7933.19       | 12563.8               |
> | Length 32768/256, 14/ 5, fail : | 7975          | 12558.6               |
> | Length 65536/ 16,  1/11, found: | 9066.69       | 9419.62               |
> | Length 65536/ 16, 14/ 5, fail : | 8496          | 9384.75               |
> | Length 65536/ 32,  1/11, found: | 10258.8       | 11192.4               |
> | Length 65536/ 32, 14/ 5, fail : | 8712.12       | 11172.3               |
> | Length 65536/ 64,  1/11, found: | 11085.2       | 18162.1               |
> | Length 65536/ 64, 14/ 5, fail : | 11219.6       | 17921.5               |
> | Length 65536/128,  1/11, found: | 9753.56       | 18704.6               |
> | Length 65536/128, 14/ 5, fail : | 9588.81       | 18465.6               |
> | Length 65536/256,  1/11, found: | 18333.3       | 28505.2               |
> | Length 65536/256, 14/ 5, fail : | 18018.8       | 27990.8               |
> |---------------------------------+---------------+-----------------------|
> ---
>  sysdeps/x86_64/multiarch/Makefile          |   2 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
>  sysdeps/x86_64/multiarch/strstr-avx512.c   | 208 +++++++++++++++++++++
>  sysdeps/x86_64/multiarch/strstr.c          |  24 ++-
>  4 files changed, 236 insertions(+), 4 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e7b413edad..6dc54a7265 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -126,6 +126,7 @@ sysdep_routines += \
>    strrchr-sse2 \
>    strspn-c \
>    strspn-sse2 \
> +  strstr-avx512 \
>    strstr-sse2-unaligned \
>    varshift \
>  # sysdep_routines
> @@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4
>  CFLAGS-strcspn-c.c += -msse4
>  CFLAGS-strpbrk-c.c += -msse4
>  CFLAGS-strspn-c.c += -msse4
> +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
>  endif
>
>  ifeq ($(subdir),wcsmbs)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index a594f4176e..cc9a7eaaa1 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
>    IFUNC_IMPL (i, name, strstr,
> +              IFUNC_IMPL_ADD (array, i, strstr,
> +                              (CPU_FEATURE_USABLE (AVX512VL)
> +                               && CPU_FEATURE_USABLE (AVX512BW)
> +                               && CPU_FEATURE_USABLE (AVX512DQ)
> +                               && CPU_FEATURE_USABLE (BMI2)),
> +                              __strstr_avx512)
>               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
>
> diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> new file mode 100644
> index 0000000000..4082a75a1b
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> @@ -0,0 +1,208 @@
> +/* strstr optimized with 512-bit AVX-512 instructions
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <immintrin.h>
> +#include <inttypes.h>
> +#include <stdbool.h>
> +#include <string.h>
> +
> +#define FULL_MMASK64 0xffffffffffffffff
> +#define ONE_64BIT 0x1ull
> +#define ZMM_SIZE_IN_BYTES 64
> +
> +/*
> + Returns the index of the first edge within the needle, returns 0 if no edge
> + is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> + */
> +static inline size_t
> +find_edge_in_needle (const char *ned)
> +{
> +  size_t ind = 0;
> +  while (ned[ind + 1] != '\0')
> +    {
> +      if (ned[ind] != ned[ind + 1])
> +        return ind;
> +      else
> +        ind = ind + 1;
> +    }
> +  return 0;
> +}
> +
> +/*
> + Compare needle with haystack byte by byte at specified location
> + */
> +static inline bool
> +verify_string_match (const char *hay, const size_t hay_index, const char *ned,
> +                     size_t ind)
> +{
> +  while (ned[ind] != '\0')
> +    {
      strcmp? (you might be able to use memcmp which will be faster
      but will need a bit of refactor to keep true nedlen and check for page
      cross on hay)
> +      if (ned[ind] != hay[hay_index + ind])
> +        return false;
> +      ind = ind + 1;
> +    }
> +  return true;
> +}
> +
> +/*
> + Compare needle with haystack at specified location. The first 64 bytes are
> + compared using a ZMM register.
> + */
> +static inline bool
> +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> +                            const char *ned, const __mmask64 ned_mask,
> +                            const __m512i ned_zmm)
> +{
> +  /* check first 64 bytes using zmm and then scalar */
> +  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
> +  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
> +  if (match != 0x0) // failed the first few chars
> +    return false;
> +  else if (ned_mask == FULL_MMASK64)
> +    return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
> +  return true;
> +}
> +
> +char *
> +__strstr_avx512 (const char *haystack, const char *ned)
> +{
> +  char first = ned[0];
> +  if (first == '\0')
> +    return (char *)haystack;
> +  if (ned[1] == '\0')
> +    return (char *)strchr (haystack, ned[0]);
> +
> +  size_t edge = find_edge_in_needle (ned);
> +
> +  /* ensure haystack is as long as the pos of edge in needle */
> +  for (int ii = 0; ii < edge; ++ii)
> +    {
    strnlen
> +      if (haystack[ii] == '\0')
> +        return NULL;
> +    }
> +
> +  const __m512i null = _mm512_setzero_si512 (); // '\0'
> +
> +  /*
> +   Load 64 bytes of the needle and save it to a zmm register
> +   Read one cache line at a time to avoid loading across a page boundary
> +   */
> +  __mmask64 ned_load_mask
> +      = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
    FULL_MMASK64 >> (((-(uintptr_t)ned) & 63));
> +  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
    Maybe conditional on highly unlike page cross this is very
    expensive if causes page walk
> +  __mmask64 ned_nullmask
> +      = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm, null);
    _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm)

    likewise at all other compares with null unless it breaks
    microfusion more than once.

    If you can replace all then get rid of null
> +  if (__glibc_unlikely (ned_nullmask == 0x0))
> +    {
> +      ned_zmm = _mm512_loadu_si512 (ned);
> +      ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
> +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> +      if (ned_nullmask != 0x0)
> +        ned_load_mask = ned_load_mask >> 1;
> +    }
> +  else
> +    {
> +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> +      ned_load_mask = ned_load_mask >> 1;
      I think you can get away with just ned_load_mask =
      ned_nullmask - ONE_64BIT because you only use this after
      checking haystack no null-term
> +    }
> +  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
> +  const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> +
> +  /*
> +   Read the bytes of haystack in the current cache line
> +   */
> +  size_t hay_index = edge;
> +  __mmask64 loadmask = _bzhi_u64 (
> +      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
> +  /* First load is a partial cache line */
> +  __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> +  /* Search for NULL and compare only till null char */
> +  __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask (loadmask, hay0, null);
> +  __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> +  cmpmask = _kand_mask64 (cmpmask, loadmask);
  nullmask ^ (nullmask - ONE_64BIT); codegen ends up actually
  using kand_mask here. Since loadmask and nullmask both go through
  GPR (nullmask for the blsmsk) you can do this explicitly in uint64_t
  to help GCC out.

> +  /* Search for the 2 charaters of needle */
> +  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> +  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> +  k1 = _kshiftri_mask64 (k1, 1);
> +  /* k2 masks tell us if both chars from needle match */
> +  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
> +  /* For every match, search for the entire needle for a full match */
> +  while (k2)
> +    {
> +      uint64_t bitcount = _tzcnt_u64(k2);
> +      k2 = _blsr_u64(k2);
> +      size_t match_pos = hay_index + bitcount - edge;
> +      if (nullmask == 0)
> +        {
> +          if (verify_string_match_avx512 (haystack, match_pos, ned,
> +                                          ned_load_mask, ned_zmm))
> +            return (char *)haystack + match_pos;
> +        }
> +      else
> +        {
> +          if (verify_string_match (haystack, match_pos, ned, 0))
> +            return (char *)haystack + match_pos;
> +        }
> +    }
> +  /* We haven't checked for potential match at the last char yet */
> +  hay_index += _mm_popcnt_u64 (loadmask) - 1;
  hay_index = 0; haystay |= 63; You might want to check codegen and
  ensure hay_index is being optimized out. AFAICT you just need a
  pointer.
> +
> +  /*
> +   Loop over one cache line at a time to prevent reading over page
> +   boundary
> +   */
> +  __m512i hay1;
> +  while (nullmask == 0)
> +    {
> +      hay0 = _mm512_loadu_si512 (haystack + hay_index);
> +      hay1 = _mm512_load_si512 (haystack + hay_index
> +                                + 1); // Always 64 byte aligned
    Is this really faster than using kshiftri?
> +      nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
> +      /* Compare only till null char */
> +      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> +      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> +      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> +      /* k2 masks tell us if both chars from needle match */
> +      k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
> +      /* For every match, compare full strings for potential match */
> +      while (k2)
> +        {
> +          uint64_t bitcount = _tzcnt_u64(k2);
> +          k2 = _blsr_u64(k2);
> +          size_t match_pos = hay_index + bitcount - edge;
> +          if (nullmask == 0)
> +            {
> +              /*
> +               Since the haystack doesn't terminate at the current cache
> +               line, we can use zmm register to compare the first 64 bytes
> +               */
> +              if (verify_string_match_avx512 (haystack, match_pos, ned,
> +                                              ned_load_mask, ned_zmm))
> +                return (char *)haystack + match_pos;
> +            }
> +          else
> +            {
> +              /* Compare byte by byte */
> +              if (verify_string_match (haystack, match_pos, ned, 0))
> +                return (char *)haystack + match_pos;
> +            }
> +        }
> +      hay_index += ZMM_SIZE_IN_BYTES;
> +    }
> +  return NULL;
> +}
> diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
> index 95600a9de5..2fb8b169b6 100644
> --- a/sysdeps/x86_64/multiarch/strstr.c
> +++ b/sysdeps/x86_64/multiarch/strstr.c
> @@ -35,16 +35,32 @@
>
>  extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
>  extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
> +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
>
>  #include "init-arch.h"
>
>  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
>     ifunc symbol properly.  */
>  extern __typeof (__redirect_strstr) __libc_strstr;
> -libc_ifunc (__libc_strstr,
> -           HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> -           ? __strstr_sse2_unaligned
> -           : __strstr_sse2)
>
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features *cpu_features = __get_cpu_features ();
> +
> +  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> +    return __strstr_avx512;
> +
> +  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> +    return __strstr_sse2_unaligned;
> +
> +  return __strstr_sse2;
> +}
> +
> +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
>  #undef strstr
>  strong_alias (__libc_strstr, strstr)
> --
> 2.36.1
>
  
Noah Goldstein May 26, 2022, 9:41 p.m. UTC | #2
On Thu, May 26, 2022 at 3:11 PM Raghuveer Devulapalli via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
>
> (1) We spend a few cycles at the begining to peek into the needle. We
> locate an edge in the needle (first occurance of 2 consequent distinct
> characters) and also store the first 64-bytes into a zmm register.
>
> (2) We search for the edge in the haystack by looking into one cache
> line of the haystack at a time. This avoids having to read past a page
> boundary which can cause a seg fault.
>
> (3) If an edge is found in the haystack we first compare the first
> 64-bytes of the needle (already stored in a zmm register) before we
> proceed with a full string compare performed byte by byte.
>
> Benchmarking data on ICX shows upto 2x speed up when compared to
> __strstr_sse2_unaligned (including partial benchtests data from
> bench-strstr.out):
>
> |---------------------------------+---------------+-----------------------|
> |                                 | strstr_avx512 | strstr_sse2_unaligned |
> |---------------------------------+---------------+-----------------------|
> | Length 16384/ 16,  1/11, found: | 1939.75       | 3458.44               |
> | Length 16384/ 16, 14/ 5, fail : | 1967.75       | 3541.12               |
> | Length 16384/ 32,  1/11, found: | 1540.38       | 2908.25               |
> | Length 16384/ 32, 14/ 5, fail : | 1345.94       | 2866.31               |
> | Length 16384/ 64,  1/11, found: | 1968.81       | 4327.56               |
> | Length 16384/ 64, 14/ 5, fail : | 1993.75       | 4215.69               |
> | Length 16384/128,  1/11, found: | 1535.44       | 3780.56               |
> | Length 16384/128, 14/ 5, fail : | 1414.75       | 3595.25               |
> | Length 16384/256,  1/11, found: | 2957.75       | 5501.44               |
> | Length 16384/256, 14/ 5, fail : | 2682.62       | 5099.88               |
> | Length 32768/ 16,  1/11, found: | 7820.19       | 11262.9               |
> | Length 32768/ 16, 14/ 5, fail : | 8196.88       | 10871.2               |
> | Length 32768/ 32,  1/11, found: | 5709.19       | 6611.56               |
> | Length 32768/ 32, 14/ 5, fail : | 5716.12       | 6647.06               |
> | Length 32768/ 64,  1/11, found: | 7160.44       | 10143.7               |
> | Length 32768/ 64, 14/ 5, fail : | 7021.38       | 10150.6               |
> | Length 32768/128,  1/11, found: | 4935.31       | 6756.56               |
> | Length 32768/128, 14/ 5, fail : | 4774.38       | 6746.19               |
> | Length 32768/256,  1/11, found: | 7933.19       | 12563.8               |
> | Length 32768/256, 14/ 5, fail : | 7975          | 12558.6               |
> | Length 65536/ 16,  1/11, found: | 9066.69       | 9419.62               |
> | Length 65536/ 16, 14/ 5, fail : | 8496          | 9384.75               |
> | Length 65536/ 32,  1/11, found: | 10258.8       | 11192.4               |
> | Length 65536/ 32, 14/ 5, fail : | 8712.12       | 11172.3               |
> | Length 65536/ 64,  1/11, found: | 11085.2       | 18162.1               |
> | Length 65536/ 64, 14/ 5, fail : | 11219.6       | 17921.5               |
> | Length 65536/128,  1/11, found: | 9753.56       | 18704.6               |
> | Length 65536/128, 14/ 5, fail : | 9588.81       | 18465.6               |
> | Length 65536/256,  1/11, found: | 18333.3       | 28505.2               |
> | Length 65536/256, 14/ 5, fail : | 18018.8       | 27990.8               |
> |---------------------------------+---------------+-----------------------|

Can you add aggregate geomean all benchmarks sse2 / evex512?

Also can you add all numbers to the email.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |   2 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
>  sysdeps/x86_64/multiarch/strstr-avx512.c   | 208 +++++++++++++++++++++
>  sysdeps/x86_64/multiarch/strstr.c          |  24 ++-
>  4 files changed, 236 insertions(+), 4 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e7b413edad..6dc54a7265 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -126,6 +126,7 @@ sysdep_routines += \
>    strrchr-sse2 \
>    strspn-c \
>    strspn-sse2 \
> +  strstr-avx512 \
>    strstr-sse2-unaligned \
>    varshift \
>  # sysdep_routines
> @@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4
>  CFLAGS-strcspn-c.c += -msse4
>  CFLAGS-strpbrk-c.c += -msse4
>  CFLAGS-strspn-c.c += -msse4
> +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
>  endif
>
>  ifeq ($(subdir),wcsmbs)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index a594f4176e..cc9a7eaaa1 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
>    IFUNC_IMPL (i, name, strstr,
> +              IFUNC_IMPL_ADD (array, i, strstr,
> +                              (CPU_FEATURE_USABLE (AVX512VL)
> +                               && CPU_FEATURE_USABLE (AVX512BW)
> +                               && CPU_FEATURE_USABLE (AVX512DQ)
> +                               && CPU_FEATURE_USABLE (BMI2)),
> +                              __strstr_avx512)
>               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
>
> diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> new file mode 100644
> index 0000000000..4082a75a1b
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> @@ -0,0 +1,208 @@
> +/* strstr optimized with 512-bit AVX-512 instructions
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <immintrin.h>
> +#include <inttypes.h>
> +#include <stdbool.h>
> +#include <string.h>
> +
> +#define FULL_MMASK64 0xffffffffffffffff
> +#define ONE_64BIT 0x1ull
> +#define ZMM_SIZE_IN_BYTES 64
> +
> +/*
> + Returns the index of the first edge within the needle, returns 0 if no edge
> + is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> + */
> +static inline size_t
> +find_edge_in_needle (const char *ned)
> +{
> +  size_t ind = 0;
> +  while (ned[ind + 1] != '\0')
> +    {
> +      if (ned[ind] != ned[ind + 1])
> +        return ind;
> +      else
> +        ind = ind + 1;
> +    }
> +  return 0;
> +}
> +
> +/*
> + Compare needle with haystack byte by byte at specified location
> + */
> +static inline bool
> +verify_string_match (const char *hay, const size_t hay_index, const char *ned,
> +                     size_t ind)
> +{
> +  while (ned[ind] != '\0')
> +    {
> +      if (ned[ind] != hay[hay_index + ind])
> +        return false;
> +      ind = ind + 1;
> +    }
> +  return true;
> +}
> +
> +/*
> + Compare needle with haystack at specified location. The first 64 bytes are
> + compared using a ZMM register.
> + */
> +static inline bool
> +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> +                            const char *ned, const __mmask64 ned_mask,
> +                            const __m512i ned_zmm)
> +{
> +  /* check first 64 bytes using zmm and then scalar */
> +  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
> +  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
> +  if (match != 0x0) // failed the first few chars
> +    return false;
> +  else if (ned_mask == FULL_MMASK64)
> +    return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
> +  return true;
> +}
> +
> +char *
> +__strstr_avx512 (const char *haystack, const char *ned)
> +{
> +  char first = ned[0];
> +  if (first == '\0')
> +    return (char *)haystack;
> +  if (ned[1] == '\0')
> +    return (char *)strchr (haystack, ned[0]);
> +
> +  size_t edge = find_edge_in_needle (ned);
> +
> +  /* ensure haystack is as long as the pos of edge in needle */
> +  for (int ii = 0; ii < edge; ++ii)
> +    {
> +      if (haystack[ii] == '\0')
> +        return NULL;
> +    }
> +
> +  const __m512i null = _mm512_setzero_si512 (); // '\0'
> +
> +  /*
> +   Load 64 bytes of the needle and save it to a zmm register
> +   Read one cache line at a time to avoid loading across a page boundary
> +   */
> +  __mmask64 ned_load_mask
> +      = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
> +  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
> +  __mmask64 ned_nullmask
> +      = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm, null);
> +  if (__glibc_unlikely (ned_nullmask == 0x0))
> +    {
> +      ned_zmm = _mm512_loadu_si512 (ned);
> +      ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
> +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> +      if (ned_nullmask != 0x0)
> +        ned_load_mask = ned_load_mask >> 1;
> +    }
> +  else
> +    {
> +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> +      ned_load_mask = ned_load_mask >> 1;
> +    }
> +  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
> +  const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> +
> +  /*
> +   Read the bytes of haystack in the current cache line
> +   */
> +  size_t hay_index = edge;
> +  __mmask64 loadmask = _bzhi_u64 (
> +      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
> +  /* First load is a partial cache line */
> +  __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> +  /* Search for NULL and compare only till null char */
> +  __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask (loadmask, hay0, null);
> +  __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> +  cmpmask = _kand_mask64 (cmpmask, loadmask);
> +  /* Search for the 2 charaters of needle */
> +  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> +  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> +  k1 = _kshiftri_mask64 (k1, 1);
> +  /* k2 masks tell us if both chars from needle match */
> +  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
> +  /* For every match, search for the entire needle for a full match */
> +  while (k2)
> +    {
> +      uint64_t bitcount = _tzcnt_u64(k2);
> +      k2 = _blsr_u64(k2);
> +      size_t match_pos = hay_index + bitcount - edge;
> +      if (nullmask == 0)
> +        {
> +          if (verify_string_match_avx512 (haystack, match_pos, ned,
> +                                          ned_load_mask, ned_zmm))
> +            return (char *)haystack + match_pos;
> +        }
> +      else
> +        {
> +          if (verify_string_match (haystack, match_pos, ned, 0))
> +            return (char *)haystack + match_pos;
> +        }
> +    }
> +  /* We haven't checked for potential match at the last char yet */
> +  hay_index += _mm_popcnt_u64 (loadmask) - 1;
> +
> +  /*
> +   Loop over one cache line at a time to prevent reading over page
> +   boundary
> +   */
> +  __m512i hay1;
> +  while (nullmask == 0)
> +    {
> +      hay0 = _mm512_loadu_si512 (haystack + hay_index);
> +      hay1 = _mm512_load_si512 (haystack + hay_index
> +                                + 1); // Always 64 byte aligned
> +      nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
> +      /* Compare only till null char */
> +      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> +      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> +      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> +      /* k2 masks tell us if both chars from needle match */
> +      k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
> +      /* For every match, compare full strings for potential match */
> +      while (k2)
> +        {
> +          uint64_t bitcount = _tzcnt_u64(k2);
> +          k2 = _blsr_u64(k2);
> +          size_t match_pos = hay_index + bitcount - edge;
> +          if (nullmask == 0)
> +            {
> +              /*
> +               Since the haystack doesn't terminate at the current cache
> +               line, we can use zmm register to compare the first 64 bytes
> +               */
> +              if (verify_string_match_avx512 (haystack, match_pos, ned,
> +                                              ned_load_mask, ned_zmm))
> +                return (char *)haystack + match_pos;
> +            }
> +          else
> +            {
> +              /* Compare byte by byte */
> +              if (verify_string_match (haystack, match_pos, ned, 0))
> +                return (char *)haystack + match_pos;
> +            }
> +        }
> +      hay_index += ZMM_SIZE_IN_BYTES;
> +    }
> +  return NULL;
> +}
> diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
> index 95600a9de5..2fb8b169b6 100644
> --- a/sysdeps/x86_64/multiarch/strstr.c
> +++ b/sysdeps/x86_64/multiarch/strstr.c
> @@ -35,16 +35,32 @@
>
>  extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
>  extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
> +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
>
>  #include "init-arch.h"
>
>  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
>     ifunc symbol properly.  */
>  extern __typeof (__redirect_strstr) __libc_strstr;
> -libc_ifunc (__libc_strstr,
> -           HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> -           ? __strstr_sse2_unaligned
> -           : __strstr_sse2)
>
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features *cpu_features = __get_cpu_features ();
> +
> +  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> +    return __strstr_avx512;
> +
> +  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> +    return __strstr_sse2_unaligned;
> +
> +  return __strstr_sse2;
> +}
> +
> +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
>  #undef strstr
>  strong_alias (__libc_strstr, strstr)
> --
> 2.36.1
>
  
Noah Goldstein May 26, 2022, 10:26 p.m. UTC | #3
On Thu, May 26, 2022 at 4:41 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, May 26, 2022 at 3:11 PM Raghuveer Devulapalli via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
> >
> > (1) We spend a few cycles at the begining to peek into the needle. We
> > locate an edge in the needle (first occurance of 2 consequent distinct
> > characters) and also store the first 64-bytes into a zmm register.
> >
> > (2) We search for the edge in the haystack by looking into one cache
> > line of the haystack at a time. This avoids having to read past a page
> > boundary which can cause a seg fault.
> >
> > (3) If an edge is found in the haystack we first compare the first
> > 64-bytes of the needle (already stored in a zmm register) before we
> > proceed with a full string compare performed byte by byte.
> >
> > Benchmarking data on ICX shows upto 2x speed up when compared to
> > __strstr_sse2_unaligned (including partial benchtests data from
> > bench-strstr.out):
> >
> > |---------------------------------+---------------+-----------------------|
> > |                                 | strstr_avx512 | strstr_sse2_unaligned |
> > |---------------------------------+---------------+-----------------------|
> > | Length 16384/ 16,  1/11, found: | 1939.75       | 3458.44               |
> > | Length 16384/ 16, 14/ 5, fail : | 1967.75       | 3541.12               |
> > | Length 16384/ 32,  1/11, found: | 1540.38       | 2908.25               |
> > | Length 16384/ 32, 14/ 5, fail : | 1345.94       | 2866.31               |
> > | Length 16384/ 64,  1/11, found: | 1968.81       | 4327.56               |
> > | Length 16384/ 64, 14/ 5, fail : | 1993.75       | 4215.69               |
> > | Length 16384/128,  1/11, found: | 1535.44       | 3780.56               |
> > | Length 16384/128, 14/ 5, fail : | 1414.75       | 3595.25               |
> > | Length 16384/256,  1/11, found: | 2957.75       | 5501.44               |
> > | Length 16384/256, 14/ 5, fail : | 2682.62       | 5099.88               |
> > | Length 32768/ 16,  1/11, found: | 7820.19       | 11262.9               |
> > | Length 32768/ 16, 14/ 5, fail : | 8196.88       | 10871.2               |
> > | Length 32768/ 32,  1/11, found: | 5709.19       | 6611.56               |
> > | Length 32768/ 32, 14/ 5, fail : | 5716.12       | 6647.06               |
> > | Length 32768/ 64,  1/11, found: | 7160.44       | 10143.7               |
> > | Length 32768/ 64, 14/ 5, fail : | 7021.38       | 10150.6               |
> > | Length 32768/128,  1/11, found: | 4935.31       | 6756.56               |
> > | Length 32768/128, 14/ 5, fail : | 4774.38       | 6746.19               |
> > | Length 32768/256,  1/11, found: | 7933.19       | 12563.8               |
> > | Length 32768/256, 14/ 5, fail : | 7975          | 12558.6               |
> > | Length 65536/ 16,  1/11, found: | 9066.69       | 9419.62               |
> > | Length 65536/ 16, 14/ 5, fail : | 8496          | 9384.75               |
> > | Length 65536/ 32,  1/11, found: | 10258.8       | 11192.4               |
> > | Length 65536/ 32, 14/ 5, fail : | 8712.12       | 11172.3               |
> > | Length 65536/ 64,  1/11, found: | 11085.2       | 18162.1               |
> > | Length 65536/ 64, 14/ 5, fail : | 11219.6       | 17921.5               |
> > | Length 65536/128,  1/11, found: | 9753.56       | 18704.6               |
> > | Length 65536/128, 14/ 5, fail : | 9588.81       | 18465.6               |
> > | Length 65536/256,  1/11, found: | 18333.3       | 28505.2               |
> > | Length 65536/256, 14/ 5, fail : | 18018.8       | 27990.8               |
> > |---------------------------------+---------------+-----------------------|
>
> Can you add aggregate geomean all benchmarks sse2 / evex512?
>
> Also can you add all numbers to the email.
> > ---
> >  sysdeps/x86_64/multiarch/Makefile          |   2 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
> >  sysdeps/x86_64/multiarch/strstr-avx512.c   | 208 +++++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strstr.c          |  24 ++-
> >  4 files changed, 236 insertions(+), 4 deletions(-)
> >  create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index e7b413edad..6dc54a7265 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -126,6 +126,7 @@ sysdep_routines += \
> >    strrchr-sse2 \
> >    strspn-c \
> >    strspn-sse2 \
> > +  strstr-avx512 \
> >    strstr-sse2-unaligned \
> >    varshift \
> >  # sysdep_routines
> > @@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4
> >  CFLAGS-strcspn-c.c += -msse4
> >  CFLAGS-strpbrk-c.c += -msse4
> >  CFLAGS-strspn-c.c += -msse4
> > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
> >  endif
> >
> >  ifeq ($(subdir),wcsmbs)
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index a594f4176e..cc9a7eaaa1 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >
> >    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
> >    IFUNC_IMPL (i, name, strstr,
> > +              IFUNC_IMPL_ADD (array, i, strstr,
> > +                              (CPU_FEATURE_USABLE (AVX512VL)
> > +                               && CPU_FEATURE_USABLE (AVX512BW)
> > +                               && CPU_FEATURE_USABLE (AVX512DQ)
> > +                               && CPU_FEATURE_USABLE (BMI2)),
> > +                              __strstr_avx512)
> >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
> >
> > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > new file mode 100644
> > index 0000000000..4082a75a1b
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > @@ -0,0 +1,208 @@
> > +/* strstr optimized with 512-bit AVX-512 instructions
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <immintrin.h>
> > +#include <inttypes.h>
> > +#include <stdbool.h>
> > +#include <string.h>
> > +
> > +#define FULL_MMASK64 0xffffffffffffffff
> > +#define ONE_64BIT 0x1ull
> > +#define ZMM_SIZE_IN_BYTES 64
> > +
> > +/*
> > + Returns the index of the first edge within the needle, returns 0 if no edge
> > + is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > + */
> > +static inline size_t
> > +find_edge_in_needle (const char *ned)
> > +{
> > +  size_t ind = 0;
> > +  while (ned[ind + 1] != '\0')
> > +    {
> > +      if (ned[ind] != ned[ind + 1])
> > +        return ind;
> > +      else
> > +        ind = ind + 1;
> > +    }
> > +  return 0;
> > +}
> > +
> > +/*
> > + Compare needle with haystack byte by byte at specified location
> > + */
> > +static inline bool
> > +verify_string_match (const char *hay, const size_t hay_index, const char *ned,
> > +                     size_t ind)
> > +{
> > +  while (ned[ind] != '\0')
> > +    {
> > +      if (ned[ind] != hay[hay_index + ind])
> > +        return false;
> > +      ind = ind + 1;
> > +    }
> > +  return true;
> > +}
> > +
> > +/*
> > + Compare needle with haystack at specified location. The first 64 bytes are
> > + compared using a ZMM register.
> > + */
> > +static inline bool
> > +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> > +                            const char *ned, const __mmask64 ned_mask,
> > +                            const __m512i ned_zmm)
> > +{
> > +  /* check first 64 bytes using zmm and then scalar */
> > +  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
> > +  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
> > +  if (match != 0x0) // failed the first few chars
> > +    return false;
> > +  else if (ned_mask == FULL_MMASK64)
> > +    return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
> > +  return true;
> > +}
> > +
> > +char *
> > +__strstr_avx512 (const char *haystack, const char *ned)
> > +{
> > +  char first = ned[0];
> > +  if (first == '\0')
> > +    return (char *)haystack;
> > +  if (ned[1] == '\0')
> > +    return (char *)strchr (haystack, ned[0]);
> > +
> > +  size_t edge = find_edge_in_needle (ned);
> > +
> > +  /* ensure haystack is as long as the pos of edge in needle */
> > +  for (int ii = 0; ii < edge; ++ii)
> > +    {
> > +      if (haystack[ii] == '\0')
> > +        return NULL;
> > +    }
> > +
> > +  const __m512i null = _mm512_setzero_si512 (); // '\0'
> > +
> > +  /*
> > +   Load 64 bytes of the needle and save it to a zmm register
> > +   Read one cache line at a time to avoid loading across a page boundary
> > +   */
> > +  __mmask64 ned_load_mask
> > +      = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
> > +  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
> > +  __mmask64 ned_nullmask
> > +      = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm, null);
> > +  if (__glibc_unlikely (ned_nullmask == 0x0))
> > +    {
> > +      ned_zmm = _mm512_loadu_si512 (ned);
> > +      ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
> > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > +      if (ned_nullmask != 0x0)
> > +        ned_load_mask = ned_load_mask >> 1;
> > +    }
> > +  else
> > +    {
> > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > +      ned_load_mask = ned_load_mask >> 1;
> > +    }
> > +  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
> > +  const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> > +
> > +  /*
> > +   Read the bytes of haystack in the current cache line
> > +   */
> > +  size_t hay_index = edge;
> > +  __mmask64 loadmask = _bzhi_u64 (
> > +      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
> > +  /* First load is a partial cache line */
> > +  __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > +  /* Search for NULL and compare only till null char */
> > +  __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask (loadmask, hay0, null);
> > +  __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > +  cmpmask = _kand_mask64 (cmpmask, loadmask);
> > +  /* Search for the 2 charaters of needle */
> > +  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > +  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > +  k1 = _kshiftri_mask64 (k1, 1);
> > +  /* k2 masks tell us if both chars from needle match */
> > +  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
> > +  /* For every match, search for the entire needle for a full match */
> > +  while (k2)
> > +    {
> > +      uint64_t bitcount = _tzcnt_u64(k2);
> > +      k2 = _blsr_u64(k2);
> > +      size_t match_pos = hay_index + bitcount - edge;
> > +      if (nullmask == 0)
> > +        {
> > +          if (verify_string_match_avx512 (haystack, match_pos, ned,
> > +                                          ned_load_mask, ned_zmm))
> > +            return (char *)haystack + match_pos;
> > +        }
> > +      else
> > +        {
> > +          if (verify_string_match (haystack, match_pos, ned, 0))
> > +            return (char *)haystack + match_pos;
> > +        }
> > +    }
> > +  /* We haven't checked for potential match at the last char yet */
> > +  hay_index += _mm_popcnt_u64 (loadmask) - 1;
> > +
> > +  /*
> > +   Loop over one cache line at a time to prevent reading over page
> > +   boundary
> > +   */
> > +  __m512i hay1;
> > +  while (nullmask == 0)
> > +    {
> > +      hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > +      hay1 = _mm512_load_si512 (haystack + hay_index
> > +                                + 1); // Always 64 byte aligned
> > +      nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
> > +      /* Compare only till null char */
> > +      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > +      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > +      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > +      /* k2 masks tell us if both chars from needle match */
> > +      k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
> > +      /* For every match, compare full strings for potential match */
> > +      while (k2)
> > +        {
> > +          uint64_t bitcount = _tzcnt_u64(k2);
> > +          k2 = _blsr_u64(k2);
> > +          size_t match_pos = hay_index + bitcount - edge;
> > +          if (nullmask == 0)
> > +            {
> > +              /*
> > +               Since the haystack doesn't terminate at the current cache
> > +               line, we can use zmm register to compare the first 64 bytes
> > +               */
> > +              if (verify_string_match_avx512 (haystack, match_pos, ned,
> > +                                              ned_load_mask, ned_zmm))
> > +                return (char *)haystack + match_pos;
> > +            }
> > +          else
> > +            {
> > +              /* Compare byte by byte */
> > +              if (verify_string_match (haystack, match_pos, ned, 0))
> > +                return (char *)haystack + match_pos;
> > +            }
> > +        }
> > +      hay_index += ZMM_SIZE_IN_BYTES;
> > +    }
> > +  return NULL;
> > +}
> > diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
> > index 95600a9de5..2fb8b169b6 100644
> > --- a/sysdeps/x86_64/multiarch/strstr.c
> > +++ b/sysdeps/x86_64/multiarch/strstr.c
> > @@ -35,16 +35,32 @@
> >
> >  extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
> >  extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
> > +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
> >
> >  #include "init-arch.h"
> >
> >  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> >     ifunc symbol properly.  */
> >  extern __typeof (__redirect_strstr) __libc_strstr;
> > -libc_ifunc (__libc_strstr,
> > -           HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> > -           ? __strstr_sse2_unaligned
> > -           : __strstr_sse2)
> >
> > +static inline void *
> > +IFUNC_SELECTOR (void)
> > +{
> > +  const struct cpu_features *cpu_features = __get_cpu_features ();
> > +
> > +  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > +    return __strstr_avx512;
> > +
> > +  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > +    return __strstr_sse2_unaligned;
> > +
> > +  return __strstr_sse2;
> > +}
> > +
> > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
> >  #undef strstr
> >  strong_alias (__libc_strstr, strstr)
> > --
> > 2.36.1
> >

Can you run clang-format on this where it makes sense.
  
develop--- via Libc-alpha May 27, 2022, 5:49 p.m. UTC | #4
Geometric mean per haystack size: 

|---------------+--------------+---------------+-----------------+-------------------------+---------------|
| haystack size | basic_strstr | twoway_strstr | __strstr_avx512 | __strstr_sse2_unaligned | __strstr_sse2 |
|---------------+--------------+---------------+-----------------+-------------------------+---------------|
| 64            | 102.24       | 227.66        | 43.99           | 45.85                   | 155.96        |
| 96            | 1460.31      | 1899.5        | 167.66          | 305.91                  | 547.13        |
| 128           | 177.45       | 305.07        | 42.51           | 70.85                   | 178.41        |
| 160           | 230.54       | 515.28        | 59.65           | 68.76                   | 229.79        |
| 192           | 2859.48      | 3345.92       | 389.53          | 557.72                  | 697.96        |
| 224           | 299.04       | 486.03        | 60.66           | 69.22                   | 222.73        |
| 256           | 344.44       | 384.04        | 64.46           | 91.64                   | 317.63        |
| 512           | 751.13       | 2032.34       | 119.33          | 186.86                  | 1063.78       |
| 1024          | 1452.07      | 2877.57       | 199.28          | 319.36                  | 1159.13       |
| 2048          | 3090.89      | 4387.19       | 303.44          | 559.34                  | 1410.79       |
| 4096          | 5691.89      | 7149.54       | 558.94          | 1141.53                 | 1800.8        |
| 8192          | 11240.29     | 13091.44      | 1479.94         | 2142.81                 | 2464.92       |
| 16384         | 22188.11     | 8634.67       | 1934.69         | 3929.45                 | 4400.37       |
| 32768         | 44837.56     | 56494.02      | 6724.21         | 9431.22                 | 8044.71       |
| 65536         | 88900.08     | 96978.61      | 11453.29        | 17091.89                | 13750.56      |
|---------------+--------------+---------------+-----------------+-------------------------+---------------|

Raw data from bench-strstr.out: 

                       	basic_strstr	twoway_strstr	__strstr_avx512	__strstr_sse2_unaligned	__strstr_sse2
Length   64/  1, alignment  1/ 3, found:	82.5625	47.1875	220.875	223.5	36.8125
Length   64/  1, alignment  0/ 9, fail :	86	22.875	17.3125	30.1875	21.3125
Length   64/  2, alignment  1/ 3, found:	90.125	281.562	116.625	25.9375	79.3125
Length   64/  2, alignment  0/ 9, fail :	88.3125	68.375	91.125	133.188	76.5
Length   64/  3, alignment  1/ 3, found:	89	143	78.5625	27.25	147.062
Length   64/  3, alignment  0/ 9, fail :	84.6875	112.875	34.9375	37.5	135.562
Length   64/  4, alignment  1/ 3, found:	98.0625	184.25	30.0625	26.9375	157.188
Length   64/  4, alignment  0/ 9, fail :	96.5625	173.438	32.3125	36.5625	144
Length   64/  5, alignment  1/ 3, found:	96.75	187.75	27.125	29.5	153.438
Length   64/  5, alignment  0/ 9, fail :	89.5	160.938	32.75	36.3125	147.75
Length   64/  6, alignment  1/ 3, found:	110	374.062	34.9375	40.9375	164.375
Length   64/  6, alignment  0/ 9, fail :	103.062	377.812	37.8125	45.6875	153.062
Length   64/  7, alignment  1/ 3, found:	120.625	367.75	34.9375	39.5	152.562
Length   64/  7, alignment  0/ 9, fail :	110.75	358.688	42.125	50.4375	150.125
Length   64/  8, alignment  1/ 3, found:	120.375	295.812	36.1875	37.5625	154.062
Length   64/  8, alignment  0/ 9, fail :	108.125	309.625	44.875	46.75	150.938
Length   64/  9, alignment  1/ 3, found:	139.75	162.188	33.375	44.3125	166.125
Length   64/  9, alignment  0/ 9, fail :	129.5	136.062	32.5	36.125	162.688
Length   64/ 10, alignment  1/ 3, found:	96.375	40.6875	26.5625	39.1875	48.0625
Length   64/ 10, alignment  0/ 9, fail :	83.5	26.8125	34.125	36.5	23.6875
Length   64/ 11, alignment  1/ 3, found:	106.938	186.562	26.625	31.6875	183.75
Length   64/ 11, alignment  0/ 9, fail :	99.0625	148.75	33.4375	38	175.375
Length   64/ 12, alignment  1/ 3, found:	101.062	214.062	28.5	34.0625	199
Length   64/ 12, alignment  0/ 9, fail :	88.875	175.938	32.75	35.9375	196.75
Length   64/ 13, alignment  1/ 3, found:	133.188	305.562	28.3125	36.0625	240.312
Length   64/ 13, alignment  0/ 9, fail :	96.0625	300.938	32.8125	36.625	233.625
Length   64/ 14, alignment  1/ 3, found:	111.125	309.562	26.875	37.5625	230.125
Length   64/ 14, alignment  0/ 9, fail :	89.8125	312.062	32.875	36.3125	221.5
Length   64/ 15, alignment  1/ 3, found:	117.25	387.688	26.5625	39.8125	224
Length   64/ 15, alignment  0/ 9, fail :	92.5625	380.875	32.375	35.8125	219.312
Length   64/ 16, alignment  1/ 3, found:	113.625	380.125	26.5	39.625	225.625
Length   64/ 16, alignment  0/ 9, fail :	98.625	351.125	40.8125	41.875	216.75
Length   96/  1, alignment  1/ 3, found:	121.188	30.1875	21.6875	32.9375	24.875
Length   96/  1, alignment  0/ 9, fail :	121	29.9375	18	27.0625	22.9375
Length   96/  2, alignment  1/ 3, found:	128.188	317.938	44.8125	43.5625	262.5
Length   96/  2, alignment  0/ 9, fail :	127.812	296.438	33.5	39.5625	221.688
Length   96/  3, alignment  1/ 3, found:	148.188	151.25	58.0625	50.5	208.688
Length   96/  3, alignment  0/ 9, fail :	143.062	131.062	45.25	41.5	199.188
Length   96/  4, alignment  1/ 3, found:	131.875	143.75	56.5	43.875	199.938
Length   96/  4, alignment  0/ 9, fail :	126.375	119.25	41.3125	36	181
Length   96/  5, alignment  1/ 3, found:	131.812	321.688	61.0625	52	174.688
Length   96/  5, alignment  0/ 9, fail :	125.938	310.062	42.75	42.6875	158.812
Length   96/  6, alignment  1/ 3, found:	150.875	516.688	55.5	45.6875	179.562
Length   96/  6, alignment  0/ 9, fail :	143.938	524.438	33.5	35.75	164.562
Length   96/  7, alignment  1/ 3, found:	132.688	430.938	48.0625	48.9375	168.188
Length   96/  7, alignment  0/ 9, fail :	127.438	364.438	32.6875	36	165.062
Length   96/  8, alignment  1/ 3, found:	130.875	46.0625	52.875	47.4375	49.6875
Length   96/  8, alignment  0/ 9, fail :	117.125	22.125	32.3125	36.25	25.1875
Length   96/  9, alignment  1/ 3, found:	201.75	197.625	63.5625	55.75	179.188
Length   96/  9, alignment  0/ 9, fail :	190.562	143.5	39.1875	44.125	176.062
Length   96/ 10, alignment  1/ 3, found:	141.312	400.875	54.25	49.0625	208.875
Length   96/ 10, alignment  0/ 9, fail :	128.938	422.25	33.125	36.625	199.812
Length   96/ 11, alignment  1/ 3, found:	170.062	468.75	65.375	62.5	207.438
Length   96/ 11, alignment  0/ 9, fail :	148.375	474.625	38.5625	42.5	192.125
Length   96/ 12, alignment  1/ 3, found:	135.688	466.375	54.9375	55.25	192.5
Length   96/ 12, alignment  0/ 9, fail :	121.625	480.938	32.25	37.9375	192.438
Length   96/ 13, alignment  1/ 3, found:	129.562	39.5625	57.9375	53.5	47.875
Length   96/ 13, alignment  0/ 9, fail :	112.438	23.3125	32.5	36.25	25.125
Length   96/ 14, alignment  1/ 3, found:	140	453.312	72.375	66.0625	207.75
Length   96/ 14, alignment  0/ 9, fail :	126.812	463.438	39.4375	45	208
Length   96/ 15, alignment  1/ 3, found:	176	238.75	63.25	56.375	230.688
Length   96/ 15, alignment  0/ 9, fail :	140.688	192.5	31.9375	36.1875	231.312
Length   96/ 16, alignment  1/ 3, found:	130.25	38.5625	63.625	59.375	42.3125
Length   96/ 16, alignment  0/ 9, fail :	111.562	22.75	32	36.8125	23.5
Length  128/  1, alignment  1/ 3, found:	152.938	29.0625	22	37.5	23.25
Length  128/  1, alignment  0/ 9, fail :	150.125	26.1875	22.25	36.3125	24.5625
Length  128/  2, alignment  1/ 3, found:	195.5	156.125	46.9375	50.875	303.5
Length  128/  2, alignment  0/ 9, fail :	194.562	134.562	42.1875	48.125	299.125
Length  128/  3, alignment  1/ 3, found:	164.875	312.25	57.3125	46.1875	268.312
Length  128/  3, alignment  0/ 9, fail :	158.688	282.562	45.6875	48.5	262.188
Length  128/  4, alignment  1/ 3, found:	180.375	171.75	56.0625	57.0625	233.125
Length  128/  4, alignment  0/ 9, fail :	167.125	151.938	54.375	661.688	210.25
Length  128/  5, alignment  1/ 3, found:	191.688	646	45.6875	51.875	188.438
Length  128/  5, alignment  0/ 9, fail :	184.938	644	45.25	63.0625	177.625
Length  128/  6, alignment  1/ 3, found:	185.812	208	48.6875	57	181.938
Length  128/  6, alignment  0/ 9, fail :	171.75	196.25	51.125	66.25	177.25
Length  128/  7, alignment  1/ 3, found:	172.938	134.75	46.6875	55.5625	147.625
Length  128/  7, alignment  0/ 9, fail :	157.875	104.688	44.6875	65.5	146.625
Length  128/  8, alignment  1/ 3, found:	250.375	298.75	39.6875	50.8125	184.25
Length  128/  8, alignment  0/ 9, fail :	248.25	265.562	41.1875	47.6875	179.562
Length  128/  9, alignment  1/ 3, found:	152	41.875	37.6875	46.9375	41.75
Length  128/  9, alignment  0/ 9, fail :	152.125	36.8125	39.5	46.375	36.1875
Length  128/ 10, alignment  1/ 3, found:	178.625	449.75	49.25	58.25	196.25
Length  128/ 10, alignment  0/ 9, fail :	164.812	468.625	47.75	52.375	188.062
Length  128/ 11, alignment  1/ 3, found:	184.75	608.375	35.4375	51.1875	199.125
Length  128/ 11, alignment  0/ 9, fail :	180.625	619.188	41.9375	47.3125	198.438
Length  128/ 12, alignment  1/ 3, found:	158.438	41.75	36.0625	55.8125	43.8125
Length  128/ 12, alignment  0/ 9, fail :	151.5	34.3125	40.3125	45.1875	38.75
Length  128/ 13, alignment  1/ 3, found:	190.062	571.062	36.25	52.5	210.625
Length  128/ 13, alignment  0/ 9, fail :	162.75	585.938	40.75	45.6875	199.188
Length  128/ 14, alignment  1/ 3, found:	172.188	498.625	42.375	61.6875	219.75
Length  128/ 14, alignment  0/ 9, fail :	158.938	509.188	46.875	51.5625	216.062
Length  128/ 15, alignment  1/ 3, found:	215.312	591.875	37.25	59.25	223.812
Length  128/ 15, alignment  0/ 9, fail :	180.312	594.812	40.9375	44.8125	218.438
Length  128/ 16, alignment  1/ 3, found:	184.875	185.438	36.9375	58.75	236.5
Length  128/ 16, alignment  0/ 9, fail :	163.125	162.062	41.3125	45.625	234.812
Length  160/  1, alignment  1/ 3, found:	261.125	42.125	27.25	33.75	37.125
Length  160/  1, alignment  0/ 9, fail :	227.062	35.9375	27.9375	33.625	30.0625
Length  160/  2, alignment  1/ 3, found:	202.938	334.062	48	51.1875	226.938
Length  160/  2, alignment  0/ 9, fail :	192.625	296.875	41.625	47.25	216
Length  160/  3, alignment  1/ 3, found:	256.688	201.188	70.3125	74.5625	396.875
Length  160/  3, alignment  0/ 9, fail :	226.312	170.688	53.25	67.5625	347
Length  160/  4, alignment  1/ 3, found:	240.375	402.375	80.375	82	287.625
Length  160/  4, alignment  0/ 9, fail :	216.5	365.125	65.8125	70.75	271.125
Length  160/  5, alignment  1/ 3, found:	230	242.25	59.6875	71.9375	242.688
Length  160/  5, alignment  0/ 9, fail :	211.812	213.938	46.1875	59.375	235.25
Length  160/  6, alignment  1/ 3, found:	214.75	359.125	60.3125	71.8125	233.375
Length  160/  6, alignment  0/ 9, fail :	212.062	315.562	45.375	67.0625	225.625
Length  160/  7, alignment  1/ 3, found:	230	504.375	59.4375	71.1875	226.938
Length  160/  7, alignment  0/ 9, fail :	214.5	477.062	43.25	62.0625	222.812
Length  160/  8, alignment  1/ 3, found:	243.875	693	74.625	78.25	240.625
Length  160/  8, alignment  0/ 9, fail :	220.562	653.062	52.6875	61.5625	221.812
Length  160/  9, alignment  1/ 3, found:	230	658.938	68.0625	69.5	237.312
Length  160/  9, alignment  0/ 9, fail :	208.188	617.688	48.125	50	229.625
Length  160/ 10, alignment  1/ 3, found:	244.188	807.312	76.5	82.125	245.75
Length  160/ 10, alignment  0/ 9, fail :	214.188	740.188	58	67.5	230.062
Length  160/ 11, alignment  1/ 3, found:	239.375	734.75	71.8125	64.4375	215
Length  160/ 11, alignment  0/ 9, fail :	216.812	662.625	41.875	47	210.125
Length  160/ 12, alignment  1/ 3, found:	194	570.625	72.625	62.1875	212.625
Length  160/ 12, alignment  0/ 9, fail :	189.5	500.125	43.625	48.375	207.875
Length  160/ 13, alignment  1/ 3, found:	312.875	790	84.0625	107.812	228.125
Length  160/ 13, alignment  0/ 9, fail :	285.5	745.375	55.8125	77.125	218.25
Length  160/ 14, alignment  1/ 3, found:	227.875	705.938	81.5625	96	230.688
Length  160/ 14, alignment  0/ 9, fail :	221.688	700.062	55.4375	70.4375	231.188
Length  160/ 15, alignment  1/ 3, found:	252.312	768.375	85.4375	96.875	253.5
Length  160/ 15, alignment  0/ 9, fail :	231.625	768.25	52.125	72.8125	244.312
Length  160/ 16, alignment  1/ 3, found:	275.688	720.75	95.6875	106.438	250.375
Length  160/ 16, alignment  0/ 9, fail :	232.25	691.25	62	77.875	246.438
Length  192/  1, alignment  1/ 3, found:	220.25	35.125	30.8125	40	34.3125
Length  192/  1, alignment  0/ 9, fail :	217.312	34.0625	29.0625	41.25	33.125
Length  192/  2, alignment  1/ 3, found:	212.438	53.125	55.6875	54.875	43
Length  192/  2, alignment  0/ 9, fail :	207.312	34.5	49.375	54.5	35.5625
Length  192/  3, alignment  1/ 3, found:	215.375	48.5	48.8125	52.25	44.5
Length  192/  3, alignment  0/ 9, fail :	207.188	28.8125	48.375	56.1875	32.6875
Length  192/  4, alignment  1/ 3, found:	244	369.438	45.375	52.3125	260.562
Length  192/  4, alignment  0/ 9, fail :	230.75	341.938	46.3125	55.8125	243
Length  192/  5, alignment  1/ 3, found:	240.688	676.812	44.8125	54.8125	226.875
Length  192/  5, alignment  0/ 9, fail :	224.5	655.438	46.0625	54.8125	218.875
Length  192/  6, alignment  1/ 3, found:	275.875	784.312	55.125	64.875	215
Length  192/  6, alignment  0/ 9, fail :	259.062	770.5	51.5625	62.1875	208.062
Length  192/  7, alignment  1/ 3, found:	270.688	826.625	69.5	85.875	236.25
Length  192/  7, alignment  0/ 9, fail :	248.312	773.25	67.1875	82.875	216.25
Length  192/  8, alignment  1/ 3, found:	334.375	888.25	55.8125	70.1875	257.312
Length  192/  8, alignment  0/ 9, fail :	330.438	856.25	53.1875	75.625	235.625
Length  192/  9, alignment  1/ 3, found:	228	162.375	49.875	68.4375	178.688
Length  192/  9, alignment  0/ 9, fail :	222.25	131.5	52.0625	70.4375	170.75
Length  192/ 10, alignment  1/ 3, found:	233.188	168.688	48.5	64.625	187.5
Length  192/ 10, alignment  0/ 9, fail :	224.312	146.625	48.3125	67.125	178
Length  192/ 11, alignment  1/ 3, found:	222.188	47.3125	45.5	68.3125	51.8125
Length  192/ 11, alignment  0/ 9, fail :	221.812	139.875	48.25	68.8125	184.062
Length  192/ 12, alignment  1/ 3, found:	335.062	481.938	50.6875	78.375	249.5
Length  192/ 12, alignment  0/ 9, fail :	317.562	466.938	53.6875	69.75	239.688
Length  192/ 13, alignment  1/ 3, found:	235.75	234.438	44.125	67.375	222.875
Length  192/ 13, alignment  0/ 9, fail :	213.938	230.312	45.375	53.9375	215.688
Length  192/ 14, alignment  1/ 3, found:	225.375	45.0625	46.3125	69.6875	51.1875
Length  192/ 14, alignment  0/ 9, fail :	214.5	29.875	51.25	53.4375	33
Length  192/ 15, alignment  1/ 3, found:	223.5	44.6875	46.4375	65.0625	47.6875
Length  192/ 15, alignment  0/ 9, fail :	207.562	31.125	44.9375	54.3125	33.25
Length  192/ 16, alignment  1/ 3, found:	224.188	45.9375	43.6875	64.8125	49.0625
Length  192/ 16, alignment  0/ 9, fail :	207.438	30.625	44.6875	53.125	31.5
Length  224/  1, alignment  1/ 3, found:	302.625	36.4375	31.1875	41.375	37.1875
Length  224/  1, alignment  0/ 9, fail :	273.625	33.125	31.1875	38.3125	32.75
Length  224/  2, alignment  1/ 3, found:	300.875	387.188	53.625	57.5625	560.875
Length  224/  2, alignment  0/ 9, fail :	296.812	356.938	47.0625	55.4375	515.25
Length  224/  3, alignment  1/ 3, found:	292.562	714.875	76.4375	90.125	493.625
Length  224/  3, alignment  0/ 9, fail :	278.625	714.438	64.1875	87.0625	455
Length  224/  4, alignment  1/ 3, found:	314.625	552.438	55.6875	67.5625	303.812
Length  224/  4, alignment  0/ 9, fail :	293.938	514.812	46.3125	51.1875	303.312
Length  224/  5, alignment  1/ 3, found:	265.5	309.812	61.75	62.875	192.812
Length  224/  5, alignment  0/ 9, fail :	258.438	275.75	47.0625	52.875	186.25
Length  224/  6, alignment  1/ 3, found:	320.125	567.688	83.5	104.438	241.25
Length  224/  6, alignment  0/ 9, fail :	295.125	559.062	72.4375	90.4375	229.062
Length  224/  7, alignment  1/ 3, found:	300.875	140.812	73.875	76.1875	222.75
Length  224/  7, alignment  0/ 9, fail :	292.812	130	55.5625	62.3125	214.188
Length  224/  8, alignment  1/ 3, found:	302.812	139.438	68.125	77.625	231.062
Length  224/  8, alignment  0/ 9, fail :	286.438	123.125	50.625	65.0625	220
Length  224/  9, alignment  1/ 3, found:	297.5	826.562	71.0625	89.9375	237.125
Length  224/  9, alignment  0/ 9, fail :	271.25	840.25	59.25	72.5	215.938
Length  224/ 10, alignment  1/ 3, found:	289.438	1008.06	70	70.75	235.875
Length  224/ 10, alignment  0/ 9, fail :	274.688	1017.12	46.5	55.0625	221.312
Length  224/ 11, alignment  1/ 3, found:	325.812	1010.56	78.9375	93.875	238.438
Length  224/ 11, alignment  0/ 9, fail :	308.062	1004.31	56.6875	70.625	233.875
Length  224/ 12, alignment  1/ 3, found:	281.062	983.875	70.3125	71.875	250.75
Length  224/ 12, alignment  0/ 9, fail :	259.938	983.75	46.4375	55.1875	243.188
Length  224/ 13, alignment  1/ 3, found:	305.438	55.0625	72.3125	70.1875	52.75
Length  224/ 13, alignment  0/ 9, fail :	281.938	45.8125	46.625	55.125	45.25
Length  224/ 14, alignment  1/ 3, found:	299	49.5625	74.9375	70.4375	51.625
Length  224/ 14, alignment  0/ 9, fail :	284.812	32.8125	46.375	52.6875	33
Length  224/ 15, alignment  1/ 3, found:	311.188	48.5625	78.4375	72.1875	47.4375
Length  224/ 15, alignment  0/ 9, fail :	279.688	30.625	46.5	55	29.875
Length  224/ 16, alignment  1/ 3, found:	429	1053.5	94.8125	105	282.75
Length  224/ 16, alignment  0/ 9, fail :	394.5	1006.56	63.375	74.125	269.125
Length  256/  1, alignment  1/ 3, found:	344.938	36.5	34.5	43.875	36.6875
Length  256/  1, alignment  0/ 9, fail :	336.938	40.3125	29.3125	45.4375	38.875
Length  256/  2, alignment  1/ 3, found:	317.688	158.125	66.25	70.8125	666.375
Length  256/  2, alignment  0/ 9, fail :	303.125	139.062	55.6875	62.75	630.938
Length  256/  3, alignment  1/ 3, found:	301.812	243	66.3125	84.125	235.188
Length  256/  3, alignment  0/ 9, fail :	289.5	230.938	69.125	89.5625	227.562
Length  256/  4, alignment  1/ 3, found:	431.562	278.5	59.5	70.9375	313.875
Length  256/  4, alignment  0/ 9, fail :	415.375	259.125	58.5	71.25	289.5
Length  256/  5, alignment  1/ 3, found:	353.438	1133.94	66.625	81.375	335.125
Length  256/  5, alignment  0/ 9, fail :	335.812	1082.75	60.375	75.6875	316.625
Length  256/  6, alignment  1/ 3, found:	321.812	251.125	58.375	72.6875	304.875
Length  256/  6, alignment  0/ 9, fail :	302.312	208.625	60.3125	72.75	284.312
Length  256/  7, alignment  1/ 3, found:	463.25	232.875	78.5	97.625	270
Length  256/  7, alignment  0/ 9, fail :	445.375	210	69.5	89.1875	257.375
Length  256/  8, alignment  1/ 3, found:	371.062	166.625	69.375	94.0625	266.875
Length  256/  8, alignment  0/ 9, fail :	355	136.188	66.1875	89.75	251.688
Length  256/  9, alignment  1/ 3, found:	321.062	164.938	61.0625	80.375	255.375
Length  256/  9, alignment  0/ 9, fail :	306.625	137.875	59.875	76.6875	242.562
Length  256/ 10, alignment  1/ 3, found:	323.688	166.25	57.4375	73.4375	253.062
Length  256/ 10, alignment  0/ 9, fail :	298.688	138.75	59.625	68.625	250.312
Length  256/ 11, alignment  1/ 3, found:	318.438	161.312	54.8125	80.9375	272.188
Length  256/ 11, alignment  0/ 9, fail :	298.875	132.188	55.4375	68.875	263.812
Length  256/ 12, alignment  1/ 3, found:	325.938	161.875	61.0625	87.6875	255.312
Length  256/ 12, alignment  0/ 9, fail :	301.812	135.125	65	80.1875	250.125
Length  256/ 13, alignment  1/ 3, found:	337.188	401.938	50.6875	73.375	271.438
Length  256/ 13, alignment  0/ 9, fail :	308.438	356.562	49.25	63.8125	258.438
Length  256/ 14, alignment  1/ 3, found:	312.5	242.75	51.125	73.0625	242
Length  256/ 14, alignment  0/ 9, fail :	287.625	206.125	51.125	60.9375	238.5
Length  256/ 15, alignment  1/ 3, found:	293.25	244.188	48.875	73.0625	249.312
Length  256/ 15, alignment  0/ 9, fail :	288	205.188	49.8125	60.3125	239.5
Length  256/ 16, alignment  1/ 3, found:	343.75	51.625	47.9375	75.5625	50.625
Length  256/ 16, alignment  0/ 9, fail :	322.875	35.3125	49.3125	59.9375	33.9375
Length  256/ 16, alignment  1/11, found:	340.938	46.9375	48.625	73.8125	49.4375
Length  256/ 16, alignment 14/ 5, fail :	323.062	32.5625	49.8125	59.625	33.875
Length  256/ 32, alignment  1/11, found:	306.75	894.188	49.6875	105.938	408.625
Length  256/ 32, alignment 14/ 5, fail :	280.5	781.875	49.5	60.5	395.75
Length  256/ 64, alignment  1/11, found:	459	1323.25	75.875	170.438	701.812
Length  256/ 64, alignment 14/ 5, fail :	413.25	1125.06	68.0625	89.5625	687.5
Length  256/128, alignment  1/11, found:	431.188	2134.38	139.688	205.125	1295.88
Length  256/128, alignment 14/ 5, fail :	356.188	1900.06	55.5	61.5	1276.75
Length  256/256, alignment  1/11, found:	548.438	81.125	264	574.625	75.9375
Length  256/256, alignment 14/ 5, fail :	329.5	60.375	65.8125	78.9375	62.3125
Length  512/ 16, alignment  1/11, found:	798.5	1669.25	81.4375	105.75	421.812
Length  512/ 16, alignment 14/ 5, fail :	767.062	1608.88	74.0625	100.25	395.688
Length  512/ 32, alignment  1/11, found:	745.938	1202.19	91.125	180.25	435.625
Length  512/ 32, alignment 14/ 5, fail :	671.5	1104.69	86.875	122.875	419.25
Length  512/ 64, alignment  1/11, found:	1000.75	1475	96.875	229.562	707
Length  512/ 64, alignment 14/ 5, fail :	597.438	1275.12	94.875	143	696.812
Length  512/128, alignment  1/11, found:	713.812	2273.5	170.625	248.25	1316.31
Length  512/128, alignment 14/ 5, fail :	614.688	1924.94	84.4375	116.125	1280.94
Length  512/256, alignment  1/11, found:	914.938	4177.12	299.5	445.812	2491.88
Length  512/256, alignment 14/ 5, fail :	686.625	3612.75	113.5	176.75	2472.44
Length 1024/ 16, alignment  1/11, found:	1423.75	4580.5	184.938	305.062	530
Length 1024/ 16, alignment 14/ 5, fail :	1264.88	4479.94	176.562	277.938	487.5
Length 1024/ 32, alignment  1/11, found:	1894.75	1585.88	137.562	275.562	620.562
Length 1024/ 32, alignment 14/ 5, fail :	1645.44	1470.44	133.938	227.125	599.25
Length 1024/ 64, alignment  1/11, found:	1476.81	1970.31	243	425.375	826.375
Length 1024/ 64, alignment 14/ 5, fail :	1336.94	1771.75	247.125	363.562	799.25
Length 1024/128, alignment  1/11, found:	1439.56	2714.44	213.438	334.062	1363.44
Length 1024/128, alignment 14/ 5, fail :	1202.56	2473.94	128.188	198.938	1326.44
Length 1024/256, alignment  1/11, found:	1598.38	4090.94	373.062	544.312	2544.19
Length 1024/256, alignment 14/ 5, fail :	1237.62	3637.56	154.938	241.625	2494.31
Length 2048/ 16, alignment  1/11, found:	3643.25	9594.62	244.438	427.625	967.125
Length 2048/ 16, alignment 14/ 5, fail :	3444.25	9472.19	244.75	424.375	900.25
Length 2048/ 32, alignment  1/11, found:	3669.06	2153.62	211.312	449.5	843.5
Length 2048/ 32, alignment 14/ 5, fail :	3619.38	2015.75	235.812	394.812	800.312
Length 2048/ 64, alignment  1/11, found:	2559.75	2449.56	241.75	494.125	1088.69
Length 2048/ 64, alignment 14/ 5, fail :	2377.44	2210.19	236.688	423.25	1032.62
Length 2048/128, alignment  1/11, found:	2987.94	3316.69	420.75	794.938	1585.12
Length 2048/128, alignment 14/ 5, fail :	2705.75	3118.12	340.938	648.625	1537.62
Length 2048/256, alignment  1/11, found:	3170.81	5005.38	530.188	893.875	2705
Length 2048/256, alignment 14/ 5, fail :	2731.31	4535.81	327.75	642.312	2647.69
Length 4096/ 16, alignment  1/11, found:	5343.5	19144.8	536.25	1089.5	1553.5
Length 4096/ 16, alignment 14/ 5, fail :	5217.44	19170.2	493.438	993.25	1456.69
Length 4096/ 32, alignment  1/11, found:	7072.5	4249.25	492.188	1066.38	1455.69
Length 4096/ 32, alignment 14/ 5, fail :	6993.19	4061.94	466.812	985.125	1393.56
Length 4096/ 64, alignment  1/11, found:	5117.25	3570.31	465.188	1093.12	1370.81
Length 4096/ 64, alignment 14/ 5, fail :	5051.31	3372.44	653.625	1009	1323.5
Length 4096/128, alignment  1/11, found:	5459.12	3780.44	434.812	954.125	1661.44
Length 4096/128, alignment 14/ 5, fail :	5142.81	3402.81	327.875	809.25	1619.19
Length 4096/256, alignment  1/11, found:	5971.44	5554.25	957.562	1919.31	3112.19
Length 4096/256, alignment 14/ 5, fail :	5550.38	5189	761.625	1496.19	3061.38
Length 8192/ 16, alignment  1/11, found:	9681.88	40017.5	3486.19	1857.81	2314.12
Length 8192/ 16, alignment 14/ 5, fail :	9640.31	39833.3	3078.38	1816.25	2244.5
Length 8192/ 32, alignment  1/11, found:	10085.7	7358.25	744.312	1513	1976.62
Length 8192/ 32, alignment 14/ 5, fail :	9982.5	7094.75	767.312	1469.19	1899.62
Length 8192/ 64, alignment  1/11, found:	10161.9	5016.19	924.062	2150.12	1796.06
Length 8192/ 64, alignment 14/ 5, fail :	9865	4756.69	939.125	2012.38	1726.5
Length 8192/128, alignment  1/11, found:	14496.5	6407.75	873.625	2102.81	2431.75
Length 8192/128, alignment 14/ 5, fail :	14451.4	6055.88	786.938	1988.5	2540.62
Length 8192/256, alignment  1/11, found:	12171.9	7560.44	1695.62	3455.44	3896.44
Length 8192/256, alignment 14/ 5, fail :	11865.8	6813.69	1503.81	3062.56	3823
Length 16384/ 16, alignment  1/11, found:	26405.8	2771	1939.75	3458.44	4665.75
Length 16384/ 16, alignment 14/ 5, fail :	26365.4	2703.5	1967.75	3541.12	4614.38
Length 16384/ 32, alignment  1/11, found:	21014.2	12941.8	1540.38	2908.25	3856.19
Length 16384/ 32, alignment 14/ 5, fail :	20926.2	12508	1345.94	2866.31	3699.19
Length 16384/ 64, alignment  1/11, found:	21047.8	9763.06	1968.81	4327.56	3937.94
Length 16384/ 64, alignment 14/ 5, fail :	20831.2	9370.19	1993.75	4215.69	3806.38
Length 16384/128, alignment  1/11, found:	20901.2	8829	1535.44	3780.56	4409.81
Length 16384/128, alignment 14/ 5, fail :	20554.6	8484.38	1414.75	3595.25	4334.81
Length 16384/256, alignment  1/11, found:	22051.7	9850.38	2957.75	5501.44	5395.38
Length 16384/256, alignment 14/ 5, fail :	21783	9125.38	2682.62	5099.88	5283.88
Length 32768/ 16, alignment  1/11, found:	44759.6	172535	7820.19	11262.9	10746.4
Length 32768/ 16, alignment 14/ 5, fail :	44607.6	165730	8196.88	10871.2	10649.1
Length 32768/ 32, alignment  1/11, found:	44149.3	32756.8	5709.19	6611.56	8476.81
Length 32768/ 32, alignment 14/ 5, fail :	44038.1	33773.8	5716.12	6647.06	8526.75
Length 32768/ 64, alignment  1/11, found:	40456.6	29023.6	7160.44	10143.7	5885.75
Length 32768/ 64, alignment 14/ 5, fail :	40481.6	28606.3	7021.38	10150.6	5737.75
Length 32768/128, alignment  1/11, found:	47779	29011.2	4935.31	6756.56	7652.69
Length 32768/128, alignment 14/ 5, fail :	47737	28753.4	4774.38	6746.19	7519.75
Length 32768/256, alignment  1/11, found:	47323.5	23237.3	7933.19	12563.8	7731.69
Length 32768/256, alignment 14/ 5, fail :	47043.3	21512.8	7975	12558.6	7520.44
Length 65536/ 16, alignment  1/11, found:	83641.9	326590	9066.69	9419.62	22823.9
Length 65536/ 16, alignment 14/ 5, fail :	83851.6	326332	8496	9384.75	22353.8
Length 65536/ 32, alignment  1/11, found:	79622.9	54849.7	10258.8	11192.4	11908.4
Length 65536/ 32, alignment 14/ 5, fail :	79978.1	54414.6	8712.12	11172.3	11775.9
Length 65536/ 64, alignment  1/11, found:	84139.6	49233.6	11085.2	18162.1	12065.2
Length 65536/ 64, alignment 14/ 5, fail :	84550.4	48807	11219.6	17921.5	12006.4
Length 65536/128, alignment  1/11, found:	82355.9	28967.7	9753.56	18704.6	12264.6
Length 65536/128, alignment 14/ 5, fail :	82424.4	27971.3	9588.81	18465.6	12060.8
Length 65536/256, alignment  1/11, found:	114307	26580.4	18333.3	28505.2	10345.7
Length 65536/256, alignment 14/ 5, fail :	114129	26039.8	18018.8	27990.8	9900.94
Length 65536/ 64, complex needle 1:	3.74241e+06	46785.3	16544	866685	862864
Length 65536/ 64, complex needle 2:	4.31153e+06	304627	11677.8	990792	986300
Length 65536/ 64, complex needle 3:	989750	1.57276e+06	97158.6	446779	18592.6
Length 65536/256, complex needle 1:	1.20009e+07	32962.1	47821.9	1.0267e+06	1.00796e+06
Length 65536/256, complex needle 2:	1.79424e+07	275658	64927.6	1.06521e+06	1.03903e+06
Length 65536/256, complex needle 3:	991491	1.5706e+06	105750	459469	8755.81
Length 65536/1024, complex needle 1:	6.14575e+07	24270.9	71510.6	216958	25173.1
Length 65536/1024, complex needle 2:	6.84464e+07	276943	78251.6	442606	341867
Length 65536/1024, complex needle 3:	988035	1.56111e+06	97955.1	429106	1.56998e+06

-----Original Message-----
From: Noah Goldstein <goldstein.w.n@gmail.com> 
Sent: Thursday, May 26, 2022 3:27 PM
To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
Cc: GNU C Library <libc-alpha@sourceware.org>
Subject: Re: [PATCH 1/1] x86_64: Add strstr function with 512-bit EVEX

On Thu, May 26, 2022 at 4:41 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, May 26, 2022 at 3:11 PM Raghuveer Devulapalli via Libc-alpha 
> <libc-alpha@sourceware.org> wrote:
> >
> > Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
> >
> > (1) We spend a few cycles at the begining to peek into the needle. 
> > We locate an edge in the needle (first occurance of 2 consequent 
> > distinct
> > characters) and also store the first 64-bytes into a zmm register.
> >
> > (2) We search for the edge in the haystack by looking into one cache 
> > line of the haystack at a time. This avoids having to read past a 
> > page boundary which can cause a seg fault.
> >
> > (3) If an edge is found in the haystack we first compare the first 
> > 64-bytes of the needle (already stored in a zmm register) before we 
> > proceed with a full string compare performed byte by byte.
> >
> > Benchmarking data on ICX shows upto 2x speed up when compared to 
> > __strstr_sse2_unaligned (including partial benchtests data from
> > bench-strstr.out):
> >
> > |---------------------------------+---------------+-----------------------|
> > |                                 | strstr_avx512 | 
> > | strstr_sse2_unaligned |
> > |---------------------------------+---------------+-----------------------|
> > | Length 16384/ 16,  1/11, found: | 1939.75       | 3458.44               |
> > | Length 16384/ 16, 14/ 5, fail : | 1967.75       | 3541.12               |
> > | Length 16384/ 32,  1/11, found: | 1540.38       | 2908.25               |
> > | Length 16384/ 32, 14/ 5, fail : | 1345.94       | 2866.31               |
> > | Length 16384/ 64,  1/11, found: | 1968.81       | 4327.56               |
> > | Length 16384/ 64, 14/ 5, fail : | 1993.75       | 4215.69               |
> > | Length 16384/128,  1/11, found: | 1535.44       | 3780.56               |
> > | Length 16384/128, 14/ 5, fail : | 1414.75       | 3595.25               |
> > | Length 16384/256,  1/11, found: | 2957.75       | 5501.44               |
> > | Length 16384/256, 14/ 5, fail : | 2682.62       | 5099.88               |
> > | Length 32768/ 16,  1/11, found: | 7820.19       | 11262.9               |
> > | Length 32768/ 16, 14/ 5, fail : | 8196.88       | 10871.2               |
> > | Length 32768/ 32,  1/11, found: | 5709.19       | 6611.56               |
> > | Length 32768/ 32, 14/ 5, fail : | 5716.12       | 6647.06               |
> > | Length 32768/ 64,  1/11, found: | 7160.44       | 10143.7               |
> > | Length 32768/ 64, 14/ 5, fail : | 7021.38       | 10150.6               |
> > | Length 32768/128,  1/11, found: | 4935.31       | 6756.56               |
> > | Length 32768/128, 14/ 5, fail : | 4774.38       | 6746.19               |
> > | Length 32768/256,  1/11, found: | 7933.19       | 12563.8               |
> > | Length 32768/256, 14/ 5, fail : | 7975          | 12558.6               |
> > | Length 65536/ 16,  1/11, found: | 9066.69       | 9419.62               |
> > | Length 65536/ 16, 14/ 5, fail : | 8496          | 9384.75               |
> > | Length 65536/ 32,  1/11, found: | 10258.8       | 11192.4               |
> > | Length 65536/ 32, 14/ 5, fail : | 8712.12       | 11172.3               |
> > | Length 65536/ 64,  1/11, found: | 11085.2       | 18162.1               |
> > | Length 65536/ 64, 14/ 5, fail : | 11219.6       | 17921.5               |
> > | Length 65536/128,  1/11, found: | 9753.56       | 18704.6               |
> > | Length 65536/128, 14/ 5, fail : | 9588.81       | 18465.6               |
> > | Length 65536/256,  1/11, found: | 18333.3       | 28505.2               |
> > | Length 65536/256, 14/ 5, fail : | 18018.8       | 27990.8               |
> > |---------------------------------+---------------+-----------------------|
>
> Can you add aggregate geomean all benchmarks sse2 / evex512?
>
> Also can you add all numbers to the email.
> > ---
> >  sysdeps/x86_64/multiarch/Makefile          |   2 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
> >  sysdeps/x86_64/multiarch/strstr-avx512.c   | 208 +++++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strstr.c          |  24 ++-
> >  4 files changed, 236 insertions(+), 4 deletions(-)  create mode 
> > 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile 
> > b/sysdeps/x86_64/multiarch/Makefile
> > index e7b413edad..6dc54a7265 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -126,6 +126,7 @@ sysdep_routines += \
> >    strrchr-sse2 \
> >    strspn-c \
> >    strspn-sse2 \
> > +  strstr-avx512 \
> >    strstr-sse2-unaligned \
> >    varshift \
> >  # sysdep_routines
> > @@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4  CFLAGS-strcspn-c.c 
> > += -msse4  CFLAGS-strpbrk-c.c += -msse4  CFLAGS-strspn-c.c += -msse4
> > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq 
> > +-mavx512bw -mbmi -mbmi2 -O3
> >  endif
> >
> >  ifeq ($(subdir),wcsmbs)
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c 
> > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index a594f4176e..cc9a7eaaa1 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name, 
> > struct libc_ifunc_impl *array,
> >
> >    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
> >    IFUNC_IMPL (i, name, strstr,
> > +              IFUNC_IMPL_ADD (array, i, strstr,
> > +                              (CPU_FEATURE_USABLE (AVX512VL)
> > +                               && CPU_FEATURE_USABLE (AVX512BW)
> > +                               && CPU_FEATURE_USABLE (AVX512DQ)
> > +                               && CPU_FEATURE_USABLE (BMI2)),
> > +                              __strstr_avx512)
> >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
> >
> > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c 
> > b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > new file mode 100644
> > index 0000000000..4082a75a1b
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > @@ -0,0 +1,208 @@
> > +/* strstr optimized with 512-bit AVX-512 instructions
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <immintrin.h>
> > +#include <inttypes.h>
> > +#include <stdbool.h>
> > +#include <string.h>
> > +
> > +#define FULL_MMASK64 0xffffffffffffffff #define ONE_64BIT 0x1ull 
> > +#define ZMM_SIZE_IN_BYTES 64
> > +
> > +/*
> > + Returns the index of the first edge within the needle, returns 0 
> > +if no edge  is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > + */
> > +static inline size_t
> > +find_edge_in_needle (const char *ned) {
> > +  size_t ind = 0;
> > +  while (ned[ind + 1] != '\0')
> > +    {
> > +      if (ned[ind] != ned[ind + 1])
> > +        return ind;
> > +      else
> > +        ind = ind + 1;
> > +    }
> > +  return 0;
> > +}
> > +
> > +/*
> > + Compare needle with haystack byte by byte at specified location  
> > +*/ static inline bool verify_string_match (const char *hay, const 
> > +size_t hay_index, const char *ned,
> > +                     size_t ind)
> > +{
> > +  while (ned[ind] != '\0')
> > +    {
> > +      if (ned[ind] != hay[hay_index + ind])
> > +        return false;
> > +      ind = ind + 1;
> > +    }
> > +  return true;
> > +}
> > +
> > +/*
> > + Compare needle with haystack at specified location. The first 64 
> > +bytes are  compared using a ZMM register.
> > + */
> > +static inline bool
> > +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> > +                            const char *ned, const __mmask64 ned_mask,
> > +                            const __m512i ned_zmm) {
> > +  /* check first 64 bytes using zmm and then scalar */
> > +  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe 
> > +to do so
> > +  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, 
> > +hay_zmm, ned_zmm);
> > +  if (match != 0x0) // failed the first few chars
> > +    return false;
> > +  else if (ned_mask == FULL_MMASK64)
> > +    return verify_string_match (hay, hay_index, ned, 
> > +ZMM_SIZE_IN_BYTES);
> > +  return true;
> > +}
> > +
> > +char *
> > +__strstr_avx512 (const char *haystack, const char *ned) {
> > +  char first = ned[0];
> > +  if (first == '\0')
> > +    return (char *)haystack;
> > +  if (ned[1] == '\0')
> > +    return (char *)strchr (haystack, ned[0]);
> > +
> > +  size_t edge = find_edge_in_needle (ned);
> > +
> > +  /* ensure haystack is as long as the pos of edge in needle */  
> > + for (int ii = 0; ii < edge; ++ii)
> > +    {
> > +      if (haystack[ii] == '\0')
> > +        return NULL;
> > +    }
> > +
> > +  const __m512i null = _mm512_setzero_si512 (); // '\0'
> > +
> > +  /*
> > +   Load 64 bytes of the needle and save it to a zmm register
> > +   Read one cache line at a time to avoid loading across a page boundary
> > +   */
> > +  __mmask64 ned_load_mask
> > +      = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));  
> > + __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
> > +  __mmask64 ned_nullmask
> > +      = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm, null);  
> > + if (__glibc_unlikely (ned_nullmask == 0x0))
> > +    {
> > +      ned_zmm = _mm512_loadu_si512 (ned);
> > +      ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
> > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > +      if (ned_nullmask != 0x0)
> > +        ned_load_mask = ned_load_mask >> 1;
> > +    }
> > +  else
> > +    {
> > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > +      ned_load_mask = ned_load_mask >> 1;
> > +    }
> > +  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);  const __m512i 
> > + ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> > +
> > +  /*
> > +   Read the bytes of haystack in the current cache line
> > +   */
> > +  size_t hay_index = edge;
> > +  __mmask64 loadmask = _bzhi_u64 (
> > +      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 
> > + 63));
> > +  /* First load is a partial cache line */  __m512i hay0 = 
> > + _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > +  /* Search for NULL and compare only till null char */
> > +  __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask (loadmask, hay0, 
> > + null);
> > +  __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT);  cmpmask = 
> > + _kand_mask64 (cmpmask, loadmask);
> > +  /* Search for the 2 charaters of needle */
> > +  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > +  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > +  k1 = _kshiftri_mask64 (k1, 1);
> > +  /* k2 masks tell us if both chars from needle match */  uint64_t 
> > + k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), 
> > + cmpmask));
> > +  /* For every match, search for the entire needle for a full match 
> > + */  while (k2)
> > +    {
> > +      uint64_t bitcount = _tzcnt_u64(k2);
> > +      k2 = _blsr_u64(k2);
> > +      size_t match_pos = hay_index + bitcount - edge;
> > +      if (nullmask == 0)
> > +        {
> > +          if (verify_string_match_avx512 (haystack, match_pos, ned,
> > +                                          ned_load_mask, ned_zmm))
> > +            return (char *)haystack + match_pos;
> > +        }
> > +      else
> > +        {
> > +          if (verify_string_match (haystack, match_pos, ned, 0))
> > +            return (char *)haystack + match_pos;
> > +        }
> > +    }
> > +  /* We haven't checked for potential match at the last char yet */  
> > + hay_index += _mm_popcnt_u64 (loadmask) - 1;
> > +
> > +  /*
> > +   Loop over one cache line at a time to prevent reading over page
> > +   boundary
> > +   */
> > +  __m512i hay1;
> > +  while (nullmask == 0)
> > +    {
> > +      hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > +      hay1 = _mm512_load_si512 (haystack + hay_index
> > +                                + 1); // Always 64 byte aligned
> > +      nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
> > +      /* Compare only till null char */
> > +      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > +      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > +      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > +      /* k2 masks tell us if both chars from needle match */
> > +      k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
> > +      /* For every match, compare full strings for potential match */
> > +      while (k2)
> > +        {
> > +          uint64_t bitcount = _tzcnt_u64(k2);
> > +          k2 = _blsr_u64(k2);
> > +          size_t match_pos = hay_index + bitcount - edge;
> > +          if (nullmask == 0)
> > +            {
> > +              /*
> > +               Since the haystack doesn't terminate at the current cache
> > +               line, we can use zmm register to compare the first 64 bytes
> > +               */
> > +              if (verify_string_match_avx512 (haystack, match_pos, ned,
> > +                                              ned_load_mask, ned_zmm))
> > +                return (char *)haystack + match_pos;
> > +            }
> > +          else
> > +            {
> > +              /* Compare byte by byte */
> > +              if (verify_string_match (haystack, match_pos, ned, 0))
> > +                return (char *)haystack + match_pos;
> > +            }
> > +        }
> > +      hay_index += ZMM_SIZE_IN_BYTES;
> > +    }
> > +  return NULL;
> > +}
> > diff --git a/sysdeps/x86_64/multiarch/strstr.c 
> > b/sysdeps/x86_64/multiarch/strstr.c
> > index 95600a9de5..2fb8b169b6 100644
> > --- a/sysdeps/x86_64/multiarch/strstr.c
> > +++ b/sysdeps/x86_64/multiarch/strstr.c
> > @@ -35,16 +35,32 @@
> >
> >  extern __typeof (__redirect_strstr) __strstr_sse2_unaligned 
> > attribute_hidden;  extern __typeof (__redirect_strstr) __strstr_sse2 
> > attribute_hidden;
> > +extern __typeof (__redirect_strstr) __strstr_avx512 
> > +attribute_hidden;
> >
> >  #include "init-arch.h"
> >
> >  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> >     ifunc symbol properly.  */
> >  extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc 
> > (__libc_strstr,
> > -           HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> > -           ? __strstr_sse2_unaligned
> > -           : __strstr_sse2)
> >
> > +static inline void *
> > +IFUNC_SELECTOR (void)
> > +{
> > +  const struct cpu_features *cpu_features = __get_cpu_features ();
> > +
> > +  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > +    return __strstr_avx512;
> > +
> > +  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > +    return __strstr_sse2_unaligned;
> > +
> > +  return __strstr_sse2;
> > +}
> > +
> > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, 
> > +IFUNC_SELECTOR ());
> >  #undef strstr
> >  strong_alias (__libc_strstr, strstr)
> > --
> > 2.36.1
> >

Can you run clang-format on this where it makes sense.
  
develop--- via Libc-alpha May 31, 2022, 7:16 p.m. UTC | #5
> -----Original Message-----
> From: Noah Goldstein <goldstein.w.n@gmail.com>
> Sent: Thursday, May 26, 2022 2:26 PM
> To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
> Cc: GNU C Library <libc-alpha@sourceware.org>
> Subject: Re: [PATCH 1/1] x86_64: Add strstr function with 512-bit EVEX
> 
> On Thu, May 26, 2022 at 3:11 PM Raghuveer Devulapalli via Libc-alpha <libc-
> alpha@sourceware.org> wrote:
> >
> > Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
> >
> > (1) We spend a few cycles at the begining to peek into the needle. We
> > locate an edge in the needle (first occurance of 2 consequent distinct
> > characters) and also store the first 64-bytes into a zmm register.
> >
> > (2) We search for the edge in the haystack by looking into one cache
> > line of the haystack at a time. This avoids having to read past a page
> > boundary which can cause a seg fault.
> >
> > (3) If an edge is found in the haystack we first compare the first
> > 64-bytes of the needle (already stored in a zmm register) before we
> > proceed with a full string compare performed byte by byte.
> >
> > Benchmarking data on ICX shows upto 2x speed up when compared to
> > __strstr_sse2_unaligned (including partial benchtests data from
> > bench-strstr.out):
> >
> > |---------------------------------+---------------+-----------------------|
> > |                                 | strstr_avx512 |
> > | strstr_sse2_unaligned |
> > |---------------------------------+---------------+-----------------------|
> > | Length 16384/ 16,  1/11, found: | 1939.75       | 3458.44               |
> > | Length 16384/ 16, 14/ 5, fail : | 1967.75       | 3541.12               |
> > | Length 16384/ 32,  1/11, found: | 1540.38       | 2908.25               |
> > | Length 16384/ 32, 14/ 5, fail : | 1345.94       | 2866.31               |
> > | Length 16384/ 64,  1/11, found: | 1968.81       | 4327.56               |
> > | Length 16384/ 64, 14/ 5, fail : | 1993.75       | 4215.69               |
> > | Length 16384/128,  1/11, found: | 1535.44       | 3780.56               |
> > | Length 16384/128, 14/ 5, fail : | 1414.75       | 3595.25               |
> > | Length 16384/256,  1/11, found: | 2957.75       | 5501.44               |
> > | Length 16384/256, 14/ 5, fail : | 2682.62       | 5099.88               |
> > | Length 32768/ 16,  1/11, found: | 7820.19       | 11262.9               |
> > | Length 32768/ 16, 14/ 5, fail : | 8196.88       | 10871.2               |
> > | Length 32768/ 32,  1/11, found: | 5709.19       | 6611.56               |
> > | Length 32768/ 32, 14/ 5, fail : | 5716.12       | 6647.06               |
> > | Length 32768/ 64,  1/11, found: | 7160.44       | 10143.7               |
> > | Length 32768/ 64, 14/ 5, fail : | 7021.38       | 10150.6               |
> > | Length 32768/128,  1/11, found: | 4935.31       | 6756.56               |
> > | Length 32768/128, 14/ 5, fail : | 4774.38       | 6746.19               |
> > | Length 32768/256,  1/11, found: | 7933.19       | 12563.8               |
> > | Length 32768/256, 14/ 5, fail : | 7975          | 12558.6               |
> > | Length 65536/ 16,  1/11, found: | 9066.69       | 9419.62               |
> > | Length 65536/ 16, 14/ 5, fail : | 8496          | 9384.75               |
> > | Length 65536/ 32,  1/11, found: | 10258.8       | 11192.4               |
> > | Length 65536/ 32, 14/ 5, fail : | 8712.12       | 11172.3               |
> > | Length 65536/ 64,  1/11, found: | 11085.2       | 18162.1               |
> > | Length 65536/ 64, 14/ 5, fail : | 11219.6       | 17921.5               |
> > | Length 65536/128,  1/11, found: | 9753.56       | 18704.6               |
> > | Length 65536/128, 14/ 5, fail : | 9588.81       | 18465.6               |
> > | Length 65536/256,  1/11, found: | 18333.3       | 28505.2               |
> > | Length 65536/256, 14/ 5, fail : | 18018.8       | 27990.8               |
> > |---------------------------------+---------------+-----------------------|
> > ---
> >  sysdeps/x86_64/multiarch/Makefile          |   2 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
> >  sysdeps/x86_64/multiarch/strstr-avx512.c   | 208
> +++++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strstr.c          |  24 ++-
> >  4 files changed, 236 insertions(+), 4 deletions(-)  create mode
> > 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile
> > b/sysdeps/x86_64/multiarch/Makefile
> > index e7b413edad..6dc54a7265 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -126,6 +126,7 @@ sysdep_routines += \
> >    strrchr-sse2 \
> >    strspn-c \
> >    strspn-sse2 \
> > +  strstr-avx512 \
> >    strstr-sse2-unaligned \
> >    varshift \
> >  # sysdep_routines
> > @@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4  CFLAGS-strcspn-c.c
> +=
> > -msse4  CFLAGS-strpbrk-c.c += -msse4  CFLAGS-strspn-c.c += -msse4
> > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -
> mavx512bw
> > +-mbmi -mbmi2 -O3
> >  endif
> >
> >  ifeq ($(subdir),wcsmbs)
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index a594f4176e..cc9a7eaaa1 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name, struct
> > libc_ifunc_impl *array,
> >
> >    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
> >    IFUNC_IMPL (i, name, strstr,
> > +              IFUNC_IMPL_ADD (array, i, strstr,
> > +                              (CPU_FEATURE_USABLE (AVX512VL)
> > +                               && CPU_FEATURE_USABLE (AVX512BW)
> > +                               && CPU_FEATURE_USABLE (AVX512DQ)
> > +                               && CPU_FEATURE_USABLE (BMI2)),
> > +                              __strstr_avx512)
> >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
> >
> > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > new file mode 100644
> > index 0000000000..4082a75a1b
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > @@ -0,0 +1,208 @@
> > +/* strstr optimized with 512-bit AVX-512 instructions
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <immintrin.h>
> > +#include <inttypes.h>
> > +#include <stdbool.h>
> > +#include <string.h>
> > +
> > +#define FULL_MMASK64 0xffffffffffffffff #define ONE_64BIT 0x1ull
> > +#define ZMM_SIZE_IN_BYTES 64
> > +
> > +/*
> > + Returns the index of the first edge within the needle, returns 0 if
> > +no edge  is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > + */
> > +static inline size_t
> > +find_edge_in_needle (const char *ned) {
> > +  size_t ind = 0;
> > +  while (ned[ind + 1] != '\0')
> > +    {
> > +      if (ned[ind] != ned[ind + 1])
> > +        return ind;
> > +      else
> > +        ind = ind + 1;
> > +    }
> > +  return 0;
> > +}
> > +
> > +/*
> > + Compare needle with haystack byte by byte at specified location  */
> > +static inline bool verify_string_match (const char *hay, const size_t
> > +hay_index, const char *ned,
> > +                     size_t ind)
> > +{
> > +  while (ned[ind] != '\0')
> > +    {
>       strcmp? (you might be able to use memcmp which will be faster
>       but will need a bit of refactor to keep true nedlen and check for page
>       cross on hay)

Wouldn't strcmp give you the wrong answer here? For ex: I would need it to return true when haystack is "abcdefg" and needle is "bcd"

> > +      if (ned[ind] != hay[hay_index + ind])
> > +        return false;
> > +      ind = ind + 1;
> > +    }
> > +  return true;
> > +}
> > +
> > +/*
> > + Compare needle with haystack at specified location. The first 64
> > +bytes are  compared using a ZMM register.
> > + */
> > +static inline bool
> > +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> > +                            const char *ned, const __mmask64 ned_mask,
> > +                            const __m512i ned_zmm) {
> > +  /* check first 64 bytes using zmm and then scalar */
> > +  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe
> to
> > +do so
> > +  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask,
> hay_zmm,
> > +ned_zmm);
> > +  if (match != 0x0) // failed the first few chars
> > +    return false;
> > +  else if (ned_mask == FULL_MMASK64)
> > +    return verify_string_match (hay, hay_index, ned,
> > +ZMM_SIZE_IN_BYTES);
> > +  return true;
> > +}
> > +
> > +char *
> > +__strstr_avx512 (const char *haystack, const char *ned) {
> > +  char first = ned[0];
> > +  if (first == '\0')
> > +    return (char *)haystack;
> > +  if (ned[1] == '\0')
> > +    return (char *)strchr (haystack, ned[0]);
> > +
> > +  size_t edge = find_edge_in_needle (ned);
> > +
> > +  /* ensure haystack is as long as the pos of edge in needle */  for
> > + (int ii = 0; ii < edge; ++ii)
> > +    {
>     strnlen

Makes sense. 

> > +      if (haystack[ii] == '\0')
> > +        return NULL;
> > +    }
> > +
> > +  const __m512i null = _mm512_setzero_si512 (); // '\0'
> > +
> > +  /*
> > +   Load 64 bytes of the needle and save it to a zmm register
> > +   Read one cache line at a time to avoid loading across a page boundary
> > +   */
> > +  __mmask64 ned_load_mask
> > +      = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
>     FULL_MMASK64 >> (((-(uintptr_t)ned) & 63));

+1

> > +  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask,
> ned);
>     Maybe conditional on highly unlike page cross this is very
>     expensive if causes page walk

Elements on the next cache line are zero masked, shouldn't that prevent a cross page load?  

> > +  __mmask64 ned_nullmask
> > +      = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm,
> null);
>     _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm,
> ned_zmm)
> 
>     likewise at all other compares with null unless it breaks
>     microfusion more than once.

The compiler was using vptestnmb, doesn't hurt to explicitly use it anyways. 

> 
>     If you can replace all then get rid of null
> > +  if (__glibc_unlikely (ned_nullmask == 0x0))
> > +    {
> > +      ned_zmm = _mm512_loadu_si512 (ned);
> > +      ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
> > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > +      if (ned_nullmask != 0x0)
> > +        ned_load_mask = ned_load_mask >> 1;
> > +    }
> > +  else
> > +    {
> > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > +      ned_load_mask = ned_load_mask >> 1;
>       I think you can get away with just ned_load_mask =
>       ned_nullmask - ONE_64BIT because you only use this after
>       checking haystack no null-term

Without the >> 1, we will compare the null char of the needle to the haystack which will give you the wrong answer.

> > +    }
> > +  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);  const __m512i
> > + ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> > +
> > +  /*
> > +   Read the bytes of haystack in the current cache line
> > +   */
> > +  size_t hay_index = edge;
> > +  __mmask64 loadmask = _bzhi_u64 (
> > +      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
> > +  /* First load is a partial cache line */  __m512i hay0 =
> > + _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > +  /* Search for NULL and compare only till null char */
> > +  __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask (loadmask,
> hay0,
> > + null);
> > +  __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT);  cmpmask
> =
> > + _kand_mask64 (cmpmask, loadmask);
>   nullmask ^ (nullmask - ONE_64BIT); codegen ends up actually
>   using kand_mask here. Since loadmask and nullmask both go through
>   GPR (nullmask for the blsmsk) you can do this explicitly in uint64_t
>   to help GCC out.
> 
> > +  /* Search for the 2 charaters of needle */
> > +  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > +  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > +  k1 = _kshiftri_mask64 (k1, 1);
> > +  /* k2 masks tell us if both chars from needle match */  uint64_t k2
> > + = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1),
> cmpmask));
> > +  /* For every match, search for the entire needle for a full match
> > + */  while (k2)
> > +    {
> > +      uint64_t bitcount = _tzcnt_u64(k2);
> > +      k2 = _blsr_u64(k2);
> > +      size_t match_pos = hay_index + bitcount - edge;
> > +      if (nullmask == 0)
> > +        {
> > +          if (verify_string_match_avx512 (haystack, match_pos, ned,
> > +                                          ned_load_mask, ned_zmm))
> > +            return (char *)haystack + match_pos;
> > +        }
> > +      else
> > +        {
> > +          if (verify_string_match (haystack, match_pos, ned, 0))
> > +            return (char *)haystack + match_pos;
> > +        }
> > +    }
> > +  /* We haven't checked for potential match at the last char yet */
> > + hay_index += _mm_popcnt_u64 (loadmask) - 1;
>   hay_index = 0; haystay |= 63; You might want to check codegen and
>   ensure hay_index is being optimized out. AFAICT you just need a
>   pointer.

AFAICT, looks like it does optimize it out. 

> > +
> > +  /*
> > +   Loop over one cache line at a time to prevent reading over page
> > +   boundary
> > +   */
> > +  __m512i hay1;
> > +  while (nullmask == 0)
> > +    {
> > +      hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > +      hay1 = _mm512_load_si512 (haystack + hay_index
> > +                                + 1); // Always 64 byte aligned
>     Is this really faster than using kshiftri?

Yes (assuming you mean using just one load and use mask shift operations to look for a match). A lot of instructions in this loop are stuck on port 5 and so is kshiftri. Using 2 loads which execute on Port 2 and 3 relives that pressure. 

> > +      nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
> > +      /* Compare only till null char */
> > +      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > +      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > +      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > +      /* k2 masks tell us if both chars from needle match */
> > +      k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1),
> cmpmask));
> > +      /* For every match, compare full strings for potential match */
> > +      while (k2)
> > +        {
> > +          uint64_t bitcount = _tzcnt_u64(k2);
> > +          k2 = _blsr_u64(k2);
> > +          size_t match_pos = hay_index + bitcount - edge;
> > +          if (nullmask == 0)
> > +            {
> > +              /*
> > +               Since the haystack doesn't terminate at the current cache
> > +               line, we can use zmm register to compare the first 64 bytes
> > +               */
> > +              if (verify_string_match_avx512 (haystack, match_pos, ned,
> > +                                              ned_load_mask, ned_zmm))
> > +                return (char *)haystack + match_pos;
> > +            }
> > +          else
> > +            {
> > +              /* Compare byte by byte */
> > +              if (verify_string_match (haystack, match_pos, ned, 0))
> > +                return (char *)haystack + match_pos;
> > +            }
> > +        }
> > +      hay_index += ZMM_SIZE_IN_BYTES;
> > +    }
> > +  return NULL;
> > +}
> > diff --git a/sysdeps/x86_64/multiarch/strstr.c
> > b/sysdeps/x86_64/multiarch/strstr.c
> > index 95600a9de5..2fb8b169b6 100644
> > --- a/sysdeps/x86_64/multiarch/strstr.c
> > +++ b/sysdeps/x86_64/multiarch/strstr.c
> > @@ -35,16 +35,32 @@
> >
> >  extern __typeof (__redirect_strstr) __strstr_sse2_unaligned
> > attribute_hidden;  extern __typeof (__redirect_strstr) __strstr_sse2
> > attribute_hidden;
> > +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
> >
> >  #include "init-arch.h"
> >
> >  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> >     ifunc symbol properly.  */
> >  extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc
> > (__libc_strstr,
> > -           HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> > -           ? __strstr_sse2_unaligned
> > -           : __strstr_sse2)
> >
> > +static inline void *
> > +IFUNC_SELECTOR (void)
> > +{
> > +  const struct cpu_features *cpu_features = __get_cpu_features ();
> > +
> > +  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > +    return __strstr_avx512;
> > +
> > +  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > +    return __strstr_sse2_unaligned;
> > +
> > +  return __strstr_sse2;
> > +}
> > +
> > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr,
> > +IFUNC_SELECTOR ());
> >  #undef strstr
> >  strong_alias (__libc_strstr, strstr)
> > --
> > 2.36.1
> >
  
develop--- via Libc-alpha May 31, 2022, 7:36 p.m. UTC | #6
> -----Original Message-----
> From: Devulapalli, Raghuveer
> Sent: Tuesday, May 31, 2022 12:17 PM
> To: Noah Goldstein <goldstein.w.n@gmail.com>
> Cc: GNU C Library <libc-alpha@sourceware.org>
> Subject: RE: [PATCH 1/1] x86_64: Add strstr function with 512-bit EVEX
> 
> 
> 
> > -----Original Message-----
> > From: Noah Goldstein <goldstein.w.n@gmail.com>
> > Sent: Thursday, May 26, 2022 2:26 PM
> > To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
> > Cc: GNU C Library <libc-alpha@sourceware.org>
> > Subject: Re: [PATCH 1/1] x86_64: Add strstr function with 512-bit EVEX
> >
> > On Thu, May 26, 2022 at 3:11 PM Raghuveer Devulapalli via Libc-alpha
> > <libc- alpha@sourceware.org> wrote:
> > >
> > > Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
> > >
> > > (1) We spend a few cycles at the begining to peek into the needle.
> > > We locate an edge in the needle (first occurance of 2 consequent
> > > distinct
> > > characters) and also store the first 64-bytes into a zmm register.
> > >
> > > (2) We search for the edge in the haystack by looking into one cache
> > > line of the haystack at a time. This avoids having to read past a
> > > page boundary which can cause a seg fault.
> > >
> > > (3) If an edge is found in the haystack we first compare the first
> > > 64-bytes of the needle (already stored in a zmm register) before we
> > > proceed with a full string compare performed byte by byte.
> > >
> > > Benchmarking data on ICX shows upto 2x speed up when compared to
> > > __strstr_sse2_unaligned (including partial benchtests data from
> > > bench-strstr.out):
> > >
> > > |---------------------------------+---------------+-----------------------|
> > > |                                 | strstr_avx512 |
> > > | strstr_sse2_unaligned |
> > > |---------------------------------+---------------+-----------------------|
> > > | Length 16384/ 16,  1/11, found: | 1939.75       | 3458.44               |
> > > | Length 16384/ 16, 14/ 5, fail : | 1967.75       | 3541.12               |
> > > | Length 16384/ 32,  1/11, found: | 1540.38       | 2908.25               |
> > > | Length 16384/ 32, 14/ 5, fail : | 1345.94       | 2866.31               |
> > > | Length 16384/ 64,  1/11, found: | 1968.81       | 4327.56               |
> > > | Length 16384/ 64, 14/ 5, fail : | 1993.75       | 4215.69               |
> > > | Length 16384/128,  1/11, found: | 1535.44       | 3780.56               |
> > > | Length 16384/128, 14/ 5, fail : | 1414.75       | 3595.25               |
> > > | Length 16384/256,  1/11, found: | 2957.75       | 5501.44               |
> > > | Length 16384/256, 14/ 5, fail : | 2682.62       | 5099.88               |
> > > | Length 32768/ 16,  1/11, found: | 7820.19       | 11262.9               |
> > > | Length 32768/ 16, 14/ 5, fail : | 8196.88       | 10871.2               |
> > > | Length 32768/ 32,  1/11, found: | 5709.19       | 6611.56               |
> > > | Length 32768/ 32, 14/ 5, fail : | 5716.12       | 6647.06               |
> > > | Length 32768/ 64,  1/11, found: | 7160.44       | 10143.7               |
> > > | Length 32768/ 64, 14/ 5, fail : | 7021.38       | 10150.6               |
> > > | Length 32768/128,  1/11, found: | 4935.31       | 6756.56               |
> > > | Length 32768/128, 14/ 5, fail : | 4774.38       | 6746.19               |
> > > | Length 32768/256,  1/11, found: | 7933.19       | 12563.8               |
> > > | Length 32768/256, 14/ 5, fail : | 7975          | 12558.6               |
> > > | Length 65536/ 16,  1/11, found: | 9066.69       | 9419.62               |
> > > | Length 65536/ 16, 14/ 5, fail : | 8496          | 9384.75               |
> > > | Length 65536/ 32,  1/11, found: | 10258.8       | 11192.4               |
> > > | Length 65536/ 32, 14/ 5, fail : | 8712.12       | 11172.3               |
> > > | Length 65536/ 64,  1/11, found: | 11085.2       | 18162.1               |
> > > | Length 65536/ 64, 14/ 5, fail : | 11219.6       | 17921.5               |
> > > | Length 65536/128,  1/11, found: | 9753.56       | 18704.6               |
> > > | Length 65536/128, 14/ 5, fail : | 9588.81       | 18465.6               |
> > > | Length 65536/256,  1/11, found: | 18333.3       | 28505.2               |
> > > | Length 65536/256, 14/ 5, fail : | 18018.8       | 27990.8               |
> > > |---------------------------------+---------------+-----------------------|
> > > ---
> > >  sysdeps/x86_64/multiarch/Makefile          |   2 +
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
> > >  sysdeps/x86_64/multiarch/strstr-avx512.c   | 208
> > +++++++++++++++++++++
> > >  sysdeps/x86_64/multiarch/strstr.c          |  24 ++-
> > >  4 files changed, 236 insertions(+), 4 deletions(-)  create mode
> > > 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/Makefile
> > > b/sysdeps/x86_64/multiarch/Makefile
> > > index e7b413edad..6dc54a7265 100644
> > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > @@ -126,6 +126,7 @@ sysdep_routines += \
> > >    strrchr-sse2 \
> > >    strspn-c \
> > >    strspn-sse2 \
> > > +  strstr-avx512 \
> > >    strstr-sse2-unaligned \
> > >    varshift \
> > >  # sysdep_routines
> > > @@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4  CFLAGS-strcspn-c.c
> > +=
> > > -msse4  CFLAGS-strpbrk-c.c += -msse4  CFLAGS-strspn-c.c += -msse4
> > > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -
> > mavx512bw
> > > +-mbmi -mbmi2 -O3
> > >  endif
> > >
> > >  ifeq ($(subdir),wcsmbs)
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index a594f4176e..cc9a7eaaa1 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name,
> > > struct libc_ifunc_impl *array,
> > >
> > >    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
> > >    IFUNC_IMPL (i, name, strstr,
> > > +              IFUNC_IMPL_ADD (array, i, strstr,
> > > +                              (CPU_FEATURE_USABLE (AVX512VL)
> > > +                               && CPU_FEATURE_USABLE (AVX512BW)
> > > +                               && CPU_FEATURE_USABLE (AVX512DQ)
> > > +                               && CPU_FEATURE_USABLE (BMI2)),
> > > +                              __strstr_avx512)
> > >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> > >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > new file mode 100644
> > > index 0000000000..4082a75a1b
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > @@ -0,0 +1,208 @@
> > > +/* strstr optimized with 512-bit AVX-512 instructions
> > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#include <immintrin.h>
> > > +#include <inttypes.h>
> > > +#include <stdbool.h>
> > > +#include <string.h>
> > > +
> > > +#define FULL_MMASK64 0xffffffffffffffff #define ONE_64BIT 0x1ull
> > > +#define ZMM_SIZE_IN_BYTES 64
> > > +
> > > +/*
> > > + Returns the index of the first edge within the needle, returns 0
> > > +if no edge  is found. Example: 'ab' is the first edge in
> 'aaaaaaaaaabaarddg'
> > > + */
> > > +static inline size_t
> > > +find_edge_in_needle (const char *ned) {
> > > +  size_t ind = 0;
> > > +  while (ned[ind + 1] != '\0')
> > > +    {
> > > +      if (ned[ind] != ned[ind + 1])
> > > +        return ind;
> > > +      else
> > > +        ind = ind + 1;
> > > +    }
> > > +  return 0;
> > > +}
> > > +
> > > +/*
> > > + Compare needle with haystack byte by byte at specified location
> > > +*/ static inline bool verify_string_match (const char *hay, const
> > > +size_t hay_index, const char *ned,
> > > +                     size_t ind)
> > > +{
> > > +  while (ned[ind] != '\0')
> > > +    {
> >       strcmp? (you might be able to use memcmp which will be faster
> >       but will need a bit of refactor to keep true nedlen and check for page
> >       cross on hay)
> 
> Wouldn't strcmp give you the wrong answer here? For ex: I would need it to
> return true when haystack is "abcdefg" and needle is "bcd"
> 
> > > +      if (ned[ind] != hay[hay_index + ind])
> > > +        return false;
> > > +      ind = ind + 1;
> > > +    }
> > > +  return true;
> > > +}
> > > +
> > > +/*
> > > + Compare needle with haystack at specified location. The first 64
> > > +bytes are  compared using a ZMM register.
> > > + */
> > > +static inline bool
> > > +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> > > +                            const char *ned, const __mmask64 ned_mask,
> > > +                            const __m512i ned_zmm) {
> > > +  /* check first 64 bytes using zmm and then scalar */
> > > +  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); //
> safe
> > to
> > > +do so
> > > +  __mmask64 match = _mm512_mask_cmpneq_epi8_mask
> (ned_mask,
> > hay_zmm,
> > > +ned_zmm);
> > > +  if (match != 0x0) // failed the first few chars
> > > +    return false;
> > > +  else if (ned_mask == FULL_MMASK64)
> > > +    return verify_string_match (hay, hay_index, ned,
> > > +ZMM_SIZE_IN_BYTES);
> > > +  return true;
> > > +}
> > > +
> > > +char *
> > > +__strstr_avx512 (const char *haystack, const char *ned) {
> > > +  char first = ned[0];
> > > +  if (first == '\0')
> > > +    return (char *)haystack;
> > > +  if (ned[1] == '\0')
> > > +    return (char *)strchr (haystack, ned[0]);
> > > +
> > > +  size_t edge = find_edge_in_needle (ned);
> > > +
> > > +  /* ensure haystack is as long as the pos of edge in needle */
> > > + for (int ii = 0; ii < edge; ++ii)
> > > +    {
> >     strnlen
> 
> Makes sense.

The function call here negatively affects performance for small sized haystack. I will keep this as-is. 

> 
> > > +      if (haystack[ii] == '\0')
> > > +        return NULL;
> > > +    }
> > > +
> > > +  const __m512i null = _mm512_setzero_si512 (); // '\0'
> > > +
> > > +  /*
> > > +   Load 64 bytes of the needle and save it to a zmm register
> > > +   Read one cache line at a time to avoid loading across a page boundary
> > > +   */
> > > +  __mmask64 ned_load_mask
> > > +      = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
> >     FULL_MMASK64 >> (((-(uintptr_t)ned) & 63));
> 
> +1
> 
> > > +  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask,
> > ned);
> >     Maybe conditional on highly unlike page cross this is very
> >     expensive if causes page walk
> 
> Elements on the next cache line are zero masked, shouldn't that prevent a
> cross page load?
> 
> > > +  __mmask64 ned_nullmask
> > > +      = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm,
> > null);
> >     _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm,
> > ned_zmm)
> >
> >     likewise at all other compares with null unless it breaks
> >     microfusion more than once.
> 
> The compiler was using vptestnmb, doesn't hurt to explicitly use it anyways.
> 
> >
> >     If you can replace all then get rid of null
> > > +  if (__glibc_unlikely (ned_nullmask == 0x0))
> > > +    {
> > > +      ned_zmm = _mm512_loadu_si512 (ned);
> > > +      ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
> > > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > > +      if (ned_nullmask != 0x0)
> > > +        ned_load_mask = ned_load_mask >> 1;
> > > +    }
> > > +  else
> > > +    {
> > > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > > +      ned_load_mask = ned_load_mask >> 1;
> >       I think you can get away with just ned_load_mask =
> >       ned_nullmask - ONE_64BIT because you only use this after
> >       checking haystack no null-term
> 
> Without the >> 1, we will compare the null char of the needle to the
> haystack which will give you the wrong answer.
> 
> > > +    }
> > > +  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);  const
> __m512i
> > > + ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> > > +
> > > +  /*
> > > +   Read the bytes of haystack in the current cache line
> > > +   */
> > > +  size_t hay_index = edge;
> > > +  __mmask64 loadmask = _bzhi_u64 (
> > > +      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) &
> > > + 63));
> > > +  /* First load is a partial cache line */  __m512i hay0 =
> > > + _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > > +  /* Search for NULL and compare only till null char */
> > > +  __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask
> (loadmask,
> > hay0,
> > > + null);
> > > +  __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> cmpmask
> > =
> > > + _kand_mask64 (cmpmask, loadmask);
> >   nullmask ^ (nullmask - ONE_64BIT); codegen ends up actually
> >   using kand_mask here. Since loadmask and nullmask both go through
> >   GPR (nullmask for the blsmsk) you can do this explicitly in uint64_t
> >   to help GCC out.
> >
> > > +  /* Search for the 2 charaters of needle */
> > > +  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > +  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > > +  k1 = _kshiftri_mask64 (k1, 1);
> > > +  /* k2 masks tell us if both chars from needle match */  uint64_t
> > > + k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1),
> > cmpmask));
> > > +  /* For every match, search for the entire needle for a full match
> > > + */  while (k2)
> > > +    {
> > > +      uint64_t bitcount = _tzcnt_u64(k2);
> > > +      k2 = _blsr_u64(k2);
> > > +      size_t match_pos = hay_index + bitcount - edge;
> > > +      if (nullmask == 0)
> > > +        {
> > > +          if (verify_string_match_avx512 (haystack, match_pos, ned,
> > > +                                          ned_load_mask, ned_zmm))
> > > +            return (char *)haystack + match_pos;
> > > +        }
> > > +      else
> > > +        {
> > > +          if (verify_string_match (haystack, match_pos, ned, 0))
> > > +            return (char *)haystack + match_pos;
> > > +        }
> > > +    }
> > > +  /* We haven't checked for potential match at the last char yet */
> > > + hay_index += _mm_popcnt_u64 (loadmask) - 1;
> >   hay_index = 0; haystay |= 63; You might want to check codegen and
> >   ensure hay_index is being optimized out. AFAICT you just need a
> >   pointer.
> 
> AFAICT, looks like it does optimize it out.
> 
> > > +
> > > +  /*
> > > +   Loop over one cache line at a time to prevent reading over page
> > > +   boundary
> > > +   */
> > > +  __m512i hay1;
> > > +  while (nullmask == 0)
> > > +    {
> > > +      hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > > +      hay1 = _mm512_load_si512 (haystack + hay_index
> > > +                                + 1); // Always 64 byte aligned
> >     Is this really faster than using kshiftri?
> 
> Yes (assuming you mean using just one load and use mask shift operations
> to look for a match). A lot of instructions in this loop are stuck on port 5 and
> so is kshiftri. Using 2 loads which execute on Port 2 and 3 relives that
> pressure.
> 
> > > +      nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
> > > +      /* Compare only till null char */
> > > +      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > +      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > +      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > > +      /* k2 masks tell us if both chars from needle match */
> > > +      k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1),
> > cmpmask));
> > > +      /* For every match, compare full strings for potential match */
> > > +      while (k2)
> > > +        {
> > > +          uint64_t bitcount = _tzcnt_u64(k2);
> > > +          k2 = _blsr_u64(k2);
> > > +          size_t match_pos = hay_index + bitcount - edge;
> > > +          if (nullmask == 0)
> > > +            {
> > > +              /*
> > > +               Since the haystack doesn't terminate at the current cache
> > > +               line, we can use zmm register to compare the first 64 bytes
> > > +               */
> > > +              if (verify_string_match_avx512 (haystack, match_pos, ned,
> > > +                                              ned_load_mask, ned_zmm))
> > > +                return (char *)haystack + match_pos;
> > > +            }
> > > +          else
> > > +            {
> > > +              /* Compare byte by byte */
> > > +              if (verify_string_match (haystack, match_pos, ned, 0))
> > > +                return (char *)haystack + match_pos;
> > > +            }
> > > +        }
> > > +      hay_index += ZMM_SIZE_IN_BYTES;
> > > +    }
> > > +  return NULL;
> > > +}
> > > diff --git a/sysdeps/x86_64/multiarch/strstr.c
> > > b/sysdeps/x86_64/multiarch/strstr.c
> > > index 95600a9de5..2fb8b169b6 100644
> > > --- a/sysdeps/x86_64/multiarch/strstr.c
> > > +++ b/sysdeps/x86_64/multiarch/strstr.c
> > > @@ -35,16 +35,32 @@
> > >
> > >  extern __typeof (__redirect_strstr) __strstr_sse2_unaligned
> > > attribute_hidden;  extern __typeof (__redirect_strstr) __strstr_sse2
> > > attribute_hidden;
> > > +extern __typeof (__redirect_strstr) __strstr_avx512
> > > +attribute_hidden;
> > >
> > >  #include "init-arch.h"
> > >
> > >  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> > >     ifunc symbol properly.  */
> > >  extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc
> > > (__libc_strstr,
> > > -           HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> > > -           ? __strstr_sse2_unaligned
> > > -           : __strstr_sse2)
> > >
> > > +static inline void *
> > > +IFUNC_SELECTOR (void)
> > > +{
> > > +  const struct cpu_features *cpu_features = __get_cpu_features ();
> > > +
> > > +  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> > > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> > > +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > > +    return __strstr_avx512;
> > > +
> > > +  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > > +    return __strstr_sse2_unaligned;
> > > +
> > > +  return __strstr_sse2;
> > > +}
> > > +
> > > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr,
> > > +IFUNC_SELECTOR ());
> > >  #undef strstr
> > >  strong_alias (__libc_strstr, strstr)
> > > --
> > > 2.36.1
> > >
  
Noah Goldstein May 31, 2022, 9:33 p.m. UTC | #7
On Tue, May 31, 2022 at 2:16 PM Devulapalli, Raghuveer
<raghuveer.devulapalli@intel.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Noah Goldstein <goldstein.w.n@gmail.com>
> > Sent: Thursday, May 26, 2022 2:26 PM
> > To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
> > Cc: GNU C Library <libc-alpha@sourceware.org>
> > Subject: Re: [PATCH 1/1] x86_64: Add strstr function with 512-bit EVEX
> >
> > On Thu, May 26, 2022 at 3:11 PM Raghuveer Devulapalli via Libc-alpha <libc-
> > alpha@sourceware.org> wrote:
> > >
> > > Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
> > >
> > > (1) We spend a few cycles at the begining to peek into the needle. We
> > > locate an edge in the needle (first occurance of 2 consequent distinct
> > > characters) and also store the first 64-bytes into a zmm register.
> > >
> > > (2) We search for the edge in the haystack by looking into one cache
> > > line of the haystack at a time. This avoids having to read past a page
> > > boundary which can cause a seg fault.
> > >
> > > (3) If an edge is found in the haystack we first compare the first
> > > 64-bytes of the needle (already stored in a zmm register) before we
> > > proceed with a full string compare performed byte by byte.
> > >
> > > Benchmarking data on ICX shows upto 2x speed up when compared to
> > > __strstr_sse2_unaligned (including partial benchtests data from
> > > bench-strstr.out):
> > >
> > > |---------------------------------+---------------+-----------------------|
> > > |                                 | strstr_avx512 |
> > > | strstr_sse2_unaligned |
> > > |---------------------------------+---------------+-----------------------|
> > > | Length 16384/ 16,  1/11, found: | 1939.75       | 3458.44               |
> > > | Length 16384/ 16, 14/ 5, fail : | 1967.75       | 3541.12               |
> > > | Length 16384/ 32,  1/11, found: | 1540.38       | 2908.25               |
> > > | Length 16384/ 32, 14/ 5, fail : | 1345.94       | 2866.31               |
> > > | Length 16384/ 64,  1/11, found: | 1968.81       | 4327.56               |
> > > | Length 16384/ 64, 14/ 5, fail : | 1993.75       | 4215.69               |
> > > | Length 16384/128,  1/11, found: | 1535.44       | 3780.56               |
> > > | Length 16384/128, 14/ 5, fail : | 1414.75       | 3595.25               |
> > > | Length 16384/256,  1/11, found: | 2957.75       | 5501.44               |
> > > | Length 16384/256, 14/ 5, fail : | 2682.62       | 5099.88               |
> > > | Length 32768/ 16,  1/11, found: | 7820.19       | 11262.9               |
> > > | Length 32768/ 16, 14/ 5, fail : | 8196.88       | 10871.2               |
> > > | Length 32768/ 32,  1/11, found: | 5709.19       | 6611.56               |
> > > | Length 32768/ 32, 14/ 5, fail : | 5716.12       | 6647.06               |
> > > | Length 32768/ 64,  1/11, found: | 7160.44       | 10143.7               |
> > > | Length 32768/ 64, 14/ 5, fail : | 7021.38       | 10150.6               |
> > > | Length 32768/128,  1/11, found: | 4935.31       | 6756.56               |
> > > | Length 32768/128, 14/ 5, fail : | 4774.38       | 6746.19               |
> > > | Length 32768/256,  1/11, found: | 7933.19       | 12563.8               |
> > > | Length 32768/256, 14/ 5, fail : | 7975          | 12558.6               |
> > > | Length 65536/ 16,  1/11, found: | 9066.69       | 9419.62               |
> > > | Length 65536/ 16, 14/ 5, fail : | 8496          | 9384.75               |
> > > | Length 65536/ 32,  1/11, found: | 10258.8       | 11192.4               |
> > > | Length 65536/ 32, 14/ 5, fail : | 8712.12       | 11172.3               |
> > > | Length 65536/ 64,  1/11, found: | 11085.2       | 18162.1               |
> > > | Length 65536/ 64, 14/ 5, fail : | 11219.6       | 17921.5               |
> > > | Length 65536/128,  1/11, found: | 9753.56       | 18704.6               |
> > > | Length 65536/128, 14/ 5, fail : | 9588.81       | 18465.6               |
> > > | Length 65536/256,  1/11, found: | 18333.3       | 28505.2               |
> > > | Length 65536/256, 14/ 5, fail : | 18018.8       | 27990.8               |
> > > |---------------------------------+---------------+-----------------------|
> > > ---
> > >  sysdeps/x86_64/multiarch/Makefile          |   2 +
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
> > >  sysdeps/x86_64/multiarch/strstr-avx512.c   | 208
> > +++++++++++++++++++++
> > >  sysdeps/x86_64/multiarch/strstr.c          |  24 ++-
> > >  4 files changed, 236 insertions(+), 4 deletions(-)  create mode
> > > 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/Makefile
> > > b/sysdeps/x86_64/multiarch/Makefile
> > > index e7b413edad..6dc54a7265 100644
> > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > @@ -126,6 +126,7 @@ sysdep_routines += \
> > >    strrchr-sse2 \
> > >    strspn-c \
> > >    strspn-sse2 \
> > > +  strstr-avx512 \
> > >    strstr-sse2-unaligned \
> > >    varshift \
> > >  # sysdep_routines
> > > @@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4  CFLAGS-strcspn-c.c
> > +=
> > > -msse4  CFLAGS-strpbrk-c.c += -msse4  CFLAGS-strspn-c.c += -msse4
> > > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -
> > mavx512bw
> > > +-mbmi -mbmi2 -O3
> > >  endif
> > >
> > >  ifeq ($(subdir),wcsmbs)
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index a594f4176e..cc9a7eaaa1 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name, struct
> > > libc_ifunc_impl *array,
> > >
> > >    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
> > >    IFUNC_IMPL (i, name, strstr,
> > > +              IFUNC_IMPL_ADD (array, i, strstr,
> > > +                              (CPU_FEATURE_USABLE (AVX512VL)
> > > +                               && CPU_FEATURE_USABLE (AVX512BW)
> > > +                               && CPU_FEATURE_USABLE (AVX512DQ)
> > > +                               && CPU_FEATURE_USABLE (BMI2)),
> > > +                              __strstr_avx512)
> > >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> > >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > new file mode 100644
> > > index 0000000000..4082a75a1b
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > @@ -0,0 +1,208 @@
> > > +/* strstr optimized with 512-bit AVX-512 instructions
> > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#include <immintrin.h>
> > > +#include <inttypes.h>
> > > +#include <stdbool.h>
> > > +#include <string.h>
> > > +
> > > +#define FULL_MMASK64 0xffffffffffffffff #define ONE_64BIT 0x1ull
> > > +#define ZMM_SIZE_IN_BYTES 64
> > > +
> > > +/*
> > > + Returns the index of the first edge within the needle, returns 0 if
> > > +no edge  is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > > + */
> > > +static inline size_t
> > > +find_edge_in_needle (const char *ned) {
> > > +  size_t ind = 0;
> > > +  while (ned[ind + 1] != '\0')
> > > +    {
> > > +      if (ned[ind] != ned[ind + 1])
> > > +        return ind;
> > > +      else
> > > +        ind = ind + 1;
> > > +    }
> > > +  return 0;
> > > +}
> > > +
> > > +/*
> > > + Compare needle with haystack byte by byte at specified location  */
> > > +static inline bool verify_string_match (const char *hay, const size_t
> > > +hay_index, const char *ned,
> > > +                     size_t ind)
> > > +{
> > > +  while (ned[ind] != '\0')
> > > +    {
> >       strcmp? (you might be able to use memcmp which will be faster
> >       but will need a bit of refactor to keep true nedlen and check for page
> >       cross on hay)
>
> Wouldn't strcmp give you the wrong answer here? For ex: I would need it to return true when haystack is "abcdefg" and needle is "bcd"

Yeah strcmp directly wont work directly. If you can refactor so neelen is known
then you can use memcmp. Otherwise can you manually implement at least
first VEC_SIZE comparison with vectors (obv make sure its a speedup). That
should cover *most* needles.

If need be you can throw the edge cases to just byte comparison.
>
> > > +      if (ned[ind] != hay[hay_index + ind])
> > > +        return false;
> > > +      ind = ind + 1;
> > > +    }
> > > +  return true;
> > > +}
> > > +
> > > +/*
> > > + Compare needle with haystack at specified location. The first 64
> > > +bytes are  compared using a ZMM register.
> > > + */
> > > +static inline bool
> > > +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> > > +                            const char *ned, const __mmask64 ned_mask,
> > > +                            const __m512i ned_zmm) {
> > > +  /* check first 64 bytes using zmm and then scalar */
> > > +  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe
> > to
> > > +do so
> > > +  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask,
> > hay_zmm,
> > > +ned_zmm);
> > > +  if (match != 0x0) // failed the first few chars
> > > +    return false;
> > > +  else if (ned_mask == FULL_MMASK64)
> > > +    return verify_string_match (hay, hay_index, ned,
> > > +ZMM_SIZE_IN_BYTES);
> > > +  return true;
> > > +}
> > > +
> > > +char *
> > > +__strstr_avx512 (const char *haystack, const char *ned) {
> > > +  char first = ned[0];
> > > +  if (first == '\0')
> > > +    return (char *)haystack;
> > > +  if (ned[1] == '\0')
> > > +    return (char *)strchr (haystack, ned[0]);
> > > +
> > > +  size_t edge = find_edge_in_needle (ned);
> > > +
> > > +  /* ensure haystack is as long as the pos of edge in needle */  for
> > > + (int ii = 0; ii < edge; ++ii)
> > > +    {
> >     strnlen
>
> Makes sense.
>
> > > +      if (haystack[ii] == '\0')
> > > +        return NULL;
> > > +    }
> > > +
> > > +  const __m512i null = _mm512_setzero_si512 (); // '\0'
> > > +
> > > +  /*
> > > +   Load 64 bytes of the needle and save it to a zmm register
> > > +   Read one cache line at a time to avoid loading across a page boundary
> > > +   */
> > > +  __mmask64 ned_load_mask
> > > +      = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
> >     FULL_MMASK64 >> (((-(uintptr_t)ned) & 63));
>
> +1
>
> > > +  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask,
> > ned);
> >     Maybe conditional on highly unlike page cross this is very
> >     expensive if causes page walk
>
> Elements on the next cache line are zero masked, shouldn't that prevent a cross page load?

Still does a page walk (and IIRC has non-intuitive rules for updating TLB so it
can get pretty ugly).
>
> > > +  __mmask64 ned_nullmask
> > > +      = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm,
> > null);
> >     _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm,
> > ned_zmm)
> >
> >     likewise at all other compares with null unless it breaks
> >     microfusion more than once.
>
> The compiler was using vptestnmb, doesn't hurt to explicitly use it anyways.
>
> >
> >     If you can replace all then get rid of null
> > > +  if (__glibc_unlikely (ned_nullmask == 0x0))
> > > +    {
> > > +      ned_zmm = _mm512_loadu_si512 (ned);
> > > +      ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
> > > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > > +      if (ned_nullmask != 0x0)
> > > +        ned_load_mask = ned_load_mask >> 1;
> > > +    }
> > > +  else
> > > +    {
> > > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > > +      ned_load_mask = ned_load_mask >> 1;
> >       I think you can get away with just ned_load_mask =
> >       ned_nullmask - ONE_64BIT because you only use this after
> >       checking haystack no null-term
>
> Without the >> 1, we will compare the null char of the needle to the haystack which will give you the wrong answer.

Thats only if you have the `xor` with the original. The xor with the
original is needed
to clear null hits out of range but because you already check that the haystack
has no null term is fine to leave those potential high bit 1s in place.

I may be wrong.
>
> > > +    }
> > > +  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);  const __m512i
> > > + ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> > > +
> > > +  /*
> > > +   Read the bytes of haystack in the current cache line
> > > +   */
> > > +  size_t hay_index = edge;
> > > +  __mmask64 loadmask = _bzhi_u64 (
> > > +      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
> > > +  /* First load is a partial cache line */  __m512i hay0 =
> > > + _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > > +  /* Search for NULL and compare only till null char */
> > > +  __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask (loadmask,
> > hay0,
> > > + null);
> > > +  __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT);  cmpmask
> > =
> > > + _kand_mask64 (cmpmask, loadmask);
> >   nullmask ^ (nullmask - ONE_64BIT); codegen ends up actually
> >   using kand_mask here. Since loadmask and nullmask both go through
> >   GPR (nullmask for the blsmsk) you can do this explicitly in uint64_t
> >   to help GCC out.

Doable?
> >
> > > +  /* Search for the 2 charaters of needle */
> > > +  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > +  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > > +  k1 = _kshiftri_mask64 (k1, 1);
> > > +  /* k2 masks tell us if both chars from needle match */  uint64_t k2
> > > + = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1),
> > cmpmask));

Doable?

> > > +  /* For every match, search for the entire needle for a full match
> > > + */  while (k2)
> > > +    {
> > > +      uint64_t bitcount = _tzcnt_u64(k2);
> > > +      k2 = _blsr_u64(k2);
> > > +      size_t match_pos = hay_index + bitcount - edge;
> > > +      if (nullmask == 0)
> > > +        {
> > > +          if (verify_string_match_avx512 (haystack, match_pos, ned,
> > > +                                          ned_load_mask, ned_zmm))
> > > +            return (char *)haystack + match_pos;
> > > +        }
> > > +      else
> > > +        {
> > > +          if (verify_string_match (haystack, match_pos, ned, 0))
> > > +            return (char *)haystack + match_pos;
> > > +        }
> > > +    }
> > > +  /* We haven't checked for potential match at the last char yet */
> > > + hay_index += _mm_popcnt_u64 (loadmask) - 1;
> >   hay_index = 0; haystay |= 63; You might want to check codegen and
> >   ensure hay_index is being optimized out. AFAICT you just need a
> >   pointer.
>
> AFAICT, looks like it does optimize it out.

Good. Can you switch to the `|=` though.
>
> > > +
> > > +  /*
> > > +   Loop over one cache line at a time to prevent reading over page
> > > +   boundary
> > > +   */
> > > +  __m512i hay1;
> > > +  while (nullmask == 0)
> > > +    {
> > > +      hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > > +      hay1 = _mm512_load_si512 (haystack + hay_index
> > > +                                + 1); // Always 64 byte aligned
> >     Is this really faster than using kshiftri?
>
> Yes (assuming you mean using just one load and use mask shift operations to look for a match). A lot of instructions in this loop are stuck on port 5 and so is kshiftri. Using 2 loads which execute on Port 2 and 3 relives that pressure.
>

Fair enough.
> > > +      nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
> > > +      /* Compare only till null char */
> > > +      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > +      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > +      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > > +      /* k2 masks tell us if both chars from needle match */
> > > +      k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1),
> > cmpmask));

Doable?
> > > +      /* For every match, compare full strings for potential match */
> > > +      while (k2)
> > > +        {
> > > +          uint64_t bitcount = _tzcnt_u64(k2);
> > > +          k2 = _blsr_u64(k2);
> > > +          size_t match_pos = hay_index + bitcount - edge;
> > > +          if (nullmask == 0)
> > > +            {
> > > +              /*
> > > +               Since the haystack doesn't terminate at the current cache
> > > +               line, we can use zmm register to compare the first 64 bytes
> > > +               */
> > > +              if (verify_string_match_avx512 (haystack, match_pos, ned,
> > > +                                              ned_load_mask, ned_zmm))
> > > +                return (char *)haystack + match_pos;
> > > +            }
> > > +          else
> > > +            {
> > > +              /* Compare byte by byte */
> > > +              if (verify_string_match (haystack, match_pos, ned, 0))
> > > +                return (char *)haystack + match_pos;
> > > +            }
> > > +        }
> > > +      hay_index += ZMM_SIZE_IN_BYTES;
> > > +    }
> > > +  return NULL;
> > > +}
> > > diff --git a/sysdeps/x86_64/multiarch/strstr.c
> > > b/sysdeps/x86_64/multiarch/strstr.c
> > > index 95600a9de5..2fb8b169b6 100644
> > > --- a/sysdeps/x86_64/multiarch/strstr.c
> > > +++ b/sysdeps/x86_64/multiarch/strstr.c
> > > @@ -35,16 +35,32 @@
> > >
> > >  extern __typeof (__redirect_strstr) __strstr_sse2_unaligned
> > > attribute_hidden;  extern __typeof (__redirect_strstr) __strstr_sse2
> > > attribute_hidden;
> > > +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
> > >
> > >  #include "init-arch.h"
> > >
> > >  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> > >     ifunc symbol properly.  */
> > >  extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc
> > > (__libc_strstr,
> > > -           HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> > > -           ? __strstr_sse2_unaligned
> > > -           : __strstr_sse2)
> > >
> > > +static inline void *
> > > +IFUNC_SELECTOR (void)
> > > +{
> > > +  const struct cpu_features *cpu_features = __get_cpu_features ();
> > > +
> > > +  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> > > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> > > +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > > +    return __strstr_avx512;
> > > +
> > > +  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > > +    return __strstr_sse2_unaligned;
> > > +
> > > +  return __strstr_sse2;
> > > +}
> > > +
> > > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr,
> > > +IFUNC_SELECTOR ());
> > >  #undef strstr
> > >  strong_alias (__libc_strstr, strstr)
> > > --
> > > 2.36.1
> > >
  
develop--- via Libc-alpha June 1, 2022, 4:13 a.m. UTC | #8
> -----Original Message-----
> From: Noah Goldstein <goldstein.w.n@gmail.com>
> Sent: Tuesday, May 31, 2022 2:34 PM
> To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
> Cc: GNU C Library <libc-alpha@sourceware.org>
> Subject: Re: [PATCH 1/1] x86_64: Add strstr function with 512-bit EVEX
> 
> On Tue, May 31, 2022 at 2:16 PM Devulapalli, Raghuveer
> <raghuveer.devulapalli@intel.com> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: Noah Goldstein <goldstein.w.n@gmail.com>
> > > Sent: Thursday, May 26, 2022 2:26 PM
> > > To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
> > > Cc: GNU C Library <libc-alpha@sourceware.org>
> > > Subject: Re: [PATCH 1/1] x86_64: Add strstr function with 512-bit
> > > EVEX
> > >
> > > On Thu, May 26, 2022 at 3:11 PM Raghuveer Devulapalli via Libc-alpha
> > > <libc- alpha@sourceware.org> wrote:
> > > >
> > > > Adding a 512-bit EVEX version of strstr. The algorithm works as
> follows:
> > > >
> > > > (1) We spend a few cycles at the begining to peek into the needle.
> > > > We locate an edge in the needle (first occurance of 2 consequent
> > > > distinct
> > > > characters) and also store the first 64-bytes into a zmm register.
> > > >
> > > > (2) We search for the edge in the haystack by looking into one
> > > > cache line of the haystack at a time. This avoids having to read
> > > > past a page boundary which can cause a seg fault.
> > > >
> > > > (3) If an edge is found in the haystack we first compare the first
> > > > 64-bytes of the needle (already stored in a zmm register) before
> > > > we proceed with a full string compare performed byte by byte.
> > > >
> > > > Benchmarking data on ICX shows upto 2x speed up when compared to
> > > > __strstr_sse2_unaligned (including partial benchtests data from
> > > > bench-strstr.out):
> > > >
> > > > |---------------------------------+---------------+-----------------------|
> > > > |                                 | strstr_avx512 |
> > > > | strstr_sse2_unaligned |
> > > > |---------------------------------+---------------+-----------------------|
> > > > | Length 16384/ 16,  1/11, found: | 1939.75       | 3458.44               |
> > > > | Length 16384/ 16, 14/ 5, fail : | 1967.75       | 3541.12               |
> > > > | Length 16384/ 32,  1/11, found: | 1540.38       | 2908.25               |
> > > > | Length 16384/ 32, 14/ 5, fail : | 1345.94       | 2866.31               |
> > > > | Length 16384/ 64,  1/11, found: | 1968.81       | 4327.56               |
> > > > | Length 16384/ 64, 14/ 5, fail : | 1993.75       | 4215.69               |
> > > > | Length 16384/128,  1/11, found: | 1535.44       | 3780.56               |
> > > > | Length 16384/128, 14/ 5, fail : | 1414.75       | 3595.25               |
> > > > | Length 16384/256,  1/11, found: | 2957.75       | 5501.44               |
> > > > | Length 16384/256, 14/ 5, fail : | 2682.62       | 5099.88               |
> > > > | Length 32768/ 16,  1/11, found: | 7820.19       | 11262.9               |
> > > > | Length 32768/ 16, 14/ 5, fail : | 8196.88       | 10871.2               |
> > > > | Length 32768/ 32,  1/11, found: | 5709.19       | 6611.56               |
> > > > | Length 32768/ 32, 14/ 5, fail : | 5716.12       | 6647.06               |
> > > > | Length 32768/ 64,  1/11, found: | 7160.44       | 10143.7               |
> > > > | Length 32768/ 64, 14/ 5, fail : | 7021.38       | 10150.6               |
> > > > | Length 32768/128,  1/11, found: | 4935.31       | 6756.56               |
> > > > | Length 32768/128, 14/ 5, fail : | 4774.38       | 6746.19               |
> > > > | Length 32768/256,  1/11, found: | 7933.19       | 12563.8               |
> > > > | Length 32768/256, 14/ 5, fail : | 7975          | 12558.6               |
> > > > | Length 65536/ 16,  1/11, found: | 9066.69       | 9419.62               |
> > > > | Length 65536/ 16, 14/ 5, fail : | 8496          | 9384.75               |
> > > > | Length 65536/ 32,  1/11, found: | 10258.8       | 11192.4               |
> > > > | Length 65536/ 32, 14/ 5, fail : | 8712.12       | 11172.3               |
> > > > | Length 65536/ 64,  1/11, found: | 11085.2       | 18162.1               |
> > > > | Length 65536/ 64, 14/ 5, fail : | 11219.6       | 17921.5               |
> > > > | Length 65536/128,  1/11, found: | 9753.56       | 18704.6               |
> > > > | Length 65536/128, 14/ 5, fail : | 9588.81       | 18465.6               |
> > > > | Length 65536/256,  1/11, found: | 18333.3       | 28505.2               |
> > > > | Length 65536/256, 14/ 5, fail : | 18018.8       | 27990.8               |
> > > > |---------------------------------+---------------+-----------------------|
> > > > ---
> > > >  sysdeps/x86_64/multiarch/Makefile          |   2 +
> > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
> > > >  sysdeps/x86_64/multiarch/strstr-avx512.c   | 208
> > > +++++++++++++++++++++
> > > >  sysdeps/x86_64/multiarch/strstr.c          |  24 ++-
> > > >  4 files changed, 236 insertions(+), 4 deletions(-)  create mode
> > > > 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/Makefile
> > > > b/sysdeps/x86_64/multiarch/Makefile
> > > > index e7b413edad..6dc54a7265 100644
> > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > @@ -126,6 +126,7 @@ sysdep_routines += \
> > > >    strrchr-sse2 \
> > > >    strspn-c \
> > > >    strspn-sse2 \
> > > > +  strstr-avx512 \
> > > >    strstr-sse2-unaligned \
> > > >    varshift \
> > > >  # sysdep_routines
> > > > @@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4
> > > > CFLAGS-strcspn-c.c
> > > +=
> > > > -msse4  CFLAGS-strpbrk-c.c += -msse4  CFLAGS-strspn-c.c += -msse4
> > > > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -
> > > mavx512bw
> > > > +-mbmi -mbmi2 -O3
> > > >  endif
> > > >
> > > >  ifeq ($(subdir),wcsmbs)
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > index a594f4176e..cc9a7eaaa1 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > @@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name,
> > > > struct libc_ifunc_impl *array,
> > > >
> > > >    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
> > > >    IFUNC_IMPL (i, name, strstr,
> > > > +              IFUNC_IMPL_ADD (array, i, strstr,
> > > > +                              (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                               && CPU_FEATURE_USABLE (AVX512BW)
> > > > +                               && CPU_FEATURE_USABLE (AVX512DQ)
> > > > +                               && CPU_FEATURE_USABLE (BMI2)),
> > > > +                              __strstr_avx512)
> > > >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> > > >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > new file mode 100644
> > > > index 0000000000..4082a75a1b
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > @@ -0,0 +1,208 @@
> > > > +/* strstr optimized with 512-bit AVX-512 instructions
> > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty
> of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
> the
> > > GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +#include <immintrin.h>
> > > > +#include <inttypes.h>
> > > > +#include <stdbool.h>
> > > > +#include <string.h>
> > > > +
> > > > +#define FULL_MMASK64 0xffffffffffffffff #define ONE_64BIT 0x1ull
> > > > +#define ZMM_SIZE_IN_BYTES 64
> > > > +
> > > > +/*
> > > > + Returns the index of the first edge within the needle, returns 0
> > > > +if no edge  is found. Example: 'ab' is the first edge in
> 'aaaaaaaaaabaarddg'
> > > > + */
> > > > +static inline size_t
> > > > +find_edge_in_needle (const char *ned) {
> > > > +  size_t ind = 0;
> > > > +  while (ned[ind + 1] != '\0')
> > > > +    {
> > > > +      if (ned[ind] != ned[ind + 1])
> > > > +        return ind;
> > > > +      else
> > > > +        ind = ind + 1;
> > > > +    }
> > > > +  return 0;
> > > > +}
> > > > +
> > > > +/*
> > > > + Compare needle with haystack byte by byte at specified location
> > > > +*/ static inline bool verify_string_match (const char *hay, const
> > > > +size_t hay_index, const char *ned,
> > > > +                     size_t ind)
> > > > +{
> > > > +  while (ned[ind] != '\0')
> > > > +    {
> > >       strcmp? (you might be able to use memcmp which will be faster
> > >       but will need a bit of refactor to keep true nedlen and check for page
> > >       cross on hay)
> >
> > Wouldn't strcmp give you the wrong answer here? For ex: I would need it
> to return true when haystack is "abcdefg" and needle is "bcd"
> 
> Yeah strcmp directly wont work directly. If you can refactor so neelen is
> known then you can use memcmp. Otherwise can you manually implement
> at least first VEC_SIZE comparison with vectors (obv make sure its a
> speedup). That should cover *most* needles.
> 
> If need be you can throw the edge cases to just byte comparison.

This is exactly what the function verify_string_match_avx512 does. 
It compares the first 64 bytes of the needle to 64 bytes of haystack 
with vcmpb and then does a byte by byte comparison for the rest 
of the data. The hot loop calls the verify_string_match_avx512 first 
and then switches to byte by byte comparison which is kind of
inevitable to avoid cross page loads for both haystack and needle. 

> >
> > > > +      if (ned[ind] != hay[hay_index + ind])
> > > > +        return false;
> > > > +      ind = ind + 1;
> > > > +    }
> > > > +  return true;
> > > > +}
> > > > +
> > > > +/*
> > > > + Compare needle with haystack at specified location. The first 64
> > > > +bytes are  compared using a ZMM register.
> > > > + */
> > > > +static inline bool
> > > > +verify_string_match_avx512 (const char *hay, const size_t
> hay_index,
> > > > +                            const char *ned, const __mmask64 ned_mask,
> > > > +                            const __m512i ned_zmm) {
> > > > +  /* check first 64 bytes using zmm and then scalar */
> > > > +  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); //
> safe
> > > to
> > > > +do so
> > > > +  __mmask64 match = _mm512_mask_cmpneq_epi8_mask
> (ned_mask,
> > > hay_zmm,
> > > > +ned_zmm);
> > > > +  if (match != 0x0) // failed the first few chars
> > > > +    return false;
> > > > +  else if (ned_mask == FULL_MMASK64)
> > > > +    return verify_string_match (hay, hay_index, ned,
> > > > +ZMM_SIZE_IN_BYTES);
> > > > +  return true;
> > > > +}
> > > > +
> > > > +char *
> > > > +__strstr_avx512 (const char *haystack, const char *ned) {
> > > > +  char first = ned[0];
> > > > +  if (first == '\0')
> > > > +    return (char *)haystack;
> > > > +  if (ned[1] == '\0')
> > > > +    return (char *)strchr (haystack, ned[0]);
> > > > +
> > > > +  size_t edge = find_edge_in_needle (ned);
> > > > +
> > > > +  /* ensure haystack is as long as the pos of edge in needle */
> > > > + for (int ii = 0; ii < edge; ++ii)
> > > > +    {
> > >     strnlen
> >
> > Makes sense.
> >
> > > > +      if (haystack[ii] == '\0')
> > > > +        return NULL;
> > > > +    }
> > > > +
> > > > +  const __m512i null = _mm512_setzero_si512 (); // '\0'
> > > > +
> > > > +  /*
> > > > +   Load 64 bytes of the needle and save it to a zmm register
> > > > +   Read one cache line at a time to avoid loading across a page
> boundary
> > > > +   */
> > > > +  __mmask64 ned_load_mask
> > > > +      = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
> > >     FULL_MMASK64 >> (((-(uintptr_t)ned) & 63));
> >
> > +1
> >
> > > > +  __m512i ned_zmm = _mm512_maskz_loadu_epi8
> (ned_load_mask,
> > > ned);
> > >     Maybe conditional on highly unlike page cross this is very
> > >     expensive if causes page walk
> >
> > Elements on the next cache line are zero masked, shouldn't that prevent a
> cross page load?
> 
> Still does a page walk (and IIRC has non-intuitive rules for updating TLB so it
> can get pretty ugly).

Ok, will add the conditional on page cross. 

> >
> > > > +  __mmask64 ned_nullmask
> > > > +      = _mm512_mask_cmpeq_epi8_mask (ned_load_mask,
> ned_zmm,
> > > null);
> > >     _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm,
> > > ned_zmm)
> > >
> > >     likewise at all other compares with null unless it breaks
> > >     microfusion more than once.
> >
> > The compiler was using vptestnmb, doesn't hurt to explicitly use it
> anyways.
> >
> > >
> > >     If you can replace all then get rid of null
> > > > +  if (__glibc_unlikely (ned_nullmask == 0x0))
> > > > +    {
> > > > +      ned_zmm = _mm512_loadu_si512 (ned);
> > > > +      ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
> > > > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > > > +      if (ned_nullmask != 0x0)
> > > > +        ned_load_mask = ned_load_mask >> 1;
> > > > +    }
> > > > +  else
> > > > +    {
> > > > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > > > +      ned_load_mask = ned_load_mask >> 1;
> > >       I think you can get away with just ned_load_mask =
> > >       ned_nullmask - ONE_64BIT because you only use this after
> > >       checking haystack no null-term
> >
> > Without the >> 1, we will compare the null char of the needle to the
> haystack which will give you the wrong answer.
> 
> Thats only if you have the `xor` with the original. The xor with the original is
> needed to clear null hits out of range but because you already check that the
> haystack has no null term is fine to leave those potential high bit 1s in place.
> 
> I may be wrong.

The benchtests fail without the shift op. Pretty sure I need it. 

> >
> > > > +    }
> > > > +  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);  const
> > > > + __m512i
> > > > + ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> > > > +
> > > > +  /*
> > > > +   Read the bytes of haystack in the current cache line
> > > > +   */
> > > > +  size_t hay_index = edge;
> > > > +  __mmask64 loadmask = _bzhi_u64 (
> > > > +      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) &
> > > > + 63));
> > > > +  /* First load is a partial cache line */  __m512i hay0 =
> > > > + _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > > > +  /* Search for NULL and compare only till null char */
> > > > +  __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask
> (loadmask,
> > > hay0,
> > > > + null);
> > > > +  __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> cmpmask
> > > =
> > > > + _kand_mask64 (cmpmask, loadmask);
> > >   nullmask ^ (nullmask - ONE_64BIT); codegen ends up actually
> > >   using kand_mask here. Since loadmask and nullmask both go through
> > >   GPR (nullmask for the blsmsk) you can do this explicitly in uint64_t
> > >   to help GCC out.
> 
> Doable?

Should be fine. I assume you mean use type uint64_t for both nullmask and cmpmask.

> > >
> > > > +  /* Search for the 2 charaters of needle */
> > > > +  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > > +  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > > > +  k1 = _kshiftri_mask64 (k1, 1);
> > > > +  /* k2 masks tell us if both chars from needle match */
> > > > + uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0,
> > > > + k1),
> > > cmpmask));
> 
> Doable?

Yup. 

> 
> > > > +  /* For every match, search for the entire needle for a full
> > > > + match */  while (k2)
> > > > +    {
> > > > +      uint64_t bitcount = _tzcnt_u64(k2);
> > > > +      k2 = _blsr_u64(k2);
> > > > +      size_t match_pos = hay_index + bitcount - edge;
> > > > +      if (nullmask == 0)
> > > > +        {
> > > > +          if (verify_string_match_avx512 (haystack, match_pos, ned,
> > > > +                                          ned_load_mask, ned_zmm))
> > > > +            return (char *)haystack + match_pos;
> > > > +        }
> > > > +      else
> > > > +        {
> > > > +          if (verify_string_match (haystack, match_pos, ned, 0))
> > > > +            return (char *)haystack + match_pos;
> > > > +        }
> > > > +    }
> > > > +  /* We haven't checked for potential match at the last char yet
> > > > + */ hay_index += _mm_popcnt_u64 (loadmask) - 1;
> > >   hay_index = 0; haystay |= 63; You might want to check codegen and
> > >   ensure hay_index is being optimized out. AFAICT you just need a
> > >   pointer.
> >
> > AFAICT, looks like it does optimize it out.
> 
> Good. Can you switch to the `|=` though.

Not sure what you mean. Could you explain? 

> >
> > > > +
> > > > +  /*
> > > > +   Loop over one cache line at a time to prevent reading over page
> > > > +   boundary
> > > > +   */
> > > > +  __m512i hay1;
> > > > +  while (nullmask == 0)
> > > > +    {
> > > > +      hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > > > +      hay1 = _mm512_load_si512 (haystack + hay_index
> > > > +                                + 1); // Always 64 byte aligned
> > >     Is this really faster than using kshiftri?
> >
> > Yes (assuming you mean using just one load and use mask shift operations
> to look for a match). A lot of instructions in this loop are stuck on port 5 and
> so is kshiftri. Using 2 loads which execute on Port 2 and 3 relives that
> pressure.
> >
> 
> Fair enough.
> > > > +      nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
> > > > +      /* Compare only till null char */
> > > > +      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > > +      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > > +      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > > > +      /* k2 masks tell us if both chars from needle match */
> > > > +      k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1),
> > > cmpmask));
> 
> Doable?

Yup. 

> > > > +      /* For every match, compare full strings for potential match */
> > > > +      while (k2)
> > > > +        {
> > > > +          uint64_t bitcount = _tzcnt_u64(k2);
> > > > +          k2 = _blsr_u64(k2);
> > > > +          size_t match_pos = hay_index + bitcount - edge;
> > > > +          if (nullmask == 0)
> > > > +            {
> > > > +              /*
> > > > +               Since the haystack doesn't terminate at the current cache
> > > > +               line, we can use zmm register to compare the first 64 bytes
> > > > +               */
> > > > +              if (verify_string_match_avx512 (haystack, match_pos, ned,
> > > > +                                              ned_load_mask, ned_zmm))
> > > > +                return (char *)haystack + match_pos;
> > > > +            }
> > > > +          else
> > > > +            {
> > > > +              /* Compare byte by byte */
> > > > +              if (verify_string_match (haystack, match_pos, ned, 0))
> > > > +                return (char *)haystack + match_pos;
> > > > +            }
> > > > +        }
> > > > +      hay_index += ZMM_SIZE_IN_BYTES;
> > > > +    }
> > > > +  return NULL;
> > > > +}
> > > > diff --git a/sysdeps/x86_64/multiarch/strstr.c
> > > > b/sysdeps/x86_64/multiarch/strstr.c
> > > > index 95600a9de5..2fb8b169b6 100644
> > > > --- a/sysdeps/x86_64/multiarch/strstr.c
> > > > +++ b/sysdeps/x86_64/multiarch/strstr.c
> > > > @@ -35,16 +35,32 @@
> > > >
> > > >  extern __typeof (__redirect_strstr) __strstr_sse2_unaligned
> > > > attribute_hidden;  extern __typeof (__redirect_strstr)
> > > > __strstr_sse2 attribute_hidden;
> > > > +extern __typeof (__redirect_strstr) __strstr_avx512
> > > > +attribute_hidden;
> > > >
> > > >  #include "init-arch.h"
> > > >
> > > >  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can
> handle
> > > >     ifunc symbol properly.  */
> > > >  extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc
> > > > (__libc_strstr,
> > > > -           HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> > > > -           ? __strstr_sse2_unaligned
> > > > -           : __strstr_sse2)
> > > >
> > > > +static inline void *
> > > > +IFUNC_SELECTOR (void)
> > > > +{
> > > > +  const struct cpu_features *cpu_features = __get_cpu_features
> > > > +();
> > > > +
> > > > +  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> > > > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > > > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > > > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> > > > +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > > > +    return __strstr_avx512;
> > > > +
> > > > +  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > > > +    return __strstr_sse2_unaligned;
> > > > +
> > > > +  return __strstr_sse2;
> > > > +}
> > > > +
> > > > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr,
> > > > +IFUNC_SELECTOR ());
> > > >  #undef strstr
> > > >  strong_alias (__libc_strstr, strstr)
> > > > --
> > > > 2.36.1
> > > >
  

Patch

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e7b413edad..6dc54a7265 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -126,6 +126,7 @@  sysdep_routines += \
   strrchr-sse2 \
   strspn-c \
   strspn-sse2 \
+  strstr-avx512 \
   strstr-sse2-unaligned \
   varshift \
 # sysdep_routines
@@ -133,6 +134,7 @@  CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
 CFLAGS-strspn-c.c += -msse4
+CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
 endif
 
 ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a594f4176e..cc9a7eaaa1 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -653,6 +653,12 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strstr.c.  */
   IFUNC_IMPL (i, name, strstr,
+              IFUNC_IMPL_ADD (array, i, strstr,
+                              (CPU_FEATURE_USABLE (AVX512VL)
+                               && CPU_FEATURE_USABLE (AVX512BW)
+                               && CPU_FEATURE_USABLE (AVX512DQ)
+                               && CPU_FEATURE_USABLE (BMI2)),
+                              __strstr_avx512)
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
 
diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
new file mode 100644
index 0000000000..4082a75a1b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
@@ -0,0 +1,208 @@ 
+/* strstr optimized with 512-bit AVX-512 instructions
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <immintrin.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <string.h>
+
+#define FULL_MMASK64 0xffffffffffffffff
+#define ONE_64BIT 0x1ull
+#define ZMM_SIZE_IN_BYTES 64
+
+/*
+ Returns the index of the first edge within the needle, returns 0 if no edge
+ is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
+ */
+static inline size_t
+find_edge_in_needle (const char *ned)
+{
+  size_t ind = 0;
+  while (ned[ind + 1] != '\0')
+    {
+      if (ned[ind] != ned[ind + 1])
+        return ind;
+      else
+        ind = ind + 1;
+    }
+  return 0;
+}
+
+/*
+ Compare needle with haystack byte by byte at specified location
+ */
+static inline bool
+verify_string_match (const char *hay, const size_t hay_index, const char *ned,
+                     size_t ind)
+{
+  while (ned[ind] != '\0')
+    {
+      if (ned[ind] != hay[hay_index + ind])
+        return false;
+      ind = ind + 1;
+    }
+  return true;
+}
+
+/*
+ Compare needle with haystack at specified location. The first 64 bytes are
+ compared using a ZMM register.
+ */
+static inline bool
+verify_string_match_avx512 (const char *hay, const size_t hay_index,
+                            const char *ned, const __mmask64 ned_mask,
+                            const __m512i ned_zmm)
+{
+  /* check first 64 bytes using zmm and then scalar */
+  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
+  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
+  if (match != 0x0) // failed the first few chars
+    return false;
+  else if (ned_mask == FULL_MMASK64)
+    return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
+  return true;
+}
+
+char *
+__strstr_avx512 (const char *haystack, const char *ned)
+{
+  char first = ned[0];
+  if (first == '\0')
+    return (char *)haystack;
+  if (ned[1] == '\0')
+    return (char *)strchr (haystack, ned[0]);
+
+  size_t edge = find_edge_in_needle (ned);
+
+  /* ensure haystack is as long as the pos of edge in needle */
+  for (int ii = 0; ii < edge; ++ii)
+    {
+      if (haystack[ii] == '\0')
+        return NULL;
+    }
+
+  const __m512i null = _mm512_setzero_si512 (); // '\0'
+
+  /*
+   Load 64 bytes of the needle and save it to a zmm register
+   Read one cache line at a time to avoid loading across a page boundary
+   */
+  __mmask64 ned_load_mask
+      = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
+  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
+  __mmask64 ned_nullmask
+      = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm, null);
+  if (__glibc_unlikely (ned_nullmask == 0x0))
+    {
+      ned_zmm = _mm512_loadu_si512 (ned);
+      ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
+      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
+      if (ned_nullmask != 0x0)
+        ned_load_mask = ned_load_mask >> 1;
+    }
+  else
+    {
+      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
+      ned_load_mask = ned_load_mask >> 1;
+    }
+  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
+  const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
+
+  /*
+   Read the bytes of haystack in the current cache line
+   */
+  size_t hay_index = edge;
+  __mmask64 loadmask = _bzhi_u64 (
+      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
+  /* First load is a partial cache line */
+  __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
+  /* Search for NULL and compare only till null char */
+  __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask (loadmask, hay0, null);
+  __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT);
+  cmpmask = _kand_mask64 (cmpmask, loadmask);
+  /* Search for the 2 charaters of needle */
+  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
+  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
+  k1 = _kshiftri_mask64 (k1, 1);
+  /* k2 masks tell us if both chars from needle match */
+  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
+  /* For every match, search for the entire needle for a full match */
+  while (k2)
+    {
+      uint64_t bitcount = _tzcnt_u64(k2);
+      k2 = _blsr_u64(k2);
+      size_t match_pos = hay_index + bitcount - edge;
+      if (nullmask == 0)
+        {
+          if (verify_string_match_avx512 (haystack, match_pos, ned,
+                                          ned_load_mask, ned_zmm))
+            return (char *)haystack + match_pos;
+        }
+      else
+        {
+          if (verify_string_match (haystack, match_pos, ned, 0))
+            return (char *)haystack + match_pos;
+        }
+    }
+  /* We haven't checked for potential match at the last char yet */
+  hay_index += _mm_popcnt_u64 (loadmask) - 1;
+
+  /*
+   Loop over one cache line at a time to prevent reading over page
+   boundary
+   */
+  __m512i hay1;
+  while (nullmask == 0)
+    {
+      hay0 = _mm512_loadu_si512 (haystack + hay_index);
+      hay1 = _mm512_load_si512 (haystack + hay_index
+                                + 1); // Always 64 byte aligned
+      nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
+      /* Compare only till null char */
+      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
+      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
+      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
+      /* k2 masks tell us if both chars from needle match */
+      k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1), cmpmask));
+      /* For every match, compare full strings for potential match */
+      while (k2)
+        {
+          uint64_t bitcount = _tzcnt_u64(k2);
+          k2 = _blsr_u64(k2);
+          size_t match_pos = hay_index + bitcount - edge;
+          if (nullmask == 0)
+            {
+              /*
+               Since the haystack doesn't terminate at the current cache
+               line, we can use zmm register to compare the first 64 bytes
+               */
+              if (verify_string_match_avx512 (haystack, match_pos, ned,
+                                              ned_load_mask, ned_zmm))
+                return (char *)haystack + match_pos;
+            }
+          else
+            {
+              /* Compare byte by byte */
+              if (verify_string_match (haystack, match_pos, ned, 0))
+                return (char *)haystack + match_pos;
+            }
+        }
+      hay_index += ZMM_SIZE_IN_BYTES;
+    }
+  return NULL;
+}
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
index 95600a9de5..2fb8b169b6 100644
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -35,16 +35,32 @@ 
 
 extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
 extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
+extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
 
 #include "init-arch.h"
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 extern __typeof (__redirect_strstr) __libc_strstr;
-libc_ifunc (__libc_strstr,
-	    HAS_ARCH_FEATURE (Fast_Unaligned_Load)
-	    ? __strstr_sse2_unaligned
-	    : __strstr_sse2)
 
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features *cpu_features = __get_cpu_features ();
+
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+    return __strstr_avx512;
+
+  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+    return __strstr_sse2_unaligned;
+
+  return __strstr_sse2;
+}
+
+libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
 #undef strstr
 strong_alias (__libc_strstr, strstr)