x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr

Message ID 20220922002731.4039897-1-skpgkp2@gmail.com
State Superseded
Headers
Series x86_64: Implement evex512 version of memchr, rawmemchr and wmemchr |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Sunil Pandey Sept. 22, 2022, 12:27 a.m. UTC
  This patch implements following evex512 version of string functions.
evex512 version takes up to 30% less cycle as compared to evex,
depending on length and alignment.

- memchr function using 512 bit vectors.
- rawmemchr function using 512 bit vectors.
- wmemchr function using 512 bit vectors.

Code size data:

memchr-evex.o		762 byte
memchr-evex512.o	570 byte (-25%)

rawmemchr-evex.o	461 byte
rawmemchr-evex512.o	413 byte (-10%)

wmemchr-evex.o		794 byte
wmemchr-evex512.o	568 byte (-28%)

Placeholder function, not used by any processor at the moment.
---
 sysdeps/x86_64/multiarch/Makefile            |   3 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  15 +
 sysdeps/x86_64/multiarch/memchr-evex-base.S  | 306 +++++++++++++++++++
 sysdeps/x86_64/multiarch/memchr-evex512.S    |   7 +
 sysdeps/x86_64/multiarch/rawmemchr-evex512.S |   7 +
 sysdeps/x86_64/multiarch/wmemchr-evex512.S   |   8 +
 6 files changed, 346 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S
 create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S
  

Comments

Noah Goldstein Sept. 22, 2022, 12:50 a.m. UTC | #1
On Wed, Sep 21, 2022 at 5:27 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch implements following evex512 version of string functions.
> evex512 version takes up to 30% less cycle as compared to evex,
> depending on length and alignment.

Please attach benchmark numbers.
>
> - memchr function using 512 bit vectors.
> - rawmemchr function using 512 bit vectors.
> - wmemchr function using 512 bit vectors.
>
> Code size data:
>
> memchr-evex.o           762 byte
> memchr-evex512.o        570 byte (-25%)
>
> rawmemchr-evex.o        461 byte
> rawmemchr-evex512.o     413 byte (-10%)
>
> wmemchr-evex.o          794 byte
> wmemchr-evex512.o       568 byte (-28%)
>
> Placeholder function, not used by any processor at the moment.
> ---
>  sysdeps/x86_64/multiarch/Makefile            |   3 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  15 +
>  sysdeps/x86_64/multiarch/memchr-evex-base.S  | 306 +++++++++++++++++++
>  sysdeps/x86_64/multiarch/memchr-evex512.S    |   7 +
>  sysdeps/x86_64/multiarch/rawmemchr-evex512.S |   7 +
>  sysdeps/x86_64/multiarch/wmemchr-evex512.S   |   8 +
>  6 files changed, 346 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S
>  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index df4601c294..e974b1ad97 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -4,6 +4,7 @@ sysdep_routines += \
>    memchr-avx2 \
>    memchr-avx2-rtm \
>    memchr-evex \
> +  memchr-evex512 \
>    memchr-evex-rtm \
>    memchr-sse2 \
>    memcmp-avx2-movbe \
> @@ -36,6 +37,7 @@ sysdep_routines += \
>    rawmemchr-avx2 \
>    rawmemchr-avx2-rtm \
>    rawmemchr-evex \
> +  rawmemchr-evex512 \
>    rawmemchr-evex-rtm \
>    rawmemchr-sse2 \
>    stpcpy-avx2 \
> @@ -156,6 +158,7 @@ sysdep_routines += \
>    wmemchr-avx2 \
>    wmemchr-avx2-rtm \
>    wmemchr-evex \
> +  wmemchr-evex512 \
>    wmemchr-evex-rtm \
>    wmemchr-sse2 \
>    wmemcmp-avx2-movbe \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index a71444eccb..17f770318d 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __memchr_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __memchr_evex512)
>               X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
>                                      (CPU_FEATURE_USABLE (AVX512VL)
>                                       && CPU_FEATURE_USABLE (AVX512BW)
> @@ -329,6 +334,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __rawmemchr_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __rawmemchr_evex512)
>               X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
>                                      (CPU_FEATURE_USABLE (AVX512VL)
>                                       && CPU_FEATURE_USABLE (AVX512BW)
> @@ -903,6 +913,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __wmemchr_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __wmemchr_evex512)
>               X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
>                                      (CPU_FEATURE_USABLE (AVX512VL)
>                                       && CPU_FEATURE_USABLE (AVX512BW)
> diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> new file mode 100644
> index 0000000000..524f0809b5
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> @@ -0,0 +1,306 @@
> +/* Placeholder function, not used by any processor at the moment.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* UNUSED. Exists purely as reference implementation.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +# include <sysdep.h>
> +
> +# ifdef USE_AS_WMEMCHR
> +#  define CHAR_SIZE    4
> +#  define VPBROADCAST   vpbroadcastd
> +#  define VPCMP                vpcmpd
> +# else
> +#  define CHAR_SIZE    1
> +#  define VPBROADCAST   vpbroadcastb
> +#  define VPCMP                vpcmpb
> +# endif
> +
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +# define XMM1           xmm17
> +
> +# if VEC_SIZE == 64
> +#  define KMOV         kmovq
> +#  define KOR          korq
> +#  define KORTEST      kortestq
> +#  define RAX          rax
> +#  define RCX          rcx
> +#  define SHR          shrq
> +#  define SARX         sarxq
> +#  define TEXTSUFFIX   evex512
> +#  define VMM0         zmm16
> +# elif VEC_SIZE == 32
> +/* Currently Unused.  */
> +#  define KMOV         kmovd
> +#  define KOR          kord
> +#  define KORTEST      kortestd
> +#  define RAX          eax
> +#  define RCX          ecx
> +#  define SHR          shrl
> +#  define SARX         sarxl
> +#  define TEXTSUFFIX   evex256
> +#  define VMM0         ymm16
> +# endif
> +
> +       .section .text.TEXTSUFFIX, "ax", @progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> +   one vector length string.  */
> +ENTRY_P2ALIGN (MEMCHR, 6)
> +# ifndef USE_AS_RAWMEMCHR
> +       /* Check for zero length.  */
> +       test    %RDX_LP, %RDX_LP
> +       jz      L(zero)
> +
> +#  ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %edx, %edx
> +#  endif
> +# endif
> +
> +       /* Broadcast CHAR to VMM0.  */
> +       VPBROADCAST %esi, %VMM0
> +       movl    %edi, %eax
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(page_cross)
> +
> +       /* Compare [w]char for null, mask bit will be set for match.  */
> +       VPCMP   $0, (%rdi), %VMM0, %k0
> +
> +       KMOV    %k0, %RAX
> +# ifndef USE_AS_RAWMEMCHR
> +       bsf     %RAX, %RCX
> +       jz      L(align_more)
> +       xor     %eax, %eax
> +#  ifdef USE_AS_WMEMCHR
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> +#  else
> +       addq    %rcx, %rdi
> +#  endif
> +       cmp     %rcx, %rdx
> +       cmova   %rdi, %rax
> +# else
> +       bsf     %RAX, %RAX
> +       jz      L(align_more)
> +       add     %rdi, %rax
> +# endif
> +       ret
> +
> +# ifndef USE_AS_RAWMEMCHR
> +L(zero):
> +       xorl    %eax, %eax
> +       ret
> +# endif
> +
> +       .p2align 5,,5
> +L(page_cross):
> +       movq    %rdi, %rcx
> +       andq    $-VEC_SIZE, %rcx
> +
> +       VPCMP   $0, (%rcx), %VMM0, %k0
> +       KMOV    %k0, %RCX
> +       SARX    %RAX, %RCX, %RAX
> +# ifndef USE_AS_RAWMEMCHR
> +       bsf     %RAX, %RCX
> +       jz      L(align_more)
> +       xor     %eax, %eax
> +#  ifdef USE_AS_WMEMCHR
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> +#  else
> +       addq    %rcx, %rdi
> +#  endif
> +       cmp     %rcx, %rdx
> +       cmovae  %rdi, %rax
> +
> +# else
> +       bsf     %rax, %rax
> +       jz      L(align_more)
> +       add     %rdi, %rax
> +# endif
> +       ret
> +
> +L(ret_vec_x2):
> +       subq    $-VEC_SIZE, %rdi
> +L(ret_vec_x1):
> +       bsf     %RAX, %RAX
> +# ifndef USE_AS_RAWMEMCHR
> +       jz      L(zero)
> +       cmp     %rax, %rdx
> +       jbe     L(zero)
> +# endif
> +# ifdef USE_AS_WMEMCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       add     %rdi, %rax
> +# endif
> +       ret
> +
> +       .p2align 5,,10
> +L(align_more):
> +# ifndef USE_AS_RAWMEMCHR
> +       xor     %eax, %eax
> +       subq    %rdi, %rax
> +# endif
> +
> +       subq    $-VEC_SIZE, %rdi
> +       /* Align rdi to VEC_SIZE.  */
> +       andq    $-VEC_SIZE, %rdi
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       addq    %rdi, %rax
> +#  ifdef USE_AS_WMEMCHR
> +       sarl    $2, %eax
> +#  endif
> +       subq    %rax, %rdx
> +       jbe     L(zero)
> +# endif
> +
> +       /* Loop unroll 4 times for 4 vector loop.  */
> +       VPCMP   $0, (%rdi), %VMM0, %k0
> +
> +       KMOV    %k0, %RAX
> +       test    %RAX, %RAX
> +       jnz     L(ret_vec_x1)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +# endif
> +
> +       VPCMP   $0, VEC_SIZE(%rdi), %VMM0, %k0
> +
> +       KMOV    %k0, %RAX
> +       test    %RAX, %RAX
> +       jnz     L(ret_vec_x2)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +# endif
> +
> +       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %VMM0, %k0
> +
> +       KMOV    %k0, %RAX
> +       test    %RAX, %RAX
> +       jnz     L(ret_vec_x3)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +# endif
> +
> +       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %VMM0, %k0
> +
> +       KMOV    %k0, %RAX
> +       test    %RAX, %RAX
> +       jnz     L(ret_vec_x4)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +       /* Save pointer to find alignment adjustment.  */
> +       movq    %rdi, %rax
> +# endif
> +       /* Align address to VEC_SIZE * 4 for loop.  */
> +       andq    $-(VEC_SIZE * 4), %rdi
> +
> +       /* Add alignment difference to rdx.  */
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    %rdi, %rax
> +#  ifdef USE_AS_WMEMCHR
> +       SHR     $2, %RAX
> +#  endif
> +       addq    %rax, %rdx
> +       jmp     L(loop_entry)
> +# endif
> +
> +       /* 4 vector loop.  */
> +       .p2align 5,,11
> +L(loop):
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $(CHAR_PER_VEC * 4), %rdx
> +       jbe     L(zero)
> +L(loop_entry):
> +# endif
> +       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %VMM0, %k1
> +       VPCMP   $0, (VEC_SIZE * 5)(%rdi), %VMM0, %k2
> +       VPCMP   $0, (VEC_SIZE * 6)(%rdi), %VMM0, %k3
> +       VPCMP   $0, (VEC_SIZE * 7)(%rdi), %VMM0, %k4
> +       KOR     %k1, %k2, %k5
> +       KOR     %k3, %k4, %k6
> +
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       KORTEST %k5, %k6
> +       jz      L(loop)
> +
> +       KMOV    %k1, %RAX
> +       test    %RAX, %RAX
> +       jnz     L(ret_vec_x1)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +# endif
> +
> +       KMOV    %k2, %RAX
> +       test    %RAX, %RAX
> +       jnz     L(ret_vec_x2)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +# endif
> +
> +       KMOV    %k3, %RAX
> +       test    %RAX, %RAX
> +       jnz     L(ret_vec_x3)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +# endif
> +
> +       /* At this point null [w]char must be in the fourth vector so no
> +          need to check.  */
> +       KMOV    %k4, %RAX
> +
> +L(ret_vec_x4):
> +       bsf     %RAX, %RAX
> +# ifndef USE_AS_RAWMEMCHR
> +       cmp     %rax, %rdx
> +       jbe     L(zero)
> +# endif
> +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
> +
> +       .p2align 5,,5
> +L(ret_vec_x3):
> +       bsf     %RAX, %RAX
> +# ifndef USE_AS_RAWMEMCHR
> +       cmp     %rax, %rdx
> +       jbe     L(zero)
> +# endif
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
> +
> +END (MEMCHR)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S
> new file mode 100644
> index 0000000000..47349d817a
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S
> @@ -0,0 +1,7 @@
> +# ifndef MEMCHR
> +#  define MEMCHR       __memchr_evex512
> +# endif
> +
> +#define VEC_SIZE        64
> +
> +#include "memchr-evex-base.S"
> diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> new file mode 100644
> index 0000000000..302d3cb055
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> @@ -0,0 +1,7 @@
> +#ifndef RAWMEMCHR
> +# define RAWMEMCHR     __rawmemchr_evex512
> +#endif
> +#define USE_AS_RAWMEMCHR       1
> +#define MEMCHR RAWMEMCHR
> +
> +#include "memchr-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> new file mode 100644
> index 0000000000..f45ed1db75
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> @@ -0,0 +1,8 @@
> +#ifndef WMEMCHR
> +# define WMEMCHR       __wmemchr_evex512
> +#endif
> +
> +#define MEMCHR WMEMCHR
> +#define USE_AS_WMEMCHR 1
> +
> +#include "memchr-evex512.S"
> --
> 2.36.1
>
  
Sunil Pandey Sept. 23, 2022, 3:57 a.m. UTC | #2
Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz


On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Sep 21, 2022 at 5:27 PM Sunil K Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > This patch implements following evex512 version of string functions.
> > evex512 version takes up to 30% less cycle as compared to evex,
> > depending on length and alignment.
>
> Please attach benchmark numbers.
> >
> > - memchr function using 512 bit vectors.
> > - rawmemchr function using 512 bit vectors.
> > - wmemchr function using 512 bit vectors.
> >
> > Code size data:
> >
> > memchr-evex.o           762 byte
> > memchr-evex512.o        570 byte (-25%)
> >
> > rawmemchr-evex.o        461 byte
> > rawmemchr-evex512.o     413 byte (-10%)
> >
> > wmemchr-evex.o          794 byte
> > wmemchr-evex512.o       568 byte (-28%)
> >
> > Placeholder function, not used by any processor at the moment.
> > ---
> >  sysdeps/x86_64/multiarch/Makefile            |   3 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  15 +
> >  sysdeps/x86_64/multiarch/memchr-evex-base.S  | 306 +++++++++++++++++++
> >  sysdeps/x86_64/multiarch/memchr-evex512.S    |   7 +
> >  sysdeps/x86_64/multiarch/rawmemchr-evex512.S |   7 +
> >  sysdeps/x86_64/multiarch/wmemchr-evex512.S   |   8 +
> >  6 files changed, 346 insertions(+)
> >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S
> >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index df4601c294..e974b1ad97 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -4,6 +4,7 @@ sysdep_routines += \
> >    memchr-avx2 \
> >    memchr-avx2-rtm \
> >    memchr-evex \
> > +  memchr-evex512 \
> >    memchr-evex-rtm \
> >    memchr-sse2 \
> >    memcmp-avx2-movbe \
> > @@ -36,6 +37,7 @@ sysdep_routines += \
> >    rawmemchr-avx2 \
> >    rawmemchr-avx2-rtm \
> >    rawmemchr-evex \
> > +  rawmemchr-evex512 \
> >    rawmemchr-evex-rtm \
> >    rawmemchr-sse2 \
> >    stpcpy-avx2 \
> > @@ -156,6 +158,7 @@ sysdep_routines += \
> >    wmemchr-avx2 \
> >    wmemchr-avx2-rtm \
> >    wmemchr-evex \
> > +  wmemchr-evex512 \
> >    wmemchr-evex-rtm \
> >    wmemchr-sse2 \
> >    wmemcmp-avx2-movbe \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index a71444eccb..17f770318d 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> >                                       && CPU_FEATURE_USABLE (BMI2)),
> >                                      __memchr_evex)
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __memchr_evex512)
> >               X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> >                                      (CPU_FEATURE_USABLE (AVX512VL)
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > @@ -329,6 +334,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> >                                       && CPU_FEATURE_USABLE (BMI2)),
> >                                      __rawmemchr_evex)
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __rawmemchr_evex512)
> >               X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> >                                      (CPU_FEATURE_USABLE (AVX512VL)
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > @@ -903,6 +913,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> >                                       && CPU_FEATURE_USABLE (BMI2)),
> >                                      __wmemchr_evex)
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __wmemchr_evex512)
> >               X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> >                                      (CPU_FEATURE_USABLE (AVX512VL)
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > new file mode 100644
> > index 0000000000..524f0809b5
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > @@ -0,0 +1,306 @@
> > +/* Placeholder function, not used by any processor at the moment.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* UNUSED. Exists purely as reference implementation.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if ISA_SHOULD_BUILD (4)
> > +
> > +# include <sysdep.h>
> > +
> > +# ifdef USE_AS_WMEMCHR
> > +#  define CHAR_SIZE    4
> > +#  define VPBROADCAST   vpbroadcastd
> > +#  define VPCMP                vpcmpd
> > +# else
> > +#  define CHAR_SIZE    1
> > +#  define VPBROADCAST   vpbroadcastb
> > +#  define VPCMP                vpcmpb
> > +# endif
> > +
> > +# define PAGE_SIZE     4096
> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > +# define XMM1           xmm17
> > +
> > +# if VEC_SIZE == 64
> > +#  define KMOV         kmovq
> > +#  define KOR          korq
> > +#  define KORTEST      kortestq
> > +#  define RAX          rax
> > +#  define RCX          rcx
> > +#  define SHR          shrq
> > +#  define SARX         sarxq
> > +#  define TEXTSUFFIX   evex512
> > +#  define VMM0         zmm16
> > +# elif VEC_SIZE == 32
> > +/* Currently Unused.  */
> > +#  define KMOV         kmovd
> > +#  define KOR          kord
> > +#  define KORTEST      kortestd
> > +#  define RAX          eax
> > +#  define RCX          ecx
> > +#  define SHR          shrl
> > +#  define SARX         sarxl
> > +#  define TEXTSUFFIX   evex256
> > +#  define VMM0         ymm16
> > +# endif
> > +
> > +       .section .text.TEXTSUFFIX, "ax", @progbits
> > +/* Aligning entry point to 64 byte, provides better performance for
> > +   one vector length string.  */
> > +ENTRY_P2ALIGN (MEMCHR, 6)
> > +# ifndef USE_AS_RAWMEMCHR
> > +       /* Check for zero length.  */
> > +       test    %RDX_LP, %RDX_LP
> > +       jz      L(zero)
> > +
> > +#  ifdef __ILP32__
> > +       /* Clear the upper 32 bits.  */
> > +       movl    %edx, %edx
> > +#  endif
> > +# endif
> > +
> > +       /* Broadcast CHAR to VMM0.  */
> > +       VPBROADCAST %esi, %VMM0
> > +       movl    %edi, %eax
> > +       andl    $(PAGE_SIZE - 1), %eax
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +       ja      L(page_cross)
> > +
> > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > +       VPCMP   $0, (%rdi), %VMM0, %k0
> > +
> > +       KMOV    %k0, %RAX
> > +# ifndef USE_AS_RAWMEMCHR
> > +       bsf     %RAX, %RCX
> > +       jz      L(align_more)
> > +       xor     %eax, %eax
> > +#  ifdef USE_AS_WMEMCHR
> > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > +#  else
> > +       addq    %rcx, %rdi
> > +#  endif
> > +       cmp     %rcx, %rdx
> > +       cmova   %rdi, %rax
> > +# else
> > +       bsf     %RAX, %RAX
> > +       jz      L(align_more)
> > +       add     %rdi, %rax
> > +# endif
> > +       ret
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +L(zero):
> > +       xorl    %eax, %eax
> > +       ret
> > +# endif
> > +
> > +       .p2align 5,,5
> > +L(page_cross):
> > +       movq    %rdi, %rcx
> > +       andq    $-VEC_SIZE, %rcx
> > +
> > +       VPCMP   $0, (%rcx), %VMM0, %k0
> > +       KMOV    %k0, %RCX
> > +       SARX    %RAX, %RCX, %RAX
> > +# ifndef USE_AS_RAWMEMCHR
> > +       bsf     %RAX, %RCX
> > +       jz      L(align_more)
> > +       xor     %eax, %eax
> > +#  ifdef USE_AS_WMEMCHR
> > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > +#  else
> > +       addq    %rcx, %rdi
> > +#  endif
> > +       cmp     %rcx, %rdx
> > +       cmovae  %rdi, %rax
> > +
> > +# else
> > +       bsf     %rax, %rax
> > +       jz      L(align_more)
> > +       add     %rdi, %rax
> > +# endif
> > +       ret
> > +
> > +L(ret_vec_x2):
> > +       subq    $-VEC_SIZE, %rdi
> > +L(ret_vec_x1):
> > +       bsf     %RAX, %RAX
> > +# ifndef USE_AS_RAWMEMCHR
> > +       jz      L(zero)
> > +       cmp     %rax, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +# ifdef USE_AS_WMEMCHR
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > +       add     %rdi, %rax
> > +# endif
> > +       ret
> > +
> > +       .p2align 5,,10
> > +L(align_more):
> > +# ifndef USE_AS_RAWMEMCHR
> > +       xor     %eax, %eax
> > +       subq    %rdi, %rax
> > +# endif
> > +
> > +       subq    $-VEC_SIZE, %rdi
> > +       /* Align rdi to VEC_SIZE.  */
> > +       andq    $-VEC_SIZE, %rdi
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       addq    %rdi, %rax
> > +#  ifdef USE_AS_WMEMCHR
> > +       sarl    $2, %eax
> > +#  endif
> > +       subq    %rax, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +
> > +       /* Loop unroll 4 times for 4 vector loop.  */
> > +       VPCMP   $0, (%rdi), %VMM0, %k0
> > +
> > +       KMOV    %k0, %RAX
> > +       test    %RAX, %RAX
> > +       jnz     L(ret_vec_x1)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +
> > +       VPCMP   $0, VEC_SIZE(%rdi), %VMM0, %k0
> > +
> > +       KMOV    %k0, %RAX
> > +       test    %RAX, %RAX
> > +       jnz     L(ret_vec_x2)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +
> > +       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %VMM0, %k0
> > +
> > +       KMOV    %k0, %RAX
> > +       test    %RAX, %RAX
> > +       jnz     L(ret_vec_x3)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +
> > +       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %VMM0, %k0
> > +
> > +       KMOV    %k0, %RAX
> > +       test    %RAX, %RAX
> > +       jnz     L(ret_vec_x4)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +       /* Save pointer to find alignment adjustment.  */
> > +       movq    %rdi, %rax
> > +# endif
> > +       /* Align address to VEC_SIZE * 4 for loop.  */
> > +       andq    $-(VEC_SIZE * 4), %rdi
> > +
> > +       /* Add alignment difference to rdx.  */
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    %rdi, %rax
> > +#  ifdef USE_AS_WMEMCHR
> > +       SHR     $2, %RAX
> > +#  endif
> > +       addq    %rax, %rdx
> > +       jmp     L(loop_entry)
> > +# endif
> > +
> > +       /* 4 vector loop.  */
> > +       .p2align 5,,11
> > +L(loop):
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > +       jbe     L(zero)
> > +L(loop_entry):
> > +# endif
> > +       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %VMM0, %k1
> > +       VPCMP   $0, (VEC_SIZE * 5)(%rdi), %VMM0, %k2
> > +       VPCMP   $0, (VEC_SIZE * 6)(%rdi), %VMM0, %k3
> > +       VPCMP   $0, (VEC_SIZE * 7)(%rdi), %VMM0, %k4
> > +       KOR     %k1, %k2, %k5
> > +       KOR     %k3, %k4, %k6
> > +
> > +       subq    $-(VEC_SIZE * 4), %rdi
> > +       KORTEST %k5, %k6
> > +       jz      L(loop)
> > +
> > +       KMOV    %k1, %RAX
> > +       test    %RAX, %RAX
> > +       jnz     L(ret_vec_x1)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +
> > +       KMOV    %k2, %RAX
> > +       test    %RAX, %RAX
> > +       jnz     L(ret_vec_x2)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +
> > +       KMOV    %k3, %RAX
> > +       test    %RAX, %RAX
> > +       jnz     L(ret_vec_x3)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +
> > +       /* At this point null [w]char must be in the fourth vector so no
> > +          need to check.  */
> > +       KMOV    %k4, %RAX
> > +
> > +L(ret_vec_x4):
> > +       bsf     %RAX, %RAX
> > +# ifndef USE_AS_RAWMEMCHR
> > +       cmp     %rax, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> > +
> > +       .p2align 5,,5
> > +L(ret_vec_x3):
> > +       bsf     %RAX, %RAX
> > +# ifndef USE_AS_RAWMEMCHR
> > +       cmp     %rax, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> > +
> > +END (MEMCHR)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > new file mode 100644
> > index 0000000000..47349d817a
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > @@ -0,0 +1,7 @@
> > +# ifndef MEMCHR
> > +#  define MEMCHR       __memchr_evex512
> > +# endif
> > +
> > +#define VEC_SIZE        64
> > +
> > +#include "memchr-evex-base.S"
> > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > new file mode 100644
> > index 0000000000..302d3cb055
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > @@ -0,0 +1,7 @@
> > +#ifndef RAWMEMCHR
> > +# define RAWMEMCHR     __rawmemchr_evex512
> > +#endif
> > +#define USE_AS_RAWMEMCHR       1
> > +#define MEMCHR RAWMEMCHR
> > +
> > +#include "memchr-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > new file mode 100644
> > index 0000000000..f45ed1db75
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > @@ -0,0 +1,8 @@
> > +#ifndef WMEMCHR
> > +# define WMEMCHR       __wmemchr_evex512
> > +#endif
> > +
> > +#define MEMCHR WMEMCHR
> > +#define USE_AS_WMEMCHR 1
> > +
> > +#include "memchr-evex512.S"
> > --
> > 2.36.1
> >
Function: wmemchr
Variant: 
                                    __wmemchr_evex	__wmemchr_evex512
========================================================================================================================
            len=256, align=1, pos=64:        22.16	       15.79 ( 28.77%)	
            len=256, align=1, pos=64:        20.04	       13.88 ( 30.77%)	
            len=256, align=2, pos=64:        18.07	       12.92 ( 28.51%)	
            len=256, align=2, pos=64:        17.02	       12.13 ( 28.72%)	
            len=256, align=3, pos=64:        16.10	       11.33 ( 29.66%)	
            len=256, align=3, pos=64:        15.18	       11.36 ( 25.16%)	
            len=256, align=4, pos=64:        15.20	       11.11 ( 26.86%)	
            len=256, align=4, pos=64:        15.16	       11.15 ( 26.46%)	
            len=256, align=5, pos=64:        15.19	       11.10 ( 26.89%)	
            len=256, align=5, pos=64:        15.20	       11.19 ( 26.37%)	
            len=256, align=6, pos=64:        15.19	       11.10 ( 26.95%)	
            len=256, align=6, pos=64:        15.20	       11.17 ( 26.51%)	
            len=256, align=7, pos=64:        15.16	       11.19 ( 26.19%)	
            len=256, align=7, pos=64:        15.07	       11.11 ( 26.27%)	
            len=192, align=1, pos=32:         9.40	        9.11 (  3.10%)	
            len=192, align=1, pos=32:         9.33	        9.18 (  1.67%)	
            len=256, align=1, pos=32:         9.38	        9.12 (  2.70%)	
            len=256, align=1, pos=32:         9.32	        9.14 (  1.87%)	
            len=512, align=1, pos=32:         9.36	        9.12 (  2.51%)	
            len=512, align=1, pos=32:         9.39	        9.12 (  2.91%)	
            len=192, align=2, pos=64:        15.20	       11.15 ( 26.66%)	
            len=192, align=2, pos=64:        15.22	       11.14 ( 26.82%)	
            len=256, align=2, pos=64:        15.20	       11.11 ( 26.87%)	
            len=256, align=2, pos=64:        15.20	       11.14 ( 26.71%)	
            len=512, align=2, pos=64:        15.17	       11.15 ( 26.47%)	
            len=512, align=2, pos=64:        15.21	       11.16 ( 26.59%)	
            len=192, align=3, pos=96:        16.78	       15.82 (  5.73%)	
            len=192, align=3, pos=96:        16.48	       15.82 (  3.98%)	
            len=256, align=3, pos=96:        16.36	       15.82 (  3.27%)	
            len=256, align=3, pos=96:        16.51	       15.79 (  4.33%)	
            len=512, align=3, pos=96:        16.49	       15.82 (  4.07%)	
            len=512, align=3, pos=96:        16.50	       15.82 (  4.13%)	
           len=192, align=4, pos=128:        17.88	       18.49 ( -3.41%)	
           len=192, align=4, pos=128:        17.79	       18.47 ( -3.84%)	
           len=256, align=4, pos=128:        17.93	       18.60 ( -3.76%)	
           len=256, align=4, pos=128:        17.79	       18.51 ( -4.05%)	
           len=512, align=4, pos=128:        17.79	       18.47 ( -3.78%)	
           len=512, align=4, pos=128:        17.69	       18.48 ( -4.50%)	
           len=192, align=5, pos=160:        19.44	       17.83 (  8.28%)	
           len=192, align=5, pos=160:        19.07	       17.83 (  6.51%)	
           len=256, align=5, pos=160:        19.10	       17.84 (  6.61%)	
           len=256, align=5, pos=160:        19.09	       17.83 (  6.58%)	
           len=512, align=5, pos=160:        19.07	       17.79 (  6.70%)	
           len=512, align=5, pos=160:        19.07	       17.84 (  6.47%)	
           len=192, align=6, pos=192:        20.28	       19.26 (  5.05%)	
           len=192, align=6, pos=192:        20.39	       19.83 (  2.76%)	
           len=256, align=6, pos=192:        20.40	       20.49 ( -0.44%)	
           len=256, align=6, pos=192:        20.40	       20.46 ( -0.29%)	
           len=512, align=6, pos=192:        20.40	       20.45 ( -0.28%)	
           len=512, align=6, pos=192:        20.37	       20.47 ( -0.49%)	
           len=192, align=7, pos=224:        20.27	       19.80 (  2.32%)	
           len=192, align=7, pos=224:        20.37	       19.15 (  6.02%)	
           len=256, align=7, pos=224:        22.02	       19.83 (  9.95%)	
           len=256, align=7, pos=224:        25.78	       19.80 ( 23.18%)	
           len=512, align=7, pos=224:        22.02	       19.80 ( 10.09%)	
           len=512, align=7, pos=224:        23.96	       19.80 ( 17.35%)	
               len=2, align=0, pos=1:         4.58	        5.71 (-24.58%)	
               len=2, align=0, pos=1:         4.67	        5.34 (-14.49%)	
               len=2, align=1, pos=1:         4.67	        5.66 (-21.20%)	
               len=2, align=1, pos=1:         4.67	        5.61 (-20.28%)	
               len=0, align=0, pos=1:         4.00	        4.00 ( -0.01%)	
               len=0, align=0, pos=1:         4.00	        4.00 ( -0.00%)	
               len=0, align=1, pos=1:         4.22	        4.00 (  5.12%)	
               len=0, align=1, pos=1:         4.00	        4.04 ( -1.03%)	
               len=3, align=0, pos=2:         4.68	        5.34 (-14.08%)	
               len=3, align=0, pos=2:         4.67	        5.34 (-14.44%)	
               len=3, align=2, pos=2:         4.67	        5.66 (-21.18%)	
               len=3, align=2, pos=2:         4.94	        6.08 (-22.93%)	
               len=1, align=0, pos=2:         4.67	        5.34 (-14.50%)	
               len=1, align=0, pos=2:         4.67	        5.30 (-13.55%)	
               len=1, align=2, pos=2:         4.67	        5.61 (-20.25%)	
               len=1, align=2, pos=2:         4.67	        5.62 (-20.34%)	
               len=4, align=0, pos=3:         4.67	        5.30 (-13.60%)	
               len=4, align=0, pos=3:         4.67	        5.30 (-13.59%)	
               len=4, align=3, pos=3:         4.67	        5.62 (-20.32%)	
               len=4, align=3, pos=3:         4.67	        5.62 (-20.36%)	
               len=2, align=0, pos=3:         4.71	        5.30 (-12.54%)	
               len=2, align=0, pos=3:         4.71	        5.30 (-12.55%)	
               len=2, align=3, pos=3:         4.71	        5.62 (-19.33%)	
               len=2, align=3, pos=3:         4.67	        5.62 (-20.31%)	
               len=5, align=0, pos=4:         4.67	        5.30 (-13.59%)	
               len=5, align=0, pos=4:         4.67	        5.30 (-13.57%)	
               len=5, align=4, pos=4:         4.67	        5.62 (-20.31%)	
               len=5, align=4, pos=4:         4.67	        5.66 (-21.20%)	
               len=3, align=0, pos=4:         4.67	        5.34 (-14.42%)	
               len=3, align=0, pos=4:         4.96	        5.72 (-15.42%)	
               len=3, align=4, pos=4:         4.71	        5.62 (-19.32%)	
               len=3, align=4, pos=4:         4.93	        6.03 (-22.22%)	
               len=6, align=0, pos=5:         4.67	        5.30 (-13.54%)	
               len=6, align=0, pos=5:         4.67	        5.34 (-14.40%)	
               len=6, align=5, pos=5:         4.67	        6.15 (-31.82%)	
               len=6, align=5, pos=5:         4.67	        5.62 (-20.38%)	
               len=4, align=0, pos=5:         4.67	        5.30 (-13.56%)	
               len=4, align=0, pos=5:         4.94	        5.67 (-14.80%)	
               len=4, align=5, pos=5:         4.67	        5.65 (-21.16%)	
               len=4, align=5, pos=5:         4.82	        6.03 (-25.22%)	
               len=7, align=0, pos=6:         4.71	        5.30 (-12.54%)	
               len=7, align=0, pos=6:         4.67	        5.30 (-13.60%)	
               len=7, align=6, pos=6:         4.67	        5.61 (-20.30%)	
               len=7, align=6, pos=6:         4.67	        5.66 (-21.21%)	
               len=5, align=0, pos=6:         4.72	        5.30 (-12.09%)	
               len=5, align=0, pos=6:         4.67	        5.30 (-13.56%)	
               len=5, align=6, pos=6:         4.67	        5.65 (-21.02%)	
               len=5, align=6, pos=6:         4.87	        6.04 (-23.99%)	
               len=8, align=0, pos=7:         4.67	        5.30 (-13.55%)	
               len=8, align=0, pos=7:         4.67	        5.34 (-14.44%)	
               len=8, align=7, pos=7:         5.23	        6.02 (-15.06%)	
               len=8, align=7, pos=7:         4.90	        5.62 (-14.58%)	
               len=6, align=0, pos=7:         4.67	        5.34 (-14.42%)	
               len=6, align=0, pos=7:         4.71	        5.30 (-12.53%)	
               len=6, align=7, pos=7:         4.90	        5.61 (-14.41%)	
               len=6, align=7, pos=7:         4.91	        5.62 (-14.45%)	
               len=9, align=0, pos=8:         8.70	        5.72 ( 34.24%)	
               len=9, align=0, pos=8:         8.82	        5.70 ( 35.31%)	
               len=9, align=8, pos=8:         8.71	        5.62 ( 35.49%)	
               len=9, align=8, pos=8:         8.67	        5.66 ( 34.72%)	
               len=7, align=0, pos=8:         4.67	        5.30 (-13.50%)	
               len=7, align=0, pos=8:         4.67	        5.34 (-14.44%)	
               len=7, align=8, pos=8:         4.91	        5.61 (-14.44%)	
               len=7, align=8, pos=8:         4.90	        5.66 (-15.40%)	
              len=10, align=0, pos=9:         8.87	        5.73 ( 35.43%)	
              len=10, align=0, pos=9:         8.67	        5.72 ( 34.08%)	
              len=10, align=9, pos=9:         8.66	        6.04 ( 30.17%)	
              len=10, align=9, pos=9:         8.70	        6.04 ( 30.60%)	
               len=8, align=0, pos=9:         4.96	        5.70 (-15.04%)	
               len=8, align=0, pos=9:         4.67	        5.30 (-13.61%)	
               len=8, align=9, pos=9:         5.26	        5.66 ( -7.57%)	
               len=8, align=9, pos=9:         5.55	        6.04 ( -8.77%)	
             len=11, align=0, pos=10:         8.67	        5.34 ( 38.39%)	
             len=11, align=0, pos=10:         8.67	        5.34 ( 38.37%)	
            len=11, align=10, pos=10:         8.67	        5.66 ( 34.74%)	
            len=11, align=10, pos=10:         8.67	        5.62 ( 35.17%)	
              len=9, align=0, pos=10:         7.75	        5.72 ( 26.17%)	
              len=9, align=0, pos=10:         7.78	        5.71 ( 26.68%)	
             len=9, align=10, pos=10:         7.71	        5.62 ( 27.13%)	
             len=9, align=10, pos=10:         7.67	        5.66 ( 26.23%)	
             len=12, align=0, pos=11:         8.67	        5.34 ( 38.42%)	
             len=12, align=0, pos=11:         8.67	        5.34 ( 38.38%)	
            len=12, align=11, pos=11:         8.67	        5.65 ( 34.76%)	
            len=12, align=11, pos=11:         8.67	        6.04 ( 30.29%)	
             len=10, align=0, pos=11:         7.79	        5.73 ( 26.47%)	
             len=10, align=0, pos=11:         7.76	        5.72 ( 26.23%)	
            len=10, align=11, pos=11:         7.76	        6.03 ( 22.28%)	
            len=10, align=11, pos=11:         7.71	        5.61 ( 27.17%)	
             len=13, align=0, pos=12:         8.71	        5.30 ( 39.14%)	
             len=13, align=0, pos=12:         8.67	        5.34 ( 38.37%)	
            len=13, align=12, pos=12:         8.00	        5.66 ( 29.32%)	
            len=13, align=12, pos=12:         8.00	        5.66 ( 29.26%)	
             len=11, align=0, pos=12:         7.69	        5.30 ( 31.08%)	
             len=11, align=0, pos=12:         7.69	        5.30 ( 31.06%)	
            len=11, align=12, pos=12:         7.67	        5.62 ( 26.76%)	
            len=11, align=12, pos=12:         7.71	        6.04 ( 21.66%)	
             len=14, align=0, pos=13:         8.67	        5.73 ( 33.86%)	
             len=14, align=0, pos=13:         8.69	        5.73 ( 34.06%)	
            len=14, align=13, pos=13:         8.10	        6.01 ( 25.73%)	
            len=14, align=13, pos=13:         8.04	        6.03 ( 25.03%)	
             len=12, align=0, pos=13:         7.72	        5.74 ( 25.70%)	
             len=12, align=0, pos=13:         7.78	        5.74 ( 26.26%)	
            len=12, align=13, pos=13:         8.29	        6.04 ( 27.18%)	
            len=12, align=13, pos=13:         8.26	        6.03 ( 26.91%)	
             len=15, align=0, pos=14:         8.68	        5.83 ( 32.82%)	
             len=15, align=0, pos=14:         8.65	        5.72 ( 33.87%)	
            len=15, align=14, pos=14:         8.03	        6.04 ( 24.84%)	
            len=15, align=14, pos=14:         8.00	        5.66 ( 29.29%)	
             len=13, align=0, pos=14:         8.14	        5.83 ( 28.35%)	
             len=13, align=0, pos=14:         7.69	        5.34 ( 30.54%)	
            len=13, align=14, pos=14:         8.22	        5.66 ( 31.17%)	
            len=13, align=14, pos=14:         9.33	        5.65 ( 39.48%)	
             len=16, align=0, pos=15:         8.67	        5.30 ( 38.86%)	
             len=16, align=0, pos=15:         8.67	        5.34 ( 38.36%)	
            len=16, align=15, pos=15:         8.67	        5.65 ( 34.78%)	
            len=16, align=15, pos=15:         8.09	        6.02 ( 25.65%)	
             len=14, align=0, pos=15:         7.76	        5.71 ( 26.48%)	
             len=14, align=0, pos=15:         7.76	        5.71 ( 26.44%)	
            len=14, align=15, pos=15:         8.29	        6.04 ( 27.15%)	
            len=14, align=15, pos=15:         8.25	        6.02 ( 27.07%)	
             len=17, align=0, pos=16:         8.10	        7.16 ( 11.55%)	
             len=17, align=0, pos=16:         8.03	        6.71 ( 16.45%)	
            len=17, align=16, pos=16:         8.11	        7.12 ( 12.19%)	
            len=17, align=16, pos=16:         8.52	        7.51 ( 11.88%)	
             len=15, align=0, pos=16:         8.14	        5.68 ( 30.26%)	
             len=15, align=0, pos=16:         8.14	        5.67 ( 30.40%)	
            len=15, align=16, pos=16:         8.19	        4.70 ( 42.58%)	
            len=15, align=16, pos=16:         7.76	        4.44 ( 42.86%)	
             len=18, align=0, pos=17:         8.11	        7.10 ( 12.41%)	
             len=18, align=0, pos=17:         8.49	        7.67 (  9.69%)	
            len=18, align=17, pos=17:         8.00	        6.71 ( 16.18%)	
            len=18, align=17, pos=17:         8.10	        7.12 ( 12.04%)	
             len=16, align=0, pos=17:         8.02	        5.77 ( 28.09%)	
             len=16, align=0, pos=17:         7.40	        5.77 ( 21.93%)	
            len=16, align=17, pos=17:         8.25	        8.42 ( -2.06%)	
            len=16, align=17, pos=17:         8.22	        8.03 (  2.29%)	
             len=19, align=0, pos=18:         8.10	        7.11 ( 12.13%)	
             len=19, align=0, pos=18:         8.03	        7.19 ( 10.47%)	
            len=19, align=18, pos=18:         8.00	        6.71 ( 16.15%)	
            len=19, align=18, pos=18:         8.03	        7.13 ( 11.16%)	
             len=17, align=0, pos=18:         8.22	        7.41 (  9.91%)	
             len=17, align=0, pos=18:         8.31	        8.42 ( -1.31%)	
            len=17, align=18, pos=18:         8.27	        7.96 (  3.69%)	
            len=17, align=18, pos=18:         8.22	        7.48 (  9.01%)	
             len=20, align=0, pos=19:         8.10	        7.13 ( 11.97%)	
             len=20, align=0, pos=19:         8.02	        7.16 ( 10.73%)	
            len=20, align=19, pos=19:         8.00	        6.67 ( 16.66%)	
            len=20, align=19, pos=19:         8.06	        7.11 ( 11.74%)	
             len=18, align=0, pos=19:         8.28	        8.43 ( -1.88%)	
             len=18, align=0, pos=19:         8.27	        8.48 ( -2.51%)	
            len=18, align=19, pos=19:         9.35	        8.00 ( 14.47%)	
            len=18, align=19, pos=19:         8.48	        8.28 (  2.36%)	
             len=21, align=0, pos=20:         8.09	        7.11 ( 12.06%)	
             len=21, align=0, pos=20:         8.03	        6.67 ( 17.02%)	
            len=21, align=20, pos=20:         9.96	        7.15 ( 28.24%)	
            len=21, align=20, pos=20:        10.02	        7.11 ( 29.04%)	
             len=19, align=0, pos=20:         8.45	        8.43 (  0.27%)	
             len=19, align=0, pos=20:         8.27	        7.82 (  5.42%)	
            len=19, align=20, pos=20:         9.33	        8.04 ( 13.89%)	
            len=19, align=20, pos=20:         8.27	        8.43 ( -1.95%)	
             len=22, align=0, pos=21:         8.09	        7.14 ( 11.66%)	
             len=22, align=0, pos=21:         8.09	        7.10 ( 12.20%)	
            len=22, align=21, pos=21:        10.16	        7.11 ( 30.07%)	
            len=22, align=21, pos=21:        10.55	        7.12 ( 32.53%)	
             len=20, align=0, pos=21:         8.27	        8.46 ( -2.29%)	
             len=20, align=0, pos=21:         8.46	        8.44 (  0.18%)	
            len=20, align=21, pos=21:        10.00	        8.45 ( 15.45%)	
            len=20, align=21, pos=21:        10.00	        8.66 ( 13.41%)	
             len=23, align=0, pos=22:         8.11	        7.12 ( 12.20%)	
             len=23, align=0, pos=22:         8.10	        7.12 ( 12.02%)	
            len=23, align=22, pos=22:        10.02	        7.24 ( 27.76%)	
            len=23, align=22, pos=22:         9.98	        7.14 ( 28.46%)	
             len=21, align=0, pos=22:         8.26	        8.46 ( -2.49%)	
             len=21, align=0, pos=22:         8.30	        8.45 ( -1.83%)	
            len=21, align=22, pos=22:        10.04	        8.00 ( 20.32%)	
            len=21, align=22, pos=22:        10.00	        7.79 ( 22.13%)	
             len=24, align=0, pos=23:         8.03	        7.17 ( 10.71%)	
             len=24, align=0, pos=23:         8.05	        7.12 ( 11.56%)	
            len=24, align=23, pos=23:        10.01	        7.12 ( 28.80%)	
            len=24, align=23, pos=23:        10.00	        7.12 ( 28.81%)	
             len=22, align=0, pos=23:         8.28	        8.45 ( -2.05%)	
             len=22, align=0, pos=23:         8.24	        8.45 ( -2.60%)	
            len=22, align=23, pos=23:         9.98	        8.94 ( 10.46%)	
            len=22, align=23, pos=23:         9.98	        8.04 ( 19.47%)	
             len=25, align=0, pos=24:         9.98	        7.14 ( 28.43%)	
             len=25, align=0, pos=24:         9.98	        7.17 ( 28.14%)	
            len=25, align=24, pos=24:         9.98	        9.28 (  7.01%)	
            len=25, align=24, pos=24:         9.98	        9.14 (  8.36%)	
             len=23, align=0, pos=24:         8.25	        8.46 ( -2.62%)	
             len=23, align=0, pos=24:         8.27	        8.42 ( -1.90%)	
            len=23, align=24, pos=24:         9.17	        7.79 ( 15.08%)	
            len=23, align=24, pos=24:         8.30	        8.44 ( -1.66%)	
             len=26, align=0, pos=25:        10.00	        7.13 ( 28.68%)	
             len=26, align=0, pos=25:         9.99	        7.17 ( 28.22%)	
            len=26, align=25, pos=25:        10.01	        9.13 (  8.81%)	
            len=26, align=25, pos=25:        10.00	        9.14 (  8.58%)	
             len=24, align=0, pos=25:         8.26	        8.10 (  1.98%)	
             len=24, align=0, pos=25:         8.24	        8.46 ( -2.63%)	
            len=24, align=25, pos=25:         9.99	        9.13 (  8.59%)	
            len=24, align=25, pos=25:        10.04	        9.10 (  9.34%)	
             len=27, align=0, pos=26:        10.65	        7.13 ( 33.08%)	
             len=27, align=0, pos=26:        10.04	        7.13 ( 29.01%)	
            len=27, align=26, pos=26:        10.00	        9.12 (  8.76%)	
            len=27, align=26, pos=26:         9.97	        9.17 (  8.04%)	
             len=25, align=0, pos=26:        10.03	        8.45 ( 15.70%)	
             len=25, align=0, pos=26:         9.97	        8.45 ( 15.29%)	
            len=25, align=26, pos=26:         9.98	        9.18 (  8.02%)	
            len=25, align=26, pos=26:        10.63	        8.70 ( 18.13%)	
             len=28, align=0, pos=27:         9.83	        7.18 ( 27.00%)	
             len=28, align=0, pos=27:         9.99	        7.14 ( 28.49%)	
            len=28, align=27, pos=27:        10.11	        9.10 (  9.97%)	
            len=28, align=27, pos=27:        10.02	        9.12 (  9.01%)	
             len=26, align=0, pos=27:        10.04	        8.43 ( 15.99%)	
             len=26, align=0, pos=27:        10.04	        8.44 ( 15.98%)	
            len=26, align=27, pos=27:         9.99	        9.19 (  8.05%)	
            len=26, align=27, pos=27:         9.97	        9.17 (  8.01%)	
             len=29, align=0, pos=28:         9.98	        7.15 ( 28.32%)	
             len=29, align=0, pos=28:         9.97	        7.18 ( 27.93%)	
            len=29, align=28, pos=28:        10.61	        9.16 ( 13.66%)	
            len=29, align=28, pos=28:        10.67	        9.12 ( 14.52%)	
             len=27, align=0, pos=28:        10.03	        7.80 ( 22.23%)	
             len=27, align=0, pos=28:         9.98	        8.50 ( 14.81%)	
            len=27, align=28, pos=28:         9.97	        9.17 (  8.03%)	
            len=27, align=28, pos=28:         9.96	        9.15 (  8.10%)	
             len=30, align=0, pos=29:         9.95	        7.16 ( 28.11%)	
             len=30, align=0, pos=29:        10.01	        7.13 ( 28.79%)	
            len=30, align=29, pos=29:        10.66	        9.11 ( 14.58%)	
            len=30, align=29, pos=29:        10.64	        9.17 ( 13.74%)	
             len=28, align=0, pos=29:        10.03	        8.42 ( 16.02%)	
             len=28, align=0, pos=29:        10.02	        7.79 ( 22.31%)	
            len=28, align=29, pos=29:        10.65	        9.14 ( 14.20%)	
            len=28, align=29, pos=29:        10.65	        9.17 ( 13.88%)	
             len=31, align=0, pos=30:        10.47	        7.11 ( 32.07%)	
             len=31, align=0, pos=30:        10.02	        7.13 ( 28.80%)	
            len=31, align=30, pos=30:        10.63	        9.15 ( 13.90%)	
            len=31, align=30, pos=30:        10.68	        9.10 ( 14.76%)	
             len=29, align=0, pos=30:        10.03	        8.43 ( 15.96%)	
             len=29, align=0, pos=30:        10.24	        8.48 ( 17.19%)	
            len=29, align=30, pos=30:        10.64	        9.16 ( 13.84%)	
            len=29, align=30, pos=30:        10.65	        9.10 ( 14.57%)	
             len=32, align=0, pos=31:         9.97	        7.16 ( 28.24%)	
             len=32, align=0, pos=31:         9.96	        7.16 ( 28.07%)	
            len=32, align=31, pos=31:        10.65	        9.18 ( 13.76%)	
            len=32, align=31, pos=31:        10.61	        9.16 ( 13.63%)	
             len=30, align=0, pos=31:        10.02	        7.91 ( 21.08%)	
             len=30, align=0, pos=31:        10.03	        7.79 ( 22.29%)	
            len=30, align=31, pos=31:        10.67	        9.12 ( 14.50%)	
            len=30, align=31, pos=31:        10.65	        9.19 ( 13.70%)
Function: memchr
Variant: 
                                    __memchr_evex	__memchr_evex512
========================================================================================================================
           len=2048, align=0, pos=32:         7.03	        4.78 ( 32.03%)	
            len=256, align=1, pos=64:         7.79	        6.73 ( 13.61%)	
           len=2048, align=0, pos=32:         7.09	        4.75 ( 32.98%)	
            len=256, align=1, pos=64:         7.77	        6.48 ( 16.61%)	
         len=256, align=4081, pos=64:         7.79	        7.58 (  2.74%)	
           len=2048, align=0, pos=64:         7.80	        6.46 ( 17.13%)	
            len=256, align=2, pos=64:         7.45	        6.12 ( 17.82%)	
           len=2048, align=0, pos=64:         7.40	        6.18 ( 16.48%)	
            len=256, align=2, pos=64:         7.37	        6.18 ( 16.10%)	
         len=256, align=4081, pos=64:         7.39	        7.17 (  2.96%)	
          len=2048, align=0, pos=128:         8.52	        7.18 ( 15.73%)	
            len=256, align=3, pos=64:         7.45	        6.13 ( 17.68%)	
          len=2048, align=0, pos=128:         8.71	        7.14 ( 18.00%)	
            len=256, align=3, pos=64:         7.37	        6.18 ( 16.10%)	
         len=256, align=4081, pos=64:         7.37	        7.14 (  3.19%)	
          len=2048, align=0, pos=256:        15.14	       13.16 ( 13.13%)	
            len=256, align=4, pos=64:         7.37	        6.15 ( 16.54%)	
          len=2048, align=0, pos=256:        15.21	       13.14 ( 13.61%)	
            len=256, align=4, pos=64:         7.42	        6.14 ( 17.31%)	
         len=256, align=4081, pos=64:         7.42	        7.11 (  4.19%)	
          len=2048, align=0, pos=512:        18.34	       18.22 (  0.66%)	
            len=256, align=5, pos=64:         7.39	        6.12 ( 17.24%)	
          len=2048, align=0, pos=512:        17.79	       18.90 ( -6.21%)	
            len=256, align=5, pos=64:         7.37	        6.17 ( 16.24%)	
         len=256, align=4081, pos=64:         7.40	        7.18 (  2.94%)	
         len=2048, align=0, pos=1024:        25.57	       21.68 ( 15.22%)	
            len=256, align=6, pos=64:         7.37	        6.19 ( 15.97%)	
         len=2048, align=0, pos=1024:        25.54	       21.72 ( 14.94%)	
            len=256, align=6, pos=64:         7.43	        6.13 ( 17.50%)	
         len=256, align=4081, pos=64:         7.44	        7.11 (  4.42%)	
         len=2048, align=0, pos=2048:        38.03	       29.22 ( 23.17%)	
            len=256, align=7, pos=64:         7.46	        6.14 ( 17.68%)	
         len=2048, align=0, pos=2048:        38.17	       29.19 ( 23.52%)	
            len=256, align=7, pos=64:         7.37	        6.17 ( 16.26%)	
         len=256, align=4081, pos=64:         7.41	        7.13 (  3.75%)	
            len=192, align=1, pos=32:         6.79	        4.45 ( 34.41%)	
            len=192, align=1, pos=32:         6.74	        4.46 ( 33.84%)	
            len=256, align=1, pos=32:         6.79	        4.59 ( 32.39%)	
            len=256, align=1, pos=32:         6.72	        4.44 ( 33.89%)	
            len=512, align=1, pos=32:         6.79	        4.46 ( 34.27%)	
            len=512, align=1, pos=32:         6.72	        4.45 ( 33.80%)	
         len=256, align=4081, pos=32:         6.79	        7.13 ( -4.96%)	
            len=192, align=2, pos=64:         7.37	        6.18 ( 16.17%)	
            len=192, align=2, pos=64:         7.38	        6.18 ( 16.30%)	
            len=256, align=2, pos=64:         7.43	        6.17 ( 16.89%)	
            len=256, align=2, pos=64:         7.40	        6.15 ( 16.89%)	
            len=512, align=2, pos=64:         7.40	        6.14 ( 17.03%)	
            len=512, align=2, pos=64:         7.44	        6.13 ( 17.68%)	
         len=256, align=4081, pos=64:         7.43	        7.13 (  4.01%)	
            len=192, align=3, pos=96:         8.05	        6.18 ( 23.22%)	
            len=192, align=3, pos=96:         8.05	        6.17 ( 23.31%)	
            len=256, align=3, pos=96:         7.91	        6.13 ( 22.55%)	
            len=256, align=3, pos=96:         8.10	        6.12 ( 24.35%)	
            len=512, align=3, pos=96:         8.06	        6.14 ( 23.84%)	
            len=512, align=3, pos=96:         8.03	        6.18 ( 23.04%)	
         len=256, align=4081, pos=96:         8.05	        7.82 (  2.85%)	
           len=192, align=4, pos=128:         8.75	        7.13 ( 18.50%)	
           len=192, align=4, pos=128:         8.69	        7.15 ( 17.66%)	
           len=256, align=4, pos=128:         8.69	        7.15 ( 17.81%)	
           len=256, align=4, pos=128:         8.71	        7.15 ( 17.88%)	
           len=512, align=4, pos=128:         8.69	        7.12 ( 18.06%)	
           len=512, align=4, pos=128:         8.71	        7.13 ( 18.11%)	
        len=256, align=4081, pos=128:         8.68	        7.83 (  9.75%)	
           len=192, align=5, pos=160:         9.90	        7.20 ( 27.27%)	
           len=192, align=5, pos=160:        10.00	        7.20 ( 27.99%)	
           len=256, align=5, pos=160:        10.03	        7.13 ( 28.92%)	
           len=256, align=5, pos=160:        10.00	        7.16 ( 28.32%)	
           len=512, align=5, pos=160:        12.62	        7.17 ( 43.22%)	
           len=512, align=5, pos=160:        12.58	        7.20 ( 42.80%)	
        len=256, align=4081, pos=160:        10.03	        9.81 (  2.23%)	
           len=192, align=6, pos=192:        11.02	        9.79 ( 11.18%)	
           len=192, align=6, pos=192:        10.62	        9.83 (  7.53%)	
           len=256, align=6, pos=192:        11.17	       10.25 (  8.21%)	
           len=256, align=6, pos=192:        11.32	        9.78 ( 13.64%)	
           len=512, align=6, pos=192:        12.61	        9.76 ( 22.57%)	
           len=512, align=6, pos=192:        12.61	        9.84 ( 21.96%)	
        len=256, align=4081, pos=192:        11.29	        9.82 ( 13.04%)	
           len=192, align=7, pos=224:        10.81	        9.79 (  9.42%)	
           len=192, align=7, pos=224:        10.65	        9.84 (  7.58%)	
           len=256, align=7, pos=224:        12.59	        9.84 ( 21.81%)	
           len=256, align=7, pos=224:        12.49	        9.77 ( 21.79%)	
           len=512, align=7, pos=224:        12.62	        9.79 ( 22.43%)	
           len=512, align=7, pos=224:        12.60	        9.82 ( 22.11%)	
        len=256, align=4081, pos=224:        12.62	       13.14 ( -4.06%)	
               len=2, align=0, pos=1:         4.59	        4.47 (  2.55%)	
               len=2, align=0, pos=1:         4.67	        4.05 ( 13.18%)	
               len=2, align=1, pos=1:         4.94	        4.24 ( 14.28%)	
               len=2, align=1, pos=1:         4.67	        4.00 ( 14.28%)	
               len=0, align=0, pos=1:         5.60	        5.56 (  0.80%)	
               len=0, align=0, pos=1:         5.33	        5.60 ( -4.99%)	
               len=0, align=1, pos=1:         5.33	        5.59 ( -4.89%)	
               len=0, align=1, pos=1:         5.33	        5.56 ( -4.17%)	
            len=2, align=2048, pos=1:         4.67	        4.00 ( 14.27%)	
            len=2, align=2048, pos=1:         4.71	        4.00 ( 15.05%)	
            len=2, align=2049, pos=1:         4.67	        4.04 ( 13.36%)	
            len=2, align=2049, pos=1:         4.67	        4.00 ( 14.28%)	
            len=0, align=2048, pos=1:         5.56	        5.56 ( -0.00%)	
            len=0, align=2048, pos=1:         5.60	        5.56 (  0.73%)	
            len=0, align=2049, pos=1:         5.33	        5.60 ( -4.95%)	
            len=0, align=2049, pos=1:         5.33	        5.56 ( -4.17%)	
            len=0, align=4081, pos=1:         5.56	        5.56 ( -0.01%)	
            len=0, align=4081, pos=1:         5.50	        5.56 ( -1.03%)	
            len=2, align=4081, pos=1:         6.04	        5.56 (  7.97%)	
            len=2, align=4081, pos=1:         6.04	        5.56 (  8.04%)	
               len=3, align=0, pos=2:         4.67	        4.00 ( 14.29%)	
               len=3, align=0, pos=2:         4.67	        4.00 ( 14.26%)	
               len=3, align=2, pos=2:         4.67	        4.00 ( 14.28%)	
               len=3, align=2, pos=2:         4.67	        4.00 ( 14.26%)	
               len=1, align=0, pos=2:         4.71	        4.00 ( 15.10%)	
               len=1, align=0, pos=2:         4.71	        4.00 ( 15.05%)	
               len=1, align=2, pos=2:         4.67	        4.00 ( 14.28%)	
               len=1, align=2, pos=2:         4.67	        4.00 ( 14.28%)	
            len=3, align=2048, pos=2:         4.67	        4.00 ( 14.28%)	
            len=3, align=2048, pos=2:         4.67	        4.00 ( 14.28%)	
            len=3, align=2050, pos=2:         4.67	        4.00 ( 14.27%)	
            len=3, align=2050, pos=2:         4.67	        4.00 ( 14.28%)	
            len=1, align=2048, pos=2:         4.67	        4.00 ( 14.28%)	
            len=1, align=2048, pos=2:         4.77	        4.00 ( 16.13%)	
            len=1, align=2050, pos=2:         4.67	        4.04 ( 13.38%)	
            len=1, align=2050, pos=2:         4.67	        4.00 ( 14.28%)	
            len=1, align=4081, pos=2:         6.00	        5.56 (  7.40%)	
            len=1, align=4081, pos=2:         6.00	        5.56 (  7.40%)	
            len=3, align=4081, pos=2:         6.00	        5.60 (  6.73%)	
            len=3, align=4081, pos=2:         6.30	        5.99 (  4.98%)	
               len=4, align=0, pos=3:         4.67	        4.00 ( 14.28%)	
               len=4, align=0, pos=3:         4.67	        4.00 ( 14.28%)	
               len=4, align=3, pos=3:         4.67	        4.00 ( 14.27%)	
               len=4, align=3, pos=3:         4.67	        4.00 ( 14.28%)	
               len=2, align=0, pos=3:         4.71	        4.00 ( 15.02%)	
               len=2, align=0, pos=3:         4.67	        4.04 ( 13.38%)	
               len=2, align=3, pos=3:         4.67	        4.00 ( 14.28%)	
               len=2, align=3, pos=3:         4.67	        4.00 ( 14.26%)	
            len=4, align=2048, pos=3:         4.67	        4.00 ( 14.28%)	
            len=4, align=2048, pos=3:         4.67	        4.00 ( 14.28%)	
            len=4, align=2051, pos=3:         4.67	        4.00 ( 14.28%)	
            len=4, align=2051, pos=3:         4.67	        4.00 ( 14.26%)	
            len=2, align=2048, pos=3:         4.71	        4.00 ( 15.04%)	
            len=2, align=2048, pos=3:         4.71	        4.00 ( 15.03%)	
            len=2, align=2051, pos=3:         4.67	        4.04 ( 13.35%)	
            len=2, align=2051, pos=3:         4.67	        4.00 ( 14.28%)	
            len=2, align=4081, pos=3:         6.00	        5.56 (  7.40%)	
            len=2, align=4081, pos=3:         6.00	        5.56 (  7.41%)	
            len=4, align=4081, pos=3:         6.09	        5.97 (  2.06%)	
            len=4, align=4081, pos=3:         6.00	        5.56 (  7.41%)	
               len=5, align=0, pos=4:         4.67	        4.04 ( 13.52%)	
               len=5, align=0, pos=4:         4.91	        4.50 (  8.32%)	
               len=5, align=4, pos=4:         4.83	        4.47 (  7.47%)	
               len=5, align=4, pos=4:         4.82	        4.44 (  7.79%)	
               len=3, align=0, pos=4:         4.71	        4.00 ( 15.02%)	
               len=3, align=0, pos=4:         4.71	        4.00 ( 15.03%)	
               len=3, align=4, pos=4:         4.67	        4.04 ( 13.39%)	
               len=3, align=4, pos=4:         4.67	        4.05 ( 13.31%)	
            len=5, align=2048, pos=4:         4.82	        4.46 (  7.51%)	
            len=5, align=2048, pos=4:         4.82	        4.46 (  7.54%)	
            len=5, align=2052, pos=4:         4.80	        4.44 (  7.49%)	
            len=5, align=2052, pos=4:         4.67	        4.00 ( 14.28%)	
            len=3, align=2048, pos=4:         4.71	        4.00 ( 15.04%)	
            len=3, align=2048, pos=4:         4.71	        4.00 ( 15.02%)	
            len=3, align=2052, pos=4:         4.71	        4.00 ( 15.03%)	
            len=3, align=2052, pos=4:         4.71	        4.00 ( 15.04%)	
            len=3, align=4081, pos=4:         6.04	        5.33 ( 11.71%)	
            len=3, align=4081, pos=4:         6.04	        5.56 (  8.06%)	
            len=5, align=4081, pos=4:         6.00	        5.60 (  6.72%)	
            len=5, align=4081, pos=4:         6.12	        5.96 (  2.62%)	
               len=6, align=0, pos=5:         4.67	        4.00 ( 14.28%)	
               len=6, align=0, pos=5:         4.67	        4.00 ( 14.27%)	
               len=6, align=5, pos=5:         4.67	        4.00 ( 14.28%)	
               len=6, align=5, pos=5:         4.67	        4.00 ( 14.28%)	
               len=4, align=0, pos=5:         4.67	        4.00 ( 14.26%)	
               len=4, align=0, pos=5:         4.67	        4.00 ( 14.28%)	
               len=4, align=5, pos=5:         4.71	        4.00 ( 15.06%)	
               len=4, align=5, pos=5:         4.71	        4.00 ( 15.02%)	
            len=6, align=2048, pos=5:         4.82	        4.48 (  7.07%)	
            len=6, align=2048, pos=5:         4.67	        4.00 ( 14.28%)	
            len=6, align=2053, pos=5:         4.67	        4.00 ( 14.26%)	
            len=6, align=2053, pos=5:         4.67	        4.00 ( 14.26%)	
            len=4, align=2048, pos=5:         4.67	        4.00 ( 14.26%)	
            len=4, align=2048, pos=5:         4.67	        4.00 ( 14.28%)	
            len=4, align=2053, pos=5:         4.67	        4.00 ( 14.28%)	
            len=4, align=2053, pos=5:         4.67	        4.00 ( 14.28%)	
            len=4, align=4081, pos=5:         6.04	        5.56 (  8.01%)	
            len=4, align=4081, pos=5:         6.00	        5.33 ( 11.11%)	
            len=6, align=4081, pos=5:         6.35	        5.99 (  5.76%)	
            len=6, align=4081, pos=5:         6.00	        5.56 (  7.40%)	
               len=7, align=0, pos=6:         4.67	        4.00 ( 14.28%)	
               len=7, align=0, pos=6:         4.67	        4.00 ( 14.26%)	
               len=7, align=6, pos=6:         4.67	        4.00 ( 14.26%)	
               len=7, align=6, pos=6:         4.67	        4.00 ( 14.26%)	
               len=5, align=0, pos=6:         4.67	        4.00 ( 14.28%)	
               len=5, align=0, pos=6:         4.67	        4.00 ( 14.26%)	
               len=5, align=6, pos=6:         4.67	        4.00 ( 14.28%)	
               len=5, align=6, pos=6:         4.71	        4.00 ( 15.04%)	
            len=7, align=2048, pos=6:         4.71	        4.00 ( 15.03%)	
            len=7, align=2048, pos=6:         4.71	        4.00 ( 15.03%)	
            len=7, align=2054, pos=6:         4.71	        4.00 ( 15.02%)	
            len=7, align=2054, pos=6:         4.70	        4.00 ( 14.92%)	
            len=5, align=2048, pos=6:         4.67	        4.04 ( 13.36%)	
            len=5, align=2048, pos=6:         4.67	        4.04 ( 13.38%)	
            len=5, align=2054, pos=6:         4.67	        4.00 ( 14.28%)	
            len=5, align=2054, pos=6:         4.67	        4.00 ( 14.28%)	
            len=5, align=4081, pos=6:         6.00	        5.60 (  6.73%)	
            len=5, align=4081, pos=6:         6.00	        5.60 (  6.69%)	
            len=7, align=4081, pos=6:         6.04	        5.56 (  8.07%)	
            len=7, align=4081, pos=6:         6.00	        5.56 (  7.41%)	
               len=8, align=0, pos=7:         4.67	        4.04 ( 13.39%)	
               len=8, align=0, pos=7:         4.95	        4.49 (  9.22%)	
               len=8, align=7, pos=7:         4.94	        4.47 (  9.54%)	
               len=8, align=7, pos=7:         4.98	        4.47 ( 10.41%)	
               len=6, align=0, pos=7:         4.67	        4.04 ( 13.37%)	
               len=6, align=0, pos=7:         4.67	        4.04 ( 13.39%)	
               len=6, align=7, pos=7:         4.67	        4.04 ( 13.37%)	
               len=6, align=7, pos=7:         4.67	        4.00 ( 14.26%)	
            len=8, align=2048, pos=7:         4.67	        4.04 ( 13.39%)	
            len=8, align=2048, pos=7:         4.67	        4.68 ( -0.21%)	
            len=8, align=2055, pos=7:         4.82	        4.47 (  7.18%)	
            len=8, align=2055, pos=7:         4.98	        4.44 ( 10.76%)	
            len=6, align=2048, pos=7:         4.67	        4.04 ( 13.39%)	
            len=6, align=2048, pos=7:         4.67	        4.04 ( 13.37%)	
            len=6, align=2055, pos=7:         4.67	        4.04 ( 13.39%)	
            len=6, align=2055, pos=7:         4.67	        4.04 ( 13.33%)	
            len=6, align=4081, pos=7:         6.00	        5.60 (  6.73%)	
            len=6, align=4081, pos=7:         6.04	        5.56 (  8.04%)	
            len=8, align=4081, pos=7:         6.00	        5.33 ( 11.11%)	
            len=8, align=4081, pos=7:         6.00	        5.60 (  6.73%)	
               len=9, align=0, pos=8:         4.80	        4.47 (  6.84%)	
               len=9, align=0, pos=8:         4.67	        4.00 ( 14.28%)	
               len=9, align=8, pos=8:         4.67	        4.00 ( 14.27%)	
               len=9, align=8, pos=8:         4.67	        4.00 ( 14.28%)	
               len=7, align=0, pos=8:         4.67	        4.00 ( 14.28%)	
               len=7, align=0, pos=8:         4.67	        4.00 ( 14.26%)	
               len=7, align=8, pos=8:         4.67	        4.00 ( 14.28%)	
               len=7, align=8, pos=8:         4.94	        4.24 ( 14.28%)	
            len=9, align=2048, pos=8:         5.03	        4.70 (  6.48%)	
            len=9, align=2048, pos=8:         4.94	        4.24 ( 14.28%)	
            len=9, align=2056, pos=8:         4.94	        4.24 ( 14.26%)	
            len=9, align=2056, pos=8:         4.94	        4.29 ( 13.20%)	
            len=7, align=2048, pos=8:         4.94	        4.28 ( 13.29%)	
            len=7, align=2048, pos=8:         5.26	        4.70 ( 10.54%)	
            len=7, align=2056, pos=8:         4.94	        4.24 ( 14.28%)	
            len=7, align=2056, pos=8:         4.94	        4.24 ( 14.28%)	
            len=7, align=4081, pos=8:         6.00	        5.33 ( 11.10%)	
            len=7, align=4081, pos=8:         6.00	        5.60 (  6.67%)	
            len=9, align=4081, pos=8:         5.84	        5.96 ( -2.12%)	
            len=9, align=4081, pos=8:         6.47	        6.50 ( -0.52%)	
              len=10, align=0, pos=9:         4.67	        4.00 ( 14.28%)	
              len=10, align=0, pos=9:         4.67	        4.00 ( 14.28%)	
              len=10, align=9, pos=9:         4.67	        4.00 ( 14.26%)	
              len=10, align=9, pos=9:         4.67	        4.00 ( 14.26%)	
               len=8, align=0, pos=9:         4.67	        4.00 ( 14.27%)	
               len=8, align=0, pos=9:         4.67	        4.04 ( 13.38%)	
               len=8, align=9, pos=9:         4.67	        4.04 ( 13.38%)	
               len=8, align=9, pos=9:         4.67	        4.04 ( 13.37%)	
           len=10, align=2048, pos=9:         4.82	        4.45 (  7.57%)	
           len=10, align=2048, pos=9:         4.83	        4.45 (  7.74%)	
           len=10, align=2057, pos=9:         4.77	        4.44 (  6.90%)	
           len=10, align=2057, pos=9:         4.67	        4.00 ( 14.28%)	
            len=8, align=2048, pos=9:         4.67	        4.00 ( 14.28%)	
            len=8, align=2048, pos=9:         4.67	        4.00 ( 14.28%)	
            len=8, align=2057, pos=9:         4.67	        4.00 ( 14.28%)	
            len=8, align=2057, pos=9:         4.67	        4.00 ( 14.28%)	
            len=8, align=4081, pos=9:         6.00	        5.56 (  7.40%)	
            len=8, align=4081, pos=9:         6.00	        5.58 (  7.00%)	
           len=10, align=4081, pos=9:         6.00	        5.56 (  7.41%)	
           len=10, align=4081, pos=9:         6.11	        5.97 (  2.36%)	
             len=11, align=0, pos=10:         4.67	        4.00 ( 14.28%)	
             len=11, align=0, pos=10:         4.67	        4.00 ( 14.26%)	
            len=11, align=10, pos=10:         4.67	        4.00 ( 14.28%)	
            len=11, align=10, pos=10:         4.67	        4.04 ( 13.38%)	
              len=9, align=0, pos=10:         4.67	        4.05 ( 13.29%)	
              len=9, align=0, pos=10:         4.67	        4.04 ( 13.39%)	
             len=9, align=10, pos=10:         4.81	        4.45 (  7.50%)	
             len=9, align=10, pos=10:         4.79	        4.49 (  6.28%)	
          len=11, align=2048, pos=10:         4.67	        4.00 ( 14.28%)	
          len=11, align=2048, pos=10:         4.67	        4.00 ( 14.28%)	
          len=11, align=2058, pos=10:         4.67	        4.00 ( 14.28%)	
          len=11, align=2058, pos=10:         4.67	        4.04 ( 13.39%)	
           len=9, align=2048, pos=10:         4.67	        4.04 ( 13.39%)	
           len=9, align=2048, pos=10:         4.81	        4.45 (  7.52%)	
           len=9, align=2058, pos=10:         4.71	        4.00 ( 15.03%)	
           len=9, align=2058, pos=10:         4.71	        4.00 ( 15.03%)	
           len=9, align=4081, pos=10:         6.04	        5.56 (  8.04%)	
           len=9, align=4081, pos=10:         6.00	        5.33 ( 11.11%)	
          len=11, align=4081, pos=10:         6.11	        5.97 (  2.29%)	
          len=11, align=4081, pos=10:         6.00	        5.56 (  7.41%)	
             len=12, align=0, pos=11:         4.85	        4.45 (  8.27%)	
             len=12, align=0, pos=11:         4.96	        4.45 ( 10.25%)	
            len=12, align=11, pos=11:         4.67	        4.00 ( 14.28%)	
            len=12, align=11, pos=11:         4.67	        4.00 ( 14.28%)	
             len=10, align=0, pos=11:         4.67	        4.04 ( 13.40%)	
             len=10, align=0, pos=11:         4.67	        4.04 ( 13.40%)	
            len=10, align=11, pos=11:         4.95	        4.50 (  9.16%)	
            len=10, align=11, pos=11:         4.79	        4.45 (  6.99%)	
          len=12, align=2048, pos=11:         4.67	        4.00 ( 14.27%)	
          len=12, align=2048, pos=11:         4.67	        4.00 ( 14.28%)	
          len=12, align=2059, pos=11:         4.67	        4.04 ( 13.39%)	
          len=12, align=2059, pos=11:         4.84	        4.44 (  8.31%)	
          len=10, align=2048, pos=11:         4.94	        4.45 ( 10.05%)	
          len=10, align=2048, pos=11:         5.00	        4.45 ( 11.04%)	
          len=10, align=2059, pos=11:         4.67	        4.00 ( 14.28%)	
          len=10, align=2059, pos=11:         4.67	        4.00 ( 14.26%)	
          len=10, align=4081, pos=11:         6.00	        5.60 (  6.72%)	
          len=10, align=4081, pos=11:         6.27	        5.98 (  4.56%)	
          len=12, align=4081, pos=11:         6.00	        5.41 (  9.85%)	
          len=12, align=4081, pos=11:         6.10	        5.99 (  1.77%)	
             len=13, align=0, pos=12:         4.67	        4.00 ( 14.28%)	
             len=13, align=0, pos=12:         4.94	        4.50 (  8.91%)	
            len=13, align=12, pos=12:         4.84	        4.44 (  8.23%)	
            len=13, align=12, pos=12:         4.81	        4.46 (  7.19%)	
             len=11, align=0, pos=12:         4.67	        4.00 ( 14.26%)	
             len=11, align=0, pos=12:         4.67	        4.00 ( 14.28%)	
            len=11, align=12, pos=12:         4.67	        4.04 ( 13.38%)	
            len=11, align=12, pos=12:         4.83	        4.47 (  7.53%)	
          len=13, align=2048, pos=12:         4.94	        4.46 (  9.71%)	
          len=13, align=2048, pos=12:         4.67	        4.00 ( 14.28%)	
          len=13, align=2060, pos=12:         4.67	        4.00 ( 14.28%)	
          len=13, align=2060, pos=12:         4.77	        4.51 (  5.41%)	
          len=11, align=2048, pos=12:         4.82	        4.47 (  7.29%)	
          len=11, align=2048, pos=12:         4.86	        4.47 (  7.97%)	
          len=11, align=2060, pos=12:         4.67	        4.00 ( 14.28%)	
          len=11, align=2060, pos=12:         4.67	        4.00 ( 14.28%)	
          len=11, align=4081, pos=12:         6.00	        5.41 (  9.81%)	
          len=11, align=4081, pos=12:         6.11	        5.98 (  2.13%)	
          len=13, align=4081, pos=12:         6.00	        5.60 (  6.72%)	
          len=13, align=4081, pos=12:         6.06	        6.14 ( -1.28%)	
             len=14, align=0, pos=13:         4.90	        4.47 (  8.85%)	
             len=14, align=0, pos=13:         4.80	        4.47 (  6.92%)	
            len=14, align=13, pos=13:         4.67	        4.00 ( 14.28%)	
            len=14, align=13, pos=13:         4.67	        4.00 ( 14.26%)	
             len=12, align=0, pos=13:         4.67	        4.04 ( 13.38%)	
             len=12, align=0, pos=13:         4.82	        4.45 (  7.65%)	
            len=12, align=13, pos=13:         5.04	        4.24 ( 15.98%)	
            len=12, align=13, pos=13:         4.67	        4.00 ( 14.26%)	
          len=14, align=2048, pos=13:         4.77	        4.48 (  6.20%)	
          len=14, align=2048, pos=13:         4.82	        4.45 (  7.63%)	
          len=14, align=2061, pos=13:         4.67	        4.00 ( 14.28%)	
          len=14, align=2061, pos=13:         4.67	        4.04 ( 13.39%)	
          len=12, align=2048, pos=13:         4.67	        4.04 ( 13.38%)	
          len=12, align=2048, pos=13:         4.82	        4.63 (  3.81%)	
          len=12, align=2061, pos=13:         4.80	        4.45 (  7.21%)	
          len=12, align=2061, pos=13:         4.67	        4.00 ( 14.28%)	
          len=12, align=4081, pos=13:         6.00	        5.60 (  6.73%)	
          len=12, align=4081, pos=13:         6.11	        6.00 (  1.81%)	
          len=14, align=4081, pos=13:         6.06	        5.97 (  1.59%)	
          len=14, align=4081, pos=13:         6.00	        5.60 (  6.63%)	
             len=15, align=0, pos=14:         4.79	        4.44 (  7.25%)	
             len=15, align=0, pos=14:         4.67	        4.04 ( 13.37%)	
            len=15, align=14, pos=14:         4.81	        4.45 (  7.50%)	
            len=15, align=14, pos=14:         4.79	        4.44 (  7.28%)	
             len=13, align=0, pos=14:         4.67	        4.00 ( 14.28%)	
             len=13, align=0, pos=14:         4.67	        4.04 ( 13.40%)	
            len=13, align=14, pos=14:         4.81	        4.45 (  7.55%)	
            len=13, align=14, pos=14:         4.79	        4.48 (  6.44%)	
          len=15, align=2048, pos=14:         4.67	        4.00 ( 14.28%)	
          len=15, align=2048, pos=14:         4.80	        4.49 (  6.39%)	
          len=15, align=2062, pos=14:         4.67	        4.46 (  4.50%)	
          len=15, align=2062, pos=14:         4.67	        4.00 ( 14.28%)	
          len=13, align=2048, pos=14:         4.67	        4.00 ( 14.28%)	
          len=13, align=2048, pos=14:         4.78	        4.47 (  6.52%)	
          len=13, align=2062, pos=14:         4.81	        4.46 (  7.21%)	
          len=13, align=2062, pos=14:         4.67	        4.00 ( 14.28%)	
          len=13, align=4081, pos=14:         6.00	        5.60 (  6.73%)	
          len=13, align=4081, pos=14:         6.06	        5.99 (  1.26%)	
          len=15, align=4081, pos=14:         6.11	        5.99 (  2.06%)	
          len=15, align=4081, pos=14:         6.00	        5.60 (  6.73%)	
             len=16, align=0, pos=15:         4.81	        4.45 (  7.36%)	
             len=16, align=0, pos=15:         4.67	        4.00 ( 14.27%)	
            len=16, align=15, pos=15:         4.83	        4.47 (  7.50%)	
            len=16, align=15, pos=15:         4.80	        4.47 (  6.94%)	
             len=14, align=0, pos=15:         4.67	        4.00 ( 14.28%)	
             len=14, align=0, pos=15:         4.67	        4.06 ( 12.99%)	
            len=14, align=15, pos=15:         4.82	        4.45 (  7.54%)	
            len=14, align=15, pos=15:         4.67	        4.00 ( 14.28%)	
          len=16, align=2048, pos=15:         4.67	        4.04 ( 13.40%)	
          len=16, align=2048, pos=15:         4.82	        4.45 (  7.69%)	
          len=16, align=2063, pos=15:         4.80	        4.44 (  7.48%)	
          len=16, align=2063, pos=15:         4.67	        4.04 ( 13.38%)	
          len=14, align=2048, pos=15:         4.83	        4.46 (  7.80%)	
          len=14, align=2048, pos=15:         4.79	        4.46 (  6.99%)	
          len=14, align=2063, pos=15:         4.67	        4.00 ( 14.26%)	
          len=14, align=2063, pos=15:         4.84	        4.44 (  8.15%)	
          len=14, align=4081, pos=15:         6.09	        5.97 (  1.87%)	
          len=14, align=4081, pos=15:         6.08	        5.99 (  1.58%)	
          len=16, align=4081, pos=15:         7.67	        6.71 ( 12.50%)	
          len=16, align=4081, pos=15:         8.37	        7.13 ( 14.83%)	
             len=17, align=0, pos=16:         4.79	        4.46 (  6.91%)	
             len=17, align=0, pos=16:         4.77	        4.49 (  5.77%)	
            len=17, align=16, pos=16:         4.80	        4.46 (  7.21%)	
            len=17, align=16, pos=16:         4.82	        4.46 (  7.43%)	
             len=15, align=0, pos=16:         4.67	        4.00 ( 14.28%)	
             len=15, align=0, pos=16:         4.67	        4.04 ( 13.40%)	
            len=15, align=16, pos=16:         4.86	        4.47 (  8.15%)	
            len=15, align=16, pos=16:         4.67	        4.00 ( 14.26%)	
          len=17, align=2048, pos=16:         4.82	        4.43 (  7.97%)	
          len=17, align=2048, pos=16:         4.67	        4.00 ( 14.28%)	
          len=17, align=2064, pos=16:         4.84	        4.45 (  8.11%)	
          len=17, align=2064, pos=16:         4.67	        4.00 ( 14.28%)	
          len=15, align=2048, pos=16:         5.41	        4.46 ( 17.64%)	
          len=15, align=2048, pos=16:         4.67	        4.00 ( 14.28%)	
          len=15, align=2064, pos=16:         4.67	        4.04 ( 13.36%)	
          len=15, align=2064, pos=16:         4.82	        4.45 (  7.59%)	
          len=15, align=4081, pos=16:         6.00	        6.71 (-11.79%)	
          len=15, align=4081, pos=16:         6.06	        7.09 (-17.01%)	
          len=17, align=4081, pos=16:         7.73	        7.12 (  7.85%)	
          len=17, align=4081, pos=16:         7.72	        7.13 (  7.61%)	
             len=18, align=0, pos=17:         4.67	        4.00 ( 14.28%)	
             len=18, align=0, pos=17:         4.79	        4.45 (  7.09%)	
            len=18, align=17, pos=17:         4.67	        4.04 ( 13.39%)	
            len=18, align=17, pos=17:         4.78	        4.44 (  6.97%)	
             len=16, align=0, pos=17:         4.67	        4.04 ( 13.36%)	
             len=16, align=0, pos=17:         4.82	        4.46 (  7.43%)	
            len=16, align=17, pos=17:         4.80	        4.46 (  7.18%)	
            len=16, align=17, pos=17:         4.95	        4.49 (  9.36%)	
          len=18, align=2048, pos=17:         4.78	        4.45 (  6.89%)	
          len=18, align=2048, pos=17:         4.79	        4.49 (  6.33%)	
          len=18, align=2065, pos=17:         4.82	        4.46 (  7.47%)	
          len=18, align=2065, pos=17:         4.79	        4.45 (  7.09%)	
          len=16, align=2048, pos=17:         4.67	        4.00 ( 14.28%)	
          len=16, align=2048, pos=17:         4.67	        4.04 ( 13.51%)	
          len=16, align=2065, pos=17:         4.82	        4.46 (  7.57%)	
          len=16, align=2065, pos=17:         4.78	        4.45 (  6.97%)	
          len=16, align=4081, pos=17:         8.37	        7.33 ( 12.42%)	
          len=16, align=4081, pos=17:         8.41	        7.79 (  7.38%)	
          len=18, align=4081, pos=17:         7.67	        6.71 ( 12.50%)	
          len=18, align=4081, pos=17:         7.73	        7.11 (  7.99%)	
             len=19, align=0, pos=18:         4.77	        4.45 (  6.78%)	
             len=19, align=0, pos=18:         4.77	        4.44 (  6.87%)	
            len=19, align=18, pos=18:         4.79	        4.46 (  6.98%)	
            len=19, align=18, pos=18:         4.81	        4.45 (  7.45%)	
             len=17, align=0, pos=18:         4.79	        4.44 (  7.38%)	
             len=17, align=0, pos=18:         4.67	        4.04 ( 13.40%)	
            len=17, align=18, pos=18:         4.80	        4.44 (  7.50%)	
            len=17, align=18, pos=18:         4.67	        4.04 ( 13.39%)	
          len=19, align=2048, pos=18:         4.77	        4.46 (  6.59%)	
          len=19, align=2048, pos=18:         4.84	        4.44 (  8.21%)	
          len=19, align=2066, pos=18:         4.67	        4.04 ( 13.40%)	
          len=19, align=2066, pos=18:         4.67	        4.47 (  4.32%)	
          len=17, align=2048, pos=18:         4.77	        4.49 (  5.80%)	
          len=17, align=2048, pos=18:         4.80	        4.46 (  7.20%)	
          len=17, align=2066, pos=18:         4.82	        4.48 (  6.99%)	
          len=17, align=2066, pos=18:         4.67	        4.00 ( 14.28%)	
          len=17, align=4081, pos=18:         8.54	        7.76 (  9.13%)	
          len=17, align=4081, pos=18:         8.33	        7.39 ( 11.29%)	
          len=19, align=4081, pos=18:         7.77	        7.11 (  8.52%)	
          len=19, align=4081, pos=18:         7.39	        7.17 (  2.95%)	
             len=20, align=0, pos=19:         4.67	        4.05 ( 13.21%)	
             len=20, align=0, pos=19:         4.80	        4.45 (  7.42%)	
            len=20, align=19, pos=19:         4.84	        4.44 (  8.39%)	
            len=20, align=19, pos=19:         4.67	        4.04 ( 13.40%)	
             len=18, align=0, pos=19:         4.81	        4.48 (  6.73%)	
             len=18, align=0, pos=19:         4.77	        4.47 (  6.27%)	
            len=18, align=19, pos=19:         4.77	        4.46 (  6.59%)	
            len=18, align=19, pos=19:         4.81	        4.44 (  7.66%)	
          len=20, align=2048, pos=19:         4.80	        4.48 (  6.72%)	
          len=20, align=2048, pos=19:         4.79	        4.47 (  6.79%)	
          len=20, align=2067, pos=19:         4.78	        4.43 (  7.27%)	
          len=20, align=2067, pos=19:         4.79	        4.49 (  6.28%)	
          len=18, align=2048, pos=19:         4.77	        4.45 (  6.72%)	
          len=18, align=2048, pos=19:         4.81	        4.44 (  7.76%)	
          len=18, align=2067, pos=19:         4.79	        4.51 (  6.02%)	
          len=18, align=2067, pos=19:         4.67	        4.00 ( 14.28%)	
          len=18, align=4081, pos=19:         8.37	        7.81 (  6.63%)	
          len=18, align=4081, pos=19:         8.69	        7.33 ( 15.62%)	
          len=20, align=4081, pos=19:         7.76	        7.12 (  8.20%)	
          len=20, align=4081, pos=19:         7.33	        6.72 (  8.33%)	
             len=21, align=0, pos=20:         4.84	        4.45 (  8.07%)	
             len=21, align=0, pos=20:         4.95	        4.51 (  8.90%)	
            len=21, align=20, pos=20:         4.79	        4.44 (  7.41%)	
            len=21, align=20, pos=20:         4.78	        4.45 (  6.96%)	
             len=19, align=0, pos=20:         4.85	        4.45 (  8.14%)	
             len=19, align=0, pos=20:         4.67	        4.00 ( 14.27%)	
            len=19, align=20, pos=20:         4.81	        4.44 (  7.72%)	
            len=19, align=20, pos=20:         4.84	        4.78 (  1.23%)	
          len=21, align=2048, pos=20:         4.81	        4.45 (  7.58%)	
          len=21, align=2048, pos=20:         4.78	        4.47 (  6.61%)	
          len=21, align=2068, pos=20:         4.79	        4.45 (  7.09%)	
          len=21, align=2068, pos=20:         4.85	        4.43 (  8.52%)	
          len=19, align=2048, pos=20:         4.67	        4.00 ( 14.28%)	
          len=19, align=2048, pos=20:         4.79	        4.45 (  7.24%)	
          len=19, align=2068, pos=20:         4.84	        4.46 (  7.90%)	
          len=19, align=2068, pos=20:         4.67	        4.00 ( 14.27%)	
          len=19, align=4081, pos=20:         8.55	        7.84 (  8.37%)	
          len=19, align=4081, pos=20:         8.63	        7.33 ( 15.04%)	
          len=21, align=4081, pos=20:         7.73	        7.17 (  7.24%)	
          len=21, align=4081, pos=20:         8.17	        7.15 ( 12.45%)	
             len=22, align=0, pos=21:         4.80	        4.44 (  7.52%)	
             len=22, align=0, pos=21:         4.85	        4.64 (  4.31%)	
            len=22, align=21, pos=21:         4.82	        4.46 (  7.34%)	
            len=22, align=21, pos=21:         4.67	        4.04 ( 13.39%)	
             len=20, align=0, pos=21:         4.79	        4.46 (  6.91%)	
             len=20, align=0, pos=21:         4.85	        4.47 (  7.81%)	
            len=20, align=21, pos=21:         4.67	        4.04 ( 13.47%)	
            len=20, align=21, pos=21:         4.77	        4.48 (  6.15%)	
          len=22, align=2048, pos=21:         4.79	        4.46 (  6.80%)	
          len=22, align=2048, pos=21:         4.86	        4.45 (  8.56%)	
          len=22, align=2069, pos=21:         4.82	        4.51 (  6.36%)	
          len=22, align=2069, pos=21:         4.79	        4.46 (  6.97%)	
          len=20, align=2048, pos=21:         4.78	        4.46 (  6.78%)	
          len=20, align=2048, pos=21:         4.89	        4.47 (  8.57%)	
          len=20, align=2069, pos=21:         4.67	        4.04 ( 13.34%)	
          len=20, align=2069, pos=21:         4.80	        4.45 (  7.14%)	
          len=20, align=4081, pos=21:         9.03	        7.33 ( 18.80%)	
          len=20, align=4081, pos=21:         8.65	        7.33 ( 15.21%)	
          len=22, align=4081, pos=21:         7.69	        7.50 (  2.51%)	
          len=22, align=4081, pos=21:         7.74	        7.14 (  7.81%)	
             len=23, align=0, pos=22:         4.78	        4.43 (  7.30%)	
             len=23, align=0, pos=22:         4.82	        4.45 (  7.65%)	
            len=23, align=22, pos=22:         4.82	        4.45 (  7.59%)	
            len=23, align=22, pos=22:         4.82	        5.17 ( -7.26%)	
             len=21, align=0, pos=22:         4.68	        4.04 ( 13.65%)	
             len=21, align=0, pos=22:         4.80	        4.44 (  7.37%)	
            len=21, align=22, pos=22:         4.81	        4.45 (  7.46%)	
            len=21, align=22, pos=22:         4.82	        4.46 (  7.53%)	
          len=23, align=2048, pos=22:         4.67	        4.04 ( 13.39%)	
          len=23, align=2048, pos=22:         4.80	        4.67 (  2.67%)	
          len=23, align=2070, pos=22:         4.79	        4.46 (  6.89%)	
          len=23, align=2070, pos=22:         4.81	        4.46 (  7.29%)	
          len=21, align=2048, pos=22:         4.67	        4.04 ( 13.39%)	
          len=21, align=2048, pos=22:         4.80	        4.44 (  7.41%)	
          len=21, align=2070, pos=22:         4.80	        4.46 (  7.19%)	
          len=21, align=2070, pos=22:         4.82	        4.45 (  7.69%)	
          len=21, align=4081, pos=22:         8.50	        7.79 (  8.34%)	
          len=21, align=4081, pos=22:         8.53	        8.50 (  0.36%)	
          len=23, align=4081, pos=22:         7.73	        7.10 (  8.09%)	
          len=23, align=4081, pos=22:         7.71	        7.16 (  7.09%)	
             len=24, align=0, pos=23:         4.80	        4.45 (  7.22%)	
             len=24, align=0, pos=23:         4.80	        4.46 (  6.96%)	
            len=24, align=23, pos=23:         4.83	        4.47 (  7.38%)	
            len=24, align=23, pos=23:         4.77	        4.49 (  5.85%)	
             len=22, align=0, pos=23:         4.81	        4.47 (  7.05%)	
             len=22, align=0, pos=23:         4.79	        4.44 (  7.36%)	
            len=22, align=23, pos=23:         4.83	        4.45 (  7.79%)	
            len=22, align=23, pos=23:         4.78	        4.50 (  5.96%)	
          len=24, align=2048, pos=23:         4.77	        4.48 (  6.09%)	
          len=24, align=2048, pos=23:         4.79	        4.45 (  7.06%)	
          len=24, align=2071, pos=23:         4.79	        4.45 (  7.05%)	
          len=24, align=2071, pos=23:         4.77	        4.46 (  6.48%)	
          len=22, align=2048, pos=23:         4.90	        4.48 (  8.46%)	
          len=22, align=2048, pos=23:         4.67	        4.00 ( 14.26%)	
          len=22, align=2071, pos=23:         4.81	        4.46 (  7.30%)	
          len=22, align=2071, pos=23:         4.83	        4.47 (  7.39%)	
          len=22, align=4081, pos=23:         8.37	        7.33 ( 12.35%)	
          len=22, align=4081, pos=23:         8.36	        7.82 (  6.39%)	
          len=24, align=4081, pos=23:         7.75	        7.13 (  8.01%)	
          len=24, align=4081, pos=23:         7.73	        7.15 (  7.40%)	
             len=25, align=0, pos=24:         4.78	        4.45 (  6.94%)	
             len=25, align=0, pos=24:         4.81	        4.48 (  6.80%)	
            len=25, align=24, pos=24:         4.82	        4.44 (  7.89%)	
            len=25, align=24, pos=24:         4.91	        4.48 (  8.74%)	
             len=23, align=0, pos=24:         4.80	        4.45 (  7.39%)	
             len=23, align=0, pos=24:         4.80	        4.46 (  7.05%)	
            len=23, align=24, pos=24:         4.86	        4.44 (  8.57%)	
            len=23, align=24, pos=24:         4.82	        4.46 (  7.49%)	
          len=25, align=2048, pos=24:         4.85	        4.47 (  7.81%)	
          len=25, align=2048, pos=24:         4.83	        4.46 (  7.55%)	
          len=25, align=2072, pos=24:         4.84	        4.46 (  7.83%)	
          len=25, align=2072, pos=24:         4.87	        4.45 (  8.60%)	
          len=23, align=2048, pos=24:         4.83	        4.45 (  7.72%)	
          len=23, align=2048, pos=24:         4.67	        4.04 ( 13.40%)	
          len=23, align=2072, pos=24:         4.79	        4.44 (  7.28%)	
          len=23, align=2072, pos=24:         4.79	        4.45 (  7.10%)	
          len=23, align=4081, pos=24:         8.40	        7.78 (  7.41%)	
          len=23, align=4081, pos=24:         8.37	        7.81 (  6.61%)	
          len=25, align=4081, pos=24:         7.72	        7.17 (  7.21%)	
          len=25, align=4081, pos=24:         7.76	        7.12 (  8.35%)	
             len=26, align=0, pos=25:         4.79	        4.46 (  6.95%)	
             len=26, align=0, pos=25:         4.80	        4.47 (  6.94%)	
            len=26, align=25, pos=25:         4.79	        4.46 (  6.84%)	
            len=26, align=25, pos=25:         4.80	        4.46 (  7.02%)	
             len=24, align=0, pos=25:         4.84	        4.45 (  8.02%)	
             len=24, align=0, pos=25:         4.77	        4.46 (  6.61%)	
            len=24, align=25, pos=25:         4.82	        4.44 (  7.83%)	
            len=24, align=25, pos=25:         4.82	        4.44 (  8.01%)	
          len=26, align=2048, pos=25:         4.83	        4.66 (  3.62%)	
          len=26, align=2048, pos=25:         4.81	        4.46 (  7.37%)	
          len=26, align=2073, pos=25:         4.85	        4.45 (  8.28%)	
          len=26, align=2073, pos=25:         4.83	        4.45 (  7.79%)	
          len=24, align=2048, pos=25:         4.67	        4.04 ( 13.39%)	
          len=24, align=2048, pos=25:         4.82	        4.47 (  7.27%)	
          len=24, align=2073, pos=25:         4.77	        4.46 (  6.51%)	
          len=24, align=2073, pos=25:         4.78	        4.45 (  6.91%)	
          len=24, align=4081, pos=25:         8.42	        7.78 (  7.58%)	
          len=24, align=4081, pos=25:         8.46	        7.77 (  8.17%)	
          len=26, align=4081, pos=25:         7.76	        7.11 (  8.34%)	
          len=26, align=4081, pos=25:         7.71	        7.17 (  7.01%)	
             len=27, align=0, pos=26:         4.78	        4.44 (  7.15%)	
             len=27, align=0, pos=26:         4.81	        4.45 (  7.44%)	
            len=27, align=26, pos=26:         4.81	        4.45 (  7.52%)	
            len=27, align=26, pos=26:         4.80	        4.45 (  7.28%)	
             len=25, align=0, pos=26:         5.22	        4.44 ( 14.86%)	
             len=25, align=0, pos=26:         4.81	        4.44 (  7.64%)	
            len=25, align=26, pos=26:         4.80	        4.45 (  7.16%)	
            len=25, align=26, pos=26:         4.78	        4.46 (  6.84%)	
          len=27, align=2048, pos=26:         4.80	        4.46 (  7.06%)	
          len=27, align=2048, pos=26:         4.82	        4.48 (  7.00%)	
          len=27, align=2074, pos=26:         4.80	        4.45 (  7.31%)	
          len=27, align=2074, pos=26:         4.81	        4.46 (  7.29%)	
          len=25, align=2048, pos=26:         4.78	        4.44 (  7.25%)	
          len=25, align=2048, pos=26:         4.81	        4.47 (  7.16%)	
          len=25, align=2074, pos=26:         4.81	        4.46 (  7.20%)	
          len=25, align=2074, pos=26:         4.82	        4.46 (  7.51%)	
          len=25, align=4081, pos=26:         8.53	        7.78 (  8.73%)	
          len=25, align=4081, pos=26:         8.56	        7.78 (  9.15%)	
          len=27, align=4081, pos=26:         7.90	        7.11 (  9.99%)	
          len=27, align=4081, pos=26:         7.69	        7.18 (  6.68%)	
             len=28, align=0, pos=27:         4.79	        4.46 (  7.05%)	
             len=28, align=0, pos=27:         4.78	        4.44 (  7.20%)	
            len=28, align=27, pos=27:         4.81	        4.46 (  7.26%)	
            len=28, align=27, pos=27:         4.78	        4.50 (  6.00%)	
             len=26, align=0, pos=27:         4.82	        4.45 (  7.54%)	
             len=26, align=0, pos=27:         4.79	        4.45 (  7.00%)	
            len=26, align=27, pos=27:         4.90	        4.69 (  4.32%)	
            len=26, align=27, pos=27:         4.81	        4.51 (  6.21%)	
          len=28, align=2048, pos=27:         4.79	        4.51 (  5.89%)	
          len=28, align=2048, pos=27:         4.85	        4.47 (  7.90%)	
          len=28, align=2075, pos=27:         4.83	        4.54 (  6.04%)	
          len=28, align=2075, pos=27:         4.80	        4.63 (  3.52%)	
          len=26, align=2048, pos=27:         4.80	        4.45 (  7.39%)	
          len=26, align=2048, pos=27:         4.80	        4.46 (  7.20%)	
          len=26, align=2075, pos=27:         4.79	        4.49 (  6.27%)	
          len=26, align=2075, pos=27:         4.78	        4.45 (  6.76%)	
          len=26, align=4081, pos=27:         8.36	        7.84 (  6.19%)	
          len=26, align=4081, pos=27:         8.54	        7.77 (  9.07%)	
          len=28, align=4081, pos=27:         7.75	        7.13 (  8.07%)	
          len=28, align=4081, pos=27:         7.72	        7.17 (  7.06%)	
             len=29, align=0, pos=28:         4.78	        4.45 (  7.02%)	
             len=29, align=0, pos=28:         4.79	        4.45 (  7.10%)	
            len=29, align=28, pos=28:         4.80	        4.45 (  7.37%)	
            len=29, align=28, pos=28:         4.86	        4.44 (  8.77%)	
             len=27, align=0, pos=28:         4.78	        4.44 (  7.16%)	
             len=27, align=0, pos=28:         4.78	        4.45 (  7.00%)	
            len=27, align=28, pos=28:         4.81	        4.45 (  7.44%)	
            len=27, align=28, pos=28:         4.81	        4.45 (  7.38%)	
          len=29, align=2048, pos=28:         4.81	        4.49 (  6.72%)	
          len=29, align=2048, pos=28:         4.81	        4.45 (  7.41%)	
          len=29, align=2076, pos=28:         4.84	        4.45 (  8.02%)	
          len=29, align=2076, pos=28:         4.81	        4.48 (  6.71%)	
          len=27, align=2048, pos=28:         4.78	        4.50 (  6.00%)	
          len=27, align=2048, pos=28:         4.78	        4.48 (  6.35%)	
          len=27, align=2076, pos=28:         4.80	        4.50 (  6.40%)	
          len=27, align=2076, pos=28:         4.80	        4.44 (  7.50%)	
          len=27, align=4081, pos=28:         8.34	        7.82 (  6.23%)	
          len=27, align=4081, pos=28:         8.38	        7.81 (  6.78%)	
          len=29, align=4081, pos=28:         7.71	        7.14 (  7.39%)	
          len=29, align=4081, pos=28:         7.72	        7.15 (  7.32%)	
             len=30, align=0, pos=29:         4.81	        4.45 (  7.60%)	
             len=30, align=0, pos=29:         4.82	        4.45 (  7.54%)	
            len=30, align=29, pos=29:         4.84	        4.46 (  7.82%)	
            len=30, align=29, pos=29:         4.80	        4.49 (  6.47%)	
             len=28, align=0, pos=29:         4.80	        4.47 (  6.90%)	
             len=28, align=0, pos=29:         4.78	        4.46 (  6.67%)	
            len=28, align=29, pos=29:         4.80	        4.52 (  5.71%)	
            len=28, align=29, pos=29:         4.83	        4.62 (  4.31%)	
          len=30, align=2048, pos=29:         4.80	        4.48 (  6.81%)	
          len=30, align=2048, pos=29:         4.82	        4.45 (  7.57%)	
          len=30, align=2077, pos=29:         4.80	        4.44 (  7.50%)	
          len=30, align=2077, pos=29:         4.78	        4.46 (  6.87%)	
          len=28, align=2048, pos=29:         4.80	        4.48 (  6.64%)	
          len=28, align=2048, pos=29:         4.80	        4.48 (  6.67%)	
          len=28, align=2077, pos=29:         4.79	        4.49 (  6.33%)	
          len=28, align=2077, pos=29:         4.84	        4.47 (  7.75%)	
          len=28, align=4081, pos=29:         8.39	        7.79 (  7.20%)	
          len=28, align=4081, pos=29:         8.40	        7.80 (  7.17%)	
          len=30, align=4081, pos=29:         7.89	        7.12 (  9.76%)	
          len=30, align=4081, pos=29:         7.74	        7.11 (  8.14%)	
             len=31, align=0, pos=30:         4.81	        4.46 (  7.31%)	
             len=31, align=0, pos=30:         5.07	        4.73 (  6.69%)	
            len=31, align=30, pos=30:         5.05	        4.72 (  6.46%)	
            len=31, align=30, pos=30:         4.82	        4.45 (  7.69%)	
             len=29, align=0, pos=30:         4.80	        4.51 (  5.94%)	
             len=29, align=0, pos=30:         4.77	        4.49 (  5.94%)	
            len=29, align=30, pos=30:         4.83	        4.44 (  7.99%)	
            len=29, align=30, pos=30:         4.77	        4.46 (  6.56%)	
          len=31, align=2048, pos=30:         4.80	        4.46 (  7.05%)	
          len=31, align=2048, pos=30:         4.79	        4.44 (  7.44%)	
          len=31, align=2078, pos=30:         4.80	        4.48 (  6.58%)	
          len=31, align=2078, pos=30:         4.80	        4.49 (  6.49%)	
          len=29, align=2048, pos=30:         4.83	        4.45 (  7.93%)	
          len=29, align=2048, pos=30:         4.84	        4.46 (  7.85%)	
          len=29, align=2078, pos=30:         4.81	        4.46 (  7.29%)	
          len=29, align=2078, pos=30:         4.77	        4.44 (  6.99%)	
          len=29, align=4081, pos=30:         8.35	        7.80 (  6.59%)	
          len=29, align=4081, pos=30:         8.36	        7.92 (  5.35%)	
          len=31, align=4081, pos=30:         7.71	        7.13 (  7.56%)	
          len=31, align=4081, pos=30:         7.70	        7.14 (  7.27%)	
             len=32, align=0, pos=31:         4.77	        4.47 (  6.45%)	
             len=32, align=0, pos=31:         4.79	        4.46 (  6.87%)	
            len=32, align=31, pos=31:         5.06	        4.74 (  6.37%)	
            len=32, align=31, pos=31:         5.06	        4.69 (  7.32%)	
             len=30, align=0, pos=31:         5.07	        4.69 (  7.39%)	
             len=30, align=0, pos=31:         5.07	        4.73 (  6.76%)	
            len=30, align=31, pos=31:         5.10	        4.68 (  8.17%)	
            len=30, align=31, pos=31:         4.78	        4.44 (  7.19%)	
          len=32, align=2048, pos=31:         4.81	        4.47 (  7.05%)	
          len=32, align=2048, pos=31:         4.86	        4.48 (  7.94%)	
          len=32, align=2079, pos=31:         4.82	        4.46 (  7.42%)	
          len=32, align=2079, pos=31:         4.86	        4.46 (  8.17%)	
          len=30, align=2048, pos=31:         4.77	        4.47 (  6.38%)	
          len=30, align=2048, pos=31:         4.80	        4.45 (  7.30%)	
          len=30, align=2079, pos=31:         4.80	        4.49 (  6.51%)	
          len=30, align=2079, pos=31:         4.80	        4.47 (  6.90%)	
          len=30, align=4081, pos=31:         8.46	        7.77 (  8.22%)	
          len=30, align=4081, pos=31:         8.39	        7.79 (  7.21%)	
          len=32, align=4081, pos=31:         7.71	        7.14 (  7.45%)	
          len=32, align=4081, pos=31:         7.74	        7.12 (  8.01%)
Function: rawmemchr
Variant: 
                                    __rawmemchr_evex	__rawmemchr_evex512
========================================================================================================================
              length=32, alignment=0:         8.06	        5.11 ( 36.55%)	
              length=64, alignment=1:         8.61	        6.84 ( 20.48%)	
              length=32, alignment=0:         6.23	        4.02 ( 35.56%)	
              length=64, alignment=1:         7.19	        5.78 ( 19.67%)	
              length=64, alignment=0:         6.68	        5.43 ( 18.75%)	
              length=64, alignment=2:         6.51	        5.45 ( 16.20%)	
              length=64, alignment=0:         6.51	        5.47 ( 15.97%)	
              length=64, alignment=2:         6.51	        5.46 ( 16.01%)	
             length=128, alignment=0:         6.53	        6.12 (  6.29%)	
              length=64, alignment=3:         6.50	        5.45 ( 16.07%)	
             length=128, alignment=0:         7.07	        6.45 (  8.75%)	
              length=64, alignment=3:         6.60	        5.46 ( 17.21%)	
             length=256, alignment=0:        11.06	        7.46 ( 32.56%)	
              length=64, alignment=4:         6.50	        5.48 ( 15.72%)	
             length=256, alignment=0:        11.05	        7.46 ( 32.46%)	
              length=64, alignment=4:         6.50	        5.46 ( 16.03%)	
             length=512, alignment=0:        14.23	       14.13 (  0.67%)	
              length=64, alignment=5:         6.50	        5.46 ( 15.93%)	
             length=512, alignment=0:        14.37	       14.14 (  1.64%)	
              length=64, alignment=5:         6.50	        5.47 ( 15.79%)	
            length=1024, alignment=0:        21.36	       18.36 ( 14.04%)	
              length=64, alignment=6:         6.51	        5.47 ( 15.99%)	
            length=1024, alignment=0:        21.30	       18.48 ( 13.23%)	
              length=64, alignment=6:         6.62	        5.46 ( 17.55%)	
               length=1, alignment=0:         3.90	        3.46 ( 11.43%)	
               length=1, alignment=0:         3.90	        3.46 ( 11.38%)	
               length=2, alignment=0:         3.90	        3.45 ( 11.53%)	
               length=2, alignment=0:         3.90	        3.45 ( 11.51%)	
               length=3, alignment=0:         3.90	        3.46 ( 11.32%)	
               length=3, alignment=0:         3.90	        3.45 ( 11.46%)	
               length=4, alignment=0:         4.01	        3.34 ( 16.66%)	
               length=4, alignment=0:         3.90	        3.45 ( 11.48%)	
               length=5, alignment=0:         3.90	        3.46 ( 11.44%)	
               length=5, alignment=0:         3.90	        3.45 ( 11.51%)	
               length=6, alignment=0:         3.90	        3.44 ( 11.72%)	
               length=6, alignment=0:         3.90	        3.46 ( 11.34%)	
               length=7, alignment=0:         3.90	        3.46 ( 11.37%)	
               length=7, alignment=0:         3.90	        3.46 ( 11.45%)	
               length=8, alignment=0:         3.90	        3.45 ( 11.61%)	
               length=8, alignment=0:         3.90	        3.45 ( 11.70%)	
               length=9, alignment=0:         3.89	        3.46 ( 11.22%)	
               length=9, alignment=0:         4.04	        3.45 ( 14.83%)	
              length=10, alignment=0:         3.90	        3.45 ( 11.66%)	
              length=10, alignment=0:         3.90	        3.45 ( 11.60%)	
              length=11, alignment=0:         3.89	        3.46 ( 11.17%)	
              length=11, alignment=0:         3.90	        3.44 ( 11.75%)	
              length=12, alignment=0:         3.89	        3.46 ( 11.18%)	
              length=12, alignment=0:         3.90	        3.45 ( 11.55%)	
              length=13, alignment=0:         3.89	        3.45 ( 11.25%)	
              length=13, alignment=0:         3.90	        3.45 ( 11.59%)	
              length=14, alignment=0:         3.89	        3.46 ( 11.17%)	
              length=14, alignment=0:         3.90	        3.45 ( 11.69%)	
              length=15, alignment=0:         3.90	        3.46 ( 11.43%)	
              length=15, alignment=0:         3.90	        3.45 ( 11.59%)	
              length=16, alignment=0:         3.89	        3.45 ( 11.27%)	
              length=16, alignment=0:         3.90	        3.44 ( 11.74%)	
              length=17, alignment=0:         3.90	        3.45 ( 11.66%)	
              length=17, alignment=0:         3.90	        3.61 (  7.42%)	
              length=18, alignment=0:         3.90	        3.45 ( 11.50%)	
              length=18, alignment=0:         3.90	        3.45 ( 11.51%)	
              length=19, alignment=0:         3.90	        3.46 ( 11.36%)	
              length=19, alignment=0:         3.89	        3.46 ( 11.11%)	
              length=20, alignment=0:         3.89	        3.45 ( 11.34%)	
              length=20, alignment=0:         3.90	        3.46 ( 11.30%)	
              length=21, alignment=0:         3.90	        3.46 ( 11.41%)	
              length=21, alignment=0:         3.90	        3.45 ( 11.52%)	
              length=22, alignment=0:         3.90	        3.46 ( 11.36%)	
              length=22, alignment=0:         3.90	        3.46 ( 11.43%)	
              length=23, alignment=0:         3.90	        3.46 ( 11.38%)	
              length=23, alignment=0:         3.89	        3.45 ( 11.27%)	
              length=24, alignment=0:         3.90	        3.44 ( 11.74%)	
              length=24, alignment=0:         3.90	        3.46 ( 11.50%)	
              length=25, alignment=0:         3.90	        3.50 ( 10.25%)	
              length=25, alignment=0:         3.90	        3.45 ( 11.48%)	
              length=26, alignment=0:         3.90	        3.46 ( 11.39%)	
              length=26, alignment=0:         3.90	        3.46 ( 11.20%)	
              length=27, alignment=0:         3.89	        3.45 ( 11.31%)	
              length=27, alignment=0:         3.90	        3.45 ( 11.64%)	
              length=28, alignment=0:         3.90	        3.45 ( 11.64%)	
              length=28, alignment=0:         3.90	        3.46 ( 11.27%)	
              length=29, alignment=0:         3.90	        3.46 ( 11.36%)	
              length=29, alignment=0:         3.90	        3.47 ( 11.04%)	
              length=30, alignment=0:         3.90	        3.45 ( 11.47%)	
              length=30, alignment=0:         3.89	        3.70 (  5.03%)	
              length=31, alignment=0:         4.25	        3.55 ( 16.53%)	
              length=31, alignment=0:         4.13	        3.66 ( 11.37%)
  
Sunil Pandey Sept. 29, 2022, 3:42 a.m. UTC | #3
On Thu, Sep 22, 2022 at 8:57 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
>
>
> On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Wed, Sep 21, 2022 at 5:27 PM Sunil K Pandey via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > This patch implements following evex512 version of string functions.
> > > evex512 version takes up to 30% less cycle as compared to evex,
> > > depending on length and alignment.
> >
> > Please attach benchmark numbers.
> > >
> > > - memchr function using 512 bit vectors.
> > > - rawmemchr function using 512 bit vectors.
> > > - wmemchr function using 512 bit vectors.
> > >
> > > Code size data:
> > >
> > > memchr-evex.o           762 byte
> > > memchr-evex512.o        570 byte (-25%)
> > >
> > > rawmemchr-evex.o        461 byte
> > > rawmemchr-evex512.o     413 byte (-10%)
> > >
> > > wmemchr-evex.o          794 byte
> > > wmemchr-evex512.o       568 byte (-28%)
> > >
> > > Placeholder function, not used by any processor at the moment.
> > > ---
> > >  sysdeps/x86_64/multiarch/Makefile            |   3 +
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  15 +
> > >  sysdeps/x86_64/multiarch/memchr-evex-base.S  | 306 +++++++++++++++++++
> > >  sysdeps/x86_64/multiarch/memchr-evex512.S    |   7 +
> > >  sysdeps/x86_64/multiarch/rawmemchr-evex512.S |   7 +
> > >  sysdeps/x86_64/multiarch/wmemchr-evex512.S   |   8 +
> > >  6 files changed, 346 insertions(+)
> > >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > index df4601c294..e974b1ad97 100644
> > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > @@ -4,6 +4,7 @@ sysdep_routines += \
> > >    memchr-avx2 \
> > >    memchr-avx2-rtm \
> > >    memchr-evex \
> > > +  memchr-evex512 \
> > >    memchr-evex-rtm \
> > >    memchr-sse2 \
> > >    memcmp-avx2-movbe \
> > > @@ -36,6 +37,7 @@ sysdep_routines += \
> > >    rawmemchr-avx2 \
> > >    rawmemchr-avx2-rtm \
> > >    rawmemchr-evex \
> > > +  rawmemchr-evex512 \
> > >    rawmemchr-evex-rtm \
> > >    rawmemchr-sse2 \
> > >    stpcpy-avx2 \
> > > @@ -156,6 +158,7 @@ sysdep_routines += \
> > >    wmemchr-avx2 \
> > >    wmemchr-avx2-rtm \
> > >    wmemchr-evex \
> > > +  wmemchr-evex512 \
> > >    wmemchr-evex-rtm \
> > >    wmemchr-sse2 \
> > >    wmemcmp-avx2-movbe \
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index a71444eccb..17f770318d 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > >                                      __memchr_evex)
> > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > +                                    __memchr_evex512)
> > >               X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > @@ -329,6 +334,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > >                                      __rawmemchr_evex)
> > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > +                                    __rawmemchr_evex512)
> > >               X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > @@ -903,6 +913,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > >                                      __wmemchr_evex)
> > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > +                                    __wmemchr_evex512)
> > >               X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > new file mode 100644
> > > index 0000000000..524f0809b5
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > @@ -0,0 +1,306 @@
> > > +/* Placeholder function, not used by any processor at the moment.
> > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +/* UNUSED. Exists purely as reference implementation.  */
> > > +
> > > +#include <isa-level.h>
> > > +
> > > +#if ISA_SHOULD_BUILD (4)
> > > +
> > > +# include <sysdep.h>
> > > +
> > > +# ifdef USE_AS_WMEMCHR
> > > +#  define CHAR_SIZE    4
> > > +#  define VPBROADCAST   vpbroadcastd
> > > +#  define VPCMP                vpcmpd
> > > +# else
> > > +#  define CHAR_SIZE    1
> > > +#  define VPBROADCAST   vpbroadcastb
> > > +#  define VPCMP                vpcmpb
> > > +# endif
> > > +
> > > +# define PAGE_SIZE     4096
> > > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > > +# define XMM1           xmm17
> > > +
> > > +# if VEC_SIZE == 64
> > > +#  define KMOV         kmovq
> > > +#  define KOR          korq
> > > +#  define KORTEST      kortestq
> > > +#  define RAX          rax
> > > +#  define RCX          rcx
> > > +#  define SHR          shrq
> > > +#  define SARX         sarxq
> > > +#  define TEXTSUFFIX   evex512
> > > +#  define VMM0         zmm16
> > > +# elif VEC_SIZE == 32
> > > +/* Currently Unused.  */
> > > +#  define KMOV         kmovd
> > > +#  define KOR          kord
> > > +#  define KORTEST      kortestd
> > > +#  define RAX          eax
> > > +#  define RCX          ecx
> > > +#  define SHR          shrl
> > > +#  define SARX         sarxl
> > > +#  define TEXTSUFFIX   evex256
> > > +#  define VMM0         ymm16
> > > +# endif
> > > +
> > > +       .section .text.TEXTSUFFIX, "ax", @progbits
> > > +/* Aligning entry point to 64 byte, provides better performance for
> > > +   one vector length string.  */
> > > +ENTRY_P2ALIGN (MEMCHR, 6)
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       /* Check for zero length.  */
> > > +       test    %RDX_LP, %RDX_LP
> > > +       jz      L(zero)
> > > +
> > > +#  ifdef __ILP32__
> > > +       /* Clear the upper 32 bits.  */
> > > +       movl    %edx, %edx
> > > +#  endif
> > > +# endif
> > > +
> > > +       /* Broadcast CHAR to VMM0.  */
> > > +       VPBROADCAST %esi, %VMM0
> > > +       movl    %edi, %eax
> > > +       andl    $(PAGE_SIZE - 1), %eax
> > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > +       ja      L(page_cross)
> > > +
> > > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > > +       VPCMP   $0, (%rdi), %VMM0, %k0
> > > +
> > > +       KMOV    %k0, %RAX
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       bsf     %RAX, %RCX
> > > +       jz      L(align_more)
> > > +       xor     %eax, %eax
> > > +#  ifdef USE_AS_WMEMCHR
> > > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > +#  else
> > > +       addq    %rcx, %rdi
> > > +#  endif
> > > +       cmp     %rcx, %rdx
> > > +       cmova   %rdi, %rax
> > > +# else
> > > +       bsf     %RAX, %RAX
> > > +       jz      L(align_more)
> > > +       add     %rdi, %rax
> > > +# endif
> > > +       ret
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +L(zero):
> > > +       xorl    %eax, %eax
> > > +       ret
> > > +# endif
> > > +
> > > +       .p2align 5,,5
> > > +L(page_cross):
> > > +       movq    %rdi, %rcx
> > > +       andq    $-VEC_SIZE, %rcx
> > > +
> > > +       VPCMP   $0, (%rcx), %VMM0, %k0
> > > +       KMOV    %k0, %RCX
> > > +       SARX    %RAX, %RCX, %RAX
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       bsf     %RAX, %RCX
> > > +       jz      L(align_more)
> > > +       xor     %eax, %eax
> > > +#  ifdef USE_AS_WMEMCHR
> > > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > +#  else
> > > +       addq    %rcx, %rdi
> > > +#  endif
> > > +       cmp     %rcx, %rdx
> > > +       cmovae  %rdi, %rax
> > > +
> > > +# else
> > > +       bsf     %rax, %rax
> > > +       jz      L(align_more)
> > > +       add     %rdi, %rax
> > > +# endif
> > > +       ret
> > > +
> > > +L(ret_vec_x2):
> > > +       subq    $-VEC_SIZE, %rdi
> > > +L(ret_vec_x1):
> > > +       bsf     %RAX, %RAX
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       jz      L(zero)
> > > +       cmp     %rax, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +# ifdef USE_AS_WMEMCHR
> > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > +# else
> > > +       add     %rdi, %rax
> > > +# endif
> > > +       ret
> > > +
> > > +       .p2align 5,,10
> > > +L(align_more):
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       xor     %eax, %eax
> > > +       subq    %rdi, %rax
> > > +# endif
> > > +
> > > +       subq    $-VEC_SIZE, %rdi
> > > +       /* Align rdi to VEC_SIZE.  */
> > > +       andq    $-VEC_SIZE, %rdi
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       addq    %rdi, %rax
> > > +#  ifdef USE_AS_WMEMCHR
> > > +       sarl    $2, %eax
> > > +#  endif
> > > +       subq    %rax, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +
> > > +       /* Loop unroll 4 times for 4 vector loop.  */
> > > +       VPCMP   $0, (%rdi), %VMM0, %k0
> > > +
> > > +       KMOV    %k0, %RAX
> > > +       test    %RAX, %RAX
> > > +       jnz     L(ret_vec_x1)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +
> > > +       VPCMP   $0, VEC_SIZE(%rdi), %VMM0, %k0
> > > +
> > > +       KMOV    %k0, %RAX
> > > +       test    %RAX, %RAX
> > > +       jnz     L(ret_vec_x2)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +
> > > +       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %VMM0, %k0
> > > +
> > > +       KMOV    %k0, %RAX
> > > +       test    %RAX, %RAX
> > > +       jnz     L(ret_vec_x3)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +
> > > +       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %VMM0, %k0
> > > +
> > > +       KMOV    %k0, %RAX
> > > +       test    %RAX, %RAX
> > > +       jnz     L(ret_vec_x4)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +       /* Save pointer to find alignment adjustment.  */
> > > +       movq    %rdi, %rax
> > > +# endif
> > > +       /* Align address to VEC_SIZE * 4 for loop.  */
> > > +       andq    $-(VEC_SIZE * 4), %rdi
> > > +
> > > +       /* Add alignment difference to rdx.  */
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    %rdi, %rax
> > > +#  ifdef USE_AS_WMEMCHR
> > > +       SHR     $2, %RAX
> > > +#  endif
> > > +       addq    %rax, %rdx
> > > +       jmp     L(loop_entry)
> > > +# endif
> > > +
> > > +       /* 4 vector loop.  */
> > > +       .p2align 5,,11
> > > +L(loop):
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > > +       jbe     L(zero)
> > > +L(loop_entry):
> > > +# endif
> > > +       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %VMM0, %k1
> > > +       VPCMP   $0, (VEC_SIZE * 5)(%rdi), %VMM0, %k2
> > > +       VPCMP   $0, (VEC_SIZE * 6)(%rdi), %VMM0, %k3
> > > +       VPCMP   $0, (VEC_SIZE * 7)(%rdi), %VMM0, %k4
> > > +       KOR     %k1, %k2, %k5
> > > +       KOR     %k3, %k4, %k6
> > > +
> > > +       subq    $-(VEC_SIZE * 4), %rdi
> > > +       KORTEST %k5, %k6
> > > +       jz      L(loop)
> > > +
> > > +       KMOV    %k1, %RAX
> > > +       test    %RAX, %RAX
> > > +       jnz     L(ret_vec_x1)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +
> > > +       KMOV    %k2, %RAX
> > > +       test    %RAX, %RAX
> > > +       jnz     L(ret_vec_x2)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +
> > > +       KMOV    %k3, %RAX
> > > +       test    %RAX, %RAX
> > > +       jnz     L(ret_vec_x3)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +
> > > +       /* At this point null [w]char must be in the fourth vector so no
> > > +          need to check.  */
> > > +       KMOV    %k4, %RAX
> > > +
> > > +L(ret_vec_x4):
> > > +       bsf     %RAX, %RAX
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       cmp     %rax, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > > +       ret
> > > +
> > > +       .p2align 5,,5
> > > +L(ret_vec_x3):
> > > +       bsf     %RAX, %RAX
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       cmp     %rax, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > > +       ret
> > > +
> > > +END (MEMCHR)
> > > +#endif
> > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > > new file mode 100644
> > > index 0000000000..47349d817a
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > > @@ -0,0 +1,7 @@
> > > +# ifndef MEMCHR
> > > +#  define MEMCHR       __memchr_evex512
> > > +# endif
> > > +
> > > +#define VEC_SIZE        64
> > > +
> > > +#include "memchr-evex-base.S"
> > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > new file mode 100644
> > > index 0000000000..302d3cb055
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > @@ -0,0 +1,7 @@
> > > +#ifndef RAWMEMCHR
> > > +# define RAWMEMCHR     __rawmemchr_evex512
> > > +#endif
> > > +#define USE_AS_RAWMEMCHR       1
> > > +#define MEMCHR RAWMEMCHR
> > > +
> > > +#include "memchr-evex512.S"
> > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > new file mode 100644
> > > index 0000000000..f45ed1db75
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > @@ -0,0 +1,8 @@
> > > +#ifndef WMEMCHR
> > > +# define WMEMCHR       __wmemchr_evex512
> > > +#endif
> > > +
> > > +#define MEMCHR WMEMCHR
> > > +#define USE_AS_WMEMCHR 1
> > > +
> > > +#include "memchr-evex512.S"
> > > --
> > > 2.36.1
> > >

ping
  
Noah Goldstein Sept. 29, 2022, 4:07 a.m. UTC | #4
On Wed, Sep 28, 2022 at 8:43 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Thu, Sep 22, 2022 at 8:57 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
> >
> >
> > On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Wed, Sep 21, 2022 at 5:27 PM Sunil K Pandey via Libc-alpha
> > > <libc-alpha@sourceware.org> wrote:
> > > >
> > > > This patch implements following evex512 version of string functions.
> > > > evex512 version takes up to 30% less cycle as compared to evex,
> > > > depending on length and alignment.
> > >
> > > Please attach benchmark numbers.
> > > >
> > > > - memchr function using 512 bit vectors.
> > > > - rawmemchr function using 512 bit vectors.
> > > > - wmemchr function using 512 bit vectors.
> > > >
> > > > Code size data:
> > > >
> > > > memchr-evex.o           762 byte
> > > > memchr-evex512.o        570 byte (-25%)
> > > >
> > > > rawmemchr-evex.o        461 byte
> > > > rawmemchr-evex512.o     413 byte (-10%)
> > > >
> > > > wmemchr-evex.o          794 byte
> > > > wmemchr-evex512.o       568 byte (-28%)
> > > >
> > > > Placeholder function, not used by any processor at the moment.
> > > > ---
> > > >  sysdeps/x86_64/multiarch/Makefile            |   3 +
> > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  15 +
> > > >  sysdeps/x86_64/multiarch/memchr-evex-base.S  | 306 +++++++++++++++++++
> > > >  sysdeps/x86_64/multiarch/memchr-evex512.S    |   7 +
> > > >  sysdeps/x86_64/multiarch/rawmemchr-evex512.S |   7 +
> > > >  sysdeps/x86_64/multiarch/wmemchr-evex512.S   |   8 +
> > > >  6 files changed, 346 insertions(+)
> > > >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S
> > > >  create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > >  create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > > index df4601c294..e974b1ad97 100644
> > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > @@ -4,6 +4,7 @@ sysdep_routines += \
> > > >    memchr-avx2 \
> > > >    memchr-avx2-rtm \
> > > >    memchr-evex \
> > > > +  memchr-evex512 \
> > > >    memchr-evex-rtm \
> > > >    memchr-sse2 \
> > > >    memcmp-avx2-movbe \
> > > > @@ -36,6 +37,7 @@ sysdep_routines += \
> > > >    rawmemchr-avx2 \
> > > >    rawmemchr-avx2-rtm \
> > > >    rawmemchr-evex \
> > > > +  rawmemchr-evex512 \
> > > >    rawmemchr-evex-rtm \
> > > >    rawmemchr-sse2 \
> > > >    stpcpy-avx2 \
> > > > @@ -156,6 +158,7 @@ sysdep_routines += \
> > > >    wmemchr-avx2 \
> > > >    wmemchr-avx2-rtm \
> > > >    wmemchr-evex \
> > > > +  wmemchr-evex512 \
> > > >    wmemchr-evex-rtm \
> > > >    wmemchr-sse2 \
> > > >    wmemcmp-avx2-movbe \
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > index a71444eccb..17f770318d 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > >                                      __memchr_evex)
> > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > +                                    __memchr_evex512)
> > > >               X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> > > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > > @@ -329,6 +334,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > >                                      __rawmemchr_evex)
> > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > +                                    __rawmemchr_evex512)
> > > >               X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> > > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > > @@ -903,6 +913,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > >                                      __wmemchr_evex)
> > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > +                                    __wmemchr_evex512)
> > > >               X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> > > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > > new file mode 100644
> > > > index 0000000000..524f0809b5
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > > @@ -0,0 +1,306 @@
> > > > +/* Placeholder function, not used by any processor at the moment.
> > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +/* UNUSED. Exists purely as reference implementation.  */
> > > > +
> > > > +#include <isa-level.h>
> > > > +
> > > > +#if ISA_SHOULD_BUILD (4)
> > > > +
> > > > +# include <sysdep.h>
> > > > +
> > > > +# ifdef USE_AS_WMEMCHR
> > > > +#  define CHAR_SIZE    4
> > > > +#  define VPBROADCAST   vpbroadcastd
> > > > +#  define VPCMP                vpcmpd
> > > > +# else
> > > > +#  define CHAR_SIZE    1
> > > > +#  define VPBROADCAST   vpbroadcastb
> > > > +#  define VPCMP                vpcmpb
> > > > +# endif
> > > > +
> > > > +# define PAGE_SIZE     4096
> > > > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > > > +# define XMM1           xmm17
> > > > +
> > > > +# if VEC_SIZE == 64
> > > > +#  define KMOV         kmovq
> > > > +#  define KOR          korq
> > > > +#  define KORTEST      kortestq
> > > > +#  define RAX          rax
> > > > +#  define RCX          rcx
> > > > +#  define SHR          shrq
> > > > +#  define SARX         sarxq
> > > > +#  define TEXTSUFFIX   evex512
> > > > +#  define VMM0         zmm16
> > > > +# elif VEC_SIZE == 32
> > > > +/* Currently Unused.  */
> > > > +#  define KMOV         kmovd
> > > > +#  define KOR          kord
> > > > +#  define KORTEST      kortestd
> > > > +#  define RAX          eax
> > > > +#  define RCX          ecx
> > > > +#  define SHR          shrl
> > > > +#  define SARX         sarxl
> > > > +#  define TEXTSUFFIX   evex256
> > > > +#  define VMM0         ymm16
> > > > +# endif
> > > > +
> > > > +       .section .text.TEXTSUFFIX, "ax", @progbits
> > > > +/* Aligning entry point to 64 byte, provides better performance for
> > > > +   one vector length string.  */
> > > > +ENTRY_P2ALIGN (MEMCHR, 6)
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       /* Check for zero length.  */
> > > > +       test    %RDX_LP, %RDX_LP
> > > > +       jz      L(zero)
> > > > +
> > > > +#  ifdef __ILP32__
> > > > +       /* Clear the upper 32 bits.  */
> > > > +       movl    %edx, %edx
> > > > +#  endif
> > > > +# endif
> > > > +
> > > > +       /* Broadcast CHAR to VMM0.  */
> > > > +       VPBROADCAST %esi, %VMM0
> > > > +       movl    %edi, %eax
> > > > +       andl    $(PAGE_SIZE - 1), %eax
> > > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > > +       ja      L(page_cross)
> > > > +
> > > > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > > > +       VPCMP   $0, (%rdi), %VMM0, %k0
> > > > +
> > > > +       KMOV    %k0, %RAX
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       bsf     %RAX, %RCX
> > > > +       jz      L(align_more)
> > > > +       xor     %eax, %eax
> > > > +#  ifdef USE_AS_WMEMCHR
> > > > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > > +#  else
> > > > +       addq    %rcx, %rdi
> > > > +#  endif
> > > > +       cmp     %rcx, %rdx
> > > > +       cmova   %rdi, %rax
> > > > +# else
> > > > +       bsf     %RAX, %RAX
> > > > +       jz      L(align_more)
> > > > +       add     %rdi, %rax
> > > > +# endif
> > > > +       ret
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +L(zero):
> > > > +       xorl    %eax, %eax
> > > > +       ret
> > > > +# endif
> > > > +
> > > > +       .p2align 5,,5
> > > > +L(page_cross):
> > > > +       movq    %rdi, %rcx
> > > > +       andq    $-VEC_SIZE, %rcx
> > > > +
> > > > +       VPCMP   $0, (%rcx), %VMM0, %k0
> > > > +       KMOV    %k0, %RCX
> > > > +       SARX    %RAX, %RCX, %RAX
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       bsf     %RAX, %RCX
> > > > +       jz      L(align_more)
> > > > +       xor     %eax, %eax
> > > > +#  ifdef USE_AS_WMEMCHR
> > > > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > > +#  else
> > > > +       addq    %rcx, %rdi
> > > > +#  endif
> > > > +       cmp     %rcx, %rdx
> > > > +       cmovae  %rdi, %rax
> > > > +
> > > > +# else
> > > > +       bsf     %rax, %rax
> > > > +       jz      L(align_more)
> > > > +       add     %rdi, %rax
> > > > +# endif
> > > > +       ret
> > > > +
> > > > +L(ret_vec_x2):
> > > > +       subq    $-VEC_SIZE, %rdi
> > > > +L(ret_vec_x1):
> > > > +       bsf     %RAX, %RAX
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       jz      L(zero)
> > > > +       cmp     %rax, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +# ifdef USE_AS_WMEMCHR
> > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > > +# else
> > > > +       add     %rdi, %rax
> > > > +# endif
> > > > +       ret
> > > > +
> > > > +       .p2align 5,,10
> > > > +L(align_more):
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       xor     %eax, %eax
> > > > +       subq    %rdi, %rax
> > > > +# endif
> > > > +
> > > > +       subq    $-VEC_SIZE, %rdi
> > > > +       /* Align rdi to VEC_SIZE.  */
> > > > +       andq    $-VEC_SIZE, %rdi
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       addq    %rdi, %rax
> > > > +#  ifdef USE_AS_WMEMCHR
> > > > +       sarl    $2, %eax
> > > > +#  endif
> > > > +       subq    %rax, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +
> > > > +       /* Loop unroll 4 times for 4 vector loop.  */
> > > > +       VPCMP   $0, (%rdi), %VMM0, %k0
> > > > +
> > > > +       KMOV    %k0, %RAX
> > > > +       test    %RAX, %RAX
> > > > +       jnz     L(ret_vec_x1)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +
> > > > +       VPCMP   $0, VEC_SIZE(%rdi), %VMM0, %k0
> > > > +
> > > > +       KMOV    %k0, %RAX
> > > > +       test    %RAX, %RAX
> > > > +       jnz     L(ret_vec_x2)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +
> > > > +       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %VMM0, %k0
> > > > +
> > > > +       KMOV    %k0, %RAX
> > > > +       test    %RAX, %RAX
> > > > +       jnz     L(ret_vec_x3)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +
> > > > +       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %VMM0, %k0
> > > > +
> > > > +       KMOV    %k0, %RAX
> > > > +       test    %RAX, %RAX
> > > > +       jnz     L(ret_vec_x4)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero)
> > > > +       /* Save pointer to find alignment adjustment.  */
> > > > +       movq    %rdi, %rax
> > > > +# endif
> > > > +       /* Align address to VEC_SIZE * 4 for loop.  */
> > > > +       andq    $-(VEC_SIZE * 4), %rdi
> > > > +
> > > > +       /* Add alignment difference to rdx.  */
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    %rdi, %rax
> > > > +#  ifdef USE_AS_WMEMCHR
> > > > +       SHR     $2, %RAX
> > > > +#  endif
> > > > +       addq    %rax, %rdx
> > > > +       jmp     L(loop_entry)
> > > > +# endif
> > > > +
> > > > +       /* 4 vector loop.  */
> > > > +       .p2align 5,,11
> > > > +L(loop):
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > > > +       jbe     L(zero)
> > > > +L(loop_entry):
> > > > +# endif
> > > > +       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %VMM0, %k1
> > > > +       VPCMP   $0, (VEC_SIZE * 5)(%rdi), %VMM0, %k2
> > > > +       VPCMP   $0, (VEC_SIZE * 6)(%rdi), %VMM0, %k3
> > > > +       VPCMP   $0, (VEC_SIZE * 7)(%rdi), %VMM0, %k4
> > > > +       KOR     %k1, %k2, %k5
> > > > +       KOR     %k3, %k4, %k6
> > > > +
> > > > +       subq    $-(VEC_SIZE * 4), %rdi
> > > > +       KORTEST %k5, %k6
> > > > +       jz      L(loop)
> > > > +
> > > > +       KMOV    %k1, %RAX
> > > > +       test    %RAX, %RAX
> > > > +       jnz     L(ret_vec_x1)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +
> > > > +       KMOV    %k2, %RAX
> > > > +       test    %RAX, %RAX
> > > > +       jnz     L(ret_vec_x2)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +
> > > > +       KMOV    %k3, %RAX
> > > > +       test    %RAX, %RAX
> > > > +       jnz     L(ret_vec_x3)
> > > > +
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       subq    $CHAR_PER_VEC, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +
> > > > +       /* At this point null [w]char must be in the fourth vector so no
> > > > +          need to check.  */
> > > > +       KMOV    %k4, %RAX
> > > > +
> > > > +L(ret_vec_x4):
> > > > +       bsf     %RAX, %RAX
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       cmp     %rax, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > > > +       ret
> > > > +
> > > > +       .p2align 5,,5
> > > > +L(ret_vec_x3):
> > > > +       bsf     %RAX, %RAX
> > > > +# ifndef USE_AS_RAWMEMCHR
> > > > +       cmp     %rax, %rdx
> > > > +       jbe     L(zero)
> > > > +# endif
> > > > +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > > > +       ret
> > > > +
> > > > +END (MEMCHR)
> > > > +#endif
> > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > > > new file mode 100644
> > > > index 0000000000..47349d817a
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > > > @@ -0,0 +1,7 @@
> > > > +# ifndef MEMCHR
> > > > +#  define MEMCHR       __memchr_evex512
> > > > +# endif
> > > > +
> > > > +#define VEC_SIZE        64
> > > > +
> > > > +#include "memchr-evex-base.S"
> > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > > new file mode 100644
> > > > index 0000000000..302d3cb055
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > > @@ -0,0 +1,7 @@
> > > > +#ifndef RAWMEMCHR
> > > > +# define RAWMEMCHR     __rawmemchr_evex512
> > > > +#endif
> > > > +#define USE_AS_RAWMEMCHR       1
> > > > +#define MEMCHR RAWMEMCHR
> > > > +
> > > > +#include "memchr-evex512.S"
> > > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > > new file mode 100644
> > > > index 0000000000..f45ed1db75
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > > @@ -0,0 +1,8 @@
> > > > +#ifndef WMEMCHR
> > > > +# define WMEMCHR       __wmemchr_evex512
> > > > +#endif
> > > > +
> > > > +#define MEMCHR WMEMCHR
> > > > +#define USE_AS_WMEMCHR 1
> > > > +
> > > > +#include "memchr-evex512.S"
> > > > --
> > > > 2.36.1
> > > >
>
> ping
see my reply to strrchr.
  
Noah Goldstein Oct. 3, 2022, 6:33 p.m. UTC | #5
On Wed, Sep 21, 2022 at 5:27 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch implements following evex512 version of string functions.
> evex512 version takes up to 30% less cycle as compared to evex,
> depending on length and alignment.
>
> - memchr function using 512 bit vectors.
> - rawmemchr function using 512 bit vectors.
> - wmemchr function using 512 bit vectors.
>
> Code size data:
>
> memchr-evex.o           762 byte
> memchr-evex512.o        570 byte (-25%)
>
> rawmemchr-evex.o        461 byte
> rawmemchr-evex512.o     413 byte (-10%)
>
> wmemchr-evex.o          794 byte
> wmemchr-evex512.o       568 byte (-28%)
>
> Placeholder function, not used by any processor at the moment.
> ---
>  sysdeps/x86_64/multiarch/Makefile            |   3 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  15 +
>  sysdeps/x86_64/multiarch/memchr-evex-base.S  | 306 +++++++++++++++++++
>  sysdeps/x86_64/multiarch/memchr-evex512.S    |   7 +
>  sysdeps/x86_64/multiarch/rawmemchr-evex512.S |   7 +
>  sysdeps/x86_64/multiarch/wmemchr-evex512.S   |   8 +
>  6 files changed, 346 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S
>  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index df4601c294..e974b1ad97 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -4,6 +4,7 @@ sysdep_routines += \
>    memchr-avx2 \
>    memchr-avx2-rtm \
>    memchr-evex \
> +  memchr-evex512 \
>    memchr-evex-rtm \
>    memchr-sse2 \
>    memcmp-avx2-movbe \
> @@ -36,6 +37,7 @@ sysdep_routines += \
>    rawmemchr-avx2 \
>    rawmemchr-avx2-rtm \
>    rawmemchr-evex \
> +  rawmemchr-evex512 \
>    rawmemchr-evex-rtm \
>    rawmemchr-sse2 \
>    stpcpy-avx2 \
> @@ -156,6 +158,7 @@ sysdep_routines += \
>    wmemchr-avx2 \
>    wmemchr-avx2-rtm \
>    wmemchr-evex \
> +  wmemchr-evex512 \
>    wmemchr-evex-rtm \
>    wmemchr-sse2 \
>    wmemcmp-avx2-movbe \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index a71444eccb..17f770318d 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __memchr_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __memchr_evex512)
>               X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
>                                      (CPU_FEATURE_USABLE (AVX512VL)
>                                       && CPU_FEATURE_USABLE (AVX512BW)
> @@ -329,6 +334,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __rawmemchr_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __rawmemchr_evex512)
>               X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
>                                      (CPU_FEATURE_USABLE (AVX512VL)
>                                       && CPU_FEATURE_USABLE (AVX512BW)
> @@ -903,6 +913,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __wmemchr_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __wmemchr_evex512)
>               X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
>                                      (CPU_FEATURE_USABLE (AVX512VL)
>                                       && CPU_FEATURE_USABLE (AVX512BW)
> diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> new file mode 100644
> index 0000000000..524f0809b5
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> @@ -0,0 +1,306 @@
> +/* Placeholder function, not used by any processor at the moment.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* UNUSED. Exists purely as reference implementation.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +# include <sysdep.h>
> +
> +# ifdef USE_AS_WMEMCHR
> +#  define CHAR_SIZE    4
> +#  define VPBROADCAST   vpbroadcastd
> +#  define VPCMP                vpcmpd
> +# else
> +#  define CHAR_SIZE    1
> +#  define VPBROADCAST   vpbroadcastb
> +#  define VPCMP                vpcmpb
> +# endif
> +
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +# define XMM1           xmm17
> +
> +# if VEC_SIZE == 64
> +#  define KMOV         kmovq
> +#  define KOR          korq
> +#  define KORTEST      kortestq
> +#  define RAX          rax
> +#  define RCX          rcx
> +#  define SHR          shrq
> +#  define SARX         sarxq
> +#  define TEXTSUFFIX   evex512
> +#  define VMM0         zmm16
> +# elif VEC_SIZE == 32
> +/* Currently Unused.  */
> +#  define KMOV         kmovd
> +#  define KOR          kord
> +#  define KORTEST      kortestd
> +#  define RAX          eax
> +#  define RCX          ecx
> +#  define SHR          shrl
> +#  define SARX         sarxl
> +#  define TEXTSUFFIX   evex256
> +#  define VMM0         ymm16
> +# endif
> +
> +       .section .text.TEXTSUFFIX, "ax", @progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> +   one vector length string.  */
> +ENTRY_P2ALIGN (MEMCHR, 6)
> +# ifndef USE_AS_RAWMEMCHR
> +       /* Check for zero length.  */
> +       test    %RDX_LP, %RDX_LP
> +       jz      L(zero)
> +
> +#  ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %edx, %edx
> +#  endif
> +# endif
> +
> +       /* Broadcast CHAR to VMM0.  */
> +       VPBROADCAST %esi, %VMM0
> +       movl    %edi, %eax
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(page_cross)
> +
> +       /* Compare [w]char for null, mask bit will be set for match.  */
> +       VPCMP   $0, (%rdi), %VMM0, %k0
> +
> +       KMOV    %k0, %RAX
> +# ifndef USE_AS_RAWMEMCHR
> +       bsf     %RAX, %RCX
> +       jz      L(align_more)
> +       xor     %eax, %eax
> +#  ifdef USE_AS_WMEMCHR
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> +#  else
> +       addq    %rcx, %rdi
> +#  endif
> +       cmp     %rcx, %rdx
> +       cmova   %rdi, %rax
> +# else
> +       bsf     %RAX, %RAX
> +       jz      L(align_more)
> +       add     %rdi, %rax
> +# endif
> +       ret
> +
> +# ifndef USE_AS_RAWMEMCHR
> +L(zero):
> +       xorl    %eax, %eax
> +       ret
> +# endif
> +
> +       .p2align 5,,5
> +L(page_cross):
> +       movq    %rdi, %rcx
> +       andq    $-VEC_SIZE, %rcx
> +
> +       VPCMP   $0, (%rcx), %VMM0, %k0
> +       KMOV    %k0, %RCX
> +       SARX    %RAX, %RCX, %RAX
> +# ifndef USE_AS_RAWMEMCHR
> +       bsf     %RAX, %RCX
> +       jz      L(align_more)
> +       xor     %eax, %eax
> +#  ifdef USE_AS_WMEMCHR
> +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> +#  else
> +       addq    %rcx, %rdi
> +#  endif
> +       cmp     %rcx, %rdx
> +       cmovae  %rdi, %rax

Irrelivant of other concerns this is buggy. It needs to be
cmova.
> +
> +# else
> +       bsf     %rax, %rax
> +       jz      L(align_more)
> +       add     %rdi, %rax
> +# endif
> +       ret
> +
> +L(ret_vec_x2):
> +       subq    $-VEC_SIZE, %rdi
> +L(ret_vec_x1):
> +       bsf     %RAX, %RAX
> +# ifndef USE_AS_RAWMEMCHR
> +       jz      L(zero)
> +       cmp     %rax, %rdx
> +       jbe     L(zero)
> +# endif
> +# ifdef USE_AS_WMEMCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       add     %rdi, %rax
> +# endif
> +       ret
> +
> +       .p2align 5,,10
> +L(align_more):
> +# ifndef USE_AS_RAWMEMCHR
> +       xor     %eax, %eax
> +       subq    %rdi, %rax
> +# endif
> +
> +       subq    $-VEC_SIZE, %rdi
> +       /* Align rdi to VEC_SIZE.  */
> +       andq    $-VEC_SIZE, %rdi
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       addq    %rdi, %rax
> +#  ifdef USE_AS_WMEMCHR
> +       sarl    $2, %eax
> +#  endif
> +       subq    %rax, %rdx
> +       jbe     L(zero)
> +# endif
> +
> +       /* Loop unroll 4 times for 4 vector loop.  */
> +       VPCMP   $0, (%rdi), %VMM0, %k0
> +
> +       KMOV    %k0, %RAX
> +       test    %RAX, %RAX
> +       jnz     L(ret_vec_x1)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +# endif
> +
> +       VPCMP   $0, VEC_SIZE(%rdi), %VMM0, %k0
> +
> +       KMOV    %k0, %RAX
> +       test    %RAX, %RAX
> +       jnz     L(ret_vec_x2)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +# endif
> +
> +       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %VMM0, %k0
> +
> +       KMOV    %k0, %RAX
> +       test    %RAX, %RAX
> +       jnz     L(ret_vec_x3)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +# endif
> +
> +       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %VMM0, %k0
> +
> +       KMOV    %k0, %RAX
> +       test    %RAX, %RAX
> +       jnz     L(ret_vec_x4)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +       /* Save pointer to find alignment adjustment.  */
> +       movq    %rdi, %rax
> +# endif
> +       /* Align address to VEC_SIZE * 4 for loop.  */
> +       andq    $-(VEC_SIZE * 4), %rdi
> +
> +       /* Add alignment difference to rdx.  */
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    %rdi, %rax
> +#  ifdef USE_AS_WMEMCHR
> +       SHR     $2, %RAX
> +#  endif
> +       addq    %rax, %rdx
> +       jmp     L(loop_entry)
> +# endif
> +
> +       /* 4 vector loop.  */
> +       .p2align 5,,11
> +L(loop):
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $(CHAR_PER_VEC * 4), %rdx
> +       jbe     L(zero)
> +L(loop_entry):
> +# endif
> +       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %VMM0, %k1
> +       VPCMP   $0, (VEC_SIZE * 5)(%rdi), %VMM0, %k2
> +       VPCMP   $0, (VEC_SIZE * 6)(%rdi), %VMM0, %k3
> +       VPCMP   $0, (VEC_SIZE * 7)(%rdi), %VMM0, %k4
> +       KOR     %k1, %k2, %k5
> +       KOR     %k3, %k4, %k6
> +
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       KORTEST %k5, %k6
> +       jz      L(loop)
> +
> +       KMOV    %k1, %RAX
> +       test    %RAX, %RAX
> +       jnz     L(ret_vec_x1)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +# endif
> +
> +       KMOV    %k2, %RAX
> +       test    %RAX, %RAX
> +       jnz     L(ret_vec_x2)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +# endif
> +
> +       KMOV    %k3, %RAX
> +       test    %RAX, %RAX
> +       jnz     L(ret_vec_x3)
> +
> +# ifndef USE_AS_RAWMEMCHR
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(zero)
> +# endif
> +
> +       /* At this point null [w]char must be in the fourth vector so no
> +          need to check.  */
> +       KMOV    %k4, %RAX
> +
> +L(ret_vec_x4):
> +       bsf     %RAX, %RAX
> +# ifndef USE_AS_RAWMEMCHR
> +       cmp     %rax, %rdx
> +       jbe     L(zero)
> +# endif
> +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
> +
> +       .p2align 5,,5
> +L(ret_vec_x3):
> +       bsf     %RAX, %RAX
> +# ifndef USE_AS_RAWMEMCHR
> +       cmp     %rax, %rdx
> +       jbe     L(zero)
> +# endif
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
> +
> +END (MEMCHR)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S
> new file mode 100644
> index 0000000000..47349d817a
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S
> @@ -0,0 +1,7 @@
> +# ifndef MEMCHR
> +#  define MEMCHR       __memchr_evex512
> +# endif
> +
> +#define VEC_SIZE        64
> +
> +#include "memchr-evex-base.S"
> diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> new file mode 100644
> index 0000000000..302d3cb055
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> @@ -0,0 +1,7 @@
> +#ifndef RAWMEMCHR
> +# define RAWMEMCHR     __rawmemchr_evex512
> +#endif
> +#define USE_AS_RAWMEMCHR       1
> +#define MEMCHR RAWMEMCHR
> +
> +#include "memchr-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> new file mode 100644
> index 0000000000..f45ed1db75
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> @@ -0,0 +1,8 @@
> +#ifndef WMEMCHR
> +# define WMEMCHR       __wmemchr_evex512
> +#endif
> +
> +#define MEMCHR WMEMCHR
> +#define USE_AS_WMEMCHR 1
> +
> +#include "memchr-evex512.S"
> --
> 2.36.1
>
  
H.J. Lu Oct. 3, 2022, 7 p.m. UTC | #6
On Mon, Oct 3, 2022 at 11:33 AM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Sep 21, 2022 at 5:27 PM Sunil K Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > This patch implements following evex512 version of string functions.
> > evex512 version takes up to 30% less cycle as compared to evex,
> > depending on length and alignment.
> >
> > - memchr function using 512 bit vectors.
> > - rawmemchr function using 512 bit vectors.
> > - wmemchr function using 512 bit vectors.
> >
> > Code size data:
> >
> > memchr-evex.o           762 byte
> > memchr-evex512.o        570 byte (-25%)
> >
> > rawmemchr-evex.o        461 byte
> > rawmemchr-evex512.o     413 byte (-10%)
> >
> > wmemchr-evex.o          794 byte
> > wmemchr-evex512.o       568 byte (-28%)
> >
> > Placeholder function, not used by any processor at the moment.
> > ---
> >  sysdeps/x86_64/multiarch/Makefile            |   3 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  15 +
> >  sysdeps/x86_64/multiarch/memchr-evex-base.S  | 306 +++++++++++++++++++
> >  sysdeps/x86_64/multiarch/memchr-evex512.S    |   7 +
> >  sysdeps/x86_64/multiarch/rawmemchr-evex512.S |   7 +
> >  sysdeps/x86_64/multiarch/wmemchr-evex512.S   |   8 +
> >  6 files changed, 346 insertions(+)
> >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S
> >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index df4601c294..e974b1ad97 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -4,6 +4,7 @@ sysdep_routines += \
> >    memchr-avx2 \
> >    memchr-avx2-rtm \
> >    memchr-evex \
> > +  memchr-evex512 \
> >    memchr-evex-rtm \
> >    memchr-sse2 \
> >    memcmp-avx2-movbe \
> > @@ -36,6 +37,7 @@ sysdep_routines += \
> >    rawmemchr-avx2 \
> >    rawmemchr-avx2-rtm \
> >    rawmemchr-evex \
> > +  rawmemchr-evex512 \
> >    rawmemchr-evex-rtm \
> >    rawmemchr-sse2 \
> >    stpcpy-avx2 \
> > @@ -156,6 +158,7 @@ sysdep_routines += \
> >    wmemchr-avx2 \
> >    wmemchr-avx2-rtm \
> >    wmemchr-evex \
> > +  wmemchr-evex512 \
> >    wmemchr-evex-rtm \
> >    wmemchr-sse2 \
> >    wmemcmp-avx2-movbe \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index a71444eccb..17f770318d 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> >                                       && CPU_FEATURE_USABLE (BMI2)),
> >                                      __memchr_evex)
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __memchr_evex512)
> >               X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> >                                      (CPU_FEATURE_USABLE (AVX512VL)
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > @@ -329,6 +334,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> >                                       && CPU_FEATURE_USABLE (BMI2)),
> >                                      __rawmemchr_evex)
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __rawmemchr_evex512)
> >               X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> >                                      (CPU_FEATURE_USABLE (AVX512VL)
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > @@ -903,6 +913,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> >                                       && CPU_FEATURE_USABLE (BMI2)),
> >                                      __wmemchr_evex)
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __wmemchr_evex512)
> >               X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> >                                      (CPU_FEATURE_USABLE (AVX512VL)
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > new file mode 100644
> > index 0000000000..524f0809b5
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > @@ -0,0 +1,306 @@
> > +/* Placeholder function, not used by any processor at the moment.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* UNUSED. Exists purely as reference implementation.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if ISA_SHOULD_BUILD (4)
> > +
> > +# include <sysdep.h>
> > +
> > +# ifdef USE_AS_WMEMCHR
> > +#  define CHAR_SIZE    4
> > +#  define VPBROADCAST   vpbroadcastd
> > +#  define VPCMP                vpcmpd
> > +# else
> > +#  define CHAR_SIZE    1
> > +#  define VPBROADCAST   vpbroadcastb
> > +#  define VPCMP                vpcmpb
> > +# endif
> > +
> > +# define PAGE_SIZE     4096
> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > +# define XMM1           xmm17
> > +
> > +# if VEC_SIZE == 64
> > +#  define KMOV         kmovq
> > +#  define KOR          korq
> > +#  define KORTEST      kortestq
> > +#  define RAX          rax
> > +#  define RCX          rcx
> > +#  define SHR          shrq
> > +#  define SARX         sarxq
> > +#  define TEXTSUFFIX   evex512
> > +#  define VMM0         zmm16
> > +# elif VEC_SIZE == 32
> > +/* Currently Unused.  */
> > +#  define KMOV         kmovd
> > +#  define KOR          kord
> > +#  define KORTEST      kortestd
> > +#  define RAX          eax
> > +#  define RCX          ecx
> > +#  define SHR          shrl
> > +#  define SARX         sarxl
> > +#  define TEXTSUFFIX   evex256
> > +#  define VMM0         ymm16
> > +# endif
> > +
> > +       .section .text.TEXTSUFFIX, "ax", @progbits
> > +/* Aligning entry point to 64 byte, provides better performance for
> > +   one vector length string.  */
> > +ENTRY_P2ALIGN (MEMCHR, 6)
> > +# ifndef USE_AS_RAWMEMCHR
> > +       /* Check for zero length.  */
> > +       test    %RDX_LP, %RDX_LP
> > +       jz      L(zero)
> > +
> > +#  ifdef __ILP32__
> > +       /* Clear the upper 32 bits.  */
> > +       movl    %edx, %edx
> > +#  endif
> > +# endif
> > +
> > +       /* Broadcast CHAR to VMM0.  */
> > +       VPBROADCAST %esi, %VMM0
> > +       movl    %edi, %eax
> > +       andl    $(PAGE_SIZE - 1), %eax
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +       ja      L(page_cross)
> > +
> > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > +       VPCMP   $0, (%rdi), %VMM0, %k0
> > +
> > +       KMOV    %k0, %RAX
> > +# ifndef USE_AS_RAWMEMCHR
> > +       bsf     %RAX, %RCX
> > +       jz      L(align_more)
> > +       xor     %eax, %eax
> > +#  ifdef USE_AS_WMEMCHR
> > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > +#  else
> > +       addq    %rcx, %rdi
> > +#  endif
> > +       cmp     %rcx, %rdx
> > +       cmova   %rdi, %rax
> > +# else
> > +       bsf     %RAX, %RAX
> > +       jz      L(align_more)
> > +       add     %rdi, %rax
> > +# endif
> > +       ret
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +L(zero):
> > +       xorl    %eax, %eax
> > +       ret
> > +# endif
> > +
> > +       .p2align 5,,5
> > +L(page_cross):
> > +       movq    %rdi, %rcx
> > +       andq    $-VEC_SIZE, %rcx
> > +
> > +       VPCMP   $0, (%rcx), %VMM0, %k0
> > +       KMOV    %k0, %RCX
> > +       SARX    %RAX, %RCX, %RAX
> > +# ifndef USE_AS_RAWMEMCHR
> > +       bsf     %RAX, %RCX
> > +       jz      L(align_more)
> > +       xor     %eax, %eax
> > +#  ifdef USE_AS_WMEMCHR
> > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > +#  else
> > +       addq    %rcx, %rdi
> > +#  endif
> > +       cmp     %rcx, %rdx
> > +       cmovae  %rdi, %rax
>
> Irrelivant of other concerns this is buggy. It needs to be
> cmova.

A testcase?

> > +
> > +# else
> > +       bsf     %rax, %rax
> > +       jz      L(align_more)
> > +       add     %rdi, %rax
> > +# endif
> > +       ret
> > +
> > +L(ret_vec_x2):
> > +       subq    $-VEC_SIZE, %rdi
> > +L(ret_vec_x1):
> > +       bsf     %RAX, %RAX
> > +# ifndef USE_AS_RAWMEMCHR
> > +       jz      L(zero)
> > +       cmp     %rax, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +# ifdef USE_AS_WMEMCHR
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > +       add     %rdi, %rax
> > +# endif
> > +       ret
> > +
> > +       .p2align 5,,10
> > +L(align_more):
> > +# ifndef USE_AS_RAWMEMCHR
> > +       xor     %eax, %eax
> > +       subq    %rdi, %rax
> > +# endif
> > +
> > +       subq    $-VEC_SIZE, %rdi
> > +       /* Align rdi to VEC_SIZE.  */
> > +       andq    $-VEC_SIZE, %rdi
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       addq    %rdi, %rax
> > +#  ifdef USE_AS_WMEMCHR
> > +       sarl    $2, %eax
> > +#  endif
> > +       subq    %rax, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +
> > +       /* Loop unroll 4 times for 4 vector loop.  */
> > +       VPCMP   $0, (%rdi), %VMM0, %k0
> > +
> > +       KMOV    %k0, %RAX
> > +       test    %RAX, %RAX
> > +       jnz     L(ret_vec_x1)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +
> > +       VPCMP   $0, VEC_SIZE(%rdi), %VMM0, %k0
> > +
> > +       KMOV    %k0, %RAX
> > +       test    %RAX, %RAX
> > +       jnz     L(ret_vec_x2)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +
> > +       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %VMM0, %k0
> > +
> > +       KMOV    %k0, %RAX
> > +       test    %RAX, %RAX
> > +       jnz     L(ret_vec_x3)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +
> > +       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %VMM0, %k0
> > +
> > +       KMOV    %k0, %RAX
> > +       test    %RAX, %RAX
> > +       jnz     L(ret_vec_x4)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +       /* Save pointer to find alignment adjustment.  */
> > +       movq    %rdi, %rax
> > +# endif
> > +       /* Align address to VEC_SIZE * 4 for loop.  */
> > +       andq    $-(VEC_SIZE * 4), %rdi
> > +
> > +       /* Add alignment difference to rdx.  */
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    %rdi, %rax
> > +#  ifdef USE_AS_WMEMCHR
> > +       SHR     $2, %RAX
> > +#  endif
> > +       addq    %rax, %rdx
> > +       jmp     L(loop_entry)
> > +# endif
> > +
> > +       /* 4 vector loop.  */
> > +       .p2align 5,,11
> > +L(loop):
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > +       jbe     L(zero)
> > +L(loop_entry):
> > +# endif
> > +       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %VMM0, %k1
> > +       VPCMP   $0, (VEC_SIZE * 5)(%rdi), %VMM0, %k2
> > +       VPCMP   $0, (VEC_SIZE * 6)(%rdi), %VMM0, %k3
> > +       VPCMP   $0, (VEC_SIZE * 7)(%rdi), %VMM0, %k4
> > +       KOR     %k1, %k2, %k5
> > +       KOR     %k3, %k4, %k6
> > +
> > +       subq    $-(VEC_SIZE * 4), %rdi
> > +       KORTEST %k5, %k6
> > +       jz      L(loop)
> > +
> > +       KMOV    %k1, %RAX
> > +       test    %RAX, %RAX
> > +       jnz     L(ret_vec_x1)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +
> > +       KMOV    %k2, %RAX
> > +       test    %RAX, %RAX
> > +       jnz     L(ret_vec_x2)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +
> > +       KMOV    %k3, %RAX
> > +       test    %RAX, %RAX
> > +       jnz     L(ret_vec_x3)
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +
> > +       /* At this point null [w]char must be in the fourth vector so no
> > +          need to check.  */
> > +       KMOV    %k4, %RAX
> > +
> > +L(ret_vec_x4):
> > +       bsf     %RAX, %RAX
> > +# ifndef USE_AS_RAWMEMCHR
> > +       cmp     %rax, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> > +
> > +       .p2align 5,,5
> > +L(ret_vec_x3):
> > +       bsf     %RAX, %RAX
> > +# ifndef USE_AS_RAWMEMCHR
> > +       cmp     %rax, %rdx
> > +       jbe     L(zero)
> > +# endif
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> > +
> > +END (MEMCHR)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > new file mode 100644
> > index 0000000000..47349d817a
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > @@ -0,0 +1,7 @@
> > +# ifndef MEMCHR
> > +#  define MEMCHR       __memchr_evex512
> > +# endif
> > +
> > +#define VEC_SIZE        64
> > +
> > +#include "memchr-evex-base.S"
> > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > new file mode 100644
> > index 0000000000..302d3cb055
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > @@ -0,0 +1,7 @@
> > +#ifndef RAWMEMCHR
> > +# define RAWMEMCHR     __rawmemchr_evex512
> > +#endif
> > +#define USE_AS_RAWMEMCHR       1
> > +#define MEMCHR RAWMEMCHR
> > +
> > +#include "memchr-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > new file mode 100644
> > index 0000000000..f45ed1db75
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > @@ -0,0 +1,8 @@
> > +#ifndef WMEMCHR
> > +# define WMEMCHR       __wmemchr_evex512
> > +#endif
> > +
> > +#define MEMCHR WMEMCHR
> > +#define USE_AS_WMEMCHR 1
> > +
> > +#include "memchr-evex512.S"
> > --
> > 2.36.1
> >
  
Noah Goldstein Oct. 3, 2022, 7:12 p.m. UTC | #7
On Mon, Oct 3, 2022 at 12:01 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Oct 3, 2022 at 11:33 AM Noah Goldstein via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > On Wed, Sep 21, 2022 at 5:27 PM Sunil K Pandey via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > This patch implements following evex512 version of string functions.
> > > evex512 version takes up to 30% less cycle as compared to evex,
> > > depending on length and alignment.
> > >
> > > - memchr function using 512 bit vectors.
> > > - rawmemchr function using 512 bit vectors.
> > > - wmemchr function using 512 bit vectors.
> > >
> > > Code size data:
> > >
> > > memchr-evex.o           762 byte
> > > memchr-evex512.o        570 byte (-25%)
> > >
> > > rawmemchr-evex.o        461 byte
> > > rawmemchr-evex512.o     413 byte (-10%)
> > >
> > > wmemchr-evex.o          794 byte
> > > wmemchr-evex512.o       568 byte (-28%)
> > >
> > > Placeholder function, not used by any processor at the moment.
> > > ---
> > >  sysdeps/x86_64/multiarch/Makefile            |   3 +
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  15 +
> > >  sysdeps/x86_64/multiarch/memchr-evex-base.S  | 306 +++++++++++++++++++
> > >  sysdeps/x86_64/multiarch/memchr-evex512.S    |   7 +
> > >  sysdeps/x86_64/multiarch/rawmemchr-evex512.S |   7 +
> > >  sysdeps/x86_64/multiarch/wmemchr-evex512.S   |   8 +
> > >  6 files changed, 346 insertions(+)
> > >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-base.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex512.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > index df4601c294..e974b1ad97 100644
> > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > @@ -4,6 +4,7 @@ sysdep_routines += \
> > >    memchr-avx2 \
> > >    memchr-avx2-rtm \
> > >    memchr-evex \
> > > +  memchr-evex512 \
> > >    memchr-evex-rtm \
> > >    memchr-sse2 \
> > >    memcmp-avx2-movbe \
> > > @@ -36,6 +37,7 @@ sysdep_routines += \
> > >    rawmemchr-avx2 \
> > >    rawmemchr-avx2-rtm \
> > >    rawmemchr-evex \
> > > +  rawmemchr-evex512 \
> > >    rawmemchr-evex-rtm \
> > >    rawmemchr-sse2 \
> > >    stpcpy-avx2 \
> > > @@ -156,6 +158,7 @@ sysdep_routines += \
> > >    wmemchr-avx2 \
> > >    wmemchr-avx2-rtm \
> > >    wmemchr-evex \
> > > +  wmemchr-evex512 \
> > >    wmemchr-evex-rtm \
> > >    wmemchr-sse2 \
> > >    wmemcmp-avx2-movbe \
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index a71444eccb..17f770318d 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -63,6 +63,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > >                                      __memchr_evex)
> > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > +                                    __memchr_evex512)
> > >               X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
> > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > @@ -329,6 +334,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > >                                      __rawmemchr_evex)
> > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > +                                    __rawmemchr_evex512)
> > >               X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
> > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > @@ -903,6 +913,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > >                                      __wmemchr_evex)
> > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > +                                    __wmemchr_evex512)
> > >               X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
> > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > new file mode 100644
> > > index 0000000000..524f0809b5
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S
> > > @@ -0,0 +1,306 @@
> > > +/* Placeholder function, not used by any processor at the moment.
> > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +/* UNUSED. Exists purely as reference implementation.  */
> > > +
> > > +#include <isa-level.h>
> > > +
> > > +#if ISA_SHOULD_BUILD (4)
> > > +
> > > +# include <sysdep.h>
> > > +
> > > +# ifdef USE_AS_WMEMCHR
> > > +#  define CHAR_SIZE    4
> > > +#  define VPBROADCAST   vpbroadcastd
> > > +#  define VPCMP                vpcmpd
> > > +# else
> > > +#  define CHAR_SIZE    1
> > > +#  define VPBROADCAST   vpbroadcastb
> > > +#  define VPCMP                vpcmpb
> > > +# endif
> > > +
> > > +# define PAGE_SIZE     4096
> > > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > > +# define XMM1           xmm17
> > > +
> > > +# if VEC_SIZE == 64
> > > +#  define KMOV         kmovq
> > > +#  define KOR          korq
> > > +#  define KORTEST      kortestq
> > > +#  define RAX          rax
> > > +#  define RCX          rcx
> > > +#  define SHR          shrq
> > > +#  define SARX         sarxq
> > > +#  define TEXTSUFFIX   evex512
> > > +#  define VMM0         zmm16
> > > +# elif VEC_SIZE == 32
> > > +/* Currently Unused.  */
> > > +#  define KMOV         kmovd
> > > +#  define KOR          kord
> > > +#  define KORTEST      kortestd
> > > +#  define RAX          eax
> > > +#  define RCX          ecx
> > > +#  define SHR          shrl
> > > +#  define SARX         sarxl
> > > +#  define TEXTSUFFIX   evex256
> > > +#  define VMM0         ymm16
> > > +# endif
> > > +
> > > +       .section .text.TEXTSUFFIX, "ax", @progbits
> > > +/* Aligning entry point to 64 byte, provides better performance for
> > > +   one vector length string.  */
> > > +ENTRY_P2ALIGN (MEMCHR, 6)
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       /* Check for zero length.  */
> > > +       test    %RDX_LP, %RDX_LP
> > > +       jz      L(zero)
> > > +
> > > +#  ifdef __ILP32__
> > > +       /* Clear the upper 32 bits.  */
> > > +       movl    %edx, %edx
> > > +#  endif
> > > +# endif
> > > +
> > > +       /* Broadcast CHAR to VMM0.  */
> > > +       VPBROADCAST %esi, %VMM0
> > > +       movl    %edi, %eax
> > > +       andl    $(PAGE_SIZE - 1), %eax
> > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > +       ja      L(page_cross)
> > > +
> > > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > > +       VPCMP   $0, (%rdi), %VMM0, %k0
> > > +
> > > +       KMOV    %k0, %RAX
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       bsf     %RAX, %RCX
> > > +       jz      L(align_more)
> > > +       xor     %eax, %eax
> > > +#  ifdef USE_AS_WMEMCHR
> > > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > +#  else
> > > +       addq    %rcx, %rdi
> > > +#  endif
> > > +       cmp     %rcx, %rdx
> > > +       cmova   %rdi, %rax
> > > +# else
> > > +       bsf     %RAX, %RAX
> > > +       jz      L(align_more)
> > > +       add     %rdi, %rax
> > > +# endif
> > > +       ret
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +L(zero):
> > > +       xorl    %eax, %eax
> > > +       ret
> > > +# endif
> > > +
> > > +       .p2align 5,,5
> > > +L(page_cross):
> > > +       movq    %rdi, %rcx
> > > +       andq    $-VEC_SIZE, %rcx
> > > +
> > > +       VPCMP   $0, (%rcx), %VMM0, %k0
> > > +       KMOV    %k0, %RCX
> > > +       SARX    %RAX, %RCX, %RAX
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       bsf     %RAX, %RCX
> > > +       jz      L(align_more)
> > > +       xor     %eax, %eax
> > > +#  ifdef USE_AS_WMEMCHR
> > > +       leaq    (%rdi, %rcx, CHAR_SIZE), %rdi
> > > +#  else
> > > +       addq    %rcx, %rdi
> > > +#  endif
> > > +       cmp     %rcx, %rdx
> > > +       cmovae  %rdi, %rax
> >
> > Irrelivant of other concerns this is buggy. It needs to be
> > cmova.
>
> A testcase?

Align % PAGE_SIZE == 4094
position = 1
len = 1.
>
> > > +
> > > +# else
> > > +       bsf     %rax, %rax
> > > +       jz      L(align_more)
> > > +       add     %rdi, %rax
> > > +# endif
> > > +       ret
> > > +
> > > +L(ret_vec_x2):
> > > +       subq    $-VEC_SIZE, %rdi
> > > +L(ret_vec_x1):
> > > +       bsf     %RAX, %RAX
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       jz      L(zero)
> > > +       cmp     %rax, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +# ifdef USE_AS_WMEMCHR
> > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > +# else
> > > +       add     %rdi, %rax
> > > +# endif
> > > +       ret
> > > +
> > > +       .p2align 5,,10
> > > +L(align_more):
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       xor     %eax, %eax
> > > +       subq    %rdi, %rax
> > > +# endif
> > > +
> > > +       subq    $-VEC_SIZE, %rdi
> > > +       /* Align rdi to VEC_SIZE.  */
> > > +       andq    $-VEC_SIZE, %rdi
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       addq    %rdi, %rax
> > > +#  ifdef USE_AS_WMEMCHR
> > > +       sarl    $2, %eax
> > > +#  endif
> > > +       subq    %rax, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +
> > > +       /* Loop unroll 4 times for 4 vector loop.  */
> > > +       VPCMP   $0, (%rdi), %VMM0, %k0
> > > +
> > > +       KMOV    %k0, %RAX
> > > +       test    %RAX, %RAX
> > > +       jnz     L(ret_vec_x1)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +
> > > +       VPCMP   $0, VEC_SIZE(%rdi), %VMM0, %k0
> > > +
> > > +       KMOV    %k0, %RAX
> > > +       test    %RAX, %RAX
> > > +       jnz     L(ret_vec_x2)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +
> > > +       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %VMM0, %k0
> > > +
> > > +       KMOV    %k0, %RAX
> > > +       test    %RAX, %RAX
> > > +       jnz     L(ret_vec_x3)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +
> > > +       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %VMM0, %k0
> > > +
> > > +       KMOV    %k0, %RAX
> > > +       test    %RAX, %RAX
> > > +       jnz     L(ret_vec_x4)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +       /* Save pointer to find alignment adjustment.  */
> > > +       movq    %rdi, %rax
> > > +# endif
> > > +       /* Align address to VEC_SIZE * 4 for loop.  */
> > > +       andq    $-(VEC_SIZE * 4), %rdi
> > > +
> > > +       /* Add alignment difference to rdx.  */
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    %rdi, %rax
> > > +#  ifdef USE_AS_WMEMCHR
> > > +       SHR     $2, %RAX
> > > +#  endif
> > > +       addq    %rax, %rdx
> > > +       jmp     L(loop_entry)
> > > +# endif
> > > +
> > > +       /* 4 vector loop.  */
> > > +       .p2align 5,,11
> > > +L(loop):
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > > +       jbe     L(zero)
> > > +L(loop_entry):
> > > +# endif
> > > +       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %VMM0, %k1
> > > +       VPCMP   $0, (VEC_SIZE * 5)(%rdi), %VMM0, %k2
> > > +       VPCMP   $0, (VEC_SIZE * 6)(%rdi), %VMM0, %k3
> > > +       VPCMP   $0, (VEC_SIZE * 7)(%rdi), %VMM0, %k4
> > > +       KOR     %k1, %k2, %k5
> > > +       KOR     %k3, %k4, %k6
> > > +
> > > +       subq    $-(VEC_SIZE * 4), %rdi
> > > +       KORTEST %k5, %k6
> > > +       jz      L(loop)
> > > +
> > > +       KMOV    %k1, %RAX
> > > +       test    %RAX, %RAX
> > > +       jnz     L(ret_vec_x1)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +
> > > +       KMOV    %k2, %RAX
> > > +       test    %RAX, %RAX
> > > +       jnz     L(ret_vec_x2)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +
> > > +       KMOV    %k3, %RAX
> > > +       test    %RAX, %RAX
> > > +       jnz     L(ret_vec_x3)
> > > +
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +
> > > +       /* At this point null [w]char must be in the fourth vector so no
> > > +          need to check.  */
> > > +       KMOV    %k4, %RAX
> > > +
> > > +L(ret_vec_x4):
> > > +       bsf     %RAX, %RAX
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       cmp     %rax, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > > +       ret
> > > +
> > > +       .p2align 5,,5
> > > +L(ret_vec_x3):
> > > +       bsf     %RAX, %RAX
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       cmp     %rax, %rdx
> > > +       jbe     L(zero)
> > > +# endif
> > > +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > > +       ret
> > > +
> > > +END (MEMCHR)
> > > +#endif
> > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > > new file mode 100644
> > > index 0000000000..47349d817a
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/memchr-evex512.S
> > > @@ -0,0 +1,7 @@
> > > +# ifndef MEMCHR
> > > +#  define MEMCHR       __memchr_evex512
> > > +# endif
> > > +
> > > +#define VEC_SIZE        64
> > > +
> > > +#include "memchr-evex-base.S"
> > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > new file mode 100644
> > > index 0000000000..302d3cb055
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
> > > @@ -0,0 +1,7 @@
> > > +#ifndef RAWMEMCHR
> > > +# define RAWMEMCHR     __rawmemchr_evex512
> > > +#endif
> > > +#define USE_AS_RAWMEMCHR       1
> > > +#define MEMCHR RAWMEMCHR
> > > +
> > > +#include "memchr-evex512.S"
> > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > new file mode 100644
> > > index 0000000000..f45ed1db75
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
> > > @@ -0,0 +1,8 @@
> > > +#ifndef WMEMCHR
> > > +# define WMEMCHR       __wmemchr_evex512
> > > +#endif
> > > +
> > > +#define MEMCHR WMEMCHR
> > > +#define USE_AS_WMEMCHR 1
> > > +
> > > +#include "memchr-evex512.S"
> > > --
> > > 2.36.1
> > >
>
>
>
> --
> H.J.
  

Patch

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index df4601c294..e974b1ad97 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -4,6 +4,7 @@  sysdep_routines += \
   memchr-avx2 \
   memchr-avx2-rtm \
   memchr-evex \
+  memchr-evex512 \
   memchr-evex-rtm \
   memchr-sse2 \
   memcmp-avx2-movbe \
@@ -36,6 +37,7 @@  sysdep_routines += \
   rawmemchr-avx2 \
   rawmemchr-avx2-rtm \
   rawmemchr-evex \
+  rawmemchr-evex512 \
   rawmemchr-evex-rtm \
   rawmemchr-sse2 \
   stpcpy-avx2 \
@@ -156,6 +158,7 @@  sysdep_routines += \
   wmemchr-avx2 \
   wmemchr-avx2-rtm \
   wmemchr-evex \
+  wmemchr-evex512 \
   wmemchr-evex-rtm \
   wmemchr-sse2 \
   wmemcmp-avx2-movbe \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a71444eccb..17f770318d 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -63,6 +63,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __memchr_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __memchr_evex512)
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, memchr,
 				     (CPU_FEATURE_USABLE (AVX512VL)
 				      && CPU_FEATURE_USABLE (AVX512BW)
@@ -329,6 +334,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __rawmemchr_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __rawmemchr_evex512)
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr,
 				     (CPU_FEATURE_USABLE (AVX512VL)
 				      && CPU_FEATURE_USABLE (AVX512BW)
@@ -903,6 +913,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wmemchr_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wmemchr_evex512)
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr,
 				     (CPU_FEATURE_USABLE (AVX512VL)
 				      && CPU_FEATURE_USABLE (AVX512BW)
diff --git a/sysdeps/x86_64/multiarch/memchr-evex-base.S b/sysdeps/x86_64/multiarch/memchr-evex-base.S
new file mode 100644
index 0000000000..524f0809b5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-evex-base.S
@@ -0,0 +1,306 @@ 
+/* Placeholder function, not used by any processor at the moment.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* UNUSED. Exists purely as reference implementation.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WMEMCHR
+#  define CHAR_SIZE	4
+#  define VPBROADCAST   vpbroadcastd
+#  define VPCMP		vpcmpd
+# else
+#  define CHAR_SIZE	1
+#  define VPBROADCAST   vpbroadcastb
+#  define VPCMP		vpcmpb
+# endif
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+# define XMM1           xmm17
+
+# if VEC_SIZE == 64
+#  define KMOV		kmovq
+#  define KOR		korq
+#  define KORTEST	kortestq
+#  define RAX		rax
+#  define RCX		rcx
+#  define SHR		shrq
+#  define SARX		sarxq
+#  define TEXTSUFFIX	evex512
+#  define VMM0		zmm16
+# elif VEC_SIZE == 32
+/* Currently Unused.  */
+#  define KMOV		kmovd
+#  define KOR		kord
+#  define KORTEST	kortestd
+#  define RAX		eax
+#  define RCX		ecx
+#  define SHR		shrl
+#  define SARX		sarxl
+#  define TEXTSUFFIX	evex256
+#  define VMM0		ymm16
+# endif
+
+	.section .text.TEXTSUFFIX, "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (MEMCHR, 6)
+# ifndef USE_AS_RAWMEMCHR
+	/* Check for zero length.  */
+	test	%RDX_LP, %RDX_LP
+	jz	L(zero)
+
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#  endif
+# endif
+
+	/* Broadcast CHAR to VMM0.  */
+	VPBROADCAST %esi, %VMM0
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+	/* Compare [w]char for null, mask bit will be set for match.  */
+	VPCMP	$0, (%rdi), %VMM0, %k0
+
+	KMOV	%k0, %RAX
+# ifndef USE_AS_RAWMEMCHR
+	bsf	%RAX, %RCX
+	jz	L(align_more)
+	xor	%eax, %eax
+#  ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#  else
+	addq	%rcx, %rdi
+#  endif
+	cmp	%rcx, %rdx
+	cmova	%rdi, %rax
+# else
+	bsf     %RAX, %RAX
+	jz	L(align_more)
+	add	%rdi, %rax
+# endif
+	ret
+
+# ifndef USE_AS_RAWMEMCHR
+L(zero):
+	xorl	%eax, %eax
+	ret
+# endif
+
+	.p2align 5,,5
+L(page_cross):
+	movq	%rdi, %rcx
+	andq	$-VEC_SIZE, %rcx
+
+	VPCMP   $0, (%rcx), %VMM0, %k0
+	KMOV    %k0, %RCX
+	SARX	%RAX, %RCX, %RAX
+# ifndef USE_AS_RAWMEMCHR
+	bsf	%RAX, %RCX
+	jz	L(align_more)
+	xor	%eax, %eax
+#  ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#  else
+	addq	%rcx, %rdi
+#  endif
+	cmp	%rcx, %rdx
+	cmovae	%rdi, %rax
+
+# else
+	bsf	%rax, %rax
+	jz	L(align_more)
+	add	%rdi, %rax
+# endif
+	ret
+
+L(ret_vec_x2):
+	subq	$-VEC_SIZE, %rdi
+L(ret_vec_x1):
+	bsf     %RAX, %RAX
+# ifndef USE_AS_RAWMEMCHR
+	jz	L(zero)
+	cmp	%rax, %rdx
+	jbe	L(zero)
+# endif
+# ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	add	%rdi, %rax
+# endif
+	ret
+
+	.p2align 5,,10
+L(align_more):
+# ifndef USE_AS_RAWMEMCHR
+	xor	%eax, %eax
+	subq    %rdi, %rax
+# endif
+
+	subq	$-VEC_SIZE, %rdi
+	/* Align rdi to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+	addq	%rdi, %rax
+#  ifdef USE_AS_WMEMCHR
+	sarl	$2, %eax
+#  endif
+	subq	%rax, %rdx
+	jbe	L(zero)
+# endif
+
+	/* Loop unroll 4 times for 4 vector loop.  */
+	VPCMP	$0, (%rdi), %VMM0, %k0
+
+	KMOV	%k0, %RAX
+	test	%RAX, %RAX
+	jnz	L(ret_vec_x1)
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(zero)
+# endif
+
+	VPCMP	$0, VEC_SIZE(%rdi), %VMM0, %k0
+
+	KMOV	%k0, %RAX
+	test	%RAX, %RAX
+	jnz	L(ret_vec_x2)
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(zero)
+# endif
+
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %VMM0, %k0
+
+	KMOV	%k0, %RAX
+	test	%RAX, %RAX
+	jnz	L(ret_vec_x3)
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(zero)
+# endif
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %VMM0, %k0
+
+	KMOV	%k0, %RAX
+	test	%RAX, %RAX
+	jnz	L(ret_vec_x4)
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(zero)
+	/* Save pointer to find alignment adjustment.  */
+	movq	%rdi, %rax
+# endif
+	/* Align address to VEC_SIZE * 4 for loop.  */
+	andq	$-(VEC_SIZE * 4), %rdi
+
+	/* Add alignment difference to rdx.  */
+# ifndef USE_AS_RAWMEMCHR
+	subq	%rdi, %rax
+#  ifdef USE_AS_WMEMCHR
+	SHR	$2, %RAX
+#  endif
+	addq	%rax, %rdx
+	jmp	L(loop_entry)
+# endif
+
+	/* 4 vector loop.  */
+	.p2align 5,,11
+L(loop):
+# ifndef USE_AS_RAWMEMCHR
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(zero)
+L(loop_entry):
+# endif
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %VMM0, %k1
+	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %VMM0, %k2
+	VPCMP	$0, (VEC_SIZE * 6)(%rdi), %VMM0, %k3
+	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %VMM0, %k4
+	KOR	%k1, %k2, %k5
+	KOR	%k3, %k4, %k6
+
+	subq	$-(VEC_SIZE * 4), %rdi
+	KORTEST	%k5, %k6
+	jz	L(loop)
+
+	KMOV	%k1, %RAX
+	test	%RAX, %RAX
+	jnz	L(ret_vec_x1)
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(zero)
+# endif
+
+	KMOV	%k2, %RAX
+	test	%RAX, %RAX
+	jnz	L(ret_vec_x2)
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(zero)
+# endif
+
+	KMOV	%k3, %RAX
+	test	%RAX, %RAX
+	jnz	L(ret_vec_x3)
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(zero)
+# endif
+
+	/* At this point null [w]char must be in the fourth vector so no
+	   need to check.  */
+	KMOV	%k4, %RAX
+
+L(ret_vec_x4):
+	bsf	%RAX, %RAX
+# ifndef USE_AS_RAWMEMCHR
+	cmp	%rax, %rdx
+	jbe	L(zero)
+# endif
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 5,,5
+L(ret_vec_x3):
+	bsf	%RAX, %RAX
+# ifndef USE_AS_RAWMEMCHR
+	cmp	%rax, %rdx
+	jbe	L(zero)
+# endif
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+END (MEMCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memchr-evex512.S b/sysdeps/x86_64/multiarch/memchr-evex512.S
new file mode 100644
index 0000000000..47349d817a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-evex512.S
@@ -0,0 +1,7 @@ 
+# ifndef MEMCHR
+#  define MEMCHR       __memchr_evex512
+# endif
+
+#define VEC_SIZE        64
+
+#include "memchr-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex512.S b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
new file mode 100644
index 0000000000..302d3cb055
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex512.S
@@ -0,0 +1,7 @@ 
+#ifndef RAWMEMCHR
+# define RAWMEMCHR	__rawmemchr_evex512
+#endif
+#define USE_AS_RAWMEMCHR	1
+#define MEMCHR	RAWMEMCHR
+
+#include "memchr-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex512.S b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
new file mode 100644
index 0000000000..f45ed1db75
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-evex512.S
@@ -0,0 +1,8 @@ 
+#ifndef WMEMCHR
+# define WMEMCHR	__wmemchr_evex512
+#endif
+
+#define MEMCHR	WMEMCHR
+#define USE_AS_WMEMCHR	1
+
+#include "memchr-evex512.S"