[v2] x86: Remove __mmask intrinsics in strstr-avx512.c

Message ID 20220711223841.2041611-1-goldstein.w.n@gmail.com
State Committed
Commit f2698954ff9c2f9626d4bcb5a30eb5729714e0b0
Headers
Series [v2] x86: Remove __mmask intrinsics in strstr-avx512.c |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Noah Goldstein July 11, 2022, 10:38 p.m. UTC
  Using standard operators generates and the same code and __mmask
instrinsics are not available before GCC7.

Removed:
    _cvtmask64_u64
    _kshiftri_mask64
    _kand_mask64
---
 sysdeps/x86_64/multiarch/strstr-avx512.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)
  

Comments

Noah Goldstein July 11, 2022, 10:52 p.m. UTC | #1
On Mon, Jul 11, 2022 at 3:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Using standard operators generates and the same code and __mmask
> instrinsics are not available before GCC7.
>
> Removed:
>     _cvtmask64_u64
>     _kshiftri_mask64
>     _kand_mask64
> ---
>  sysdeps/x86_64/multiarch/strstr-avx512.c | 16 ++++++++++------
>  1 file changed, 10 insertions(+), 6 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> index 2ab9e96db8..e44c1a05dc 100644
> --- a/sysdeps/x86_64/multiarch/strstr-avx512.c
> +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> @@ -26,6 +26,10 @@
>  #define ZMM_SIZE_IN_BYTES 64
>  #define PAGESIZE 4096
>
> +#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
> +#define kshiftri_mask64(x, y) ((x) >> (y))
> +#define kand_mask64(x, y) ((x) & (y))
> +
>  /*
>   Returns the index of the first edge within the needle, returns 0 if no edge
>   is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> @@ -133,15 +137,15 @@ __strstr_avx512 (const char *haystack, const char *ned)
>    __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
>    /* Search for NULL and compare only till null char */
>    uint64_t nullmask
> -      = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> +      = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
>    uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> -  cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
> +  cmpmask = cmpmask & cvtmask64_u64 (loadmask);
>    /* Search for the 2 charaters of needle */
>    __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
>    __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> -  k1 = _kshiftri_mask64 (k1, 1);
> +  k1 = kshiftri_mask64 (k1, 1);
>    /* k2 masks tell us if both chars from needle match */
> -  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> +  uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
>    /* For every match, search for the entire needle for a full match */
>    while (k2)
>      {
> @@ -178,13 +182,13 @@ __strstr_avx512 (const char *haystack, const char *ned)
>        hay0 = _mm512_loadu_si512 (haystack + hay_index);
>        hay1 = _mm512_load_si512 (haystack + hay_index
>                                  + 1); // Always 64 byte aligned
> -      nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> +      nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
>        /* Compare only till null char */
>        cmpmask = nullmask ^ (nullmask - ONE_64BIT);
>        k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
>        k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
>        /* k2 masks tell us if both chars from needle match */
> -      k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> +      k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
>        /* For every match, compare full strings for potential match */
>        while (k2)
>          {
> --
> 2.34.1
>
Sunil,
Does this work (v2 didn't get chained because I changed commit msg).
  
Sunil Pandey July 11, 2022, 11:32 p.m. UTC | #2
On Mon, Jul 11, 2022 at 3:38 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Using standard operators generates and the same code and __mmask
> instrinsics are not available before GCC7.
>
> Removed:
>     _cvtmask64_u64
>     _kshiftri_mask64
>     _kand_mask64
> ---
>  sysdeps/x86_64/multiarch/strstr-avx512.c | 16 ++++++++++------
>  1 file changed, 10 insertions(+), 6 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> index 2ab9e96db8..e44c1a05dc 100644
> --- a/sysdeps/x86_64/multiarch/strstr-avx512.c
> +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> @@ -26,6 +26,10 @@
>  #define ZMM_SIZE_IN_BYTES 64
>  #define PAGESIZE 4096
>
> +#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
> +#define kshiftri_mask64(x, y) ((x) >> (y))
> +#define kand_mask64(x, y) ((x) & (y))
> +
>  /*
>   Returns the index of the first edge within the needle, returns 0 if no edge
>   is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> @@ -133,15 +137,15 @@ __strstr_avx512 (const char *haystack, const char *ned)
>    __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
>    /* Search for NULL and compare only till null char */
>    uint64_t nullmask
> -      = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> +      = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
>    uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> -  cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
> +  cmpmask = cmpmask & cvtmask64_u64 (loadmask);
>    /* Search for the 2 charaters of needle */
>    __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
>    __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> -  k1 = _kshiftri_mask64 (k1, 1);
> +  k1 = kshiftri_mask64 (k1, 1);
>    /* k2 masks tell us if both chars from needle match */
> -  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> +  uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
>    /* For every match, search for the entire needle for a full match */
>    while (k2)
>      {
> @@ -178,13 +182,13 @@ __strstr_avx512 (const char *haystack, const char *ned)
>        hay0 = _mm512_loadu_si512 (haystack + hay_index);
>        hay1 = _mm512_load_si512 (haystack + hay_index
>                                  + 1); // Always 64 byte aligned
> -      nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> +      nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
>        /* Compare only till null char */
>        cmpmask = nullmask ^ (nullmask - ONE_64BIT);
>        k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
>        k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
>        /* k2 masks tell us if both chars from needle match */
> -      k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> +      k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
>        /* For every match, compare full strings for potential match */
>        while (k2)
>          {
> --
> 2.34.1
>

LGTM

Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
  
Sunil Pandey July 11, 2022, 11:33 p.m. UTC | #3
On Mon, Jul 11, 2022 at 3:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Jul 11, 2022 at 3:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Using standard operators generates and the same code and __mmask
> > instrinsics are not available before GCC7.
> >
> > Removed:
> >     _cvtmask64_u64
> >     _kshiftri_mask64
> >     _kand_mask64
> > ---
> >  sysdeps/x86_64/multiarch/strstr-avx512.c | 16 ++++++++++------
> >  1 file changed, 10 insertions(+), 6 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > index 2ab9e96db8..e44c1a05dc 100644
> > --- a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > @@ -26,6 +26,10 @@
> >  #define ZMM_SIZE_IN_BYTES 64
> >  #define PAGESIZE 4096
> >
> > +#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
> > +#define kshiftri_mask64(x, y) ((x) >> (y))
> > +#define kand_mask64(x, y) ((x) & (y))
> > +
> >  /*
> >   Returns the index of the first edge within the needle, returns 0 if no edge
> >   is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > @@ -133,15 +137,15 @@ __strstr_avx512 (const char *haystack, const char *ned)
> >    __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> >    /* Search for NULL and compare only till null char */
> >    uint64_t nullmask
> > -      = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> > +      = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> >    uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > -  cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
> > +  cmpmask = cmpmask & cvtmask64_u64 (loadmask);
> >    /* Search for the 2 charaters of needle */
> >    __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> >    __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > -  k1 = _kshiftri_mask64 (k1, 1);
> > +  k1 = kshiftri_mask64 (k1, 1);
> >    /* k2 masks tell us if both chars from needle match */
> > -  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > +  uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> >    /* For every match, search for the entire needle for a full match */
> >    while (k2)
> >      {
> > @@ -178,13 +182,13 @@ __strstr_avx512 (const char *haystack, const char *ned)
> >        hay0 = _mm512_loadu_si512 (haystack + hay_index);
> >        hay1 = _mm512_load_si512 (haystack + hay_index
> >                                  + 1); // Always 64 byte aligned
> > -      nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> > +      nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> >        /* Compare only till null char */
> >        cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> >        k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> >        k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> >        /* k2 masks tell us if both chars from needle match */
> > -      k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > +      k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> >        /* For every match, compare full strings for potential match */
> >        while (k2)
> >          {
> > --
> > 2.34.1
> >
> Sunil,
> Does this work (v2 didn't get chained because I changed commit msg).

Yes, v2 works.

--Sunil
  
H.J. Lu July 12, 2022, 2:26 a.m. UTC | #4
On Mon, Jul 11, 2022 at 4:33 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Mon, Jul 11, 2022 at 3:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Mon, Jul 11, 2022 at 3:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > Using standard operators generates and the same code and __mmask
> > > instrinsics are not available before GCC7.
> > >
> > > Removed:
> > >     _cvtmask64_u64
> > >     _kshiftri_mask64
> > >     _kand_mask64
> > > ---
> > >  sysdeps/x86_64/multiarch/strstr-avx512.c | 16 ++++++++++------
> > >  1 file changed, 10 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > index 2ab9e96db8..e44c1a05dc 100644
> > > --- a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > @@ -26,6 +26,10 @@
> > >  #define ZMM_SIZE_IN_BYTES 64
> > >  #define PAGESIZE 4096
> > >
> > > +#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
> > > +#define kshiftri_mask64(x, y) ((x) >> (y))
> > > +#define kand_mask64(x, y) ((x) & (y))
> > > +
> > >  /*
> > >   Returns the index of the first edge within the needle, returns 0 if no edge
> > >   is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > > @@ -133,15 +137,15 @@ __strstr_avx512 (const char *haystack, const char *ned)
> > >    __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > >    /* Search for NULL and compare only till null char */
> > >    uint64_t nullmask
> > > -      = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> > > +      = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> > >    uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > -  cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
> > > +  cmpmask = cmpmask & cvtmask64_u64 (loadmask);
> > >    /* Search for the 2 charaters of needle */
> > >    __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > >    __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > > -  k1 = _kshiftri_mask64 (k1, 1);
> > > +  k1 = kshiftri_mask64 (k1, 1);
> > >    /* k2 masks tell us if both chars from needle match */
> > > -  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > > +  uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> > >    /* For every match, search for the entire needle for a full match */
> > >    while (k2)
> > >      {
> > > @@ -178,13 +182,13 @@ __strstr_avx512 (const char *haystack, const char *ned)
> > >        hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > >        hay1 = _mm512_load_si512 (haystack + hay_index
> > >                                  + 1); // Always 64 byte aligned
> > > -      nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> > > +      nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> > >        /* Compare only till null char */
> > >        cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > >        k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > >        k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > >        /* k2 masks tell us if both chars from needle match */
> > > -      k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > > +      k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> > >        /* For every match, compare full strings for potential match */
> > >        while (k2)
> > >          {
> > > --
> > > 2.34.1
> > >
> > Sunil,
> > Does this work (v2 didn't get chained because I changed commit msg).
>
> Yes, v2 works.
>

Any performance differences?
  
Noah Goldstein July 12, 2022, 2:37 a.m. UTC | #5
On Mon, Jul 11, 2022 at 7:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jul 11, 2022 at 4:33 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > On Mon, Jul 11, 2022 at 3:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Mon, Jul 11, 2022 at 3:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > Using standard operators generates and the same code and __mmask
> > > > instrinsics are not available before GCC7.
> > > >
> > > > Removed:
> > > >     _cvtmask64_u64
> > > >     _kshiftri_mask64
> > > >     _kand_mask64
> > > > ---
> > > >  sysdeps/x86_64/multiarch/strstr-avx512.c | 16 ++++++++++------
> > > >  1 file changed, 10 insertions(+), 6 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > index 2ab9e96db8..e44c1a05dc 100644
> > > > --- a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > @@ -26,6 +26,10 @@
> > > >  #define ZMM_SIZE_IN_BYTES 64
> > > >  #define PAGESIZE 4096
> > > >
> > > > +#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
> > > > +#define kshiftri_mask64(x, y) ((x) >> (y))
> > > > +#define kand_mask64(x, y) ((x) & (y))
> > > > +
> > > >  /*
> > > >   Returns the index of the first edge within the needle, returns 0 if no edge
> > > >   is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > > > @@ -133,15 +137,15 @@ __strstr_avx512 (const char *haystack, const char *ned)
> > > >    __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > > >    /* Search for NULL and compare only till null char */
> > > >    uint64_t nullmask
> > > > -      = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> > > > +      = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> > > >    uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > > -  cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
> > > > +  cmpmask = cmpmask & cvtmask64_u64 (loadmask);
> > > >    /* Search for the 2 charaters of needle */
> > > >    __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > >    __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > > > -  k1 = _kshiftri_mask64 (k1, 1);
> > > > +  k1 = kshiftri_mask64 (k1, 1);
> > > >    /* k2 masks tell us if both chars from needle match */
> > > > -  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > > > +  uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> > > >    /* For every match, search for the entire needle for a full match */
> > > >    while (k2)
> > > >      {
> > > > @@ -178,13 +182,13 @@ __strstr_avx512 (const char *haystack, const char *ned)
> > > >        hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > > >        hay1 = _mm512_load_si512 (haystack + hay_index
> > > >                                  + 1); // Always 64 byte aligned
> > > > -      nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> > > > +      nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> > > >        /* Compare only till null char */
> > > >        cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > >        k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > >        k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > > >        /* k2 masks tell us if both chars from needle match */
> > > > -      k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > > > +      k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> > > >        /* For every match, compare full strings for potential match */
> > > >        while (k2)
> > > >          {
> > > > --
> > > > 2.34.1
> > > >
> > > Sunil,
> > > Does this work (v2 didn't get chained because I changed commit msg).
> >
> > Yes, v2 works.
> >
>
> Any performance differences?

It should generate the same code. It's just using standard operators instead
of intrinsics.
>
>
> --
> H.J.
  
H.J. Lu July 12, 2022, 3:56 p.m. UTC | #6
On Mon, Jul 11, 2022 at 7:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Jul 11, 2022 at 7:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Jul 11, 2022 at 4:33 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > >
> > > On Mon, Jul 11, 2022 at 3:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Mon, Jul 11, 2022 at 3:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > Using standard operators generates and the same code and __mmask
> > > > > instrinsics are not available before GCC7.
> > > > >
> > > > > Removed:
> > > > >     _cvtmask64_u64
> > > > >     _kshiftri_mask64
> > > > >     _kand_mask64
> > > > > ---
> > > > >  sysdeps/x86_64/multiarch/strstr-avx512.c | 16 ++++++++++------
> > > > >  1 file changed, 10 insertions(+), 6 deletions(-)
> > > > >
> > > > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > > index 2ab9e96db8..e44c1a05dc 100644
> > > > > --- a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > > @@ -26,6 +26,10 @@
> > > > >  #define ZMM_SIZE_IN_BYTES 64
> > > > >  #define PAGESIZE 4096
> > > > >
> > > > > +#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
> > > > > +#define kshiftri_mask64(x, y) ((x) >> (y))
> > > > > +#define kand_mask64(x, y) ((x) & (y))
> > > > > +
> > > > >  /*
> > > > >   Returns the index of the first edge within the needle, returns 0 if no edge
> > > > >   is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > > > > @@ -133,15 +137,15 @@ __strstr_avx512 (const char *haystack, const char *ned)
> > > > >    __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > > > >    /* Search for NULL and compare only till null char */
> > > > >    uint64_t nullmask
> > > > > -      = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> > > > > +      = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> > > > >    uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > > > -  cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
> > > > > +  cmpmask = cmpmask & cvtmask64_u64 (loadmask);
> > > > >    /* Search for the 2 charaters of needle */
> > > > >    __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > > >    __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > > > > -  k1 = _kshiftri_mask64 (k1, 1);
> > > > > +  k1 = kshiftri_mask64 (k1, 1);
> > > > >    /* k2 masks tell us if both chars from needle match */
> > > > > -  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > > > > +  uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> > > > >    /* For every match, search for the entire needle for a full match */
> > > > >    while (k2)
> > > > >      {
> > > > > @@ -178,13 +182,13 @@ __strstr_avx512 (const char *haystack, const char *ned)
> > > > >        hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > > > >        hay1 = _mm512_load_si512 (haystack + hay_index
> > > > >                                  + 1); // Always 64 byte aligned
> > > > > -      nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> > > > > +      nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> > > > >        /* Compare only till null char */
> > > > >        cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > > >        k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > > >        k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > > > >        /* k2 masks tell us if both chars from needle match */
> > > > > -      k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > > > > +      k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> > > > >        /* For every match, compare full strings for potential match */
> > > > >        while (k2)
> > > > >          {
> > > > > --
> > > > > 2.34.1
> > > > >
> > > > Sunil,
> > > > Does this work (v2 didn't get chained because I changed commit msg).
> > >
> > > Yes, v2 works.
> > >
> >
> > Any performance differences?
>
> It should generate the same code. It's just using standard operators instead
> of intrinsics.

Has it been verified?

> >
> >
> > --
> > H.J.
  
Noah Goldstein July 12, 2022, 5:11 p.m. UTC | #7
On Tue, Jul 12, 2022 at 8:56 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jul 11, 2022 at 7:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Mon, Jul 11, 2022 at 7:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Mon, Jul 11, 2022 at 4:33 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > > >
> > > > On Mon, Jul 11, 2022 at 3:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > On Mon, Jul 11, 2022 at 3:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > >
> > > > > > Using standard operators generates and the same code and __mmask
> > > > > > instrinsics are not available before GCC7.
> > > > > >
> > > > > > Removed:
> > > > > >     _cvtmask64_u64
> > > > > >     _kshiftri_mask64
> > > > > >     _kand_mask64
> > > > > > ---
> > > > > >  sysdeps/x86_64/multiarch/strstr-avx512.c | 16 ++++++++++------
> > > > > >  1 file changed, 10 insertions(+), 6 deletions(-)
> > > > > >
> > > > > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > > > index 2ab9e96db8..e44c1a05dc 100644
> > > > > > --- a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > > > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > > > @@ -26,6 +26,10 @@
> > > > > >  #define ZMM_SIZE_IN_BYTES 64
> > > > > >  #define PAGESIZE 4096
> > > > > >
> > > > > > +#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
> > > > > > +#define kshiftri_mask64(x, y) ((x) >> (y))
> > > > > > +#define kand_mask64(x, y) ((x) & (y))
> > > > > > +
> > > > > >  /*
> > > > > >   Returns the index of the first edge within the needle, returns 0 if no edge
> > > > > >   is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > > > > > @@ -133,15 +137,15 @@ __strstr_avx512 (const char *haystack, const char *ned)
> > > > > >    __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > > > > >    /* Search for NULL and compare only till null char */
> > > > > >    uint64_t nullmask
> > > > > > -      = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> > > > > > +      = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> > > > > >    uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > > > > -  cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
> > > > > > +  cmpmask = cmpmask & cvtmask64_u64 (loadmask);
> > > > > >    /* Search for the 2 charaters of needle */
> > > > > >    __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > > > >    __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > > > > > -  k1 = _kshiftri_mask64 (k1, 1);
> > > > > > +  k1 = kshiftri_mask64 (k1, 1);
> > > > > >    /* k2 masks tell us if both chars from needle match */
> > > > > > -  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > > > > > +  uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> > > > > >    /* For every match, search for the entire needle for a full match */
> > > > > >    while (k2)
> > > > > >      {
> > > > > > @@ -178,13 +182,13 @@ __strstr_avx512 (const char *haystack, const char *ned)
> > > > > >        hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > > > > >        hay1 = _mm512_load_si512 (haystack + hay_index
> > > > > >                                  + 1); // Always 64 byte aligned
> > > > > > -      nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> > > > > > +      nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> > > > > >        /* Compare only till null char */
> > > > > >        cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > > > >        k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > > > >        k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > > > > >        /* k2 masks tell us if both chars from needle match */
> > > > > > -      k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > > > > > +      k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> > > > > >        /* For every match, compare full strings for potential match */
> > > > > >        while (k2)
> > > > > >          {
> > > > > > --
> > > > > > 2.34.1
> > > > > >
> > > > > Sunil,
> > > > > Does this work (v2 didn't get chained because I changed commit msg).
> > > >
> > > > Yes, v2 works.
> > > >
> > >
> > > Any performance differences?
> >
> > It should generate the same code. It's just using standard operators instead
> > of intrinsics.
>
> Has it been verified?

You are right there are some diffs.

Took a look at the ASM and think the new is probably fine, but will
run a quick benchmark.


>
> > >
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.
  
Noah Goldstein July 12, 2022, 6:40 p.m. UTC | #8
On Tue, Jul 12, 2022 at 10:11 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Tue, Jul 12, 2022 at 8:56 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Jul 11, 2022 at 7:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Mon, Jul 11, 2022 at 7:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Mon, Jul 11, 2022 at 4:33 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > > > >
> > > > > On Mon, Jul 11, 2022 at 3:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > >
> > > > > > On Mon, Jul 11, 2022 at 3:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > > >
> > > > > > > Using standard operators generates and the same code and __mmask
> > > > > > > instrinsics are not available before GCC7.
> > > > > > >
> > > > > > > Removed:
> > > > > > >     _cvtmask64_u64
> > > > > > >     _kshiftri_mask64
> > > > > > >     _kand_mask64
> > > > > > > ---
> > > > > > >  sysdeps/x86_64/multiarch/strstr-avx512.c | 16 ++++++++++------
> > > > > > >  1 file changed, 10 insertions(+), 6 deletions(-)
> > > > > > >
> > > > > > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > > > > index 2ab9e96db8..e44c1a05dc 100644
> > > > > > > --- a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > > > > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > > > > @@ -26,6 +26,10 @@
> > > > > > >  #define ZMM_SIZE_IN_BYTES 64
> > > > > > >  #define PAGESIZE 4096
> > > > > > >
> > > > > > > +#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
> > > > > > > +#define kshiftri_mask64(x, y) ((x) >> (y))
> > > > > > > +#define kand_mask64(x, y) ((x) & (y))
> > > > > > > +
> > > > > > >  /*
> > > > > > >   Returns the index of the first edge within the needle, returns 0 if no edge
> > > > > > >   is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > > > > > > @@ -133,15 +137,15 @@ __strstr_avx512 (const char *haystack, const char *ned)
> > > > > > >    __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > > > > > >    /* Search for NULL and compare only till null char */
> > > > > > >    uint64_t nullmask
> > > > > > > -      = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> > > > > > > +      = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> > > > > > >    uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > > > > > -  cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
> > > > > > > +  cmpmask = cmpmask & cvtmask64_u64 (loadmask);
> > > > > > >    /* Search for the 2 charaters of needle */
> > > > > > >    __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > > > > >    __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > > > > > > -  k1 = _kshiftri_mask64 (k1, 1);
> > > > > > > +  k1 = kshiftri_mask64 (k1, 1);
> > > > > > >    /* k2 masks tell us if both chars from needle match */
> > > > > > > -  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > > > > > > +  uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> > > > > > >    /* For every match, search for the entire needle for a full match */
> > > > > > >    while (k2)
> > > > > > >      {
> > > > > > > @@ -178,13 +182,13 @@ __strstr_avx512 (const char *haystack, const char *ned)
> > > > > > >        hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > > > > > >        hay1 = _mm512_load_si512 (haystack + hay_index
> > > > > > >                                  + 1); // Always 64 byte aligned
> > > > > > > -      nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> > > > > > > +      nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> > > > > > >        /* Compare only till null char */
> > > > > > >        cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > > > > >        k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > > > > >        k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > > > > > >        /* k2 masks tell us if both chars from needle match */
> > > > > > > -      k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > > > > > > +      k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> > > > > > >        /* For every match, compare full strings for potential match */
> > > > > > >        while (k2)
> > > > > > >          {
> > > > > > > --
> > > > > > > 2.34.1
> > > > > > >
> > > > > > Sunil,
> > > > > > Does this work (v2 didn't get chained because I changed commit msg).
> > > > >
> > > > > Yes, v2 works.
> > > > >
> > > >
> > > > Any performance differences?
> > >
> > > It should generate the same code. It's just using standard operators instead
> > > of intrinsics.
> >
> > Has it been verified?
>
> You are right there are some diffs.
>
> Took a look at the ASM and think the new is probably fine, but will
> run a quick benchmark.

Did 5 runs. Think the new one is fine and maybe even gets slightly better
codegen:

N = 5 runs.
Geometric Mean of All Results (New / Old): 0.958
Results For: __strstr_avx512
len_haystack,align_haystack ,align_needle ,fail ,len_needle ,New / Old
8           ,1              ,3            ,0    ,1          ,0.834
8           ,0              ,9            ,1    ,1          ,1.07
9           ,1              ,3            ,0    ,1          ,1.031
9           ,0              ,9            ,1    ,1          ,0.943
8           ,4081           ,9            ,1    ,1          ,0.96
16          ,1              ,3            ,0    ,2          ,0.824
16          ,0              ,9            ,1    ,2          ,1.018
17          ,1              ,3            ,0    ,2          ,0.979
17          ,0              ,9            ,1    ,2          ,1.019
16          ,4081           ,9            ,1    ,2          ,1.015
24          ,1              ,3            ,0    ,3          ,0.979
24          ,0              ,9            ,1    ,3          ,0.972
25          ,1              ,3            ,0    ,3          ,0.99
25          ,0              ,9            ,1    ,3          ,0.998
24          ,4081           ,9            ,1    ,3          ,1.031
32          ,1              ,3            ,0    ,4          ,1.004
32          ,0              ,9            ,1    ,4          ,0.934
33          ,1              ,3            ,0    ,4          ,1.015
33          ,0              ,9            ,1    ,4          ,0.98
32          ,4081           ,9            ,1    ,4          ,1.019
40          ,1              ,3            ,0    ,5          ,1.016
40          ,0              ,9            ,1    ,5          ,1.017
41          ,1              ,3            ,0    ,5          ,1.059
41          ,0              ,9            ,1    ,5          ,1.011
40          ,4081           ,9            ,1    ,5          ,0.989
48          ,1              ,3            ,0    ,6          ,0.985
48          ,0              ,9            ,1    ,6          ,1.016
49          ,1              ,3            ,0    ,6          ,0.979
49          ,0              ,9            ,1    ,6          ,1.041
48          ,4081           ,9            ,1    ,6          ,0.975
56          ,1              ,3            ,0    ,7          ,1.017
56          ,0              ,9            ,1    ,7          ,1.004
57          ,1              ,3            ,0    ,7          ,1.049
57          ,0              ,9            ,1    ,7          ,0.985
56          ,4081           ,9            ,1    ,7          ,0.949
64          ,1              ,3            ,0    ,8          ,1.152
64          ,0              ,9            ,1    ,8          ,1.583
65          ,1              ,3            ,0    ,8          ,1.076
65          ,0              ,9            ,1    ,8          ,1.013
64          ,4081           ,9            ,1    ,8          ,1.038
96          ,1              ,3            ,0    ,9          ,1.007
96          ,0              ,9            ,1    ,9          ,1.001
97          ,1              ,3            ,0    ,9          ,0.973
97          ,0              ,9            ,1    ,9          ,1.021
96          ,4081           ,9            ,1    ,9          ,1.026
128         ,1              ,3            ,0    ,10         ,1.012
128         ,0              ,9            ,1    ,10         ,1.006
129         ,1              ,3            ,0    ,10         ,0.959
129         ,0              ,9            ,1    ,10         ,1.041
128         ,4081           ,9            ,1    ,10         ,0.956
160         ,1              ,3            ,0    ,11         ,0.967
160         ,0              ,9            ,1    ,11         ,1.008
161         ,1              ,3            ,0    ,11         ,0.97
161         ,0              ,9            ,1    ,11         ,0.982
160         ,4081           ,9            ,1    ,11         ,0.999
192         ,1              ,3            ,0    ,12         ,1.057
192         ,0              ,9            ,1    ,12         ,0.973
193         ,1              ,3            ,0    ,12         ,0.956
193         ,0              ,9            ,1    ,12         ,0.982
192         ,4081           ,9            ,1    ,12         ,0.955
224         ,1              ,3            ,0    ,13         ,0.979
224         ,0              ,9            ,1    ,13         ,0.995
225         ,1              ,3            ,0    ,13         ,0.966
225         ,0              ,9            ,1    ,13         ,0.973
224         ,4081           ,9            ,1    ,13         ,1.011
256         ,1              ,3            ,0    ,14         ,0.981
256         ,0              ,9            ,1    ,14         ,1.019
257         ,1              ,3            ,0    ,14         ,0.905
257         ,0              ,9            ,1    ,14         ,0.99
256         ,4081           ,9            ,1    ,14         ,0.994
288         ,1              ,3            ,0    ,15         ,0.963
288         ,0              ,9            ,1    ,15         ,0.981
289         ,1              ,3            ,0    ,15         ,0.97
289         ,0              ,9            ,1    ,15         ,0.855
288         ,4081           ,9            ,1    ,15         ,0.935
320         ,1              ,3            ,0    ,16         ,0.962
320         ,0              ,9            ,1    ,16         ,0.978
321         ,1              ,3            ,0    ,16         ,0.952
321         ,0              ,9            ,1    ,16         ,0.949
320         ,4081           ,9            ,1    ,16         ,0.983
256         ,1              ,11           ,0    ,4          ,0.967
256         ,14             ,5            ,1    ,4          ,0.997
257         ,1              ,11           ,0    ,5          ,1.008
257         ,14             ,5            ,1    ,5          ,0.97
257         ,1              ,11           ,0    ,4          ,1.001
257         ,14             ,5            ,1    ,4          ,0.985
257         ,4081           ,5            ,1    ,4          ,1.0
256         ,1              ,11           ,0    ,8          ,0.958
256         ,14             ,5            ,1    ,8          ,0.967
257         ,1              ,11           ,0    ,9          ,0.991
257         ,14             ,5            ,1    ,9          ,0.91
257         ,1              ,11           ,0    ,8          ,0.974
257         ,14             ,5            ,1    ,8          ,0.999
257         ,4081           ,5            ,1    ,8          ,0.958
256         ,1              ,11           ,0    ,16         ,0.957
256         ,14             ,5            ,1    ,16         ,1.009
257         ,1              ,11           ,0    ,17         ,0.983
257         ,14             ,5            ,1    ,17         ,0.94
257         ,1              ,11           ,0    ,16         ,0.956
257         ,14             ,5            ,1    ,16         ,0.99
257         ,4081           ,5            ,1    ,16         ,1.008
256         ,1              ,11           ,0    ,32         ,0.982
256         ,14             ,5            ,1    ,32         ,0.977
257         ,1              ,11           ,0    ,33         ,0.942
257         ,14             ,5            ,1    ,33         ,0.912
257         ,1              ,11           ,0    ,32         ,1.024
257         ,14             ,5            ,1    ,32         ,0.974
257         ,4081           ,5            ,1    ,32         ,0.953
256         ,1              ,11           ,0    ,64         ,0.955
256         ,14             ,5            ,1    ,64         ,0.938
257         ,1              ,11           ,0    ,65         ,0.916
257         ,14             ,5            ,1    ,65         ,0.942
257         ,1              ,11           ,0    ,64         ,0.94
257         ,14             ,5            ,1    ,64         ,0.952
257         ,4081           ,5            ,1    ,64         ,0.989
256         ,1              ,11           ,0    ,128        ,0.87
256         ,14             ,5            ,1    ,128        ,1.044
257         ,1              ,11           ,0    ,129        ,0.933
257         ,14             ,5            ,1    ,129        ,0.972
257         ,1              ,11           ,0    ,128        ,0.919
257         ,14             ,5            ,1    ,128        ,0.911
257         ,4081           ,5            ,1    ,128        ,0.958
256         ,1              ,11           ,0    ,256        ,1.069
256         ,14             ,5            ,1    ,256        ,0.984
257         ,1              ,11           ,0    ,257        ,1.067
257         ,14             ,5            ,1    ,257        ,0.969
257         ,1              ,11           ,0    ,256        ,1.081
257         ,14             ,5            ,1    ,256        ,0.936
257         ,4081           ,5            ,1    ,256        ,0.975
512         ,1              ,11           ,0    ,4          ,0.984
512         ,14             ,5            ,1    ,4          ,0.955
513         ,1              ,11           ,0    ,5          ,0.976
513         ,14             ,5            ,1    ,5          ,1.221
513         ,1              ,11           ,0    ,4          ,0.983
513         ,14             ,5            ,1    ,4          ,0.989
513         ,4081           ,5            ,1    ,4          ,0.962
512         ,1              ,11           ,0    ,8          ,0.949
512         ,14             ,5            ,1    ,8          ,0.957
513         ,1              ,11           ,0    ,9          ,0.959
513         ,14             ,5            ,1    ,9          ,0.972
513         ,1              ,11           ,0    ,8          ,0.977
513         ,14             ,5            ,1    ,8          ,0.951
513         ,4081           ,5            ,1    ,8          ,0.957
512         ,1              ,11           ,0    ,16         ,0.975
512         ,14             ,5            ,1    ,16         ,0.953
513         ,1              ,11           ,0    ,17         ,0.898
513         ,14             ,5            ,1    ,17         ,0.97
513         ,1              ,11           ,0    ,16         ,0.941
513         ,14             ,5            ,1    ,16         ,0.948
513         ,4081           ,5            ,1    ,16         ,0.948
512         ,1              ,11           ,0    ,32         ,0.917
512         ,14             ,5            ,1    ,32         ,0.963
513         ,1              ,11           ,0    ,33         ,0.937
513         ,14             ,5            ,1    ,33         ,0.969
513         ,1              ,11           ,0    ,32         ,0.995
513         ,14             ,5            ,1    ,32         ,1.001
513         ,4081           ,5            ,1    ,32         ,0.942
512         ,1              ,11           ,0    ,64         ,0.943
512         ,14             ,5            ,1    ,64         ,0.972
513         ,1              ,11           ,0    ,65         ,0.979
513         ,14             ,5            ,1    ,65         ,1.008
513         ,1              ,11           ,0    ,64         ,0.938
513         ,14             ,5            ,1    ,64         ,0.98
513         ,4081           ,5            ,1    ,64         ,0.988
512         ,1              ,11           ,0    ,128        ,0.91
512         ,14             ,5            ,1    ,128        ,0.964
513         ,1              ,11           ,0    ,129        ,0.918
513         ,14             ,5            ,1    ,129        ,0.958
513         ,1              ,11           ,0    ,128        ,0.931
513         ,14             ,5            ,1    ,128        ,0.957
513         ,4081           ,5            ,1    ,128        ,0.956
512         ,1              ,11           ,0    ,256        ,0.835
512         ,14             ,5            ,1    ,256        ,0.955
513         ,1              ,11           ,0    ,257        ,0.842
513         ,14             ,5            ,1    ,257        ,0.959
513         ,1              ,11           ,0    ,256        ,0.832
513         ,14             ,5            ,1    ,256        ,0.879
513         ,4081           ,5            ,1    ,256        ,0.95
1024        ,1              ,11           ,0    ,4          ,0.929
1024        ,14             ,5            ,1    ,4          ,0.965
1025        ,1              ,11           ,0    ,5          ,0.969
1025        ,14             ,5            ,1    ,5          ,0.951
1025        ,1              ,11           ,0    ,4          ,0.943
1025        ,14             ,5            ,1    ,4          ,0.973
1025        ,4081           ,5            ,1    ,4          ,0.982
1024        ,1              ,11           ,0    ,8          ,0.881
1024        ,14             ,5            ,1    ,8          ,0.924
1025        ,1              ,11           ,0    ,9          ,0.971
1025        ,14             ,5            ,1    ,9          ,0.948
1025        ,1              ,11           ,0    ,8          ,0.968
1025        ,14             ,5            ,1    ,8          ,0.967
1025        ,4081           ,5            ,1    ,8          ,0.965
1024        ,1              ,11           ,0    ,16         ,0.959
1024        ,14             ,5            ,1    ,16         ,0.957
1025        ,1              ,11           ,0    ,17         ,0.984
1025        ,14             ,5            ,1    ,17         ,0.964
1025        ,1              ,11           ,0    ,16         ,0.971
1025        ,14             ,5            ,1    ,16         ,0.977
1025        ,4081           ,5            ,1    ,16         ,0.97
1024        ,1              ,11           ,0    ,32         ,0.946
1024        ,14             ,5            ,1    ,32         ,0.967
1025        ,1              ,11           ,0    ,33         ,0.922
1025        ,14             ,5            ,1    ,33         ,0.843
1025        ,1              ,11           ,0    ,32         ,0.954
1025        ,14             ,5            ,1    ,32         ,0.98
1025        ,4081           ,5            ,1    ,32         ,0.97
1024        ,1              ,11           ,0    ,64         ,0.941
1024        ,14             ,5            ,1    ,64         ,0.958
1025        ,1              ,11           ,0    ,65         ,0.931
1025        ,14             ,5            ,1    ,65         ,0.866
1025        ,1              ,11           ,0    ,64         ,0.944
1025        ,14             ,5            ,1    ,64         ,0.915
1025        ,4081           ,5            ,1    ,64         ,0.956
1024        ,1              ,11           ,0    ,128        ,0.827
1024        ,14             ,5            ,1    ,128        ,0.938
1025        ,1              ,11           ,0    ,129        ,0.921
1025        ,14             ,5            ,1    ,129        ,0.95
1025        ,1              ,11           ,0    ,128        ,0.912
1025        ,14             ,5            ,1    ,128        ,1.005
1025        ,4081           ,5            ,1    ,128        ,0.965
1024        ,1              ,11           ,0    ,256        ,0.835
1024        ,14             ,5            ,1    ,256        ,0.967
1025        ,1              ,11           ,0    ,257        ,0.85
1025        ,14             ,5            ,1    ,257        ,0.96
1025        ,1              ,11           ,0    ,256        ,0.83
1025        ,14             ,5            ,1    ,256        ,0.948
1025        ,4081           ,5            ,1    ,256        ,0.946
2048        ,1              ,11           ,0    ,4          ,0.933
2048        ,14             ,5            ,1    ,4          ,0.913
2049        ,1              ,11           ,0    ,5          ,0.962
2049        ,14             ,5            ,1    ,5          ,0.961
2049        ,1              ,11           ,0    ,4          ,0.866
2049        ,14             ,5            ,1    ,4          ,1.05
2049        ,4081           ,5            ,1    ,4          ,0.995
2048        ,1              ,11           ,0    ,8          ,0.929
2048        ,14             ,5            ,1    ,8          ,0.976
2049        ,1              ,11           ,0    ,9          ,0.899
2049        ,14             ,5            ,1    ,9          ,0.995
2049        ,1              ,11           ,0    ,8          ,0.965
2049        ,14             ,5            ,1    ,8          ,1.03
2049        ,4081           ,5            ,1    ,8          ,0.958
2048        ,1              ,11           ,0    ,16         ,0.858
2048        ,14             ,5            ,1    ,16         ,0.963
2049        ,1              ,11           ,0    ,17         ,0.97
2049        ,14             ,5            ,1    ,17         ,0.875
2049        ,1              ,11           ,0    ,16         ,0.862
2049        ,14             ,5            ,1    ,16         ,0.97
2049        ,4081           ,5            ,1    ,16         ,0.937
2048        ,1              ,11           ,0    ,32         ,0.919
2048        ,14             ,5            ,1    ,32         ,0.875
2049        ,1              ,11           ,0    ,33         ,0.955
2049        ,14             ,5            ,1    ,33         ,0.96
2049        ,1              ,11           ,0    ,32         ,0.937
2049        ,14             ,5            ,1    ,32         ,1.028
2049        ,4081           ,5            ,1    ,32         ,0.94
2048        ,1              ,11           ,0    ,64         ,0.966
2048        ,14             ,5            ,1    ,64         ,0.944
2049        ,1              ,11           ,0    ,65         ,0.948
2049        ,14             ,5            ,1    ,65         ,0.947
2049        ,1              ,11           ,0    ,64         ,1.008
2049        ,14             ,5            ,1    ,64         ,0.98
2049        ,4081           ,5            ,1    ,64         ,0.974
2048        ,1              ,11           ,0    ,128        ,0.97
2048        ,14             ,5            ,1    ,128        ,0.947
2049        ,1              ,11           ,0    ,129        ,0.95
2049        ,14             ,5            ,1    ,129        ,0.939
2049        ,1              ,11           ,0    ,128        ,0.852
2049        ,14             ,5            ,1    ,128        ,0.742
2049        ,4081           ,5            ,1    ,128        ,0.674
2048        ,1              ,11           ,0    ,256        ,0.86
2048        ,14             ,5            ,1    ,256        ,0.89
2049        ,1              ,11           ,0    ,257        ,0.76
2049        ,14             ,5            ,1    ,257        ,0.886
2049        ,1              ,11           ,0    ,256        ,0.876
2049        ,14             ,5            ,1    ,256        ,0.915
2049        ,4081           ,5            ,1    ,256        ,0.942
4096        ,1              ,11           ,0    ,4          ,0.926
4096        ,14             ,5            ,1    ,4          ,0.927
4097        ,1              ,11           ,0    ,5          ,0.965
4097        ,14             ,5            ,1    ,5          ,0.978
4097        ,1              ,11           ,0    ,4          ,0.973
4097        ,14             ,5            ,1    ,4          ,0.997
4097        ,4081           ,5            ,1    ,4          ,0.982
4096        ,1              ,11           ,0    ,8          ,0.948
4096        ,14             ,5            ,1    ,8          ,0.937
4097        ,1              ,11           ,0    ,9          ,0.949
4097        ,14             ,5            ,1    ,9          ,0.964
4097        ,1              ,11           ,0    ,8          ,0.998
4097        ,14             ,5            ,1    ,8          ,0.98
4097        ,4081           ,5            ,1    ,8          ,0.972
4096        ,1              ,11           ,0    ,16         ,0.95
4096        ,14             ,5            ,1    ,16         ,0.954
4097        ,1              ,11           ,0    ,17         ,0.978
4097        ,14             ,5            ,1    ,17         ,0.964
4097        ,1              ,11           ,0    ,16         ,0.962
4097        ,14             ,5            ,1    ,16         ,0.952
4097        ,4081           ,5            ,1    ,16         ,0.944
4096        ,1              ,11           ,0    ,32         ,0.965
4096        ,14             ,5            ,1    ,32         ,0.971
4097        ,1              ,11           ,0    ,33         ,0.996
4097        ,14             ,5            ,1    ,33         ,0.976
4097        ,1              ,11           ,0    ,32         ,1.029
4097        ,14             ,5            ,1    ,32         ,1.255
4097        ,4081           ,5            ,1    ,32         ,0.992
4096        ,1              ,11           ,0    ,64         ,1.0
4096        ,14             ,5            ,1    ,64         ,0.98
4097        ,1              ,11           ,0    ,65         ,0.996
4097        ,14             ,5            ,1    ,65         ,0.985
4097        ,1              ,11           ,0    ,64         ,0.98
4097        ,14             ,5            ,1    ,64         ,0.966
4097        ,4081           ,5            ,1    ,64         ,0.955
4096        ,1              ,11           ,0    ,128        ,0.914
4096        ,14             ,5            ,1    ,128        ,0.897
4097        ,1              ,11           ,0    ,129        ,0.93
4097        ,14             ,5            ,1    ,129        ,0.98
4097        ,1              ,11           ,0    ,128        ,0.94
4097        ,14             ,5            ,1    ,128        ,0.965
4097        ,4081           ,5            ,1    ,128        ,0.964
4096        ,1              ,11           ,0    ,256        ,0.904
4096        ,14             ,5            ,1    ,256        ,0.981
4097        ,1              ,11           ,0    ,257        ,0.968
4097        ,14             ,5            ,1    ,257        ,0.921
4097        ,1              ,11           ,0    ,256        ,0.884
4097        ,14             ,5            ,1    ,256        ,0.925
4097        ,4081           ,5            ,1    ,256        ,0.94
8192        ,1              ,11           ,0    ,4          ,0.961
8192        ,14             ,5            ,1    ,4          ,0.959
8193        ,1              ,11           ,0    ,5          ,0.927
8193        ,14             ,5            ,1    ,5          ,0.961
8193        ,1              ,11           ,0    ,4          ,0.938
8193        ,14             ,5            ,1    ,4          ,0.956
8193        ,4081           ,5            ,1    ,4          ,0.979
8192        ,1              ,11           ,0    ,8          ,0.962
8192        ,14             ,5            ,1    ,8          ,1.139
8193        ,1              ,11           ,0    ,9          ,1.178
8193        ,14             ,5            ,1    ,9          ,0.95
8193        ,1              ,11           ,0    ,8          ,0.97
8193        ,14             ,5            ,1    ,8          ,0.935
8193        ,4081           ,5            ,1    ,8          ,0.999
8192        ,1              ,11           ,0    ,16         ,0.908
8192        ,14             ,5            ,1    ,16         ,0.907
8193        ,1              ,11           ,0    ,17         ,0.997
8193        ,14             ,5            ,1    ,17         ,0.838
8193        ,1              ,11           ,0    ,16         ,0.968
8193        ,14             ,5            ,1    ,16         ,0.713
8193        ,4081           ,5            ,1    ,16         ,0.98
8192        ,1              ,11           ,0    ,32         ,1.01
8192        ,14             ,5            ,1    ,32         ,0.949
8193        ,1              ,11           ,0    ,33         ,0.958
8193        ,14             ,5            ,1    ,33         ,0.941
8193        ,1              ,11           ,0    ,32         ,0.953
8193        ,14             ,5            ,1    ,32         ,0.929
8193        ,4081           ,5            ,1    ,32         ,0.97
8192        ,1              ,11           ,0    ,64         ,0.95
8192        ,14             ,5            ,1    ,64         ,0.913
8193        ,1              ,11           ,0    ,65         ,0.958
8193        ,14             ,5            ,1    ,65         ,0.934
8193        ,1              ,11           ,0    ,64         ,0.905
8193        ,14             ,5            ,1    ,64         ,0.89
8193        ,4081           ,5            ,1    ,64         ,0.885
8192        ,1              ,11           ,0    ,128        ,0.885
8192        ,14             ,5            ,1    ,128        ,0.934
8193        ,1              ,11           ,0    ,129        ,0.933
8193        ,14             ,5            ,1    ,129        ,0.985
8193        ,1              ,11           ,0    ,128        ,0.913
8193        ,14             ,5            ,1    ,128        ,0.931
8193        ,4081           ,5            ,1    ,128        ,0.967
8192        ,1              ,11           ,0    ,256        ,0.903
8192        ,14             ,5            ,1    ,256        ,0.908
8193        ,1              ,11           ,0    ,257        ,0.915
8193        ,14             ,5            ,1    ,257        ,0.969
8193        ,1              ,11           ,0    ,256        ,0.918
8193        ,14             ,5            ,1    ,256        ,0.92
8193        ,4081           ,5            ,1    ,256        ,0.967
16384       ,1              ,11           ,0    ,4          ,0.99
16384       ,14             ,5            ,1    ,4          ,0.961
16385       ,1              ,11           ,0    ,5          ,0.862
16385       ,14             ,5            ,1    ,5          ,0.836
16385       ,1              ,11           ,0    ,4          ,0.969
16385       ,14             ,5            ,1    ,4          ,0.97
16385       ,4081           ,5            ,1    ,4          ,0.973
16384       ,1              ,11           ,0    ,8          ,0.931
16384       ,14             ,5            ,1    ,8          ,0.953
16385       ,1              ,11           ,0    ,9          ,0.923
16385       ,14             ,5            ,1    ,9          ,0.821
16385       ,1              ,11           ,0    ,8          ,0.829
16385       ,14             ,5            ,1    ,8          ,0.953
16385       ,4081           ,5            ,1    ,8          ,0.953
16384       ,1              ,11           ,0    ,16         ,0.951
16384       ,14             ,5            ,1    ,16         ,0.932
16385       ,1              ,11           ,0    ,17         ,0.954
16385       ,14             ,5            ,1    ,17         ,0.981
16385       ,1              ,11           ,0    ,16         ,0.955
16385       ,14             ,5            ,1    ,16         ,0.982
16385       ,4081           ,5            ,1    ,16         ,0.951
16384       ,1              ,11           ,0    ,32         ,0.995
16384       ,14             ,5            ,1    ,32         ,0.982
16385       ,1              ,11           ,0    ,33         ,0.967
16385       ,14             ,5            ,1    ,33         ,0.945
16385       ,1              ,11           ,0    ,32         ,0.953
16385       ,14             ,5            ,1    ,32         ,0.942
16385       ,4081           ,5            ,1    ,32         ,0.967
16384       ,1              ,11           ,0    ,64         ,0.962
16384       ,14             ,5            ,1    ,64         ,0.957
16385       ,1              ,11           ,0    ,65         ,1.011
16385       ,14             ,5            ,1    ,65         ,0.931
16385       ,1              ,11           ,0    ,64         ,0.965
16385       ,14             ,5            ,1    ,64         ,0.947
16385       ,4081           ,5            ,1    ,64         ,0.96
16384       ,1              ,11           ,0    ,128        ,0.931
16384       ,14             ,5            ,1    ,128        ,0.935
16385       ,1              ,11           ,0    ,129        ,0.948
16385       ,14             ,5            ,1    ,129        ,0.943
16385       ,1              ,11           ,0    ,128        ,0.94
16385       ,14             ,5            ,1    ,128        ,0.841
16385       ,4081           ,5            ,1    ,128        ,0.97
16384       ,1              ,11           ,0    ,256        ,0.911
16384       ,14             ,5            ,1    ,256        ,0.944
16385       ,1              ,11           ,0    ,257        ,0.913
16385       ,14             ,5            ,1    ,257        ,0.925
16385       ,1              ,11           ,0    ,256        ,0.846
16385       ,14             ,5            ,1    ,256        ,0.966
16385       ,4081           ,5            ,1    ,256        ,1.037
32768       ,1              ,11           ,0    ,4          ,0.984
32768       ,14             ,5            ,1    ,4          ,1.015
32769       ,1              ,11           ,0    ,5          ,0.972
32769       ,14             ,5            ,1    ,5          ,0.958
32769       ,1              ,11           ,0    ,4          ,0.955
32769       ,14             ,5            ,1    ,4          ,0.95
32769       ,4081           ,5            ,1    ,4          ,1.041
32768       ,1              ,11           ,0    ,8          ,0.934
32768       ,14             ,5            ,1    ,8          ,1.067
32769       ,1              ,11           ,0    ,9          ,1.003
32769       ,14             ,5            ,1    ,9          ,1.089
32769       ,1              ,11           ,0    ,8          ,1.0
32769       ,14             ,5            ,1    ,8          ,1.068
32769       ,4081           ,5            ,1    ,8          ,0.986
32768       ,1              ,11           ,0    ,16         ,0.994
32768       ,14             ,5            ,1    ,16         ,0.994
32769       ,1              ,11           ,0    ,17         ,1.008
32769       ,14             ,5            ,1    ,17         ,0.95
32769       ,1              ,11           ,0    ,16         ,0.953
32769       ,14             ,5            ,1    ,16         ,0.954
32769       ,4081           ,5            ,1    ,16         ,0.956
32768       ,1              ,11           ,0    ,32         ,1.037
32768       ,14             ,5            ,1    ,32         ,0.751
32769       ,1              ,11           ,0    ,33         ,0.769
32769       ,14             ,5            ,1    ,33         ,0.906
32769       ,1              ,11           ,0    ,32         ,0.867
32769       ,14             ,5            ,1    ,32         ,0.919
32769       ,4081           ,5            ,1    ,32         ,1.145
32768       ,1              ,11           ,0    ,64         ,0.909
32768       ,14             ,5            ,1    ,64         ,0.947
32769       ,1              ,11           ,0    ,65         ,0.896
32769       ,14             ,5            ,1    ,65         ,0.964
32769       ,1              ,11           ,0    ,64         ,0.996
32769       ,14             ,5            ,1    ,64         ,0.905
32769       ,4081           ,5            ,1    ,64         ,0.996
32768       ,1              ,11           ,0    ,128        ,0.856
32768       ,14             ,5            ,1    ,128        ,1.061
32769       ,1              ,11           ,0    ,129        ,1.027
32769       ,14             ,5            ,1    ,129        ,0.886
32769       ,1              ,11           ,0    ,128        ,0.957
32769       ,14             ,5            ,1    ,128        ,0.97
32769       ,4081           ,5            ,1    ,128        ,1.097
32768       ,1              ,11           ,0    ,256        ,0.914
32768       ,14             ,5            ,1    ,256        ,0.994
32769       ,1              ,11           ,0    ,257        ,0.865
32769       ,14             ,5            ,1    ,257        ,0.889
32769       ,1              ,11           ,0    ,256        ,0.951
32769       ,14             ,5            ,1    ,256        ,0.938
32769       ,4081           ,5            ,1    ,256        ,0.972
65536       ,1              ,11           ,0    ,4          ,1.02
65536       ,14             ,5            ,1    ,4          ,0.962
65537       ,1              ,11           ,0    ,5          ,1.006
65537       ,14             ,5            ,1    ,5          ,0.959
65537       ,1              ,11           ,0    ,4          ,0.949
65537       ,14             ,5            ,1    ,4          ,0.945
65537       ,4081           ,5            ,1    ,4          ,0.976
65536       ,1              ,11           ,0    ,8          ,1.007
65536       ,14             ,5            ,1    ,8          ,0.997
65537       ,1              ,11           ,0    ,9          ,1.008
65537       ,14             ,5            ,1    ,9          ,0.971
65537       ,1              ,11           ,0    ,8          ,0.893
65537       ,14             ,5            ,1    ,8          ,0.929
65537       ,4081           ,5            ,1    ,8          ,0.956
65536       ,1              ,11           ,0    ,16         ,0.921
65536       ,14             ,5            ,1    ,16         ,0.909
65537       ,1              ,11           ,0    ,17         ,0.986
65537       ,14             ,5            ,1    ,17         ,0.962
65537       ,1              ,11           ,0    ,16         ,0.93
65537       ,14             ,5            ,1    ,16         ,0.947
65537       ,4081           ,5            ,1    ,16         ,0.885
65536       ,1              ,11           ,0    ,32         ,1.001
65536       ,14             ,5            ,1    ,32         ,0.93
65537       ,1              ,11           ,0    ,33         ,0.87
65537       ,14             ,5            ,1    ,33         ,1.038
65537       ,1              ,11           ,0    ,32         ,0.934
65537       ,14             ,5            ,1    ,32         ,1.094
65537       ,4081           ,5            ,1    ,32         ,0.997
65536       ,1              ,11           ,0    ,64         ,0.975
65536       ,14             ,5            ,1    ,64         ,0.964
65537       ,1              ,11           ,0    ,65         ,1.027
65537       ,14             ,5            ,1    ,65         ,0.942
65537       ,1              ,11           ,0    ,64         ,0.996
65537       ,14             ,5            ,1    ,64         ,0.938
65537       ,4081           ,5            ,1    ,64         ,0.913
65536       ,1              ,11           ,0    ,128        ,0.967
65536       ,14             ,5            ,1    ,128        ,0.991
65537       ,1              ,11           ,0    ,129        ,0.949
65537       ,14             ,5            ,1    ,129        ,0.948
65537       ,1              ,11           ,0    ,128        ,1.019
65537       ,14             ,5            ,1    ,128        ,1.028
65537       ,4081           ,5            ,1    ,128        ,0.978
65536       ,1              ,11           ,0    ,256        ,0.956
65536       ,14             ,5            ,1    ,256        ,0.932
65537       ,1              ,11           ,0    ,257        ,0.982
65537       ,14             ,5            ,1    ,257        ,0.972
65537       ,1              ,11           ,0    ,256        ,0.933
65537       ,14             ,5            ,1    ,256        ,0.947
65537       ,4081           ,5            ,1    ,256        ,0.939
65536       ,0              ,0            ,1    ,64         ,0.974
65536       ,0              ,0            ,1    ,256        ,0.948
65536       ,0              ,0            ,1    ,1024       ,0.917
>
>
> >
> > > >
> > > >
> > > > --
> > > > H.J.
> >
> >
> >
> > --
> > H.J.
  
H.J. Lu July 12, 2022, 6:43 p.m. UTC | #9
On Tue, Jul 12, 2022 at 11:41 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Tue, Jul 12, 2022 at 10:11 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Tue, Jul 12, 2022 at 8:56 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Mon, Jul 11, 2022 at 7:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Mon, Jul 11, 2022 at 7:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Mon, Jul 11, 2022 at 4:33 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > > > > >
> > > > > > On Mon, Jul 11, 2022 at 3:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > > >
> > > > > > > On Mon, Jul 11, 2022 at 3:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > > > >
> > > > > > > > Using standard operators generates and the same code and __mmask
> > > > > > > > instrinsics are not available before GCC7.
> > > > > > > >
> > > > > > > > Removed:
> > > > > > > >     _cvtmask64_u64
> > > > > > > >     _kshiftri_mask64
> > > > > > > >     _kand_mask64
> > > > > > > > ---
> > > > > > > >  sysdeps/x86_64/multiarch/strstr-avx512.c | 16 ++++++++++------
> > > > > > > >  1 file changed, 10 insertions(+), 6 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > > > > > index 2ab9e96db8..e44c1a05dc 100644
> > > > > > > > --- a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > > > > > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > > > > > @@ -26,6 +26,10 @@
> > > > > > > >  #define ZMM_SIZE_IN_BYTES 64
> > > > > > > >  #define PAGESIZE 4096
> > > > > > > >
> > > > > > > > +#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
> > > > > > > > +#define kshiftri_mask64(x, y) ((x) >> (y))
> > > > > > > > +#define kand_mask64(x, y) ((x) & (y))
> > > > > > > > +
> > > > > > > >  /*
> > > > > > > >   Returns the index of the first edge within the needle, returns 0 if no edge
> > > > > > > >   is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > > > > > > > @@ -133,15 +137,15 @@ __strstr_avx512 (const char *haystack, const char *ned)
> > > > > > > >    __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > > > > > > >    /* Search for NULL and compare only till null char */
> > > > > > > >    uint64_t nullmask
> > > > > > > > -      = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> > > > > > > > +      = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> > > > > > > >    uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > > > > > > -  cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
> > > > > > > > +  cmpmask = cmpmask & cvtmask64_u64 (loadmask);
> > > > > > > >    /* Search for the 2 charaters of needle */
> > > > > > > >    __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > > > > > >    __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > > > > > > > -  k1 = _kshiftri_mask64 (k1, 1);
> > > > > > > > +  k1 = kshiftri_mask64 (k1, 1);
> > > > > > > >    /* k2 masks tell us if both chars from needle match */
> > > > > > > > -  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > > > > > > > +  uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> > > > > > > >    /* For every match, search for the entire needle for a full match */
> > > > > > > >    while (k2)
> > > > > > > >      {
> > > > > > > > @@ -178,13 +182,13 @@ __strstr_avx512 (const char *haystack, const char *ned)
> > > > > > > >        hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > > > > > > >        hay1 = _mm512_load_si512 (haystack + hay_index
> > > > > > > >                                  + 1); // Always 64 byte aligned
> > > > > > > > -      nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> > > > > > > > +      nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> > > > > > > >        /* Compare only till null char */
> > > > > > > >        cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > > > > > >        k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > > > > > >        k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > > > > > > >        /* k2 masks tell us if both chars from needle match */
> > > > > > > > -      k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > > > > > > > +      k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> > > > > > > >        /* For every match, compare full strings for potential match */
> > > > > > > >        while (k2)
> > > > > > > >          {
> > > > > > > > --
> > > > > > > > 2.34.1
> > > > > > > >
> > > > > > > Sunil,
> > > > > > > Does this work (v2 didn't get chained because I changed commit msg).
> > > > > >
> > > > > > Yes, v2 works.
> > > > > >
> > > > >
> > > > > Any performance differences?
> > > >
> > > > It should generate the same code. It's just using standard operators instead
> > > > of intrinsics.
> > >
> > > Has it been verified?
> >
> > You are right there are some diffs.
> >
> > Took a look at the ASM and think the new is probably fine, but will
> > run a quick benchmark.
>
> Did 5 runs. Think the new one is fine and maybe even gets slightly better
> codegen:

Please update the commit log to remove "the same code".

Thanks.

> N = 5 runs.
> Geometric Mean of All Results (New / Old): 0.958
> Results For: __strstr_avx512
> len_haystack,align_haystack ,align_needle ,fail ,len_needle ,New / Old
> 8           ,1              ,3            ,0    ,1          ,0.834
> 8           ,0              ,9            ,1    ,1          ,1.07
> 9           ,1              ,3            ,0    ,1          ,1.031
> 9           ,0              ,9            ,1    ,1          ,0.943
> 8           ,4081           ,9            ,1    ,1          ,0.96
> 16          ,1              ,3            ,0    ,2          ,0.824
> 16          ,0              ,9            ,1    ,2          ,1.018
> 17          ,1              ,3            ,0    ,2          ,0.979
> 17          ,0              ,9            ,1    ,2          ,1.019
> 16          ,4081           ,9            ,1    ,2          ,1.015
> 24          ,1              ,3            ,0    ,3          ,0.979
> 24          ,0              ,9            ,1    ,3          ,0.972
> 25          ,1              ,3            ,0    ,3          ,0.99
> 25          ,0              ,9            ,1    ,3          ,0.998
> 24          ,4081           ,9            ,1    ,3          ,1.031
> 32          ,1              ,3            ,0    ,4          ,1.004
> 32          ,0              ,9            ,1    ,4          ,0.934
> 33          ,1              ,3            ,0    ,4          ,1.015
> 33          ,0              ,9            ,1    ,4          ,0.98
> 32          ,4081           ,9            ,1    ,4          ,1.019
> 40          ,1              ,3            ,0    ,5          ,1.016
> 40          ,0              ,9            ,1    ,5          ,1.017
> 41          ,1              ,3            ,0    ,5          ,1.059
> 41          ,0              ,9            ,1    ,5          ,1.011
> 40          ,4081           ,9            ,1    ,5          ,0.989
> 48          ,1              ,3            ,0    ,6          ,0.985
> 48          ,0              ,9            ,1    ,6          ,1.016
> 49          ,1              ,3            ,0    ,6          ,0.979
> 49          ,0              ,9            ,1    ,6          ,1.041
> 48          ,4081           ,9            ,1    ,6          ,0.975
> 56          ,1              ,3            ,0    ,7          ,1.017
> 56          ,0              ,9            ,1    ,7          ,1.004
> 57          ,1              ,3            ,0    ,7          ,1.049
> 57          ,0              ,9            ,1    ,7          ,0.985
> 56          ,4081           ,9            ,1    ,7          ,0.949
> 64          ,1              ,3            ,0    ,8          ,1.152
> 64          ,0              ,9            ,1    ,8          ,1.583
> 65          ,1              ,3            ,0    ,8          ,1.076
> 65          ,0              ,9            ,1    ,8          ,1.013
> 64          ,4081           ,9            ,1    ,8          ,1.038
> 96          ,1              ,3            ,0    ,9          ,1.007
> 96          ,0              ,9            ,1    ,9          ,1.001
> 97          ,1              ,3            ,0    ,9          ,0.973
> 97          ,0              ,9            ,1    ,9          ,1.021
> 96          ,4081           ,9            ,1    ,9          ,1.026
> 128         ,1              ,3            ,0    ,10         ,1.012
> 128         ,0              ,9            ,1    ,10         ,1.006
> 129         ,1              ,3            ,0    ,10         ,0.959
> 129         ,0              ,9            ,1    ,10         ,1.041
> 128         ,4081           ,9            ,1    ,10         ,0.956
> 160         ,1              ,3            ,0    ,11         ,0.967
> 160         ,0              ,9            ,1    ,11         ,1.008
> 161         ,1              ,3            ,0    ,11         ,0.97
> 161         ,0              ,9            ,1    ,11         ,0.982
> 160         ,4081           ,9            ,1    ,11         ,0.999
> 192         ,1              ,3            ,0    ,12         ,1.057
> 192         ,0              ,9            ,1    ,12         ,0.973
> 193         ,1              ,3            ,0    ,12         ,0.956
> 193         ,0              ,9            ,1    ,12         ,0.982
> 192         ,4081           ,9            ,1    ,12         ,0.955
> 224         ,1              ,3            ,0    ,13         ,0.979
> 224         ,0              ,9            ,1    ,13         ,0.995
> 225         ,1              ,3            ,0    ,13         ,0.966
> 225         ,0              ,9            ,1    ,13         ,0.973
> 224         ,4081           ,9            ,1    ,13         ,1.011
> 256         ,1              ,3            ,0    ,14         ,0.981
> 256         ,0              ,9            ,1    ,14         ,1.019
> 257         ,1              ,3            ,0    ,14         ,0.905
> 257         ,0              ,9            ,1    ,14         ,0.99
> 256         ,4081           ,9            ,1    ,14         ,0.994
> 288         ,1              ,3            ,0    ,15         ,0.963
> 288         ,0              ,9            ,1    ,15         ,0.981
> 289         ,1              ,3            ,0    ,15         ,0.97
> 289         ,0              ,9            ,1    ,15         ,0.855
> 288         ,4081           ,9            ,1    ,15         ,0.935
> 320         ,1              ,3            ,0    ,16         ,0.962
> 320         ,0              ,9            ,1    ,16         ,0.978
> 321         ,1              ,3            ,0    ,16         ,0.952
> 321         ,0              ,9            ,1    ,16         ,0.949
> 320         ,4081           ,9            ,1    ,16         ,0.983
> 256         ,1              ,11           ,0    ,4          ,0.967
> 256         ,14             ,5            ,1    ,4          ,0.997
> 257         ,1              ,11           ,0    ,5          ,1.008
> 257         ,14             ,5            ,1    ,5          ,0.97
> 257         ,1              ,11           ,0    ,4          ,1.001
> 257         ,14             ,5            ,1    ,4          ,0.985
> 257         ,4081           ,5            ,1    ,4          ,1.0
> 256         ,1              ,11           ,0    ,8          ,0.958
> 256         ,14             ,5            ,1    ,8          ,0.967
> 257         ,1              ,11           ,0    ,9          ,0.991
> 257         ,14             ,5            ,1    ,9          ,0.91
> 257         ,1              ,11           ,0    ,8          ,0.974
> 257         ,14             ,5            ,1    ,8          ,0.999
> 257         ,4081           ,5            ,1    ,8          ,0.958
> 256         ,1              ,11           ,0    ,16         ,0.957
> 256         ,14             ,5            ,1    ,16         ,1.009
> 257         ,1              ,11           ,0    ,17         ,0.983
> 257         ,14             ,5            ,1    ,17         ,0.94
> 257         ,1              ,11           ,0    ,16         ,0.956
> 257         ,14             ,5            ,1    ,16         ,0.99
> 257         ,4081           ,5            ,1    ,16         ,1.008
> 256         ,1              ,11           ,0    ,32         ,0.982
> 256         ,14             ,5            ,1    ,32         ,0.977
> 257         ,1              ,11           ,0    ,33         ,0.942
> 257         ,14             ,5            ,1    ,33         ,0.912
> 257         ,1              ,11           ,0    ,32         ,1.024
> 257         ,14             ,5            ,1    ,32         ,0.974
> 257         ,4081           ,5            ,1    ,32         ,0.953
> 256         ,1              ,11           ,0    ,64         ,0.955
> 256         ,14             ,5            ,1    ,64         ,0.938
> 257         ,1              ,11           ,0    ,65         ,0.916
> 257         ,14             ,5            ,1    ,65         ,0.942
> 257         ,1              ,11           ,0    ,64         ,0.94
> 257         ,14             ,5            ,1    ,64         ,0.952
> 257         ,4081           ,5            ,1    ,64         ,0.989
> 256         ,1              ,11           ,0    ,128        ,0.87
> 256         ,14             ,5            ,1    ,128        ,1.044
> 257         ,1              ,11           ,0    ,129        ,0.933
> 257         ,14             ,5            ,1    ,129        ,0.972
> 257         ,1              ,11           ,0    ,128        ,0.919
> 257         ,14             ,5            ,1    ,128        ,0.911
> 257         ,4081           ,5            ,1    ,128        ,0.958
> 256         ,1              ,11           ,0    ,256        ,1.069
> 256         ,14             ,5            ,1    ,256        ,0.984
> 257         ,1              ,11           ,0    ,257        ,1.067
> 257         ,14             ,5            ,1    ,257        ,0.969
> 257         ,1              ,11           ,0    ,256        ,1.081
> 257         ,14             ,5            ,1    ,256        ,0.936
> 257         ,4081           ,5            ,1    ,256        ,0.975
> 512         ,1              ,11           ,0    ,4          ,0.984
> 512         ,14             ,5            ,1    ,4          ,0.955
> 513         ,1              ,11           ,0    ,5          ,0.976
> 513         ,14             ,5            ,1    ,5          ,1.221
> 513         ,1              ,11           ,0    ,4          ,0.983
> 513         ,14             ,5            ,1    ,4          ,0.989
> 513         ,4081           ,5            ,1    ,4          ,0.962
> 512         ,1              ,11           ,0    ,8          ,0.949
> 512         ,14             ,5            ,1    ,8          ,0.957
> 513         ,1              ,11           ,0    ,9          ,0.959
> 513         ,14             ,5            ,1    ,9          ,0.972
> 513         ,1              ,11           ,0    ,8          ,0.977
> 513         ,14             ,5            ,1    ,8          ,0.951
> 513         ,4081           ,5            ,1    ,8          ,0.957
> 512         ,1              ,11           ,0    ,16         ,0.975
> 512         ,14             ,5            ,1    ,16         ,0.953
> 513         ,1              ,11           ,0    ,17         ,0.898
> 513         ,14             ,5            ,1    ,17         ,0.97
> 513         ,1              ,11           ,0    ,16         ,0.941
> 513         ,14             ,5            ,1    ,16         ,0.948
> 513         ,4081           ,5            ,1    ,16         ,0.948
> 512         ,1              ,11           ,0    ,32         ,0.917
> 512         ,14             ,5            ,1    ,32         ,0.963
> 513         ,1              ,11           ,0    ,33         ,0.937
> 513         ,14             ,5            ,1    ,33         ,0.969
> 513         ,1              ,11           ,0    ,32         ,0.995
> 513         ,14             ,5            ,1    ,32         ,1.001
> 513         ,4081           ,5            ,1    ,32         ,0.942
> 512         ,1              ,11           ,0    ,64         ,0.943
> 512         ,14             ,5            ,1    ,64         ,0.972
> 513         ,1              ,11           ,0    ,65         ,0.979
> 513         ,14             ,5            ,1    ,65         ,1.008
> 513         ,1              ,11           ,0    ,64         ,0.938
> 513         ,14             ,5            ,1    ,64         ,0.98
> 513         ,4081           ,5            ,1    ,64         ,0.988
> 512         ,1              ,11           ,0    ,128        ,0.91
> 512         ,14             ,5            ,1    ,128        ,0.964
> 513         ,1              ,11           ,0    ,129        ,0.918
> 513         ,14             ,5            ,1    ,129        ,0.958
> 513         ,1              ,11           ,0    ,128        ,0.931
> 513         ,14             ,5            ,1    ,128        ,0.957
> 513         ,4081           ,5            ,1    ,128        ,0.956
> 512         ,1              ,11           ,0    ,256        ,0.835
> 512         ,14             ,5            ,1    ,256        ,0.955
> 513         ,1              ,11           ,0    ,257        ,0.842
> 513         ,14             ,5            ,1    ,257        ,0.959
> 513         ,1              ,11           ,0    ,256        ,0.832
> 513         ,14             ,5            ,1    ,256        ,0.879
> 513         ,4081           ,5            ,1    ,256        ,0.95
> 1024        ,1              ,11           ,0    ,4          ,0.929
> 1024        ,14             ,5            ,1    ,4          ,0.965
> 1025        ,1              ,11           ,0    ,5          ,0.969
> 1025        ,14             ,5            ,1    ,5          ,0.951
> 1025        ,1              ,11           ,0    ,4          ,0.943
> 1025        ,14             ,5            ,1    ,4          ,0.973
> 1025        ,4081           ,5            ,1    ,4          ,0.982
> 1024        ,1              ,11           ,0    ,8          ,0.881
> 1024        ,14             ,5            ,1    ,8          ,0.924
> 1025        ,1              ,11           ,0    ,9          ,0.971
> 1025        ,14             ,5            ,1    ,9          ,0.948
> 1025        ,1              ,11           ,0    ,8          ,0.968
> 1025        ,14             ,5            ,1    ,8          ,0.967
> 1025        ,4081           ,5            ,1    ,8          ,0.965
> 1024        ,1              ,11           ,0    ,16         ,0.959
> 1024        ,14             ,5            ,1    ,16         ,0.957
> 1025        ,1              ,11           ,0    ,17         ,0.984
> 1025        ,14             ,5            ,1    ,17         ,0.964
> 1025        ,1              ,11           ,0    ,16         ,0.971
> 1025        ,14             ,5            ,1    ,16         ,0.977
> 1025        ,4081           ,5            ,1    ,16         ,0.97
> 1024        ,1              ,11           ,0    ,32         ,0.946
> 1024        ,14             ,5            ,1    ,32         ,0.967
> 1025        ,1              ,11           ,0    ,33         ,0.922
> 1025        ,14             ,5            ,1    ,33         ,0.843
> 1025        ,1              ,11           ,0    ,32         ,0.954
> 1025        ,14             ,5            ,1    ,32         ,0.98
> 1025        ,4081           ,5            ,1    ,32         ,0.97
> 1024        ,1              ,11           ,0    ,64         ,0.941
> 1024        ,14             ,5            ,1    ,64         ,0.958
> 1025        ,1              ,11           ,0    ,65         ,0.931
> 1025        ,14             ,5            ,1    ,65         ,0.866
> 1025        ,1              ,11           ,0    ,64         ,0.944
> 1025        ,14             ,5            ,1    ,64         ,0.915
> 1025        ,4081           ,5            ,1    ,64         ,0.956
> 1024        ,1              ,11           ,0    ,128        ,0.827
> 1024        ,14             ,5            ,1    ,128        ,0.938
> 1025        ,1              ,11           ,0    ,129        ,0.921
> 1025        ,14             ,5            ,1    ,129        ,0.95
> 1025        ,1              ,11           ,0    ,128        ,0.912
> 1025        ,14             ,5            ,1    ,128        ,1.005
> 1025        ,4081           ,5            ,1    ,128        ,0.965
> 1024        ,1              ,11           ,0    ,256        ,0.835
> 1024        ,14             ,5            ,1    ,256        ,0.967
> 1025        ,1              ,11           ,0    ,257        ,0.85
> 1025        ,14             ,5            ,1    ,257        ,0.96
> 1025        ,1              ,11           ,0    ,256        ,0.83
> 1025        ,14             ,5            ,1    ,256        ,0.948
> 1025        ,4081           ,5            ,1    ,256        ,0.946
> 2048        ,1              ,11           ,0    ,4          ,0.933
> 2048        ,14             ,5            ,1    ,4          ,0.913
> 2049        ,1              ,11           ,0    ,5          ,0.962
> 2049        ,14             ,5            ,1    ,5          ,0.961
> 2049        ,1              ,11           ,0    ,4          ,0.866
> 2049        ,14             ,5            ,1    ,4          ,1.05
> 2049        ,4081           ,5            ,1    ,4          ,0.995
> 2048        ,1              ,11           ,0    ,8          ,0.929
> 2048        ,14             ,5            ,1    ,8          ,0.976
> 2049        ,1              ,11           ,0    ,9          ,0.899
> 2049        ,14             ,5            ,1    ,9          ,0.995
> 2049        ,1              ,11           ,0    ,8          ,0.965
> 2049        ,14             ,5            ,1    ,8          ,1.03
> 2049        ,4081           ,5            ,1    ,8          ,0.958
> 2048        ,1              ,11           ,0    ,16         ,0.858
> 2048        ,14             ,5            ,1    ,16         ,0.963
> 2049        ,1              ,11           ,0    ,17         ,0.97
> 2049        ,14             ,5            ,1    ,17         ,0.875
> 2049        ,1              ,11           ,0    ,16         ,0.862
> 2049        ,14             ,5            ,1    ,16         ,0.97
> 2049        ,4081           ,5            ,1    ,16         ,0.937
> 2048        ,1              ,11           ,0    ,32         ,0.919
> 2048        ,14             ,5            ,1    ,32         ,0.875
> 2049        ,1              ,11           ,0    ,33         ,0.955
> 2049        ,14             ,5            ,1    ,33         ,0.96
> 2049        ,1              ,11           ,0    ,32         ,0.937
> 2049        ,14             ,5            ,1    ,32         ,1.028
> 2049        ,4081           ,5            ,1    ,32         ,0.94
> 2048        ,1              ,11           ,0    ,64         ,0.966
> 2048        ,14             ,5            ,1    ,64         ,0.944
> 2049        ,1              ,11           ,0    ,65         ,0.948
> 2049        ,14             ,5            ,1    ,65         ,0.947
> 2049        ,1              ,11           ,0    ,64         ,1.008
> 2049        ,14             ,5            ,1    ,64         ,0.98
> 2049        ,4081           ,5            ,1    ,64         ,0.974
> 2048        ,1              ,11           ,0    ,128        ,0.97
> 2048        ,14             ,5            ,1    ,128        ,0.947
> 2049        ,1              ,11           ,0    ,129        ,0.95
> 2049        ,14             ,5            ,1    ,129        ,0.939
> 2049        ,1              ,11           ,0    ,128        ,0.852
> 2049        ,14             ,5            ,1    ,128        ,0.742
> 2049        ,4081           ,5            ,1    ,128        ,0.674
> 2048        ,1              ,11           ,0    ,256        ,0.86
> 2048        ,14             ,5            ,1    ,256        ,0.89
> 2049        ,1              ,11           ,0    ,257        ,0.76
> 2049        ,14             ,5            ,1    ,257        ,0.886
> 2049        ,1              ,11           ,0    ,256        ,0.876
> 2049        ,14             ,5            ,1    ,256        ,0.915
> 2049        ,4081           ,5            ,1    ,256        ,0.942
> 4096        ,1              ,11           ,0    ,4          ,0.926
> 4096        ,14             ,5            ,1    ,4          ,0.927
> 4097        ,1              ,11           ,0    ,5          ,0.965
> 4097        ,14             ,5            ,1    ,5          ,0.978
> 4097        ,1              ,11           ,0    ,4          ,0.973
> 4097        ,14             ,5            ,1    ,4          ,0.997
> 4097        ,4081           ,5            ,1    ,4          ,0.982
> 4096        ,1              ,11           ,0    ,8          ,0.948
> 4096        ,14             ,5            ,1    ,8          ,0.937
> 4097        ,1              ,11           ,0    ,9          ,0.949
> 4097        ,14             ,5            ,1    ,9          ,0.964
> 4097        ,1              ,11           ,0    ,8          ,0.998
> 4097        ,14             ,5            ,1    ,8          ,0.98
> 4097        ,4081           ,5            ,1    ,8          ,0.972
> 4096        ,1              ,11           ,0    ,16         ,0.95
> 4096        ,14             ,5            ,1    ,16         ,0.954
> 4097        ,1              ,11           ,0    ,17         ,0.978
> 4097        ,14             ,5            ,1    ,17         ,0.964
> 4097        ,1              ,11           ,0    ,16         ,0.962
> 4097        ,14             ,5            ,1    ,16         ,0.952
> 4097        ,4081           ,5            ,1    ,16         ,0.944
> 4096        ,1              ,11           ,0    ,32         ,0.965
> 4096        ,14             ,5            ,1    ,32         ,0.971
> 4097        ,1              ,11           ,0    ,33         ,0.996
> 4097        ,14             ,5            ,1    ,33         ,0.976
> 4097        ,1              ,11           ,0    ,32         ,1.029
> 4097        ,14             ,5            ,1    ,32         ,1.255
> 4097        ,4081           ,5            ,1    ,32         ,0.992
> 4096        ,1              ,11           ,0    ,64         ,1.0
> 4096        ,14             ,5            ,1    ,64         ,0.98
> 4097        ,1              ,11           ,0    ,65         ,0.996
> 4097        ,14             ,5            ,1    ,65         ,0.985
> 4097        ,1              ,11           ,0    ,64         ,0.98
> 4097        ,14             ,5            ,1    ,64         ,0.966
> 4097        ,4081           ,5            ,1    ,64         ,0.955
> 4096        ,1              ,11           ,0    ,128        ,0.914
> 4096        ,14             ,5            ,1    ,128        ,0.897
> 4097        ,1              ,11           ,0    ,129        ,0.93
> 4097        ,14             ,5            ,1    ,129        ,0.98
> 4097        ,1              ,11           ,0    ,128        ,0.94
> 4097        ,14             ,5            ,1    ,128        ,0.965
> 4097        ,4081           ,5            ,1    ,128        ,0.964
> 4096        ,1              ,11           ,0    ,256        ,0.904
> 4096        ,14             ,5            ,1    ,256        ,0.981
> 4097        ,1              ,11           ,0    ,257        ,0.968
> 4097        ,14             ,5            ,1    ,257        ,0.921
> 4097        ,1              ,11           ,0    ,256        ,0.884
> 4097        ,14             ,5            ,1    ,256        ,0.925
> 4097        ,4081           ,5            ,1    ,256        ,0.94
> 8192        ,1              ,11           ,0    ,4          ,0.961
> 8192        ,14             ,5            ,1    ,4          ,0.959
> 8193        ,1              ,11           ,0    ,5          ,0.927
> 8193        ,14             ,5            ,1    ,5          ,0.961
> 8193        ,1              ,11           ,0    ,4          ,0.938
> 8193        ,14             ,5            ,1    ,4          ,0.956
> 8193        ,4081           ,5            ,1    ,4          ,0.979
> 8192        ,1              ,11           ,0    ,8          ,0.962
> 8192        ,14             ,5            ,1    ,8          ,1.139
> 8193        ,1              ,11           ,0    ,9          ,1.178
> 8193        ,14             ,5            ,1    ,9          ,0.95
> 8193        ,1              ,11           ,0    ,8          ,0.97
> 8193        ,14             ,5            ,1    ,8          ,0.935
> 8193        ,4081           ,5            ,1    ,8          ,0.999
> 8192        ,1              ,11           ,0    ,16         ,0.908
> 8192        ,14             ,5            ,1    ,16         ,0.907
> 8193        ,1              ,11           ,0    ,17         ,0.997
> 8193        ,14             ,5            ,1    ,17         ,0.838
> 8193        ,1              ,11           ,0    ,16         ,0.968
> 8193        ,14             ,5            ,1    ,16         ,0.713
> 8193        ,4081           ,5            ,1    ,16         ,0.98
> 8192        ,1              ,11           ,0    ,32         ,1.01
> 8192        ,14             ,5            ,1    ,32         ,0.949
> 8193        ,1              ,11           ,0    ,33         ,0.958
> 8193        ,14             ,5            ,1    ,33         ,0.941
> 8193        ,1              ,11           ,0    ,32         ,0.953
> 8193        ,14             ,5            ,1    ,32         ,0.929
> 8193        ,4081           ,5            ,1    ,32         ,0.97
> 8192        ,1              ,11           ,0    ,64         ,0.95
> 8192        ,14             ,5            ,1    ,64         ,0.913
> 8193        ,1              ,11           ,0    ,65         ,0.958
> 8193        ,14             ,5            ,1    ,65         ,0.934
> 8193        ,1              ,11           ,0    ,64         ,0.905
> 8193        ,14             ,5            ,1    ,64         ,0.89
> 8193        ,4081           ,5            ,1    ,64         ,0.885
> 8192        ,1              ,11           ,0    ,128        ,0.885
> 8192        ,14             ,5            ,1    ,128        ,0.934
> 8193        ,1              ,11           ,0    ,129        ,0.933
> 8193        ,14             ,5            ,1    ,129        ,0.985
> 8193        ,1              ,11           ,0    ,128        ,0.913
> 8193        ,14             ,5            ,1    ,128        ,0.931
> 8193        ,4081           ,5            ,1    ,128        ,0.967
> 8192        ,1              ,11           ,0    ,256        ,0.903
> 8192        ,14             ,5            ,1    ,256        ,0.908
> 8193        ,1              ,11           ,0    ,257        ,0.915
> 8193        ,14             ,5            ,1    ,257        ,0.969
> 8193        ,1              ,11           ,0    ,256        ,0.918
> 8193        ,14             ,5            ,1    ,256        ,0.92
> 8193        ,4081           ,5            ,1    ,256        ,0.967
> 16384       ,1              ,11           ,0    ,4          ,0.99
> 16384       ,14             ,5            ,1    ,4          ,0.961
> 16385       ,1              ,11           ,0    ,5          ,0.862
> 16385       ,14             ,5            ,1    ,5          ,0.836
> 16385       ,1              ,11           ,0    ,4          ,0.969
> 16385       ,14             ,5            ,1    ,4          ,0.97
> 16385       ,4081           ,5            ,1    ,4          ,0.973
> 16384       ,1              ,11           ,0    ,8          ,0.931
> 16384       ,14             ,5            ,1    ,8          ,0.953
> 16385       ,1              ,11           ,0    ,9          ,0.923
> 16385       ,14             ,5            ,1    ,9          ,0.821
> 16385       ,1              ,11           ,0    ,8          ,0.829
> 16385       ,14             ,5            ,1    ,8          ,0.953
> 16385       ,4081           ,5            ,1    ,8          ,0.953
> 16384       ,1              ,11           ,0    ,16         ,0.951
> 16384       ,14             ,5            ,1    ,16         ,0.932
> 16385       ,1              ,11           ,0    ,17         ,0.954
> 16385       ,14             ,5            ,1    ,17         ,0.981
> 16385       ,1              ,11           ,0    ,16         ,0.955
> 16385       ,14             ,5            ,1    ,16         ,0.982
> 16385       ,4081           ,5            ,1    ,16         ,0.951
> 16384       ,1              ,11           ,0    ,32         ,0.995
> 16384       ,14             ,5            ,1    ,32         ,0.982
> 16385       ,1              ,11           ,0    ,33         ,0.967
> 16385       ,14             ,5            ,1    ,33         ,0.945
> 16385       ,1              ,11           ,0    ,32         ,0.953
> 16385       ,14             ,5            ,1    ,32         ,0.942
> 16385       ,4081           ,5            ,1    ,32         ,0.967
> 16384       ,1              ,11           ,0    ,64         ,0.962
> 16384       ,14             ,5            ,1    ,64         ,0.957
> 16385       ,1              ,11           ,0    ,65         ,1.011
> 16385       ,14             ,5            ,1    ,65         ,0.931
> 16385       ,1              ,11           ,0    ,64         ,0.965
> 16385       ,14             ,5            ,1    ,64         ,0.947
> 16385       ,4081           ,5            ,1    ,64         ,0.96
> 16384       ,1              ,11           ,0    ,128        ,0.931
> 16384       ,14             ,5            ,1    ,128        ,0.935
> 16385       ,1              ,11           ,0    ,129        ,0.948
> 16385       ,14             ,5            ,1    ,129        ,0.943
> 16385       ,1              ,11           ,0    ,128        ,0.94
> 16385       ,14             ,5            ,1    ,128        ,0.841
> 16385       ,4081           ,5            ,1    ,128        ,0.97
> 16384       ,1              ,11           ,0    ,256        ,0.911
> 16384       ,14             ,5            ,1    ,256        ,0.944
> 16385       ,1              ,11           ,0    ,257        ,0.913
> 16385       ,14             ,5            ,1    ,257        ,0.925
> 16385       ,1              ,11           ,0    ,256        ,0.846
> 16385       ,14             ,5            ,1    ,256        ,0.966
> 16385       ,4081           ,5            ,1    ,256        ,1.037
> 32768       ,1              ,11           ,0    ,4          ,0.984
> 32768       ,14             ,5            ,1    ,4          ,1.015
> 32769       ,1              ,11           ,0    ,5          ,0.972
> 32769       ,14             ,5            ,1    ,5          ,0.958
> 32769       ,1              ,11           ,0    ,4          ,0.955
> 32769       ,14             ,5            ,1    ,4          ,0.95
> 32769       ,4081           ,5            ,1    ,4          ,1.041
> 32768       ,1              ,11           ,0    ,8          ,0.934
> 32768       ,14             ,5            ,1    ,8          ,1.067
> 32769       ,1              ,11           ,0    ,9          ,1.003
> 32769       ,14             ,5            ,1    ,9          ,1.089
> 32769       ,1              ,11           ,0    ,8          ,1.0
> 32769       ,14             ,5            ,1    ,8          ,1.068
> 32769       ,4081           ,5            ,1    ,8          ,0.986
> 32768       ,1              ,11           ,0    ,16         ,0.994
> 32768       ,14             ,5            ,1    ,16         ,0.994
> 32769       ,1              ,11           ,0    ,17         ,1.008
> 32769       ,14             ,5            ,1    ,17         ,0.95
> 32769       ,1              ,11           ,0    ,16         ,0.953
> 32769       ,14             ,5            ,1    ,16         ,0.954
> 32769       ,4081           ,5            ,1    ,16         ,0.956
> 32768       ,1              ,11           ,0    ,32         ,1.037
> 32768       ,14             ,5            ,1    ,32         ,0.751
> 32769       ,1              ,11           ,0    ,33         ,0.769
> 32769       ,14             ,5            ,1    ,33         ,0.906
> 32769       ,1              ,11           ,0    ,32         ,0.867
> 32769       ,14             ,5            ,1    ,32         ,0.919
> 32769       ,4081           ,5            ,1    ,32         ,1.145
> 32768       ,1              ,11           ,0    ,64         ,0.909
> 32768       ,14             ,5            ,1    ,64         ,0.947
> 32769       ,1              ,11           ,0    ,65         ,0.896
> 32769       ,14             ,5            ,1    ,65         ,0.964
> 32769       ,1              ,11           ,0    ,64         ,0.996
> 32769       ,14             ,5            ,1    ,64         ,0.905
> 32769       ,4081           ,5            ,1    ,64         ,0.996
> 32768       ,1              ,11           ,0    ,128        ,0.856
> 32768       ,14             ,5            ,1    ,128        ,1.061
> 32769       ,1              ,11           ,0    ,129        ,1.027
> 32769       ,14             ,5            ,1    ,129        ,0.886
> 32769       ,1              ,11           ,0    ,128        ,0.957
> 32769       ,14             ,5            ,1    ,128        ,0.97
> 32769       ,4081           ,5            ,1    ,128        ,1.097
> 32768       ,1              ,11           ,0    ,256        ,0.914
> 32768       ,14             ,5            ,1    ,256        ,0.994
> 32769       ,1              ,11           ,0    ,257        ,0.865
> 32769       ,14             ,5            ,1    ,257        ,0.889
> 32769       ,1              ,11           ,0    ,256        ,0.951
> 32769       ,14             ,5            ,1    ,256        ,0.938
> 32769       ,4081           ,5            ,1    ,256        ,0.972
> 65536       ,1              ,11           ,0    ,4          ,1.02
> 65536       ,14             ,5            ,1    ,4          ,0.962
> 65537       ,1              ,11           ,0    ,5          ,1.006
> 65537       ,14             ,5            ,1    ,5          ,0.959
> 65537       ,1              ,11           ,0    ,4          ,0.949
> 65537       ,14             ,5            ,1    ,4          ,0.945
> 65537       ,4081           ,5            ,1    ,4          ,0.976
> 65536       ,1              ,11           ,0    ,8          ,1.007
> 65536       ,14             ,5            ,1    ,8          ,0.997
> 65537       ,1              ,11           ,0    ,9          ,1.008
> 65537       ,14             ,5            ,1    ,9          ,0.971
> 65537       ,1              ,11           ,0    ,8          ,0.893
> 65537       ,14             ,5            ,1    ,8          ,0.929
> 65537       ,4081           ,5            ,1    ,8          ,0.956
> 65536       ,1              ,11           ,0    ,16         ,0.921
> 65536       ,14             ,5            ,1    ,16         ,0.909
> 65537       ,1              ,11           ,0    ,17         ,0.986
> 65537       ,14             ,5            ,1    ,17         ,0.962
> 65537       ,1              ,11           ,0    ,16         ,0.93
> 65537       ,14             ,5            ,1    ,16         ,0.947
> 65537       ,4081           ,5            ,1    ,16         ,0.885
> 65536       ,1              ,11           ,0    ,32         ,1.001
> 65536       ,14             ,5            ,1    ,32         ,0.93
> 65537       ,1              ,11           ,0    ,33         ,0.87
> 65537       ,14             ,5            ,1    ,33         ,1.038
> 65537       ,1              ,11           ,0    ,32         ,0.934
> 65537       ,14             ,5            ,1    ,32         ,1.094
> 65537       ,4081           ,5            ,1    ,32         ,0.997
> 65536       ,1              ,11           ,0    ,64         ,0.975
> 65536       ,14             ,5            ,1    ,64         ,0.964
> 65537       ,1              ,11           ,0    ,65         ,1.027
> 65537       ,14             ,5            ,1    ,65         ,0.942
> 65537       ,1              ,11           ,0    ,64         ,0.996
> 65537       ,14             ,5            ,1    ,64         ,0.938
> 65537       ,4081           ,5            ,1    ,64         ,0.913
> 65536       ,1              ,11           ,0    ,128        ,0.967
> 65536       ,14             ,5            ,1    ,128        ,0.991
> 65537       ,1              ,11           ,0    ,129        ,0.949
> 65537       ,14             ,5            ,1    ,129        ,0.948
> 65537       ,1              ,11           ,0    ,128        ,1.019
> 65537       ,14             ,5            ,1    ,128        ,1.028
> 65537       ,4081           ,5            ,1    ,128        ,0.978
> 65536       ,1              ,11           ,0    ,256        ,0.956
> 65536       ,14             ,5            ,1    ,256        ,0.932
> 65537       ,1              ,11           ,0    ,257        ,0.982
> 65537       ,14             ,5            ,1    ,257        ,0.972
> 65537       ,1              ,11           ,0    ,256        ,0.933
> 65537       ,14             ,5            ,1    ,256        ,0.947
> 65537       ,4081           ,5            ,1    ,256        ,0.939
> 65536       ,0              ,0            ,1    ,64         ,0.974
> 65536       ,0              ,0            ,1    ,256        ,0.948
> 65536       ,0              ,0            ,1    ,1024       ,0.917
> >
> >
> > >
> > > > >
> > > > >
> > > > > --
> > > > > H.J.
> > >
> > >
> > >
> > > --
> > > H.J.
  
Noah Goldstein July 12, 2022, 6:48 p.m. UTC | #10
On Tue, Jul 12, 2022 at 11:44 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Jul 12, 2022 at 11:41 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Tue, Jul 12, 2022 at 10:11 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Tue, Jul 12, 2022 at 8:56 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Mon, Jul 11, 2022 at 7:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > On Mon, Jul 11, 2022 at 7:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > On Mon, Jul 11, 2022 at 4:33 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > > > > > >
> > > > > > > On Mon, Jul 11, 2022 at 3:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Jul 11, 2022 at 3:38 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > > > > >
> > > > > > > > > Using standard operators generates and the same code and __mmask
> > > > > > > > > instrinsics are not available before GCC7.
> > > > > > > > >
> > > > > > > > > Removed:
> > > > > > > > >     _cvtmask64_u64
> > > > > > > > >     _kshiftri_mask64
> > > > > > > > >     _kand_mask64
> > > > > > > > > ---
> > > > > > > > >  sysdeps/x86_64/multiarch/strstr-avx512.c | 16 ++++++++++------
> > > > > > > > >  1 file changed, 10 insertions(+), 6 deletions(-)
> > > > > > > > >
> > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > > > > > > index 2ab9e96db8..e44c1a05dc 100644
> > > > > > > > > --- a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > > > > > > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > > > > > > > > @@ -26,6 +26,10 @@
> > > > > > > > >  #define ZMM_SIZE_IN_BYTES 64
> > > > > > > > >  #define PAGESIZE 4096
> > > > > > > > >
> > > > > > > > > +#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
> > > > > > > > > +#define kshiftri_mask64(x, y) ((x) >> (y))
> > > > > > > > > +#define kand_mask64(x, y) ((x) & (y))
> > > > > > > > > +
> > > > > > > > >  /*
> > > > > > > > >   Returns the index of the first edge within the needle, returns 0 if no edge
> > > > > > > > >   is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > > > > > > > > @@ -133,15 +137,15 @@ __strstr_avx512 (const char *haystack, const char *ned)
> > > > > > > > >    __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > > > > > > > >    /* Search for NULL and compare only till null char */
> > > > > > > > >    uint64_t nullmask
> > > > > > > > > -      = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> > > > > > > > > +      = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
> > > > > > > > >    uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > > > > > > > -  cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
> > > > > > > > > +  cmpmask = cmpmask & cvtmask64_u64 (loadmask);
> > > > > > > > >    /* Search for the 2 charaters of needle */
> > > > > > > > >    __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > > > > > > >    __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > > > > > > > > -  k1 = _kshiftri_mask64 (k1, 1);
> > > > > > > > > +  k1 = kshiftri_mask64 (k1, 1);
> > > > > > > > >    /* k2 masks tell us if both chars from needle match */
> > > > > > > > > -  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > > > > > > > > +  uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> > > > > > > > >    /* For every match, search for the entire needle for a full match */
> > > > > > > > >    while (k2)
> > > > > > > > >      {
> > > > > > > > > @@ -178,13 +182,13 @@ __strstr_avx512 (const char *haystack, const char *ned)
> > > > > > > > >        hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > > > > > > > >        hay1 = _mm512_load_si512 (haystack + hay_index
> > > > > > > > >                                  + 1); // Always 64 byte aligned
> > > > > > > > > -      nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> > > > > > > > > +      nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
> > > > > > > > >        /* Compare only till null char */
> > > > > > > > >        cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > > > > > > > >        k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > > > > > > > >        k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > > > > > > > >        /* k2 masks tell us if both chars from needle match */
> > > > > > > > > -      k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
> > > > > > > > > +      k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
> > > > > > > > >        /* For every match, compare full strings for potential match */
> > > > > > > > >        while (k2)
> > > > > > > > >          {
> > > > > > > > > --
> > > > > > > > > 2.34.1
> > > > > > > > >
> > > > > > > > Sunil,
> > > > > > > > Does this work (v2 didn't get chained because I changed commit msg).
> > > > > > >
> > > > > > > Yes, v2 works.
> > > > > > >
> > > > > >
> > > > > > Any performance differences?
> > > > >
> > > > > It should generate the same code. It's just using standard operators instead
> > > > > of intrinsics.
> > > >
> > > > Has it been verified?
> > >
> > > You are right there are some diffs.
> > >
> > > Took a look at the ASM and think the new is probably fine, but will
> > > run a quick benchmark.
> >
> > Did 5 runs. Think the new one is fine and maybe even gets slightly better
> > codegen:
>
> Please update the commit log to remove "the same code".

Done  in v3.
>
> Thanks.
>
> > N = 5 runs.
> > Geometric Mean of All Results (New / Old): 0.958
> > Results For: __strstr_avx512
> > len_haystack,align_haystack ,align_needle ,fail ,len_needle ,New / Old
> > 8           ,1              ,3            ,0    ,1          ,0.834
> > 8           ,0              ,9            ,1    ,1          ,1.07
> > 9           ,1              ,3            ,0    ,1          ,1.031
> > 9           ,0              ,9            ,1    ,1          ,0.943
> > 8           ,4081           ,9            ,1    ,1          ,0.96
> > 16          ,1              ,3            ,0    ,2          ,0.824
> > 16          ,0              ,9            ,1    ,2          ,1.018
> > 17          ,1              ,3            ,0    ,2          ,0.979
> > 17          ,0              ,9            ,1    ,2          ,1.019
> > 16          ,4081           ,9            ,1    ,2          ,1.015
> > 24          ,1              ,3            ,0    ,3          ,0.979
> > 24          ,0              ,9            ,1    ,3          ,0.972
> > 25          ,1              ,3            ,0    ,3          ,0.99
> > 25          ,0              ,9            ,1    ,3          ,0.998
> > 24          ,4081           ,9            ,1    ,3          ,1.031
> > 32          ,1              ,3            ,0    ,4          ,1.004
> > 32          ,0              ,9            ,1    ,4          ,0.934
> > 33          ,1              ,3            ,0    ,4          ,1.015
> > 33          ,0              ,9            ,1    ,4          ,0.98
> > 32          ,4081           ,9            ,1    ,4          ,1.019
> > 40          ,1              ,3            ,0    ,5          ,1.016
> > 40          ,0              ,9            ,1    ,5          ,1.017
> > 41          ,1              ,3            ,0    ,5          ,1.059
> > 41          ,0              ,9            ,1    ,5          ,1.011
> > 40          ,4081           ,9            ,1    ,5          ,0.989
> > 48          ,1              ,3            ,0    ,6          ,0.985
> > 48          ,0              ,9            ,1    ,6          ,1.016
> > 49          ,1              ,3            ,0    ,6          ,0.979
> > 49          ,0              ,9            ,1    ,6          ,1.041
> > 48          ,4081           ,9            ,1    ,6          ,0.975
> > 56          ,1              ,3            ,0    ,7          ,1.017
> > 56          ,0              ,9            ,1    ,7          ,1.004
> > 57          ,1              ,3            ,0    ,7          ,1.049
> > 57          ,0              ,9            ,1    ,7          ,0.985
> > 56          ,4081           ,9            ,1    ,7          ,0.949
> > 64          ,1              ,3            ,0    ,8          ,1.152
> > 64          ,0              ,9            ,1    ,8          ,1.583
> > 65          ,1              ,3            ,0    ,8          ,1.076
> > 65          ,0              ,9            ,1    ,8          ,1.013
> > 64          ,4081           ,9            ,1    ,8          ,1.038
> > 96          ,1              ,3            ,0    ,9          ,1.007
> > 96          ,0              ,9            ,1    ,9          ,1.001
> > 97          ,1              ,3            ,0    ,9          ,0.973
> > 97          ,0              ,9            ,1    ,9          ,1.021
> > 96          ,4081           ,9            ,1    ,9          ,1.026
> > 128         ,1              ,3            ,0    ,10         ,1.012
> > 128         ,0              ,9            ,1    ,10         ,1.006
> > 129         ,1              ,3            ,0    ,10         ,0.959
> > 129         ,0              ,9            ,1    ,10         ,1.041
> > 128         ,4081           ,9            ,1    ,10         ,0.956
> > 160         ,1              ,3            ,0    ,11         ,0.967
> > 160         ,0              ,9            ,1    ,11         ,1.008
> > 161         ,1              ,3            ,0    ,11         ,0.97
> > 161         ,0              ,9            ,1    ,11         ,0.982
> > 160         ,4081           ,9            ,1    ,11         ,0.999
> > 192         ,1              ,3            ,0    ,12         ,1.057
> > 192         ,0              ,9            ,1    ,12         ,0.973
> > 193         ,1              ,3            ,0    ,12         ,0.956
> > 193         ,0              ,9            ,1    ,12         ,0.982
> > 192         ,4081           ,9            ,1    ,12         ,0.955
> > 224         ,1              ,3            ,0    ,13         ,0.979
> > 224         ,0              ,9            ,1    ,13         ,0.995
> > 225         ,1              ,3            ,0    ,13         ,0.966
> > 225         ,0              ,9            ,1    ,13         ,0.973
> > 224         ,4081           ,9            ,1    ,13         ,1.011
> > 256         ,1              ,3            ,0    ,14         ,0.981
> > 256         ,0              ,9            ,1    ,14         ,1.019
> > 257         ,1              ,3            ,0    ,14         ,0.905
> > 257         ,0              ,9            ,1    ,14         ,0.99
> > 256         ,4081           ,9            ,1    ,14         ,0.994
> > 288         ,1              ,3            ,0    ,15         ,0.963
> > 288         ,0              ,9            ,1    ,15         ,0.981
> > 289         ,1              ,3            ,0    ,15         ,0.97
> > 289         ,0              ,9            ,1    ,15         ,0.855
> > 288         ,4081           ,9            ,1    ,15         ,0.935
> > 320         ,1              ,3            ,0    ,16         ,0.962
> > 320         ,0              ,9            ,1    ,16         ,0.978
> > 321         ,1              ,3            ,0    ,16         ,0.952
> > 321         ,0              ,9            ,1    ,16         ,0.949
> > 320         ,4081           ,9            ,1    ,16         ,0.983
> > 256         ,1              ,11           ,0    ,4          ,0.967
> > 256         ,14             ,5            ,1    ,4          ,0.997
> > 257         ,1              ,11           ,0    ,5          ,1.008
> > 257         ,14             ,5            ,1    ,5          ,0.97
> > 257         ,1              ,11           ,0    ,4          ,1.001
> > 257         ,14             ,5            ,1    ,4          ,0.985
> > 257         ,4081           ,5            ,1    ,4          ,1.0
> > 256         ,1              ,11           ,0    ,8          ,0.958
> > 256         ,14             ,5            ,1    ,8          ,0.967
> > 257         ,1              ,11           ,0    ,9          ,0.991
> > 257         ,14             ,5            ,1    ,9          ,0.91
> > 257         ,1              ,11           ,0    ,8          ,0.974
> > 257         ,14             ,5            ,1    ,8          ,0.999
> > 257         ,4081           ,5            ,1    ,8          ,0.958
> > 256         ,1              ,11           ,0    ,16         ,0.957
> > 256         ,14             ,5            ,1    ,16         ,1.009
> > 257         ,1              ,11           ,0    ,17         ,0.983
> > 257         ,14             ,5            ,1    ,17         ,0.94
> > 257         ,1              ,11           ,0    ,16         ,0.956
> > 257         ,14             ,5            ,1    ,16         ,0.99
> > 257         ,4081           ,5            ,1    ,16         ,1.008
> > 256         ,1              ,11           ,0    ,32         ,0.982
> > 256         ,14             ,5            ,1    ,32         ,0.977
> > 257         ,1              ,11           ,0    ,33         ,0.942
> > 257         ,14             ,5            ,1    ,33         ,0.912
> > 257         ,1              ,11           ,0    ,32         ,1.024
> > 257         ,14             ,5            ,1    ,32         ,0.974
> > 257         ,4081           ,5            ,1    ,32         ,0.953
> > 256         ,1              ,11           ,0    ,64         ,0.955
> > 256         ,14             ,5            ,1    ,64         ,0.938
> > 257         ,1              ,11           ,0    ,65         ,0.916
> > 257         ,14             ,5            ,1    ,65         ,0.942
> > 257         ,1              ,11           ,0    ,64         ,0.94
> > 257         ,14             ,5            ,1    ,64         ,0.952
> > 257         ,4081           ,5            ,1    ,64         ,0.989
> > 256         ,1              ,11           ,0    ,128        ,0.87
> > 256         ,14             ,5            ,1    ,128        ,1.044
> > 257         ,1              ,11           ,0    ,129        ,0.933
> > 257         ,14             ,5            ,1    ,129        ,0.972
> > 257         ,1              ,11           ,0    ,128        ,0.919
> > 257         ,14             ,5            ,1    ,128        ,0.911
> > 257         ,4081           ,5            ,1    ,128        ,0.958
> > 256         ,1              ,11           ,0    ,256        ,1.069
> > 256         ,14             ,5            ,1    ,256        ,0.984
> > 257         ,1              ,11           ,0    ,257        ,1.067
> > 257         ,14             ,5            ,1    ,257        ,0.969
> > 257         ,1              ,11           ,0    ,256        ,1.081
> > 257         ,14             ,5            ,1    ,256        ,0.936
> > 257         ,4081           ,5            ,1    ,256        ,0.975
> > 512         ,1              ,11           ,0    ,4          ,0.984
> > 512         ,14             ,5            ,1    ,4          ,0.955
> > 513         ,1              ,11           ,0    ,5          ,0.976
> > 513         ,14             ,5            ,1    ,5          ,1.221
> > 513         ,1              ,11           ,0    ,4          ,0.983
> > 513         ,14             ,5            ,1    ,4          ,0.989
> > 513         ,4081           ,5            ,1    ,4          ,0.962
> > 512         ,1              ,11           ,0    ,8          ,0.949
> > 512         ,14             ,5            ,1    ,8          ,0.957
> > 513         ,1              ,11           ,0    ,9          ,0.959
> > 513         ,14             ,5            ,1    ,9          ,0.972
> > 513         ,1              ,11           ,0    ,8          ,0.977
> > 513         ,14             ,5            ,1    ,8          ,0.951
> > 513         ,4081           ,5            ,1    ,8          ,0.957
> > 512         ,1              ,11           ,0    ,16         ,0.975
> > 512         ,14             ,5            ,1    ,16         ,0.953
> > 513         ,1              ,11           ,0    ,17         ,0.898
> > 513         ,14             ,5            ,1    ,17         ,0.97
> > 513         ,1              ,11           ,0    ,16         ,0.941
> > 513         ,14             ,5            ,1    ,16         ,0.948
> > 513         ,4081           ,5            ,1    ,16         ,0.948
> > 512         ,1              ,11           ,0    ,32         ,0.917
> > 512         ,14             ,5            ,1    ,32         ,0.963
> > 513         ,1              ,11           ,0    ,33         ,0.937
> > 513         ,14             ,5            ,1    ,33         ,0.969
> > 513         ,1              ,11           ,0    ,32         ,0.995
> > 513         ,14             ,5            ,1    ,32         ,1.001
> > 513         ,4081           ,5            ,1    ,32         ,0.942
> > 512         ,1              ,11           ,0    ,64         ,0.943
> > 512         ,14             ,5            ,1    ,64         ,0.972
> > 513         ,1              ,11           ,0    ,65         ,0.979
> > 513         ,14             ,5            ,1    ,65         ,1.008
> > 513         ,1              ,11           ,0    ,64         ,0.938
> > 513         ,14             ,5            ,1    ,64         ,0.98
> > 513         ,4081           ,5            ,1    ,64         ,0.988
> > 512         ,1              ,11           ,0    ,128        ,0.91
> > 512         ,14             ,5            ,1    ,128        ,0.964
> > 513         ,1              ,11           ,0    ,129        ,0.918
> > 513         ,14             ,5            ,1    ,129        ,0.958
> > 513         ,1              ,11           ,0    ,128        ,0.931
> > 513         ,14             ,5            ,1    ,128        ,0.957
> > 513         ,4081           ,5            ,1    ,128        ,0.956
> > 512         ,1              ,11           ,0    ,256        ,0.835
> > 512         ,14             ,5            ,1    ,256        ,0.955
> > 513         ,1              ,11           ,0    ,257        ,0.842
> > 513         ,14             ,5            ,1    ,257        ,0.959
> > 513         ,1              ,11           ,0    ,256        ,0.832
> > 513         ,14             ,5            ,1    ,256        ,0.879
> > 513         ,4081           ,5            ,1    ,256        ,0.95
> > 1024        ,1              ,11           ,0    ,4          ,0.929
> > 1024        ,14             ,5            ,1    ,4          ,0.965
> > 1025        ,1              ,11           ,0    ,5          ,0.969
> > 1025        ,14             ,5            ,1    ,5          ,0.951
> > 1025        ,1              ,11           ,0    ,4          ,0.943
> > 1025        ,14             ,5            ,1    ,4          ,0.973
> > 1025        ,4081           ,5            ,1    ,4          ,0.982
> > 1024        ,1              ,11           ,0    ,8          ,0.881
> > 1024        ,14             ,5            ,1    ,8          ,0.924
> > 1025        ,1              ,11           ,0    ,9          ,0.971
> > 1025        ,14             ,5            ,1    ,9          ,0.948
> > 1025        ,1              ,11           ,0    ,8          ,0.968
> > 1025        ,14             ,5            ,1    ,8          ,0.967
> > 1025        ,4081           ,5            ,1    ,8          ,0.965
> > 1024        ,1              ,11           ,0    ,16         ,0.959
> > 1024        ,14             ,5            ,1    ,16         ,0.957
> > 1025        ,1              ,11           ,0    ,17         ,0.984
> > 1025        ,14             ,5            ,1    ,17         ,0.964
> > 1025        ,1              ,11           ,0    ,16         ,0.971
> > 1025        ,14             ,5            ,1    ,16         ,0.977
> > 1025        ,4081           ,5            ,1    ,16         ,0.97
> > 1024        ,1              ,11           ,0    ,32         ,0.946
> > 1024        ,14             ,5            ,1    ,32         ,0.967
> > 1025        ,1              ,11           ,0    ,33         ,0.922
> > 1025        ,14             ,5            ,1    ,33         ,0.843
> > 1025        ,1              ,11           ,0    ,32         ,0.954
> > 1025        ,14             ,5            ,1    ,32         ,0.98
> > 1025        ,4081           ,5            ,1    ,32         ,0.97
> > 1024        ,1              ,11           ,0    ,64         ,0.941
> > 1024        ,14             ,5            ,1    ,64         ,0.958
> > 1025        ,1              ,11           ,0    ,65         ,0.931
> > 1025        ,14             ,5            ,1    ,65         ,0.866
> > 1025        ,1              ,11           ,0    ,64         ,0.944
> > 1025        ,14             ,5            ,1    ,64         ,0.915
> > 1025        ,4081           ,5            ,1    ,64         ,0.956
> > 1024        ,1              ,11           ,0    ,128        ,0.827
> > 1024        ,14             ,5            ,1    ,128        ,0.938
> > 1025        ,1              ,11           ,0    ,129        ,0.921
> > 1025        ,14             ,5            ,1    ,129        ,0.95
> > 1025        ,1              ,11           ,0    ,128        ,0.912
> > 1025        ,14             ,5            ,1    ,128        ,1.005
> > 1025        ,4081           ,5            ,1    ,128        ,0.965
> > 1024        ,1              ,11           ,0    ,256        ,0.835
> > 1024        ,14             ,5            ,1    ,256        ,0.967
> > 1025        ,1              ,11           ,0    ,257        ,0.85
> > 1025        ,14             ,5            ,1    ,257        ,0.96
> > 1025        ,1              ,11           ,0    ,256        ,0.83
> > 1025        ,14             ,5            ,1    ,256        ,0.948
> > 1025        ,4081           ,5            ,1    ,256        ,0.946
> > 2048        ,1              ,11           ,0    ,4          ,0.933
> > 2048        ,14             ,5            ,1    ,4          ,0.913
> > 2049        ,1              ,11           ,0    ,5          ,0.962
> > 2049        ,14             ,5            ,1    ,5          ,0.961
> > 2049        ,1              ,11           ,0    ,4          ,0.866
> > 2049        ,14             ,5            ,1    ,4          ,1.05
> > 2049        ,4081           ,5            ,1    ,4          ,0.995
> > 2048        ,1              ,11           ,0    ,8          ,0.929
> > 2048        ,14             ,5            ,1    ,8          ,0.976
> > 2049        ,1              ,11           ,0    ,9          ,0.899
> > 2049        ,14             ,5            ,1    ,9          ,0.995
> > 2049        ,1              ,11           ,0    ,8          ,0.965
> > 2049        ,14             ,5            ,1    ,8          ,1.03
> > 2049        ,4081           ,5            ,1    ,8          ,0.958
> > 2048        ,1              ,11           ,0    ,16         ,0.858
> > 2048        ,14             ,5            ,1    ,16         ,0.963
> > 2049        ,1              ,11           ,0    ,17         ,0.97
> > 2049        ,14             ,5            ,1    ,17         ,0.875
> > 2049        ,1              ,11           ,0    ,16         ,0.862
> > 2049        ,14             ,5            ,1    ,16         ,0.97
> > 2049        ,4081           ,5            ,1    ,16         ,0.937
> > 2048        ,1              ,11           ,0    ,32         ,0.919
> > 2048        ,14             ,5            ,1    ,32         ,0.875
> > 2049        ,1              ,11           ,0    ,33         ,0.955
> > 2049        ,14             ,5            ,1    ,33         ,0.96
> > 2049        ,1              ,11           ,0    ,32         ,0.937
> > 2049        ,14             ,5            ,1    ,32         ,1.028
> > 2049        ,4081           ,5            ,1    ,32         ,0.94
> > 2048        ,1              ,11           ,0    ,64         ,0.966
> > 2048        ,14             ,5            ,1    ,64         ,0.944
> > 2049        ,1              ,11           ,0    ,65         ,0.948
> > 2049        ,14             ,5            ,1    ,65         ,0.947
> > 2049        ,1              ,11           ,0    ,64         ,1.008
> > 2049        ,14             ,5            ,1    ,64         ,0.98
> > 2049        ,4081           ,5            ,1    ,64         ,0.974
> > 2048        ,1              ,11           ,0    ,128        ,0.97
> > 2048        ,14             ,5            ,1    ,128        ,0.947
> > 2049        ,1              ,11           ,0    ,129        ,0.95
> > 2049        ,14             ,5            ,1    ,129        ,0.939
> > 2049        ,1              ,11           ,0    ,128        ,0.852
> > 2049        ,14             ,5            ,1    ,128        ,0.742
> > 2049        ,4081           ,5            ,1    ,128        ,0.674
> > 2048        ,1              ,11           ,0    ,256        ,0.86
> > 2048        ,14             ,5            ,1    ,256        ,0.89
> > 2049        ,1              ,11           ,0    ,257        ,0.76
> > 2049        ,14             ,5            ,1    ,257        ,0.886
> > 2049        ,1              ,11           ,0    ,256        ,0.876
> > 2049        ,14             ,5            ,1    ,256        ,0.915
> > 2049        ,4081           ,5            ,1    ,256        ,0.942
> > 4096        ,1              ,11           ,0    ,4          ,0.926
> > 4096        ,14             ,5            ,1    ,4          ,0.927
> > 4097        ,1              ,11           ,0    ,5          ,0.965
> > 4097        ,14             ,5            ,1    ,5          ,0.978
> > 4097        ,1              ,11           ,0    ,4          ,0.973
> > 4097        ,14             ,5            ,1    ,4          ,0.997
> > 4097        ,4081           ,5            ,1    ,4          ,0.982
> > 4096        ,1              ,11           ,0    ,8          ,0.948
> > 4096        ,14             ,5            ,1    ,8          ,0.937
> > 4097        ,1              ,11           ,0    ,9          ,0.949
> > 4097        ,14             ,5            ,1    ,9          ,0.964
> > 4097        ,1              ,11           ,0    ,8          ,0.998
> > 4097        ,14             ,5            ,1    ,8          ,0.98
> > 4097        ,4081           ,5            ,1    ,8          ,0.972
> > 4096        ,1              ,11           ,0    ,16         ,0.95
> > 4096        ,14             ,5            ,1    ,16         ,0.954
> > 4097        ,1              ,11           ,0    ,17         ,0.978
> > 4097        ,14             ,5            ,1    ,17         ,0.964
> > 4097        ,1              ,11           ,0    ,16         ,0.962
> > 4097        ,14             ,5            ,1    ,16         ,0.952
> > 4097        ,4081           ,5            ,1    ,16         ,0.944
> > 4096        ,1              ,11           ,0    ,32         ,0.965
> > 4096        ,14             ,5            ,1    ,32         ,0.971
> > 4097        ,1              ,11           ,0    ,33         ,0.996
> > 4097        ,14             ,5            ,1    ,33         ,0.976
> > 4097        ,1              ,11           ,0    ,32         ,1.029
> > 4097        ,14             ,5            ,1    ,32         ,1.255
> > 4097        ,4081           ,5            ,1    ,32         ,0.992
> > 4096        ,1              ,11           ,0    ,64         ,1.0
> > 4096        ,14             ,5            ,1    ,64         ,0.98
> > 4097        ,1              ,11           ,0    ,65         ,0.996
> > 4097        ,14             ,5            ,1    ,65         ,0.985
> > 4097        ,1              ,11           ,0    ,64         ,0.98
> > 4097        ,14             ,5            ,1    ,64         ,0.966
> > 4097        ,4081           ,5            ,1    ,64         ,0.955
> > 4096        ,1              ,11           ,0    ,128        ,0.914
> > 4096        ,14             ,5            ,1    ,128        ,0.897
> > 4097        ,1              ,11           ,0    ,129        ,0.93
> > 4097        ,14             ,5            ,1    ,129        ,0.98
> > 4097        ,1              ,11           ,0    ,128        ,0.94
> > 4097        ,14             ,5            ,1    ,128        ,0.965
> > 4097        ,4081           ,5            ,1    ,128        ,0.964
> > 4096        ,1              ,11           ,0    ,256        ,0.904
> > 4096        ,14             ,5            ,1    ,256        ,0.981
> > 4097        ,1              ,11           ,0    ,257        ,0.968
> > 4097        ,14             ,5            ,1    ,257        ,0.921
> > 4097        ,1              ,11           ,0    ,256        ,0.884
> > 4097        ,14             ,5            ,1    ,256        ,0.925
> > 4097        ,4081           ,5            ,1    ,256        ,0.94
> > 8192        ,1              ,11           ,0    ,4          ,0.961
> > 8192        ,14             ,5            ,1    ,4          ,0.959
> > 8193        ,1              ,11           ,0    ,5          ,0.927
> > 8193        ,14             ,5            ,1    ,5          ,0.961
> > 8193        ,1              ,11           ,0    ,4          ,0.938
> > 8193        ,14             ,5            ,1    ,4          ,0.956
> > 8193        ,4081           ,5            ,1    ,4          ,0.979
> > 8192        ,1              ,11           ,0    ,8          ,0.962
> > 8192        ,14             ,5            ,1    ,8          ,1.139
> > 8193        ,1              ,11           ,0    ,9          ,1.178
> > 8193        ,14             ,5            ,1    ,9          ,0.95
> > 8193        ,1              ,11           ,0    ,8          ,0.97
> > 8193        ,14             ,5            ,1    ,8          ,0.935
> > 8193        ,4081           ,5            ,1    ,8          ,0.999
> > 8192        ,1              ,11           ,0    ,16         ,0.908
> > 8192        ,14             ,5            ,1    ,16         ,0.907
> > 8193        ,1              ,11           ,0    ,17         ,0.997
> > 8193        ,14             ,5            ,1    ,17         ,0.838
> > 8193        ,1              ,11           ,0    ,16         ,0.968
> > 8193        ,14             ,5            ,1    ,16         ,0.713
> > 8193        ,4081           ,5            ,1    ,16         ,0.98
> > 8192        ,1              ,11           ,0    ,32         ,1.01
> > 8192        ,14             ,5            ,1    ,32         ,0.949
> > 8193        ,1              ,11           ,0    ,33         ,0.958
> > 8193        ,14             ,5            ,1    ,33         ,0.941
> > 8193        ,1              ,11           ,0    ,32         ,0.953
> > 8193        ,14             ,5            ,1    ,32         ,0.929
> > 8193        ,4081           ,5            ,1    ,32         ,0.97
> > 8192        ,1              ,11           ,0    ,64         ,0.95
> > 8192        ,14             ,5            ,1    ,64         ,0.913
> > 8193        ,1              ,11           ,0    ,65         ,0.958
> > 8193        ,14             ,5            ,1    ,65         ,0.934
> > 8193        ,1              ,11           ,0    ,64         ,0.905
> > 8193        ,14             ,5            ,1    ,64         ,0.89
> > 8193        ,4081           ,5            ,1    ,64         ,0.885
> > 8192        ,1              ,11           ,0    ,128        ,0.885
> > 8192        ,14             ,5            ,1    ,128        ,0.934
> > 8193        ,1              ,11           ,0    ,129        ,0.933
> > 8193        ,14             ,5            ,1    ,129        ,0.985
> > 8193        ,1              ,11           ,0    ,128        ,0.913
> > 8193        ,14             ,5            ,1    ,128        ,0.931
> > 8193        ,4081           ,5            ,1    ,128        ,0.967
> > 8192        ,1              ,11           ,0    ,256        ,0.903
> > 8192        ,14             ,5            ,1    ,256        ,0.908
> > 8193        ,1              ,11           ,0    ,257        ,0.915
> > 8193        ,14             ,5            ,1    ,257        ,0.969
> > 8193        ,1              ,11           ,0    ,256        ,0.918
> > 8193        ,14             ,5            ,1    ,256        ,0.92
> > 8193        ,4081           ,5            ,1    ,256        ,0.967
> > 16384       ,1              ,11           ,0    ,4          ,0.99
> > 16384       ,14             ,5            ,1    ,4          ,0.961
> > 16385       ,1              ,11           ,0    ,5          ,0.862
> > 16385       ,14             ,5            ,1    ,5          ,0.836
> > 16385       ,1              ,11           ,0    ,4          ,0.969
> > 16385       ,14             ,5            ,1    ,4          ,0.97
> > 16385       ,4081           ,5            ,1    ,4          ,0.973
> > 16384       ,1              ,11           ,0    ,8          ,0.931
> > 16384       ,14             ,5            ,1    ,8          ,0.953
> > 16385       ,1              ,11           ,0    ,9          ,0.923
> > 16385       ,14             ,5            ,1    ,9          ,0.821
> > 16385       ,1              ,11           ,0    ,8          ,0.829
> > 16385       ,14             ,5            ,1    ,8          ,0.953
> > 16385       ,4081           ,5            ,1    ,8          ,0.953
> > 16384       ,1              ,11           ,0    ,16         ,0.951
> > 16384       ,14             ,5            ,1    ,16         ,0.932
> > 16385       ,1              ,11           ,0    ,17         ,0.954
> > 16385       ,14             ,5            ,1    ,17         ,0.981
> > 16385       ,1              ,11           ,0    ,16         ,0.955
> > 16385       ,14             ,5            ,1    ,16         ,0.982
> > 16385       ,4081           ,5            ,1    ,16         ,0.951
> > 16384       ,1              ,11           ,0    ,32         ,0.995
> > 16384       ,14             ,5            ,1    ,32         ,0.982
> > 16385       ,1              ,11           ,0    ,33         ,0.967
> > 16385       ,14             ,5            ,1    ,33         ,0.945
> > 16385       ,1              ,11           ,0    ,32         ,0.953
> > 16385       ,14             ,5            ,1    ,32         ,0.942
> > 16385       ,4081           ,5            ,1    ,32         ,0.967
> > 16384       ,1              ,11           ,0    ,64         ,0.962
> > 16384       ,14             ,5            ,1    ,64         ,0.957
> > 16385       ,1              ,11           ,0    ,65         ,1.011
> > 16385       ,14             ,5            ,1    ,65         ,0.931
> > 16385       ,1              ,11           ,0    ,64         ,0.965
> > 16385       ,14             ,5            ,1    ,64         ,0.947
> > 16385       ,4081           ,5            ,1    ,64         ,0.96
> > 16384       ,1              ,11           ,0    ,128        ,0.931
> > 16384       ,14             ,5            ,1    ,128        ,0.935
> > 16385       ,1              ,11           ,0    ,129        ,0.948
> > 16385       ,14             ,5            ,1    ,129        ,0.943
> > 16385       ,1              ,11           ,0    ,128        ,0.94
> > 16385       ,14             ,5            ,1    ,128        ,0.841
> > 16385       ,4081           ,5            ,1    ,128        ,0.97
> > 16384       ,1              ,11           ,0    ,256        ,0.911
> > 16384       ,14             ,5            ,1    ,256        ,0.944
> > 16385       ,1              ,11           ,0    ,257        ,0.913
> > 16385       ,14             ,5            ,1    ,257        ,0.925
> > 16385       ,1              ,11           ,0    ,256        ,0.846
> > 16385       ,14             ,5            ,1    ,256        ,0.966
> > 16385       ,4081           ,5            ,1    ,256        ,1.037
> > 32768       ,1              ,11           ,0    ,4          ,0.984
> > 32768       ,14             ,5            ,1    ,4          ,1.015
> > 32769       ,1              ,11           ,0    ,5          ,0.972
> > 32769       ,14             ,5            ,1    ,5          ,0.958
> > 32769       ,1              ,11           ,0    ,4          ,0.955
> > 32769       ,14             ,5            ,1    ,4          ,0.95
> > 32769       ,4081           ,5            ,1    ,4          ,1.041
> > 32768       ,1              ,11           ,0    ,8          ,0.934
> > 32768       ,14             ,5            ,1    ,8          ,1.067
> > 32769       ,1              ,11           ,0    ,9          ,1.003
> > 32769       ,14             ,5            ,1    ,9          ,1.089
> > 32769       ,1              ,11           ,0    ,8          ,1.0
> > 32769       ,14             ,5            ,1    ,8          ,1.068
> > 32769       ,4081           ,5            ,1    ,8          ,0.986
> > 32768       ,1              ,11           ,0    ,16         ,0.994
> > 32768       ,14             ,5            ,1    ,16         ,0.994
> > 32769       ,1              ,11           ,0    ,17         ,1.008
> > 32769       ,14             ,5            ,1    ,17         ,0.95
> > 32769       ,1              ,11           ,0    ,16         ,0.953
> > 32769       ,14             ,5            ,1    ,16         ,0.954
> > 32769       ,4081           ,5            ,1    ,16         ,0.956
> > 32768       ,1              ,11           ,0    ,32         ,1.037
> > 32768       ,14             ,5            ,1    ,32         ,0.751
> > 32769       ,1              ,11           ,0    ,33         ,0.769
> > 32769       ,14             ,5            ,1    ,33         ,0.906
> > 32769       ,1              ,11           ,0    ,32         ,0.867
> > 32769       ,14             ,5            ,1    ,32         ,0.919
> > 32769       ,4081           ,5            ,1    ,32         ,1.145
> > 32768       ,1              ,11           ,0    ,64         ,0.909
> > 32768       ,14             ,5            ,1    ,64         ,0.947
> > 32769       ,1              ,11           ,0    ,65         ,0.896
> > 32769       ,14             ,5            ,1    ,65         ,0.964
> > 32769       ,1              ,11           ,0    ,64         ,0.996
> > 32769       ,14             ,5            ,1    ,64         ,0.905
> > 32769       ,4081           ,5            ,1    ,64         ,0.996
> > 32768       ,1              ,11           ,0    ,128        ,0.856
> > 32768       ,14             ,5            ,1    ,128        ,1.061
> > 32769       ,1              ,11           ,0    ,129        ,1.027
> > 32769       ,14             ,5            ,1    ,129        ,0.886
> > 32769       ,1              ,11           ,0    ,128        ,0.957
> > 32769       ,14             ,5            ,1    ,128        ,0.97
> > 32769       ,4081           ,5            ,1    ,128        ,1.097
> > 32768       ,1              ,11           ,0    ,256        ,0.914
> > 32768       ,14             ,5            ,1    ,256        ,0.994
> > 32769       ,1              ,11           ,0    ,257        ,0.865
> > 32769       ,14             ,5            ,1    ,257        ,0.889
> > 32769       ,1              ,11           ,0    ,256        ,0.951
> > 32769       ,14             ,5            ,1    ,256        ,0.938
> > 32769       ,4081           ,5            ,1    ,256        ,0.972
> > 65536       ,1              ,11           ,0    ,4          ,1.02
> > 65536       ,14             ,5            ,1    ,4          ,0.962
> > 65537       ,1              ,11           ,0    ,5          ,1.006
> > 65537       ,14             ,5            ,1    ,5          ,0.959
> > 65537       ,1              ,11           ,0    ,4          ,0.949
> > 65537       ,14             ,5            ,1    ,4          ,0.945
> > 65537       ,4081           ,5            ,1    ,4          ,0.976
> > 65536       ,1              ,11           ,0    ,8          ,1.007
> > 65536       ,14             ,5            ,1    ,8          ,0.997
> > 65537       ,1              ,11           ,0    ,9          ,1.008
> > 65537       ,14             ,5            ,1    ,9          ,0.971
> > 65537       ,1              ,11           ,0    ,8          ,0.893
> > 65537       ,14             ,5            ,1    ,8          ,0.929
> > 65537       ,4081           ,5            ,1    ,8          ,0.956
> > 65536       ,1              ,11           ,0    ,16         ,0.921
> > 65536       ,14             ,5            ,1    ,16         ,0.909
> > 65537       ,1              ,11           ,0    ,17         ,0.986
> > 65537       ,14             ,5            ,1    ,17         ,0.962
> > 65537       ,1              ,11           ,0    ,16         ,0.93
> > 65537       ,14             ,5            ,1    ,16         ,0.947
> > 65537       ,4081           ,5            ,1    ,16         ,0.885
> > 65536       ,1              ,11           ,0    ,32         ,1.001
> > 65536       ,14             ,5            ,1    ,32         ,0.93
> > 65537       ,1              ,11           ,0    ,33         ,0.87
> > 65537       ,14             ,5            ,1    ,33         ,1.038
> > 65537       ,1              ,11           ,0    ,32         ,0.934
> > 65537       ,14             ,5            ,1    ,32         ,1.094
> > 65537       ,4081           ,5            ,1    ,32         ,0.997
> > 65536       ,1              ,11           ,0    ,64         ,0.975
> > 65536       ,14             ,5            ,1    ,64         ,0.964
> > 65537       ,1              ,11           ,0    ,65         ,1.027
> > 65537       ,14             ,5            ,1    ,65         ,0.942
> > 65537       ,1              ,11           ,0    ,64         ,0.996
> > 65537       ,14             ,5            ,1    ,64         ,0.938
> > 65537       ,4081           ,5            ,1    ,64         ,0.913
> > 65536       ,1              ,11           ,0    ,128        ,0.967
> > 65536       ,14             ,5            ,1    ,128        ,0.991
> > 65537       ,1              ,11           ,0    ,129        ,0.949
> > 65537       ,14             ,5            ,1    ,129        ,0.948
> > 65537       ,1              ,11           ,0    ,128        ,1.019
> > 65537       ,14             ,5            ,1    ,128        ,1.028
> > 65537       ,4081           ,5            ,1    ,128        ,0.978
> > 65536       ,1              ,11           ,0    ,256        ,0.956
> > 65536       ,14             ,5            ,1    ,256        ,0.932
> > 65537       ,1              ,11           ,0    ,257        ,0.982
> > 65537       ,14             ,5            ,1    ,257        ,0.972
> > 65537       ,1              ,11           ,0    ,256        ,0.933
> > 65537       ,14             ,5            ,1    ,256        ,0.947
> > 65537       ,4081           ,5            ,1    ,256        ,0.939
> > 65536       ,0              ,0            ,1    ,64         ,0.974
> > 65536       ,0              ,0            ,1    ,256        ,0.948
> > 65536       ,0              ,0            ,1    ,1024       ,0.917
> > >
> > >
> > > >
> > > > > >
> > > > > >
> > > > > > --
> > > > > > H.J.
> > > >
> > > >
> > > >
> > > > --
> > > > H.J.
>
>
>
> --
> H.J.
  

Patch

diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
index 2ab9e96db8..e44c1a05dc 100644
--- a/sysdeps/x86_64/multiarch/strstr-avx512.c
+++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
@@ -26,6 +26,10 @@ 
 #define ZMM_SIZE_IN_BYTES 64
 #define PAGESIZE 4096
 
+#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
+#define kshiftri_mask64(x, y) ((x) >> (y))
+#define kand_mask64(x, y) ((x) & (y))
+
 /*
  Returns the index of the first edge within the needle, returns 0 if no edge
  is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
@@ -133,15 +137,15 @@  __strstr_avx512 (const char *haystack, const char *ned)
   __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
   /* Search for NULL and compare only till null char */
   uint64_t nullmask
-      = _cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
+      = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
   uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
-  cmpmask = cmpmask & _cvtmask64_u64 (loadmask);
+  cmpmask = cmpmask & cvtmask64_u64 (loadmask);
   /* Search for the 2 charaters of needle */
   __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
   __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
-  k1 = _kshiftri_mask64 (k1, 1);
+  k1 = kshiftri_mask64 (k1, 1);
   /* k2 masks tell us if both chars from needle match */
-  uint64_t k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
+  uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
   /* For every match, search for the entire needle for a full match */
   while (k2)
     {
@@ -178,13 +182,13 @@  __strstr_avx512 (const char *haystack, const char *ned)
       hay0 = _mm512_loadu_si512 (haystack + hay_index);
       hay1 = _mm512_load_si512 (haystack + hay_index
                                 + 1); // Always 64 byte aligned
-      nullmask = _cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
+      nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
       /* Compare only till null char */
       cmpmask = nullmask ^ (nullmask - ONE_64BIT);
       k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
       k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
       /* k2 masks tell us if both chars from needle match */
-      k2 = _cvtmask64_u64 (_kand_mask64 (k0, k1)) & cmpmask;
+      k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
       /* For every match, compare full strings for potential match */
       while (k2)
         {