[v10,6/6] elf: Optimize _dl_new_hash in dl-new-hash.h

Message ID 20220518172635.291119-6-goldstein.w.n@gmail.com
State Superseded
Headers
Series [v10,1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit fail Patch series failed to apply

Commit Message

Noah Goldstein May 18, 2022, 5:26 p.m. UTC
  Unroll slightly and enforce good instruction scheduling. This improves
performance on out-of-order machines. The unrolling allows for
pipelined multiplies.

As well, as an optional sysdep, reorder the operations and prevent
reassosiation for better scheduling and higher ILP. This commit
only adds the barrier for x86, although it should be either no
change or a win for any architecture.

Unrolling further started to induce slowdowns for sizes [0, 4]
but can help the loop so if larger sizes are the target further
unrolling can be beneficial.

Results for _dl_new_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=30 runs
Geometric of all benchmark New / Old: 0.674
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    2.865,     2.72,               1.053
 fixed,      1,    3.567,    2.489,               1.433
 fixed,      2,    2.577,    3.649,               0.706
 fixed,      3,    3.644,    5.983,               0.609
 fixed,      4,    4.211,    6.833,               0.616
 fixed,      5,    4.741,    9.372,               0.506
 fixed,      6,    5.415,    9.561,               0.566
 fixed,      7,    6.649,   10.789,               0.616
 fixed,      8,    8.081,   11.808,               0.684
 fixed,      9,    8.427,   12.935,               0.651
 fixed,     10,    8.673,   14.134,               0.614
 fixed,     11,    10.69,   15.408,               0.694
 fixed,     12,   10.789,   16.982,               0.635
 fixed,     13,   12.169,   18.411,               0.661
 fixed,     14,   12.659,   19.914,               0.636
 fixed,     15,   13.526,   21.541,               0.628
 fixed,     16,   14.211,   23.088,               0.616
 fixed,     32,   29.412,   52.722,               0.558
 fixed,     64,    65.41,  142.351,               0.459
 fixed,    128,  138.505,  295.625,               0.469
 fixed,    256,  291.707,  601.983,               0.485
random,      2,   12.698,   12.849,               0.988
random,      4,   16.065,   15.857,               1.013
random,      8,   19.564,   21.105,               0.927
random,     16,   23.919,   26.823,               0.892
random,     32,   31.987,   39.591,               0.808
random,     64,   49.282,   71.487,               0.689
random,    128,    82.23,  145.364,               0.566
random,    256,  152.209,  298.434,                0.51

Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
---
 benchtests/bench-dl-new-hash.c              |   3 +-
 elf/{dl-new-hash.h => simple-dl-new-hash.h} |  20 ++--
 elf/tst-dl-hash.c                           |   1 +
 sysdeps/generic/dl-new-hash.h               | 111 ++++++++++++++++++++
 sysdeps/x86/dl-new-hash.h                   |  24 +++++
 5 files changed, 146 insertions(+), 13 deletions(-)
 rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%)
 create mode 100644 sysdeps/generic/dl-new-hash.h
 create mode 100644 sysdeps/x86/dl-new-hash.h
  

Comments

H.J. Lu May 18, 2022, 5:32 p.m. UTC | #1
andOn Wed, May 18, 2022 at 10:26 AM Noah Goldstein
<goldstein.w.n@gmail.com> wrote:
>
> Unroll slightly and enforce good instruction scheduling. This improves
> performance on out-of-order machines. The unrolling allows for
> pipelined multiplies.
>
> As well, as an optional sysdep, reorder the operations and prevent
> reassosiation for better scheduling and higher ILP. This commit
> only adds the barrier for x86, although it should be either no
> change or a win for any architecture.
>
> Unrolling further started to induce slowdowns for sizes [0, 4]
> but can help the loop so if larger sizes are the target further
> unrolling can be beneficial.
>
> Results for _dl_new_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
>
> Time as Geometric Mean of N=30 runs
> Geometric of all benchmark New / Old: 0.674
>   type, length, New Time, Old Time, New Time / Old Time
>  fixed,      0,    2.865,     2.72,               1.053
>  fixed,      1,    3.567,    2.489,               1.433
>  fixed,      2,    2.577,    3.649,               0.706
>  fixed,      3,    3.644,    5.983,               0.609
>  fixed,      4,    4.211,    6.833,               0.616
>  fixed,      5,    4.741,    9.372,               0.506
>  fixed,      6,    5.415,    9.561,               0.566
>  fixed,      7,    6.649,   10.789,               0.616
>  fixed,      8,    8.081,   11.808,               0.684
>  fixed,      9,    8.427,   12.935,               0.651
>  fixed,     10,    8.673,   14.134,               0.614
>  fixed,     11,    10.69,   15.408,               0.694
>  fixed,     12,   10.789,   16.982,               0.635
>  fixed,     13,   12.169,   18.411,               0.661
>  fixed,     14,   12.659,   19.914,               0.636
>  fixed,     15,   13.526,   21.541,               0.628
>  fixed,     16,   14.211,   23.088,               0.616
>  fixed,     32,   29.412,   52.722,               0.558
>  fixed,     64,    65.41,  142.351,               0.459
>  fixed,    128,  138.505,  295.625,               0.469
>  fixed,    256,  291.707,  601.983,               0.485
> random,      2,   12.698,   12.849,               0.988
> random,      4,   16.065,   15.857,               1.013
> random,      8,   19.564,   21.105,               0.927
> random,     16,   23.919,   26.823,               0.892
> random,     32,   31.987,   39.591,               0.808
> random,     64,   49.282,   71.487,               0.689
> random,    128,    82.23,  145.364,               0.566
> random,    256,  152.209,  298.434,                0.51
>
> Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> ---
>  benchtests/bench-dl-new-hash.c              |   3 +-
>  elf/{dl-new-hash.h => simple-dl-new-hash.h} |  20 ++--
>  elf/tst-dl-hash.c                           |   1 +
>  sysdeps/generic/dl-new-hash.h               | 111 ++++++++++++++++++++
>  sysdeps/x86/dl-new-hash.h                   |  24 +++++
>  5 files changed, 146 insertions(+), 13 deletions(-)
>  rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%)
>  create mode 100644 sysdeps/generic/dl-new-hash.h
>  create mode 100644 sysdeps/x86/dl-new-hash.h
>
> diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
> index 3c8a1d5a82..040fa7ce01 100644
> --- a/benchtests/bench-dl-new-hash.c
> +++ b/benchtests/bench-dl-new-hash.c
> @@ -16,7 +16,8 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <elf/dl-new-hash.h>
> +#include <dl-new-hash.h>
> +#include <elf/simple-dl-new-hash.h>
>  #define TEST_FUNC(x, y) _dl_new_hash (x)
>  #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)
>
> diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h
> similarity index 75%
> rename from elf/dl-new-hash.h
> rename to elf/simple-dl-new-hash.h
> index 8641bb4196..1437b1bd36 100644
> --- a/elf/dl-new-hash.h
> +++ b/elf/simple-dl-new-hash.h
> @@ -1,4 +1,4 @@
> -/* _dl_new_hash for elf symbol lookup
> +/* __simple_dl_new_hash for testing true elf symbol lookup.
>     Copyright (C) 2022 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
> @@ -16,16 +16,16 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#ifndef _DL_NEW_HASH_H
> -#define _DL_NEW_HASH_H 1
> +#ifndef _SIMPLE_DL_NEW_HASH_H
> +#define _SIMPLE_DL_NEW_HASH_H 1
>
>  #include <stdint.h>
> -/* For __always_inline.  */
> -#include <sys/cdefs.h>
>
> -static __always_inline uint32_t
> +/* For testing/benchmarking purposes.  Real implementation in
> +   sysdeps/generic/dl-new-hash.h.  */
> +static uint32_t
>  __attribute__ ((unused))
> -_dl_new_hash (const char *s)
> +__simple_dl_new_hash (const char *s)
>  {
>    uint32_t h = 5381;
>    for (unsigned char c = *s; c != '\0'; c = *++s)
> @@ -33,8 +33,4 @@ _dl_new_hash (const char *s)
>    return h;
>  }
>
> -/* For testing/benchmarking purposes.  */
> -#define __simple_dl_new_hash _dl_new_hash
> -
> -
> -#endif /* dl-new-hash.h */
> +#endif /* simple-dl-new-hash.h */
> diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
> index 8697eb73a0..b21766c63d 100644
> --- a/elf/tst-dl-hash.c
> +++ b/elf/tst-dl-hash.c
> @@ -18,6 +18,7 @@
>
>
>  #include <simple-dl-hash.h>
> +#include <simple-dl-new-hash.h>
>  #include <dl-hash.h>
>  #include <dl-new-hash.h>
>  #include <support/support.h>
> diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h
> new file mode 100644
> index 0000000000..1faf309c97
> --- /dev/null
> +++ b/sysdeps/generic/dl-new-hash.h
> @@ -0,0 +1,111 @@
> +/* _dl_new_hash for elf symbol lookup
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _DL_NEW_HASH_H
> +#define _DL_NEW_HASH_H 1
> +
> +#include <stdint.h>
> +/* For __always_inline.  */
> +#include <sys/cdefs.h>
> +/* For __glibc_unlikely.  */
> +#include <sys/cdefs.h>
> +
> +/* The simplest implementation of _dl_new_hash is:
> +
> +   _dl_new_hash (const char *s)
> +   {
> +      uint32_t h = 5381;
> +      for (unsigned char c = *s; c != '\0'; c = *++s)
> +        h = h * 33 + c;
> +      return h;
> +   }
> +
> +   We can get better performance by slightly unrolling the loop to
> +   pipeline the multiples, which gcc cannot easily do due to
> +   dependencies across iterations.
> +
> +   As well, as an architecture specific option we add asm statements
> +   to explicitly specify order of operations and prevent reassociation
> +   of instructions that lengthens the loop carried dependency. This
> +   may have no affect as the compiler may have ordered instructions
> +   the same way without it but in testing this has not been the case
> +   for GCC. Improving GCC to reliably schedule instructions ideally
> +   cannot be easily done.
> +
> +   Architecture(s) that use the reassociation barries are:
> +   x86
> +
> +   Note it is very unlikely the reassociation barriers would
> +   de-optimize performance on any architecture and with an imperfect
> +   compiler it may help performance, especially on out-of-order cpus,
> +   so it is suggested that the respective maintainers add them.
> +
> +   architecture maintainers are encouraged to benchmark this with
> +   __asm_reassociation_barrier defined to __asm__ like it is in x86.
> +*/
> +
> +
> +#ifndef __asm_reassociation_barrier
> +# define __asm_reassociation_barrier(...)
> +#endif
> +
> +static __always_inline uint32_t
> +__attribute__ ((unused))
> +_dl_new_hash (const char *str)
> +{
> +  const unsigned char *s = (const unsigned char *) str;
> +  unsigned int h = 5381;
> +  unsigned int c0, c1;
> +  for (;;)
> +    {
> +      c0 = s[0];
> +      /* Since hashed string is normally not empty, this is unlikely on the
> +        first iteration of the loop.  */
> +      if (__glibc_unlikely (c0 == 0))
> +       return h;
> +
> +      c1 = s[1];
> +      if (c1 == 0)
> +       {
> +         /* Ideal computational order is:
> +        c0 += h;
> +        h *= 32;
> +        h += c0;  */
> +         c0 += h;
> +         __asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
> +         h = h * 32 + c0;
> +         return h;
> +       }
> +
> +      /* Ideal computational order is:
> +        c1 += c0;
> +        h *= 33 * 33;
> +        c0 *= 32;
> +        c1 += c0;
> +        h  += c1;  */
> +      c1 += c0;
> +      __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
> +      h *= 33 * 33;
> +      c1 += c0 * 32;
> +      __asm_reassociation_barrier("" : "+r"(c1));
> +      h += c1;
> +      s += 2;
> +    }
> +}
> +
> +#endif /* dl-new-hash.h */
> diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h
> new file mode 100644
> index 0000000000..ce8fb5a838
> --- /dev/null
> +++ b/sysdeps/x86/dl-new-hash.h
> @@ -0,0 +1,24 @@
> +/* _dl_new_hash for elf symbol lookup
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifdef __asm_reassociation_barrier
> +# error "__asm_reassociation_barrier should never already be defined."
> +#endif
> +
> +#define __asm_reassociation_barrier __asm__
> +#include <sysdeps/generic/dl-new-hash.h>
> --
> 2.34.1
>

Should the new _dl_new_hash be placed in sysdeps/x86/dl-new-hash.h
and leave the generic one unchanged?
  
Noah Goldstein May 18, 2022, 5:39 p.m. UTC | #2
On Wed, May 18, 2022 at 12:32 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> andOn Wed, May 18, 2022 at 10:26 AM Noah Goldstein
> <goldstein.w.n@gmail.com> wrote:
> >
> > Unroll slightly and enforce good instruction scheduling. This improves
> > performance on out-of-order machines. The unrolling allows for
> > pipelined multiplies.
> >
> > As well, as an optional sysdep, reorder the operations and prevent
> > reassosiation for better scheduling and higher ILP. This commit
> > only adds the barrier for x86, although it should be either no
> > change or a win for any architecture.
> >
> > Unrolling further started to induce slowdowns for sizes [0, 4]
> > but can help the loop so if larger sizes are the target further
> > unrolling can be beneficial.
> >
> > Results for _dl_new_hash
> > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> >
> > Time as Geometric Mean of N=30 runs
> > Geometric of all benchmark New / Old: 0.674
> >   type, length, New Time, Old Time, New Time / Old Time
> >  fixed,      0,    2.865,     2.72,               1.053
> >  fixed,      1,    3.567,    2.489,               1.433
> >  fixed,      2,    2.577,    3.649,               0.706
> >  fixed,      3,    3.644,    5.983,               0.609
> >  fixed,      4,    4.211,    6.833,               0.616
> >  fixed,      5,    4.741,    9.372,               0.506
> >  fixed,      6,    5.415,    9.561,               0.566
> >  fixed,      7,    6.649,   10.789,               0.616
> >  fixed,      8,    8.081,   11.808,               0.684
> >  fixed,      9,    8.427,   12.935,               0.651
> >  fixed,     10,    8.673,   14.134,               0.614
> >  fixed,     11,    10.69,   15.408,               0.694
> >  fixed,     12,   10.789,   16.982,               0.635
> >  fixed,     13,   12.169,   18.411,               0.661
> >  fixed,     14,   12.659,   19.914,               0.636
> >  fixed,     15,   13.526,   21.541,               0.628
> >  fixed,     16,   14.211,   23.088,               0.616
> >  fixed,     32,   29.412,   52.722,               0.558
> >  fixed,     64,    65.41,  142.351,               0.459
> >  fixed,    128,  138.505,  295.625,               0.469
> >  fixed,    256,  291.707,  601.983,               0.485
> > random,      2,   12.698,   12.849,               0.988
> > random,      4,   16.065,   15.857,               1.013
> > random,      8,   19.564,   21.105,               0.927
> > random,     16,   23.919,   26.823,               0.892
> > random,     32,   31.987,   39.591,               0.808
> > random,     64,   49.282,   71.487,               0.689
> > random,    128,    82.23,  145.364,               0.566
> > random,    256,  152.209,  298.434,                0.51
> >
> > Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> > ---
> >  benchtests/bench-dl-new-hash.c              |   3 +-
> >  elf/{dl-new-hash.h => simple-dl-new-hash.h} |  20 ++--
> >  elf/tst-dl-hash.c                           |   1 +
> >  sysdeps/generic/dl-new-hash.h               | 111 ++++++++++++++++++++
> >  sysdeps/x86/dl-new-hash.h                   |  24 +++++
> >  5 files changed, 146 insertions(+), 13 deletions(-)
> >  rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%)
> >  create mode 100644 sysdeps/generic/dl-new-hash.h
> >  create mode 100644 sysdeps/x86/dl-new-hash.h
> >
> > diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
> > index 3c8a1d5a82..040fa7ce01 100644
> > --- a/benchtests/bench-dl-new-hash.c
> > +++ b/benchtests/bench-dl-new-hash.c
> > @@ -16,7 +16,8 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#include <elf/dl-new-hash.h>
> > +#include <dl-new-hash.h>
> > +#include <elf/simple-dl-new-hash.h>
> >  #define TEST_FUNC(x, y) _dl_new_hash (x)
> >  #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)
> >
> > diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h
> > similarity index 75%
> > rename from elf/dl-new-hash.h
> > rename to elf/simple-dl-new-hash.h
> > index 8641bb4196..1437b1bd36 100644
> > --- a/elf/dl-new-hash.h
> > +++ b/elf/simple-dl-new-hash.h
> > @@ -1,4 +1,4 @@
> > -/* _dl_new_hash for elf symbol lookup
> > +/* __simple_dl_new_hash for testing true elf symbol lookup.
> >     Copyright (C) 2022 Free Software Foundation, Inc.
> >     This file is part of the GNU C Library.
> >
> > @@ -16,16 +16,16 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#ifndef _DL_NEW_HASH_H
> > -#define _DL_NEW_HASH_H 1
> > +#ifndef _SIMPLE_DL_NEW_HASH_H
> > +#define _SIMPLE_DL_NEW_HASH_H 1
> >
> >  #include <stdint.h>
> > -/* For __always_inline.  */
> > -#include <sys/cdefs.h>
> >
> > -static __always_inline uint32_t
> > +/* For testing/benchmarking purposes.  Real implementation in
> > +   sysdeps/generic/dl-new-hash.h.  */
> > +static uint32_t
> >  __attribute__ ((unused))
> > -_dl_new_hash (const char *s)
> > +__simple_dl_new_hash (const char *s)
> >  {
> >    uint32_t h = 5381;
> >    for (unsigned char c = *s; c != '\0'; c = *++s)
> > @@ -33,8 +33,4 @@ _dl_new_hash (const char *s)
> >    return h;
> >  }
> >
> > -/* For testing/benchmarking purposes.  */
> > -#define __simple_dl_new_hash _dl_new_hash
> > -
> > -
> > -#endif /* dl-new-hash.h */
> > +#endif /* simple-dl-new-hash.h */
> > diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
> > index 8697eb73a0..b21766c63d 100644
> > --- a/elf/tst-dl-hash.c
> > +++ b/elf/tst-dl-hash.c
> > @@ -18,6 +18,7 @@
> >
> >
> >  #include <simple-dl-hash.h>
> > +#include <simple-dl-new-hash.h>
> >  #include <dl-hash.h>
> >  #include <dl-new-hash.h>
> >  #include <support/support.h>
> > diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h
> > new file mode 100644
> > index 0000000000..1faf309c97
> > --- /dev/null
> > +++ b/sysdeps/generic/dl-new-hash.h
> > @@ -0,0 +1,111 @@
> > +/* _dl_new_hash for elf symbol lookup
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _DL_NEW_HASH_H
> > +#define _DL_NEW_HASH_H 1
> > +
> > +#include <stdint.h>
> > +/* For __always_inline.  */
> > +#include <sys/cdefs.h>
> > +/* For __glibc_unlikely.  */
> > +#include <sys/cdefs.h>
> > +
> > +/* The simplest implementation of _dl_new_hash is:
> > +
> > +   _dl_new_hash (const char *s)
> > +   {
> > +      uint32_t h = 5381;
> > +      for (unsigned char c = *s; c != '\0'; c = *++s)
> > +        h = h * 33 + c;
> > +      return h;
> > +   }
> > +
> > +   We can get better performance by slightly unrolling the loop to
> > +   pipeline the multiples, which gcc cannot easily do due to
> > +   dependencies across iterations.
> > +
> > +   As well, as an architecture specific option we add asm statements
> > +   to explicitly specify order of operations and prevent reassociation
> > +   of instructions that lengthens the loop carried dependency. This
> > +   may have no affect as the compiler may have ordered instructions
> > +   the same way without it but in testing this has not been the case
> > +   for GCC. Improving GCC to reliably schedule instructions ideally
> > +   cannot be easily done.
> > +
> > +   Architecture(s) that use the reassociation barries are:
> > +   x86
> > +
> > +   Note it is very unlikely the reassociation barriers would
> > +   de-optimize performance on any architecture and with an imperfect
> > +   compiler it may help performance, especially on out-of-order cpus,
> > +   so it is suggested that the respective maintainers add them.
> > +
> > +   architecture maintainers are encouraged to benchmark this with
> > +   __asm_reassociation_barrier defined to __asm__ like it is in x86.
> > +*/
> > +
> > +
> > +#ifndef __asm_reassociation_barrier
> > +# define __asm_reassociation_barrier(...)
> > +#endif
> > +
> > +static __always_inline uint32_t
> > +__attribute__ ((unused))
> > +_dl_new_hash (const char *str)
> > +{
> > +  const unsigned char *s = (const unsigned char *) str;
> > +  unsigned int h = 5381;
> > +  unsigned int c0, c1;
> > +  for (;;)
> > +    {
> > +      c0 = s[0];
> > +      /* Since hashed string is normally not empty, this is unlikely on the
> > +        first iteration of the loop.  */
> > +      if (__glibc_unlikely (c0 == 0))
> > +       return h;
> > +
> > +      c1 = s[1];
> > +      if (c1 == 0)
> > +       {
> > +         /* Ideal computational order is:
> > +        c0 += h;
> > +        h *= 32;
> > +        h += c0;  */
> > +         c0 += h;
> > +         __asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
> > +         h = h * 32 + c0;
> > +         return h;
> > +       }
> > +
> > +      /* Ideal computational order is:
> > +        c1 += c0;
> > +        h *= 33 * 33;
> > +        c0 *= 32;
> > +        c1 += c0;
> > +        h  += c1;  */
> > +      c1 += c0;
> > +      __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
> > +      h *= 33 * 33;
> > +      c1 += c0 * 32;
> > +      __asm_reassociation_barrier("" : "+r"(c1));
> > +      h += c1;
> > +      s += 2;
> > +    }
> > +}
> > +
> > +#endif /* dl-new-hash.h */
> > diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h
> > new file mode 100644
> > index 0000000000..ce8fb5a838
> > --- /dev/null
> > +++ b/sysdeps/x86/dl-new-hash.h
> > @@ -0,0 +1,24 @@
> > +/* _dl_new_hash for elf symbol lookup
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifdef __asm_reassociation_barrier
> > +# error "__asm_reassociation_barrier should never already be defined."
> > +#endif
> > +
> > +#define __asm_reassociation_barrier __asm__
> > +#include <sysdeps/generic/dl-new-hash.h>
> > --
> > 2.34.1
> >
>
> Should the new _dl_new_hash be placed in sysdeps/x86/dl-new-hash.h
> and leave the generic one unchanged?

I think the expectation is the generic code is going to be a win across
the board. The sysdep aspect of the perf were the asm barriers.

But again, I've only benchmarks on x86.
>
> --
> H.J.
  
Siddhesh Poyarekar May 19, 2022, 7:53 a.m. UTC | #3
On 18/05/2022 23:02, H.J. Lu via Libc-alpha wrote:
> andOn Wed, May 18, 2022 at 10:26 AM Noah Goldstein
> <goldstein.w.n@gmail.com> wrote:
>>
>> Unroll slightly and enforce good instruction scheduling. This improves
>> performance on out-of-order machines. The unrolling allows for
>> pipelined multiplies.
>>
>> As well, as an optional sysdep, reorder the operations and prevent
>> reassosiation for better scheduling and higher ILP. This commit
>> only adds the barrier for x86, although it should be either no
>> change or a win for any architecture.
>>
>> Unrolling further started to induce slowdowns for sizes [0, 4]
>> but can help the loop so if larger sizes are the target further
>> unrolling can be beneficial.
>>
>> Results for _dl_new_hash
>> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
>>
>> Time as Geometric Mean of N=30 runs
>> Geometric of all benchmark New / Old: 0.674
>>    type, length, New Time, Old Time, New Time / Old Time
>>   fixed,      0,    2.865,     2.72,               1.053
>>   fixed,      1,    3.567,    2.489,               1.433
>>   fixed,      2,    2.577,    3.649,               0.706
>>   fixed,      3,    3.644,    5.983,               0.609
>>   fixed,      4,    4.211,    6.833,               0.616
>>   fixed,      5,    4.741,    9.372,               0.506
>>   fixed,      6,    5.415,    9.561,               0.566
>>   fixed,      7,    6.649,   10.789,               0.616
>>   fixed,      8,    8.081,   11.808,               0.684
>>   fixed,      9,    8.427,   12.935,               0.651
>>   fixed,     10,    8.673,   14.134,               0.614
>>   fixed,     11,    10.69,   15.408,               0.694
>>   fixed,     12,   10.789,   16.982,               0.635
>>   fixed,     13,   12.169,   18.411,               0.661
>>   fixed,     14,   12.659,   19.914,               0.636
>>   fixed,     15,   13.526,   21.541,               0.628
>>   fixed,     16,   14.211,   23.088,               0.616
>>   fixed,     32,   29.412,   52.722,               0.558
>>   fixed,     64,    65.41,  142.351,               0.459
>>   fixed,    128,  138.505,  295.625,               0.469
>>   fixed,    256,  291.707,  601.983,               0.485
>> random,      2,   12.698,   12.849,               0.988
>> random,      4,   16.065,   15.857,               1.013
>> random,      8,   19.564,   21.105,               0.927
>> random,     16,   23.919,   26.823,               0.892
>> random,     32,   31.987,   39.591,               0.808
>> random,     64,   49.282,   71.487,               0.689
>> random,    128,    82.23,  145.364,               0.566
>> random,    256,  152.209,  298.434,                0.51
>>
>> Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
>> ---
>>   benchtests/bench-dl-new-hash.c              |   3 +-
>>   elf/{dl-new-hash.h => simple-dl-new-hash.h} |  20 ++--
>>   elf/tst-dl-hash.c                           |   1 +
>>   sysdeps/generic/dl-new-hash.h               | 111 ++++++++++++++++++++
>>   sysdeps/x86/dl-new-hash.h                   |  24 +++++
>>   5 files changed, 146 insertions(+), 13 deletions(-)
>>   rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%)
>>   create mode 100644 sysdeps/generic/dl-new-hash.h
>>   create mode 100644 sysdeps/x86/dl-new-hash.h
>>
>> diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
>> index 3c8a1d5a82..040fa7ce01 100644
>> --- a/benchtests/bench-dl-new-hash.c
>> +++ b/benchtests/bench-dl-new-hash.c
>> @@ -16,7 +16,8 @@
>>      License along with the GNU C Library; if not, see
>>      <https://www.gnu.org/licenses/>.  */
>>
>> -#include <elf/dl-new-hash.h>
>> +#include <dl-new-hash.h>
>> +#include <elf/simple-dl-new-hash.h>
>>   #define TEST_FUNC(x, y) _dl_new_hash (x)
>>   #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)
>>
>> diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h
>> similarity index 75%
>> rename from elf/dl-new-hash.h
>> rename to elf/simple-dl-new-hash.h
>> index 8641bb4196..1437b1bd36 100644
>> --- a/elf/dl-new-hash.h
>> +++ b/elf/simple-dl-new-hash.h
>> @@ -1,4 +1,4 @@
>> -/* _dl_new_hash for elf symbol lookup
>> +/* __simple_dl_new_hash for testing true elf symbol lookup.
>>      Copyright (C) 2022 Free Software Foundation, Inc.
>>      This file is part of the GNU C Library.
>>
>> @@ -16,16 +16,16 @@
>>      License along with the GNU C Library; if not, see
>>      <https://www.gnu.org/licenses/>.  */
>>
>> -#ifndef _DL_NEW_HASH_H
>> -#define _DL_NEW_HASH_H 1
>> +#ifndef _SIMPLE_DL_NEW_HASH_H
>> +#define _SIMPLE_DL_NEW_HASH_H 1
>>
>>   #include <stdint.h>
>> -/* For __always_inline.  */
>> -#include <sys/cdefs.h>
>>
>> -static __always_inline uint32_t
>> +/* For testing/benchmarking purposes.  Real implementation in
>> +   sysdeps/generic/dl-new-hash.h.  */
>> +static uint32_t
>>   __attribute__ ((unused))
>> -_dl_new_hash (const char *s)
>> +__simple_dl_new_hash (const char *s)
>>   {
>>     uint32_t h = 5381;
>>     for (unsigned char c = *s; c != '\0'; c = *++s)
>> @@ -33,8 +33,4 @@ _dl_new_hash (const char *s)
>>     return h;
>>   }
>>
>> -/* For testing/benchmarking purposes.  */
>> -#define __simple_dl_new_hash _dl_new_hash
>> -
>> -
>> -#endif /* dl-new-hash.h */
>> +#endif /* simple-dl-new-hash.h */
>> diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
>> index 8697eb73a0..b21766c63d 100644
>> --- a/elf/tst-dl-hash.c
>> +++ b/elf/tst-dl-hash.c
>> @@ -18,6 +18,7 @@
>>
>>
>>   #include <simple-dl-hash.h>
>> +#include <simple-dl-new-hash.h>
>>   #include <dl-hash.h>
>>   #include <dl-new-hash.h>
>>   #include <support/support.h>
>> diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h
>> new file mode 100644
>> index 0000000000..1faf309c97
>> --- /dev/null
>> +++ b/sysdeps/generic/dl-new-hash.h
>> @@ -0,0 +1,111 @@
>> +/* _dl_new_hash for elf symbol lookup
>> +   Copyright (C) 2022 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +#ifndef _DL_NEW_HASH_H
>> +#define _DL_NEW_HASH_H 1
>> +
>> +#include <stdint.h>
>> +/* For __always_inline.  */
>> +#include <sys/cdefs.h>
>> +/* For __glibc_unlikely.  */
>> +#include <sys/cdefs.h>
>> +
>> +/* The simplest implementation of _dl_new_hash is:
>> +
>> +   _dl_new_hash (const char *s)
>> +   {
>> +      uint32_t h = 5381;
>> +      for (unsigned char c = *s; c != '\0'; c = *++s)
>> +        h = h * 33 + c;
>> +      return h;
>> +   }
>> +
>> +   We can get better performance by slightly unrolling the loop to
>> +   pipeline the multiples, which gcc cannot easily do due to
>> +   dependencies across iterations.
>> +
>> +   As well, as an architecture specific option we add asm statements
>> +   to explicitly specify order of operations and prevent reassociation
>> +   of instructions that lengthens the loop carried dependency. This
>> +   may have no affect as the compiler may have ordered instructions
>> +   the same way without it but in testing this has not been the case
>> +   for GCC. Improving GCC to reliably schedule instructions ideally
>> +   cannot be easily done.
>> +
>> +   Architecture(s) that use the reassociation barries are:
>> +   x86
>> +
>> +   Note it is very unlikely the reassociation barriers would
>> +   de-optimize performance on any architecture and with an imperfect
>> +   compiler it may help performance, especially on out-of-order cpus,
>> +   so it is suggested that the respective maintainers add them.
>> +
>> +   architecture maintainers are encouraged to benchmark this with
>> +   __asm_reassociation_barrier defined to __asm__ like it is in x86.
>> +*/
>> +
>> +
>> +#ifndef __asm_reassociation_barrier
>> +# define __asm_reassociation_barrier(...)
>> +#endif
>> +
>> +static __always_inline uint32_t
>> +__attribute__ ((unused))
>> +_dl_new_hash (const char *str)
>> +{
>> +  const unsigned char *s = (const unsigned char *) str;
>> +  unsigned int h = 5381;
>> +  unsigned int c0, c1;
>> +  for (;;)
>> +    {
>> +      c0 = s[0];
>> +      /* Since hashed string is normally not empty, this is unlikely on the
>> +        first iteration of the loop.  */
>> +      if (__glibc_unlikely (c0 == 0))
>> +       return h;
>> +
>> +      c1 = s[1];
>> +      if (c1 == 0)
>> +       {
>> +         /* Ideal computational order is:
>> +        c0 += h;
>> +        h *= 32;
>> +        h += c0;  */
>> +         c0 += h;
>> +         __asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
>> +         h = h * 32 + c0;
>> +         return h;
>> +       }
>> +
>> +      /* Ideal computational order is:
>> +        c1 += c0;
>> +        h *= 33 * 33;
>> +        c0 *= 32;
>> +        c1 += c0;
>> +        h  += c1;  */
>> +      c1 += c0;
>> +      __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
>> +      h *= 33 * 33;
>> +      c1 += c0 * 32;
>> +      __asm_reassociation_barrier("" : "+r"(c1));
>> +      h += c1;
>> +      s += 2;
>> +    }
>> +}
>> +
>> +#endif /* dl-new-hash.h */
>> diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h
>> new file mode 100644
>> index 0000000000..ce8fb5a838
>> --- /dev/null
>> +++ b/sysdeps/x86/dl-new-hash.h
>> @@ -0,0 +1,24 @@
>> +/* _dl_new_hash for elf symbol lookup
>> +   Copyright (C) 2022 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +#ifdef __asm_reassociation_barrier
>> +# error "__asm_reassociation_barrier should never already be defined."
>> +#endif
>> +
>> +#define __asm_reassociation_barrier __asm__
>> +#include <sysdeps/generic/dl-new-hash.h>
>> --
>> 2.34.1
>>
> 
> Should the new _dl_new_hash be placed in sysdeps/x86/dl-new-hash.h
> and leave the generic one unchanged?
> 

There are 3 implementations: the reference one in elf/dl-new-hash.h 
that's retained for verification, the optimized one in 
sysdeps-generic/dl-new-hash.h that is suitable for all architectures and 
the micro-optimized one with optimized schedule for x86, giving it that 
little bit more.

Siddhesh
  
Siddhesh Poyarekar May 19, 2022, 3:55 p.m. UTC | #4
On 18/05/2022 22:56, Noah Goldstein via Libc-alpha wrote:
> Unroll slightly and enforce good instruction scheduling. This improves
> performance on out-of-order machines. The unrolling allows for
> pipelined multiplies.
> 
> As well, as an optional sysdep, reorder the operations and prevent
> reassosiation for better scheduling and higher ILP. This commit
> only adds the barrier for x86, although it should be either no
> change or a win for any architecture.
> 
> Unrolling further started to induce slowdowns for sizes [0, 4]
> but can help the loop so if larger sizes are the target further
> unrolling can be beneficial.
> 
> Results for _dl_new_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> 
> Time as Geometric Mean of N=30 runs
> Geometric of all benchmark New / Old: 0.674
>    type, length, New Time, Old Time, New Time / Old Time
>   fixed,      0,    2.865,     2.72,               1.053
>   fixed,      1,    3.567,    2.489,               1.433
>   fixed,      2,    2.577,    3.649,               0.706
>   fixed,      3,    3.644,    5.983,               0.609
>   fixed,      4,    4.211,    6.833,               0.616
>   fixed,      5,    4.741,    9.372,               0.506
>   fixed,      6,    5.415,    9.561,               0.566
>   fixed,      7,    6.649,   10.789,               0.616
>   fixed,      8,    8.081,   11.808,               0.684
>   fixed,      9,    8.427,   12.935,               0.651
>   fixed,     10,    8.673,   14.134,               0.614
>   fixed,     11,    10.69,   15.408,               0.694
>   fixed,     12,   10.789,   16.982,               0.635
>   fixed,     13,   12.169,   18.411,               0.661
>   fixed,     14,   12.659,   19.914,               0.636
>   fixed,     15,   13.526,   21.541,               0.628
>   fixed,     16,   14.211,   23.088,               0.616
>   fixed,     32,   29.412,   52.722,               0.558
>   fixed,     64,    65.41,  142.351,               0.459
>   fixed,    128,  138.505,  295.625,               0.469
>   fixed,    256,  291.707,  601.983,               0.485
> random,      2,   12.698,   12.849,               0.988
> random,      4,   16.065,   15.857,               1.013
> random,      8,   19.564,   21.105,               0.927
> random,     16,   23.919,   26.823,               0.892
> random,     32,   31.987,   39.591,               0.808
> random,     64,   49.282,   71.487,               0.689
> random,    128,    82.23,  145.364,               0.566
> random,    256,  152.209,  298.434,                0.51
> 
> Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> ---
>   benchtests/bench-dl-new-hash.c              |   3 +-
>   elf/{dl-new-hash.h => simple-dl-new-hash.h} |  20 ++--
>   elf/tst-dl-hash.c                           |   1 +
>   sysdeps/generic/dl-new-hash.h               | 111 ++++++++++++++++++++
>   sysdeps/x86/dl-new-hash.h                   |  24 +++++
>   5 files changed, 146 insertions(+), 13 deletions(-)
>   rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%)
>   create mode 100644 sysdeps/generic/dl-new-hash.h
>   create mode 100644 sysdeps/x86/dl-new-hash.h

Mostly OK, just minor nits to fix below.

> 
> diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
> index 3c8a1d5a82..040fa7ce01 100644
> --- a/benchtests/bench-dl-new-hash.c
> +++ b/benchtests/bench-dl-new-hash.c
> @@ -16,7 +16,8 @@
>      License along with the GNU C Library; if not, see
>      <https://www.gnu.org/licenses/>.  */
>   
> -#include <elf/dl-new-hash.h>
> +#include <dl-new-hash.h>
> +#include <elf/simple-dl-new-hash.h>
>   #define TEST_FUNC(x, y) _dl_new_hash (x)
>   #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)

OK.

>   
> diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h
> similarity index 75%
> rename from elf/dl-new-hash.h
> rename to elf/simple-dl-new-hash.h
> index 8641bb4196..1437b1bd36 100644
> --- a/elf/dl-new-hash.h
> +++ b/elf/simple-dl-new-hash.h
> @@ -1,4 +1,4 @@
> -/* _dl_new_hash for elf symbol lookup
> +/* __simple_dl_new_hash for testing true elf symbol lookup.
>      Copyright (C) 2022 Free Software Foundation, Inc.
>      This file is part of the GNU C Library.
>   
> @@ -16,16 +16,16 @@
>      License along with the GNU C Library; if not, see
>      <https://www.gnu.org/licenses/>.  */
>   
> -#ifndef _DL_NEW_HASH_H
> -#define _DL_NEW_HASH_H 1
> +#ifndef _SIMPLE_DL_NEW_HASH_H
> +#define _SIMPLE_DL_NEW_HASH_H 1
>   
>   #include <stdint.h>
> -/* For __always_inline.  */
> -#include <sys/cdefs.h>
>   
> -static __always_inline uint32_t
> +/* For testing/benchmarking purposes.  Real implementation in
> +   sysdeps/generic/dl-new-hash.h.  */
> +static uint32_t
>   __attribute__ ((unused))
> -_dl_new_hash (const char *s)
> +__simple_dl_new_hash (const char *s)
>   {
>     uint32_t h = 5381;
>     for (unsigned char c = *s; c != '\0'; c = *++s)
> @@ -33,8 +33,4 @@ _dl_new_hash (const char *s)
>     return h;
>   }
>   
> -/* For testing/benchmarking purposes.  */
> -#define __simple_dl_new_hash _dl_new_hash
> -
> -
> -#endif /* dl-new-hash.h */
> +#endif /* simple-dl-new-hash.h */
> diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
> index 8697eb73a0..b21766c63d 100644
> --- a/elf/tst-dl-hash.c
> +++ b/elf/tst-dl-hash.c
> @@ -18,6 +18,7 @@
>   
>   
>   #include <simple-dl-hash.h>
> +#include <simple-dl-new-hash.h>
>   #include <dl-hash.h>
>   #include <dl-new-hash.h>
>   #include <support/support.h>
> diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h
> new file mode 100644
> index 0000000000..1faf309c97
> --- /dev/null
> +++ b/sysdeps/generic/dl-new-hash.h
> @@ -0,0 +1,111 @@
> +/* _dl_new_hash for elf symbol lookup
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _DL_NEW_HASH_H
> +#define _DL_NEW_HASH_H 1
> +
> +#include <stdint.h>
> +/* For __always_inline.  */
> +#include <sys/cdefs.h>
> +/* For __glibc_unlikely.  */
> +#include <sys/cdefs.h>

Duplicate, but you already know this.

> +
> +/* The simplest implementation of _dl_new_hash is:
> +
> +   _dl_new_hash (const char *s)
> +   {
> +      uint32_t h = 5381;
> +      for (unsigned char c = *s; c != '\0'; c = *++s)
> +        h = h * 33 + c;
> +      return h;
> +   }
> +
> +   We can get better performance by slightly unrolling the loop to
> +   pipeline the multiples, which gcc cannot easily do due to
> +   dependencies across iterations.
> +
> +   As well, as an architecture specific option we add asm statements
> +   to explicitly specify order of operations and prevent reassociation
> +   of instructions that lengthens the loop carried dependency. This
> +   may have no affect as the compiler may have ordered instructions
> +   the same way without it but in testing this has not been the case
> +   for GCC. Improving GCC to reliably schedule instructions ideally
> +   cannot be easily done.
> +
> +   Architecture(s) that use the reassociation barries are:

barriers

> +   x86
> +
> +   Note it is very unlikely the reassociation barriers would
> +   de-optimize performance on any architecture and with an imperfect
> +   compiler it may help performance, especially on out-of-order cpus,
> +   so it is suggested that the respective maintainers add them.
> +
> +   architecture maintainers are encouraged to benchmark this with

Architecture

> +   __asm_reassociation_barrier defined to __asm__ like it is in x86.
> +*/
> +
> +
> +#ifndef __asm_reassociation_barrier
> +# define __asm_reassociation_barrier(...)
> +#endif
> +
> +static __always_inline uint32_t
> +__attribute__ ((unused))
> +_dl_new_hash (const char *str)
> +{
> +  const unsigned char *s = (const unsigned char *) str;
> +  unsigned int h = 5381;
> +  unsigned int c0, c1;
> +  for (;;)
> +    {
> +      c0 = s[0];
> +      /* Since hashed string is normally not empty, this is unlikely on the
> +	 first iteration of the loop.  */
> +      if (__glibc_unlikely (c0 == 0))
> +	return h;
> +
> +      c1 = s[1];
> +      if (c1 == 0)
> +	{
> +	  /* Ideal computational order is:
> +	 c0 += h;
> +	 h *= 32;
> +	 h += c0;  */
> +	  c0 += h;
> +	  __asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
> +	  h = h * 32 + c0;
> +	  return h;
> +	}
> +
> +      /* Ideal computational order is:
> +	 c1 += c0;
> +	 h *= 33 * 33;
> +	 c0 *= 32;
> +	 c1 += c0;
> +	 h  += c1;  */
> +      c1 += c0;
> +      __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
> +      h *= 33 * 33;
> +      c1 += c0 * 32;
> +      __asm_reassociation_barrier("" : "+r"(c1));
> +      h += c1;
> +      s += 2;
> +    }
> +}
> +

OK.

> +#endif /* dl-new-hash.h */
> diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h
> new file mode 100644
> index 0000000000..ce8fb5a838
> --- /dev/null
> +++ b/sysdeps/x86/dl-new-hash.h
> @@ -0,0 +1,24 @@
> +/* _dl_new_hash for elf symbol lookup
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifdef __asm_reassociation_barrier
> +# error "__asm_reassociation_barrier should never already be defined."
> +#endif
> +
> +#define __asm_reassociation_barrier __asm__
> +#include <sysdeps/generic/dl-new-hash.h>

OK.
  
Noah Goldstein May 19, 2022, 10:22 p.m. UTC | #5
On Thu, May 19, 2022 at 10:55 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>
> On 18/05/2022 22:56, Noah Goldstein via Libc-alpha wrote:
> > Unroll slightly and enforce good instruction scheduling. This improves
> > performance on out-of-order machines. The unrolling allows for
> > pipelined multiplies.
> >
> > As well, as an optional sysdep, reorder the operations and prevent
> > reassosiation for better scheduling and higher ILP. This commit
> > only adds the barrier for x86, although it should be either no
> > change or a win for any architecture.
> >
> > Unrolling further started to induce slowdowns for sizes [0, 4]
> > but can help the loop so if larger sizes are the target further
> > unrolling can be beneficial.
> >
> > Results for _dl_new_hash
> > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> >
> > Time as Geometric Mean of N=30 runs
> > Geometric of all benchmark New / Old: 0.674
> >    type, length, New Time, Old Time, New Time / Old Time
> >   fixed,      0,    2.865,     2.72,               1.053
> >   fixed,      1,    3.567,    2.489,               1.433
> >   fixed,      2,    2.577,    3.649,               0.706
> >   fixed,      3,    3.644,    5.983,               0.609
> >   fixed,      4,    4.211,    6.833,               0.616
> >   fixed,      5,    4.741,    9.372,               0.506
> >   fixed,      6,    5.415,    9.561,               0.566
> >   fixed,      7,    6.649,   10.789,               0.616
> >   fixed,      8,    8.081,   11.808,               0.684
> >   fixed,      9,    8.427,   12.935,               0.651
> >   fixed,     10,    8.673,   14.134,               0.614
> >   fixed,     11,    10.69,   15.408,               0.694
> >   fixed,     12,   10.789,   16.982,               0.635
> >   fixed,     13,   12.169,   18.411,               0.661
> >   fixed,     14,   12.659,   19.914,               0.636
> >   fixed,     15,   13.526,   21.541,               0.628
> >   fixed,     16,   14.211,   23.088,               0.616
> >   fixed,     32,   29.412,   52.722,               0.558
> >   fixed,     64,    65.41,  142.351,               0.459
> >   fixed,    128,  138.505,  295.625,               0.469
> >   fixed,    256,  291.707,  601.983,               0.485
> > random,      2,   12.698,   12.849,               0.988
> > random,      4,   16.065,   15.857,               1.013
> > random,      8,   19.564,   21.105,               0.927
> > random,     16,   23.919,   26.823,               0.892
> > random,     32,   31.987,   39.591,               0.808
> > random,     64,   49.282,   71.487,               0.689
> > random,    128,    82.23,  145.364,               0.566
> > random,    256,  152.209,  298.434,                0.51
> >
> > Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> > ---
> >   benchtests/bench-dl-new-hash.c              |   3 +-
> >   elf/{dl-new-hash.h => simple-dl-new-hash.h} |  20 ++--
> >   elf/tst-dl-hash.c                           |   1 +
> >   sysdeps/generic/dl-new-hash.h               | 111 ++++++++++++++++++++
> >   sysdeps/x86/dl-new-hash.h                   |  24 +++++
> >   5 files changed, 146 insertions(+), 13 deletions(-)
> >   rename elf/{dl-new-hash.h => simple-dl-new-hash.h} (75%)
> >   create mode 100644 sysdeps/generic/dl-new-hash.h
> >   create mode 100644 sysdeps/x86/dl-new-hash.h
>
> Mostly OK, just minor nits to fix below.
>
> >
> > diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
> > index 3c8a1d5a82..040fa7ce01 100644
> > --- a/benchtests/bench-dl-new-hash.c
> > +++ b/benchtests/bench-dl-new-hash.c
> > @@ -16,7 +16,8 @@
> >      License along with the GNU C Library; if not, see
> >      <https://www.gnu.org/licenses/>.  */
> >
> > -#include <elf/dl-new-hash.h>
> > +#include <dl-new-hash.h>
> > +#include <elf/simple-dl-new-hash.h>
> >   #define TEST_FUNC(x, y) _dl_new_hash (x)
> >   #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)
>
> OK.
>
> >
> > diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h
> > similarity index 75%
> > rename from elf/dl-new-hash.h
> > rename to elf/simple-dl-new-hash.h
> > index 8641bb4196..1437b1bd36 100644
> > --- a/elf/dl-new-hash.h
> > +++ b/elf/simple-dl-new-hash.h
> > @@ -1,4 +1,4 @@
> > -/* _dl_new_hash for elf symbol lookup
> > +/* __simple_dl_new_hash for testing true elf symbol lookup.
> >      Copyright (C) 2022 Free Software Foundation, Inc.
> >      This file is part of the GNU C Library.
> >
> > @@ -16,16 +16,16 @@
> >      License along with the GNU C Library; if not, see
> >      <https://www.gnu.org/licenses/>.  */
> >
> > -#ifndef _DL_NEW_HASH_H
> > -#define _DL_NEW_HASH_H 1
> > +#ifndef _SIMPLE_DL_NEW_HASH_H
> > +#define _SIMPLE_DL_NEW_HASH_H 1
> >
> >   #include <stdint.h>
> > -/* For __always_inline.  */
> > -#include <sys/cdefs.h>
> >
> > -static __always_inline uint32_t
> > +/* For testing/benchmarking purposes.  Real implementation in
> > +   sysdeps/generic/dl-new-hash.h.  */
> > +static uint32_t
> >   __attribute__ ((unused))
> > -_dl_new_hash (const char *s)
> > +__simple_dl_new_hash (const char *s)
> >   {
> >     uint32_t h = 5381;
> >     for (unsigned char c = *s; c != '\0'; c = *++s)
> > @@ -33,8 +33,4 @@ _dl_new_hash (const char *s)
> >     return h;
> >   }
> >
> > -/* For testing/benchmarking purposes.  */
> > -#define __simple_dl_new_hash _dl_new_hash
> > -
> > -
> > -#endif /* dl-new-hash.h */
> > +#endif /* simple-dl-new-hash.h */
> > diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
> > index 8697eb73a0..b21766c63d 100644
> > --- a/elf/tst-dl-hash.c
> > +++ b/elf/tst-dl-hash.c
> > @@ -18,6 +18,7 @@
> >
> >
> >   #include <simple-dl-hash.h>
> > +#include <simple-dl-new-hash.h>
> >   #include <dl-hash.h>
> >   #include <dl-new-hash.h>
> >   #include <support/support.h>
> > diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h
> > new file mode 100644
> > index 0000000000..1faf309c97
> > --- /dev/null
> > +++ b/sysdeps/generic/dl-new-hash.h
> > @@ -0,0 +1,111 @@
> > +/* _dl_new_hash for elf symbol lookup
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _DL_NEW_HASH_H
> > +#define _DL_NEW_HASH_H 1
> > +
> > +#include <stdint.h>
> > +/* For __always_inline.  */
> > +#include <sys/cdefs.h>
> > +/* For __glibc_unlikely.  */
> > +#include <sys/cdefs.h>
>
> Duplicate, but you already know this.
>
> > +
> > +/* The simplest implementation of _dl_new_hash is:
> > +
> > +   _dl_new_hash (const char *s)
> > +   {
> > +      uint32_t h = 5381;
> > +      for (unsigned char c = *s; c != '\0'; c = *++s)
> > +        h = h * 33 + c;
> > +      return h;
> > +   }
> > +
> > +   We can get better performance by slightly unrolling the loop to
> > +   pipeline the multiples, which gcc cannot easily do due to
> > +   dependencies across iterations.
> > +
> > +   As well, as an architecture specific option we add asm statements
> > +   to explicitly specify order of operations and prevent reassociation
> > +   of instructions that lengthens the loop carried dependency. This
> > +   may have no affect as the compiler may have ordered instructions
> > +   the same way without it but in testing this has not been the case
> > +   for GCC. Improving GCC to reliably schedule instructions ideally
> > +   cannot be easily done.
> > +
> > +   Architecture(s) that use the reassociation barries are:
>
> barriers

Fixed in V11.
>
> > +   x86
> > +
> > +   Note it is very unlikely the reassociation barriers would
> > +   de-optimize performance on any architecture and with an imperfect
> > +   compiler it may help performance, especially on out-of-order cpus,
> > +   so it is suggested that the respective maintainers add them.
> > +
> > +   architecture maintainers are encouraged to benchmark this with
>
> Architecture

Fixed in V11.
>
> > +   __asm_reassociation_barrier defined to __asm__ like it is in x86.
> > +*/
> > +
> > +
> > +#ifndef __asm_reassociation_barrier
> > +# define __asm_reassociation_barrier(...)
> > +#endif
> > +
> > +static __always_inline uint32_t
> > +__attribute__ ((unused))
> > +_dl_new_hash (const char *str)
> > +{
> > +  const unsigned char *s = (const unsigned char *) str;
> > +  unsigned int h = 5381;
> > +  unsigned int c0, c1;
> > +  for (;;)
> > +    {
> > +      c0 = s[0];
> > +      /* Since hashed string is normally not empty, this is unlikely on the
> > +      first iteration of the loop.  */
> > +      if (__glibc_unlikely (c0 == 0))
> > +     return h;
> > +
> > +      c1 = s[1];
> > +      if (c1 == 0)
> > +     {
> > +       /* Ideal computational order is:
> > +      c0 += h;
> > +      h *= 32;
> > +      h += c0;  */
> > +       c0 += h;
> > +       __asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
> > +       h = h * 32 + c0;
> > +       return h;
> > +     }
> > +
> > +      /* Ideal computational order is:
> > +      c1 += c0;
> > +      h *= 33 * 33;
> > +      c0 *= 32;
> > +      c1 += c0;
> > +      h  += c1;  */
> > +      c1 += c0;
> > +      __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
> > +      h *= 33 * 33;
> > +      c1 += c0 * 32;
> > +      __asm_reassociation_barrier("" : "+r"(c1));
> > +      h += c1;
> > +      s += 2;
> > +    }
> > +}
> > +
>
> OK.
>
> > +#endif /* dl-new-hash.h */
> > diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h
> > new file mode 100644
> > index 0000000000..ce8fb5a838
> > --- /dev/null
> > +++ b/sysdeps/x86/dl-new-hash.h
> > @@ -0,0 +1,24 @@
> > +/* _dl_new_hash for elf symbol lookup
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifdef __asm_reassociation_barrier
> > +# error "__asm_reassociation_barrier should never already be defined."
> > +#endif
> > +
> > +#define __asm_reassociation_barrier __asm__
> > +#include <sysdeps/generic/dl-new-hash.h>
>
> OK.
  

Patch

diff --git a/benchtests/bench-dl-new-hash.c b/benchtests/bench-dl-new-hash.c
index 3c8a1d5a82..040fa7ce01 100644
--- a/benchtests/bench-dl-new-hash.c
+++ b/benchtests/bench-dl-new-hash.c
@@ -16,7 +16,8 @@ 
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <elf/dl-new-hash.h>
+#include <dl-new-hash.h>
+#include <elf/simple-dl-new-hash.h>
 #define TEST_FUNC(x, y) _dl_new_hash (x)
 #define SIMPLE_TEST_FUNC(x, y) __simple_dl_new_hash (x)
 
diff --git a/elf/dl-new-hash.h b/elf/simple-dl-new-hash.h
similarity index 75%
rename from elf/dl-new-hash.h
rename to elf/simple-dl-new-hash.h
index 8641bb4196..1437b1bd36 100644
--- a/elf/dl-new-hash.h
+++ b/elf/simple-dl-new-hash.h
@@ -1,4 +1,4 @@ 
-/* _dl_new_hash for elf symbol lookup
+/* __simple_dl_new_hash for testing true elf symbol lookup.
    Copyright (C) 2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,16 +16,16 @@ 
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#ifndef _DL_NEW_HASH_H
-#define _DL_NEW_HASH_H 1
+#ifndef _SIMPLE_DL_NEW_HASH_H
+#define _SIMPLE_DL_NEW_HASH_H 1
 
 #include <stdint.h>
-/* For __always_inline.  */
-#include <sys/cdefs.h>
 
-static __always_inline uint32_t
+/* For testing/benchmarking purposes.  Real implementation in
+   sysdeps/generic/dl-new-hash.h.  */
+static uint32_t
 __attribute__ ((unused))
-_dl_new_hash (const char *s)
+__simple_dl_new_hash (const char *s)
 {
   uint32_t h = 5381;
   for (unsigned char c = *s; c != '\0'; c = *++s)
@@ -33,8 +33,4 @@  _dl_new_hash (const char *s)
   return h;
 }
 
-/* For testing/benchmarking purposes.  */
-#define __simple_dl_new_hash _dl_new_hash
-
-
-#endif /* dl-new-hash.h */
+#endif /* simple-dl-new-hash.h */
diff --git a/elf/tst-dl-hash.c b/elf/tst-dl-hash.c
index 8697eb73a0..b21766c63d 100644
--- a/elf/tst-dl-hash.c
+++ b/elf/tst-dl-hash.c
@@ -18,6 +18,7 @@ 
 
 
 #include <simple-dl-hash.h>
+#include <simple-dl-new-hash.h>
 #include <dl-hash.h>
 #include <dl-new-hash.h>
 #include <support/support.h>
diff --git a/sysdeps/generic/dl-new-hash.h b/sysdeps/generic/dl-new-hash.h
new file mode 100644
index 0000000000..1faf309c97
--- /dev/null
+++ b/sysdeps/generic/dl-new-hash.h
@@ -0,0 +1,111 @@ 
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_NEW_HASH_H
+#define _DL_NEW_HASH_H 1
+
+#include <stdint.h>
+/* For __always_inline.  */
+#include <sys/cdefs.h>
+/* For __glibc_unlikely.  */
+#include <sys/cdefs.h>
+
+/* The simplest implementation of _dl_new_hash is:
+
+   _dl_new_hash (const char *s)
+   {
+      uint32_t h = 5381;
+      for (unsigned char c = *s; c != '\0'; c = *++s)
+        h = h * 33 + c;
+      return h;
+   }
+
+   We can get better performance by slightly unrolling the loop to
+   pipeline the multiples, which gcc cannot easily do due to
+   dependencies across iterations.
+
+   As well, as an architecture specific option we add asm statements
+   to explicitly specify order of operations and prevent reassociation
+   of instructions that lengthens the loop carried dependency. This
+   may have no affect as the compiler may have ordered instructions
+   the same way without it but in testing this has not been the case
+   for GCC. Improving GCC to reliably schedule instructions ideally
+   cannot be easily done.
+
+   Architecture(s) that use the reassociation barries are:
+   x86
+
+   Note it is very unlikely the reassociation barriers would
+   de-optimize performance on any architecture and with an imperfect
+   compiler it may help performance, especially on out-of-order cpus,
+   so it is suggested that the respective maintainers add them.
+
+   architecture maintainers are encouraged to benchmark this with
+   __asm_reassociation_barrier defined to __asm__ like it is in x86.
+*/
+
+
+#ifndef __asm_reassociation_barrier
+# define __asm_reassociation_barrier(...)
+#endif
+
+static __always_inline uint32_t
+__attribute__ ((unused))
+_dl_new_hash (const char *str)
+{
+  const unsigned char *s = (const unsigned char *) str;
+  unsigned int h = 5381;
+  unsigned int c0, c1;
+  for (;;)
+    {
+      c0 = s[0];
+      /* Since hashed string is normally not empty, this is unlikely on the
+	 first iteration of the loop.  */
+      if (__glibc_unlikely (c0 == 0))
+	return h;
+
+      c1 = s[1];
+      if (c1 == 0)
+	{
+	  /* Ideal computational order is:
+	 c0 += h;
+	 h *= 32;
+	 h += c0;  */
+	  c0 += h;
+	  __asm_reassociation_barrier("" : "+r"(h) : "r"(c0));
+	  h = h * 32 + c0;
+	  return h;
+	}
+
+      /* Ideal computational order is:
+	 c1 += c0;
+	 h *= 33 * 33;
+	 c0 *= 32;
+	 c1 += c0;
+	 h  += c1;  */
+      c1 += c0;
+      __asm_reassociation_barrier("" : "+r"(c1), "+r"(c0));
+      h *= 33 * 33;
+      c1 += c0 * 32;
+      __asm_reassociation_barrier("" : "+r"(c1));
+      h += c1;
+      s += 2;
+    }
+}
+
+#endif /* dl-new-hash.h */
diff --git a/sysdeps/x86/dl-new-hash.h b/sysdeps/x86/dl-new-hash.h
new file mode 100644
index 0000000000..ce8fb5a838
--- /dev/null
+++ b/sysdeps/x86/dl-new-hash.h
@@ -0,0 +1,24 @@ 
+/* _dl_new_hash for elf symbol lookup
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifdef __asm_reassociation_barrier
+# error "__asm_reassociation_barrier should never already be defined."
+#endif
+
+#define __asm_reassociation_barrier __asm__
+#include <sysdeps/generic/dl-new-hash.h>