[v8,6/6] elf: Optimize _dl_new_hash in dl-new-hash.h

Message ID 20220511030635.154689-6-goldstein.w.n@gmail.com
State Superseded
Headers
Series [v8,1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Noah Goldstein May 11, 2022, 3:06 a.m. UTC
  Unroll slightly and enforce good instruction scheduling. This improves
performance on out-of-order machines. Note the unrolling allows
for pipelined multiplies which helps a bit, but most of the gain
is from enforcing better instruction scheduling for more ILP.
Unrolling further started to induce slowdowns for sizes [0, 4]
but can help the loop so if larger sizes are the target further
unrolling can be beneficial.

Results for _dl_new_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=30 runs
Geometric of all benchmark New / Old: 0.674
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    2.865,     2.72,               1.053
 fixed,      1,    3.567,    2.489,               1.433
 fixed,      2,    2.577,    3.649,               0.706
 fixed,      3,    3.644,    5.983,               0.609
 fixed,      4,    4.211,    6.833,               0.616
 fixed,      5,    4.741,    9.372,               0.506
 fixed,      6,    5.415,    9.561,               0.566
 fixed,      7,    6.649,   10.789,               0.616
 fixed,      8,    8.081,   11.808,               0.684
 fixed,      9,    8.427,   12.935,               0.651
 fixed,     10,    8.673,   14.134,               0.614
 fixed,     11,    10.69,   15.408,               0.694
 fixed,     12,   10.789,   16.982,               0.635
 fixed,     13,   12.169,   18.411,               0.661
 fixed,     14,   12.659,   19.914,               0.636
 fixed,     15,   13.526,   21.541,               0.628
 fixed,     16,   14.211,   23.088,               0.616
 fixed,     32,   29.412,   52.722,               0.558
 fixed,     64,    65.41,  142.351,               0.459
 fixed,    128,  138.505,  295.625,               0.469
 fixed,    256,  291.707,  601.983,               0.485
random,      2,   12.698,   12.849,               0.988
random,      4,   16.065,   15.857,               1.013
random,      8,   19.564,   21.105,               0.927
random,     16,   23.919,   26.823,               0.892
random,     32,   31.987,   39.591,               0.808
random,     64,   49.282,   71.487,               0.689
random,    128,    82.23,  145.364,               0.566
random,    256,  152.209,  298.434,                0.51

Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
---
 elf/dl-new-hash.h | 66 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 61 insertions(+), 5 deletions(-)
  

Comments

Siddhesh Poyarekar May 16, 2022, 2:12 p.m. UTC | #1
On 11/05/2022 08:36, Noah Goldstein via Libc-alpha wrote:
> Unroll slightly and enforce good instruction scheduling. This improves
> performance on out-of-order machines. Note the unrolling allows
> for pipelined multiplies which helps a bit, but most of the gain
> is from enforcing better instruction scheduling for more ILP.
> Unrolling further started to induce slowdowns for sizes [0, 4]
> but can help the loop so if larger sizes are the target further
> unrolling can be beneficial.
> 
> Results for _dl_new_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> 
> Time as Geometric Mean of N=30 runs
> Geometric of all benchmark New / Old: 0.674
>    type, length, New Time, Old Time, New Time / Old Time
>   fixed,      0,    2.865,     2.72,               1.053
>   fixed,      1,    3.567,    2.489,               1.433
>   fixed,      2,    2.577,    3.649,               0.706
>   fixed,      3,    3.644,    5.983,               0.609
>   fixed,      4,    4.211,    6.833,               0.616
>   fixed,      5,    4.741,    9.372,               0.506
>   fixed,      6,    5.415,    9.561,               0.566
>   fixed,      7,    6.649,   10.789,               0.616
>   fixed,      8,    8.081,   11.808,               0.684
>   fixed,      9,    8.427,   12.935,               0.651
>   fixed,     10,    8.673,   14.134,               0.614
>   fixed,     11,    10.69,   15.408,               0.694
>   fixed,     12,   10.789,   16.982,               0.635
>   fixed,     13,   12.169,   18.411,               0.661
>   fixed,     14,   12.659,   19.914,               0.636
>   fixed,     15,   13.526,   21.541,               0.628
>   fixed,     16,   14.211,   23.088,               0.616
>   fixed,     32,   29.412,   52.722,               0.558
>   fixed,     64,    65.41,  142.351,               0.459
>   fixed,    128,  138.505,  295.625,               0.469
>   fixed,    256,  291.707,  601.983,               0.485
> random,      2,   12.698,   12.849,               0.988
> random,      4,   16.065,   15.857,               1.013
> random,      8,   19.564,   21.105,               0.927
> random,     16,   23.919,   26.823,               0.892
> random,     32,   31.987,   39.591,               0.808
> random,     64,   49.282,   71.487,               0.689
> random,    128,    82.23,  145.364,               0.566
> random,    256,  152.209,  298.434,                0.51
> 
> Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> ---
>   elf/dl-new-hash.h | 66 +++++++++++++++++++++++++++++++++++++++++++----
>   1 file changed, 61 insertions(+), 5 deletions(-)
> 
> diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> index 40d88c81f9..5269f6eb98 100644
> --- a/elf/dl-new-hash.h
> +++ b/elf/dl-new-hash.h
> @@ -20,15 +20,71 @@
>   #define _DL_NEW_HASH_H 1
>   
>   #include <stdint.h>
> +/* For __glibc_unlikely.  */
> +#include <sys/cdefs.h>
> +
> +/* The simplest implementation of _dl_new_hash is:
> +
> +_dl_new_hash (const char *s)
> +{
> +   uint32_t h = 5381;
> +   for (unsigned char c = *s; c != '\0'; c = *++s)
> +     h = h * 33 + c;
> +   return h;
> +}
> +
> +   We can get better performance, however, by slightly unrolling the
> +   loop and explicitly specifying order of operations to prevent
> +   reassosiation of instructions and ensure ideal scheduling.  */
>   
>   static inline uint32_t
>   __attribute__ ((unused))
> -_dl_new_hash (const char *s)
> +_dl_new_hash (const char *str)
>   {
> -  uint32_t h = 5381;
> -  for (unsigned char c = *s; c != '\0'; c = *++s)
> -    h = h * 33 + c;
> -  return h;
> +  const unsigned char *s = (const unsigned char *) str;
> +  unsigned int h = 5381;
> +  unsigned int c0, c1;
> +  for (;;)
> +    {
> +      c0 = s[0];
> +      /* Unlikely length zero string so evens will be slightly less
> +	 common.  */
> +      if (__glibc_unlikely (c0 == 0))
> +	return h;
> +
> +      c1 = s[1];
> +      if (c1 == 0)
> +	{
> +	  c0 += h;
> +	  /* Ideal instruction scheduling is:
> +	 c0 += h;
> +	 h *= 32;
> +	 h += c0;
> +
> +	 The asm statements are to prevent reassosiation that would result in
> +	 more instruction interdependencies and worse scheduling.  */
> +	  asm("" : "+r"(h) : "r"(c0));

There are a couple of things that seem problematic to me about this:

- It seems like we're trying to fix a gcc issue in glibc.  Couldn't we 
file a gcc bug and explore ways in which this could be supported in the 
compiler?  In fact, it might make sense to do that for the original 
loop; it looks like a missed optimization that gcc ought to fix.  IMO 
the bug should be filed even if we do end up with this 
micro-optimization in glibc.

- The patch controls an instruction schedule so that it works well on 
out-of-order processors but then only quoting one microarchitecture.  If 
it works well on TigerLake (and on x86 in general) then it might be 
better to add it as a sysdep override; I assumed that was the point of 
breaking the function out into its header anyway.  If it is more 
generally useful then please share numbers to that effect in the commit 
message and also explicitly state in the comments why we're trying to 
exert this level of control on codegen in generic C code and why it is 
good for all architectures.

> +	  h = h * 32 + c0;
> +	  return h;
> +	}
> +
> +      /* Ideal instruction scheduling is:
> +	 c1 += c0;
> +	 h *= 33 * 33;
> +	 c0 *= 32;
> +	 c1 += c0;
> +	 h  += c1;
> +
> +	 The asm statements are to prevent reassosiation that would result in
> +	 more instruction interdependencies and worse scheduling.  */
> +      c1 += c0;
> +      asm("" : "+r"(c1), "+r"(c0));
> +      h *= 33 * 33;
> +      c1 += c0 * 32;
> +      asm("" : "+r"(c1));
> +      h += c1;
> +      s += 2;
> +    }
>   }
>   
>   

Thanks,
Siddhesh
  
Alexander Monakov May 16, 2022, 2:31 p.m. UTC | #2
On Mon, 16 May 2022, Siddhesh Poyarekar wrote:

> There are a couple of things that seem problematic to me about this:
> 
> - It seems like we're trying to fix a gcc issue in glibc.  Couldn't we file a
> gcc bug and explore ways in which this could be supported in the compiler?  In
> fact, it might make sense to do that for the original loop; it looks like a
> missed optimization that gcc ought to fix.  IMO the bug should be filed even
> if we do end up with this micro-optimization in glibc.

This issue involves a chain of dependencies that goes across all loop
iterations, but relevant compiler optimization (reassociation, register
allocation, scheduling) do not consider such global chains. You might
file this as a "wishlist" bug, but compiler infrastructure is simply
not designed to make such nontrivial decisions.

> - The patch controls an instruction schedule so that it works well on
> out-of-order processors but then only quoting one microarchitecture.

It's not specific to out-of-order processors: a long chain of dependencies
restricts OoO scheduling in the CPU. So in the end it benefits "classic"
and OoO pipelines in a similar fashion.

> If it
> works well on TigerLake (and on x86 in general) then it might be better to add
> it as a sysdep override; I assumed that was the point of breaking the function
> out into its header anyway.  If it is more generally useful then please share
> numbers to that effect in the commit message and also explicitly state in the
> comments why we're trying to exert this level of control on codegen in generic
> C code and why it is good for all architectures.

I guess it's up to you and Noah to hash it out, but I'd like to remind that
there was an alternative variant which is a strict win on all architectures
(same code size, same instruction mix, no dependency on fast multiplication).
That might be easier to justify from generic code point of view.

Alexander
  
Siddhesh Poyarekar May 16, 2022, 4:23 p.m. UTC | #3
On 16/05/2022 20:01, Alexander Monakov wrote:
> On Mon, 16 May 2022, Siddhesh Poyarekar wrote:
> 
>> There are a couple of things that seem problematic to me about this:
>>
>> - It seems like we're trying to fix a gcc issue in glibc.  Couldn't we file a
>> gcc bug and explore ways in which this could be supported in the compiler?  In
>> fact, it might make sense to do that for the original loop; it looks like a
>> missed optimization that gcc ought to fix.  IMO the bug should be filed even
>> if we do end up with this micro-optimization in glibc.
> 
> This issue involves a chain of dependencies that goes across all loop
> iterations, but relevant compiler optimization (reassociation, register
> allocation, scheduling) do not consider such global chains. You might
> file this as a "wishlist" bug, but compiler infrastructure is simply
> not designed to make such nontrivial decisions.

Thanks for the context, this should go into comments.  A wishlist bug 
would be nice but I suspect it'll just gather dust.  Maybe it's still 
useful for someone coming in after 10-15 years looking for more context 
on it.

>> - The patch controls an instruction schedule so that it works well on
>> out-of-order processors but then only quoting one microarchitecture.
> 
> It's not specific to out-of-order processors: a long chain of dependencies
> restricts OoO scheduling in the CPU. So in the end it benefits "classic"
> and OoO pipelines in a similar fashion.
> 
>> If it
>> works well on TigerLake (and on x86 in general) then it might be better to add
>> it as a sysdep override; I assumed that was the point of breaking the function
>> out into its header anyway.  If it is more generally useful then please share
>> numbers to that effect in the commit message and also explicitly state in the
>> comments why we're trying to exert this level of control on codegen in generic
>> C code and why it is good for all architectures.
> 
> I guess it's up to you and Noah to hash it out, but I'd like to remind that
> there was an alternative variant which is a strict win on all architectures
> (same code size, same instruction mix, no dependency on fast multiplication).
> That might be easier to justify from generic code point of view.

I would prefer the earlier variant in generic code, with (if necessary) 
the scheduling hack being a sysdep for x86.  Other architectures that 
want to use the latter should #include it and also post microbenchmark 
results so that we keep track of how we arrived at that decision.

Thanks,
Siddhesh
  
Noah Goldstein May 16, 2022, 4:38 p.m. UTC | #4
On Mon, May 16, 2022 at 11:23 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>
> On 16/05/2022 20:01, Alexander Monakov wrote:
> > On Mon, 16 May 2022, Siddhesh Poyarekar wrote:
> >
> >> There are a couple of things that seem problematic to me about this:
> >>
> >> - It seems like we're trying to fix a gcc issue in glibc.  Couldn't we file a
> >> gcc bug and explore ways in which this could be supported in the compiler?  In
> >> fact, it might make sense to do that for the original loop; it looks like a
> >> missed optimization that gcc ought to fix.  IMO the bug should be filed even
> >> if we do end up with this micro-optimization in glibc.
> >
> > This issue involves a chain of dependencies that goes across all loop
> > iterations, but relevant compiler optimization (reassociation, register
> > allocation, scheduling) do not consider such global chains. You might
> > file this as a "wishlist" bug, but compiler infrastructure is simply
> > not designed to make such nontrivial decisions.
>
> Thanks for the context, this should go into comments.  A wishlist bug
> would be nice but I suspect it'll just gather dust.  Maybe it's still
> useful for someone coming in after 10-15 years looking for more context
> on it.

I'll add a comment in the next version.
>
> >> - The patch controls an instruction schedule so that it works well on
> >> out-of-order processors but then only quoting one microarchitecture.
> >
> > It's not specific to out-of-order processors: a long chain of dependencies
> > restricts OoO scheduling in the CPU. So in the end it benefits "classic"
> > and OoO pipelines in a similar fashion.
> >
> >> If it
> >> works well on TigerLake (and on x86 in general) then it might be better to add
> >> it as a sysdep override; I assumed that was the point of breaking the function
> >> out into its header anyway.  If it is more generally useful then please share
> >> numbers to that effect in the commit message and also explicitly state in the
> >> comments why we're trying to exert this level of control on codegen in generic
> >> C code and why it is good for all architectures.
> >
> > I guess it's up to you and Noah to hash it out, but I'd like to remind that
> > there was an alternative variant which is a strict win on all architectures
> > (same code size, same instruction mix, no dependency on fast multiplication).
> > That might be easier to justify from generic code point of view.
>
> I would prefer the earlier variant in generic code, with (if necessary)
> the scheduling hack being a sysdep for x86.  Other architectures that
> want to use the latter should #include it and also post microbenchmark
> results so that we keep track of how we arrived at that decision.

I'm happy to switch it back but I don't think the scheduling hack is x86
oriented. I don't think re-ordering could ever de-optimize things.
The only real architectural assumption is a reasonably fast
32-bit multiply which is true for both the more generic earlier version
and the current one.
>
> Thanks,
> Siddhesh
  
Siddhesh Poyarekar May 16, 2022, 4:44 p.m. UTC | #5
On 16/05/2022 22:08, Noah Goldstein wrote:
>> Thanks for the context, this should go into comments.  A wishlist bug
>> would be nice but I suspect it'll just gather dust.  Maybe it's still
>> useful for someone coming in after 10-15 years looking for more context
>> on it.
> 
> I'll add a comment in the next version.

Thanks.

>> I would prefer the earlier variant in generic code, with (if necessary)
>> the scheduling hack being a sysdep for x86.  Other architectures that
>> want to use the latter should #include it and also post microbenchmark
>> results so that we keep track of how we arrived at that decision.
> 
> I'm happy to switch it back but I don't think the scheduling hack is x86
> oriented. I don't think re-ordering could ever de-optimize things.
> The only real architectural assumption is a reasonably fast
> 32-bit multiply which is true for both the more generic earlier version
> and the current one.

I don't entirely disagree, but I think the conservative stance here is 
to keep the scheduling hack in the sysdep that has actually been tested 
and then bring it out into generic if it has been experimentally 
verified to be a universal win for all architectures we support.  That 
is, if we find that every architecture is including the sysdep version, 
then it's time to bring it out to replace the generic version.

FWIW, I'm lowering the bar for acceptance because you only need to 
verify that the scheduling hack is better for the architecture you're 
interested in, not all architectures we support :)

Thanks,
Siddhesh
  
Alexander Monakov May 16, 2022, 6:09 p.m. UTC | #6
On Mon, 16 May 2022, Siddhesh Poyarekar wrote:

> There are a couple of things that seem problematic to me about this:
[snip]

Since Carlos mentioned in today's patch review message that you didn't like
something about the asms, can you (or Adhemerval) please explain what you
meant? The asms in the patch have empty body and standard "r" constraints,
I'd say they are perfectly portable.

Personally I am highly uncomfortable with situations when maintainers have
a substantial (in their opinion) objection to the patch, but the objection
is not communicated back to the submitter, so they cannot defend their patch
or even learn about the issue to account for it in future work, but at the
same time it is discussed "behind the submitter's back" (potentially
disparaging or merely misunderstanding their work).

Alexander
  
Siddhesh Poyarekar May 16, 2022, 6:47 p.m. UTC | #7
On 16/05/2022 23:39, Alexander Monakov wrote:
> On Mon, 16 May 2022, Siddhesh Poyarekar wrote:
> 
>> There are a couple of things that seem problematic to me about this:
> [snip]
> 
> Since Carlos mentioned in today's patch review message that you didn't like
> something about the asms, can you (or Adhemerval) please explain what you
> meant? The asms in the patch have empty body and standard "r" constraints,
> I'd say they are perfectly portable.

I did explain; I am not comfortable with controlling instruction 
scheduling in that manner for generic code because it assumes more about 
underlying processor pipelines and instruction sequences than we 
typically do in generic code.  It has nothing to do with portability. 
Adhemerval raised the question about whether this ought to be done in 
gcc instead, which I concurred with too.

I volunteered to share that position here, which I believe I did to the 
best of my knowledge and understanding of the issues.  I think talking 
about "the asms" would have only confused the issue since it's not 
really about that construct.

> Personally I am highly uncomfortable with situations when maintainers have
> a substantial (in their opinion) objection to the patch, but the objection
> is not communicated back to the submitter, so they cannot defend their patch
> or even learn about the issue to account for it in future work, but at the
> same time it is discussed "behind the submitter's back" (potentially
> disparaging or merely misunderstanding their work).

The submitter (Noah) has often attended the review calls so it's just 
unfortunate that he didn't happen to be on it today.

The main point of this call is finding reviewers for patches and the 
secondary one is for submitters to solicit reviews or discussions if 
they're stuck on a problem.  The call is open to everyone too[1].  While 
some patches get closer scrutiny, the primary purpose is to find out why 
the patch is stuck in review, not to decide on whether to ack or nack 
it.  Usually the submitter is present in the closer scrutiny but if not, 
the action item always is for one of us to volunteer to take a closer 
look and share findings on the mailing list.

Everything, without exception, lands on the mailing list to the last 
detail.  Except maybe jokes about someone's audio not working or 
conversations about the weather.

Siddhesh

[1] https://sourceware.org/glibc/wiki/PatchworkReviewMeetings
  
Alexander Monakov May 16, 2022, 7:28 p.m. UTC | #8
On Tue, 17 May 2022, Siddhesh Poyarekar wrote:

> On 16/05/2022 23:39, Alexander Monakov wrote:
> > On Mon, 16 May 2022, Siddhesh Poyarekar wrote:
> > 
> >> There are a couple of things that seem problematic to me about this:
> > [snip]
> > 
> > Since Carlos mentioned in today's patch review message that you didn't like
> > something about the asms, can you (or Adhemerval) please explain what you
> > meant? The asms in the patch have empty body and standard "r" constraints,
> > I'd say they are perfectly portable.
> 
> I did explain; I am not comfortable with controlling instruction scheduling in
> that manner for generic code because it assumes more about underlying
> processor pipelines and instruction sequences than we typically do in generic
> code.  It has nothing to do with portability. Adhemerval raised the question
> about whether this ought to be done in gcc instead, which I concurred with
> too.

Thank you very much for the detailed response. Allow me to clear up what seems
to be a technical misunderstanding here: this is not about instruction
scheduling, but rather dependencies in the computations (I know Noah mentioned
scheduling, but it's confusing especially in context of benchmarking for an
out-of-order CPU).

I have shown how different variants have different chains of dependencies in
this email: https://sourceware.org/pipermail/libc-alpha/2022-May/138495.html

The empty asms are used to prevent compiler reassociating 'h*32 + (h + c)'
to '(h*32 + h) + c' which looks fine in isolation, but significantly changes
the dependency graph in context of the whole loop.

There's nothing specific to the x86 architecture in this reasoning. On arm and
aarch64 it's moot because they evaluate 'h*32 + h' in a single cycle, though.

Alexander
  
Noah Goldstein May 16, 2022, 7:35 p.m. UTC | #9
On Mon, May 16, 2022 at 2:28 PM Alexander Monakov <amonakov@ispras.ru> wrote:
>
> On Tue, 17 May 2022, Siddhesh Poyarekar wrote:
>
> > On 16/05/2022 23:39, Alexander Monakov wrote:
> > > On Mon, 16 May 2022, Siddhesh Poyarekar wrote:
> > >
> > >> There are a couple of things that seem problematic to me about this:
> > > [snip]
> > >
> > > Since Carlos mentioned in today's patch review message that you didn't like
> > > something about the asms, can you (or Adhemerval) please explain what you
> > > meant? The asms in the patch have empty body and standard "r" constraints,
> > > I'd say they are perfectly portable.
> >
> > I did explain; I am not comfortable with controlling instruction scheduling in
> > that manner for generic code because it assumes more about underlying
> > processor pipelines and instruction sequences than we typically do in generic
> > code.  It has nothing to do with portability. Adhemerval raised the question
> > about whether this ought to be done in gcc instead, which I concurred with
> > too.
>
> Thank you very much for the detailed response. Allow me to clear up what seems
> to be a technical misunderstanding here: this is not about instruction
> scheduling, but rather dependencies in the computations (I know Noah mentioned
> scheduling, but it's confusing especially in context of benchmarking for an
> out-of-order CPU).
>
> I have shown how different variants have different chains of dependencies in
> this email: https://sourceware.org/pipermail/libc-alpha/2022-May/138495.html
>
> The empty asms are used to prevent compiler reassociating 'h*32 + (h + c)'
> to '(h*32 + h) + c' which looks fine in isolation, but significantly changes
> the dependency graph in context of the whole loop.

Some architecture could have a really fast integer MADD instruction that
the barrier could either prevent from being emitted or add an extra ADD
instruction at the end of.

Think most arch should use the barriers but it's fair to leave that up to
the arch maintainer.


>
> There's nothing specific to the x86 architecture in this reasoning. On arm and
> aarch64 it's moot because they evaluate 'h*32 + h' in a single cycle, though.
>
> Alexander
  
Alexander Monakov May 16, 2022, 7:41 p.m. UTC | #10
On Mon, 16 May 2022, Noah Goldstein wrote:

> > The empty asms are used to prevent compiler reassociating 'h*32 + (h + c)'
> > to '(h*32 + h) + c' which looks fine in isolation, but significantly changes
> > the dependency graph in context of the whole loop.
> 
> Some architecture could have a really fast integer MADD instruction that
> the barrier could either prevent from being emitted or add an extra ADD
> instruction at the end of.

With the barrier I'd expect a shift-by-5 and two additions, no madd. Modern
aarch64 cores have 3-cycle madd I believe, so it's 3 cycles if the compiler
decides to emit madd vs. 2 cycles if it's only additions.

Alexander
  
Adhemerval Zanella May 16, 2022, 7:47 p.m. UTC | #11
On 16/05/2022 16:41, Alexander Monakov via Libc-alpha wrote:
> On Mon, 16 May 2022, Noah Goldstein wrote:
> 
>>> The empty asms are used to prevent compiler reassociating 'h*32 + (h + c)'
>>> to '(h*32 + h) + c' which looks fine in isolation, but significantly changes
>>> the dependency graph in context of the whole loop.
>>
>> Some architecture could have a really fast integer MADD instruction that
>> the barrier could either prevent from being emitted or add an extra ADD
>> instruction at the end of.
> 
> With the barrier I'd expect a shift-by-5 and two additions, no madd. Modern
> aarch64 cores have 3-cycle madd I believe, so it's 3 cycles if the compiler
> decides to emit madd vs. 2 cycles if it's only additions.
> 
> Alexander

How hard would to make compiler to make this very optimization? I raised 
this on weekly call because more and more it seems that tuning computation
dependencies for loop tuning seems to be more a compiler job than libc's
(although this not a blocker, but we have multiple smalls micro-optimizations
in the past that turned in dead code due compiler catching up).
  
Noah Goldstein May 16, 2022, 7:48 p.m. UTC | #12
On Mon, May 16, 2022 at 2:41 PM Alexander Monakov <amonakov@ispras.ru> wrote:
>
> On Mon, 16 May 2022, Noah Goldstein wrote:
>
> > > The empty asms are used to prevent compiler reassociating 'h*32 + (h + c)'
> > > to '(h*32 + h) + c' which looks fine in isolation, but significantly changes
> > > the dependency graph in context of the whole loop.
> >
> > Some architecture could have a really fast integer MADD instruction that
> > the barrier could either prevent from being emitted or add an extra ADD
> > instruction at the end of.
>
> With the barrier I'd expect a shift-by-5 and two additions, no madd. Modern
> aarch64 cores have 3-cycle madd I believe, so it's 3 cycles if the compiler
> decides to emit madd vs. 2 cycles if it's only additions.

AFAIK the shift-by-5 + 2x adds is the best that exists at the moment but
the point is some arch we either arent thinking about or some future variant
may implement a leq 2 cycle madd.
>
> Alexander
  
Alexander Monakov May 16, 2022, 8 p.m. UTC | #13
> On 16/05/2022 16:41, Alexander Monakov via Libc-alpha wrote:
> > On Mon, 16 May 2022, Noah Goldstein wrote:
> > 
> >>> The empty asms are used to prevent compiler reassociating 'h*32 + (h + c)'
> >>> to '(h*32 + h) + c' which looks fine in isolation, but significantly changes
> >>> the dependency graph in context of the whole loop.
> >>
> >> Some architecture could have a really fast integer MADD instruction that
> >> the barrier could either prevent from being emitted or add an extra ADD
> >> instruction at the end of.
> > 
> > With the barrier I'd expect a shift-by-5 and two additions, no madd. Modern
> > aarch64 cores have 3-cycle madd I believe, so it's 3 cycles if the compiler
> > decides to emit madd vs. 2 cycles if it's only additions.
> > 
> > Alexander
> 
> How hard would to make compiler to make this very optimization? I raised 
> this on weekly call because more and more it seems that tuning computation
> dependencies for loop tuning seems to be more a compiler job than libc's
> (although this not a blocker, but we have multiple smalls micro-optimizations
> in the past that turned in dead code due compiler catching up).

Sorry, since you're responding to a discussion about multiply-add, it's unclear
to me which optimization you mean. Is your question about choosing which
sequence of additions has shorter cross-iteration chain?

Alexander
  
Adhemerval Zanella May 16, 2022, 8:08 p.m. UTC | #14
On 16/05/2022 17:00, Alexander Monakov wrote:
>> On 16/05/2022 16:41, Alexander Monakov via Libc-alpha wrote:
>>> On Mon, 16 May 2022, Noah Goldstein wrote:
>>>
>>>>> The empty asms are used to prevent compiler reassociating 'h*32 + (h + c)'
>>>>> to '(h*32 + h) + c' which looks fine in isolation, but significantly changes
>>>>> the dependency graph in context of the whole loop.
>>>>
>>>> Some architecture could have a really fast integer MADD instruction that
>>>> the barrier could either prevent from being emitted or add an extra ADD
>>>> instruction at the end of.
>>>
>>> With the barrier I'd expect a shift-by-5 and two additions, no madd. Modern
>>> aarch64 cores have 3-cycle madd I believe, so it's 3 cycles if the compiler
>>> decides to emit madd vs. 2 cycles if it's only additions.
>>>
>>> Alexander
>>
>> How hard would to make compiler to make this very optimization? I raised 
>> this on weekly call because more and more it seems that tuning computation
>> dependencies for loop tuning seems to be more a compiler job than libc's
>> (although this not a blocker, but we have multiple smalls micro-optimizations
>> in the past that turned in dead code due compiler catching up).
> 
> Sorry, since you're responding to a discussion about multiply-add, it's unclear
> to me which optimization you mean. Is your question about choosing which
> sequence of additions has shorter cross-iteration chain?

Indeed I was not clear, I mean the reply to [1] where you explain why 
you have suggested the asm to prevent compiler reassociating.  

[1] https://sourceware.org/pipermail/libc-alpha/2022-May/138794.html
  
Alexander Monakov May 16, 2022, 8:27 p.m. UTC | #15
On Mon, 16 May 2022, Adhemerval Zanella via Libc-alpha wrote:

> >> How hard would to make compiler to make this very optimization? I raised 
> >> this on weekly call because more and more it seems that tuning computation
> >> dependencies for loop tuning seems to be more a compiler job than libc's
> >> (although this not a blocker, but we have multiple smalls micro-optimizations
> >> in the past that turned in dead code due compiler catching up).
> > 
> > Sorry, since you're responding to a discussion about multiply-add, it's unclear
> > to me which optimization you mean. Is your question about choosing which
> > sequence of additions has shorter cross-iteration chain?
> 
> Indeed I was not clear, I mean the reply to [1] where you explain why 
> you have suggested the asm to prevent compiler reassociating.  
> 
> [1] https://sourceware.org/pipermail/libc-alpha/2022-May/138794.html

I think it's pretty hard, you'd have to decompose 'h*33' into '(h<<5)+h'
in the reassociation pass, notice that it's a part of addition chain that
feeds the phi node for 'h', and based on that select a specific
association variant (all to shave off one cycle per iteration). To me it
looks like an optimization just for this exact scenario. And then you
need to "hope" that no other pass undoes this transformation.

It would be quite some nontrivial code in the compiler, when the alternative is
getting a guaranteed outcome for any compiler by adding an empty asm statement
in a loop that iterates thousands of times on every process startup.

Alexander
  
Noah Goldstein May 16, 2022, 8:32 p.m. UTC | #16
On Mon, May 16, 2022 at 11:44 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
>
> On 16/05/2022 22:08, Noah Goldstein wrote:
> >> Thanks for the context, this should go into comments.  A wishlist bug
> >> would be nice but I suspect it'll just gather dust.  Maybe it's still
> >> useful for someone coming in after 10-15 years looking for more context
> >> on it.
> >
> > I'll add a comment in the next version.
>
> Thanks.
>
> >> I would prefer the earlier variant in generic code, with (if necessary)
> >> the scheduling hack being a sysdep for x86.  Other architectures that
> >> want to use the latter should #include it and also post microbenchmark
> >> results so that we keep track of how we arrived at that decision.
> >
> > I'm happy to switch it back but I don't think the scheduling hack is x86
> > oriented. I don't think re-ordering could ever de-optimize things.
> > The only real architectural assumption is a reasonably fast
> > 32-bit multiply which is true for both the more generic earlier version
> > and the current one.
>
> I don't entirely disagree, but I think the conservative stance here is
> to keep the scheduling hack in the sysdep that has actually been tested
> and then bring it out into generic if it has been experimentally
> verified to be a universal win for all architectures we support.  That
> is, if we find that every architecture is including the sysdep version,
> then it's time to bring it out to replace the generic version.
>
> FWIW, I'm lowering the bar for acceptance because you only need to
> verify that the scheduling hack is better for the architecture you're
> interested in, not all architectures we support :)

Made the asm statements arch specific in V9.

>
> Thanks,
> Siddhesh
  
Alexander Monakov May 16, 2022, 8:33 p.m. UTC | #17
On Mon, 16 May 2022, Noah Goldstein wrote:

> > With the barrier I'd expect a shift-by-5 and two additions, no madd. Modern
> > aarch64 cores have 3-cycle madd I believe, so it's 3 cycles if the compiler
> > decides to emit madd vs. 2 cycles if it's only additions.
> 
> AFAIK the shift-by-5 + 2x adds is the best that exists at the moment but
> the point is some arch we either arent thinking about or some future variant
> may implement a leq 2 cycle madd.

Maybe they'll manage to fit two dependent additions in a single cycle like it
was on Pentium 4, too ;)

Alexander
  
Noah Goldstein May 16, 2022, 9:40 p.m. UTC | #18
On Mon, May 16, 2022 at 3:33 PM Alexander Monakov <amonakov@ispras.ru> wrote:
>
> On Mon, 16 May 2022, Noah Goldstein wrote:
>
> > > With the barrier I'd expect a shift-by-5 and two additions, no madd. Modern
> > > aarch64 cores have 3-cycle madd I believe, so it's 3 cycles if the compiler
> > > decides to emit madd vs. 2 cycles if it's only additions.
> >
> > AFAIK the shift-by-5 + 2x adds is the best that exists at the moment but
> > the point is some arch we either arent thinking about or some future variant
> > may implement a leq 2 cycle madd.
>
> Maybe they'll manage to fit two dependent additions in a single cycle like it
> was on Pentium 4, too ;)

Yes but the barriers preclude the MADD, but doesn't inherently preclude 2x
dependent ADDs. I think the barrier code is better and the comments/commit
message recommend arch maintainers to use it, but think there is a fair
case that this is arch dependent.
>
> Alexander
  
Siddhesh Poyarekar May 17, 2022, 1:45 a.m. UTC | #19
On 17/05/2022 00:58, Alexander Monakov wrote:
>> I did explain; I am not comfortable with controlling instruction scheduling in
>> that manner for generic code because it assumes more about underlying
>> processor pipelines and instruction sequences than we typically do in generic
>> code.  It has nothing to do with portability. Adhemerval raised the question
>> about whether this ought to be done in gcc instead, which I concurred with
>> too.
> 
> Thank you very much for the detailed response. Allow me to clear up what seems
> to be a technical misunderstanding here: this is not about instruction
> scheduling, but rather dependencies in the computations (I know Noah mentioned
> scheduling, but it's confusing especially in context of benchmarking for an
> out-of-order CPU).
> 
> I have shown how different variants have different chains of dependencies in
> this email: https://sourceware.org/pipermail/libc-alpha/2022-May/138495.html

Agreed, but again the latencies due to that dependency graph may have 
more or less impact depending on the architecture and eventually is 
linked to the code schedule, so the difference is academic IMO.

> The empty asms are used to prevent compiler reassociating 'h*32 + (h + c)'
> to '(h*32 + h) + c' which looks fine in isolation, but significantly changes
> the dependency graph in context of the whole loop.
> 
> There's nothing specific to the x86 architecture in this reasoning. On arm and
> aarch64 it's moot because they evaluate 'h*32 + h' in a single cycle, though.

That may well be true, but there are always architecture quirks to throw 
one off that may have been missed in testing or may turn up later.  Like 
I conceded before, it may well be that my concerns are unfounded and 
that gcc will generate the best code for all architectures with those 
barriers in place but that choice should be explicitly made based on 
benchmarks.

Siddhesh
  

Patch

diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
index 40d88c81f9..5269f6eb98 100644
--- a/elf/dl-new-hash.h
+++ b/elf/dl-new-hash.h
@@ -20,15 +20,71 @@ 
 #define _DL_NEW_HASH_H 1
 
 #include <stdint.h>
+/* For __glibc_unlikely.  */
+#include <sys/cdefs.h>
+
+/* The simplest implementation of _dl_new_hash is:
+
+_dl_new_hash (const char *s)
+{
+   uint32_t h = 5381;
+   for (unsigned char c = *s; c != '\0'; c = *++s)
+     h = h * 33 + c;
+   return h;
+}
+
+   We can get better performance, however, by slightly unrolling the
+   loop and explicitly specifying order of operations to prevent
+   reassosiation of instructions and ensure ideal scheduling.  */
 
 static inline uint32_t
 __attribute__ ((unused))
-_dl_new_hash (const char *s)
+_dl_new_hash (const char *str)
 {
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
+  const unsigned char *s = (const unsigned char *) str;
+  unsigned int h = 5381;
+  unsigned int c0, c1;
+  for (;;)
+    {
+      c0 = s[0];
+      /* Unlikely length zero string so evens will be slightly less
+	 common.  */
+      if (__glibc_unlikely (c0 == 0))
+	return h;
+
+      c1 = s[1];
+      if (c1 == 0)
+	{
+	  c0 += h;
+	  /* Ideal instruction scheduling is:
+	 c0 += h;
+	 h *= 32;
+	 h += c0;
+
+	 The asm statements are to prevent reassosiation that would result in
+	 more instruction interdependencies and worse scheduling.  */
+	  asm("" : "+r"(h) : "r"(c0));
+	  h = h * 32 + c0;
+	  return h;
+	}
+
+      /* Ideal instruction scheduling is:
+	 c1 += c0;
+	 h *= 33 * 33;
+	 c0 *= 32;
+	 c1 += c0;
+	 h  += c1;
+
+	 The asm statements are to prevent reassosiation that would result in
+	 more instruction interdependencies and worse scheduling.  */
+      c1 += c0;
+      asm("" : "+r"(c1), "+r"(c0));
+      h *= 33 * 33;
+      c1 += c0 * 32;
+      asm("" : "+r"(c1));
+      h += c1;
+      s += 2;
+    }
 }