Improve RTL CSE hash table hash usage

Message ID 20230203140650.E72031346D@imap2.suse-dmz.suse.de
State Committed
Commit aeeec83b0e26edf43e1460d2e85f62cb9bc57439
Headers
Series Improve RTL CSE hash table hash usage |

Commit Message

Richard Biener Feb. 3, 2023, 2:06 p.m. UTC
  The RTL CSE hash table has a fixed number of buckets (32) each
with a linked list of entries with the same hash value.  The
actual hash values are computed using hash_rtx which uses adds
for mixing and adds the rtx CODE as CODE << 7 (apart from some
exceptions such as MEM).  The unsigned int typed hash value
is then simply truncated for the actual lookup into the fixed
size table which means that usually CODE is simply lost.

The following improves this truncation by first mixing in more
bits using xor.  It does not change the actual hash function
since that's used outside of CSE as well.

An alternative would be to bump the fixed number of buckets,
say to 256 which would retain the LSB of CODE or to 8192 which
can capture all 6 bits required for the last CODE.

As the comment in CSE says, there's invalidate_memory and
flush_hash_table done possibly frequently and those at least
need to walk all slots, so when the hash table is mostly empty
enlarging it will be a loss.  Still there should be more
regular lookups by hash, so less collisions should pay off
as well.

Without enlarging the table a better hash function is unlikely
going to make a big difference, simple statistics on the
number of collisions at insertion time shows a reduction of
around 10%.  Bumping HASH_SHIFT by 1 improves that to 30%
at the expense of reducing the average table fill by 10%
(all of this stats from looking just at fold-const.i at -O2).
Increasing HASH_SHIFT more leaves the table even more sparse
likely showing that hash_rtx uses add for mixing which is
quite bad.  Bumping HASH_SHIFT by 2 removes 90% of all
collisions.

Experimenting with using inchash instead of adds for the
mixing does not improve things when looking at the HASH_SHIFT
bumped by 2 numbers.

Bootstrapped and tested on x86_64-unknown-linux-gnu.

Any opinions?

	* cse.cc (HASH): Turn into inline function and mix
	in another HASH_SHIFT bits.
	(SAFE_HASH): Likewise.
---
 gcc/cse.cc | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)
  

Comments

Richard Sandiford Feb. 3, 2023, 2:19 p.m. UTC | #1
Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> The RTL CSE hash table has a fixed number of buckets (32) each
> with a linked list of entries with the same hash value.  The
> actual hash values are computed using hash_rtx which uses adds
> for mixing and adds the rtx CODE as CODE << 7 (apart from some
> exceptions such as MEM).  The unsigned int typed hash value
> is then simply truncated for the actual lookup into the fixed
> size table which means that usually CODE is simply lost.
>
> The following improves this truncation by first mixing in more
> bits using xor.  It does not change the actual hash function
> since that's used outside of CSE as well.
>
> An alternative would be to bump the fixed number of buckets,
> say to 256 which would retain the LSB of CODE or to 8192 which
> can capture all 6 bits required for the last CODE.
>
> As the comment in CSE says, there's invalidate_memory and
> flush_hash_table done possibly frequently and those at least
> need to walk all slots, so when the hash table is mostly empty
> enlarging it will be a loss.  Still there should be more
> regular lookups by hash, so less collisions should pay off
> as well.

Going purely from this description and without having looked
at the code properly, would it be possible to link all current
values together, not just those with the same hash?  And would
that help?  It looks like the list is already doubly-linked,
and there's spare room to store a "start of new hash" marker.

Thanks,
Richard

> Without enlarging the table a better hash function is unlikely
> going to make a big difference, simple statistics on the
> number of collisions at insertion time shows a reduction of
> around 10%.  Bumping HASH_SHIFT by 1 improves that to 30%
> at the expense of reducing the average table fill by 10%
> (all of this stats from looking just at fold-const.i at -O2).
> Increasing HASH_SHIFT more leaves the table even more sparse
> likely showing that hash_rtx uses add for mixing which is
> quite bad.  Bumping HASH_SHIFT by 2 removes 90% of all
> collisions.
>
> Experimenting with using inchash instead of adds for the
> mixing does not improve things when looking at the HASH_SHIFT
> bumped by 2 numbers.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
>
> Any opinions?
>
> 	* cse.cc (HASH): Turn into inline function and mix
> 	in another HASH_SHIFT bits.
> 	(SAFE_HASH): Likewise.
> ---
>  gcc/cse.cc | 37 +++++++++++++++++++++++--------------
>  1 file changed, 23 insertions(+), 14 deletions(-)
>
> diff --git a/gcc/cse.cc b/gcc/cse.cc
> index 37afc88b439..4777e559b86 100644
> --- a/gcc/cse.cc
> +++ b/gcc/cse.cc
> @@ -420,20 +420,6 @@ struct table_elt
>  #define HASH_SIZE	(1 << HASH_SHIFT)
>  #define HASH_MASK	(HASH_SIZE - 1)
>  
> -/* Compute hash code of X in mode M.  Special-case case where X is a pseudo
> -   register (hard registers may require `do_not_record' to be set).  */
> -
> -#define HASH(X, M)	\
> - ((REG_P (X) && REGNO (X) >= FIRST_PSEUDO_REGISTER	\
> -  ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (X)))	\
> -  : canon_hash (X, M)) & HASH_MASK)
> -
> -/* Like HASH, but without side-effects.  */
> -#define SAFE_HASH(X, M)	\
> - ((REG_P (X) && REGNO (X) >= FIRST_PSEUDO_REGISTER	\
> -  ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (X)))	\
> -  : safe_hash (X, M)) & HASH_MASK)
> -
>  /* Determine whether register number N is considered a fixed register for the
>     purpose of approximating register costs.
>     It is desirable to replace other regs with fixed regs, to reduce need for
> @@ -586,6 +572,29 @@ static machine_mode cse_cc_succs (basic_block, basic_block, rtx, rtx,
>  
>  static const struct rtl_hooks cse_rtl_hooks = RTL_HOOKS_INITIALIZER;
>  
> +/* Compute hash code of X in mode M.  Special-case case where X is a pseudo
> +   register (hard registers may require `do_not_record' to be set).  */
> +
> +static inline unsigned
> +HASH (rtx x, machine_mode mode)
> +{
> +  unsigned h = (REG_P (x) && REGNO (x) >= FIRST_PSEUDO_REGISTER
> +		? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (x)))
> +		: canon_hash (x, mode));
> +  return (h ^ (h >> HASH_SHIFT)) & HASH_MASK;
> +}
> +
> +/* Like HASH, but without side-effects.  */
> +
> +static inline unsigned
> +SAFE_HASH (rtx x, machine_mode mode)
> +{
> +  unsigned h = (REG_P (x) && REGNO (x) >= FIRST_PSEUDO_REGISTER
> +		? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (x)))
> +		: safe_hash (x, mode));
> +  return (h ^ (h >> HASH_SHIFT)) & HASH_MASK;
> +}
> +
>  /* Nonzero if X has the form (PLUS frame-pointer integer).  */
>  
>  static bool
  
Richard Biener Feb. 3, 2023, 3:44 p.m. UTC | #2
> Am 03.02.2023 um 15:20 schrieb Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org>:
> 
> Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> The RTL CSE hash table has a fixed number of buckets (32) each
>> with a linked list of entries with the same hash value.  The
>> actual hash values are computed using hash_rtx which uses adds
>> for mixing and adds the rtx CODE as CODE << 7 (apart from some
>> exceptions such as MEM).  The unsigned int typed hash value
>> is then simply truncated for the actual lookup into the fixed
>> size table which means that usually CODE is simply lost.
>> 
>> The following improves this truncation by first mixing in more
>> bits using xor.  It does not change the actual hash function
>> since that's used outside of CSE as well.
>> 
>> An alternative would be to bump the fixed number of buckets,
>> say to 256 which would retain the LSB of CODE or to 8192 which
>> can capture all 6 bits required for the last CODE.
>> 
>> As the comment in CSE says, there's invalidate_memory and
>> flush_hash_table done possibly frequently and those at least
>> need to walk all slots, so when the hash table is mostly empty
>> enlarging it will be a loss.  Still there should be more
>> regular lookups by hash, so less collisions should pay off
>> as well.
> 
> Going purely from this description and without having looked
> at the code properly, would it be possible to link all current
> values together, not just those with the same hash?  And would
> that help?  It looks like the list is already doubly-linked,
> and there's spare room to store a "start of new hash" marker.

We already do have equivalent values linked, but I’m not sure that’s what you are suggesting.  Those should also have the same hash value, so both lists are somewhat redundant and we might be able to save some storage here by making this a list of lists of same hash and value list?

> 
> Thanks,
> Richard
> 
>> Without enlarging the table a better hash function is unlikely
>> going to make a big difference, simple statistics on the
>> number of collisions at insertion time shows a reduction of
>> around 10%.  Bumping HASH_SHIFT by 1 improves that to 30%
>> at the expense of reducing the average table fill by 10%
>> (all of this stats from looking just at fold-const.i at -O2).
>> Increasing HASH_SHIFT more leaves the table even more sparse
>> likely showing that hash_rtx uses add for mixing which is
>> quite bad.  Bumping HASH_SHIFT by 2 removes 90% of all
>> collisions.
>> 
>> Experimenting with using inchash instead of adds for the
>> mixing does not improve things when looking at the HASH_SHIFT
>> bumped by 2 numbers.
>> 
>> Bootstrapped and tested on x86_64-unknown-linux-gnu.
>> 
>> Any opinions?
>> 
>>    * cse.cc (HASH): Turn into inline function and mix
>>    in another HASH_SHIFT bits.
>>    (SAFE_HASH): Likewise.
>> ---
>> gcc/cse.cc | 37 +++++++++++++++++++++++--------------
>> 1 file changed, 23 insertions(+), 14 deletions(-)
>> 
>> diff --git a/gcc/cse.cc b/gcc/cse.cc
>> index 37afc88b439..4777e559b86 100644
>> --- a/gcc/cse.cc
>> +++ b/gcc/cse.cc
>> @@ -420,20 +420,6 @@ struct table_elt
>> #define HASH_SIZE    (1 << HASH_SHIFT)
>> #define HASH_MASK    (HASH_SIZE - 1)
>> 
>> -/* Compute hash code of X in mode M.  Special-case case where X is a pseudo
>> -   register (hard registers may require `do_not_record' to be set).  */
>> -
>> -#define HASH(X, M)    \
>> - ((REG_P (X) && REGNO (X) >= FIRST_PSEUDO_REGISTER    \
>> -  ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (X)))    \
>> -  : canon_hash (X, M)) & HASH_MASK)
>> -
>> -/* Like HASH, but without side-effects.  */
>> -#define SAFE_HASH(X, M)    \
>> - ((REG_P (X) && REGNO (X) >= FIRST_PSEUDO_REGISTER    \
>> -  ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (X)))    \
>> -  : safe_hash (X, M)) & HASH_MASK)
>> -
>> /* Determine whether register number N is considered a fixed register for the
>>    purpose of approximating register costs.
>>    It is desirable to replace other regs with fixed regs, to reduce need for
>> @@ -586,6 +572,29 @@ static machine_mode cse_cc_succs (basic_block, basic_block, rtx, rtx,
>> 
>> static const struct rtl_hooks cse_rtl_hooks = RTL_HOOKS_INITIALIZER;
>> 
>> +/* Compute hash code of X in mode M.  Special-case case where X is a pseudo
>> +   register (hard registers may require `do_not_record' to be set).  */
>> +
>> +static inline unsigned
>> +HASH (rtx x, machine_mode mode)
>> +{
>> +  unsigned h = (REG_P (x) && REGNO (x) >= FIRST_PSEUDO_REGISTER
>> +        ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (x)))
>> +        : canon_hash (x, mode));
>> +  return (h ^ (h >> HASH_SHIFT)) & HASH_MASK;
>> +}
>> +
>> +/* Like HASH, but without side-effects.  */
>> +
>> +static inline unsigned
>> +SAFE_HASH (rtx x, machine_mode mode)
>> +{
>> +  unsigned h = (REG_P (x) && REGNO (x) >= FIRST_PSEUDO_REGISTER
>> +        ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (x)))
>> +        : safe_hash (x, mode));
>> +  return (h ^ (h >> HASH_SHIFT)) & HASH_MASK;
>> +}
>> +
>> /* Nonzero if X has the form (PLUS frame-pointer integer).  */
>> 
>> static bool
  
Richard Sandiford Feb. 3, 2023, 3:54 p.m. UTC | #3
Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> Am 03.02.2023 um 15:20 schrieb Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org>:
>> 
>> Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>>> The RTL CSE hash table has a fixed number of buckets (32) each
>>> with a linked list of entries with the same hash value.  The
>>> actual hash values are computed using hash_rtx which uses adds
>>> for mixing and adds the rtx CODE as CODE << 7 (apart from some
>>> exceptions such as MEM).  The unsigned int typed hash value
>>> is then simply truncated for the actual lookup into the fixed
>>> size table which means that usually CODE is simply lost.
>>> 
>>> The following improves this truncation by first mixing in more
>>> bits using xor.  It does not change the actual hash function
>>> since that's used outside of CSE as well.
>>> 
>>> An alternative would be to bump the fixed number of buckets,
>>> say to 256 which would retain the LSB of CODE or to 8192 which
>>> can capture all 6 bits required for the last CODE.
>>> 
>>> As the comment in CSE says, there's invalidate_memory and
>>> flush_hash_table done possibly frequently and those at least
>>> need to walk all slots, so when the hash table is mostly empty
>>> enlarging it will be a loss.  Still there should be more
>>> regular lookups by hash, so less collisions should pay off
>>> as well.
>> 
>> Going purely from this description and without having looked
>> at the code properly, would it be possible to link all current
>> values together, not just those with the same hash?  And would
>> that help?  It looks like the list is already doubly-linked,
>> and there's spare room to store a "start of new hash" marker.
>
> We already do have equivalent values linked, but I’m not sure that’s what you are suggesting.

I was thinking of linking every active value in the table together,
but with entries for the same hash being consecutive.  That way, things
like invalidate_memory can just walk the list and ignore the hash table.

> Those should also have the same hash value, so both lists are somewhat redundant and we might be able to save some storage here by making this a list of lists of same hash and value list?

I thought the value-equality list was to establish that (e.g.)
(reg R1) and (reg R2) are known to have the same value, despite
being different expressions with different hash values.

But I suppose if we reused an existing hash table structure (with its
own mechanism for handling collisions), it would make sense to use the
equivalent-value list to join everything together, rather than the
same-hash list.  Again, there could be a marker to establish the start
of a new equivalent-value sublist.

Thanks,
Richard
>
>> 
>> Thanks,
>> Richard
>> 
>>> Without enlarging the table a better hash function is unlikely
>>> going to make a big difference, simple statistics on the
>>> number of collisions at insertion time shows a reduction of
>>> around 10%.  Bumping HASH_SHIFT by 1 improves that to 30%
>>> at the expense of reducing the average table fill by 10%
>>> (all of this stats from looking just at fold-const.i at -O2).
>>> Increasing HASH_SHIFT more leaves the table even more sparse
>>> likely showing that hash_rtx uses add for mixing which is
>>> quite bad.  Bumping HASH_SHIFT by 2 removes 90% of all
>>> collisions.
>>> 
>>> Experimenting with using inchash instead of adds for the
>>> mixing does not improve things when looking at the HASH_SHIFT
>>> bumped by 2 numbers.
>>> 
>>> Bootstrapped and tested on x86_64-unknown-linux-gnu.
>>> 
>>> Any opinions?
>>> 
>>>    * cse.cc (HASH): Turn into inline function and mix
>>>    in another HASH_SHIFT bits.
>>>    (SAFE_HASH): Likewise.
>>> ---
>>> gcc/cse.cc | 37 +++++++++++++++++++++++--------------
>>> 1 file changed, 23 insertions(+), 14 deletions(-)
>>> 
>>> diff --git a/gcc/cse.cc b/gcc/cse.cc
>>> index 37afc88b439..4777e559b86 100644
>>> --- a/gcc/cse.cc
>>> +++ b/gcc/cse.cc
>>> @@ -420,20 +420,6 @@ struct table_elt
>>> #define HASH_SIZE    (1 << HASH_SHIFT)
>>> #define HASH_MASK    (HASH_SIZE - 1)
>>> 
>>> -/* Compute hash code of X in mode M.  Special-case case where X is a pseudo
>>> -   register (hard registers may require `do_not_record' to be set).  */
>>> -
>>> -#define HASH(X, M)    \
>>> - ((REG_P (X) && REGNO (X) >= FIRST_PSEUDO_REGISTER    \
>>> -  ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (X)))    \
>>> -  : canon_hash (X, M)) & HASH_MASK)
>>> -
>>> -/* Like HASH, but without side-effects.  */
>>> -#define SAFE_HASH(X, M)    \
>>> - ((REG_P (X) && REGNO (X) >= FIRST_PSEUDO_REGISTER    \
>>> -  ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (X)))    \
>>> -  : safe_hash (X, M)) & HASH_MASK)
>>> -
>>> /* Determine whether register number N is considered a fixed register for the
>>>    purpose of approximating register costs.
>>>    It is desirable to replace other regs with fixed regs, to reduce need for
>>> @@ -586,6 +572,29 @@ static machine_mode cse_cc_succs (basic_block, basic_block, rtx, rtx,
>>> 
>>> static const struct rtl_hooks cse_rtl_hooks = RTL_HOOKS_INITIALIZER;
>>> 
>>> +/* Compute hash code of X in mode M.  Special-case case where X is a pseudo
>>> +   register (hard registers may require `do_not_record' to be set).  */
>>> +
>>> +static inline unsigned
>>> +HASH (rtx x, machine_mode mode)
>>> +{
>>> +  unsigned h = (REG_P (x) && REGNO (x) >= FIRST_PSEUDO_REGISTER
>>> +        ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (x)))
>>> +        : canon_hash (x, mode));
>>> +  return (h ^ (h >> HASH_SHIFT)) & HASH_MASK;
>>> +}
>>> +
>>> +/* Like HASH, but without side-effects.  */
>>> +
>>> +static inline unsigned
>>> +SAFE_HASH (rtx x, machine_mode mode)
>>> +{
>>> +  unsigned h = (REG_P (x) && REGNO (x) >= FIRST_PSEUDO_REGISTER
>>> +        ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (x)))
>>> +        : safe_hash (x, mode));
>>> +  return (h ^ (h >> HASH_SHIFT)) & HASH_MASK;
>>> +}
>>> +
>>> /* Nonzero if X has the form (PLUS frame-pointer integer).  */
>>> 
>>> static bool
  
Richard Biener Feb. 3, 2023, 7:05 p.m. UTC | #4
> Am 03.02.2023 um 16:55 schrieb Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org>:
> 
> Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>>>> Am 03.02.2023 um 15:20 schrieb Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org>:
>>> 
>>> Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>>>> The RTL CSE hash table has a fixed number of buckets (32) each
>>>> with a linked list of entries with the same hash value.  The
>>>> actual hash values are computed using hash_rtx which uses adds
>>>> for mixing and adds the rtx CODE as CODE << 7 (apart from some
>>>> exceptions such as MEM).  The unsigned int typed hash value
>>>> is then simply truncated for the actual lookup into the fixed
>>>> size table which means that usually CODE is simply lost.
>>>> 
>>>> The following improves this truncation by first mixing in more
>>>> bits using xor.  It does not change the actual hash function
>>>> since that's used outside of CSE as well.
>>>> 
>>>> An alternative would be to bump the fixed number of buckets,
>>>> say to 256 which would retain the LSB of CODE or to 8192 which
>>>> can capture all 6 bits required for the last CODE.
>>>> 
>>>> As the comment in CSE says, there's invalidate_memory and
>>>> flush_hash_table done possibly frequently and those at least
>>>> need to walk all slots, so when the hash table is mostly empty
>>>> enlarging it will be a loss.  Still there should be more
>>>> regular lookups by hash, so less collisions should pay off
>>>> as well.
>>> 
>>> Going purely from this description and without having looked
>>> at the code properly, would it be possible to link all current
>>> values together, not just those with the same hash?  And would
>>> that help?  It looks like the list is already doubly-linked,
>>> and there's spare room to store a "start of new hash" marker.
>> 
>> We already do have equivalent values linked, but I’m not sure that’s what you are suggesting.
> 
> I was thinking of linking every active value in the table together,
> but with entries for the same hash being consecutive.  That way, things
> like invalidate_memory can just walk the list and ignore the hash table.

Ah, yeah.  Even better might be a generation count for memory like there’s one (but only for a subset of cases?!) for pseudos.  That would avoid the walking altogether.

>> Those should also have the same hash value, so both lists are somewhat redundant and we might be able to save some storage here by making this a list of lists of same hash and value list?
> 
> I thought the value-equality list was to establish that (e.g.)
> (reg R1) and (reg R2) are known to have the same value, despite
> being different expressions with different hash values.

I’d have to double check, I was just cursory sweeping over the code after being pointed to it from a profile of a testcase.  Most of CSE.cc dates back to the initial source revision …

Richard 

> But I suppose if we reused an existing hash table structure (with its
> own mechanism for handling collisions), it would make sense to use the
> equivalent-value list to join everything together, rather than the
> same-hash list.  Again, there could be a marker to establish the start
> of a new equivalent-value sublist.
> 
> Thanks,
> Richard
>> 
>>> 
>>> Thanks,
>>> Richard
>>> 
>>>> Without enlarging the table a better hash function is unlikely
>>>> going to make a big difference, simple statistics on the
>>>> number of collisions at insertion time shows a reduction of
>>>> around 10%.  Bumping HASH_SHIFT by 1 improves that to 30%
>>>> at the expense of reducing the average table fill by 10%
>>>> (all of this stats from looking just at fold-const.i at -O2).
>>>> Increasing HASH_SHIFT more leaves the table even more sparse
>>>> likely showing that hash_rtx uses add for mixing which is
>>>> quite bad.  Bumping HASH_SHIFT by 2 removes 90% of all
>>>> collisions.
>>>> 
>>>> Experimenting with using inchash instead of adds for the
>>>> mixing does not improve things when looking at the HASH_SHIFT
>>>> bumped by 2 numbers.
>>>> 
>>>> Bootstrapped and tested on x86_64-unknown-linux-gnu.
>>>> 
>>>> Any opinions?
>>>> 
>>>>   * cse.cc (HASH): Turn into inline function and mix
>>>>   in another HASH_SHIFT bits.
>>>>   (SAFE_HASH): Likewise.
>>>> ---
>>>> gcc/cse.cc | 37 +++++++++++++++++++++++--------------
>>>> 1 file changed, 23 insertions(+), 14 deletions(-)
>>>> 
>>>> diff --git a/gcc/cse.cc b/gcc/cse.cc
>>>> index 37afc88b439..4777e559b86 100644
>>>> --- a/gcc/cse.cc
>>>> +++ b/gcc/cse.cc
>>>> @@ -420,20 +420,6 @@ struct table_elt
>>>> #define HASH_SIZE    (1 << HASH_SHIFT)
>>>> #define HASH_MASK    (HASH_SIZE - 1)
>>>> 
>>>> -/* Compute hash code of X in mode M.  Special-case case where X is a pseudo
>>>> -   register (hard registers may require `do_not_record' to be set).  */
>>>> -
>>>> -#define HASH(X, M)    \
>>>> - ((REG_P (X) && REGNO (X) >= FIRST_PSEUDO_REGISTER    \
>>>> -  ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (X)))    \
>>>> -  : canon_hash (X, M)) & HASH_MASK)
>>>> -
>>>> -/* Like HASH, but without side-effects.  */
>>>> -#define SAFE_HASH(X, M)    \
>>>> - ((REG_P (X) && REGNO (X) >= FIRST_PSEUDO_REGISTER    \
>>>> -  ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (X)))    \
>>>> -  : safe_hash (X, M)) & HASH_MASK)
>>>> -
>>>> /* Determine whether register number N is considered a fixed register for the
>>>>   purpose of approximating register costs.
>>>>   It is desirable to replace other regs with fixed regs, to reduce need for
>>>> @@ -586,6 +572,29 @@ static machine_mode cse_cc_succs (basic_block, basic_block, rtx, rtx,
>>>> 
>>>> static const struct rtl_hooks cse_rtl_hooks = RTL_HOOKS_INITIALIZER;
>>>> 
>>>> +/* Compute hash code of X in mode M.  Special-case case where X is a pseudo
>>>> +   register (hard registers may require `do_not_record' to be set).  */
>>>> +
>>>> +static inline unsigned
>>>> +HASH (rtx x, machine_mode mode)
>>>> +{
>>>> +  unsigned h = (REG_P (x) && REGNO (x) >= FIRST_PSEUDO_REGISTER
>>>> +        ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (x)))
>>>> +        : canon_hash (x, mode));
>>>> +  return (h ^ (h >> HASH_SHIFT)) & HASH_MASK;
>>>> +}
>>>> +
>>>> +/* Like HASH, but without side-effects.  */
>>>> +
>>>> +static inline unsigned
>>>> +SAFE_HASH (rtx x, machine_mode mode)
>>>> +{
>>>> +  unsigned h = (REG_P (x) && REGNO (x) >= FIRST_PSEUDO_REGISTER
>>>> +        ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (x)))
>>>> +        : safe_hash (x, mode));
>>>> +  return (h ^ (h >> HASH_SHIFT)) & HASH_MASK;
>>>> +}
>>>> +
>>>> /* Nonzero if X has the form (PLUS frame-pointer integer).  */
>>>> 
>>>> static bool
  
Richard Biener May 3, 2023, 10:22 a.m. UTC | #5
On Fri, Feb 3, 2023 at 3:07 PM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> The RTL CSE hash table has a fixed number of buckets (32) each
> with a linked list of entries with the same hash value.  The
> actual hash values are computed using hash_rtx which uses adds
> for mixing and adds the rtx CODE as CODE << 7 (apart from some
> exceptions such as MEM).  The unsigned int typed hash value
> is then simply truncated for the actual lookup into the fixed
> size table which means that usually CODE is simply lost.
>
> The following improves this truncation by first mixing in more
> bits using xor.  It does not change the actual hash function
> since that's used outside of CSE as well.
>
> An alternative would be to bump the fixed number of buckets,
> say to 256 which would retain the LSB of CODE or to 8192 which
> can capture all 6 bits required for the last CODE.
>
> As the comment in CSE says, there's invalidate_memory and
> flush_hash_table done possibly frequently and those at least
> need to walk all slots, so when the hash table is mostly empty
> enlarging it will be a loss.  Still there should be more
> regular lookups by hash, so less collisions should pay off
> as well.
>
> Without enlarging the table a better hash function is unlikely
> going to make a big difference, simple statistics on the
> number of collisions at insertion time shows a reduction of
> around 10%.  Bumping HASH_SHIFT by 1 improves that to 30%
> at the expense of reducing the average table fill by 10%
> (all of this stats from looking just at fold-const.i at -O2).
> Increasing HASH_SHIFT more leaves the table even more sparse
> likely showing that hash_rtx uses add for mixing which is
> quite bad.  Bumping HASH_SHIFT by 2 removes 90% of all
> collisions.
>
> Experimenting with using inchash instead of adds for the
> mixing does not improve things when looking at the HASH_SHIFT
> bumped by 2 numbers.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
>
> Any opinions?

I have pushed this change now after re-bootstrapping and testing
on x86_64-unknown-linux-gnu.

Richard.

>         * cse.cc (HASH): Turn into inline function and mix
>         in another HASH_SHIFT bits.
>         (SAFE_HASH): Likewise.
> ---
>  gcc/cse.cc | 37 +++++++++++++++++++++++--------------
>  1 file changed, 23 insertions(+), 14 deletions(-)
>
> diff --git a/gcc/cse.cc b/gcc/cse.cc
> index 37afc88b439..4777e559b86 100644
> --- a/gcc/cse.cc
> +++ b/gcc/cse.cc
> @@ -420,20 +420,6 @@ struct table_elt
>  #define HASH_SIZE      (1 << HASH_SHIFT)
>  #define HASH_MASK      (HASH_SIZE - 1)
>
> -/* Compute hash code of X in mode M.  Special-case case where X is a pseudo
> -   register (hard registers may require `do_not_record' to be set).  */
> -
> -#define HASH(X, M)     \
> - ((REG_P (X) && REGNO (X) >= FIRST_PSEUDO_REGISTER     \
> -  ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (X)))   \
> -  : canon_hash (X, M)) & HASH_MASK)
> -
> -/* Like HASH, but without side-effects.  */
> -#define SAFE_HASH(X, M)        \
> - ((REG_P (X) && REGNO (X) >= FIRST_PSEUDO_REGISTER     \
> -  ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (X)))   \
> -  : safe_hash (X, M)) & HASH_MASK)
> -
>  /* Determine whether register number N is considered a fixed register for the
>     purpose of approximating register costs.
>     It is desirable to replace other regs with fixed regs, to reduce need for
> @@ -586,6 +572,29 @@ static machine_mode cse_cc_succs (basic_block, basic_block, rtx, rtx,
>
>  static const struct rtl_hooks cse_rtl_hooks = RTL_HOOKS_INITIALIZER;
>
> +/* Compute hash code of X in mode M.  Special-case case where X is a pseudo
> +   register (hard registers may require `do_not_record' to be set).  */
> +
> +static inline unsigned
> +HASH (rtx x, machine_mode mode)
> +{
> +  unsigned h = (REG_P (x) && REGNO (x) >= FIRST_PSEUDO_REGISTER
> +               ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (x)))
> +               : canon_hash (x, mode));
> +  return (h ^ (h >> HASH_SHIFT)) & HASH_MASK;
> +}
> +
> +/* Like HASH, but without side-effects.  */
> +
> +static inline unsigned
> +SAFE_HASH (rtx x, machine_mode mode)
> +{
> +  unsigned h = (REG_P (x) && REGNO (x) >= FIRST_PSEUDO_REGISTER
> +               ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (x)))
> +               : safe_hash (x, mode));
> +  return (h ^ (h >> HASH_SHIFT)) & HASH_MASK;
> +}
> +
>  /* Nonzero if X has the form (PLUS frame-pointer integer).  */
>
>  static bool
> --
> 2.35.3
  

Patch

diff --git a/gcc/cse.cc b/gcc/cse.cc
index 37afc88b439..4777e559b86 100644
--- a/gcc/cse.cc
+++ b/gcc/cse.cc
@@ -420,20 +420,6 @@  struct table_elt
 #define HASH_SIZE	(1 << HASH_SHIFT)
 #define HASH_MASK	(HASH_SIZE - 1)
 
-/* Compute hash code of X in mode M.  Special-case case where X is a pseudo
-   register (hard registers may require `do_not_record' to be set).  */
-
-#define HASH(X, M)	\
- ((REG_P (X) && REGNO (X) >= FIRST_PSEUDO_REGISTER	\
-  ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (X)))	\
-  : canon_hash (X, M)) & HASH_MASK)
-
-/* Like HASH, but without side-effects.  */
-#define SAFE_HASH(X, M)	\
- ((REG_P (X) && REGNO (X) >= FIRST_PSEUDO_REGISTER	\
-  ? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (X)))	\
-  : safe_hash (X, M)) & HASH_MASK)
-
 /* Determine whether register number N is considered a fixed register for the
    purpose of approximating register costs.
    It is desirable to replace other regs with fixed regs, to reduce need for
@@ -586,6 +572,29 @@  static machine_mode cse_cc_succs (basic_block, basic_block, rtx, rtx,
 
 static const struct rtl_hooks cse_rtl_hooks = RTL_HOOKS_INITIALIZER;
 
+/* Compute hash code of X in mode M.  Special-case case where X is a pseudo
+   register (hard registers may require `do_not_record' to be set).  */
+
+static inline unsigned
+HASH (rtx x, machine_mode mode)
+{
+  unsigned h = (REG_P (x) && REGNO (x) >= FIRST_PSEUDO_REGISTER
+		? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (x)))
+		: canon_hash (x, mode));
+  return (h ^ (h >> HASH_SHIFT)) & HASH_MASK;
+}
+
+/* Like HASH, but without side-effects.  */
+
+static inline unsigned
+SAFE_HASH (rtx x, machine_mode mode)
+{
+  unsigned h = (REG_P (x) && REGNO (x) >= FIRST_PSEUDO_REGISTER
+		? (((unsigned) REG << 7) + (unsigned) REG_QTY (REGNO (x)))
+		: safe_hash (x, mode));
+  return (h ^ (h >> HASH_SHIFT)) & HASH_MASK;
+}
+
 /* Nonzero if X has the form (PLUS frame-pointer integer).  */
 
 static bool