From patchwork Mon Nov 18 13:58:25 2019
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Patchwork-Submitter: Florian Weimer <fweimer@redhat.com>
X-Patchwork-Id: 36003
Received: (qmail 53624 invoked by alias); 18 Nov 2019 13:58:34 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 53610 invoked by uid 89); 18 Nov 2019 13:58:34 -0000
Authentication-Results: sourceware.org; auth=none
X-Spam-SWARE-Status: No, score=-18.4 required=5.0 tests=AWL, BAYES_00,
	GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_SHORT,
	SPF_PASS autolearn=ham version=3.3.1 spammy=division,
	consulting, Cascade, H*MI:str
X-HELO: us-smtp-delivery-1.mimecast.com
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com;
	s=mimecast20190719; t=1574085510;
	h=from:from:reply-to:subject:subject:date:date:message-id:message-id:
	to:to:cc:mime-version:mime-version:content-type:content-type:
	content-transfer-encoding:content-transfer-encoding;
	bh=vE0YBccNQ2mcYPT2kwcMQ0k8Pngz3MvpyntLFatbV7s=;
	b=WyNDhpJyMQVCq5Qk8c9YYOFc3t+JmaVqJD2ajmwltpZdhxgcqRhTDD0cfgme3GQezQq1xx
	KqJ4HEN+AK0wzoZd385ekuuT+GCxzykv+IE8zTtE7QMSqpT0hPMudMQ6/s/HvTvpjJevVv
	wKHNaDgVlBW0BhA13/+PfrsZ/HI5vJY=
From: Florian Weimer <fweimer@redhat.com>
To: libc-alpha@sourceware.org
Subject: Optimizing hash table lookup in symbol binding
Date: Mon, 18 Nov 2019 14:58:25 +0100
Message-ID: <87lfsd477i.fsf@oldenburg2.str.redhat.com>
User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.2 (gnu/linux)
MIME-Version: 1.0
X-Mimecast-Spam-Score: 0

I investigated optimizing the hash table lookup used for symbol
binding.  This is about this code in elf/dl-lookup.c:

      /* The tables for this map.  */
      const ElfW(Sym) *symtab = (const void *) D_PTR (map, l_info[DT_SYMTAB]);
      const char *strtab = (const void *) D_PTR (map, l_info[DT_STRTAB]);

      const ElfW(Sym) *sym;
      const ElfW(Addr) *bitmask = map->l_gnu_bitmask;
      if (__glibc_likely (bitmask != NULL))
	{
	  ElfW(Addr) bitmask_word
	    = bitmask[(new_hash / __ELF_NATIVE_CLASS)
		      & map->l_gnu_bitmask_idxbits];

	  unsigned int hashbit1 = new_hash & (__ELF_NATIVE_CLASS - 1);
	  unsigned int hashbit2 = ((new_hash >> map->l_gnu_shift)
				   & (__ELF_NATIVE_CLASS - 1));

	  if (__glibc_unlikely ((bitmask_word >> hashbit1)
				& (bitmask_word >> hashbit2) & 1))
	    {
	      Elf32_Word bucket = map->l_gnu_buckets[new_hash
						     % map->l_nbuckets];
	      if (bucket != 0)
		{
		  const Elf32_Word *hasharr = &map->l_gnu_chain_zero[bucket];

		  do
		    if (((*hasharr ^ new_hash) >> 1) == 0)
		      {
			symidx = ELF_MACHINE_HASH_SYMIDX (map, hasharr);
			sym = check_match (undef_name, ref, version, flags,
					   type_class, &symtab[symidx], symidx,
					   strtab, map, &versioned_sym,
					   &num_versions);
			if (sym != NULL)
			  goto found_it;
		      }
		  while ((*hasharr++ & 1u) == 0);
		}
	    }
	  /* No symbol found.  */
	  symidx = SHN_UNDEF;
	}
      else

My primary interest was the % operator because it turns out that it
actually shows up in some profiles stressing symbol binding during
program lookup.  In most cases, however the search for the right mapping
dominates and the preceding bitmask check fails most of the time.  But
with shallow library dependencies, we can end up in a situation where
the division actually matters.

Strategies for optimizing integer division are discussed in Hacker's
Delight and here:

<http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html>
<http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html>

(I have written to the author to get some of the math fixed in minor
ways, but I think the general direction is solid.)

The algorithm from the first episode looks like this:


Benchmarking results are mixed.  As I said, for deep DSO dependency
chains, the bitmask check clearly dominates.  I tried to benchmark this
directly using dlsym, with the dladdr avoidance patch here:

  dlsym: Do not determine caller link map if not needed
  <https://gnutoolchain-gerrit.osci.io/r/c/glibc/+/528>

On a second-generation AMD EPYC, I didn't see a difference at all for
some reason.  On Cascade Lake, I see a moderate improvement for the
dlsym test, but I don't know how realistic this microbenchmark is.  Both
patches had performance that was on par.

I also tried to remove the bitmask check altogether, but it was not an
improvement.  I suspect the bitmasks are much smaller, so consulting
them avoids the cache misses in the full table lookup.

If any of the architecture maintainers think this is worth doing, we can
still incorporate it.

Thanks,
Florian

diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
index fd44cd4101..205d043717 100644
--- a/elf/dl-lookup.c
+++ b/elf/dl-lookup.c
@@ -28,6 +28,7 @@
 #include <libc-lock.h>
 #include <tls.h>
 #include <atomic.h>
+#include <divopt.h>
 
 #include <assert.h>
 
@@ -394,8 +395,18 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash,
 	  if (__glibc_unlikely ((bitmask_word >> hashbit1)
 				& (bitmask_word >> hashbit2) & 1))
 	    {
-	      Elf32_Word bucket = map->l_gnu_buckets[new_hash
-						     % map->l_nbuckets];
+	      Elf32_Word bucket;
+	      if (map->l_nbuckets > 1)
+		{
+		  uint32_t quotient
+		    = divopt_32 (new_hash, map->l_nbuckets_multiplier,
+				 map->l_nbuckets_multiplier_shift);
+		  uint32_t remainder = new_hash - map->l_nbuckets * quotient;
+		  bucket = map->l_gnu_buckets[remainder];
+		}
+	      else
+		bucket = map->l_gnu_buckets[0];
+
 	      if (bucket != 0)
 		{
 		  const Elf32_Word *hasharr = &map->l_gnu_chain_zero[bucket];
@@ -931,6 +942,11 @@ _dl_setup_hash (struct link_map *map)
       /* Initialize MIPS xhash translation table.  */
       ELF_MACHINE_XHASH_SETUP (hash32, symbias, map);
 
+      if (map->l_nbuckets >= 2)
+	map->l_nbuckets_multiplier_shift
+	  = precompute_divopt_32 (map->l_nbuckets,
+				  &map->l_nbuckets_multiplier);
+
       return;
     }
 
diff --git a/include/divopt.h b/include/divopt.h
new file mode 100644
index 0000000000..85eece8fd9
--- /dev/null
+++ b/include/divopt.h
@@ -0,0 +1,78 @@
+/* Optimization of repeated integer division.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+#include <sys/param.h>
+
+/* Precompute *MULTIPLIER for dividing by DIVISOR, which must be two
+   or larger, and return the shift count (non-negative and less than
+   32), for use with divopt_32 below.  */
+static int __attribute__ ((used))
+precompute_divopt_32 (uint32_t divisor, uint32_t *multiplier)
+{
+  if (divisor == 1)
+    {
+      *multiplier = 1;
+      return 0;
+    }
+
+  int log2 = 32 - __builtin_clz (divisor);
+
+  /* Handle powers-of-two first, so that we do not need to deal with
+     the clz corner cases below.  */
+  if (powerof2 (divisor))
+    {
+      *multiplier = 1;
+      return log2 - 2;
+    }
+
+  if (log2 != 32)
+    {
+      /* Compute ceil (2**(32 + log2) / divisor).  The
+         most-significant bit is always set and is discarded.  */
+      *multiplier = (((uint64_t) 1 << (32 + log2)) + divisor) / divisor;
+      return log2 - 1;
+    }
+  else
+    {
+      /* Perform a long division of 2**64 + (divisor - 1) by the
+         divisor, encoded in base-2**32, using a 64-by-32 division.
+         Start out with the first two digits, which are (1, 0).  2**32
+         divided by the divisor is 1 because the divisor is larger
+         than 2**31.  This set bit is discarded.  */
+      uint64_t remainder = -divisor;
+
+      /* Combine the remainder of the first division with the third
+         and final base 2**32 digit.  */
+      *multiplier = ((remainder << 32) | (divisor - 1)) / divisor;
+      return 31;
+    }
+}
+
+/* Return the quotient of DIVIDEND devided by the divisor that was
+   used to compute MULTIPLIER and SHIFT via precompute_divopt_32.  */
+static inline uint32_t
+divopt_32 (uint32_t dividend, uint32_t multiplier, int shift)
+{
+  /* Approximation to the quotient.  */
+  uint32_t quotient = ((uint64_t) dividend * multiplier) >> 32;
+  /* Compute (dividend + quotient) / 2 without overflow.  */
+  uint32_t temp = ((dividend - quotient) >> 1) + quotient;
+  /* The result is in the higher-order bits.  */
+  return temp >> shift;
+}
diff --git a/include/link.h b/include/link.h
index 1184201f91..b09aa81bb4 100644
--- a/include/link.h
+++ b/include/link.h
@@ -153,6 +153,8 @@ struct link_map
 
     /* Symbol hash table.  */
     Elf_Symndx l_nbuckets;
+    uint32_t l_nbuckets_multiplier;
+    int l_nbuckets_multiplier_shift;
     Elf32_Word l_gnu_bitmask_idxbits;
     Elf32_Word l_gnu_shift;
     const ElfW(Addr) *l_gnu_bitmask;

It only requires a 32x32â†’64 multiplier, which is an advantage.  However,
the dependency chain in divopt_32 is fairly long, so performance is less
than optimal.

The Episode 3 approach as described is not general, so it would work
well only if we told binutils ld to avoid bucket counts that are
problematic for it.  (In both cases, we need run-time checks to avoid
the unsupported cases, but if we could work together with binutils, this
data-dependent branch should be easy to predict in practice.)

There is a variant of the Episode 3 approach which uses a 64x64â†’128
multiplication, for a much shorter dependency chain.  For convenience, I
used the __int128 support from libgcc below, but that could probably be
avoided on 64-bit architectures with suitable multipliers:

diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c
index fd44cd4101..4d2f3b91f0 100644
--- a/elf/dl-lookup.c
+++ b/elf/dl-lookup.c
@@ -394,8 +395,17 @@ do_lookup_x (const char *undef_name, uint_fast32_t new_hash,
 	  if (__glibc_unlikely ((bitmask_word >> hashbit1)
 				& (bitmask_word >> hashbit2) & 1))
 	    {
-	      Elf32_Word bucket = map->l_gnu_buckets[new_hash
-						     % map->l_nbuckets];
+	      Elf32_Word bucket;
+	      if (powerof2 (map->l_nbuckets))
+		bucket = map->l_gnu_buckets[new_hash & (map->l_nbuckets - 1)];
+	      else
+		{
+		  uint32_t quotient
+		    = ((unsigned __int128) map->l_nbuckets_multiplier * ((uint64_t) new_hash + 1)) >> 64;
+		  uint32_t remainder = new_hash - map->l_nbuckets * quotient;
+		  bucket = map->l_gnu_buckets[remainder];
+		}
+
 	      if (bucket != 0)
 		{
 		  const Elf32_Word *hasharr = &map->l_gnu_chain_zero[bucket];
@@ -931,6 +941,10 @@ _dl_setup_hash (struct link_map *map)
       /* Initialize MIPS xhash translation table.  */
       ELF_MACHINE_XHASH_SETUP (hash32, symbias, map);
 
+      if (powerof2 (map->l_nbuckets))
+	map->l_nbuckets_multiplier = __builtin_ctz (map->l_nbuckets);
+      else
+	map->l_nbuckets_multiplier = ((unsigned __int128) 1 << 64) / map->l_nbuckets;
       return;
     }
 
diff --git a/include/link.h b/include/link.h
index 1184201f91..eec4c9ef6e 100644
--- a/include/link.h
+++ b/include/link.h
@@ -153,6 +153,7 @@ struct link_map
 
     /* Symbol hash table.  */
     Elf_Symndx l_nbuckets;
+    uint64_t l_nbuckets_multiplier;
     Elf32_Word l_gnu_bitmask_idxbits;
     Elf32_Word l_gnu_shift;
     const ElfW(Addr) *l_gnu_bitmask;