diff mbox series

[06/11] elf: Implement a string table for ldconfig, with tail merging

Message ID c98d04641f8ab2f616c2be017d7d0eb918250823.1604946656.git.fweimer@redhat.com
State Superseded
Headers show
Series glibc-hwcaps support | expand

Commit Message

Florian Weimer Nov. 9, 2020, 6:40 p.m. UTC
This will be used in ldconfig to reduce the ld.so.cache size slightly.

Tail merging is an optimization where a pointer points into another
string if the first string is a suffix of the second string.

The hash function FNV-1a was chosen because it is simple and achieves
good dispersion even for short strings (so that the hash table bucket
count can be a power of two).  It is clearly superior to the hsearch
hash and the ELF hash in this regard.

The hash table uses chaining for collision resolution.
---
 elf/Makefile           |   2 +-
 elf/stringtable.c      | 209 +++++++++++++++++++++++++++++++++++++++++
 elf/stringtable.h      |  64 +++++++++++++
 elf/stringtable_free.c |  33 +++++++
 elf/tst-stringtable.c  | 141 +++++++++++++++++++++++++++
 5 files changed, 448 insertions(+), 1 deletion(-)
 create mode 100644 elf/stringtable.c
 create mode 100644 elf/stringtable.h
 create mode 100644 elf/stringtable_free.c
 create mode 100644 elf/tst-stringtable.c

Comments

Adhemerval Zanella Nov. 27, 2020, 7:29 p.m. UTC | #1
On 09/11/2020 15:40, Florian Weimer via Libc-alpha wrote:
> This will be used in ldconfig to reduce the ld.so.cache size slightly.
> 
> Tail merging is an optimization where a pointer points into another
> string if the first string is a suffix of the second string.
> 
> The hash function FNV-1a was chosen because it is simple and achieves
> good dispersion even for short strings (so that the hash table bucket
> count can be a power of two).  It is clearly superior to the hsearch
> hash and the ELF hash in this regard.
> 
> The hash table uses chaining for collision resolution.

LGTM, with just a couple of comments below.

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

> ---
>  elf/Makefile           |   2 +-
>  elf/stringtable.c      | 209 +++++++++++++++++++++++++++++++++++++++++
>  elf/stringtable.h      |  64 +++++++++++++
>  elf/stringtable_free.c |  33 +++++++
>  elf/tst-stringtable.c  | 141 +++++++++++++++++++++++++++
>  5 files changed, 448 insertions(+), 1 deletion(-)
>  create mode 100644 elf/stringtable.c
>  create mode 100644 elf/stringtable.h
>  create mode 100644 elf/stringtable_free.c
>  create mode 100644 elf/tst-stringtable.c
> 
> diff --git a/elf/Makefile b/elf/Makefile
> index d2f7e99863..5a8f116a67 100644
> --- a/elf/Makefile
> +++ b/elf/Makefile
> @@ -172,7 +172,7 @@ tests-container := \
>  
>  tests := tst-tls9 tst-leaks1 \
>  	tst-array1 tst-array2 tst-array3 tst-array4 tst-array5 \
> -	tst-auxv
> +	tst-auxv tst-stringtable
>  tests-internal := tst-tls1 tst-tls2 $(tests-static-internal)
>  tests-static := $(tests-static-normal) $(tests-static-internal)
>  

Ok.

> diff --git a/elf/stringtable.c b/elf/stringtable.c
> new file mode 100644
> index 0000000000..e5b38148a6
> --- /dev/null
> +++ b/elf/stringtable.c
> @@ -0,0 +1,209 @@
> +/* String tables for ld.so.cache construction.  Implementation.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   This program is free software; you can redistribute it and/or modify
> +   it under the terms of the GNU General Public License as published
> +   by the Free Software Foundation; version 2 of the License, or
> +   (at your option) any later version.
> +
> +   This program is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +   GNU General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with this program; if not, see <https://www.gnu.org/licenses/>.  */
> +
> +#include <assert.h>
> +#include <error.h>
> +#include <ldconfig.h>
> +#include <libintl.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <stringtable.h>
> +
> +static void
> +stringtable_init (struct stringtable *table)
> +{
> +  table->count = 0;
> +
> +  /* This needs to be a power of two.  128 is sufficient to keep track
> +     of 42 DSOs without resizing (assuming two strings per DSOs).
> +     glibc itself comes with more than 20 DSOs, so 64 would likely to
> +     be too small.  */

Should we use a large value then? Asking because the comment indicates that
the chosen default value is not best suitable one.

> +  table->allocated = 64;
> +
> +  table->entries = xcalloc (table->allocated, sizeof (table->entries[0]));
> +}
> +
> +/* 32-bit FNV-1a hash function.  */
> +static uint32_t
> +fnv1a (const char *string, size_t length)
> +{
> +  const unsigned char *p = (const unsigned char *) string;
> +  uint32_t hash = 2166136261U;
> +  for (size_t i = 0; i < length; ++i)
> +    {
> +      hash ^= p[i];
> +      hash *= 16777619U;
> +    }
> +  return hash;
> +}
> +

Ok.

> +/* Double the capacity of the hash table.  */
> +static void
> +stringtable_rehash (struct stringtable *table)
> +{
> +  /* Cannot overflow because the old allocation size (in bytes) is
> +     larger.  */

This comment is a bit confusing, I think it should be safe to assume
it won't overflow because stringtable_rehash won't be called for
large values (stringtable_add will bail early).

> +  uint32_t new_allocated = table->allocated * 2;
> +  struct stringtable_entry **new_entries
> +    = xcalloc (new_allocated, sizeof (table->entries[0]));
> +
> +  uint32_t mask = new_allocated - 1;
> +  for (uint32_t i = 0; i < table->allocated; ++i)
> +    for (struct stringtable_entry *e = table->entries[i]; e != NULL; )
> +      {
> +        struct stringtable_entry *next = e->next;
> +        uint32_t hash = fnv1a (e->string, e->length);
> +        uint32_t new_index = hash & mask;
> +        e->next = new_entries[new_index];
> +        new_entries[new_index] = e;
> +        e = next;
> +      }
> +
> +  free (table->entries);
> +  table->entries = new_entries;
> +  table->allocated = new_allocated;
> +}
> +

Ok.

> +struct stringtable_entry *
> +stringtable_add (struct stringtable *table, const char *string)
> +{
> +  /* Check for a zero-initialized table.  */
> +  if (table->allocated == 0)
> +    stringtable_init (table);
> +
> +  size_t length = strlen (string);
> +  if (length > (1U << 30))
> +    error (EXIT_FAILURE, 0, _("String table string is too long"));
> +  uint32_t hash = fnv1a (string, length);
> +
> +  /* Return a previously-existing entry.  */
> +  for (struct stringtable_entry *e
> +         = table->entries[hash & (table->allocated - 1)];
> +       e != NULL; e = e->next)
> +    if (e->length == length && memcmp (e->string, string, length) == 0)
> +      return e;
> +
> +  /* Increase the size of the table if necessary.  Keep utilization
> +     below two thirds.  */
> +  if (table->count >= (1U << 30))
> +    error (EXIT_FAILURE, 0, _("String table has too many entries"));
> +  if (table->count * 3 > table->allocated * 2)
> +    stringtable_rehash (table);
> +
> +  /* Add the new table entry.  */
> +  ++table->count;
> +  struct stringtable_entry *e
> +    = xmalloc (offsetof (struct stringtable_entry, string) + length + 1);
> +  uint32_t index = hash & (table->allocated - 1);
> +  e->next = table->entries[index];
> +  table->entries[index] = e;
> +  e->length = length;
> +  e->offset = 0;
> +  memcpy (e->string, string, length + 1);
> +  return e;
> +}

Ok.

> +
> +/* Sort reversed strings in lexicographic order.  This is used for tail
> +   merging.  */
> +static int
> +finalize_compare (const void *l, const void *r)
> +{
> +  struct stringtable_entry *left = *(struct stringtable_entry **) l;
> +  struct stringtable_entry *right = *(struct stringtable_entry **) r;
> +  size_t to_compare;
> +  if (left->length < right->length)
> +    to_compare = left->length;
> +  else
> +    to_compare = right->length;
> +  for (ssize_t i = to_compare - 1; i >= 0; --i)
> +    {
> +      unsigned char lch = left->string[i];
> +      unsigned char rch = right->string[i];
> +      if (lch != rch)
> +        return lch - rch;
> +    }
> +  if (left->length == right->length)
> +    return 0;
> +  else if (left->length < right->length)
> +    /* Longer strings should come first.  */
> +    return 1;
> +  else
> +    return -1;
> +}
> +

Ok.

> +void
> +stringtable_finalize (struct stringtable *table,
> +                      struct stringtable_finalized *result)
> +{
> +  if (table->count == 0)
> +    {
> +      result->strings = xstrdup ("");
> +      result->size = 0;
> +      return;
> +    }

This is confusing because the folliwing.

  struct stringtable s = { 0, };
  struct stringtable_finalized f;
  stringtable_finalize (&s, &f);
  
will result in f being { "", 0 }, while the following

  struct stringtable s = { 0, };
  stringtable_add (&s, "");
  struct stringtable_finalized f;
  stringtable_finalize (&s, &f);
  
will result in f being { "", 1 }.

I would expect that if strings is being "", the 'size' should be the
same (either 0 or 1 for both usages).

> +
> +  /* Optimize the order of the strings.  */
> +  struct stringtable_entry **array = xcalloc (table->count, sizeof (*array));
> +  {
> +    size_t j = 0;
> +    for (uint32_t i = 0; i < table->allocated; ++i)
> +      for (struct stringtable_entry *e = table->entries[i]; e != NULL;
> +           e = e->next)
> +        {
> +          array[j] = e;
> +          ++j;
> +        }
> +    assert (j == table->count);
> +  }
> +  qsort (array, table->count, sizeof (*array), finalize_compare);
> +
> +  /* Assign offsets, using tail merging (sharing suffixes) if possible.  */
> +  array[0]->offset = 0;
> +  for (uint32_t j = 1; j < table->count; ++j)
> +    {
> +      struct stringtable_entry *previous = array[j - 1];
> +      struct stringtable_entry *current = array[j];
> +      if (previous->length >= current->length
> +          && memcmp (&previous->string[previous->length - current->length],
> +                     current->string, current->length) == 0)
> +        current->offset = (previous->offset + previous->length
> +                           - current->length);
> +      else if (__builtin_add_overflow (previous->offset,
> +                                       previous->length + 1,
> +                                       &current->offset))
> +        error (EXIT_FAILURE, 0, _("String table is too large"));
> +    }
> +
> +  /* Allocate the result string.  */
> +  {
> +    struct stringtable_entry *last = array[table->count - 1];
> +    if (__builtin_add_overflow (last->offset, last->length + 1,
> +                                &result->size))
> +      error (EXIT_FAILURE, 0, _("String table is too large"));
> +  }
> +  /* The strings are copied from the hash table, so the array is no
> +     longer needed.  */
> +  free (array);
> +  result->strings = xcalloc (result->size, 1);
> +
> +  /* Copy the strings.  */
> +  for (uint32_t i = 0; i < table->allocated; ++i)
> +    for (struct stringtable_entry *e = table->entries[i]; e != NULL;
> +         e = e->next)
> +      if (result->strings[e->offset] == '\0')
> +        memcpy (&result->strings[e->offset], e->string, e->length + 1);
> +}

Ok.

> diff --git a/elf/stringtable.h b/elf/stringtable.h
> new file mode 100644
> index 0000000000..7d57d1bda9
> --- /dev/null
> +++ b/elf/stringtable.h
> @@ -0,0 +1,64 @@
> +/* String tables for ld.so.cache construction.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   This program is free software; you can redistribute it and/or modify
> +   it under the terms of the GNU General Public License as published
> +   by the Free Software Foundation; version 2 of the License, or
> +   (at your option) any later version.
> +
> +   This program is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +   GNU General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with this program; if not, see <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _STRINGTABLE_H
> +#define _STRINGTABLE_H
> +
> +#include <stddef.h>
> +#include <stdint.h>
> +
> +/* An entry in the string table.  Only the length and string fields are
> +   expected to be used outside the string table code.  */
> +struct stringtable_entry
> +{
> +  struct stringtable_entry *next; /* For collision resolution.  */
> +  uint32_t length;                /* Length of then string.  */
> +  uint32_t offset;                /* From start of finalized table.  */
> +  char string[];                  /* Null-terminated string.  */
> +};
> +
> +/* A string table.  Zero-initialization produces a valid atable.  */
> +struct stringtable
> +{
> +  struct stringtable_entry **entries;  /* Array of hash table buckets.  */
> +  uint32_t count;                 /* Number of elements in the table.  */
> +  uint32_t allocated;             /* Length of the entries array.  */
> +};
> +
> +/* Adds STRING to TABLE.  May return the address of an existing entry.  */
> +struct stringtable_entry *stringtable_add (struct stringtable *table,
> +                                           const char *string);
> +
> +/* Result of stringtable_finalize.  SIZE bytes at STRINGS should be
> +   written to the file.  */
> +struct stringtable_finalized
> +{
> +  char *strings;
> +  size_t size;
> +};
> +
> +/* Assigns offsets to string table entries and computes the serialized
> +   form of the string table.  */
> +void stringtable_finalize (struct stringtable *table,
> +                           struct stringtable_finalized *result);
> +
> +/* Deallocate the string table (but not the TABLE pointer itself).
> +   (The table can be re-used for adding more strings without
> +   initialization.)  */
> +void stringtable_free (struct stringtable *table);
> +
> +#endif /* _STRINGTABLE_H */

Ok.

> diff --git a/elf/stringtable_free.c b/elf/stringtable_free.c
> new file mode 100644
> index 0000000000..8588a25470
> --- /dev/null
> +++ b/elf/stringtable_free.c
> @@ -0,0 +1,33 @@
> +/* String tables for ld.so.cache construction.  Deallocation (for tests only).
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   This program is free software; you can redistribute it and/or modify
> +   it under the terms of the GNU General Public License as published
> +   by the Free Software Foundation; version 2 of the License, or
> +   (at your option) any later version.
> +
> +   This program is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +   GNU General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with this program; if not, see <https://www.gnu.org/licenses/>.  */
> +
> +#include <stdlib.h>
> +#include <stringtable.h>
> +
> +void
> +stringtable_free (struct stringtable *table)
> +{
> +  for (uint32_t i = 0; i < table->allocated; ++i)
> +    for (struct stringtable_entry *e = table->entries[i]; e != NULL; )
> +      {
> +        struct stringtable_entry *next = e->next;
> +        free (e);
> +        e = next;
> +      }
> +  free (table->entries);
> +  *table = (struct stringtable) { 0, };
> +}

Ok.

> diff --git a/elf/tst-stringtable.c b/elf/tst-stringtable.c
> new file mode 100644
> index 0000000000..2d044b7151
> --- /dev/null
> +++ b/elf/tst-stringtable.c
> @@ -0,0 +1,141 @@
> +/* Unit test for ldconfig string tables.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   This program is free software; you can redistribute it and/or modify
> +   it under the terms of the GNU General Public License as published
> +   by the Free Software Foundation; version 2 of the License, or
> +   (at your option) any later version.
> +
> +   This program is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +   GNU General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with this program; if not, see <https://www.gnu.org/licenses/>.  */
> +
> +#include <stdlib.h>
> +#include <string.h>
> +#include <stringtable.h>
> +#include <support/check.h>
> +#include <support/support.h>
> +
> +static int
> +do_test (void)
> +{
> +  /* Empty string table.  */
> +  {
> +    struct stringtable s = { 0, };
> +    struct stringtable_finalized f;
> +    stringtable_finalize (&s, &f);
> +    TEST_COMPARE_STRING (f.strings, "");
> +    TEST_COMPARE (f.size, 0);
> +    free (f.strings);
> +    stringtable_free (&s);
> +  }
> +
> +  /* String table with one empty string.  */
> +  {
> +    struct stringtable s = { 0, };
> +    struct stringtable_entry *e = stringtable_add (&s, "");
> +    TEST_COMPARE_STRING (e->string, "");
> +    TEST_COMPARE (e->length, 0);
> +    TEST_COMPARE (s.count, 1);
> +
> +    struct stringtable_finalized f;
> +    stringtable_finalize (&s, &f);
> +    TEST_COMPARE (e->offset, 0);
> +    TEST_COMPARE_STRING (f.strings, "");
> +    TEST_COMPARE (f.size, 1);
> +    free (f.strings);
> +    stringtable_free (&s);
> +  }
> +
> +  /* String table with one non-empty string.  */
> +  {
> +    struct stringtable s = { 0, };
> +    struct stringtable_entry *e = stringtable_add (&s, "name");
> +    TEST_COMPARE_STRING (e->string, "name");
> +    TEST_COMPARE (e->length, 4);
> +    TEST_COMPARE (s.count, 1);

Ok.

> +
> +    struct stringtable_finalized f;
> +    stringtable_finalize (&s, &f);
> +    TEST_COMPARE (e->offset, 0);
> +    TEST_COMPARE_STRING (f.strings, "name");
> +    TEST_COMPARE (f.size, 5);
> +    free (f.strings);
> +    stringtable_free (&s);
> +  }
> +

Ok.

> +  /* Two strings, one is a prefix of the other.  Tail-merging can only
> +     happen in one way in this case.  */
> +  {
> +    struct stringtable s = { 0, };
> +    struct stringtable_entry *suffix = stringtable_add (&s, "suffix");
> +    TEST_COMPARE_STRING (suffix->string, "suffix");
> +    TEST_COMPARE (suffix->length, 6);
> +    TEST_COMPARE (s.count, 1);
> +
> +    struct stringtable_entry *prefix
> +      = stringtable_add (&s, "prefix-suffix");
> +    TEST_COMPARE_STRING (prefix->string, "prefix-suffix");
> +    TEST_COMPARE (prefix->length, strlen ("prefix-suffix"));
> +    TEST_COMPARE (s.count, 2);
> +
> +    struct stringtable_finalized f;
> +    stringtable_finalize (&s, &f);
> +    TEST_COMPARE (prefix->offset, 0);
> +    TEST_COMPARE (suffix->offset, strlen ("prefix-"));
> +    TEST_COMPARE_STRING (f.strings, "prefix-suffix");
> +    TEST_COMPARE (f.size, sizeof ("prefix-suffix"));
> +    free (f.strings);
> +    stringtable_free (&s);
> +  }

Ok.

> +
> +  /* String table with various shared prefixes.  Triggers hash
> +     resizing.  */
> +  {
> +    enum { count = 1500 };
> +    char *strings[2 * count];
> +    struct stringtable_entry *entries[2 * count];
> +    struct stringtable s = { 0, };
> +    for (int i = 0; i < count; ++i)
> +      {
> +        strings[i] = xasprintf ("%d", i);
> +        entries[i] = stringtable_add (&s, strings[i]);
> +        TEST_COMPARE (entries[i]->length, strlen (strings[i]));
> +        TEST_COMPARE_STRING (entries[i]->string, strings[i]);
> +        strings[i + count] = xasprintf ("prefix/%d", i);
> +        entries[i + count] = stringtable_add (&s, strings[i + count]);
> +        TEST_COMPARE (entries[i + count]->length, strlen (strings[i + count]));
> +        TEST_COMPARE_STRING (entries[i + count]->string, strings[i + count]);
> +      }
> +
> +    struct stringtable_finalized f;
> +    stringtable_finalize (&s, &f);
> +
> +    for (int i = 0; i < 2 * count; ++i)
> +      {
> +        TEST_COMPARE (entries[i]->length, strlen (strings[i]));
> +        TEST_COMPARE_STRING (entries[i]->string, strings[i]);
> +        TEST_COMPARE_STRING (f.strings + entries[i]->offset, strings[i]);
> +        free (strings[i]);
> +      }
> +
> +    free (f.strings);
> +    stringtable_free (&s);
> +  }
> +
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>
> +
> +/* Re-compile the string table implementation here.  It is not
> +   possible to link against the actual build because it was built for
> +   use in ldconfig.  */
> +#define _(arg) arg
> +#include "stringtable.c"
> +#include "stringtable_free.c"
> 

Ok.
Florian Weimer Nov. 27, 2020, 7:49 p.m. UTC | #2
* Adhemerval Zanella:

>> +static void
>> +stringtable_init (struct stringtable *table)
>> +{
>> +  table->count = 0;
>> +
>> +  /* This needs to be a power of two.  128 is sufficient to keep track
>> +     of 42 DSOs without resizing (assuming two strings per DSOs).
>> +     glibc itself comes with more than 20 DSOs, so 64 would likely to
>> +     be too small.  */
>> +  table->allocated = 64;
>
> Should we use a large value then? Asking because the comment indicates that
> the chosen default value is not best suitable one.

You mean 128?  Eh, yes.

>> +/* Double the capacity of the hash table.  */
>> +static void
>> +stringtable_rehash (struct stringtable *table)
>> +{
>> +  /* Cannot overflow because the old allocation size (in bytes) is
>> +     larger.  */
>
> This comment is a bit confusing, I think it should be safe to assume
> it won't overflow because stringtable_rehash won't be called for
> large values (stringtable_add will bail early).

What I meant is this:

  /* This computation cannot overflow because the old total in-memory
     size of the hash table is larger than the computed value.  */

>> +void
>> +stringtable_finalize (struct stringtable *table,
>> +                      struct stringtable_finalized *result)
>> +{
>> +  if (table->count == 0)
>> +    {
>> +      result->strings = xstrdup ("");
>> +      result->size = 0;
>> +      return;
>> +    }
>
> This is confusing because the folliwing.
>
>   struct stringtable s = { 0, };
>   struct stringtable_finalized f;
>   stringtable_finalize (&s, &f);
>   
> will result in f being { "", 0 }, while the following
>
>   struct stringtable s = { 0, };
>   stringtable_add (&s, "");
>   struct stringtable_finalized f;
>   stringtable_finalize (&s, &f);
>   
> will result in f being { "", 1 }.
>
> I would expect that if strings is being "", the 'size' should be the
> same (either 0 or 1 for both usages).

In the second space, storage space is needed for the null byte because
the table contains the empty string.  In the first case, no storage
space is needed.  We could use NULL, but then glibc has a bit of a
strange relationship with arrays of length 0 that start at a null
pointer.  (They can't be used in memcpy etc.)

Thanks,
Florian
Adhemerval Zanella Nov. 30, 2020, 7:01 p.m. UTC | #3
On 27/11/2020 16:49, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>>> +static void
>>> +stringtable_init (struct stringtable *table)
>>> +{
>>> +  table->count = 0;
>>> +
>>> +  /* This needs to be a power of two.  128 is sufficient to keep track
>>> +     of 42 DSOs without resizing (assuming two strings per DSOs).
>>> +     glibc itself comes with more than 20 DSOs, so 64 would likely to
>>> +     be too small.  */
>>> +  table->allocated = 64;
>>
>> Should we use a large value then? Asking because the comment indicates that
>> the chosen default value is not best suitable one.
> 
> You mean 128?  Eh, yes.
> 
>>> +/* Double the capacity of the hash table.  */
>>> +static void
>>> +stringtable_rehash (struct stringtable *table)
>>> +{
>>> +  /* Cannot overflow because the old allocation size (in bytes) is
>>> +     larger.  */
>>
>> This comment is a bit confusing, I think it should be safe to assume
>> it won't overflow because stringtable_rehash won't be called for
>> large values (stringtable_add will bail early).
> 
> What I meant is this:
> 
>   /* This computation cannot overflow because the old total in-memory
>      size of the hash table is larger than the computed value.  */

Ack.

> 
>>> +void
>>> +stringtable_finalize (struct stringtable *table,
>>> +                      struct stringtable_finalized *result)
>>> +{
>>> +  if (table->count == 0)
>>> +    {
>>> +      result->strings = xstrdup ("");
>>> +      result->size = 0;
>>> +      return;
>>> +    }
>>
>> This is confusing because the folliwing.
>>
>>   struct stringtable s = { 0, };
>>   struct stringtable_finalized f;
>>   stringtable_finalize (&s, &f);
>>   
>> will result in f being { "", 0 }, while the following
>>
>>   struct stringtable s = { 0, };
>>   stringtable_add (&s, "");
>>   struct stringtable_finalized f;
>>   stringtable_finalize (&s, &f);
>>   
>> will result in f being { "", 1 }.
>>
>> I would expect that if strings is being "", the 'size' should be the
>> same (either 0 or 1 for both usages).
> 
> In the second space, storage space is needed for the null byte because
> the table contains the empty string.  In the first case, no storage
> space is needed.  We could use NULL, but then glibc has a bit of a
> strange relationship with arrays of length 0 that start at a null
> pointer.  (They can't be used in memcpy etc.)

Fair enough. 

> 
> Thanks,
> Florian
>
diff mbox series

Patch

diff --git a/elf/Makefile b/elf/Makefile
index d2f7e99863..5a8f116a67 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -172,7 +172,7 @@  tests-container := \
 
 tests := tst-tls9 tst-leaks1 \
 	tst-array1 tst-array2 tst-array3 tst-array4 tst-array5 \
-	tst-auxv
+	tst-auxv tst-stringtable
 tests-internal := tst-tls1 tst-tls2 $(tests-static-internal)
 tests-static := $(tests-static-normal) $(tests-static-internal)
 
diff --git a/elf/stringtable.c b/elf/stringtable.c
new file mode 100644
index 0000000000..e5b38148a6
--- /dev/null
+++ b/elf/stringtable.c
@@ -0,0 +1,209 @@ 
+/* String tables for ld.so.cache construction.  Implementation.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published
+   by the Free Software Foundation; version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <https://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <error.h>
+#include <ldconfig.h>
+#include <libintl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stringtable.h>
+
+static void
+stringtable_init (struct stringtable *table)
+{
+  table->count = 0;
+
+  /* This needs to be a power of two.  128 is sufficient to keep track
+     of 42 DSOs without resizing (assuming two strings per DSOs).
+     glibc itself comes with more than 20 DSOs, so 64 would likely to
+     be too small.  */
+  table->allocated = 64;
+
+  table->entries = xcalloc (table->allocated, sizeof (table->entries[0]));
+}
+
+/* 32-bit FNV-1a hash function.  */
+static uint32_t
+fnv1a (const char *string, size_t length)
+{
+  const unsigned char *p = (const unsigned char *) string;
+  uint32_t hash = 2166136261U;
+  for (size_t i = 0; i < length; ++i)
+    {
+      hash ^= p[i];
+      hash *= 16777619U;
+    }
+  return hash;
+}
+
+/* Double the capacity of the hash table.  */
+static void
+stringtable_rehash (struct stringtable *table)
+{
+  /* Cannot overflow because the old allocation size (in bytes) is
+     larger.  */
+  uint32_t new_allocated = table->allocated * 2;
+  struct stringtable_entry **new_entries
+    = xcalloc (new_allocated, sizeof (table->entries[0]));
+
+  uint32_t mask = new_allocated - 1;
+  for (uint32_t i = 0; i < table->allocated; ++i)
+    for (struct stringtable_entry *e = table->entries[i]; e != NULL; )
+      {
+        struct stringtable_entry *next = e->next;
+        uint32_t hash = fnv1a (e->string, e->length);
+        uint32_t new_index = hash & mask;
+        e->next = new_entries[new_index];
+        new_entries[new_index] = e;
+        e = next;
+      }
+
+  free (table->entries);
+  table->entries = new_entries;
+  table->allocated = new_allocated;
+}
+
+struct stringtable_entry *
+stringtable_add (struct stringtable *table, const char *string)
+{
+  /* Check for a zero-initialized table.  */
+  if (table->allocated == 0)
+    stringtable_init (table);
+
+  size_t length = strlen (string);
+  if (length > (1U << 30))
+    error (EXIT_FAILURE, 0, _("String table string is too long"));
+  uint32_t hash = fnv1a (string, length);
+
+  /* Return a previously-existing entry.  */
+  for (struct stringtable_entry *e
+         = table->entries[hash & (table->allocated - 1)];
+       e != NULL; e = e->next)
+    if (e->length == length && memcmp (e->string, string, length) == 0)
+      return e;
+
+  /* Increase the size of the table if necessary.  Keep utilization
+     below two thirds.  */
+  if (table->count >= (1U << 30))
+    error (EXIT_FAILURE, 0, _("String table has too many entries"));
+  if (table->count * 3 > table->allocated * 2)
+    stringtable_rehash (table);
+
+  /* Add the new table entry.  */
+  ++table->count;
+  struct stringtable_entry *e
+    = xmalloc (offsetof (struct stringtable_entry, string) + length + 1);
+  uint32_t index = hash & (table->allocated - 1);
+  e->next = table->entries[index];
+  table->entries[index] = e;
+  e->length = length;
+  e->offset = 0;
+  memcpy (e->string, string, length + 1);
+  return e;
+}
+
+/* Sort reversed strings in lexicographic order.  This is used for tail
+   merging.  */
+static int
+finalize_compare (const void *l, const void *r)
+{
+  struct stringtable_entry *left = *(struct stringtable_entry **) l;
+  struct stringtable_entry *right = *(struct stringtable_entry **) r;
+  size_t to_compare;
+  if (left->length < right->length)
+    to_compare = left->length;
+  else
+    to_compare = right->length;
+  for (ssize_t i = to_compare - 1; i >= 0; --i)
+    {
+      unsigned char lch = left->string[i];
+      unsigned char rch = right->string[i];
+      if (lch != rch)
+        return lch - rch;
+    }
+  if (left->length == right->length)
+    return 0;
+  else if (left->length < right->length)
+    /* Longer strings should come first.  */
+    return 1;
+  else
+    return -1;
+}
+
+void
+stringtable_finalize (struct stringtable *table,
+                      struct stringtable_finalized *result)
+{
+  if (table->count == 0)
+    {
+      result->strings = xstrdup ("");
+      result->size = 0;
+      return;
+    }
+
+  /* Optimize the order of the strings.  */
+  struct stringtable_entry **array = xcalloc (table->count, sizeof (*array));
+  {
+    size_t j = 0;
+    for (uint32_t i = 0; i < table->allocated; ++i)
+      for (struct stringtable_entry *e = table->entries[i]; e != NULL;
+           e = e->next)
+        {
+          array[j] = e;
+          ++j;
+        }
+    assert (j == table->count);
+  }
+  qsort (array, table->count, sizeof (*array), finalize_compare);
+
+  /* Assign offsets, using tail merging (sharing suffixes) if possible.  */
+  array[0]->offset = 0;
+  for (uint32_t j = 1; j < table->count; ++j)
+    {
+      struct stringtable_entry *previous = array[j - 1];
+      struct stringtable_entry *current = array[j];
+      if (previous->length >= current->length
+          && memcmp (&previous->string[previous->length - current->length],
+                     current->string, current->length) == 0)
+        current->offset = (previous->offset + previous->length
+                           - current->length);
+      else if (__builtin_add_overflow (previous->offset,
+                                       previous->length + 1,
+                                       &current->offset))
+        error (EXIT_FAILURE, 0, _("String table is too large"));
+    }
+
+  /* Allocate the result string.  */
+  {
+    struct stringtable_entry *last = array[table->count - 1];
+    if (__builtin_add_overflow (last->offset, last->length + 1,
+                                &result->size))
+      error (EXIT_FAILURE, 0, _("String table is too large"));
+  }
+  /* The strings are copied from the hash table, so the array is no
+     longer needed.  */
+  free (array);
+  result->strings = xcalloc (result->size, 1);
+
+  /* Copy the strings.  */
+  for (uint32_t i = 0; i < table->allocated; ++i)
+    for (struct stringtable_entry *e = table->entries[i]; e != NULL;
+         e = e->next)
+      if (result->strings[e->offset] == '\0')
+        memcpy (&result->strings[e->offset], e->string, e->length + 1);
+}
diff --git a/elf/stringtable.h b/elf/stringtable.h
new file mode 100644
index 0000000000..7d57d1bda9
--- /dev/null
+++ b/elf/stringtable.h
@@ -0,0 +1,64 @@ 
+/* String tables for ld.so.cache construction.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published
+   by the Free Software Foundation; version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <https://www.gnu.org/licenses/>.  */
+
+#ifndef _STRINGTABLE_H
+#define _STRINGTABLE_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+/* An entry in the string table.  Only the length and string fields are
+   expected to be used outside the string table code.  */
+struct stringtable_entry
+{
+  struct stringtable_entry *next; /* For collision resolution.  */
+  uint32_t length;                /* Length of then string.  */
+  uint32_t offset;                /* From start of finalized table.  */
+  char string[];                  /* Null-terminated string.  */
+};
+
+/* A string table.  Zero-initialization produces a valid atable.  */
+struct stringtable
+{
+  struct stringtable_entry **entries;  /* Array of hash table buckets.  */
+  uint32_t count;                 /* Number of elements in the table.  */
+  uint32_t allocated;             /* Length of the entries array.  */
+};
+
+/* Adds STRING to TABLE.  May return the address of an existing entry.  */
+struct stringtable_entry *stringtable_add (struct stringtable *table,
+                                           const char *string);
+
+/* Result of stringtable_finalize.  SIZE bytes at STRINGS should be
+   written to the file.  */
+struct stringtable_finalized
+{
+  char *strings;
+  size_t size;
+};
+
+/* Assigns offsets to string table entries and computes the serialized
+   form of the string table.  */
+void stringtable_finalize (struct stringtable *table,
+                           struct stringtable_finalized *result);
+
+/* Deallocate the string table (but not the TABLE pointer itself).
+   (The table can be re-used for adding more strings without
+   initialization.)  */
+void stringtable_free (struct stringtable *table);
+
+#endif /* _STRINGTABLE_H */
diff --git a/elf/stringtable_free.c b/elf/stringtable_free.c
new file mode 100644
index 0000000000..8588a25470
--- /dev/null
+++ b/elf/stringtable_free.c
@@ -0,0 +1,33 @@ 
+/* String tables for ld.so.cache construction.  Deallocation (for tests only).
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published
+   by the Free Software Foundation; version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <https://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+#include <stringtable.h>
+
+void
+stringtable_free (struct stringtable *table)
+{
+  for (uint32_t i = 0; i < table->allocated; ++i)
+    for (struct stringtable_entry *e = table->entries[i]; e != NULL; )
+      {
+        struct stringtable_entry *next = e->next;
+        free (e);
+        e = next;
+      }
+  free (table->entries);
+  *table = (struct stringtable) { 0, };
+}
diff --git a/elf/tst-stringtable.c b/elf/tst-stringtable.c
new file mode 100644
index 0000000000..2d044b7151
--- /dev/null
+++ b/elf/tst-stringtable.c
@@ -0,0 +1,141 @@ 
+/* Unit test for ldconfig string tables.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published
+   by the Free Software Foundation; version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <https://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+#include <string.h>
+#include <stringtable.h>
+#include <support/check.h>
+#include <support/support.h>
+
+static int
+do_test (void)
+{
+  /* Empty string table.  */
+  {
+    struct stringtable s = { 0, };
+    struct stringtable_finalized f;
+    stringtable_finalize (&s, &f);
+    TEST_COMPARE_STRING (f.strings, "");
+    TEST_COMPARE (f.size, 0);
+    free (f.strings);
+    stringtable_free (&s);
+  }
+
+  /* String table with one empty string.  */
+  {
+    struct stringtable s = { 0, };
+    struct stringtable_entry *e = stringtable_add (&s, "");
+    TEST_COMPARE_STRING (e->string, "");
+    TEST_COMPARE (e->length, 0);
+    TEST_COMPARE (s.count, 1);
+
+    struct stringtable_finalized f;
+    stringtable_finalize (&s, &f);
+    TEST_COMPARE (e->offset, 0);
+    TEST_COMPARE_STRING (f.strings, "");
+    TEST_COMPARE (f.size, 1);
+    free (f.strings);
+    stringtable_free (&s);
+  }
+
+  /* String table with one non-empty string.  */
+  {
+    struct stringtable s = { 0, };
+    struct stringtable_entry *e = stringtable_add (&s, "name");
+    TEST_COMPARE_STRING (e->string, "name");
+    TEST_COMPARE (e->length, 4);
+    TEST_COMPARE (s.count, 1);
+
+    struct stringtable_finalized f;
+    stringtable_finalize (&s, &f);
+    TEST_COMPARE (e->offset, 0);
+    TEST_COMPARE_STRING (f.strings, "name");
+    TEST_COMPARE (f.size, 5);
+    free (f.strings);
+    stringtable_free (&s);
+  }
+
+  /* Two strings, one is a prefix of the other.  Tail-merging can only
+     happen in one way in this case.  */
+  {
+    struct stringtable s = { 0, };
+    struct stringtable_entry *suffix = stringtable_add (&s, "suffix");
+    TEST_COMPARE_STRING (suffix->string, "suffix");
+    TEST_COMPARE (suffix->length, 6);
+    TEST_COMPARE (s.count, 1);
+
+    struct stringtable_entry *prefix
+      = stringtable_add (&s, "prefix-suffix");
+    TEST_COMPARE_STRING (prefix->string, "prefix-suffix");
+    TEST_COMPARE (prefix->length, strlen ("prefix-suffix"));
+    TEST_COMPARE (s.count, 2);
+
+    struct stringtable_finalized f;
+    stringtable_finalize (&s, &f);
+    TEST_COMPARE (prefix->offset, 0);
+    TEST_COMPARE (suffix->offset, strlen ("prefix-"));
+    TEST_COMPARE_STRING (f.strings, "prefix-suffix");
+    TEST_COMPARE (f.size, sizeof ("prefix-suffix"));
+    free (f.strings);
+    stringtable_free (&s);
+  }
+
+  /* String table with various shared prefixes.  Triggers hash
+     resizing.  */
+  {
+    enum { count = 1500 };
+    char *strings[2 * count];
+    struct stringtable_entry *entries[2 * count];
+    struct stringtable s = { 0, };
+    for (int i = 0; i < count; ++i)
+      {
+        strings[i] = xasprintf ("%d", i);
+        entries[i] = stringtable_add (&s, strings[i]);
+        TEST_COMPARE (entries[i]->length, strlen (strings[i]));
+        TEST_COMPARE_STRING (entries[i]->string, strings[i]);
+        strings[i + count] = xasprintf ("prefix/%d", i);
+        entries[i + count] = stringtable_add (&s, strings[i + count]);
+        TEST_COMPARE (entries[i + count]->length, strlen (strings[i + count]));
+        TEST_COMPARE_STRING (entries[i + count]->string, strings[i + count]);
+      }
+
+    struct stringtable_finalized f;
+    stringtable_finalize (&s, &f);
+
+    for (int i = 0; i < 2 * count; ++i)
+      {
+        TEST_COMPARE (entries[i]->length, strlen (strings[i]));
+        TEST_COMPARE_STRING (entries[i]->string, strings[i]);
+        TEST_COMPARE_STRING (f.strings + entries[i]->offset, strings[i]);
+        free (strings[i]);
+      }
+
+    free (f.strings);
+    stringtable_free (&s);
+  }
+
+  return 0;
+}
+
+#include <support/test-driver.c>
+
+/* Re-compile the string table implementation here.  It is not
+   possible to link against the actual build because it was built for
+   use in ldconfig.  */
+#define _(arg) arg
+#include "stringtable.c"
+#include "stringtable_free.c"