[v2,4/4] malloc: Add Huge Page support for sysmalloc

Message ID 20210818142000.128752-5-adhemerval.zanella@linaro.org
State Superseded
Headers
Series malloc: Improve Huge Page support |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit fail Patch caused testsuite regressions

Commit Message

Adhemerval Zanella Aug. 18, 2021, 2:20 p.m. UTC
  A new tunable, 'glibc.malloc.mmap_hugetlb', adds support to use Huge Page
support directly with mmap() calls.  The required supported sizes and
flags for mmap() are provided by an arch-specific internal hook
malloc_hp_config().

Currently it first try mmap() using the huge page size and fallback to
default page size and sbrk() call if kernel returns MMAP_FAILED.

The default malloc_hp_config() implementation does not enable it even
if the tunable is set.

Checked on x86_64-linux-gnu.
---
 NEWS                                       |   4 +
 elf/dl-tunables.list                       |   4 +
 elf/tst-rtld-list-tunables.exp             |   1 +
 malloc/arena.c                             |   2 +
 malloc/malloc.c                            |  35 +++++-
 manual/tunables.texi                       |  14 +++
 sysdeps/generic/malloc-hugepages.c         |   6 +
 sysdeps/generic/malloc-hugepages.h         |  12 ++
 sysdeps/unix/sysv/linux/malloc-hugepages.c | 125 +++++++++++++++++++++
 9 files changed, 200 insertions(+), 3 deletions(-)
  

Comments

Siddhesh Poyarekar Aug. 19, 2021, 1:03 a.m. UTC | #1
On 8/18/21 7:50 PM, Adhemerval Zanella via Libc-alpha wrote:
> A new tunable, 'glibc.malloc.mmap_hugetlb', adds support to use Huge Page
> support directly with mmap() calls.  The required supported sizes and
> flags for mmap() are provided by an arch-specific internal hook
> malloc_hp_config().
> 
> Currently it first try mmap() using the huge page size and fallback to
> default page size and sbrk() call if kernel returns MMAP_FAILED.
> 
> The default malloc_hp_config() implementation does not enable it even
> if the tunable is set.
> 
> Checked on x86_64-linux-gnu.
> ---
>   NEWS                                       |   4 +
>   elf/dl-tunables.list                       |   4 +
>   elf/tst-rtld-list-tunables.exp             |   1 +
>   malloc/arena.c                             |   2 +
>   malloc/malloc.c                            |  35 +++++-
>   manual/tunables.texi                       |  14 +++
>   sysdeps/generic/malloc-hugepages.c         |   6 +
>   sysdeps/generic/malloc-hugepages.h         |  12 ++
>   sysdeps/unix/sysv/linux/malloc-hugepages.c | 125 +++++++++++++++++++++
>   9 files changed, 200 insertions(+), 3 deletions(-)
> 
> diff --git a/NEWS b/NEWS
> index 9b2345d08c..412bf3e6f8 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -14,6 +14,10 @@ Major new features:
>     It might improve performance with Transparent Huge Pages madvise mode
>     depending of the workload.
>   
> +* On Linux, a new tunable, glibc.malloc.mmap_hugetlb, can be used to
> +  instruct malloc to try use Huge Pages when allocate memory with mmap()
> +  calls (through the use of MAP_HUGETLB).
> +
>   Deprecated and removed features, and other changes affecting compatibility:
>   
>     [Add deprecations, removals and changes affecting compatibility here]
> diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
> index 67df6dbc2c..209c2d8592 100644
> --- a/elf/dl-tunables.list
> +++ b/elf/dl-tunables.list
> @@ -97,6 +97,10 @@ glibc {
>         minval: 0
>         maxval: 1
>       }
> +    mmap_hugetlb {
> +      type: SIZE_T
> +      minval: 0
> +    }
>     }
>     cpu {
>       hwcap_mask {
> diff --git a/elf/tst-rtld-list-tunables.exp b/elf/tst-rtld-list-tunables.exp
> index d8109fa31c..49f033ce91 100644
> --- a/elf/tst-rtld-list-tunables.exp
> +++ b/elf/tst-rtld-list-tunables.exp
> @@ -1,6 +1,7 @@
>   glibc.malloc.arena_max: 0x0 (min: 0x1, max: 0x[f]+)
>   glibc.malloc.arena_test: 0x0 (min: 0x1, max: 0x[f]+)
>   glibc.malloc.check: 0 (min: 0, max: 3)
> +glibc.malloc.mmap_hugetlb: 0x0 (min: 0x0, max: 0x[f]+)
>   glibc.malloc.mmap_max: 0 (min: 0, max: 2147483647)
>   glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0x[f]+)
>   glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0x[f]+)
> diff --git a/malloc/arena.c b/malloc/arena.c
> index 81bff54303..4efb5581c1 100644
> --- a/malloc/arena.c
> +++ b/malloc/arena.c
> @@ -232,6 +232,7 @@ TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
>   #endif
>   TUNABLE_CALLBACK_FNDECL (set_mxfast, size_t)
>   TUNABLE_CALLBACK_FNDECL (set_thp_madvise, int32_t)
> +TUNABLE_CALLBACK_FNDECL (set_mmap_hugetlb, size_t)
>   #else
>   /* Initialization routine. */
>   #include <string.h>
> @@ -333,6 +334,7 @@ ptmalloc_init (void)
>   # endif
>     TUNABLE_GET (mxfast, size_t, TUNABLE_CALLBACK (set_mxfast));
>     TUNABLE_GET (thp_madvise, int32_t, TUNABLE_CALLBACK (set_thp_madvise));
> +  TUNABLE_GET (mmap_hugetlb, size_t, TUNABLE_CALLBACK (set_mmap_hugetlb));
>   #else
>     if (__glibc_likely (_environ != NULL))
>       {
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 4bfcea286f..8cf2d6855e 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -1884,6 +1884,10 @@ struct malloc_par
>   #if HAVE_TUNABLES
>     /* Transparent Large Page support.  */
>     INTERNAL_SIZE_T thp_pagesize;
> +  /* A value different than 0 means to align mmap allocation to hp_pagesize
> +     add hp_flags on flags.  */
> +  INTERNAL_SIZE_T hp_pagesize;
> +  int hp_flags;
>   #endif
>   
>     /* Memory map support */
> @@ -2415,7 +2419,8 @@ do_check_malloc_state (mstate av)
>    */
>   
>   static void *
> -sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
> +sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av,
> +		bool set_thp)
>   {
>     long int size;
>   
> @@ -2442,7 +2447,8 @@ sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
>     if (mm == MAP_FAILED)
>       return mm;
>   
> -  sysmadvise_thp (mm, size);
> +  if (set_thp)
> +    sysmadvise_thp (mm, size);

If MAP_HUGEPAGE is set in extra_flags then we don't need madvise; 
there's no need for set_thp.

>   
>     /*
>       The offset to the start of the mmapped region is stored in the prev_size
> @@ -2531,7 +2537,18 @@ sysmalloc (INTERNAL_SIZE_T nb, mstate av)
>   	  && (mp_.n_mmaps < mp_.n_mmaps_max)))
>       {
>       try_mmap:
> -      char *mm = sysmalloc_mmap (nb, pagesize, 0, av);
> +      char *mm;
> +#if HAVE_TUNABLES
> +      if (mp_.hp_pagesize > 0)
> +	{
> +	  /* There is no need to isse the THP madvise call if Huge Pages are
> +	     used directly.  */
> +	  mm = sysmalloc_mmap (nb, mp_.hp_pagesize, mp_.hp_flags, av, false);
> +	  if (mm != MAP_FAILED)
> +	    return mm;
> +	}
> +#endif
> +      mm = sysmalloc_mmap (nb, pagesize, 0, av, true);

A single tunable ought to allow you to do all this in just sysmalloc_mmap.

>         if (mm != MAP_FAILED)
>   	return mm;
>         tried_mmap = true;
> @@ -5405,6 +5422,18 @@ do_set_thp_madvise (int32_t value)
>       }
>     return 0;
>   }
> +
> +static __always_inline int
> +do_set_mmap_hugetlb (size_t value)
> +{
> +  if (value > 0)
> +    {
> +      struct malloc_hugepage_config_t cfg = __malloc_hugepage_config (value);
> +      mp_.hp_pagesize = cfg.pagesize;
> +      mp_.hp_flags = cfg.flags;

Instead of making a struct to pass it, you could just pass 
&mp.hp_pagesize and &mp.hp_flags.  Also, with a single tunable, you do 
this only when value > 1.  For value == 0, you set the default THP 
pagesize and set flags to 0.

> +    }
> +  return 0;
> +}
>   #endif
>   
>   int
> diff --git a/manual/tunables.texi b/manual/tunables.texi
> index 93c46807f9..4da6a02778 100644
> --- a/manual/tunables.texi
> +++ b/manual/tunables.texi
> @@ -279,6 +279,20 @@ The default value of this tunable is @code{0}, which disable its usage.
>   Setting to a positive value enable the @code{madvise} call.
>   @end deftp
>   
> +@deftp Tunable glibc.malloc.mmap_hugetlb
> +This tunable enable the use of Huge Pages when the system supports it (currently
> +only Linux).  It is done by aligning the memory size and passing the required
> +flags (@code{MAP_HUGETLB} on Linux) when issuing the @code{mmap} to allocate
> +memory from the system.
> +
> +The default value of this tunable is @code{0}, which disable its usage.
> +The special value @code{1} will try to gather the system default huge page size,
> +while a value larger than @code{1} will try to match it with the supported system
> +huge page size.  If either no default huge page size could be obtained or if the
> +requested size does not match the supported ones, the huge pages supports will be
> +disabled.
> +@end deftp
> +
>   @node Dynamic Linking Tunables
>   @section Dynamic Linking Tunables
>   @cindex dynamic linking tunables
> diff --git a/sysdeps/generic/malloc-hugepages.c b/sysdeps/generic/malloc-hugepages.c
> index 262bcdbeb8..e5f5c1ec98 100644
> --- a/sysdeps/generic/malloc-hugepages.c
> +++ b/sysdeps/generic/malloc-hugepages.c
> @@ -29,3 +29,9 @@ __malloc_thp_mode (void)
>   {
>     return malloc_thp_mode_not_supported;
>   }
> +
> +/* Return the default transparent huge page size.  */
> +struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested)
> +{
> +  return (struct malloc_hugepage_config_t) { 0, 0 };
> +}
> diff --git a/sysdeps/generic/malloc-hugepages.h b/sysdeps/generic/malloc-hugepages.h
> index 664cda9b67..27f7adfea5 100644
> --- a/sysdeps/generic/malloc-hugepages.h
> +++ b/sysdeps/generic/malloc-hugepages.h
> @@ -34,4 +34,16 @@ enum malloc_thp_mode_t
>   
>   enum malloc_thp_mode_t __malloc_thp_mode (void) attribute_hidden;
>   
> +struct malloc_hugepage_config_t
> +{
> +  size_t pagesize;
> +  int flags;
> +};
> +
> +/* Returned the support huge page size from the requested PAGESIZE along
> +   with the requires extra mmap flags.  Returning a 0 value for pagesize
> +   disables its usage.  */
> +struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested)
> +     attribute_hidden;
> +
>   #endif /* _MALLOC_HUGEPAGES_H */
> diff --git a/sysdeps/unix/sysv/linux/malloc-hugepages.c b/sysdeps/unix/sysv/linux/malloc-hugepages.c
> index 66589127cd..0eb0c764ad 100644
> --- a/sysdeps/unix/sysv/linux/malloc-hugepages.c
> +++ b/sysdeps/unix/sysv/linux/malloc-hugepages.c
> @@ -17,8 +17,10 @@
>      not, see <https://www.gnu.org/licenses/>.  */
>   
>   #include <intprops.h>
> +#include <dirent.h>
>   #include <malloc-hugepages.h>
>   #include <not-cancel.h>
> +#include <sys/mman.h>
>   
>   size_t
>   __malloc_default_thp_pagesize (void)
> @@ -74,3 +76,126 @@ __malloc_thp_mode (void)
>       }
>     return malloc_thp_mode_not_supported;
>   }
> +
> +static size_t
> +malloc_default_hugepage_size (void)
> +{
> +  int fd = __open64_nocancel ("/proc/meminfo", O_RDONLY);
> +  if (fd == -1)
> +    return 0;
> +
> +  char buf[512];
> +  off64_t off = 0;
> +  while (1)
> +    {
> +      ssize_t r = __pread64_nocancel (fd, buf, sizeof (buf) - 1, off);
> +      if (r < 0)
> +	break;
> +      buf[r - 1] = '\0';
> +
> +      const char *s = strstr (buf, "Hugepagesize:");
> +      if (s == NULL)
> +	{
> +	  char *nl = strrchr (buf, '\n');
> +	  if (nl == NULL)
> +	    break;
> +	  off += (nl + 1) - buf;
> +	  continue;
> +	}
> +
> +      /* The default huge page size is in the form:
> +	 Hugepagesize:       NUMBER kB  */
> +      size_t hpsize = 0;
> +      s += sizeof ("Hugepagesize: ") - 1;
> +      for (int i = 0; (s[i] >= '0' && s[i] <= '9') || s[i] == ' '; i++)
> +	{
> +	  if (s[i] == ' ')
> +	    continue;
> +	  hpsize *= 10;
> +	  hpsize += s[i] - '0';
> +	}
> +      return hpsize * 1024;
> +    }
> +
> +  __close_nocancel (fd);
> +
> +  return 0;
> +}
> +
> +static inline struct malloc_hugepage_config_t
> +make_malloc_hugepage_config (size_t pagesize)
> +{
> +  int flags = MAP_HUGETLB | (__builtin_ctzll (pagesize) << MAP_HUGE_SHIFT);
> +  return (struct malloc_hugepage_config_t) { pagesize, flags };
> +}
> +
> +struct malloc_hugepage_config_t
> +__malloc_hugepage_config (size_t requested)
> +{
> +  if (requested == 1)
> +    {
> +      size_t pagesize = malloc_default_hugepage_size ();
> +      if (pagesize != 0)
> +	return make_malloc_hugepage_config (pagesize);
> +    }
> +
> +  int dirfd = __open64_nocancel ("/sys/kernel/mm/hugepages",
> +				 O_RDONLY | O_DIRECTORY, 0);
> +  if (dirfd == -1)
> +    return (struct malloc_hugepage_config_t) { 0, 0 };
> +
> +  bool found = false;
> +
> +  char buffer[1024];
> +  while (true)
> +    {
> +#if !IS_IN(libc)
> +# define __getdents64 getdents64
> +#endif
> +      ssize_t ret = __getdents64 (dirfd, buffer, sizeof (buffer));
> +      if (ret == -1)
> +	break;
> +      else if (ret == 0)
> +        break;
> +
> +      char *begin = buffer, *end = buffer + ret;
> +      while (begin != end)
> +        {
> +          unsigned short int d_reclen;
> +          memcpy (&d_reclen, begin + offsetof (struct dirent64, d_reclen),
> +                  sizeof (d_reclen));
> +          const char *dname = begin + offsetof (struct dirent64, d_name);
> +          begin += d_reclen;
> +
> +          if (dname[0] == '.'
> +	      || strncmp (dname, "hugepages-", sizeof ("hugepages-") - 1) != 0)
> +            continue;
> +
> +	  /* Each entry represents a supported huge page in the form of:
> +	     hugepages-<size>kB.  */
> +	  size_t hpsize = 0;
> +	  const char *sizestr = dname + sizeof ("hugepages-") - 1;
> +	  for (int i = 0; sizestr[i] >= '0' && sizestr[i] <= '9'; i++)
> +	    {
> +	      hpsize *= 10;
> +	      hpsize += sizestr[i] - '0';
> +	    }
> +	  hpsize *= 1024;
> +
> +	  if (hpsize == requested)
> +	    {
> +	      found = true;
> +	      break;
> +	    }
> +        }
> +      if (found)
> +	break;
> +    }
> +
> +  __close_nocancel (dirfd);
> +
> +  if (found)
> +    return make_malloc_hugepage_config (requested);
> +
> +  return (struct malloc_hugepage_config_t) { 0, 0 };
> +}
>
  
Adhemerval Zanella Aug. 19, 2021, 12:08 p.m. UTC | #2
On 18/08/2021 22:03, Siddhesh Poyarekar wrote:
> On 8/18/21 7:50 PM, Adhemerval Zanella via Libc-alpha wrote:
>> A new tunable, 'glibc.malloc.mmap_hugetlb', adds support to use Huge Page
>> support directly with mmap() calls.  The required supported sizes and
>> flags for mmap() are provided by an arch-specific internal hook
>> malloc_hp_config().
>>
>> Currently it first try mmap() using the huge page size and fallback to
>> default page size and sbrk() call if kernel returns MMAP_FAILED.
>>
>> The default malloc_hp_config() implementation does not enable it even
>> if the tunable is set.
>>
>> Checked on x86_64-linux-gnu.
>> ---
>>   NEWS                                       |   4 +
>>   elf/dl-tunables.list                       |   4 +
>>   elf/tst-rtld-list-tunables.exp             |   1 +
>>   malloc/arena.c                             |   2 +
>>   malloc/malloc.c                            |  35 +++++-
>>   manual/tunables.texi                       |  14 +++
>>   sysdeps/generic/malloc-hugepages.c         |   6 +
>>   sysdeps/generic/malloc-hugepages.h         |  12 ++
>>   sysdeps/unix/sysv/linux/malloc-hugepages.c | 125 +++++++++++++++++++++
>>   9 files changed, 200 insertions(+), 3 deletions(-)
>>
>> diff --git a/NEWS b/NEWS
>> index 9b2345d08c..412bf3e6f8 100644
>> --- a/NEWS
>> +++ b/NEWS
>> @@ -14,6 +14,10 @@ Major new features:
>>     It might improve performance with Transparent Huge Pages madvise mode
>>     depending of the workload.
>>   +* On Linux, a new tunable, glibc.malloc.mmap_hugetlb, can be used to
>> +  instruct malloc to try use Huge Pages when allocate memory with mmap()
>> +  calls (through the use of MAP_HUGETLB).
>> +
>>   Deprecated and removed features, and other changes affecting compatibility:
>>       [Add deprecations, removals and changes affecting compatibility here]
>> diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
>> index 67df6dbc2c..209c2d8592 100644
>> --- a/elf/dl-tunables.list
>> +++ b/elf/dl-tunables.list
>> @@ -97,6 +97,10 @@ glibc {
>>         minval: 0
>>         maxval: 1
>>       }
>> +    mmap_hugetlb {
>> +      type: SIZE_T
>> +      minval: 0
>> +    }
>>     }
>>     cpu {
>>       hwcap_mask {
>> diff --git a/elf/tst-rtld-list-tunables.exp b/elf/tst-rtld-list-tunables.exp
>> index d8109fa31c..49f033ce91 100644
>> --- a/elf/tst-rtld-list-tunables.exp
>> +++ b/elf/tst-rtld-list-tunables.exp
>> @@ -1,6 +1,7 @@
>>   glibc.malloc.arena_max: 0x0 (min: 0x1, max: 0x[f]+)
>>   glibc.malloc.arena_test: 0x0 (min: 0x1, max: 0x[f]+)
>>   glibc.malloc.check: 0 (min: 0, max: 3)
>> +glibc.malloc.mmap_hugetlb: 0x0 (min: 0x0, max: 0x[f]+)
>>   glibc.malloc.mmap_max: 0 (min: 0, max: 2147483647)
>>   glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0x[f]+)
>>   glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0x[f]+)
>> diff --git a/malloc/arena.c b/malloc/arena.c
>> index 81bff54303..4efb5581c1 100644
>> --- a/malloc/arena.c
>> +++ b/malloc/arena.c
>> @@ -232,6 +232,7 @@ TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
>>   #endif
>>   TUNABLE_CALLBACK_FNDECL (set_mxfast, size_t)
>>   TUNABLE_CALLBACK_FNDECL (set_thp_madvise, int32_t)
>> +TUNABLE_CALLBACK_FNDECL (set_mmap_hugetlb, size_t)
>>   #else
>>   /* Initialization routine. */
>>   #include <string.h>
>> @@ -333,6 +334,7 @@ ptmalloc_init (void)
>>   # endif
>>     TUNABLE_GET (mxfast, size_t, TUNABLE_CALLBACK (set_mxfast));
>>     TUNABLE_GET (thp_madvise, int32_t, TUNABLE_CALLBACK (set_thp_madvise));
>> +  TUNABLE_GET (mmap_hugetlb, size_t, TUNABLE_CALLBACK (set_mmap_hugetlb));
>>   #else
>>     if (__glibc_likely (_environ != NULL))
>>       {
>> diff --git a/malloc/malloc.c b/malloc/malloc.c
>> index 4bfcea286f..8cf2d6855e 100644
>> --- a/malloc/malloc.c
>> +++ b/malloc/malloc.c
>> @@ -1884,6 +1884,10 @@ struct malloc_par
>>   #if HAVE_TUNABLES
>>     /* Transparent Large Page support.  */
>>     INTERNAL_SIZE_T thp_pagesize;
>> +  /* A value different than 0 means to align mmap allocation to hp_pagesize
>> +     add hp_flags on flags.  */
>> +  INTERNAL_SIZE_T hp_pagesize;
>> +  int hp_flags;
>>   #endif
>>       /* Memory map support */
>> @@ -2415,7 +2419,8 @@ do_check_malloc_state (mstate av)
>>    */
>>     static void *
>> -sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
>> +sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av,
>> +        bool set_thp)
>>   {
>>     long int size;
>>   @@ -2442,7 +2447,8 @@ sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
>>     if (mm == MAP_FAILED)
>>       return mm;
>>   -  sysmadvise_thp (mm, size);
>> +  if (set_thp)
>> +    sysmadvise_thp (mm, size);
> 
> If MAP_HUGEPAGE is set in extra_flags then we don't need madvise; there's no need for set_thp.

Alright we can use it instead.  I just add the flag to avoid the extra
ifdef MAP_HUGEPAGE.

> 
>>       /*
>>       The offset to the start of the mmapped region is stored in the prev_size
>> @@ -2531,7 +2537,18 @@ sysmalloc (INTERNAL_SIZE_T nb, mstate av)
>>         && (mp_.n_mmaps < mp_.n_mmaps_max)))
>>       {
>>       try_mmap:
>> -      char *mm = sysmalloc_mmap (nb, pagesize, 0, av);
>> +      char *mm;
>> +#if HAVE_TUNABLES
>> +      if (mp_.hp_pagesize > 0)
>> +    {
>> +      /* There is no need to isse the THP madvise call if Huge Pages are
>> +         used directly.  */
>> +      mm = sysmalloc_mmap (nb, mp_.hp_pagesize, mp_.hp_flags, av, false);
>> +      if (mm != MAP_FAILED)
>> +        return mm;
>> +    }
>> +#endif
>> +      mm = sysmalloc_mmap (nb, pagesize, 0, av, true);
> 
> A single tunable ought to allow you to do all this in just sysmalloc_mmap.
> 
>>         if (mm != MAP_FAILED)
>>       return mm;
>>         tried_mmap = true;
>> @@ -5405,6 +5422,18 @@ do_set_thp_madvise (int32_t value)
>>       }
>>     return 0;
>>   }
>> +
>> +static __always_inline int
>> +do_set_mmap_hugetlb (size_t value)
>> +{
>> +  if (value > 0)
>> +    {
>> +      struct malloc_hugepage_config_t cfg = __malloc_hugepage_config (value);
>> +      mp_.hp_pagesize = cfg.pagesize;
>> +      mp_.hp_flags = cfg.flags;
> 
> Instead of making a struct to pass it, you could just pass &mp.hp_pagesize and &mp.hp_flags.  Also, with a single tunable, you do this only when value > 1.  For value == 0, you set the default THP pagesize and set flags to 0.
> 
>> +    }
>> +  return 0;
>> +}
>>   #endif
>>     int

I don't have a strong opinion here, using pointers should work as well.

>> diff --git a/manual/tunables.texi b/manual/tunables.texi
>> index 93c46807f9..4da6a02778 100644
>> --- a/manual/tunables.texi
>> +++ b/manual/tunables.texi
>> @@ -279,6 +279,20 @@ The default value of this tunable is @code{0}, which disable its usage.
>>   Setting to a positive value enable the @code{madvise} call.
>>   @end deftp
>>   +@deftp Tunable glibc.malloc.mmap_hugetlb
>> +This tunable enable the use of Huge Pages when the system supports it (currently
>> +only Linux).  It is done by aligning the memory size and passing the required
>> +flags (@code{MAP_HUGETLB} on Linux) when issuing the @code{mmap} to allocate
>> +memory from the system.
>> +
>> +The default value of this tunable is @code{0}, which disable its usage.
>> +The special value @code{1} will try to gather the system default huge page size,
>> +while a value larger than @code{1} will try to match it with the supported system
>> +huge page size.  If either no default huge page size could be obtained or if the
>> +requested size does not match the supported ones, the huge pages supports will be
>> +disabled.
>> +@end deftp
>> +
>>   @node Dynamic Linking Tunables
>>   @section Dynamic Linking Tunables
>>   @cindex dynamic linking tunables
>> diff --git a/sysdeps/generic/malloc-hugepages.c b/sysdeps/generic/malloc-hugepages.c
>> index 262bcdbeb8..e5f5c1ec98 100644
>> --- a/sysdeps/generic/malloc-hugepages.c
>> +++ b/sysdeps/generic/malloc-hugepages.c
>> @@ -29,3 +29,9 @@ __malloc_thp_mode (void)
>>   {
>>     return malloc_thp_mode_not_supported;
>>   }
>> +
>> +/* Return the default transparent huge page size.  */
>> +struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested)
>> +{
>> +  return (struct malloc_hugepage_config_t) { 0, 0 };
>> +}
>> diff --git a/sysdeps/generic/malloc-hugepages.h b/sysdeps/generic/malloc-hugepages.h
>> index 664cda9b67..27f7adfea5 100644
>> --- a/sysdeps/generic/malloc-hugepages.h
>> +++ b/sysdeps/generic/malloc-hugepages.h
>> @@ -34,4 +34,16 @@ enum malloc_thp_mode_t
>>     enum malloc_thp_mode_t __malloc_thp_mode (void) attribute_hidden;
>>   +struct malloc_hugepage_config_t
>> +{
>> +  size_t pagesize;
>> +  int flags;
>> +};
>> +
>> +/* Returned the support huge page size from the requested PAGESIZE along
>> +   with the requires extra mmap flags.  Returning a 0 value for pagesize
>> +   disables its usage.  */
>> +struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested)
>> +     attribute_hidden;
>> +
>>   #endif /* _MALLOC_HUGEPAGES_H */
>> diff --git a/sysdeps/unix/sysv/linux/malloc-hugepages.c b/sysdeps/unix/sysv/linux/malloc-hugepages.c
>> index 66589127cd..0eb0c764ad 100644
>> --- a/sysdeps/unix/sysv/linux/malloc-hugepages.c
>> +++ b/sysdeps/unix/sysv/linux/malloc-hugepages.c
>> @@ -17,8 +17,10 @@
>>      not, see <https://www.gnu.org/licenses/>.  */
>>     #include <intprops.h>
>> +#include <dirent.h>
>>   #include <malloc-hugepages.h>
>>   #include <not-cancel.h>
>> +#include <sys/mman.h>
>>     size_t
>>   __malloc_default_thp_pagesize (void)
>> @@ -74,3 +76,126 @@ __malloc_thp_mode (void)
>>       }
>>     return malloc_thp_mode_not_supported;
>>   }
>> +
>> +static size_t
>> +malloc_default_hugepage_size (void)
>> +{
>> +  int fd = __open64_nocancel ("/proc/meminfo", O_RDONLY);
>> +  if (fd == -1)
>> +    return 0;
>> +
>> +  char buf[512];
>> +  off64_t off = 0;
>> +  while (1)
>> +    {
>> +      ssize_t r = __pread64_nocancel (fd, buf, sizeof (buf) - 1, off);
>> +      if (r < 0)
>> +    break;
>> +      buf[r - 1] = '\0';
>> +
>> +      const char *s = strstr (buf, "Hugepagesize:");
>> +      if (s == NULL)
>> +    {
>> +      char *nl = strrchr (buf, '\n');
>> +      if (nl == NULL)
>> +        break;
>> +      off += (nl + 1) - buf;
>> +      continue;
>> +    }
>> +
>> +      /* The default huge page size is in the form:
>> +     Hugepagesize:       NUMBER kB  */
>> +      size_t hpsize = 0;
>> +      s += sizeof ("Hugepagesize: ") - 1;
>> +      for (int i = 0; (s[i] >= '0' && s[i] <= '9') || s[i] == ' '; i++)
>> +    {
>> +      if (s[i] == ' ')
>> +        continue;
>> +      hpsize *= 10;
>> +      hpsize += s[i] - '0';
>> +    }
>> +      return hpsize * 1024;
>> +    }
>> +
>> +  __close_nocancel (fd);
>> +
>> +  return 0;
>> +}
>> +
>> +static inline struct malloc_hugepage_config_t
>> +make_malloc_hugepage_config (size_t pagesize)
>> +{
>> +  int flags = MAP_HUGETLB | (__builtin_ctzll (pagesize) << MAP_HUGE_SHIFT);
>> +  return (struct malloc_hugepage_config_t) { pagesize, flags };
>> +}
>> +
>> +struct malloc_hugepage_config_t
>> +__malloc_hugepage_config (size_t requested)
>> +{
>> +  if (requested == 1)
>> +    {
>> +      size_t pagesize = malloc_default_hugepage_size ();
>> +      if (pagesize != 0)
>> +    return make_malloc_hugepage_config (pagesize);
>> +    }
>> +
>> +  int dirfd = __open64_nocancel ("/sys/kernel/mm/hugepages",
>> +                 O_RDONLY | O_DIRECTORY, 0);
>> +  if (dirfd == -1)
>> +    return (struct malloc_hugepage_config_t) { 0, 0 };
>> +
>> +  bool found = false;
>> +
>> +  char buffer[1024];
>> +  while (true)
>> +    {
>> +#if !IS_IN(libc)
>> +# define __getdents64 getdents64
>> +#endif
>> +      ssize_t ret = __getdents64 (dirfd, buffer, sizeof (buffer));
>> +      if (ret == -1)
>> +    break;
>> +      else if (ret == 0)
>> +        break;
>> +
>> +      char *begin = buffer, *end = buffer + ret;
>> +      while (begin != end)
>> +        {
>> +          unsigned short int d_reclen;
>> +          memcpy (&d_reclen, begin + offsetof (struct dirent64, d_reclen),
>> +                  sizeof (d_reclen));
>> +          const char *dname = begin + offsetof (struct dirent64, d_name);
>> +          begin += d_reclen;
>> +
>> +          if (dname[0] == '.'
>> +          || strncmp (dname, "hugepages-", sizeof ("hugepages-") - 1) != 0)
>> +            continue;
>> +
>> +      /* Each entry represents a supported huge page in the form of:
>> +         hugepages-<size>kB.  */
>> +      size_t hpsize = 0;
>> +      const char *sizestr = dname + sizeof ("hugepages-") - 1;
>> +      for (int i = 0; sizestr[i] >= '0' && sizestr[i] <= '9'; i++)
>> +        {
>> +          hpsize *= 10;
>> +          hpsize += sizestr[i] - '0';
>> +        }
>> +      hpsize *= 1024;
>> +
>> +      if (hpsize == requested)
>> +        {
>> +          found = true;
>> +          break;
>> +        }
>> +        }
>> +      if (found)
>> +    break;
>> +    }
>> +
>> +  __close_nocancel (dirfd);
>> +
>> +  if (found)
>> +    return make_malloc_hugepage_config (requested);
>> +
>> +  return (struct malloc_hugepage_config_t) { 0, 0 };
>> +}
>>
>
  
Matheus Castanho Aug. 19, 2021, 5:58 p.m. UTC | #3
Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org> writes:

> A new tunable, 'glibc.malloc.mmap_hugetlb', adds support to use Huge Page
> support directly with mmap() calls.  The required supported sizes and
> flags for mmap() are provided by an arch-specific internal hook
> malloc_hp_config().
>
> Currently it first try mmap() using the huge page size and fallback to
> default page size and sbrk() call if kernel returns MMAP_FAILED.
>
> The default malloc_hp_config() implementation does not enable it even
> if the tunable is set.
>
> Checked on x86_64-linux-gnu.
> ---
>  NEWS                                       |   4 +
>  elf/dl-tunables.list                       |   4 +
>  elf/tst-rtld-list-tunables.exp             |   1 +
>  malloc/arena.c                             |   2 +
>  malloc/malloc.c                            |  35 +++++-
>  manual/tunables.texi                       |  14 +++
>  sysdeps/generic/malloc-hugepages.c         |   6 +
>  sysdeps/generic/malloc-hugepages.h         |  12 ++
>  sysdeps/unix/sysv/linux/malloc-hugepages.c | 125 +++++++++++++++++++++
>  9 files changed, 200 insertions(+), 3 deletions(-)
>
> diff --git a/NEWS b/NEWS
> index 9b2345d08c..412bf3e6f8 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -14,6 +14,10 @@ Major new features:
>    It might improve performance with Transparent Huge Pages madvise mode
>    depending of the workload.
>
> +* On Linux, a new tunable, glibc.malloc.mmap_hugetlb, can be used to
> +  instruct malloc to try use Huge Pages when allocate memory with mmap()
> +  calls (through the use of MAP_HUGETLB).
> +
>  Deprecated and removed features, and other changes affecting compatibility:
>
>    [Add deprecations, removals and changes affecting compatibility here]
> diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
> index 67df6dbc2c..209c2d8592 100644
> --- a/elf/dl-tunables.list
> +++ b/elf/dl-tunables.list
> @@ -97,6 +97,10 @@ glibc {
>        minval: 0
>        maxval: 1
>      }
> +    mmap_hugetlb {
> +      type: SIZE_T
> +      minval: 0
> +    }
>    }
>    cpu {
>      hwcap_mask {
> diff --git a/elf/tst-rtld-list-tunables.exp b/elf/tst-rtld-list-tunables.exp
> index d8109fa31c..49f033ce91 100644
> --- a/elf/tst-rtld-list-tunables.exp
> +++ b/elf/tst-rtld-list-tunables.exp
> @@ -1,6 +1,7 @@
>  glibc.malloc.arena_max: 0x0 (min: 0x1, max: 0x[f]+)
>  glibc.malloc.arena_test: 0x0 (min: 0x1, max: 0x[f]+)
>  glibc.malloc.check: 0 (min: 0, max: 3)
> +glibc.malloc.mmap_hugetlb: 0x0 (min: 0x0, max: 0x[f]+)
>  glibc.malloc.mmap_max: 0 (min: 0, max: 2147483647)
>  glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0x[f]+)
>  glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0x[f]+)
> diff --git a/malloc/arena.c b/malloc/arena.c
> index 81bff54303..4efb5581c1 100644
> --- a/malloc/arena.c
> +++ b/malloc/arena.c
> @@ -232,6 +232,7 @@ TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
>  #endif
>  TUNABLE_CALLBACK_FNDECL (set_mxfast, size_t)
>  TUNABLE_CALLBACK_FNDECL (set_thp_madvise, int32_t)
> +TUNABLE_CALLBACK_FNDECL (set_mmap_hugetlb, size_t)
>  #else
>  /* Initialization routine. */
>  #include <string.h>
> @@ -333,6 +334,7 @@ ptmalloc_init (void)
>  # endif
>    TUNABLE_GET (mxfast, size_t, TUNABLE_CALLBACK (set_mxfast));
>    TUNABLE_GET (thp_madvise, int32_t, TUNABLE_CALLBACK (set_thp_madvise));
> +  TUNABLE_GET (mmap_hugetlb, size_t, TUNABLE_CALLBACK (set_mmap_hugetlb));
>  #else
>    if (__glibc_likely (_environ != NULL))
>      {
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 4bfcea286f..8cf2d6855e 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -1884,6 +1884,10 @@ struct malloc_par
>  #if HAVE_TUNABLES
>    /* Transparent Large Page support.  */
>    INTERNAL_SIZE_T thp_pagesize;
> +  /* A value different than 0 means to align mmap allocation to hp_pagesize
> +     add hp_flags on flags.  */
> +  INTERNAL_SIZE_T hp_pagesize;
> +  int hp_flags;
>  #endif
>
>    /* Memory map support */
> @@ -2415,7 +2419,8 @@ do_check_malloc_state (mstate av)
>   */
>
>  static void *
> -sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
> +sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av,
> +		bool set_thp)
>  {
>    long int size;
>
> @@ -2442,7 +2447,8 @@ sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
>    if (mm == MAP_FAILED)
>      return mm;
>
> -  sysmadvise_thp (mm, size);
> +  if (set_thp)
> +    sysmadvise_thp (mm, size);
>
>    /*
>      The offset to the start of the mmapped region is stored in the prev_size
> @@ -2531,7 +2537,18 @@ sysmalloc (INTERNAL_SIZE_T nb, mstate av)
>  	  && (mp_.n_mmaps < mp_.n_mmaps_max)))
>      {
>      try_mmap:
> -      char *mm = sysmalloc_mmap (nb, pagesize, 0, av);
> +      char *mm;
> +#if HAVE_TUNABLES
> +      if (mp_.hp_pagesize > 0)
> +	{
> +	  /* There is no need to isse the THP madvise call if Huge Pages are
> +	     used directly.  */
> +	  mm = sysmalloc_mmap (nb, mp_.hp_pagesize, mp_.hp_flags, av, false);
> +	  if (mm != MAP_FAILED)
> +	    return mm;
> +	}
> +#endif
> +      mm = sysmalloc_mmap (nb, pagesize, 0, av, true);
>        if (mm != MAP_FAILED)
>  	return mm;
>        tried_mmap = true;
> @@ -5405,6 +5422,18 @@ do_set_thp_madvise (int32_t value)
>      }
>    return 0;
>  }
> +
> +static __always_inline int
> +do_set_mmap_hugetlb (size_t value)
> +{
> +  if (value > 0)
> +    {
> +      struct malloc_hugepage_config_t cfg = __malloc_hugepage_config (value);
> +      mp_.hp_pagesize = cfg.pagesize;
> +      mp_.hp_flags = cfg.flags;
> +    }
> +  return 0;
> +}
>  #endif
>
>  int
> diff --git a/manual/tunables.texi b/manual/tunables.texi
> index 93c46807f9..4da6a02778 100644
> --- a/manual/tunables.texi
> +++ b/manual/tunables.texi
> @@ -279,6 +279,20 @@ The default value of this tunable is @code{0}, which disable its usage.
>  Setting to a positive value enable the @code{madvise} call.
>  @end deftp
>
> +@deftp Tunable glibc.malloc.mmap_hugetlb
> +This tunable enable the use of Huge Pages when the system supports it (currently
> +only Linux).  It is done by aligning the memory size and passing the required
> +flags (@code{MAP_HUGETLB} on Linux) when issuing the @code{mmap} to allocate
> +memory from the system.
> +
> +The default value of this tunable is @code{0}, which disable its usage.
> +The special value @code{1} will try to gather the system default huge page size,
> +while a value larger than @code{1} will try to match it with the supported system
> +huge page size.  If either no default huge page size could be obtained or if the
> +requested size does not match the supported ones, the huge pages supports will be
> +disabled.
> +@end deftp
> +
>  @node Dynamic Linking Tunables
>  @section Dynamic Linking Tunables
>  @cindex dynamic linking tunables
> diff --git a/sysdeps/generic/malloc-hugepages.c b/sysdeps/generic/malloc-hugepages.c
> index 262bcdbeb8..e5f5c1ec98 100644
> --- a/sysdeps/generic/malloc-hugepages.c
> +++ b/sysdeps/generic/malloc-hugepages.c
> @@ -29,3 +29,9 @@ __malloc_thp_mode (void)
>  {
>    return malloc_thp_mode_not_supported;
>  }
> +
> +/* Return the default transparent huge page size.  */
> +struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested)
> +{
> +  return (struct malloc_hugepage_config_t) { 0, 0 };
> +}
> diff --git a/sysdeps/generic/malloc-hugepages.h b/sysdeps/generic/malloc-hugepages.h
> index 664cda9b67..27f7adfea5 100644
> --- a/sysdeps/generic/malloc-hugepages.h
> +++ b/sysdeps/generic/malloc-hugepages.h
> @@ -34,4 +34,16 @@ enum malloc_thp_mode_t
>
>  enum malloc_thp_mode_t __malloc_thp_mode (void) attribute_hidden;
>
> +struct malloc_hugepage_config_t
> +{
> +  size_t pagesize;
> +  int flags;
> +};
> +
> +/* Returned the support huge page size from the requested PAGESIZE along
> +   with the requires extra mmap flags.  Returning a 0 value for pagesize
> +   disables its usage.  */
> +struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested)
> +     attribute_hidden;
> +
>  #endif /* _MALLOC_HUGEPAGES_H */
> diff --git a/sysdeps/unix/sysv/linux/malloc-hugepages.c b/sysdeps/unix/sysv/linux/malloc-hugepages.c
> index 66589127cd..0eb0c764ad 100644
> --- a/sysdeps/unix/sysv/linux/malloc-hugepages.c
> +++ b/sysdeps/unix/sysv/linux/malloc-hugepages.c
> @@ -17,8 +17,10 @@
>     not, see <https://www.gnu.org/licenses/>.  */
>
>  #include <intprops.h>
> +#include <dirent.h>
>  #include <malloc-hugepages.h>
>  #include <not-cancel.h>
> +#include <sys/mman.h>
>
>  size_t
>  __malloc_default_thp_pagesize (void)
> @@ -74,3 +76,126 @@ __malloc_thp_mode (void)
>      }
>    return malloc_thp_mode_not_supported;
>  }
> +
> +static size_t
> +malloc_default_hugepage_size (void)
> +{
> +  int fd = __open64_nocancel ("/proc/meminfo", O_RDONLY);
> +  if (fd == -1)
> +    return 0;
> +
> +  char buf[512];
> +  off64_t off = 0;
> +  while (1)
> +    {
> +      ssize_t r = __pread64_nocancel (fd, buf, sizeof (buf) - 1, off);
> +      if (r < 0)
> +	break;
> +      buf[r - 1] = '\0';
> +
> +      const char *s = strstr (buf, "Hugepagesize:");
> +      if (s == NULL)
> +	{
> +	  char *nl = strrchr (buf, '\n');
> +	  if (nl == NULL)
> +	    break;
> +	  off += (nl + 1) - buf;
> +	  continue;
> +	}
> +
> +      /* The default huge page size is in the form:
> +	 Hugepagesize:       NUMBER kB  */
> +      size_t hpsize = 0;
> +      s += sizeof ("Hugepagesize: ") - 1;
> +      for (int i = 0; (s[i] >= '0' && s[i] <= '9') || s[i] == ' '; i++)
> +	{
> +	  if (s[i] == ' ')
> +	    continue;
> +	  hpsize *= 10;
> +	  hpsize += s[i] - '0';
> +	}
> +      return hpsize * 1024;
> +    }
> +
> +  __close_nocancel (fd);
> +
> +  return 0;
> +}
> +
> +static inline struct malloc_hugepage_config_t
> +make_malloc_hugepage_config (size_t pagesize)
> +{
> +  int flags = MAP_HUGETLB | (__builtin_ctzll (pagesize) << MAP_HUGE_SHIFT);
> +  return (struct malloc_hugepage_config_t) { pagesize, flags };
> +}
> +
> +struct malloc_hugepage_config_t
> +__malloc_hugepage_config (size_t requested)
> +{
> +  if (requested == 1)
> +    {
> +      size_t pagesize = malloc_default_hugepage_size ();
> +      if (pagesize != 0)
> +	return make_malloc_hugepage_config (pagesize);
> +    }
> +
> +  int dirfd = __open64_nocancel ("/sys/kernel/mm/hugepages",
> +				 O_RDONLY | O_DIRECTORY, 0);
> +  if (dirfd == -1)
> +    return (struct malloc_hugepage_config_t) { 0, 0 };
> +
> +  bool found = false;
> +
> +  char buffer[1024];
> +  while (true)
> +    {
> +#if !IS_IN(libc)
> +# define __getdents64 getdents64
> +#endif
> +      ssize_t ret = __getdents64 (dirfd, buffer, sizeof (buffer));
> +      if (ret == -1)
> +	break;
> +      else if (ret == 0)
> +        break;
> +
> +      char *begin = buffer, *end = buffer + ret;
> +      while (begin != end)
> +        {
> +          unsigned short int d_reclen;
> +          memcpy (&d_reclen, begin + offsetof (struct dirent64, d_reclen),
> +                  sizeof (d_reclen));
> +          const char *dname = begin + offsetof (struct dirent64, d_name);
> +          begin += d_reclen;
> +
> +          if (dname[0] == '.'
> +	      || strncmp (dname, "hugepages-", sizeof ("hugepages-") - 1) != 0)
> +            continue;
> +
> +	  /* Each entry represents a supported huge page in the form of:
> +	     hugepages-<size>kB.  */
> +	  size_t hpsize = 0;
> +	  const char *sizestr = dname + sizeof ("hugepages-") - 1;
> +	  for (int i = 0; sizestr[i] >= '0' && sizestr[i] <= '9'; i++)
> +	    {
> +	      hpsize *= 10;
> +	      hpsize += sizestr[i] - '0';
> +	    }
> +	  hpsize *= 1024;
> +
> +	  if (hpsize == requested)
> +	    {
> +	      found = true;
> +	      break;
> +	    }
> +        }
> +      if (found)
> +	break;
> +    }
> +
> +  __close_nocancel (dirfd);
> +
> +  if (found)
> +    return make_malloc_hugepage_config (requested);
> +
> +  return (struct malloc_hugepage_config_t) { 0, 0 };
> +}

Hi Adhemerval,

I tested this patchset on a POWER9, and I'm seeing the following test
failures when running make check with glibc.malloc.mmap_hugetlb=1:

malloc/tst-free-errno
malloc/tst-free-errno-malloc-check
malloc/tst-free-errno-mcheck
posix/tst-exec
posix/tst-exec-static
posix/tst-spawn
posix/tst-spawn-static
posix/tst-spawn5

I'm attaching a summary of the contents of the .out files for each test.
$ failing="malloc/tst-free-errno malloc/tst-free-errno-malloc-check malloc/tst-free-errno-mcheck posix/tst-exec posix/tst-exec-static posix/tst-spawn posix/tst-spawn-static posix/tst-spawn5"
$
$ for t in $failing; do echo "~> $t"; { make test t=$t; GLIBC_TUNABLES="glibc.malloc.mmap_hugetlb=1" make test t=$t; } | grep -Ei "^fail|pass"; cat $t.out; echo; done

~> malloc/tst-free-errno
double free or corruption (out)
PASS: malloc/tst-free-errno
FAIL: malloc/tst-free-errno
Didn't expect signal from child: got `Aborted'

~> malloc/tst-free-errno-malloc-check
PASS: malloc/tst-free-errno-malloc-check
FAIL: malloc/tst-free-errno-malloc-check
error: xmmap.c:28: mmap of 16908288 bytes, prot=0x3, flags=0x32: Device or resource busy
error: 1 test failures

~> malloc/tst-free-errno-mcheck
memory clobbered past end of allocated block
PASS: malloc/tst-free-errno-mcheck
FAIL: malloc/tst-free-errno-mcheck
Didn't expect signal from child: got `Aborted'

~> posix/tst-exec
/home/mscastanho/build/glibc/posix/tst-exec: file 1 (4) is not closed
PASS: posix/tst-exec
FAIL: posix/tst-exec

~> posix/tst-exec-static
/home/mscastanho/build/glibc/posix/tst-exec-static: file 1 (4) is not closed
PASS: posix/tst-exec-static
FAIL: posix/tst-exec-static

~> posix/tst-spawn
PASS: posix/tst-spawn
FAIL: posix/tst-spawn
tst-spawn.c:127: numeric comparison failure
   left: 0 (0x0); from: lseek (fd1, 0, SEEK_CUR)
  right: -1 (0xffffffffffffffff); from: (off_t) -1
error: 1 test failures
tst-spawn.c:244: numeric comparison failure
   left: 1 (0x1); from: WEXITSTATUS (status)
  right: 0 (0x0); from: 0
tst-spawn.c:127: numeric comparison failure
   left: 0 (0x0); from: lseek (fd1, 0, SEEK_CUR)
  right: -1 (0xffffffffffffffff); from: (off_t) -1
error: 1 test failures
tst-spawn.c:258: numeric comparison failure
   left: 1 (0x1); from: WEXITSTATUS (status)
  right: 0 (0x0); from: 0
error: 2 test failures

~> posix/tst-spawn-static
PASS: posix/tst-spawn-static
FAIL: posix/tst-spawn-static
tst-spawn.c:127: numeric comparison failure
   left: 0 (0x0); from: lseek (fd1, 0, SEEK_CUR)
  right: -1 (0xffffffffffffffff); from: (off_t) -1
error: 1 test failures
tst-spawn.c:244: numeric comparison failure
   left: 1 (0x1); from: WEXITSTATUS (status)
  right: 0 (0x0); from: 0
tst-spawn.c:127: numeric comparison failure
   left: 0 (0x0); from: lseek (fd1, 0, SEEK_CUR)
  right: -1 (0xffffffffffffffff); from: (off_t) -1
error: 1 test failures
tst-spawn.c:258: numeric comparison failure
   left: 1 (0x1); from: WEXITSTATUS (status)
  right: 0 (0x0); from: 0
error: 2 test failures

~> posix/tst-spawn5
PASS: posix/tst-spawn5
FAIL: posix/tst-spawn5
error: tst-spawn5.c:128: unexpected open file descriptor 54: /proc/meminfo
tst-spawn5.c:182: numeric comparison failure
   left: 1 (0x1); from: WEXITSTATUS (status)
  right: 0 (0x0); from: 0
error: tst-spawn5.c:128: unexpected open file descriptor 54: /proc/meminfo
tst-spawn5.c:182: numeric comparison failure
   left: 1 (0x1); from: WEXITSTATUS (status)
  right: 0 (0x0); from: 0
error: tst-spawn5.c:128: unexpected open file descriptor 5: /proc/meminfo
tst-spawn5.c:182: numeric comparison failure
   left: 1 (0x1); from: WEXITSTATUS (status)
  right: 0 (0x0); from: 0
error: tst-spawn5.c:128: unexpected open file descriptor 4: /proc/meminfo
tst-spawn5.c:182: numeric comparison failure
   left: 1 (0x1); from: WEXITSTATUS (status)
  right: 0 (0x0); from: 0
error: tst-spawn5.c:128: unexpected open file descriptor 6: /proc/meminfo
tst-spawn5.c:182: numeric comparison failure
   left: 1 (0x1); from: WEXITSTATUS (status)
  right: 0 (0x0); from: 0
error: 5 test failures
--
Matheus Castanho
  
Adhemerval Zanella Aug. 19, 2021, 6:50 p.m. UTC | #4
On 19/08/2021 14:58, Matheus Castanho wrote:
> Hi Adhemerval,
> 
> I tested this patchset on a POWER9, and I'm seeing the following test
> failures when running make check with glibc.malloc.mmap_hugetlb=1:

Thanks for checking on this.

> 
> malloc/tst-free-errno
> malloc/tst-free-errno-malloc-check
> malloc/tst-free-errno-mcheck

These one I couldn't really reproduce it on gcc farm power machines,
a power9 with 2M huge page default and power8 with 16M default. Both
didn't have any page allocated in the poll. I don't have admin access
so I can change the pool size to check what is happening.

I also tested on my x86_64 environment without any pages in the poll,
with 4 pages in the pool and with 10 pages.  

If you could the stacktrace from where we get the
"Didn't expect signal from child: got `Aborted'" it would be useful.

It could be also something related to /proc/sys/vm/max_map_count
value, since it mmap seems to be failing for some reason.

> posix/tst-exec
> posix/tst-exec-static
> posix/tst-spawn
> posix/tst-spawn-static
> posix/tst-spawn5

These are an overlook at 'malloc_default_hugepage_size()' where it
does not close the file descriptor on success.  I have fixed it.
  
Matheus Castanho Aug. 20, 2021, 12:34 p.m. UTC | #5
Adhemerval Zanella <adhemerval.zanella@linaro.org> writes:

> On 19/08/2021 14:58, Matheus Castanho wrote:
>> Hi Adhemerval,
>>
>> I tested this patchset on a POWER9, and I'm seeing the following test
>> failures when running make check with glibc.malloc.mmap_hugetlb=1:
>
> Thanks for checking on this.
>
>>
>> malloc/tst-free-errno
>> malloc/tst-free-errno-malloc-check
>> malloc/tst-free-errno-mcheck
>
> These one I couldn't really reproduce it on gcc farm power machines,
> a power9 with 2M huge page default and power8 with 16M default. Both
> didn't have any page allocated in the poll. I don't have admin access
> so I can change the pool size to check what is happening.
>
> I also tested on my x86_64 environment without any pages in the poll,
> with 4 pages in the pool and with 10 pages.
>

I confirm that without pages in the pool the tests pass correctly. Only
when I add them to the pool things start failing. In this case I'm
reserving 500 16 MB pages:

$ grep -i hugepages /proc/meminfo
AnonHugePages:         0 kB
ShmemHugePages:        0 kB
FileHugePages:         0 kB
HugePages_Total:     500
HugePages_Free:      500
HugePages_Rsvd:        0
HugePages_Surp:        0
Hugepagesize:      16384 kB

> If you could the stacktrace from where we get the
> "Didn't expect signal from child: got `Aborted'" it would be useful.
>

This is what GDB is showing me when the abort happens:

#0  0x00007ffff7dccf00 in __pthread_kill_internal (threadid=<optimized out>, signo=<optimized out>) at pthread_kill.c:44
#1  0x00007ffff7d6e26c in __GI_raise (sig=<optimized out>) at ../sysdeps/posix/raise.c:26
#2  0x00007ffff7d50490 in __GI_abort () at abort.c:79
#3  0x00007ffff7dba770 in __libc_message (action=<optimized out>, fmt=<optimized out>) at ../sysdeps/posix/libc_fatal.c:155
#4  0x00007ffff7ddc4e8 in malloc_printerr (str=<optimized out>, str@entry=0x7ffff7efdc90 "double free or corruption (out)") at malloc.c:5654
#5  0x00007ffff7ddefe8 in _int_free (av=0x7ffff7f60e30 <main_arena>, p=0x7ffff80203d0, have_lock=<optimized out>, have_lock@entry=0) at malloc.c:4555
#6  0x00007ffff7de2160 in __GI___libc_free (mem=<optimized out>) at malloc.c:3358
#7  0x0000000010001ee4 in do_test () at tst-free-errno.c:123
#8  0x0000000010002730 in run_test_function (argc=argc@entry=1, argv=argv@entry=0x7fffffffede0, config=config@entry=0x7fffffffe950) at support_test_main.c:232
#9  0x00000000100032fc in support_test_main (argc=1, argv=0x7fffffffede0, config=0x7fffffffe950) at support_test_main.c:431
#10 0x00000000100019d0 in main (argc=<optimized out>, argv=<optimized out>) at ../support/test-driver.c:168
#11 0x00007ffff7d50818 in __libc_start_call_main (main=main@entry=0x10001980 <main>, argc=argc@entry=1, argv=argv@entry=0x7fffffffede0, auxvec=auxvec@entry=0x7fffffffef68) at ../sysdeps/nptl/libc_start_call_main.h:58
#12 0x00007ffff7d50a00 in generic_start_main (fini=<optimized out>, stack_end=<optimized out>, rtld_fini=<optimized out>, init=<optimized out>, auxvec=<optimized out>, argv=<optimized out>, argc=<optimized out>, main=<optimized out>) at ../csu/libc-start.c:409
#13 __libc_start_main_impl (argc=1, argv=0x7fffffffede0, ev=<optimized out>, auxvec=0x7fffffffef68, rtld_fini=<optimized out>, stinfo=<optimized out>, stack_on_entry=<optimized out>) at ../sysdeps/unix/sysv/linux/powerpc/libc-start.c:98
#14 0x0000000000000000 in ?? ()

> It could be also something related to /proc/sys/vm/max_map_count
> value, since it mmap seems to be failing for some reason.
>

This is what the machine I'm using now has:

$ cat /proc/sys/vm/max_map_count
65530

>> posix/tst-exec
>> posix/tst-exec-static
>> posix/tst-spawn
>> posix/tst-spawn-static
>> posix/tst-spawn5
>
> These are an overlook at 'malloc_default_hugepage_size()' where it
> does not close the file descriptor on success.  I have fixed it.

Ok, thanks!

--
Matheus Castanho
  

Patch

diff --git a/NEWS b/NEWS
index 9b2345d08c..412bf3e6f8 100644
--- a/NEWS
+++ b/NEWS
@@ -14,6 +14,10 @@  Major new features:
   It might improve performance with Transparent Huge Pages madvise mode
   depending of the workload.
 
+* On Linux, a new tunable, glibc.malloc.mmap_hugetlb, can be used to
+  instruct malloc to try use Huge Pages when allocate memory with mmap()
+  calls (through the use of MAP_HUGETLB).
+
 Deprecated and removed features, and other changes affecting compatibility:
 
   [Add deprecations, removals and changes affecting compatibility here]
diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
index 67df6dbc2c..209c2d8592 100644
--- a/elf/dl-tunables.list
+++ b/elf/dl-tunables.list
@@ -97,6 +97,10 @@  glibc {
       minval: 0
       maxval: 1
     }
+    mmap_hugetlb {
+      type: SIZE_T
+      minval: 0
+    }
   }
   cpu {
     hwcap_mask {
diff --git a/elf/tst-rtld-list-tunables.exp b/elf/tst-rtld-list-tunables.exp
index d8109fa31c..49f033ce91 100644
--- a/elf/tst-rtld-list-tunables.exp
+++ b/elf/tst-rtld-list-tunables.exp
@@ -1,6 +1,7 @@ 
 glibc.malloc.arena_max: 0x0 (min: 0x1, max: 0x[f]+)
 glibc.malloc.arena_test: 0x0 (min: 0x1, max: 0x[f]+)
 glibc.malloc.check: 0 (min: 0, max: 3)
+glibc.malloc.mmap_hugetlb: 0x0 (min: 0x0, max: 0x[f]+)
 glibc.malloc.mmap_max: 0 (min: 0, max: 2147483647)
 glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0x[f]+)
 glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0x[f]+)
diff --git a/malloc/arena.c b/malloc/arena.c
index 81bff54303..4efb5581c1 100644
--- a/malloc/arena.c
+++ b/malloc/arena.c
@@ -232,6 +232,7 @@  TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
 #endif
 TUNABLE_CALLBACK_FNDECL (set_mxfast, size_t)
 TUNABLE_CALLBACK_FNDECL (set_thp_madvise, int32_t)
+TUNABLE_CALLBACK_FNDECL (set_mmap_hugetlb, size_t)
 #else
 /* Initialization routine. */
 #include <string.h>
@@ -333,6 +334,7 @@  ptmalloc_init (void)
 # endif
   TUNABLE_GET (mxfast, size_t, TUNABLE_CALLBACK (set_mxfast));
   TUNABLE_GET (thp_madvise, int32_t, TUNABLE_CALLBACK (set_thp_madvise));
+  TUNABLE_GET (mmap_hugetlb, size_t, TUNABLE_CALLBACK (set_mmap_hugetlb));
 #else
   if (__glibc_likely (_environ != NULL))
     {
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 4bfcea286f..8cf2d6855e 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -1884,6 +1884,10 @@  struct malloc_par
 #if HAVE_TUNABLES
   /* Transparent Large Page support.  */
   INTERNAL_SIZE_T thp_pagesize;
+  /* A value different than 0 means to align mmap allocation to hp_pagesize
+     add hp_flags on flags.  */
+  INTERNAL_SIZE_T hp_pagesize;
+  int hp_flags;
 #endif
 
   /* Memory map support */
@@ -2415,7 +2419,8 @@  do_check_malloc_state (mstate av)
  */
 
 static void *
-sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
+sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av,
+		bool set_thp)
 {
   long int size;
 
@@ -2442,7 +2447,8 @@  sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
   if (mm == MAP_FAILED)
     return mm;
 
-  sysmadvise_thp (mm, size);
+  if (set_thp)
+    sysmadvise_thp (mm, size);
 
   /*
     The offset to the start of the mmapped region is stored in the prev_size
@@ -2531,7 +2537,18 @@  sysmalloc (INTERNAL_SIZE_T nb, mstate av)
 	  && (mp_.n_mmaps < mp_.n_mmaps_max)))
     {
     try_mmap:
-      char *mm = sysmalloc_mmap (nb, pagesize, 0, av);
+      char *mm;
+#if HAVE_TUNABLES
+      if (mp_.hp_pagesize > 0)
+	{
+	  /* There is no need to isse the THP madvise call if Huge Pages are
+	     used directly.  */
+	  mm = sysmalloc_mmap (nb, mp_.hp_pagesize, mp_.hp_flags, av, false);
+	  if (mm != MAP_FAILED)
+	    return mm;
+	}
+#endif
+      mm = sysmalloc_mmap (nb, pagesize, 0, av, true);
       if (mm != MAP_FAILED)
 	return mm;
       tried_mmap = true;
@@ -5405,6 +5422,18 @@  do_set_thp_madvise (int32_t value)
     }
   return 0;
 }
+
+static __always_inline int
+do_set_mmap_hugetlb (size_t value)
+{
+  if (value > 0)
+    {
+      struct malloc_hugepage_config_t cfg = __malloc_hugepage_config (value);
+      mp_.hp_pagesize = cfg.pagesize;
+      mp_.hp_flags = cfg.flags;
+    }
+  return 0;
+}
 #endif
 
 int
diff --git a/manual/tunables.texi b/manual/tunables.texi
index 93c46807f9..4da6a02778 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -279,6 +279,20 @@  The default value of this tunable is @code{0}, which disable its usage.
 Setting to a positive value enable the @code{madvise} call.
 @end deftp
 
+@deftp Tunable glibc.malloc.mmap_hugetlb
+This tunable enable the use of Huge Pages when the system supports it (currently
+only Linux).  It is done by aligning the memory size and passing the required
+flags (@code{MAP_HUGETLB} on Linux) when issuing the @code{mmap} to allocate
+memory from the system.
+
+The default value of this tunable is @code{0}, which disable its usage.
+The special value @code{1} will try to gather the system default huge page size,
+while a value larger than @code{1} will try to match it with the supported system
+huge page size.  If either no default huge page size could be obtained or if the
+requested size does not match the supported ones, the huge pages supports will be
+disabled.
+@end deftp
+
 @node Dynamic Linking Tunables
 @section Dynamic Linking Tunables
 @cindex dynamic linking tunables
diff --git a/sysdeps/generic/malloc-hugepages.c b/sysdeps/generic/malloc-hugepages.c
index 262bcdbeb8..e5f5c1ec98 100644
--- a/sysdeps/generic/malloc-hugepages.c
+++ b/sysdeps/generic/malloc-hugepages.c
@@ -29,3 +29,9 @@  __malloc_thp_mode (void)
 {
   return malloc_thp_mode_not_supported;
 }
+
+/* Return the default transparent huge page size.  */
+struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested)
+{
+  return (struct malloc_hugepage_config_t) { 0, 0 };
+}
diff --git a/sysdeps/generic/malloc-hugepages.h b/sysdeps/generic/malloc-hugepages.h
index 664cda9b67..27f7adfea5 100644
--- a/sysdeps/generic/malloc-hugepages.h
+++ b/sysdeps/generic/malloc-hugepages.h
@@ -34,4 +34,16 @@  enum malloc_thp_mode_t
 
 enum malloc_thp_mode_t __malloc_thp_mode (void) attribute_hidden;
 
+struct malloc_hugepage_config_t
+{
+  size_t pagesize;
+  int flags;
+};
+
+/* Returned the support huge page size from the requested PAGESIZE along
+   with the requires extra mmap flags.  Returning a 0 value for pagesize
+   disables its usage.  */
+struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested)
+     attribute_hidden;
+
 #endif /* _MALLOC_HUGEPAGES_H */
diff --git a/sysdeps/unix/sysv/linux/malloc-hugepages.c b/sysdeps/unix/sysv/linux/malloc-hugepages.c
index 66589127cd..0eb0c764ad 100644
--- a/sysdeps/unix/sysv/linux/malloc-hugepages.c
+++ b/sysdeps/unix/sysv/linux/malloc-hugepages.c
@@ -17,8 +17,10 @@ 
    not, see <https://www.gnu.org/licenses/>.  */
 
 #include <intprops.h>
+#include <dirent.h>
 #include <malloc-hugepages.h>
 #include <not-cancel.h>
+#include <sys/mman.h>
 
 size_t
 __malloc_default_thp_pagesize (void)
@@ -74,3 +76,126 @@  __malloc_thp_mode (void)
     }
   return malloc_thp_mode_not_supported;
 }
+
+static size_t
+malloc_default_hugepage_size (void)
+{
+  int fd = __open64_nocancel ("/proc/meminfo", O_RDONLY);
+  if (fd == -1)
+    return 0;
+
+  char buf[512];
+  off64_t off = 0;
+  while (1)
+    {
+      ssize_t r = __pread64_nocancel (fd, buf, sizeof (buf) - 1, off);
+      if (r < 0)
+	break;
+      buf[r - 1] = '\0';
+
+      const char *s = strstr (buf, "Hugepagesize:");
+      if (s == NULL)
+	{
+	  char *nl = strrchr (buf, '\n');
+	  if (nl == NULL)
+	    break;
+	  off += (nl + 1) - buf;
+	  continue;
+	}
+
+      /* The default huge page size is in the form:
+	 Hugepagesize:       NUMBER kB  */
+      size_t hpsize = 0;
+      s += sizeof ("Hugepagesize: ") - 1;
+      for (int i = 0; (s[i] >= '0' && s[i] <= '9') || s[i] == ' '; i++)
+	{
+	  if (s[i] == ' ')
+	    continue;
+	  hpsize *= 10;
+	  hpsize += s[i] - '0';
+	}
+      return hpsize * 1024;
+    }
+
+  __close_nocancel (fd);
+
+  return 0;
+}
+
+static inline struct malloc_hugepage_config_t
+make_malloc_hugepage_config (size_t pagesize)
+{
+  int flags = MAP_HUGETLB | (__builtin_ctzll (pagesize) << MAP_HUGE_SHIFT);
+  return (struct malloc_hugepage_config_t) { pagesize, flags };
+}
+
+struct malloc_hugepage_config_t
+__malloc_hugepage_config (size_t requested)
+{
+  if (requested == 1)
+    {
+      size_t pagesize = malloc_default_hugepage_size ();
+      if (pagesize != 0)
+	return make_malloc_hugepage_config (pagesize);
+    }
+
+  int dirfd = __open64_nocancel ("/sys/kernel/mm/hugepages",
+				 O_RDONLY | O_DIRECTORY, 0);
+  if (dirfd == -1)
+    return (struct malloc_hugepage_config_t) { 0, 0 };
+
+  bool found = false;
+
+  char buffer[1024];
+  while (true)
+    {
+#if !IS_IN(libc)
+# define __getdents64 getdents64
+#endif
+      ssize_t ret = __getdents64 (dirfd, buffer, sizeof (buffer));
+      if (ret == -1)
+	break;
+      else if (ret == 0)
+        break;
+
+      char *begin = buffer, *end = buffer + ret;
+      while (begin != end)
+        {
+          unsigned short int d_reclen;
+          memcpy (&d_reclen, begin + offsetof (struct dirent64, d_reclen),
+                  sizeof (d_reclen));
+          const char *dname = begin + offsetof (struct dirent64, d_name);
+          begin += d_reclen;
+
+          if (dname[0] == '.'
+	      || strncmp (dname, "hugepages-", sizeof ("hugepages-") - 1) != 0)
+            continue;
+
+	  /* Each entry represents a supported huge page in the form of:
+	     hugepages-<size>kB.  */
+	  size_t hpsize = 0;
+	  const char *sizestr = dname + sizeof ("hugepages-") - 1;
+	  for (int i = 0; sizestr[i] >= '0' && sizestr[i] <= '9'; i++)
+	    {
+	      hpsize *= 10;
+	      hpsize += sizestr[i] - '0';
+	    }
+	  hpsize *= 1024;
+
+	  if (hpsize == requested)
+	    {
+	      found = true;
+	      break;
+	    }
+        }
+      if (found)
+	break;
+    }
+
+  __close_nocancel (dirfd);
+
+  if (found)
+    return make_malloc_hugepage_config (requested);
+
+  return (struct malloc_hugepage_config_t) { 0, 0 };
+}