[v2,4/4] malloc: Add Huge Page support for sysmalloc
Checks
Commit Message
A new tunable, 'glibc.malloc.mmap_hugetlb', adds support to use Huge Page
support directly with mmap() calls. The required supported sizes and
flags for mmap() are provided by an arch-specific internal hook
malloc_hp_config().
Currently it first try mmap() using the huge page size and fallback to
default page size and sbrk() call if kernel returns MMAP_FAILED.
The default malloc_hp_config() implementation does not enable it even
if the tunable is set.
Checked on x86_64-linux-gnu.
---
NEWS | 4 +
elf/dl-tunables.list | 4 +
elf/tst-rtld-list-tunables.exp | 1 +
malloc/arena.c | 2 +
malloc/malloc.c | 35 +++++-
manual/tunables.texi | 14 +++
sysdeps/generic/malloc-hugepages.c | 6 +
sysdeps/generic/malloc-hugepages.h | 12 ++
sysdeps/unix/sysv/linux/malloc-hugepages.c | 125 +++++++++++++++++++++
9 files changed, 200 insertions(+), 3 deletions(-)
Comments
On 8/18/21 7:50 PM, Adhemerval Zanella via Libc-alpha wrote:
> A new tunable, 'glibc.malloc.mmap_hugetlb', adds support to use Huge Page
> support directly with mmap() calls. The required supported sizes and
> flags for mmap() are provided by an arch-specific internal hook
> malloc_hp_config().
>
> Currently it first try mmap() using the huge page size and fallback to
> default page size and sbrk() call if kernel returns MMAP_FAILED.
>
> The default malloc_hp_config() implementation does not enable it even
> if the tunable is set.
>
> Checked on x86_64-linux-gnu.
> ---
> NEWS | 4 +
> elf/dl-tunables.list | 4 +
> elf/tst-rtld-list-tunables.exp | 1 +
> malloc/arena.c | 2 +
> malloc/malloc.c | 35 +++++-
> manual/tunables.texi | 14 +++
> sysdeps/generic/malloc-hugepages.c | 6 +
> sysdeps/generic/malloc-hugepages.h | 12 ++
> sysdeps/unix/sysv/linux/malloc-hugepages.c | 125 +++++++++++++++++++++
> 9 files changed, 200 insertions(+), 3 deletions(-)
>
> diff --git a/NEWS b/NEWS
> index 9b2345d08c..412bf3e6f8 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -14,6 +14,10 @@ Major new features:
> It might improve performance with Transparent Huge Pages madvise mode
> depending of the workload.
>
> +* On Linux, a new tunable, glibc.malloc.mmap_hugetlb, can be used to
> + instruct malloc to try use Huge Pages when allocate memory with mmap()
> + calls (through the use of MAP_HUGETLB).
> +
> Deprecated and removed features, and other changes affecting compatibility:
>
> [Add deprecations, removals and changes affecting compatibility here]
> diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
> index 67df6dbc2c..209c2d8592 100644
> --- a/elf/dl-tunables.list
> +++ b/elf/dl-tunables.list
> @@ -97,6 +97,10 @@ glibc {
> minval: 0
> maxval: 1
> }
> + mmap_hugetlb {
> + type: SIZE_T
> + minval: 0
> + }
> }
> cpu {
> hwcap_mask {
> diff --git a/elf/tst-rtld-list-tunables.exp b/elf/tst-rtld-list-tunables.exp
> index d8109fa31c..49f033ce91 100644
> --- a/elf/tst-rtld-list-tunables.exp
> +++ b/elf/tst-rtld-list-tunables.exp
> @@ -1,6 +1,7 @@
> glibc.malloc.arena_max: 0x0 (min: 0x1, max: 0x[f]+)
> glibc.malloc.arena_test: 0x0 (min: 0x1, max: 0x[f]+)
> glibc.malloc.check: 0 (min: 0, max: 3)
> +glibc.malloc.mmap_hugetlb: 0x0 (min: 0x0, max: 0x[f]+)
> glibc.malloc.mmap_max: 0 (min: 0, max: 2147483647)
> glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0x[f]+)
> glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0x[f]+)
> diff --git a/malloc/arena.c b/malloc/arena.c
> index 81bff54303..4efb5581c1 100644
> --- a/malloc/arena.c
> +++ b/malloc/arena.c
> @@ -232,6 +232,7 @@ TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
> #endif
> TUNABLE_CALLBACK_FNDECL (set_mxfast, size_t)
> TUNABLE_CALLBACK_FNDECL (set_thp_madvise, int32_t)
> +TUNABLE_CALLBACK_FNDECL (set_mmap_hugetlb, size_t)
> #else
> /* Initialization routine. */
> #include <string.h>
> @@ -333,6 +334,7 @@ ptmalloc_init (void)
> # endif
> TUNABLE_GET (mxfast, size_t, TUNABLE_CALLBACK (set_mxfast));
> TUNABLE_GET (thp_madvise, int32_t, TUNABLE_CALLBACK (set_thp_madvise));
> + TUNABLE_GET (mmap_hugetlb, size_t, TUNABLE_CALLBACK (set_mmap_hugetlb));
> #else
> if (__glibc_likely (_environ != NULL))
> {
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 4bfcea286f..8cf2d6855e 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -1884,6 +1884,10 @@ struct malloc_par
> #if HAVE_TUNABLES
> /* Transparent Large Page support. */
> INTERNAL_SIZE_T thp_pagesize;
> + /* A value different than 0 means to align mmap allocation to hp_pagesize
> + add hp_flags on flags. */
> + INTERNAL_SIZE_T hp_pagesize;
> + int hp_flags;
> #endif
>
> /* Memory map support */
> @@ -2415,7 +2419,8 @@ do_check_malloc_state (mstate av)
> */
>
> static void *
> -sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
> +sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av,
> + bool set_thp)
> {
> long int size;
>
> @@ -2442,7 +2447,8 @@ sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
> if (mm == MAP_FAILED)
> return mm;
>
> - sysmadvise_thp (mm, size);
> + if (set_thp)
> + sysmadvise_thp (mm, size);
If MAP_HUGEPAGE is set in extra_flags then we don't need madvise;
there's no need for set_thp.
>
> /*
> The offset to the start of the mmapped region is stored in the prev_size
> @@ -2531,7 +2537,18 @@ sysmalloc (INTERNAL_SIZE_T nb, mstate av)
> && (mp_.n_mmaps < mp_.n_mmaps_max)))
> {
> try_mmap:
> - char *mm = sysmalloc_mmap (nb, pagesize, 0, av);
> + char *mm;
> +#if HAVE_TUNABLES
> + if (mp_.hp_pagesize > 0)
> + {
> + /* There is no need to isse the THP madvise call if Huge Pages are
> + used directly. */
> + mm = sysmalloc_mmap (nb, mp_.hp_pagesize, mp_.hp_flags, av, false);
> + if (mm != MAP_FAILED)
> + return mm;
> + }
> +#endif
> + mm = sysmalloc_mmap (nb, pagesize, 0, av, true);
A single tunable ought to allow you to do all this in just sysmalloc_mmap.
> if (mm != MAP_FAILED)
> return mm;
> tried_mmap = true;
> @@ -5405,6 +5422,18 @@ do_set_thp_madvise (int32_t value)
> }
> return 0;
> }
> +
> +static __always_inline int
> +do_set_mmap_hugetlb (size_t value)
> +{
> + if (value > 0)
> + {
> + struct malloc_hugepage_config_t cfg = __malloc_hugepage_config (value);
> + mp_.hp_pagesize = cfg.pagesize;
> + mp_.hp_flags = cfg.flags;
Instead of making a struct to pass it, you could just pass
&mp.hp_pagesize and &mp.hp_flags. Also, with a single tunable, you do
this only when value > 1. For value == 0, you set the default THP
pagesize and set flags to 0.
> + }
> + return 0;
> +}
> #endif
>
> int
> diff --git a/manual/tunables.texi b/manual/tunables.texi
> index 93c46807f9..4da6a02778 100644
> --- a/manual/tunables.texi
> +++ b/manual/tunables.texi
> @@ -279,6 +279,20 @@ The default value of this tunable is @code{0}, which disable its usage.
> Setting to a positive value enable the @code{madvise} call.
> @end deftp
>
> +@deftp Tunable glibc.malloc.mmap_hugetlb
> +This tunable enable the use of Huge Pages when the system supports it (currently
> +only Linux). It is done by aligning the memory size and passing the required
> +flags (@code{MAP_HUGETLB} on Linux) when issuing the @code{mmap} to allocate
> +memory from the system.
> +
> +The default value of this tunable is @code{0}, which disable its usage.
> +The special value @code{1} will try to gather the system default huge page size,
> +while a value larger than @code{1} will try to match it with the supported system
> +huge page size. If either no default huge page size could be obtained or if the
> +requested size does not match the supported ones, the huge pages supports will be
> +disabled.
> +@end deftp
> +
> @node Dynamic Linking Tunables
> @section Dynamic Linking Tunables
> @cindex dynamic linking tunables
> diff --git a/sysdeps/generic/malloc-hugepages.c b/sysdeps/generic/malloc-hugepages.c
> index 262bcdbeb8..e5f5c1ec98 100644
> --- a/sysdeps/generic/malloc-hugepages.c
> +++ b/sysdeps/generic/malloc-hugepages.c
> @@ -29,3 +29,9 @@ __malloc_thp_mode (void)
> {
> return malloc_thp_mode_not_supported;
> }
> +
> +/* Return the default transparent huge page size. */
> +struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested)
> +{
> + return (struct malloc_hugepage_config_t) { 0, 0 };
> +}
> diff --git a/sysdeps/generic/malloc-hugepages.h b/sysdeps/generic/malloc-hugepages.h
> index 664cda9b67..27f7adfea5 100644
> --- a/sysdeps/generic/malloc-hugepages.h
> +++ b/sysdeps/generic/malloc-hugepages.h
> @@ -34,4 +34,16 @@ enum malloc_thp_mode_t
>
> enum malloc_thp_mode_t __malloc_thp_mode (void) attribute_hidden;
>
> +struct malloc_hugepage_config_t
> +{
> + size_t pagesize;
> + int flags;
> +};
> +
> +/* Returned the support huge page size from the requested PAGESIZE along
> + with the requires extra mmap flags. Returning a 0 value for pagesize
> + disables its usage. */
> +struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested)
> + attribute_hidden;
> +
> #endif /* _MALLOC_HUGEPAGES_H */
> diff --git a/sysdeps/unix/sysv/linux/malloc-hugepages.c b/sysdeps/unix/sysv/linux/malloc-hugepages.c
> index 66589127cd..0eb0c764ad 100644
> --- a/sysdeps/unix/sysv/linux/malloc-hugepages.c
> +++ b/sysdeps/unix/sysv/linux/malloc-hugepages.c
> @@ -17,8 +17,10 @@
> not, see <https://www.gnu.org/licenses/>. */
>
> #include <intprops.h>
> +#include <dirent.h>
> #include <malloc-hugepages.h>
> #include <not-cancel.h>
> +#include <sys/mman.h>
>
> size_t
> __malloc_default_thp_pagesize (void)
> @@ -74,3 +76,126 @@ __malloc_thp_mode (void)
> }
> return malloc_thp_mode_not_supported;
> }
> +
> +static size_t
> +malloc_default_hugepage_size (void)
> +{
> + int fd = __open64_nocancel ("/proc/meminfo", O_RDONLY);
> + if (fd == -1)
> + return 0;
> +
> + char buf[512];
> + off64_t off = 0;
> + while (1)
> + {
> + ssize_t r = __pread64_nocancel (fd, buf, sizeof (buf) - 1, off);
> + if (r < 0)
> + break;
> + buf[r - 1] = '\0';
> +
> + const char *s = strstr (buf, "Hugepagesize:");
> + if (s == NULL)
> + {
> + char *nl = strrchr (buf, '\n');
> + if (nl == NULL)
> + break;
> + off += (nl + 1) - buf;
> + continue;
> + }
> +
> + /* The default huge page size is in the form:
> + Hugepagesize: NUMBER kB */
> + size_t hpsize = 0;
> + s += sizeof ("Hugepagesize: ") - 1;
> + for (int i = 0; (s[i] >= '0' && s[i] <= '9') || s[i] == ' '; i++)
> + {
> + if (s[i] == ' ')
> + continue;
> + hpsize *= 10;
> + hpsize += s[i] - '0';
> + }
> + return hpsize * 1024;
> + }
> +
> + __close_nocancel (fd);
> +
> + return 0;
> +}
> +
> +static inline struct malloc_hugepage_config_t
> +make_malloc_hugepage_config (size_t pagesize)
> +{
> + int flags = MAP_HUGETLB | (__builtin_ctzll (pagesize) << MAP_HUGE_SHIFT);
> + return (struct malloc_hugepage_config_t) { pagesize, flags };
> +}
> +
> +struct malloc_hugepage_config_t
> +__malloc_hugepage_config (size_t requested)
> +{
> + if (requested == 1)
> + {
> + size_t pagesize = malloc_default_hugepage_size ();
> + if (pagesize != 0)
> + return make_malloc_hugepage_config (pagesize);
> + }
> +
> + int dirfd = __open64_nocancel ("/sys/kernel/mm/hugepages",
> + O_RDONLY | O_DIRECTORY, 0);
> + if (dirfd == -1)
> + return (struct malloc_hugepage_config_t) { 0, 0 };
> +
> + bool found = false;
> +
> + char buffer[1024];
> + while (true)
> + {
> +#if !IS_IN(libc)
> +# define __getdents64 getdents64
> +#endif
> + ssize_t ret = __getdents64 (dirfd, buffer, sizeof (buffer));
> + if (ret == -1)
> + break;
> + else if (ret == 0)
> + break;
> +
> + char *begin = buffer, *end = buffer + ret;
> + while (begin != end)
> + {
> + unsigned short int d_reclen;
> + memcpy (&d_reclen, begin + offsetof (struct dirent64, d_reclen),
> + sizeof (d_reclen));
> + const char *dname = begin + offsetof (struct dirent64, d_name);
> + begin += d_reclen;
> +
> + if (dname[0] == '.'
> + || strncmp (dname, "hugepages-", sizeof ("hugepages-") - 1) != 0)
> + continue;
> +
> + /* Each entry represents a supported huge page in the form of:
> + hugepages-<size>kB. */
> + size_t hpsize = 0;
> + const char *sizestr = dname + sizeof ("hugepages-") - 1;
> + for (int i = 0; sizestr[i] >= '0' && sizestr[i] <= '9'; i++)
> + {
> + hpsize *= 10;
> + hpsize += sizestr[i] - '0';
> + }
> + hpsize *= 1024;
> +
> + if (hpsize == requested)
> + {
> + found = true;
> + break;
> + }
> + }
> + if (found)
> + break;
> + }
> +
> + __close_nocancel (dirfd);
> +
> + if (found)
> + return make_malloc_hugepage_config (requested);
> +
> + return (struct malloc_hugepage_config_t) { 0, 0 };
> +}
>
On 18/08/2021 22:03, Siddhesh Poyarekar wrote:
> On 8/18/21 7:50 PM, Adhemerval Zanella via Libc-alpha wrote:
>> A new tunable, 'glibc.malloc.mmap_hugetlb', adds support to use Huge Page
>> support directly with mmap() calls. The required supported sizes and
>> flags for mmap() are provided by an arch-specific internal hook
>> malloc_hp_config().
>>
>> Currently it first try mmap() using the huge page size and fallback to
>> default page size and sbrk() call if kernel returns MMAP_FAILED.
>>
>> The default malloc_hp_config() implementation does not enable it even
>> if the tunable is set.
>>
>> Checked on x86_64-linux-gnu.
>> ---
>> NEWS | 4 +
>> elf/dl-tunables.list | 4 +
>> elf/tst-rtld-list-tunables.exp | 1 +
>> malloc/arena.c | 2 +
>> malloc/malloc.c | 35 +++++-
>> manual/tunables.texi | 14 +++
>> sysdeps/generic/malloc-hugepages.c | 6 +
>> sysdeps/generic/malloc-hugepages.h | 12 ++
>> sysdeps/unix/sysv/linux/malloc-hugepages.c | 125 +++++++++++++++++++++
>> 9 files changed, 200 insertions(+), 3 deletions(-)
>>
>> diff --git a/NEWS b/NEWS
>> index 9b2345d08c..412bf3e6f8 100644
>> --- a/NEWS
>> +++ b/NEWS
>> @@ -14,6 +14,10 @@ Major new features:
>> It might improve performance with Transparent Huge Pages madvise mode
>> depending of the workload.
>> +* On Linux, a new tunable, glibc.malloc.mmap_hugetlb, can be used to
>> + instruct malloc to try use Huge Pages when allocate memory with mmap()
>> + calls (through the use of MAP_HUGETLB).
>> +
>> Deprecated and removed features, and other changes affecting compatibility:
>> [Add deprecations, removals and changes affecting compatibility here]
>> diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
>> index 67df6dbc2c..209c2d8592 100644
>> --- a/elf/dl-tunables.list
>> +++ b/elf/dl-tunables.list
>> @@ -97,6 +97,10 @@ glibc {
>> minval: 0
>> maxval: 1
>> }
>> + mmap_hugetlb {
>> + type: SIZE_T
>> + minval: 0
>> + }
>> }
>> cpu {
>> hwcap_mask {
>> diff --git a/elf/tst-rtld-list-tunables.exp b/elf/tst-rtld-list-tunables.exp
>> index d8109fa31c..49f033ce91 100644
>> --- a/elf/tst-rtld-list-tunables.exp
>> +++ b/elf/tst-rtld-list-tunables.exp
>> @@ -1,6 +1,7 @@
>> glibc.malloc.arena_max: 0x0 (min: 0x1, max: 0x[f]+)
>> glibc.malloc.arena_test: 0x0 (min: 0x1, max: 0x[f]+)
>> glibc.malloc.check: 0 (min: 0, max: 3)
>> +glibc.malloc.mmap_hugetlb: 0x0 (min: 0x0, max: 0x[f]+)
>> glibc.malloc.mmap_max: 0 (min: 0, max: 2147483647)
>> glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0x[f]+)
>> glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0x[f]+)
>> diff --git a/malloc/arena.c b/malloc/arena.c
>> index 81bff54303..4efb5581c1 100644
>> --- a/malloc/arena.c
>> +++ b/malloc/arena.c
>> @@ -232,6 +232,7 @@ TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
>> #endif
>> TUNABLE_CALLBACK_FNDECL (set_mxfast, size_t)
>> TUNABLE_CALLBACK_FNDECL (set_thp_madvise, int32_t)
>> +TUNABLE_CALLBACK_FNDECL (set_mmap_hugetlb, size_t)
>> #else
>> /* Initialization routine. */
>> #include <string.h>
>> @@ -333,6 +334,7 @@ ptmalloc_init (void)
>> # endif
>> TUNABLE_GET (mxfast, size_t, TUNABLE_CALLBACK (set_mxfast));
>> TUNABLE_GET (thp_madvise, int32_t, TUNABLE_CALLBACK (set_thp_madvise));
>> + TUNABLE_GET (mmap_hugetlb, size_t, TUNABLE_CALLBACK (set_mmap_hugetlb));
>> #else
>> if (__glibc_likely (_environ != NULL))
>> {
>> diff --git a/malloc/malloc.c b/malloc/malloc.c
>> index 4bfcea286f..8cf2d6855e 100644
>> --- a/malloc/malloc.c
>> +++ b/malloc/malloc.c
>> @@ -1884,6 +1884,10 @@ struct malloc_par
>> #if HAVE_TUNABLES
>> /* Transparent Large Page support. */
>> INTERNAL_SIZE_T thp_pagesize;
>> + /* A value different than 0 means to align mmap allocation to hp_pagesize
>> + add hp_flags on flags. */
>> + INTERNAL_SIZE_T hp_pagesize;
>> + int hp_flags;
>> #endif
>> /* Memory map support */
>> @@ -2415,7 +2419,8 @@ do_check_malloc_state (mstate av)
>> */
>> static void *
>> -sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
>> +sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av,
>> + bool set_thp)
>> {
>> long int size;
>> @@ -2442,7 +2447,8 @@ sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
>> if (mm == MAP_FAILED)
>> return mm;
>> - sysmadvise_thp (mm, size);
>> + if (set_thp)
>> + sysmadvise_thp (mm, size);
>
> If MAP_HUGEPAGE is set in extra_flags then we don't need madvise; there's no need for set_thp.
Alright we can use it instead. I just add the flag to avoid the extra
ifdef MAP_HUGEPAGE.
>
>> /*
>> The offset to the start of the mmapped region is stored in the prev_size
>> @@ -2531,7 +2537,18 @@ sysmalloc (INTERNAL_SIZE_T nb, mstate av)
>> && (mp_.n_mmaps < mp_.n_mmaps_max)))
>> {
>> try_mmap:
>> - char *mm = sysmalloc_mmap (nb, pagesize, 0, av);
>> + char *mm;
>> +#if HAVE_TUNABLES
>> + if (mp_.hp_pagesize > 0)
>> + {
>> + /* There is no need to isse the THP madvise call if Huge Pages are
>> + used directly. */
>> + mm = sysmalloc_mmap (nb, mp_.hp_pagesize, mp_.hp_flags, av, false);
>> + if (mm != MAP_FAILED)
>> + return mm;
>> + }
>> +#endif
>> + mm = sysmalloc_mmap (nb, pagesize, 0, av, true);
>
> A single tunable ought to allow you to do all this in just sysmalloc_mmap.
>
>> if (mm != MAP_FAILED)
>> return mm;
>> tried_mmap = true;
>> @@ -5405,6 +5422,18 @@ do_set_thp_madvise (int32_t value)
>> }
>> return 0;
>> }
>> +
>> +static __always_inline int
>> +do_set_mmap_hugetlb (size_t value)
>> +{
>> + if (value > 0)
>> + {
>> + struct malloc_hugepage_config_t cfg = __malloc_hugepage_config (value);
>> + mp_.hp_pagesize = cfg.pagesize;
>> + mp_.hp_flags = cfg.flags;
>
> Instead of making a struct to pass it, you could just pass &mp.hp_pagesize and &mp.hp_flags. Also, with a single tunable, you do this only when value > 1. For value == 0, you set the default THP pagesize and set flags to 0.
>
>> + }
>> + return 0;
>> +}
>> #endif
>> int
I don't have a strong opinion here, using pointers should work as well.
>> diff --git a/manual/tunables.texi b/manual/tunables.texi
>> index 93c46807f9..4da6a02778 100644
>> --- a/manual/tunables.texi
>> +++ b/manual/tunables.texi
>> @@ -279,6 +279,20 @@ The default value of this tunable is @code{0}, which disable its usage.
>> Setting to a positive value enable the @code{madvise} call.
>> @end deftp
>> +@deftp Tunable glibc.malloc.mmap_hugetlb
>> +This tunable enable the use of Huge Pages when the system supports it (currently
>> +only Linux). It is done by aligning the memory size and passing the required
>> +flags (@code{MAP_HUGETLB} on Linux) when issuing the @code{mmap} to allocate
>> +memory from the system.
>> +
>> +The default value of this tunable is @code{0}, which disable its usage.
>> +The special value @code{1} will try to gather the system default huge page size,
>> +while a value larger than @code{1} will try to match it with the supported system
>> +huge page size. If either no default huge page size could be obtained or if the
>> +requested size does not match the supported ones, the huge pages supports will be
>> +disabled.
>> +@end deftp
>> +
>> @node Dynamic Linking Tunables
>> @section Dynamic Linking Tunables
>> @cindex dynamic linking tunables
>> diff --git a/sysdeps/generic/malloc-hugepages.c b/sysdeps/generic/malloc-hugepages.c
>> index 262bcdbeb8..e5f5c1ec98 100644
>> --- a/sysdeps/generic/malloc-hugepages.c
>> +++ b/sysdeps/generic/malloc-hugepages.c
>> @@ -29,3 +29,9 @@ __malloc_thp_mode (void)
>> {
>> return malloc_thp_mode_not_supported;
>> }
>> +
>> +/* Return the default transparent huge page size. */
>> +struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested)
>> +{
>> + return (struct malloc_hugepage_config_t) { 0, 0 };
>> +}
>> diff --git a/sysdeps/generic/malloc-hugepages.h b/sysdeps/generic/malloc-hugepages.h
>> index 664cda9b67..27f7adfea5 100644
>> --- a/sysdeps/generic/malloc-hugepages.h
>> +++ b/sysdeps/generic/malloc-hugepages.h
>> @@ -34,4 +34,16 @@ enum malloc_thp_mode_t
>> enum malloc_thp_mode_t __malloc_thp_mode (void) attribute_hidden;
>> +struct malloc_hugepage_config_t
>> +{
>> + size_t pagesize;
>> + int flags;
>> +};
>> +
>> +/* Returned the support huge page size from the requested PAGESIZE along
>> + with the requires extra mmap flags. Returning a 0 value for pagesize
>> + disables its usage. */
>> +struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested)
>> + attribute_hidden;
>> +
>> #endif /* _MALLOC_HUGEPAGES_H */
>> diff --git a/sysdeps/unix/sysv/linux/malloc-hugepages.c b/sysdeps/unix/sysv/linux/malloc-hugepages.c
>> index 66589127cd..0eb0c764ad 100644
>> --- a/sysdeps/unix/sysv/linux/malloc-hugepages.c
>> +++ b/sysdeps/unix/sysv/linux/malloc-hugepages.c
>> @@ -17,8 +17,10 @@
>> not, see <https://www.gnu.org/licenses/>. */
>> #include <intprops.h>
>> +#include <dirent.h>
>> #include <malloc-hugepages.h>
>> #include <not-cancel.h>
>> +#include <sys/mman.h>
>> size_t
>> __malloc_default_thp_pagesize (void)
>> @@ -74,3 +76,126 @@ __malloc_thp_mode (void)
>> }
>> return malloc_thp_mode_not_supported;
>> }
>> +
>> +static size_t
>> +malloc_default_hugepage_size (void)
>> +{
>> + int fd = __open64_nocancel ("/proc/meminfo", O_RDONLY);
>> + if (fd == -1)
>> + return 0;
>> +
>> + char buf[512];
>> + off64_t off = 0;
>> + while (1)
>> + {
>> + ssize_t r = __pread64_nocancel (fd, buf, sizeof (buf) - 1, off);
>> + if (r < 0)
>> + break;
>> + buf[r - 1] = '\0';
>> +
>> + const char *s = strstr (buf, "Hugepagesize:");
>> + if (s == NULL)
>> + {
>> + char *nl = strrchr (buf, '\n');
>> + if (nl == NULL)
>> + break;
>> + off += (nl + 1) - buf;
>> + continue;
>> + }
>> +
>> + /* The default huge page size is in the form:
>> + Hugepagesize: NUMBER kB */
>> + size_t hpsize = 0;
>> + s += sizeof ("Hugepagesize: ") - 1;
>> + for (int i = 0; (s[i] >= '0' && s[i] <= '9') || s[i] == ' '; i++)
>> + {
>> + if (s[i] == ' ')
>> + continue;
>> + hpsize *= 10;
>> + hpsize += s[i] - '0';
>> + }
>> + return hpsize * 1024;
>> + }
>> +
>> + __close_nocancel (fd);
>> +
>> + return 0;
>> +}
>> +
>> +static inline struct malloc_hugepage_config_t
>> +make_malloc_hugepage_config (size_t pagesize)
>> +{
>> + int flags = MAP_HUGETLB | (__builtin_ctzll (pagesize) << MAP_HUGE_SHIFT);
>> + return (struct malloc_hugepage_config_t) { pagesize, flags };
>> +}
>> +
>> +struct malloc_hugepage_config_t
>> +__malloc_hugepage_config (size_t requested)
>> +{
>> + if (requested == 1)
>> + {
>> + size_t pagesize = malloc_default_hugepage_size ();
>> + if (pagesize != 0)
>> + return make_malloc_hugepage_config (pagesize);
>> + }
>> +
>> + int dirfd = __open64_nocancel ("/sys/kernel/mm/hugepages",
>> + O_RDONLY | O_DIRECTORY, 0);
>> + if (dirfd == -1)
>> + return (struct malloc_hugepage_config_t) { 0, 0 };
>> +
>> + bool found = false;
>> +
>> + char buffer[1024];
>> + while (true)
>> + {
>> +#if !IS_IN(libc)
>> +# define __getdents64 getdents64
>> +#endif
>> + ssize_t ret = __getdents64 (dirfd, buffer, sizeof (buffer));
>> + if (ret == -1)
>> + break;
>> + else if (ret == 0)
>> + break;
>> +
>> + char *begin = buffer, *end = buffer + ret;
>> + while (begin != end)
>> + {
>> + unsigned short int d_reclen;
>> + memcpy (&d_reclen, begin + offsetof (struct dirent64, d_reclen),
>> + sizeof (d_reclen));
>> + const char *dname = begin + offsetof (struct dirent64, d_name);
>> + begin += d_reclen;
>> +
>> + if (dname[0] == '.'
>> + || strncmp (dname, "hugepages-", sizeof ("hugepages-") - 1) != 0)
>> + continue;
>> +
>> + /* Each entry represents a supported huge page in the form of:
>> + hugepages-<size>kB. */
>> + size_t hpsize = 0;
>> + const char *sizestr = dname + sizeof ("hugepages-") - 1;
>> + for (int i = 0; sizestr[i] >= '0' && sizestr[i] <= '9'; i++)
>> + {
>> + hpsize *= 10;
>> + hpsize += sizestr[i] - '0';
>> + }
>> + hpsize *= 1024;
>> +
>> + if (hpsize == requested)
>> + {
>> + found = true;
>> + break;
>> + }
>> + }
>> + if (found)
>> + break;
>> + }
>> +
>> + __close_nocancel (dirfd);
>> +
>> + if (found)
>> + return make_malloc_hugepage_config (requested);
>> +
>> + return (struct malloc_hugepage_config_t) { 0, 0 };
>> +}
>>
>
Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org> writes:
> A new tunable, 'glibc.malloc.mmap_hugetlb', adds support to use Huge Page
> support directly with mmap() calls. The required supported sizes and
> flags for mmap() are provided by an arch-specific internal hook
> malloc_hp_config().
>
> Currently it first try mmap() using the huge page size and fallback to
> default page size and sbrk() call if kernel returns MMAP_FAILED.
>
> The default malloc_hp_config() implementation does not enable it even
> if the tunable is set.
>
> Checked on x86_64-linux-gnu.
> ---
> NEWS | 4 +
> elf/dl-tunables.list | 4 +
> elf/tst-rtld-list-tunables.exp | 1 +
> malloc/arena.c | 2 +
> malloc/malloc.c | 35 +++++-
> manual/tunables.texi | 14 +++
> sysdeps/generic/malloc-hugepages.c | 6 +
> sysdeps/generic/malloc-hugepages.h | 12 ++
> sysdeps/unix/sysv/linux/malloc-hugepages.c | 125 +++++++++++++++++++++
> 9 files changed, 200 insertions(+), 3 deletions(-)
>
> diff --git a/NEWS b/NEWS
> index 9b2345d08c..412bf3e6f8 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -14,6 +14,10 @@ Major new features:
> It might improve performance with Transparent Huge Pages madvise mode
> depending of the workload.
>
> +* On Linux, a new tunable, glibc.malloc.mmap_hugetlb, can be used to
> + instruct malloc to try use Huge Pages when allocate memory with mmap()
> + calls (through the use of MAP_HUGETLB).
> +
> Deprecated and removed features, and other changes affecting compatibility:
>
> [Add deprecations, removals and changes affecting compatibility here]
> diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
> index 67df6dbc2c..209c2d8592 100644
> --- a/elf/dl-tunables.list
> +++ b/elf/dl-tunables.list
> @@ -97,6 +97,10 @@ glibc {
> minval: 0
> maxval: 1
> }
> + mmap_hugetlb {
> + type: SIZE_T
> + minval: 0
> + }
> }
> cpu {
> hwcap_mask {
> diff --git a/elf/tst-rtld-list-tunables.exp b/elf/tst-rtld-list-tunables.exp
> index d8109fa31c..49f033ce91 100644
> --- a/elf/tst-rtld-list-tunables.exp
> +++ b/elf/tst-rtld-list-tunables.exp
> @@ -1,6 +1,7 @@
> glibc.malloc.arena_max: 0x0 (min: 0x1, max: 0x[f]+)
> glibc.malloc.arena_test: 0x0 (min: 0x1, max: 0x[f]+)
> glibc.malloc.check: 0 (min: 0, max: 3)
> +glibc.malloc.mmap_hugetlb: 0x0 (min: 0x0, max: 0x[f]+)
> glibc.malloc.mmap_max: 0 (min: 0, max: 2147483647)
> glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0x[f]+)
> glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0x[f]+)
> diff --git a/malloc/arena.c b/malloc/arena.c
> index 81bff54303..4efb5581c1 100644
> --- a/malloc/arena.c
> +++ b/malloc/arena.c
> @@ -232,6 +232,7 @@ TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
> #endif
> TUNABLE_CALLBACK_FNDECL (set_mxfast, size_t)
> TUNABLE_CALLBACK_FNDECL (set_thp_madvise, int32_t)
> +TUNABLE_CALLBACK_FNDECL (set_mmap_hugetlb, size_t)
> #else
> /* Initialization routine. */
> #include <string.h>
> @@ -333,6 +334,7 @@ ptmalloc_init (void)
> # endif
> TUNABLE_GET (mxfast, size_t, TUNABLE_CALLBACK (set_mxfast));
> TUNABLE_GET (thp_madvise, int32_t, TUNABLE_CALLBACK (set_thp_madvise));
> + TUNABLE_GET (mmap_hugetlb, size_t, TUNABLE_CALLBACK (set_mmap_hugetlb));
> #else
> if (__glibc_likely (_environ != NULL))
> {
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 4bfcea286f..8cf2d6855e 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -1884,6 +1884,10 @@ struct malloc_par
> #if HAVE_TUNABLES
> /* Transparent Large Page support. */
> INTERNAL_SIZE_T thp_pagesize;
> + /* A value different than 0 means to align mmap allocation to hp_pagesize
> + add hp_flags on flags. */
> + INTERNAL_SIZE_T hp_pagesize;
> + int hp_flags;
> #endif
>
> /* Memory map support */
> @@ -2415,7 +2419,8 @@ do_check_malloc_state (mstate av)
> */
>
> static void *
> -sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
> +sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av,
> + bool set_thp)
> {
> long int size;
>
> @@ -2442,7 +2447,8 @@ sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
> if (mm == MAP_FAILED)
> return mm;
>
> - sysmadvise_thp (mm, size);
> + if (set_thp)
> + sysmadvise_thp (mm, size);
>
> /*
> The offset to the start of the mmapped region is stored in the prev_size
> @@ -2531,7 +2537,18 @@ sysmalloc (INTERNAL_SIZE_T nb, mstate av)
> && (mp_.n_mmaps < mp_.n_mmaps_max)))
> {
> try_mmap:
> - char *mm = sysmalloc_mmap (nb, pagesize, 0, av);
> + char *mm;
> +#if HAVE_TUNABLES
> + if (mp_.hp_pagesize > 0)
> + {
> + /* There is no need to isse the THP madvise call if Huge Pages are
> + used directly. */
> + mm = sysmalloc_mmap (nb, mp_.hp_pagesize, mp_.hp_flags, av, false);
> + if (mm != MAP_FAILED)
> + return mm;
> + }
> +#endif
> + mm = sysmalloc_mmap (nb, pagesize, 0, av, true);
> if (mm != MAP_FAILED)
> return mm;
> tried_mmap = true;
> @@ -5405,6 +5422,18 @@ do_set_thp_madvise (int32_t value)
> }
> return 0;
> }
> +
> +static __always_inline int
> +do_set_mmap_hugetlb (size_t value)
> +{
> + if (value > 0)
> + {
> + struct malloc_hugepage_config_t cfg = __malloc_hugepage_config (value);
> + mp_.hp_pagesize = cfg.pagesize;
> + mp_.hp_flags = cfg.flags;
> + }
> + return 0;
> +}
> #endif
>
> int
> diff --git a/manual/tunables.texi b/manual/tunables.texi
> index 93c46807f9..4da6a02778 100644
> --- a/manual/tunables.texi
> +++ b/manual/tunables.texi
> @@ -279,6 +279,20 @@ The default value of this tunable is @code{0}, which disable its usage.
> Setting to a positive value enable the @code{madvise} call.
> @end deftp
>
> +@deftp Tunable glibc.malloc.mmap_hugetlb
> +This tunable enable the use of Huge Pages when the system supports it (currently
> +only Linux). It is done by aligning the memory size and passing the required
> +flags (@code{MAP_HUGETLB} on Linux) when issuing the @code{mmap} to allocate
> +memory from the system.
> +
> +The default value of this tunable is @code{0}, which disable its usage.
> +The special value @code{1} will try to gather the system default huge page size,
> +while a value larger than @code{1} will try to match it with the supported system
> +huge page size. If either no default huge page size could be obtained or if the
> +requested size does not match the supported ones, the huge pages supports will be
> +disabled.
> +@end deftp
> +
> @node Dynamic Linking Tunables
> @section Dynamic Linking Tunables
> @cindex dynamic linking tunables
> diff --git a/sysdeps/generic/malloc-hugepages.c b/sysdeps/generic/malloc-hugepages.c
> index 262bcdbeb8..e5f5c1ec98 100644
> --- a/sysdeps/generic/malloc-hugepages.c
> +++ b/sysdeps/generic/malloc-hugepages.c
> @@ -29,3 +29,9 @@ __malloc_thp_mode (void)
> {
> return malloc_thp_mode_not_supported;
> }
> +
> +/* Return the default transparent huge page size. */
> +struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested)
> +{
> + return (struct malloc_hugepage_config_t) { 0, 0 };
> +}
> diff --git a/sysdeps/generic/malloc-hugepages.h b/sysdeps/generic/malloc-hugepages.h
> index 664cda9b67..27f7adfea5 100644
> --- a/sysdeps/generic/malloc-hugepages.h
> +++ b/sysdeps/generic/malloc-hugepages.h
> @@ -34,4 +34,16 @@ enum malloc_thp_mode_t
>
> enum malloc_thp_mode_t __malloc_thp_mode (void) attribute_hidden;
>
> +struct malloc_hugepage_config_t
> +{
> + size_t pagesize;
> + int flags;
> +};
> +
> +/* Returned the support huge page size from the requested PAGESIZE along
> + with the requires extra mmap flags. Returning a 0 value for pagesize
> + disables its usage. */
> +struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested)
> + attribute_hidden;
> +
> #endif /* _MALLOC_HUGEPAGES_H */
> diff --git a/sysdeps/unix/sysv/linux/malloc-hugepages.c b/sysdeps/unix/sysv/linux/malloc-hugepages.c
> index 66589127cd..0eb0c764ad 100644
> --- a/sysdeps/unix/sysv/linux/malloc-hugepages.c
> +++ b/sysdeps/unix/sysv/linux/malloc-hugepages.c
> @@ -17,8 +17,10 @@
> not, see <https://www.gnu.org/licenses/>. */
>
> #include <intprops.h>
> +#include <dirent.h>
> #include <malloc-hugepages.h>
> #include <not-cancel.h>
> +#include <sys/mman.h>
>
> size_t
> __malloc_default_thp_pagesize (void)
> @@ -74,3 +76,126 @@ __malloc_thp_mode (void)
> }
> return malloc_thp_mode_not_supported;
> }
> +
> +static size_t
> +malloc_default_hugepage_size (void)
> +{
> + int fd = __open64_nocancel ("/proc/meminfo", O_RDONLY);
> + if (fd == -1)
> + return 0;
> +
> + char buf[512];
> + off64_t off = 0;
> + while (1)
> + {
> + ssize_t r = __pread64_nocancel (fd, buf, sizeof (buf) - 1, off);
> + if (r < 0)
> + break;
> + buf[r - 1] = '\0';
> +
> + const char *s = strstr (buf, "Hugepagesize:");
> + if (s == NULL)
> + {
> + char *nl = strrchr (buf, '\n');
> + if (nl == NULL)
> + break;
> + off += (nl + 1) - buf;
> + continue;
> + }
> +
> + /* The default huge page size is in the form:
> + Hugepagesize: NUMBER kB */
> + size_t hpsize = 0;
> + s += sizeof ("Hugepagesize: ") - 1;
> + for (int i = 0; (s[i] >= '0' && s[i] <= '9') || s[i] == ' '; i++)
> + {
> + if (s[i] == ' ')
> + continue;
> + hpsize *= 10;
> + hpsize += s[i] - '0';
> + }
> + return hpsize * 1024;
> + }
> +
> + __close_nocancel (fd);
> +
> + return 0;
> +}
> +
> +static inline struct malloc_hugepage_config_t
> +make_malloc_hugepage_config (size_t pagesize)
> +{
> + int flags = MAP_HUGETLB | (__builtin_ctzll (pagesize) << MAP_HUGE_SHIFT);
> + return (struct malloc_hugepage_config_t) { pagesize, flags };
> +}
> +
> +struct malloc_hugepage_config_t
> +__malloc_hugepage_config (size_t requested)
> +{
> + if (requested == 1)
> + {
> + size_t pagesize = malloc_default_hugepage_size ();
> + if (pagesize != 0)
> + return make_malloc_hugepage_config (pagesize);
> + }
> +
> + int dirfd = __open64_nocancel ("/sys/kernel/mm/hugepages",
> + O_RDONLY | O_DIRECTORY, 0);
> + if (dirfd == -1)
> + return (struct malloc_hugepage_config_t) { 0, 0 };
> +
> + bool found = false;
> +
> + char buffer[1024];
> + while (true)
> + {
> +#if !IS_IN(libc)
> +# define __getdents64 getdents64
> +#endif
> + ssize_t ret = __getdents64 (dirfd, buffer, sizeof (buffer));
> + if (ret == -1)
> + break;
> + else if (ret == 0)
> + break;
> +
> + char *begin = buffer, *end = buffer + ret;
> + while (begin != end)
> + {
> + unsigned short int d_reclen;
> + memcpy (&d_reclen, begin + offsetof (struct dirent64, d_reclen),
> + sizeof (d_reclen));
> + const char *dname = begin + offsetof (struct dirent64, d_name);
> + begin += d_reclen;
> +
> + if (dname[0] == '.'
> + || strncmp (dname, "hugepages-", sizeof ("hugepages-") - 1) != 0)
> + continue;
> +
> + /* Each entry represents a supported huge page in the form of:
> + hugepages-<size>kB. */
> + size_t hpsize = 0;
> + const char *sizestr = dname + sizeof ("hugepages-") - 1;
> + for (int i = 0; sizestr[i] >= '0' && sizestr[i] <= '9'; i++)
> + {
> + hpsize *= 10;
> + hpsize += sizestr[i] - '0';
> + }
> + hpsize *= 1024;
> +
> + if (hpsize == requested)
> + {
> + found = true;
> + break;
> + }
> + }
> + if (found)
> + break;
> + }
> +
> + __close_nocancel (dirfd);
> +
> + if (found)
> + return make_malloc_hugepage_config (requested);
> +
> + return (struct malloc_hugepage_config_t) { 0, 0 };
> +}
Hi Adhemerval,
I tested this patchset on a POWER9, and I'm seeing the following test
failures when running make check with glibc.malloc.mmap_hugetlb=1:
malloc/tst-free-errno
malloc/tst-free-errno-malloc-check
malloc/tst-free-errno-mcheck
posix/tst-exec
posix/tst-exec-static
posix/tst-spawn
posix/tst-spawn-static
posix/tst-spawn5
I'm attaching a summary of the contents of the .out files for each test.
$ failing="malloc/tst-free-errno malloc/tst-free-errno-malloc-check malloc/tst-free-errno-mcheck posix/tst-exec posix/tst-exec-static posix/tst-spawn posix/tst-spawn-static posix/tst-spawn5"
$
$ for t in $failing; do echo "~> $t"; { make test t=$t; GLIBC_TUNABLES="glibc.malloc.mmap_hugetlb=1" make test t=$t; } | grep -Ei "^fail|pass"; cat $t.out; echo; done
~> malloc/tst-free-errno
double free or corruption (out)
PASS: malloc/tst-free-errno
FAIL: malloc/tst-free-errno
Didn't expect signal from child: got `Aborted'
~> malloc/tst-free-errno-malloc-check
PASS: malloc/tst-free-errno-malloc-check
FAIL: malloc/tst-free-errno-malloc-check
error: xmmap.c:28: mmap of 16908288 bytes, prot=0x3, flags=0x32: Device or resource busy
error: 1 test failures
~> malloc/tst-free-errno-mcheck
memory clobbered past end of allocated block
PASS: malloc/tst-free-errno-mcheck
FAIL: malloc/tst-free-errno-mcheck
Didn't expect signal from child: got `Aborted'
~> posix/tst-exec
/home/mscastanho/build/glibc/posix/tst-exec: file 1 (4) is not closed
PASS: posix/tst-exec
FAIL: posix/tst-exec
~> posix/tst-exec-static
/home/mscastanho/build/glibc/posix/tst-exec-static: file 1 (4) is not closed
PASS: posix/tst-exec-static
FAIL: posix/tst-exec-static
~> posix/tst-spawn
PASS: posix/tst-spawn
FAIL: posix/tst-spawn
tst-spawn.c:127: numeric comparison failure
left: 0 (0x0); from: lseek (fd1, 0, SEEK_CUR)
right: -1 (0xffffffffffffffff); from: (off_t) -1
error: 1 test failures
tst-spawn.c:244: numeric comparison failure
left: 1 (0x1); from: WEXITSTATUS (status)
right: 0 (0x0); from: 0
tst-spawn.c:127: numeric comparison failure
left: 0 (0x0); from: lseek (fd1, 0, SEEK_CUR)
right: -1 (0xffffffffffffffff); from: (off_t) -1
error: 1 test failures
tst-spawn.c:258: numeric comparison failure
left: 1 (0x1); from: WEXITSTATUS (status)
right: 0 (0x0); from: 0
error: 2 test failures
~> posix/tst-spawn-static
PASS: posix/tst-spawn-static
FAIL: posix/tst-spawn-static
tst-spawn.c:127: numeric comparison failure
left: 0 (0x0); from: lseek (fd1, 0, SEEK_CUR)
right: -1 (0xffffffffffffffff); from: (off_t) -1
error: 1 test failures
tst-spawn.c:244: numeric comparison failure
left: 1 (0x1); from: WEXITSTATUS (status)
right: 0 (0x0); from: 0
tst-spawn.c:127: numeric comparison failure
left: 0 (0x0); from: lseek (fd1, 0, SEEK_CUR)
right: -1 (0xffffffffffffffff); from: (off_t) -1
error: 1 test failures
tst-spawn.c:258: numeric comparison failure
left: 1 (0x1); from: WEXITSTATUS (status)
right: 0 (0x0); from: 0
error: 2 test failures
~> posix/tst-spawn5
PASS: posix/tst-spawn5
FAIL: posix/tst-spawn5
error: tst-spawn5.c:128: unexpected open file descriptor 54: /proc/meminfo
tst-spawn5.c:182: numeric comparison failure
left: 1 (0x1); from: WEXITSTATUS (status)
right: 0 (0x0); from: 0
error: tst-spawn5.c:128: unexpected open file descriptor 54: /proc/meminfo
tst-spawn5.c:182: numeric comparison failure
left: 1 (0x1); from: WEXITSTATUS (status)
right: 0 (0x0); from: 0
error: tst-spawn5.c:128: unexpected open file descriptor 5: /proc/meminfo
tst-spawn5.c:182: numeric comparison failure
left: 1 (0x1); from: WEXITSTATUS (status)
right: 0 (0x0); from: 0
error: tst-spawn5.c:128: unexpected open file descriptor 4: /proc/meminfo
tst-spawn5.c:182: numeric comparison failure
left: 1 (0x1); from: WEXITSTATUS (status)
right: 0 (0x0); from: 0
error: tst-spawn5.c:128: unexpected open file descriptor 6: /proc/meminfo
tst-spawn5.c:182: numeric comparison failure
left: 1 (0x1); from: WEXITSTATUS (status)
right: 0 (0x0); from: 0
error: 5 test failures
--
Matheus Castanho
On 19/08/2021 14:58, Matheus Castanho wrote:
> Hi Adhemerval,
>
> I tested this patchset on a POWER9, and I'm seeing the following test
> failures when running make check with glibc.malloc.mmap_hugetlb=1:
Thanks for checking on this.
>
> malloc/tst-free-errno
> malloc/tst-free-errno-malloc-check
> malloc/tst-free-errno-mcheck
These one I couldn't really reproduce it on gcc farm power machines,
a power9 with 2M huge page default and power8 with 16M default. Both
didn't have any page allocated in the poll. I don't have admin access
so I can change the pool size to check what is happening.
I also tested on my x86_64 environment without any pages in the poll,
with 4 pages in the pool and with 10 pages.
If you could the stacktrace from where we get the
"Didn't expect signal from child: got `Aborted'" it would be useful.
It could be also something related to /proc/sys/vm/max_map_count
value, since it mmap seems to be failing for some reason.
> posix/tst-exec
> posix/tst-exec-static
> posix/tst-spawn
> posix/tst-spawn-static
> posix/tst-spawn5
These are an overlook at 'malloc_default_hugepage_size()' where it
does not close the file descriptor on success. I have fixed it.
Adhemerval Zanella <adhemerval.zanella@linaro.org> writes:
> On 19/08/2021 14:58, Matheus Castanho wrote:
>> Hi Adhemerval,
>>
>> I tested this patchset on a POWER9, and I'm seeing the following test
>> failures when running make check with glibc.malloc.mmap_hugetlb=1:
>
> Thanks for checking on this.
>
>>
>> malloc/tst-free-errno
>> malloc/tst-free-errno-malloc-check
>> malloc/tst-free-errno-mcheck
>
> These one I couldn't really reproduce it on gcc farm power machines,
> a power9 with 2M huge page default and power8 with 16M default. Both
> didn't have any page allocated in the poll. I don't have admin access
> so I can change the pool size to check what is happening.
>
> I also tested on my x86_64 environment without any pages in the poll,
> with 4 pages in the pool and with 10 pages.
>
I confirm that without pages in the pool the tests pass correctly. Only
when I add them to the pool things start failing. In this case I'm
reserving 500 16 MB pages:
$ grep -i hugepages /proc/meminfo
AnonHugePages: 0 kB
ShmemHugePages: 0 kB
FileHugePages: 0 kB
HugePages_Total: 500
HugePages_Free: 500
HugePages_Rsvd: 0
HugePages_Surp: 0
Hugepagesize: 16384 kB
> If you could the stacktrace from where we get the
> "Didn't expect signal from child: got `Aborted'" it would be useful.
>
This is what GDB is showing me when the abort happens:
#0 0x00007ffff7dccf00 in __pthread_kill_internal (threadid=<optimized out>, signo=<optimized out>) at pthread_kill.c:44
#1 0x00007ffff7d6e26c in __GI_raise (sig=<optimized out>) at ../sysdeps/posix/raise.c:26
#2 0x00007ffff7d50490 in __GI_abort () at abort.c:79
#3 0x00007ffff7dba770 in __libc_message (action=<optimized out>, fmt=<optimized out>) at ../sysdeps/posix/libc_fatal.c:155
#4 0x00007ffff7ddc4e8 in malloc_printerr (str=<optimized out>, str@entry=0x7ffff7efdc90 "double free or corruption (out)") at malloc.c:5654
#5 0x00007ffff7ddefe8 in _int_free (av=0x7ffff7f60e30 <main_arena>, p=0x7ffff80203d0, have_lock=<optimized out>, have_lock@entry=0) at malloc.c:4555
#6 0x00007ffff7de2160 in __GI___libc_free (mem=<optimized out>) at malloc.c:3358
#7 0x0000000010001ee4 in do_test () at tst-free-errno.c:123
#8 0x0000000010002730 in run_test_function (argc=argc@entry=1, argv=argv@entry=0x7fffffffede0, config=config@entry=0x7fffffffe950) at support_test_main.c:232
#9 0x00000000100032fc in support_test_main (argc=1, argv=0x7fffffffede0, config=0x7fffffffe950) at support_test_main.c:431
#10 0x00000000100019d0 in main (argc=<optimized out>, argv=<optimized out>) at ../support/test-driver.c:168
#11 0x00007ffff7d50818 in __libc_start_call_main (main=main@entry=0x10001980 <main>, argc=argc@entry=1, argv=argv@entry=0x7fffffffede0, auxvec=auxvec@entry=0x7fffffffef68) at ../sysdeps/nptl/libc_start_call_main.h:58
#12 0x00007ffff7d50a00 in generic_start_main (fini=<optimized out>, stack_end=<optimized out>, rtld_fini=<optimized out>, init=<optimized out>, auxvec=<optimized out>, argv=<optimized out>, argc=<optimized out>, main=<optimized out>) at ../csu/libc-start.c:409
#13 __libc_start_main_impl (argc=1, argv=0x7fffffffede0, ev=<optimized out>, auxvec=0x7fffffffef68, rtld_fini=<optimized out>, stinfo=<optimized out>, stack_on_entry=<optimized out>) at ../sysdeps/unix/sysv/linux/powerpc/libc-start.c:98
#14 0x0000000000000000 in ?? ()
> It could be also something related to /proc/sys/vm/max_map_count
> value, since it mmap seems to be failing for some reason.
>
This is what the machine I'm using now has:
$ cat /proc/sys/vm/max_map_count
65530
>> posix/tst-exec
>> posix/tst-exec-static
>> posix/tst-spawn
>> posix/tst-spawn-static
>> posix/tst-spawn5
>
> These are an overlook at 'malloc_default_hugepage_size()' where it
> does not close the file descriptor on success. I have fixed it.
Ok, thanks!
--
Matheus Castanho
@@ -14,6 +14,10 @@ Major new features:
It might improve performance with Transparent Huge Pages madvise mode
depending of the workload.
+* On Linux, a new tunable, glibc.malloc.mmap_hugetlb, can be used to
+ instruct malloc to try use Huge Pages when allocate memory with mmap()
+ calls (through the use of MAP_HUGETLB).
+
Deprecated and removed features, and other changes affecting compatibility:
[Add deprecations, removals and changes affecting compatibility here]
@@ -97,6 +97,10 @@ glibc {
minval: 0
maxval: 1
}
+ mmap_hugetlb {
+ type: SIZE_T
+ minval: 0
+ }
}
cpu {
hwcap_mask {
@@ -1,6 +1,7 @@
glibc.malloc.arena_max: 0x0 (min: 0x1, max: 0x[f]+)
glibc.malloc.arena_test: 0x0 (min: 0x1, max: 0x[f]+)
glibc.malloc.check: 0 (min: 0, max: 3)
+glibc.malloc.mmap_hugetlb: 0x0 (min: 0x0, max: 0x[f]+)
glibc.malloc.mmap_max: 0 (min: 0, max: 2147483647)
glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0x[f]+)
glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0x[f]+)
@@ -232,6 +232,7 @@ TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
#endif
TUNABLE_CALLBACK_FNDECL (set_mxfast, size_t)
TUNABLE_CALLBACK_FNDECL (set_thp_madvise, int32_t)
+TUNABLE_CALLBACK_FNDECL (set_mmap_hugetlb, size_t)
#else
/* Initialization routine. */
#include <string.h>
@@ -333,6 +334,7 @@ ptmalloc_init (void)
# endif
TUNABLE_GET (mxfast, size_t, TUNABLE_CALLBACK (set_mxfast));
TUNABLE_GET (thp_madvise, int32_t, TUNABLE_CALLBACK (set_thp_madvise));
+ TUNABLE_GET (mmap_hugetlb, size_t, TUNABLE_CALLBACK (set_mmap_hugetlb));
#else
if (__glibc_likely (_environ != NULL))
{
@@ -1884,6 +1884,10 @@ struct malloc_par
#if HAVE_TUNABLES
/* Transparent Large Page support. */
INTERNAL_SIZE_T thp_pagesize;
+ /* A value different than 0 means to align mmap allocation to hp_pagesize
+ add hp_flags on flags. */
+ INTERNAL_SIZE_T hp_pagesize;
+ int hp_flags;
#endif
/* Memory map support */
@@ -2415,7 +2419,8 @@ do_check_malloc_state (mstate av)
*/
static void *
-sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
+sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av,
+ bool set_thp)
{
long int size;
@@ -2442,7 +2447,8 @@ sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
if (mm == MAP_FAILED)
return mm;
- sysmadvise_thp (mm, size);
+ if (set_thp)
+ sysmadvise_thp (mm, size);
/*
The offset to the start of the mmapped region is stored in the prev_size
@@ -2531,7 +2537,18 @@ sysmalloc (INTERNAL_SIZE_T nb, mstate av)
&& (mp_.n_mmaps < mp_.n_mmaps_max)))
{
try_mmap:
- char *mm = sysmalloc_mmap (nb, pagesize, 0, av);
+ char *mm;
+#if HAVE_TUNABLES
+ if (mp_.hp_pagesize > 0)
+ {
+ /* There is no need to isse the THP madvise call if Huge Pages are
+ used directly. */
+ mm = sysmalloc_mmap (nb, mp_.hp_pagesize, mp_.hp_flags, av, false);
+ if (mm != MAP_FAILED)
+ return mm;
+ }
+#endif
+ mm = sysmalloc_mmap (nb, pagesize, 0, av, true);
if (mm != MAP_FAILED)
return mm;
tried_mmap = true;
@@ -5405,6 +5422,18 @@ do_set_thp_madvise (int32_t value)
}
return 0;
}
+
+static __always_inline int
+do_set_mmap_hugetlb (size_t value)
+{
+ if (value > 0)
+ {
+ struct malloc_hugepage_config_t cfg = __malloc_hugepage_config (value);
+ mp_.hp_pagesize = cfg.pagesize;
+ mp_.hp_flags = cfg.flags;
+ }
+ return 0;
+}
#endif
int
@@ -279,6 +279,20 @@ The default value of this tunable is @code{0}, which disable its usage.
Setting to a positive value enable the @code{madvise} call.
@end deftp
+@deftp Tunable glibc.malloc.mmap_hugetlb
+This tunable enable the use of Huge Pages when the system supports it (currently
+only Linux). It is done by aligning the memory size and passing the required
+flags (@code{MAP_HUGETLB} on Linux) when issuing the @code{mmap} to allocate
+memory from the system.
+
+The default value of this tunable is @code{0}, which disable its usage.
+The special value @code{1} will try to gather the system default huge page size,
+while a value larger than @code{1} will try to match it with the supported system
+huge page size. If either no default huge page size could be obtained or if the
+requested size does not match the supported ones, the huge pages supports will be
+disabled.
+@end deftp
+
@node Dynamic Linking Tunables
@section Dynamic Linking Tunables
@cindex dynamic linking tunables
@@ -29,3 +29,9 @@ __malloc_thp_mode (void)
{
return malloc_thp_mode_not_supported;
}
+
+/* Return the default transparent huge page size. */
+struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested)
+{
+ return (struct malloc_hugepage_config_t) { 0, 0 };
+}
@@ -34,4 +34,16 @@ enum malloc_thp_mode_t
enum malloc_thp_mode_t __malloc_thp_mode (void) attribute_hidden;
+struct malloc_hugepage_config_t
+{
+ size_t pagesize;
+ int flags;
+};
+
+/* Returned the support huge page size from the requested PAGESIZE along
+ with the requires extra mmap flags. Returning a 0 value for pagesize
+ disables its usage. */
+struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested)
+ attribute_hidden;
+
#endif /* _MALLOC_HUGEPAGES_H */
@@ -17,8 +17,10 @@
not, see <https://www.gnu.org/licenses/>. */
#include <intprops.h>
+#include <dirent.h>
#include <malloc-hugepages.h>
#include <not-cancel.h>
+#include <sys/mman.h>
size_t
__malloc_default_thp_pagesize (void)
@@ -74,3 +76,126 @@ __malloc_thp_mode (void)
}
return malloc_thp_mode_not_supported;
}
+
+static size_t
+malloc_default_hugepage_size (void)
+{
+ int fd = __open64_nocancel ("/proc/meminfo", O_RDONLY);
+ if (fd == -1)
+ return 0;
+
+ char buf[512];
+ off64_t off = 0;
+ while (1)
+ {
+ ssize_t r = __pread64_nocancel (fd, buf, sizeof (buf) - 1, off);
+ if (r < 0)
+ break;
+ buf[r - 1] = '\0';
+
+ const char *s = strstr (buf, "Hugepagesize:");
+ if (s == NULL)
+ {
+ char *nl = strrchr (buf, '\n');
+ if (nl == NULL)
+ break;
+ off += (nl + 1) - buf;
+ continue;
+ }
+
+ /* The default huge page size is in the form:
+ Hugepagesize: NUMBER kB */
+ size_t hpsize = 0;
+ s += sizeof ("Hugepagesize: ") - 1;
+ for (int i = 0; (s[i] >= '0' && s[i] <= '9') || s[i] == ' '; i++)
+ {
+ if (s[i] == ' ')
+ continue;
+ hpsize *= 10;
+ hpsize += s[i] - '0';
+ }
+ return hpsize * 1024;
+ }
+
+ __close_nocancel (fd);
+
+ return 0;
+}
+
+static inline struct malloc_hugepage_config_t
+make_malloc_hugepage_config (size_t pagesize)
+{
+ int flags = MAP_HUGETLB | (__builtin_ctzll (pagesize) << MAP_HUGE_SHIFT);
+ return (struct malloc_hugepage_config_t) { pagesize, flags };
+}
+
+struct malloc_hugepage_config_t
+__malloc_hugepage_config (size_t requested)
+{
+ if (requested == 1)
+ {
+ size_t pagesize = malloc_default_hugepage_size ();
+ if (pagesize != 0)
+ return make_malloc_hugepage_config (pagesize);
+ }
+
+ int dirfd = __open64_nocancel ("/sys/kernel/mm/hugepages",
+ O_RDONLY | O_DIRECTORY, 0);
+ if (dirfd == -1)
+ return (struct malloc_hugepage_config_t) { 0, 0 };
+
+ bool found = false;
+
+ char buffer[1024];
+ while (true)
+ {
+#if !IS_IN(libc)
+# define __getdents64 getdents64
+#endif
+ ssize_t ret = __getdents64 (dirfd, buffer, sizeof (buffer));
+ if (ret == -1)
+ break;
+ else if (ret == 0)
+ break;
+
+ char *begin = buffer, *end = buffer + ret;
+ while (begin != end)
+ {
+ unsigned short int d_reclen;
+ memcpy (&d_reclen, begin + offsetof (struct dirent64, d_reclen),
+ sizeof (d_reclen));
+ const char *dname = begin + offsetof (struct dirent64, d_name);
+ begin += d_reclen;
+
+ if (dname[0] == '.'
+ || strncmp (dname, "hugepages-", sizeof ("hugepages-") - 1) != 0)
+ continue;
+
+ /* Each entry represents a supported huge page in the form of:
+ hugepages-<size>kB. */
+ size_t hpsize = 0;
+ const char *sizestr = dname + sizeof ("hugepages-") - 1;
+ for (int i = 0; sizestr[i] >= '0' && sizestr[i] <= '9'; i++)
+ {
+ hpsize *= 10;
+ hpsize += sizestr[i] - '0';
+ }
+ hpsize *= 1024;
+
+ if (hpsize == requested)
+ {
+ found = true;
+ break;
+ }
+ }
+ if (found)
+ break;
+ }
+
+ __close_nocancel (dirfd);
+
+ if (found)
+ return make_malloc_hugepage_config (requested);
+
+ return (struct malloc_hugepage_config_t) { 0, 0 };
+}