nptl: Disable THP on thread stack if it incurs in large RSS usage

Message ID 20230420172436.2013698-1-adhemerval.zanella@linaro.org
State Dropped
Headers
Series nptl: Disable THP on thread stack if it incurs in large RSS usage |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Adhemerval Zanella April 20, 2023, 5:24 p.m. UTC
  If Transparent Huge Page (THP) is set to 'always', the thread stack might
be backed by Huge Pages depending of the asigned address by the kernel and
the resulting guard position.  If the guard page is within the same large
page that might be used by the stack itself, changing stack permission
will make the allocated range no longer be server with THP.  The kernel
will then revert back using default page size.

In this case, besides the aditional work, the kernel will need to potential
keep all the pages since it can not distinguish which one was really touched
by the process.  This result in a large RSS usage than just madvise the range
to not use huge pages.

The glibc.pthread.stack_hugetlb is still useful for the case where the
caller either setup no guard page or a guard page multiple of default THP
size.  In this case, the kernel might potentially backup the stack with
THP, but it would be up to the thread stack profile if THP is benefitial
or not.

Checked on x86_64-linux-gnu.
---
 nptl/allocatestack.c                       | 34 ++++++++++++++++
 sysdeps/generic/malloc-hugepages.h         |  1 +
 sysdeps/unix/sysv/linux/malloc-hugepages.c | 46 ++++++++++++++++++----
 3 files changed, 74 insertions(+), 7 deletions(-)
  

Comments

Wilco Dijkstra May 3, 2023, 12:42 p.m. UTC | #1
Hi Adhemerval,

> +static __always_inline int
> +advise_thp (void *mem, size_t size, char *guard)
> +{
> +  enum malloc_thp_mode_t thpmode = __malloc_thp_mode ();
> +  if (thpmode != malloc_thp_mode_always)
> +    return 0;
> +
> +  unsigned long int thpsize = __malloc_default_thp_pagesize ();
> +  if (PTR_ALIGN_DOWN (mem, thpsize) != PTR_ALIGN_DOWN (guard, thpsize))
> +    return 0;
> +
> +  return __madvise (mem, size, MADV_NOHUGEPAGE);
> +}

This still doesn't make sense since if _STACK_GROWS_DOWN, mem == guard, so
this will always execute the madvise. As I mentioned, I couldn't find evidence that
the claimed scenario of a huge page allocated, written to and then split due to the
mprotect exists.

So the real issue is that the current stack allocation code randomly (based on
alignment from previous mmap calls) uses huge pages even for small stacks.

Cheers,
Wilco
  
Adhemerval Zanella May 15, 2023, 5:57 p.m. UTC | #2
On 03/05/23 09:42, Wilco Dijkstra wrote:
> Hi Adhemerval,
> 
>> +static __always_inline int
>> +advise_thp (void *mem, size_t size, char *guard)
>> +{
>> +  enum malloc_thp_mode_t thpmode = __malloc_thp_mode ();
>> +  if (thpmode != malloc_thp_mode_always)
>> +    return 0;
>> +
>> +  unsigned long int thpsize = __malloc_default_thp_pagesize ();
>> +  if (PTR_ALIGN_DOWN (mem, thpsize) != PTR_ALIGN_DOWN (guard, thpsize))
>> +    return 0;
>> +
>> +  return __madvise (mem, size, MADV_NOHUGEPAGE);
>> +}
> 
> This still doesn't make sense since if _STACK_GROWS_DOWN, mem == guard, so
> this will always execute the madvise. 

Yes, if THP is set to always this is exactly the idea of this patch since
afaiu the kernel might still back up the stack with large pages if the 
request a size is smaller than the default THP.  It is only an issue if
the guard page address is not aligned to THP default size, which will
potentially trigger issues Cupertino has brought (since we do not prior
hand which is the mapping flags used on page used to fulfill the allocation). 

> As I mentioned, I couldn't find evidence that
> the claimed scenario of a huge page allocated, written to and then split due to the
> mprotect exists.

I adapted Cupertino original test to allow specify both the thread stack
and guard size by command line.  Just:

$ gcc -std=gnu11 -O2 -g -I. -D_LARGEFILE64_SOURCE=1 -D_GNU_SOURCE   -c -o tststackalloc.o tststackalloc.c
$ echo "always" | sudo tee /sys/kernel/mm/transparent_hugepage/enabled
$ ./tststackalloc -m
[...]
[statm] RSS: 65964 pages (270188544 bytes = 257 MB)
[smaps] RSS: 270327808 bytes = 257 MB
[...]

With either the new tunable or this patch:

$ ld-linux-x86-64.so.2 --library-path . ./tststackalloc  -m
[...]
[statm] RSS: 531 pages (2174976 bytes = 2 MB)
[smaps] RSS: 3002368 bytes = 2 MB
[...]

> 
> So the real issue is that the current stack allocation code randomly (based on
> alignment from previous mmap calls) uses huge pages even for small stacks.

Keep in mind this heuristic is only enabled if THP is set to 'always', meaning
the kernel will try to back *all* the stack with large pages.  The issue is
when the *guard* page is within a large page.
// Compile & run:
//    gcc -Wall -g -o tststackalloc tststackalloc.c $< -lpthread
//    ./tststackalloc 1     # Attempt to use huge pages for stacks -> RSS bloat
//    ./tststackalloc 0     # Do not attempt to use huge pages -> No RSS bloat

#include <pthread.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <limits.h>
#include <inttypes.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <assert.h>
#include <ctype.h>

// Number of threads to create
#define NOOF_THREADS (128)

// Size of a small page (hard-coded)
#define SMALL_PAGE_SIZE (4*1024)
static size_t small_page_size = 0;

// Size of a huge page (hard-coded)
#define HUGE_PAGE_SIZE (2*1024*1024)
static size_t huge_page_size = 0;

// Total size of the thread stack, including the guard page(s)
#define STACK_SIZE_TOTAL (HUGE_PAGE_SIZE)

// Size of the guard page(s)
#define GUARD_SIZE (SMALL_PAGE_SIZE)

// When enabled (set to non-zero), tries to align thread stacks on
// huge page boundaries, making them eligible for huge pages
static int huge_page_align_stacks;

static volatile int exit_thread = 0;

unsigned long int
default_thp_pagesize (void)
{
  int fd = open (
    "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", O_RDONLY);
  if (fd == -1)
    return -1;

  char str[64];
  ssize_t s = read (fd, str, sizeof (str));
  close (fd);
  if (s < 0)
    return -1;

  unsigned long int r = 0;
  for (ssize_t i = 0; i < s; i++)
    {
      if (str[i] == '\n')
        break;
      r *= 10;
      r += str[i] - '0';
    }
  return r;
}

static void *
start (void *arg)
{
  while (!exit_thread)
    sleep (1);
  return NULL;
}

static char *
next_line (int fd, char *const buffer, char **cp, char **re,
           char *const buffer_end)
{
  char *res = *cp;
  char *nl = memchr (*cp, '\n', *re - *cp);
  if (nl == NULL)
    {
      if (*cp != buffer)
        {
          if (*re == buffer_end)
            {
              memmove (buffer, *cp, *re - *cp);
              *re = buffer + (*re - *cp);
              *cp = buffer;

              ssize_t n = read (fd, *re, buffer_end - *re);
              if (n < 0)
                return NULL;

              *re += n;

              nl = memchr (*cp, '\n', *re - *cp);
              while (nl == NULL && *re == buffer_end)
                {
                  /* Truncate too long lines.  */
                  *re = buffer + 3 * (buffer_end - buffer) / 4;
                  n = read (fd, *re, buffer_end - *re);
                  if (n < 0)
                    return NULL;

                  nl = memchr (*re, '\n', n);
                  **re = '\n';
                  *re += n;
                }
            }
          else
            nl = memchr (*cp, '\n', *re - *cp);

          res = *cp;
        }

      if (nl == NULL)
        nl = *re - 1;
    }

  *cp = nl + 1;
  assert (*cp <= *re);

  return res == *re ? NULL : res;
}

static void
read_proc_file (const char *fname, void (*closure)(const char *, size_t *),
						   size_t *arg)
{
  int fd = open (fname, O_RDONLY | O_CLOEXEC);
  assert (fd != -1);

  enum { buffer_size = 1024 };
  char buffer[buffer_size];
  char *buffer_end = buffer + buffer_size;
  char *cp = buffer_end;
  char *re = buffer_end;

  char *l;
  while ((l = next_line (fd, buffer, &cp, &re, buffer_end)) != NULL)
    closure (l, arg);

  close (fd);
}

static void
parse_statm_line (const char *line, size_t *ret)
{
  long int rss;
  assert (sscanf (line, "%*d %ld", &rss) == 1);
  *ret = rss;
}

static void
parse_statm (void)
{
  size_t rss = 0;
  read_proc_file ("/proc/self/statm", parse_statm_line, &rss);

  fprintf (stderr, "[statm] RSS: %zd pages (%zd bytes = %zd MB)\n", rss,
	   rss * small_page_size, rss * small_page_size / 1024 / 1024);
}

static void
parse_smaps_line (const char *line, size_t *total)
{
  static const char field[] = "Rss:";
  size_t fieldlen = strlen (field);
  if (strncmp (line, field, fieldlen) != 0)
    return;

  // skip spaces
  for (line += fieldlen; isspace (*line); line++);

  enum { numberlen = 32 };
  char number[numberlen];
  size_t i;
  for (i = 0; isdigit (line[i]) && i < numberlen - 1; i++)
    number[i] = line[i];
  number[i] = '\0';

  // Assume kB.
  long int value = strtol (number, NULL, 10);
  assert (value != LONG_MIN && value != LONG_MAX && errno != ERANGE);

  *total += value * 1024;
}

static void
parse_smaps (void)
{
  size_t rss = 0;
  read_proc_file ("/proc/self/smaps", parse_smaps_line, &rss);

  fprintf (stderr, "[smaps] RSS: %zd bytes = %zd MB\n", rss,
	   rss / (1024 * 1024));
}


static inline uintptr_t
align_down (uintptr_t value, uintptr_t alignment)
{
  return value & ~(alignment - 1);
}

// Do a series of small, single page mmap calls to attempt to set
// everything up so that the next mmap call (glibc allocating the
// stack) returns a 2MB aligned range. The kernel "expands" vmas from
// higher to lower addresses (subsequent calls return ranges starting
// at lower addresses), so this function keeps calling mmap until it a
// huge page aligned address is returned. The next range (the stack)
// will then end on that same address.
static void
align_next_on (uintptr_t alignment)
{
  uintptr_t p;
  do
    {
      p =
	(uintptr_t) mmap (NULL, small_page_size, PROT_NONE,
			  MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE, -1, 0);
    }
  while (p != align_down (p, huge_page_size));
}

static size_t
parse_size_t (const char *value)
{
  char *endptr;
  errno = 0;
  size_t n = strtoull (value, &endptr, 10);
  if (errno == ERANGE)
    {
      fprintf (stderr, "error: invalid %s value\n", value);
      exit (EXIT_FAILURE);
    }
  return n;
}

int
main (int argc, char *argv[])
{
  int opt;

  size_t guard_size = GUARD_SIZE;
  size_t stack_size = STACK_SIZE_TOTAL;

  while ((opt = getopt (argc, argv, "g:s:m")) != -1)
    {
      switch (opt)
	{
	case 'g':
	  guard_size = parse_size_t (optarg);
	  break;
	case 's':
	  stack_size = parse_size_t (optarg);
	  break;
	case 'm':
	  huge_page_align_stacks = 1;
	  break;
	default:
	  fprintf (stderr, "Usage: %s [-s stacksize] [-g guardsize] [-m]\n",
		   argv[0]);
	  exit (EXIT_FAILURE);
	}
    }

  huge_page_size = default_thp_pagesize ();
  if (huge_page_size == 0)
    huge_page_size = HUGE_PAGE_SIZE;

  {
    long int sz = sysconf (_SC_PAGESIZE);
    if (sz == -1)
      small_page_size = SMALL_PAGE_SIZE;
    else
      small_page_size = sz;
  }

  pthread_t t[NOOF_THREADS];
  pthread_attr_t attr;
  int i;

  void *dummy = malloc (1024);
  free (dummy);

  fprintf (stderr, "Page size: %zd kB, %zd MB huge pages\n",
	   small_page_size / 1024, huge_page_size / (1024 * 1024));
  fprintf (stderr, "Stack size: %zd kB, guard size: %zd kB\n",
	   stack_size / 1024, guard_size / 1024);
  if (huge_page_align_stacks)
    {
      fprintf (stderr,
	       "Will attempt to align allocations to make stacks eligible for huge pages\n");
    }
  pid_t pid = getpid ();
  fprintf (stderr, "pid: %d (/proc/%d/smaps)\n", pid, pid);

  pthread_attr_init (&attr);
  pthread_attr_setstacksize (&attr, stack_size);
  pthread_attr_setguardsize (&attr, guard_size);

  fprintf (stderr, "Creating %d threads...\n", NOOF_THREADS);
  for (i = 0; i < NOOF_THREADS; i++)
    {
      if (huge_page_align_stacks)
	{
	  // align (next) allocation on huge page boundary
	  align_next_on (huge_page_size);
	}
      pthread_create (&t[i], &attr, start, NULL);
    }
  sleep (1);

  parse_statm ();
  parse_smaps ();

  fprintf (stderr, "Press enter to exit...\n");
  getchar ();

  exit_thread = 1;
  for (i = 0; i < NOOF_THREADS; i++)
    pthread_join (t[i], NULL);
  return 0;
}
  
Cupertino Miranda May 16, 2023, 2:30 p.m. UTC | #3
Hi Adhemerval,

Appologies for such late reply.
For what it is worth, the patch LGTM.

Regards,
Cupertino

Adhemerval Zanella writes:

> If Transparent Huge Page (THP) is set to 'always', the thread stack might
> be backed by Huge Pages depending of the asigned address by the kernel and
> the resulting guard position.  If the guard page is within the same large
> page that might be used by the stack itself, changing stack permission
> will make the allocated range no longer be server with THP.  The kernel
> will then revert back using default page size.
>
> In this case, besides the aditional work, the kernel will need to potential
> keep all the pages since it can not distinguish which one was really touched
> by the process.  This result in a large RSS usage than just madvise the range
> to not use huge pages.
>
> The glibc.pthread.stack_hugetlb is still useful for the case where the
> caller either setup no guard page or a guard page multiple of default THP
> size.  In this case, the kernel might potentially backup the stack with
> THP, but it would be up to the thread stack profile if THP is benefitial
> or not.
>
> Checked on x86_64-linux-gnu.
> ---
>  nptl/allocatestack.c                       | 34 ++++++++++++++++
>  sysdeps/generic/malloc-hugepages.h         |  1 +
>  sysdeps/unix/sysv/linux/malloc-hugepages.c | 46 ++++++++++++++++++----
>  3 files changed, 74 insertions(+), 7 deletions(-)
>
> diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
> index f9d8cdfd08..1eb34f816c 100644
> --- a/nptl/allocatestack.c
> +++ b/nptl/allocatestack.c
> @@ -33,6 +33,7 @@
>  #include <nptl-stack.h>
>  #include <libc-lock.h>
>  #include <tls-internal.h>
> +#include <malloc-hugepages.h>
>
>  /* Default alignment of stack.  */
>  #ifndef STACK_ALIGN
> @@ -206,6 +207,31 @@ advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
>  #endif
>  }
>
> +/* If Transparent Huge Page (THP) is set to 'always', the thread stack might
> +   be backed by Huge Pages depending of the asigned address by the kernel and
> +   if resulting guard position.  If the guard page is within the same large
> +   page that might be used by the stack itself, changing stack permission
> +   will make the allocated range no longer be server with THP.  The kernel will
> +   then revert back using default page size.
> +
> +   In this case, besides the aditional work, the kernel will need to potential
> +   keep all the pages since it can distinguish which one was really touched by
> +   the process.  This result in a large RSS usage than just madvise the range
> +   to not use huge pages.  */
> +static __always_inline int
> +advise_thp (void *mem, size_t size, char *guard)
> +{
> +  enum malloc_thp_mode_t thpmode = __malloc_thp_mode ();
> +  if (thpmode != malloc_thp_mode_always)
> +    return 0;
> +
> +  unsigned long int thpsize = __malloc_default_thp_pagesize ();
> +  if (PTR_ALIGN_DOWN (mem, thpsize) != PTR_ALIGN_DOWN (guard, thpsize))
> +    return 0;
> +
> +  return __madvise (mem, size, MADV_NOHUGEPAGE);
> +}
> +
>  /* Returns a usable stack for a new thread either by allocating a
>     new stack or reusing a cached stack of sufficient size.
>     ATTR must be non-NULL and point to a valid pthread_attr.
> @@ -396,6 +422,14 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
>  	    {
>  	      char *guard = guard_position (mem, size, guardsize, pd,
>  					    pagesize_m1);
> +
> +	      if (__glibc_unlikely (__nptl_stack_hugetlb == 1))
> +		{
> +		  int r = advise_thp (mem, size, guard);
> +		  if (r != 0)
> +		    return r;
> +		}
> +
>  	      if (setup_stack_prot (mem, size, guard, guardsize, prot) != 0)
>  		{
>  		  __munmap (mem, size);
> diff --git a/sysdeps/generic/malloc-hugepages.h b/sysdeps/generic/malloc-hugepages.h
> index d68b85630c..21d4844bc4 100644
> --- a/sysdeps/generic/malloc-hugepages.h
> +++ b/sysdeps/generic/malloc-hugepages.h
> @@ -26,6 +26,7 @@ unsigned long int __malloc_default_thp_pagesize (void) attribute_hidden;
>
>  enum malloc_thp_mode_t
>  {
> +  malloc_thp_mode_unknown,
>    malloc_thp_mode_always,
>    malloc_thp_mode_madvise,
>    malloc_thp_mode_never,
> diff --git a/sysdeps/unix/sysv/linux/malloc-hugepages.c b/sysdeps/unix/sysv/linux/malloc-hugepages.c
> index 2f316474c1..e7877f098e 100644
> --- a/sysdeps/unix/sysv/linux/malloc-hugepages.c
> +++ b/sysdeps/unix/sysv/linux/malloc-hugepages.c
> @@ -22,19 +22,33 @@
>  #include <not-cancel.h>
>  #include <sys/mman.h>
>
> +/* The __malloc_thp_mode is called only in single-thread mode, either in
> +   malloc initialization or pthread creation.  */
> +static unsigned long int thp_pagesize = -1;
> +
>  unsigned long int
>  __malloc_default_thp_pagesize (void)
>  {
> +  unsigned long int size = atomic_load_relaxed (&thp_pagesize);
> +  if (size != -1)
> +    return size;
> +
>    int fd = __open64_nocancel (
>      "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", O_RDONLY);
>    if (fd == -1)
> -    return 0;
> +    {
> +      atomic_store_relaxed (&thp_pagesize, 0);
> +      return 0;
> +    }
>
>    char str[INT_BUFSIZE_BOUND (unsigned long int)];
>    ssize_t s = __read_nocancel (fd, str, sizeof (str));
>    __close_nocancel (fd);
>    if (s < 0)
> -    return 0;
> +    {
> +      atomic_store_relaxed (&thp_pagesize, 0);
> +      return 0;
> +    }
>
>    unsigned long int r = 0;
>    for (ssize_t i = 0; i < s; i++)
> @@ -44,16 +58,28 @@ __malloc_default_thp_pagesize (void)
>        r *= 10;
>        r += str[i] - '0';
>      }
> +  atomic_store_relaxed (&thp_pagesize, r);
>    return r;
>  }
>
> +/* The __malloc_thp_mode is called only in single-thread mode, either in
> +   malloc initialization or pthread creation.  */
> +static enum malloc_thp_mode_t thp_mode = malloc_thp_mode_unknown;
> +
>  enum malloc_thp_mode_t
>  __malloc_thp_mode (void)
>  {
> +  enum malloc_thp_mode_t mode = atomic_load_relaxed (&thp_mode);
> +  if (mode != malloc_thp_mode_unknown)
> +    return mode;
> +
>    int fd = __open64_nocancel ("/sys/kernel/mm/transparent_hugepage/enabled",
>  			      O_RDONLY);
>    if (fd == -1)
> -    return malloc_thp_mode_not_supported;
> +    {
> +      atomic_store_relaxed (&thp_mode, malloc_thp_mode_not_supported);
> +      return malloc_thp_mode_not_supported;
> +    }
>
>    static const char mode_always[]  = "[always] madvise never\n";
>    static const char mode_madvise[] = "always [madvise] never\n";
> @@ -69,13 +95,19 @@ __malloc_thp_mode (void)
>    if (s == sizeof (mode_always) - 1)
>      {
>        if (strcmp (str, mode_always) == 0)
> -	return malloc_thp_mode_always;
> +	mode = malloc_thp_mode_always;
>        else if (strcmp (str, mode_madvise) == 0)
> -	return malloc_thp_mode_madvise;
> +	mode = malloc_thp_mode_madvise;
>        else if (strcmp (str, mode_never) == 0)
> -	return malloc_thp_mode_never;
> +	mode = malloc_thp_mode_never;
> +      else
> +	mode = malloc_thp_mode_not_supported;
>      }
> -  return malloc_thp_mode_not_supported;
> +  else
> +    mode = malloc_thp_mode_not_supported;
> +
> +  atomic_store_relaxed (&thp_mode, mode);
> +  return mode;
>  }
>
>  static size_t
  
Wilco Dijkstra May 16, 2023, 3:38 p.m. UTC | #4
Hi Adhemerval,

>> This still doesn't make sense since if _STACK_GROWS_DOWN, mem == guard, so
>> this will always execute the madvise. 

And if !_STACK_GROWS_DOWN, we never execute the madvise. So I don't believe
this is correct, even if it behaves like a nop in some cases.

> Yes, if THP is set to always this is exactly the idea of this patch since
> afaiu the kernel might still back up the stack with large pages if the 
> request a size is smaller than the default THP. 

If a mmap start/end range does not align to a huge page, you get small pages
at the ends because a huge page does not fit.

> It is only an issue if
> the guard page address is not aligned to THP default size, which will
> potentially trigger issues Cupertino has brought (since we do not prior
> hand which is the mapping flags used on page used to fulfill the allocation). 

I don't see the claimed issue happen. What happens is that if you request
huge pages, you get them. And that is what increases the RSS size.

>> As I mentioned, I couldn't find evidence that
>> the claimed scenario of a huge page allocated, written to and then split due to the
>> mprotect exists.
>
> I adapted Cupertino original test to allow specify both the thread stack
>and guard size by command line.  Just:

The RSS size difference is not evidence of an issue - you asked for huge pages
and you got them! I verified they are definitely huge pages by counting the TLB
misses when accessing the stack.

>> So the real issue is that the current stack allocation code randomly (based on
>> alignment from previous mmap calls) uses huge pages even for small stacks.
>
> Keep in mind this heuristic is only enabled if THP is set to 'always', meaning
> the kernel will try to back *all* the stack with large pages.  The issue is
> when the *guard* page is within a large page.

Why would that be an issue? In that case you can't get a large page.

The question is, under what circumstances are huge pages in stacks beneficial and
in which cases are they not? If have a good answer to that, then we can automatically
do the right thing without needing a tuning.

Cheers,
Wilco
  
Adhemerval Zanella May 16, 2023, 4:35 p.m. UTC | #5
On 16/05/23 12:38, Wilco Dijkstra wrote:
> Hi Adhemerval,
> 
>>> This still doesn't make sense since if _STACK_GROWS_DOWN, mem == guard, so
>>> this will always execute the madvise. 
> 
> And if !_STACK_GROWS_DOWN, we never execute the madvise. So I don't believe
> this is correct, even if it behaves like a nop in some cases.
> 
>> Yes, if THP is set to always this is exactly the idea of this patch since
>> afaiu the kernel might still back up the stack with large pages if the 
>> request a size is smaller than the default THP. 
> 
> If a mmap start/end range does not align to a huge page, you get small pages
> at the ends because a huge page does not fit.

Yeap, this is why Curpertino original testcase adds a way to force mmap
filling the VMA to triggers the THP allocated stack.

> 
>> It is only an issue if
>> the guard page address is not aligned to THP default size, which will
>> potentially trigger issues Cupertino has brought (since we do not prior
>> hand which is the mapping flags used on page used to fulfill the allocation). 
> 
> I don't see the claimed issue happen. What happens is that if you request
> huge pages, you get them. And that is what increases the RSS size.
> 
>>> As I mentioned, I couldn't find evidence that
>>> the claimed scenario of a huge page allocated, written to and then split due to the
>>> mprotect exists.
>>
>> I adapted Cupertino original test to allow specify both the thread stack
>> and guard size by command line.  Just:
> 
> The RSS size difference is not evidence of an issue - you asked for huge pages
> and you got them! I verified they are definitely huge pages by counting the TLB
> misses when accessing the stack.
> 
>>> So the real issue is that the current stack allocation code randomly (based on
>>> alignment from previous mmap calls) uses huge pages even for small stacks.
>>
>> Keep in mind this heuristic is only enabled if THP is set to 'always', meaning
>> the kernel will try to back *all* the stack with large pages.  The issue is
>> when the *guard* page is within a large page.
> 
> Why would that be an issue? In that case you can't get a large page.
> 
> The question is, under what circumstances are huge pages in stacks beneficial and
> in which cases are they not? If have a good answer to that, then we can automatically
> do the right thing without needing a tuning.
> 

Afaiu the issue is not whether huge page in stacks is beneficial, but rather 
when kernel will fall back to default pages anyway which will be just waste 
cycles.  Another option would be tune the default stack and guard size
to avoid this issue, but this might require some more heuristics to find a 
good spot to avoid too much VMA waste.
  
Wilco Dijkstra May 17, 2023, 12:49 p.m. UTC | #6
Hi Adhemerval,

>> The question is, under what circumstances are huge pages in stacks beneficial and
>> in which cases are they not? If have a good answer to that, then we can automatically
>> do the right thing without needing a tuning.
>> 
> 
> Afaiu the issue is not whether huge page in stacks is beneficial, but rather 
> when kernel will fall back to default pages anyway which will be just waste 
> cycles.  Another option would be tune the default stack and guard size
> to avoid this issue, but this might require some more heuristics to find a 
> good spot to avoid too much VMA waste.

The kernel won't fallback to standard pages - like I said, it really allocates a huge
page when it can (based on alignment of the stack) and that is what causes the
increase of RSS size. But that's not evidence of an issue.

So the real question is when do huge pages make sense for stacks?

Cheers,
Wilco
  
Cupertino Miranda May 17, 2023, 1:12 p.m. UTC | #7
Wilco Dijkstra writes:

> Hi Adhemerval,
>
>>> The question is, under what circumstances are huge pages in stacks beneficial and
>>> in which cases are they not? If have a good answer to that, then we can automatically
>>> do the right thing without needing a tuning.
>>>
>>
>> Afaiu the issue is not whether huge page in stacks is beneficial, but rather
>> when kernel will fall back to default pages anyway which will be just waste
>> cycles.  Another option would be tune the default stack and guard size
>> to avoid this issue, but this might require some more heuristics to find a
>> good spot to avoid too much VMA waste.
>
> The kernel won't fallback to standard pages - like I said, it really allocates a huge
> page when it can (based on alignment of the stack) and that is what causes the
> increase of RSS size. But that's not evidence of an issue.
>
> So the real question is when do huge pages make sense for stacks?
The dual question of when huge pages make sense, is when they don't make
sense.
In current proposal of Adhemerval he is really focusing on the dual
question, and identifying a specific case where they do not make sense.

IMO, huge pages make sense for the stack every time the programmer or the
user of the system so thinks and profiles as a benefit. It really
depends on the use case.  If stack allocation is big enough to make use
of huge pages, it is definetely a benefit to use them to reduce TLB
pressure.

Regarding precisely identifying when hugepages are benefitial this is
really an unpredictable case by case problem and for that matter hard to
predict.
I don't think you can easily find an heuristic that will solve the
properly detect that.

Cheers,
Cupertino

>
> Cheers,
> Wilco
  
Adhemerval Zanella May 17, 2023, 1:20 p.m. UTC | #8
On 17/05/23 09:49, Wilco Dijkstra wrote:
> Hi Adhemerval,
> 
>>> The question is, under what circumstances are huge pages in stacks beneficial and
>>> in which cases are they not? If have a good answer to that, then we can automatically
>>> do the right thing without needing a tuning.
>>>
>>
>> Afaiu the issue is not whether huge page in stacks is beneficial, but rather 
>> when kernel will fall back to default pages anyway which will be just waste 
>> cycles.  Another option would be tune the default stack and guard size
>> to avoid this issue, but this might require some more heuristics to find a 
>> good spot to avoid too much VMA waste.
> 
> The kernel won't fallback to standard pages - like I said, it really allocates a huge
> page when it can (based on alignment of the stack) and that is what causes the
> increase of RSS size. But that's not evidence of an issue.

AFAIU the issue is after the stack is allocated with huge pages, the
kernel needs to fallback to standard pages because the guard 'page'
will be also within the same huge page allocated for the stack.  

My understanding is, once kernel needs to fallback to use default pages, 
it allocates *all* the large page range.  This is what the RSS increase
make me believe, I am not sure if there is technical limitation to just 
making the range COW (since at the time of guard protection setup, no
the page has not been touched yet).

> 
> So the real question is when do huge pages make sense for stacks?

But that's not what the patch is trying to do, it only tries tot mitigate
a specific corner case where THP will be ineffective.  I agree with
Cupertino that this question is really hard to answer and it will be
really depended of the workload and/or runtime characteristics that we will
need to plug in kernel feedback to have some answer.
  
Wilco Dijkstra May 17, 2023, 2:22 p.m. UTC | #9
Hi Adhemerval,

> AFAIU the issue is after the stack is allocated with huge pages, the
> kernel needs to fallback to standard pages because the guard 'page'
> will be also within the same huge page allocated for the stack.  

The stack allocation explicitly never overlaps with the guard page, ie. there
is no such fallback. All that matters is the mapped address range of the
stack - if this fits huge pages, you'll get them.

> My understanding is, once kernel needs to fallback to use default pages, 
> it allocates *all* the large page range.  This is what the RSS increase
> make me believe, I am not sure if there is technical limitation to just 
> making the range COW (since at the time of guard protection setup, no
> the page has not been touched yet).

That's not what happens. The RSS size increases because you actually get
a huge page (as requested). There is no fallback to standard pages.

>> So the real question is when do huge pages make sense for stacks?
>
> But that's not what the patch is trying to do, it only tries tot mitigate
> a specific corner case where THP will be ineffective.  I agree with

So far there is no evidence this corner case exists, but even ignoring that,
the expression used is incorrect.

> Cupertino that this question is really hard to answer and it will be
> really depended of the workload and/or runtime characteristics that we will
> need to plug in kernel feedback to have some answer.

It should be feasible to run benchmarks to get an idea whether huge stack pages
help or not. And similarly whether the RSS increase is worth it or not.

Cheers,
Wilco
  
Adhemerval Zanella May 17, 2023, 4:50 p.m. UTC | #10
On 17/05/23 11:22, Wilco Dijkstra wrote:
> Hi Adhemerval,
> 
>> AFAIU the issue is after the stack is allocated with huge pages, the
>> kernel needs to fallback to standard pages because the guard 'page'
>> will be also within the same huge page allocated for the stack.  
> 
> The stack allocation explicitly never overlaps with the guard page, ie. there
> is no such fallback. All that matters is the mapped address range of the
> stack - if this fits huge pages, you'll get them.
> 
>> My understanding is, once kernel needs to fallback to use default pages, 
>> it allocates *all* the large page range.  This is what the RSS increase
>> make me believe, I am not sure if there is technical limitation to just 
>> making the range COW (since at the time of guard protection setup, no
>> the page has not been touched yet).
> 
> That's not what happens. The RSS size increases because you actually get
> a huge page (as requested). There is no fallback to standard pages.

But the threads themselves do not end up using all the VMA region allocated
for them.  Using the test program you can see it:

$ cat /proc/meminfo  | grep AnonHugePages
AnonHugePages:     43008 kB
$ ./tststackalloc &
[...]
[statm] RSS: 1049 pages (4296704 bytes = 4 MB)
[smaps] RSS: 5033984 bytes = 4 MB
[...]
$ cat /proc/meminfo  | grep AnonHugePages
AnonHugePages:     45056 kB

So even if the stack is not aligned to default large page, THP will still
back up the thread allocation.  The issues is, if the mmap is also aligned 
to THP size, the guard setup will trigger the issue that will increase RSS.
This seems to be same conclusion OpenJVM and some kernel discussion has
reached as well [1] [2].

> 
>>> So the real question is when do huge pages make sense for stacks?
>>
>> But that's not what the patch is trying to do, it only tries tot mitigate
>> a specific corner case where THP will be ineffective.  I agree with
> 
> So far there is no evidence this corner case exists, but even ignoring that,
> the expression used is incorrect.
> 
>> Cupertino that this question is really hard to answer and it will be
>> really depended of the workload and/or runtime characteristics that we will
>> need to plug in kernel feedback to have some answer.
> 
> It should be feasible to run benchmarks to get an idea whether huge stack pages
> help or not. And similarly whether the RSS increase is worth it or not.

Another option, hinted in both discussion and brought by Florian as well is to
add a pthread extension to force huge page disabl (something like pthread_attr_setflags
to make is extensible).

[1] https://bugs.openjdk.org/browse/JDK-8303215?page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel&showAll=true
[2] https://lore.kernel.org/linux-mm/278ec047-4c5d-ab71-de36-094dbed4067c@redhat.com/T/
  
Wilco Dijkstra May 17, 2023, 6:16 p.m. UTC | #11
Hi Adhemerval,

> But the threads themselves do not end up using all the VMA region allocated
> for them.  Using the test program you can see it:

Obviously that is due to there being a 2MB gap between each stack. After the
huge page there is a default page for the guard and then empty space for the
alignment till the next stack.

> So even if the stack is not aligned to default large page, THP will still
> back up the thread allocation.

THP can only be used if the mmap covers an aligned huge page. If you
allocate lots of 2MB stacks then you get threads * guardsize / 2MB number
of huge pages since some stacks will become aligned.

>The issues is, if the mmap is also aligned 
> to THP size, the guard setup will trigger the issue that will increase RSS.

No it won't. If it is aligned it will get a huge page, and the guard will be before
the huge page (since we allocate 2MB + guardsize).

> This seems to be same conclusion OpenJVM and some kernel discussion has
> reached as well [1] [2].

If that is what they claim, they are wrong. This is not how THP or our stack
allocation works. You can see the allocation of the THP and guard pages
if you print the smaps (build with -DPRINT_PROC_SMAPS).

Cheers,
Wilco
  
Adhemerval Zanella May 18, 2023, 1:04 p.m. UTC | #12
On 17/05/23 15:16, Wilco Dijkstra wrote:
> Hi Adhemerval,
> 
>> But the threads themselves do not end up using all the VMA region allocated
>> for them.  Using the test program you can see it:
> 
> Obviously that is due to there being a 2MB gap between each stack. After the
> huge page there is a default page for the guard and then empty space for the
> alignment till the next stack.
> 
>> So even if the stack is not aligned to default large page, THP will still
>> back up the thread allocation.
> 
> THP can only be used if the mmap covers an aligned huge page. If you
> allocate lots of 2MB stacks then you get threads * guardsize / 2MB number
> of huge pages since some stacks will become aligned.
> 
>> The issues is, if the mmap is also aligned 
>> to THP size, the guard setup will trigger the issue that will increase RSS.
> 
> No it won't. If it is aligned it will get a huge page, and the guard will be before
> the huge page (since we allocate 2MB + guardsize).
> 
>> This seems to be same conclusion OpenJVM and some kernel discussion has
>> reached as well [1] [2].
> 
> If that is what they claim, they are wrong. This is not how THP or our stack
> allocation works. You can see the allocation of the THP and guard pages
> if you print the smaps (build with -DPRINT_PROC_SMAPS).

Right, I was using the wrong assumption then on how THP acts (mostly due the
two threads discussion).  Running more tests I see your point and it seems
what is happening in fact.  I will drop this patch since it really does not
make much sense.

So, do you think adding a pthread extension to control it would be a improvement
or maybe work towards a heuristic to check if THP would be indeed valuable would
be better?
  
Wilco Dijkstra May 23, 2023, 9:48 a.m. UTC | #13
Hi Adhemerval,

> Right, I was using the wrong assumption then on how THP acts (mostly due the
> two threads discussion).  Running more tests I see your point and it seems
> what is happening in fact.  I will drop this patch since it really does not
> make much sense.
>
> So, do you think adding a pthread extension to control it would be a improvement
> or maybe work towards a heuristic to check if THP would be indeed valuable would
> be better?

Adding something to pthread to make it more configurable seems reasonable, but
like with tunables, I bet most applications won't ever use this. So adding heuristics
that work well for most applications would be best.

It seems Linux doesn't support aligning large mmaps, and as a result the code in
alloc_new_heap looks very messy and complex. We'd need something similar to
ensure deterministic alignment of the top of the stack for the cases where we
decide to use huge pages. We could reserve a number of default pages for the
initial stack so you avoid increasing RSS size if you use a small amount of stack.

However even switching off THP for all small stacks (eg. default size or smaller)
would avoid the random use of THP without needing to set the tunable.

Cheers,
Wilco
  
Cristian Rodríguez Jan. 31, 2024, 2:03 a.m. UTC | #14
On Tue, May 23, 2023 at 5:48 AM Wilco Dijkstra via Libc-alpha <
libc-alpha@sourceware.org> wrote:

>
> However even switching off THP for all small stacks (eg. default size or
> smaller)
> would avoid the random use of THP without needing to set the tunable.
>
> Cheers,
> Wilco


Didn't found the exact message to reply.. but this one is close enough.
MAP_STACK implies no transparent hugepages since linux kernel commit
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c4608d1bf7c6536d1a3d233eb21e50678681564e
  
Florian Weimer Jan. 31, 2024, 7:54 a.m. UTC | #15
* Cristian Rodríguez:

> On Tue, May 23, 2023 at 5:48 AM Wilco Dijkstra via Libc-alpha <libc-alpha@sourceware.org>
> wrote:
>
>> However even switching off THP for all small stacks (eg. default size or smaller)
>> would avoid the random use of THP without needing to set the tunable.
>>
>> Cheers,
>> Wilco
>
>  
> Didn't found the exact message to reply.. but this one is close enough. MAP_STACK implies no
> transparent hugepages since linux kernel commit 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c4608d1bf7c6536d1a3d233eb21e50678681564e

It's an odd choice for fixing the regression.  I commented on the
linux-mm thread.  It's also surprising to see such changes being made
without larger discussion.

Thanks,
Florian
  
Adhemerval Zanella Jan. 31, 2024, 11:30 a.m. UTC | #16
On 31/01/24 04:54, Florian Weimer wrote:
> * Cristian Rodríguez:
> 
>> On Tue, May 23, 2023 at 5:48 AM Wilco Dijkstra via Libc-alpha <libc-alpha@sourceware.org>
>> wrote:
>>
>>> However even switching off THP for all small stacks (eg. default size or smaller)
>>> would avoid the random use of THP without needing to set the tunable.
>>>
>>> Cheers,
>>> Wilco
>>
>>  
>> Didn't found the exact message to reply.. but this one is close enough. MAP_STACK implies no
>> transparent hugepages since linux kernel commit 
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c4608d1bf7c6536d1a3d233eb21e50678681564e
> 
> It's an odd choice for fixing the regression.  I commented on the
> linux-mm thread.  It's also surprising to see such changes being made
> without larger discussion.

Do you have a link for this discussion?
  
Florian Weimer Jan. 31, 2024, 11:43 a.m. UTC | #17
* Adhemerval Zanella Netto:

> On 31/01/24 04:54, Florian Weimer wrote:
>> * Cristian Rodríguez:
>> 
>>> On Tue, May 23, 2023 at 5:48 AM Wilco Dijkstra via Libc-alpha <libc-alpha@sourceware.org>
>>> wrote:
>>>
>>>> However even switching off THP for all small stacks (eg. default size or smaller)
>>>> would avoid the random use of THP without needing to set the tunable.
>>>>
>>>> Cheers,
>>>> Wilco
>>>
>>>  
>>> Didn't found the exact message to reply.. but this one is close enough. MAP_STACK implies no
>>> transparent hugepages since linux kernel commit 
>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c4608d1bf7c6536d1a3d233eb21e50678681564e
>> 
>> It's an odd choice for fixing the regression.  I commented on the
>> linux-mm thread.  It's also surprising to see such changes being made
>> without larger discussion.
>
> Do you have a link for this discussion?

My message is here:

  <https://lore.kernel.org/all/878r46ym4b.fsf@oldenburg.str.redhat.com/>

Thanks,
Florian
  
Cristian Rodríguez Jan. 31, 2024, 3:18 p.m. UTC | #18
On Wed, Jan 31, 2024 at 8:30 AM Adhemerval Zanella Netto <
adhemerval.zanella@linaro.org> wrote:

>
>
> On 31/01/24 04:54, Florian Weimer wrote:
> > * Cristian Rodríguez:
> >
> >> On Tue, May 23, 2023 at 5:48 AM Wilco Dijkstra via Libc-alpha <
> libc-alpha@sourceware.org>
> >> wrote:
> >>
> >>> However even switching off THP for all small stacks (eg. default size
> or smaller)
> >>> would avoid the random use of THP without needing to set the tunable.
> >>>
> >>> Cheers,
> >>> Wilco
> >>
> >>
> >> Didn't found the exact message to reply.. but this one is close enough.
> MAP_STACK implies no
> >> transparent hugepages since linux kernel commit
> >>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c4608d1bf7c6536d1a3d233eb21e50678681564e
> >
> > It's an odd choice for fixing the regression.  I commented on the
> > linux-mm thread.  It's also surprising to see such changes being made
> > without larger discussion.
>
> Do you have a link for this discussion?
>

Doesn't this also imply that  the kernel expects the MAP_GROWSDOWN flag
#if _STACK_GROWS_DOWN to do the right thing elsewhere ?
  
Cristian Rodríguez Feb. 1, 2024, 1:26 a.m. UTC | #19
On Wed, Jan 31, 2024 at 12:18 PM Cristian Rodríguez <cristian@rodriguez.im>
wrote:

>
>
> On Wed, Jan 31, 2024 at 8:30 AM Adhemerval Zanella Netto <
> adhemerval.zanella@linaro.org> wrote:
>
>>
>>
>> On 31/01/24 04:54, Florian Weimer wrote:
>> > * Cristian Rodríguez:
>> >
>> >> On Tue, May 23, 2023 at 5:48 AM Wilco Dijkstra via Libc-alpha <
>> libc-alpha@sourceware.org>
>> >> wrote:
>> >>
>> >>> However even switching off THP for all small stacks (eg. default size
>> or smaller)
>> >>> would avoid the random use of THP without needing to set the tunable.
>> >>>
>> >>> Cheers,
>> >>> Wilco
>> >>
>> >>
>> >> Didn't found the exact message to reply.. but this one is close
>> enough. MAP_STACK implies no
>> >> transparent hugepages since linux kernel commit
>> >>
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c4608d1bf7c6536d1a3d233eb21e50678681564e
>> >
>> > It's an odd choice for fixing the regression.  I commented on the
>> > linux-mm thread.  It's also surprising to see such changes being made
>> > without larger discussion.
>>
>> Do you have a link for this discussion?
>>
>
> Doesn't this also imply that  the kernel expects the MAP_GROWSDOWN flag
> #if _STACK_GROWS_DOWN to do the right thing elsewhere ?
>
>
> Older versions will  need MAP_GROWS* to have identical THP behaviour than
earilier kernel versions it seems to me.
  
Cristian Rodríguez March 12, 2024, 12:55 a.m. UTC | #20
> > Do you have a link for this discussion?
>
> My message is here:
>
>   <https://lore.kernel.org/all/878r46ym4b.fsf@oldenburg.str.redhat.com/>
>
> Thanks,
> Florian


Kernel 6.8 was released and the change remains in place. IMHO libc
should behave if MADV_NOHUGEPAGE was set for old kernels so the
behaviour is consistent old vs new.
  

Patch

diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
index f9d8cdfd08..1eb34f816c 100644
--- a/nptl/allocatestack.c
+++ b/nptl/allocatestack.c
@@ -33,6 +33,7 @@ 
 #include <nptl-stack.h>
 #include <libc-lock.h>
 #include <tls-internal.h>
+#include <malloc-hugepages.h>
 
 /* Default alignment of stack.  */
 #ifndef STACK_ALIGN
@@ -206,6 +207,31 @@  advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
 #endif
 }
 
+/* If Transparent Huge Page (THP) is set to 'always', the thread stack might
+   be backed by Huge Pages depending of the asigned address by the kernel and
+   if resulting guard position.  If the guard page is within the same large
+   page that might be used by the stack itself, changing stack permission
+   will make the allocated range no longer be server with THP.  The kernel will
+   then revert back using default page size.
+
+   In this case, besides the aditional work, the kernel will need to potential
+   keep all the pages since it can distinguish which one was really touched by
+   the process.  This result in a large RSS usage than just madvise the range
+   to not use huge pages.  */
+static __always_inline int
+advise_thp (void *mem, size_t size, char *guard)
+{
+  enum malloc_thp_mode_t thpmode = __malloc_thp_mode ();
+  if (thpmode != malloc_thp_mode_always)
+    return 0;
+
+  unsigned long int thpsize = __malloc_default_thp_pagesize ();
+  if (PTR_ALIGN_DOWN (mem, thpsize) != PTR_ALIGN_DOWN (guard, thpsize))
+    return 0;
+
+  return __madvise (mem, size, MADV_NOHUGEPAGE);
+}
+
 /* Returns a usable stack for a new thread either by allocating a
    new stack or reusing a cached stack of sufficient size.
    ATTR must be non-NULL and point to a valid pthread_attr.
@@ -396,6 +422,14 @@  allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 	    {
 	      char *guard = guard_position (mem, size, guardsize, pd,
 					    pagesize_m1);
+
+	      if (__glibc_unlikely (__nptl_stack_hugetlb == 1))
+		{
+		  int r = advise_thp (mem, size, guard);
+		  if (r != 0)
+		    return r;
+		}
+
 	      if (setup_stack_prot (mem, size, guard, guardsize, prot) != 0)
 		{
 		  __munmap (mem, size);
diff --git a/sysdeps/generic/malloc-hugepages.h b/sysdeps/generic/malloc-hugepages.h
index d68b85630c..21d4844bc4 100644
--- a/sysdeps/generic/malloc-hugepages.h
+++ b/sysdeps/generic/malloc-hugepages.h
@@ -26,6 +26,7 @@  unsigned long int __malloc_default_thp_pagesize (void) attribute_hidden;
 
 enum malloc_thp_mode_t
 {
+  malloc_thp_mode_unknown,
   malloc_thp_mode_always,
   malloc_thp_mode_madvise,
   malloc_thp_mode_never,
diff --git a/sysdeps/unix/sysv/linux/malloc-hugepages.c b/sysdeps/unix/sysv/linux/malloc-hugepages.c
index 2f316474c1..e7877f098e 100644
--- a/sysdeps/unix/sysv/linux/malloc-hugepages.c
+++ b/sysdeps/unix/sysv/linux/malloc-hugepages.c
@@ -22,19 +22,33 @@ 
 #include <not-cancel.h>
 #include <sys/mman.h>
 
+/* The __malloc_thp_mode is called only in single-thread mode, either in
+   malloc initialization or pthread creation.  */
+static unsigned long int thp_pagesize = -1;
+
 unsigned long int
 __malloc_default_thp_pagesize (void)
 {
+  unsigned long int size = atomic_load_relaxed (&thp_pagesize);
+  if (size != -1)
+    return size;
+
   int fd = __open64_nocancel (
     "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", O_RDONLY);
   if (fd == -1)
-    return 0;
+    {
+      atomic_store_relaxed (&thp_pagesize, 0);
+      return 0;
+    }
 
   char str[INT_BUFSIZE_BOUND (unsigned long int)];
   ssize_t s = __read_nocancel (fd, str, sizeof (str));
   __close_nocancel (fd);
   if (s < 0)
-    return 0;
+    {
+      atomic_store_relaxed (&thp_pagesize, 0);
+      return 0;
+    }
 
   unsigned long int r = 0;
   for (ssize_t i = 0; i < s; i++)
@@ -44,16 +58,28 @@  __malloc_default_thp_pagesize (void)
       r *= 10;
       r += str[i] - '0';
     }
+  atomic_store_relaxed (&thp_pagesize, r);
   return r;
 }
 
+/* The __malloc_thp_mode is called only in single-thread mode, either in
+   malloc initialization or pthread creation.  */
+static enum malloc_thp_mode_t thp_mode = malloc_thp_mode_unknown;
+
 enum malloc_thp_mode_t
 __malloc_thp_mode (void)
 {
+  enum malloc_thp_mode_t mode = atomic_load_relaxed (&thp_mode);
+  if (mode != malloc_thp_mode_unknown)
+    return mode;
+
   int fd = __open64_nocancel ("/sys/kernel/mm/transparent_hugepage/enabled",
 			      O_RDONLY);
   if (fd == -1)
-    return malloc_thp_mode_not_supported;
+    {
+      atomic_store_relaxed (&thp_mode, malloc_thp_mode_not_supported);
+      return malloc_thp_mode_not_supported;
+    }
 
   static const char mode_always[]  = "[always] madvise never\n";
   static const char mode_madvise[] = "always [madvise] never\n";
@@ -69,13 +95,19 @@  __malloc_thp_mode (void)
   if (s == sizeof (mode_always) - 1)
     {
       if (strcmp (str, mode_always) == 0)
-	return malloc_thp_mode_always;
+	mode = malloc_thp_mode_always;
       else if (strcmp (str, mode_madvise) == 0)
-	return malloc_thp_mode_madvise;
+	mode = malloc_thp_mode_madvise;
       else if (strcmp (str, mode_never) == 0)
-	return malloc_thp_mode_never;
+	mode = malloc_thp_mode_never;
+      else
+	mode = malloc_thp_mode_not_supported;
     }
-  return malloc_thp_mode_not_supported;
+  else
+    mode = malloc_thp_mode_not_supported;
+
+  atomic_store_relaxed (&thp_mode, mode);
+  return mode;
 }
 
 static size_t