[[PATCH] v2 1/1] malloc: support transparent huge pages

Message ID 20200923221300.162450-2-nmanthey@conp-solutions.com
State Not applicable
Headers
Series [[PATCH] v2 1/1] malloc: support transparent huge pages |

Commit Message

Norbert Manthey Sept. 23, 2020, 10:13 p.m. UTC
  There exists use cases where huge pages would help to reduce TLB pressure,
but other applications running on the system should not be backed by huge
pages by default. Hence, the configuration option inside the kernel is
typically set to be controlled by the madvise syscall.

This change extends the memory allocation functions to use the madvise
system call, in case the requested area to be allocated contains at least
one huge page. To make this change more effective, the threshold to use
mmap as allocation is set to 2M, the typical huge page size.

The new feature has to be requested explicitly. Currently, the environment
variable MALLOC_THP_ALWAYS_ has to be defined in order to enable the
feature, or the related glibc tunable. Otherwise, the default
configuration will be used.

When allocating memory, the brk system call is used. However, the used
granularity is a page size, typically 4K. To not drop from other default
page sizes, this change makes sure we only jump to 2M as a huge page size
if this increases the page size to be used. For some environments, the
size of huge pages can be different. Therefore, the huge page size can
be modified at compile time using the THP_HUGE_PAGESIZE compile time
variable.

To improve the effectiveness of using huge pages, calls to brk are aligned
to the page size to be used. This alignment of allocations with brk can be
activated separately via the environment variable MALLOC_ALLOC_2M_ALIGNED_,
or the glibc tunable, respectively.

The functions grow_heap and new_heap have not been modified to use
transparent huge pages, yet.

Signed-off-by: Norbert Manthey <nmanthey@conp-solutions.com>
---
 elf/dl-tunables.list |  12 ++++
 malloc/arena.c       |  19 +++---
 malloc/malloc.c      | 159 ++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 170 insertions(+), 20 deletions(-)
  

Patch

diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
index 35634ef24d..f5f80eeb77 100644
--- a/elf/dl-tunables.list
+++ b/elf/dl-tunables.list
@@ -90,6 +90,18 @@  glibc {
       minval: 0
       security_level: SXID_IGNORE
     }
+    thp_always {
+      type: INT_32
+      minval: 0
+      maxval: 1
+      env_alias: MALLOC_THP_ALWAYS_
+    }
+    alloc_2M_aligned {
+      type: INT_32
+      minval: 0
+      maxval: 1
+      env_alias: MALLOC_ALLOC_2M_ALIGNED_
+    }
   }
   cpu {
     hwcap_mask {
diff --git a/malloc/arena.c b/malloc/arena.c
index cecdb7f4c4..bf23150290 100644
--- a/malloc/arena.c
+++ b/malloc/arena.c
@@ -19,11 +19,6 @@ 
 
 #include <stdbool.h>
 
-#if HAVE_TUNABLES
-# define TUNABLE_NAMESPACE malloc
-#endif
-#include <elf/dl-tunables.h>
-
 /* Compile-time constants.  */
 
 #define HEAP_MIN_SIZE (32 * 1024)
@@ -456,6 +451,8 @@  new_heap (size_t size, size_t top_pad)
   char *p1, *p2;
   unsigned long ul;
   heap_info *h;
+  const size_t mmap_pagesize = sys_thp_mmap_pagesize > pagesize ? sys_thp_mmap_pagesize : pagesize;
+  int extra_mmap_flags = (21 << MAP_HUGE_SHIFT);
 
   if (size + top_pad < HEAP_MIN_SIZE)
     size = HEAP_MIN_SIZE;
@@ -465,7 +462,7 @@  new_heap (size_t size, size_t top_pad)
     return 0;
   else
     size = HEAP_MAX_SIZE;
-  size = ALIGN_UP (size, pagesize);
+  size = ALIGN_UP (size, mmap_pagesize);
 
   /* A memory region aligned to a multiple of HEAP_MAX_SIZE is needed.
      No swap space needs to be reserved for the following large
@@ -475,7 +472,7 @@  new_heap (size_t size, size_t top_pad)
   if (aligned_heap_area)
     {
       p2 = (char *) MMAP (aligned_heap_area, HEAP_MAX_SIZE, PROT_NONE,
-                          MAP_NORESERVE);
+                          MAP_NORESERVE | extra_mmap_flags);
       aligned_heap_area = NULL;
       if (p2 != MAP_FAILED && ((unsigned long) p2 & (HEAP_MAX_SIZE - 1)))
         {
@@ -485,7 +482,7 @@  new_heap (size_t size, size_t top_pad)
     }
   if (p2 == MAP_FAILED)
     {
-      p1 = (char *) MMAP (0, HEAP_MAX_SIZE << 1, PROT_NONE, MAP_NORESERVE);
+      p1 = (char *) MMAP (0, HEAP_MAX_SIZE << 1, PROT_NONE, MAP_NORESERVE | extra_mmap_flags);
       if (p1 != MAP_FAILED)
         {
           p2 = (char *) (((unsigned long) p1 + (HEAP_MAX_SIZE - 1))
@@ -501,7 +498,7 @@  new_heap (size_t size, size_t top_pad)
         {
           /* Try to take the chance that an allocation of only HEAP_MAX_SIZE
              is already aligned. */
-          p2 = (char *) MMAP (0, HEAP_MAX_SIZE, PROT_NONE, MAP_NORESERVE);
+          p2 = (char *) MMAP (0, HEAP_MAX_SIZE, PROT_NONE, MAP_NORESERVE | extra_mmap_flags);
           if (p2 == MAP_FAILED)
             return 0;
 
@@ -517,6 +514,10 @@  new_heap (size_t size, size_t top_pad)
       __munmap (p2, HEAP_MAX_SIZE);
       return 0;
     }
+
+  /* use huge pages */
+  systhp(p2, size);
+
   h = (heap_info *) p2;
   h->size = size;
   h->mprotect_size = size;
diff --git a/malloc/malloc.c b/malloc/malloc.c
index cd9933b4e5..7a38a15dda 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -327,6 +327,11 @@  __malloc_assert (const char *assertion, const char *file, unsigned int line,
 # define MAX_TCACHE_COUNT UINT16_MAX
 #endif
 
+#if HAVE_TUNABLES
+# define TUNABLE_NAMESPACE malloc
+#endif
+# include <elf/dl-tunables.h>
+
 /* Safe-Linking:
    Use randomness from ASLR (mmap_base) to protect single-linked lists
    of Fast-Bins and TCache.  That is, mask the "next" pointers of the
@@ -458,7 +463,7 @@  void *(*__morecore)(ptrdiff_t) = __default_morecore;
    thus avoid running out of kernel resources.  */
 
 #ifndef MMAP_AS_MORECORE_SIZE
-#define MMAP_AS_MORECORE_SIZE (1024 * 1024)
+#define MMAP_AS_MORECORE_SIZE (2 * 1024 * 1024)
 #endif
 
 /*
@@ -1895,6 +1900,89 @@  free_perturb (char *p, size_t n)
 
 #include <stap-probe.h>
 
+/* ----------- Routines dealing with transparent huge pages ----------- */
+
+/* support custom THP huge page sizes */
+#ifndef THP_HUGE_PAGESIZE
+#define THP_HUGE_PAGESIZE 0x200000
+# endif
+
+const static int sys_thp_pagesize = THP_HUGE_PAGESIZE; /* page size to be used */
+
+/* allow to select during compile time already, off by default */
+#ifndef SYS_THP_ALWAYS_ENABLED
+static int sys_thp_initialized = 0; /* have we checked the environment? */
+static int sys_thp_engaged = 0; /* shall we use THP and align 2M pages? */
+static int sys_thp_mmap_pagesize = 0; /* by default, do not set any extra page size */
+#else
+static int sys_thp_initialized = 1; /* have we checked the environment? */
+static int sys_thp_engaged = 1; /* shall we use THP and align 2M pages? */
+static int sys_thp_mmap_pagesize = THP_HUGE_PAGESIZE; /* by default, do not set any extra page size */
+#endif
+static int sys_alloc_2M_aligned = 0; /* by default, do not change allocation schema */
+
+/*
+   check environment variable GLIBC_THP_ALWAYS whether we should try to
+   align to 2M pages and run madvise(..., MADV_HUGEPAGE) for all alocated
+   memory
+
+   In case the variable GLIBC_THP_2M_FRIEDNLY is specified, try to align the
+   allocations to 2M, so that external THP can be more effective.
+ */
+static int
+systhp_initialize(void)
+{
+  if (!sys_thp_initialized)
+  {
+    // TODO FIXME: use GLIBC_TUNABLES instead of this!
+    // TODO: add coverletter 'cat /sys/kernel/mm/transparent_hugepage/enabled'
+    // TODO: repeat the experiment with "always" - how about NUMA?
+    sys_thp_engaged = TUNABLE_GET (thp_always, int32_t, NULL);  // (getenv("GLIBC_THP_ALWAYS") != NULL);
+    sys_thp_initialized = 1;
+
+    sys_alloc_2M_aligned = TUNABLE_GET (alloc_2M_aligned, int32_t, NULL);
+    /* align to 2M if using sys_thp, or when trying to be THP friednly */
+    if(sys_thp_engaged || getenv("GLIBC_THP_2M_FRIEDNLY") != 0)
+      sys_thp_mmap_pagesize = sys_thp_pagesize;
+  }
+  return sys_thp_engaged;
+}
+
+/*
+   systhp asks OS to use a huge page to back the current memory
+ */
+static int
+systhp(void* p, INTERNAL_SIZE_T size)
+{
+  /* do not consider areas smaller than a huge page */
+  if(size < sys_thp_pagesize)
+    return 0;
+
+  /* ask for huge page, if enabled and aligned */
+  if (!sys_thp_engaged)
+    return 0;
+
+  /* ensure we use only 2M aligned addresses */
+  if(((unsigned long)p & 0x1fffff) != 0)
+  {
+    /* get smallest 2M aligned address and size within 2M pages */
+    unsigned long q = ALIGN_UP ((unsigned long)p, sys_thp_pagesize);
+    unsigned long top = (unsigned long)p + size;
+    top = ALIGN_DOWN(top, sys_thp_pagesize);
+
+    /* abort if requested area does not contain a huge page */
+    if(top <= q)
+      return 0;
+
+    /* update area to be backed with huge pages */
+    p = (void *)q;
+    size = top - q;
+  }
+
+  /* ask for huge page, if enabled and aligned */
+  return __madvise (p, size, MADV_HUGEPAGE);
+}
+
 /* ------------------- Support for multiple arenas -------------------- */
 #include "arena.c"
 
@@ -2293,6 +2381,14 @@  sysmalloc (INTERNAL_SIZE_T nb, mstate av)
   size_t pagesize = GLRO (dl_pagesize);
   bool tried_mmap = false;
 
+  size_t mmap_pagesize;
+  int extra_mmap_flags = (21 << MAP_HUGE_SHIFT);
+
+  systhp_initialize();
+  mmap_pagesize = sys_thp_mmap_pagesize > pagesize ? sys_thp_mmap_pagesize : pagesize;
+
+  long align_size;      /* size to use to align brk (top of heap) */
+  char *aligned_2m_brk; /* value of updated brk prior to alignment */
 
   /*
      If have mmap, and the request size meets the mmap threshold, and
@@ -2317,15 +2413,15 @@  sysmalloc (INTERNAL_SIZE_T nb, mstate av)
          need for further alignments unless we have have high alignment.
        */
       if (MALLOC_ALIGNMENT == 2 * SIZE_SZ)
-        size = ALIGN_UP (nb + SIZE_SZ, pagesize);
+        size = ALIGN_UP (nb + SIZE_SZ, mmap_pagesize);
       else
-        size = ALIGN_UP (nb + SIZE_SZ + MALLOC_ALIGN_MASK, pagesize);
+        size = ALIGN_UP (nb + SIZE_SZ + MALLOC_ALIGN_MASK, mmap_pagesize);
       tried_mmap = true;
 
       /* Don't try if size wraps around 0 */
       if ((unsigned long) (size) > (unsigned long) (nb))
         {
-          mm = (char *) (MMAP (0, size, PROT_READ | PROT_WRITE, 0));
+          mm = (char *) (MMAP (0, size, PROT_READ | PROT_WRITE, extra_mmap_flags));
 
           if (mm != MAP_FAILED)
             {
@@ -2337,6 +2433,9 @@  sysmalloc (INTERNAL_SIZE_T nb, mstate av)
                  address argument for later munmap in free() and realloc().
                */
 
+              /* use huge pages */
+              systhp(mm, size);
+
               if (MALLOC_ALIGNMENT == 2 * SIZE_SZ)
                 {
                   /* For glibc, chunk2mem increases the address by 2*SIZE_SZ and
@@ -2475,7 +2574,7 @@  sysmalloc (INTERNAL_SIZE_T nb, mstate av)
          previous calls. Otherwise, we correct to page-align below.
        */
 
-      size = ALIGN_UP (size, pagesize);
+      size = ALIGN_UP (size, mmap_pagesize);
 
       /*
          Don't try to call MORECORE if argument is so big as to appear
@@ -2491,6 +2590,29 @@  sysmalloc (INTERNAL_SIZE_T nb, mstate av)
 
       if (brk != (char *) (MORECORE_FAILURE))
         {
+	  /*
+	     Try to align heap top to 2M page size. This allows to use huge
+	     pages for any future MORECORE call.
+	   */
+	  if(sys_thp_mmap_pagesize > 0 && ((unsigned long)brk & 0x1fffff) != 0)
+	  {
+	    align_size = sys_thp_pagesize - ((unsigned long)brk & 0x1fffff);
+
+	    aligned_2m_brk = (char *) (MORECORE (align_size));
+            LIBC_PROBE (memory_sbrk_more, 2, brk, align_size);
+
+	    assert((((unsigned long)aligned_2m_brk + align_size) & 0x1fffff) == 0); /* make sure top is now aligned */
+
+	    /* ignore failures for now */
+	    if (aligned_2m_brk != (char *) (MORECORE_FAILURE))
+	    {
+	      size += align_size;
+	    }
+	  }
+
+          /* use huge pages */
+          systhp(brk, size);
+
           /* Call the `morecore' hook if necessary.  */
           void (*hook) (void) = atomic_forced_read (__after_morecore_hook);
           if (__builtin_expect (hook != NULL, 0))
@@ -2509,7 +2631,7 @@  sysmalloc (INTERNAL_SIZE_T nb, mstate av)
 
           /* Cannot merge with old top, so add its size back in */
           if (contiguous (av))
-            size = ALIGN_UP (size + old_size, pagesize);
+            size = ALIGN_UP (size + old_size, mmap_pagesize);
 
           /* If we are relying on mmap as backup, then use larger units */
           if ((unsigned long) (size) < (unsigned long) (MMAP_AS_MORECORE_SIZE))
@@ -2518,10 +2640,14 @@  sysmalloc (INTERNAL_SIZE_T nb, mstate av)
           /* Don't try if size wraps around 0 */
           if ((unsigned long) (size) > (unsigned long) (nb))
             {
-              char *mbrk = (char *) (MMAP (0, size, PROT_READ | PROT_WRITE, 0));
+              char *mbrk = (char *) (MMAP (0, size, PROT_READ | PROT_WRITE, extra_mmap_flags));
 
               if (mbrk != MAP_FAILED)
                 {
+
+                  /* use huge pages */
+                  systhp(mbrk, size);
+
                   /* We do not need, and cannot use, another sbrk call to find end */
                   brk = mbrk;
                   snd_brk = brk + size;
@@ -2613,7 +2739,7 @@  sysmalloc (INTERNAL_SIZE_T nb, mstate av)
 
                   /* Extend the end address to hit a page boundary */
                   end_misalign = (INTERNAL_SIZE_T) (brk + size + correction);
-                  correction += (ALIGN_UP (end_misalign, pagesize)) - end_misalign;
+                  correction += (ALIGN_UP (end_misalign, mmap_pagesize)) - end_misalign;
 
                   assert (correction >= 0);
                   snd_brk = (char *) (MORECORE (correction));
@@ -2635,6 +2761,7 @@  sysmalloc (INTERNAL_SIZE_T nb, mstate av)
                     }
                   else
                     {
+		      systhp(snd_brk, correction);
                       /* Call the `morecore' hook if necessary.  */
                       void (*hook) (void) = atomic_forced_read (__after_morecore_hook);
                       if (__builtin_expect (hook != NULL, 0))
@@ -2765,16 +2892,20 @@  systrim (size_t pad, mstate av)
   char *new_brk;         /* address returned by post-check sbrk call */
   size_t pagesize;
   long top_area;
+  size_t mmap_pagesize;
 
   pagesize = GLRO (dl_pagesize);
   top_size = chunksize (av->top);
 
+  systhp_initialize();
+  mmap_pagesize = sys_thp_mmap_pagesize > pagesize ? sys_thp_mmap_pagesize : pagesize;
+
   top_area = top_size - MINSIZE - 1;
   if (top_area <= pad)
     return 0;
 
   /* Release in pagesize units and round down to the nearest page.  */
-  extra = ALIGN_DOWN(top_area - pad, pagesize);
+  extra = ALIGN_DOWN(top_area - pad, mmap_pagesize);
 
   if (extra == 0)
     return 0;
@@ -2865,6 +2996,8 @@  mremap_chunk (mchunkptr p, size_t new_size)
   INTERNAL_SIZE_T offset = prev_size (p);
   INTERNAL_SIZE_T size = chunksize (p);
   char *cp;
+  const size_t mmap_pagesize = sys_thp_mmap_pagesize > pagesize ? sys_thp_mmap_pagesize : pagesize;
+  const int extra_mmap_flags = (21 << MAP_HUGE_SHIFT);
 
   assert (chunk_is_mmapped (p));
 
@@ -2876,18 +3009,22 @@  mremap_chunk (mchunkptr p, size_t new_size)
     malloc_printerr("mremap_chunk(): invalid pointer");
 
   /* Note the extra SIZE_SZ overhead as in mmap_chunk(). */
-  new_size = ALIGN_UP (new_size + offset + SIZE_SZ, pagesize);
+  new_size = ALIGN_UP (new_size + offset + SIZE_SZ, mmap_pagesize);
 
   /* No need to remap if the number of pages does not change.  */
   if (total_size == new_size)
     return p;
 
   cp = (char *) __mremap ((char *) block, total_size, new_size,
-                          MREMAP_MAYMOVE);
+                          MREMAP_MAYMOVE | extra_mmap_flags);
 
   if (cp == MAP_FAILED)
     return 0;
 
+  /* use huge pages */
+  systhp(cp, new_size);
+
+
   p = (mchunkptr) (cp + offset);
 
   assert (aligned_OK (chunk2mem (p)));