[v4] malloc: Optimize small memory zeroing for calloc

Message ID 20241129003931.4082573-1-hjl.tools@gmail.com
State Superseded
Headers
Series [v4] malloc: Optimize small memory zeroing for calloc |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Build passed
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Test passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Test passed

Commit Message

H.J. Lu Nov. 29, 2024, 12:39 a.m. UTC
  Add calloc-clear-memory.h to clear memory for calloc.  Use overlapping
stores with 1 branch, instead of up to 3.  Unroll 17 times on x86-64 to
support 32-byte vector stores as well as x32 when compiling glibc with
-march=x86-64-v3.

Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
---
 malloc/malloc-internal.h              |  1 +
 malloc/malloc.c                       | 35 +--------------
 sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++
 sysdeps/x86_64/calloc-clear-memory.h  | 61 +++++++++++++++++++++++++++
 4 files changed, 111 insertions(+), 34 deletions(-)
 create mode 100644 sysdeps/generic/calloc-clear-memory.h
 create mode 100644 sysdeps/x86_64/calloc-clear-memory.h
  

Comments

Wangyang Guo Nov. 29, 2024, 7:49 a.m. UTC | #1
On 11/29/2024 8:39 AM, H.J. Lu wrote:

> Add calloc-clear-memory.h to clear memory for calloc.  Use overlapping
> stores with 1 branch, instead of up to 3.  Unroll 17 times on x86-64 to
> support 32-byte vector stores as well as x32 when compiling glibc with
> -march=x86-64-v3.
>
> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> ---
>   malloc/malloc-internal.h              |  1 +
>   malloc/malloc.c                       | 35 +--------------
>   sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++
>   sysdeps/x86_64/calloc-clear-memory.h  | 61 +++++++++++++++++++++++++++
>   4 files changed, 111 insertions(+), 34 deletions(-)
>   create mode 100644 sysdeps/generic/calloc-clear-memory.h
>   create mode 100644 sysdeps/x86_64/calloc-clear-memory.h
Here is the performance data:

Test Platform: Xeon-8380
Ratio: New / Original time_per_iteration (Lower is Better)
Original Commit: "Add tcache path for calloc"

Without ISA level 3, CFLAGS="-march=x86-64-v2 -O3"
Threads# | Ratio
-----------|------
1 thread | 0.986
4 threads | 0.962

With ISA level 3, CFLAGS="-march=x86-64-v3 -O3"
Threads# | Ratio
-----------|------
1 thread | 1.006 -- within variant
4 threads | 0.969
  
Wilco Dijkstra Nov. 29, 2024, 6:50 p.m. UTC | #2
Hi H.J.,

+static __always_inline void *
+clear_memory (void *mem, unsigned long clearsize)
+{
+  /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes.  We know
+     that contents have an odd number of INTERNAL_SIZE_T-sized words;
+     minimally 3 words.  */
+  INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
+  unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
+
+  if (nclears > 9)
+    return memset (d, 0, clearsize);
+
+  /* Use overlapping stores with 1 branch, instead of up to 3.  */
+  *(d + 0) = 0;
+  *(d + 1) = 0;
+  *(d + 2) = 0;
+  *(d + nclears - 2) = 0;
+  *(d + nclears - 2 + 1) = 0;
+  if (nclears > 3)

Should be nclears > 5, right?

+    {
+      *(d + 3) = 0;
+      *(d + 3 + 1) = 0;
+      *(d + nclears - 4) = 0;
+      *(d + nclears - 4 + 1) = 0;
+    }
+  else if (nclears < 3)
+    __builtin_unreachable ();
+
+  return mem;
+}

This interface makes much more sense indeed. Still, it's hard to see how
targets could do much better than memset
  
H.J. Lu Nov. 30, 2024, 4:07 a.m. UTC | #3
On Sat, Nov 30, 2024, 2:51 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:

> Hi H.J.,
>
> +static __always_inline void *
> +clear_memory (void *mem, unsigned long clearsize)
> +{
> +  /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes.  We know
> +     that contents have an odd number of INTERNAL_SIZE_T-sized words;
> +     minimally 3 words.  */
> +  INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
> +  unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> +
> +  if (nclears > 9)
> +    return memset (d, 0, clearsize);
> +
> +  /* Use overlapping stores with 1 branch, instead of up to 3.  */
> +  *(d + 0) = 0;
> +  *(d + 1) = 0;
> +  *(d + 2) = 0;
> +  *(d + nclears - 2) = 0;
> +  *(d + nclears - 2 + 1) = 0;
> +  if (nclears > 3)
>
> Should be nclears > 5, right?
>

Will fix.


> +    {
> +      *(d + 3) = 0;
> +      *(d + 3 + 1) = 0;
> +      *(d + nclears - 4) = 0;
> +      *(d + nclears - 4 + 1) = 0;
> +    }
> +  else if (nclears < 3)
> +    __builtin_unreachable ();
> +
> +  return mem;
> +}
>
> This interface makes much more sense indeed. Still, it's hard to see how
> targets could do much better than memset
>

memset needs 1 indirect branch and more conditional
branches.

H.J.

>
  

Patch

diff --git a/malloc/malloc-internal.h b/malloc/malloc-internal.h
index cba03433fe..3349e2d1fe 100644
--- a/malloc/malloc-internal.h
+++ b/malloc/malloc-internal.h
@@ -23,6 +23,7 @@ 
 #include <malloc-sysdep.h>
 #include <malloc-size.h>
 #include <malloc-hugepages.h>
+#include <calloc-clear-memory.h>
 
 /* Called in the parent process before a fork.  */
 void __malloc_fork_lock_parent (void) attribute_hidden;
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 81ddd2c3a8..df36b423a8 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3755,8 +3755,6 @@  __libc_calloc (size_t n, size_t elem_size)
   INTERNAL_SIZE_T sz, oldtopsize;
   void *mem;
   unsigned long clearsize;
-  unsigned long nclears;
-  INTERNAL_SIZE_T *d;
   ptrdiff_t bytes;
 
   if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
@@ -3853,39 +3851,8 @@  __libc_calloc (size_t n, size_t elem_size)
     }
 #endif
 
-  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
-     contents have an odd number of INTERNAL_SIZE_T-sized words;
-     minimally 3.  */
-  d = (INTERNAL_SIZE_T *) mem;
   clearsize = csz - SIZE_SZ;
-  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
-  assert (nclears >= 3);
-
-  if (nclears > 9)
-    return memset (d, 0, clearsize);
-
-  else
-    {
-      *(d + 0) = 0;
-      *(d + 1) = 0;
-      *(d + 2) = 0;
-      if (nclears > 4)
-        {
-          *(d + 3) = 0;
-          *(d + 4) = 0;
-          if (nclears > 6)
-            {
-              *(d + 5) = 0;
-              *(d + 6) = 0;
-              if (nclears > 8)
-                {
-                  *(d + 7) = 0;
-                  *(d + 8) = 0;
-                }
-            }
-        }
-    }
-
+  clear_memory (mem, clearsize);
   return mem;
 }
 #endif /* IS_IN (libc) */
diff --git a/sysdeps/generic/calloc-clear-memory.h b/sysdeps/generic/calloc-clear-memory.h
new file mode 100644
index 0000000000..baf254977a
--- /dev/null
+++ b/sysdeps/generic/calloc-clear-memory.h
@@ -0,0 +1,48 @@ 
+/* Clear a block of memory for calloc.  Generic version.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+static __always_inline void *
+clear_memory (void *mem, unsigned long clearsize)
+{
+  /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes.  We know
+     that contents have an odd number of INTERNAL_SIZE_T-sized words;
+     minimally 3 words.  */
+  INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
+  unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
+
+  if (nclears > 9)
+    return memset (d, 0, clearsize);
+
+  /* Use overlapping stores with 1 branch, instead of up to 3.  */
+  *(d + 0) = 0;
+  *(d + 1) = 0;
+  *(d + 2) = 0;
+  *(d + nclears - 2) = 0;
+  *(d + nclears - 2 + 1) = 0;
+  if (nclears > 3)
+    {
+      *(d + 3) = 0;
+      *(d + 3 + 1) = 0;
+      *(d + nclears - 4) = 0;
+      *(d + nclears - 4 + 1) = 0;
+    }
+  else if (nclears < 3)
+    __builtin_unreachable ();
+
+  return mem;
+}
diff --git a/sysdeps/x86_64/calloc-clear-memory.h b/sysdeps/x86_64/calloc-clear-memory.h
new file mode 100644
index 0000000000..a4a22efd09
--- /dev/null
+++ b/sysdeps/x86_64/calloc-clear-memory.h
@@ -0,0 +1,61 @@ 
+/* Clear a block of memory for calloc.  X64-64 version.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+static __always_inline void *
+clear_memory (void *mem, unsigned long clearsize)
+{
+  /* Unroll clear memory size up to 17 * INTERNAL_SIZE_T bytes.  We know
+     that contents have an odd number of INTERNAL_SIZE_T-sized words;
+     minimally 3 words.  */
+  INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
+  unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
+
+  if (nclears > 17)
+    memset (mem, 0, clearsize);
+
+  /* Use overlapping stores with 2 branch, instead of up to 6.  */
+  *(d + 0) = 0;
+  *(d + 1) = 0;
+  *(d + 2) = 0;
+  if (nclears > 3)
+    {
+      *(d + 1) = 0;
+      *(d + 2) = 0;
+      *(d + 3) = 0;
+      *(d + 4) = 0;
+      *(d + nclears - 4) = 0;
+      *(d + nclears - 4 + 1) = 0;
+      *(d + nclears - 4 + 2) = 0;
+      *(d + nclears - 4 + 3) = 0;
+      if (nclears > 9)
+	{
+	  *(d + 5) = 0;
+	  *(d + 5 + 1) = 0;
+	  *(d + 5 + 2) = 0;
+	  *(d + 5 + 3) = 0;
+	  *(d + nclears - 8) = 0;
+	  *(d + nclears - 8 + 1) = 0;
+	  *(d + nclears - 8 + 2) = 0;
+	  *(d + nclears - 8 + 3) = 0;
+	}
+    }
+  else if (nclears < 3)
+    __builtin_unreachable ();
+
+  return mem;
+}