[v4] malloc: Optimize small memory zeroing for calloc
Checks
Context |
Check |
Description |
redhat-pt-bot/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_glibc_build--master-arm |
success
|
Build passed
|
redhat-pt-bot/TryBot-32bit |
success
|
Build for i686
|
linaro-tcwg-bot/tcwg_glibc_check--master-arm |
success
|
Test passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 |
success
|
Test passed
|
Commit Message
Add calloc-clear-memory.h to clear memory for calloc. Use overlapping
stores with 1 branch, instead of up to 3. Unroll 17 times on x86-64 to
support 32-byte vector stores as well as x32 when compiling glibc with
-march=x86-64-v3.
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
---
malloc/malloc-internal.h | 1 +
malloc/malloc.c | 35 +--------------
sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++
sysdeps/x86_64/calloc-clear-memory.h | 61 +++++++++++++++++++++++++++
4 files changed, 111 insertions(+), 34 deletions(-)
create mode 100644 sysdeps/generic/calloc-clear-memory.h
create mode 100644 sysdeps/x86_64/calloc-clear-memory.h
Comments
On 11/29/2024 8:39 AM, H.J. Lu wrote:
> Add calloc-clear-memory.h to clear memory for calloc. Use overlapping
> stores with 1 branch, instead of up to 3. Unroll 17 times on x86-64 to
> support 32-byte vector stores as well as x32 when compiling glibc with
> -march=x86-64-v3.
>
> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> ---
> malloc/malloc-internal.h | 1 +
> malloc/malloc.c | 35 +--------------
> sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++
> sysdeps/x86_64/calloc-clear-memory.h | 61 +++++++++++++++++++++++++++
> 4 files changed, 111 insertions(+), 34 deletions(-)
> create mode 100644 sysdeps/generic/calloc-clear-memory.h
> create mode 100644 sysdeps/x86_64/calloc-clear-memory.h
Here is the performance data:
Test Platform: Xeon-8380
Ratio: New / Original time_per_iteration (Lower is Better)
Original Commit: "Add tcache path for calloc"
Without ISA level 3, CFLAGS="-march=x86-64-v2 -O3"
Threads# | Ratio
-----------|------
1 thread | 0.986
4 threads | 0.962
With ISA level 3, CFLAGS="-march=x86-64-v3 -O3"
Threads# | Ratio
-----------|------
1 thread | 1.006 -- within variant
4 threads | 0.969
Hi H.J.,
+static __always_inline void *
+clear_memory (void *mem, unsigned long clearsize)
+{
+ /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes. We know
+ that contents have an odd number of INTERNAL_SIZE_T-sized words;
+ minimally 3 words. */
+ INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
+ unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
+
+ if (nclears > 9)
+ return memset (d, 0, clearsize);
+
+ /* Use overlapping stores with 1 branch, instead of up to 3. */
+ *(d + 0) = 0;
+ *(d + 1) = 0;
+ *(d + 2) = 0;
+ *(d + nclears - 2) = 0;
+ *(d + nclears - 2 + 1) = 0;
+ if (nclears > 3)
Should be nclears > 5, right?
+ {
+ *(d + 3) = 0;
+ *(d + 3 + 1) = 0;
+ *(d + nclears - 4) = 0;
+ *(d + nclears - 4 + 1) = 0;
+ }
+ else if (nclears < 3)
+ __builtin_unreachable ();
+
+ return mem;
+}
This interface makes much more sense indeed. Still, it's hard to see how
targets could do much better than memset
On Sat, Nov 30, 2024, 2:51 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> Hi H.J.,
>
> +static __always_inline void *
> +clear_memory (void *mem, unsigned long clearsize)
> +{
> + /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes. We know
> + that contents have an odd number of INTERNAL_SIZE_T-sized words;
> + minimally 3 words. */
> + INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
> + unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> +
> + if (nclears > 9)
> + return memset (d, 0, clearsize);
> +
> + /* Use overlapping stores with 1 branch, instead of up to 3. */
> + *(d + 0) = 0;
> + *(d + 1) = 0;
> + *(d + 2) = 0;
> + *(d + nclears - 2) = 0;
> + *(d + nclears - 2 + 1) = 0;
> + if (nclears > 3)
>
> Should be nclears > 5, right?
>
Will fix.
> + {
> + *(d + 3) = 0;
> + *(d + 3 + 1) = 0;
> + *(d + nclears - 4) = 0;
> + *(d + nclears - 4 + 1) = 0;
> + }
> + else if (nclears < 3)
> + __builtin_unreachable ();
> +
> + return mem;
> +}
>
> This interface makes much more sense indeed. Still, it's hard to see how
> targets could do much better than memset
>
memset needs 1 indirect branch and more conditional
branches.
H.J.
>
@@ -23,6 +23,7 @@
#include <malloc-sysdep.h>
#include <malloc-size.h>
#include <malloc-hugepages.h>
+#include <calloc-clear-memory.h>
/* Called in the parent process before a fork. */
void __malloc_fork_lock_parent (void) attribute_hidden;
@@ -3755,8 +3755,6 @@ __libc_calloc (size_t n, size_t elem_size)
INTERNAL_SIZE_T sz, oldtopsize;
void *mem;
unsigned long clearsize;
- unsigned long nclears;
- INTERNAL_SIZE_T *d;
ptrdiff_t bytes;
if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
@@ -3853,39 +3851,8 @@ __libc_calloc (size_t n, size_t elem_size)
}
#endif
- /* Unroll clear of <= 36 bytes (72 if 8byte sizes). We know that
- contents have an odd number of INTERNAL_SIZE_T-sized words;
- minimally 3. */
- d = (INTERNAL_SIZE_T *) mem;
clearsize = csz - SIZE_SZ;
- nclears = clearsize / sizeof (INTERNAL_SIZE_T);
- assert (nclears >= 3);
-
- if (nclears > 9)
- return memset (d, 0, clearsize);
-
- else
- {
- *(d + 0) = 0;
- *(d + 1) = 0;
- *(d + 2) = 0;
- if (nclears > 4)
- {
- *(d + 3) = 0;
- *(d + 4) = 0;
- if (nclears > 6)
- {
- *(d + 5) = 0;
- *(d + 6) = 0;
- if (nclears > 8)
- {
- *(d + 7) = 0;
- *(d + 8) = 0;
- }
- }
- }
- }
-
+ clear_memory (mem, clearsize);
return mem;
}
#endif /* IS_IN (libc) */
new file mode 100644
@@ -0,0 +1,48 @@
+/* Clear a block of memory for calloc. Generic version.
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+static __always_inline void *
+clear_memory (void *mem, unsigned long clearsize)
+{
+ /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes. We know
+ that contents have an odd number of INTERNAL_SIZE_T-sized words;
+ minimally 3 words. */
+ INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
+ unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
+
+ if (nclears > 9)
+ return memset (d, 0, clearsize);
+
+ /* Use overlapping stores with 1 branch, instead of up to 3. */
+ *(d + 0) = 0;
+ *(d + 1) = 0;
+ *(d + 2) = 0;
+ *(d + nclears - 2) = 0;
+ *(d + nclears - 2 + 1) = 0;
+ if (nclears > 3)
+ {
+ *(d + 3) = 0;
+ *(d + 3 + 1) = 0;
+ *(d + nclears - 4) = 0;
+ *(d + nclears - 4 + 1) = 0;
+ }
+ else if (nclears < 3)
+ __builtin_unreachable ();
+
+ return mem;
+}
new file mode 100644
@@ -0,0 +1,61 @@
+/* Clear a block of memory for calloc. X64-64 version.
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+static __always_inline void *
+clear_memory (void *mem, unsigned long clearsize)
+{
+ /* Unroll clear memory size up to 17 * INTERNAL_SIZE_T bytes. We know
+ that contents have an odd number of INTERNAL_SIZE_T-sized words;
+ minimally 3 words. */
+ INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
+ unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
+
+ if (nclears > 17)
+ memset (mem, 0, clearsize);
+
+ /* Use overlapping stores with 2 branch, instead of up to 6. */
+ *(d + 0) = 0;
+ *(d + 1) = 0;
+ *(d + 2) = 0;
+ if (nclears > 3)
+ {
+ *(d + 1) = 0;
+ *(d + 2) = 0;
+ *(d + 3) = 0;
+ *(d + 4) = 0;
+ *(d + nclears - 4) = 0;
+ *(d + nclears - 4 + 1) = 0;
+ *(d + nclears - 4 + 2) = 0;
+ *(d + nclears - 4 + 3) = 0;
+ if (nclears > 9)
+ {
+ *(d + 5) = 0;
+ *(d + 5 + 1) = 0;
+ *(d + 5 + 2) = 0;
+ *(d + 5 + 3) = 0;
+ *(d + nclears - 8) = 0;
+ *(d + nclears - 8 + 1) = 0;
+ *(d + nclears - 8 + 2) = 0;
+ *(d + nclears - 8 + 3) = 0;
+ }
+ }
+ else if (nclears < 3)
+ __builtin_unreachable ();
+
+ return mem;
+}