[v5] malloc: Optimize small memory clearing for calloc
Checks
Context |
Check |
Description |
redhat-pt-bot/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 |
success
|
Test passed
|
linaro-tcwg-bot/tcwg_glibc_build--master-arm |
success
|
Build passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-arm |
success
|
Test passed
|
redhat-pt-bot/TryBot-32bit |
success
|
Build for i686
|
Commit Message
Add calloc-clear-memory.h to clear memory size up to 36 bytes (72 if 8byte
sizes) for calloc. Use overlapping stores with 1 branch, instead of up to
3 branches. On x860-64, it is faster than memset since calling memset
needs 1 indirect branch, 1 broadcast, and up to 4 branches.
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
---
malloc/malloc-internal.h | 1 +
malloc/malloc.c | 36 +-------------------
sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++++++++
3 files changed, 50 insertions(+), 35 deletions(-)
create mode 100644 sysdeps/generic/calloc-clear-memory.h
Comments
On 30/11/24 01:21, H.J. Lu wrote:
> Add calloc-clear-memory.h to clear memory size up to 36 bytes (72 if 8byte
> sizes) for calloc. Use overlapping stores with 1 branch, instead of up to
> 3 branches. On x860-64, it is faster than memset since calling memset
> needs 1 indirect branch, 1 broadcast, and up to 4 branches.
>
> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> ---
> malloc/malloc-internal.h | 1 +
> malloc/malloc.c | 36 +-------------------
> sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++++++++
> 3 files changed, 50 insertions(+), 35 deletions(-)
> create mode 100644 sysdeps/generic/calloc-clear-memory.h
>
> diff --git a/malloc/malloc-internal.h b/malloc/malloc-internal.h
> index cba03433fe..3349e2d1fe 100644
> --- a/malloc/malloc-internal.h
> +++ b/malloc/malloc-internal.h
> @@ -23,6 +23,7 @@
> #include <malloc-sysdep.h>
> #include <malloc-size.h>
> #include <malloc-hugepages.h>
> +#include <calloc-clear-memory.h>
>
> /* Called in the parent process before a fork. */
> void __malloc_fork_lock_parent (void) attribute_hidden;
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 287fa0904d..95775a3515 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3755,8 +3755,6 @@ __libc_calloc (size_t n, size_t elem_size)
> INTERNAL_SIZE_T sz, oldtopsize;
> void *mem;
> unsigned long clearsize;
> - unsigned long nclears;
> - INTERNAL_SIZE_T *d;
> ptrdiff_t bytes;
>
> if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
> @@ -3853,40 +3851,8 @@ __libc_calloc (size_t n, size_t elem_size)
> }
> #endif
>
> - /* Unroll clear of <= 36 bytes (72 if 8byte sizes). We know that
> - contents have an odd number of INTERNAL_SIZE_T-sized words;
> - minimally 3. */
> - d = (INTERNAL_SIZE_T *) mem;
> clearsize = csz - SIZE_SZ;
> - nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> - assert (nclears >= 3);
> -
> - if (nclears > 9)
> - return memset (d, 0, clearsize);
> -
> - else
> - {
> - *(d + 0) = 0;
> - *(d + 1) = 0;
> - *(d + 2) = 0;
> - if (nclears > 4)
> - {
> - *(d + 3) = 0;
> - *(d + 4) = 0;
> - if (nclears > 6)
> - {
> - *(d + 5) = 0;
> - *(d + 6) = 0;
> - if (nclears > 8)
> - {
> - *(d + 7) = 0;
> - *(d + 8) = 0;
> - }
> - }
> - }
> - }
> -
> - return mem;
> + return clear_memory (mem, clearsize);
> }
> #endif /* IS_IN (libc) */
>
> diff --git a/sysdeps/generic/calloc-clear-memory.h b/sysdeps/generic/calloc-clear-memory.h
> new file mode 100644
> index 0000000000..9a2727b772
> --- /dev/null
> +++ b/sysdeps/generic/calloc-clear-memory.h
> @@ -0,0 +1,48 @@
> +/* Clear a block of memory for calloc. Generic version.
> + Copyright (C) 2024 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +static __always_inline void *
> +clear_memory (void *mem, unsigned long clearsize)
> +{
> + /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes. We know
> + that contents have an odd number of INTERNAL_SIZE_T-sized words;
> + minimally 3 words. */
> + INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
I think this strictly UB and it might generate some issues on architecture
with strict alignment requirement (like sparc and some riscv chips). I
think we will need to use either some struct helper with __attribute__((packed))
or memcpy to avoid it.
> + unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> +
> + if (nclears > 9)
> + return memset (d, 0, clearsize);
> +
> + /* Use overlapping stores with 1 branch, instead of up to 3. */
> + *(d + 0) = 0;
> + *(d + 1) = 0;
> + *(d + 2) = 0;
> + *(d + nclears - 2) = 0;
> + *(d + nclears - 2 + 1) = 0;
> + if (nclears > 5)
> + {
> + *(d + 3) = 0;
> + *(d + 3 + 1) = 0;
> + *(d + nclears - 4) = 0;
> + *(d + nclears - 4 + 1) = 0;
> + }
> + else if (nclears < 3)
> + __builtin_unreachable ();
> +
> + return mem;
> +}
On Mon, Dec 2, 2024 at 7:56 PM Adhemerval Zanella Netto
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 30/11/24 01:21, H.J. Lu wrote:
> > Add calloc-clear-memory.h to clear memory size up to 36 bytes (72 if 8byte
> > sizes) for calloc. Use overlapping stores with 1 branch, instead of up to
> > 3 branches. On x860-64, it is faster than memset since calling memset
> > needs 1 indirect branch, 1 broadcast, and up to 4 branches.
> >
> > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> > malloc/malloc-internal.h | 1 +
> > malloc/malloc.c | 36 +-------------------
> > sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++++++++
> > 3 files changed, 50 insertions(+), 35 deletions(-)
> > create mode 100644 sysdeps/generic/calloc-clear-memory.h
> >
> > diff --git a/malloc/malloc-internal.h b/malloc/malloc-internal.h
> > index cba03433fe..3349e2d1fe 100644
> > --- a/malloc/malloc-internal.h
> > +++ b/malloc/malloc-internal.h
> > @@ -23,6 +23,7 @@
> > #include <malloc-sysdep.h>
> > #include <malloc-size.h>
> > #include <malloc-hugepages.h>
> > +#include <calloc-clear-memory.h>
> >
> > /* Called in the parent process before a fork. */
> > void __malloc_fork_lock_parent (void) attribute_hidden;
> > diff --git a/malloc/malloc.c b/malloc/malloc.c
> > index 287fa0904d..95775a3515 100644
> > --- a/malloc/malloc.c
> > +++ b/malloc/malloc.c
> > @@ -3755,8 +3755,6 @@ __libc_calloc (size_t n, size_t elem_size)
> > INTERNAL_SIZE_T sz, oldtopsize;
> > void *mem;
> > unsigned long clearsize;
> > - unsigned long nclears;
> > - INTERNAL_SIZE_T *d;
> > ptrdiff_t bytes;
> >
> > if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
> > @@ -3853,40 +3851,8 @@ __libc_calloc (size_t n, size_t elem_size)
> > }
> > #endif
> >
> > - /* Unroll clear of <= 36 bytes (72 if 8byte sizes). We know that
> > - contents have an odd number of INTERNAL_SIZE_T-sized words;
> > - minimally 3. */
> > - d = (INTERNAL_SIZE_T *) mem;
> > clearsize = csz - SIZE_SZ;
> > - nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> > - assert (nclears >= 3);
> > -
> > - if (nclears > 9)
> > - return memset (d, 0, clearsize);
> > -
> > - else
> > - {
> > - *(d + 0) = 0;
> > - *(d + 1) = 0;
> > - *(d + 2) = 0;
> > - if (nclears > 4)
> > - {
> > - *(d + 3) = 0;
> > - *(d + 4) = 0;
> > - if (nclears > 6)
> > - {
> > - *(d + 5) = 0;
> > - *(d + 6) = 0;
> > - if (nclears > 8)
> > - {
> > - *(d + 7) = 0;
> > - *(d + 8) = 0;
> > - }
> > - }
> > - }
> > - }
> > -
> > - return mem;
> > + return clear_memory (mem, clearsize);
> > }
> > #endif /* IS_IN (libc) */
> >
> > diff --git a/sysdeps/generic/calloc-clear-memory.h b/sysdeps/generic/calloc-clear-memory.h
> > new file mode 100644
> > index 0000000000..9a2727b772
> > --- /dev/null
> > +++ b/sysdeps/generic/calloc-clear-memory.h
> > @@ -0,0 +1,48 @@
> > +/* Clear a block of memory for calloc. Generic version.
> > + Copyright (C) 2024 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +static __always_inline void *
> > +clear_memory (void *mem, unsigned long clearsize)
> > +{
> > + /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes. We know
> > + that contents have an odd number of INTERNAL_SIZE_T-sized words;
> > + minimally 3 words. */
> > + INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
>
> I think this strictly UB and it might generate some issues on architecture
> with strict alignment requirement (like sparc and some riscv chips). I
> think we will need to use either some struct helper with __attribute__((packed))
> or memcpy to avoid it.
I think we should use the original code by default. The unaligned
version is opt-in.
> > + unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> > +
> > + if (nclears > 9)
> > + return memset (d, 0, clearsize);
> > +
> > + /* Use overlapping stores with 1 branch, instead of up to 3. */
> > + *(d + 0) = 0;
> > + *(d + 1) = 0;
> > + *(d + 2) = 0;
> > + *(d + nclears - 2) = 0;
> > + *(d + nclears - 2 + 1) = 0;
> > + if (nclears > 5)
> > + {
> > + *(d + 3) = 0;
> > + *(d + 3 + 1) = 0;
> > + *(d + nclears - 4) = 0;
> > + *(d + nclears - 4 + 1) = 0;
> > + }
> > + else if (nclears < 3)
> > + __builtin_unreachable ();
> > +
> > + return mem;
> > +}
>
* Adhemerval Zanella Netto:
>> +static __always_inline void *
>> +clear_memory (void *mem, unsigned long clearsize)
>> +{
>> + /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes. We know
>> + that contents have an odd number of INTERNAL_SIZE_T-sized words;
>> + minimally 3 words. */
>> + INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
>
> I think this strictly UB and it might generate some issues on architecture
> with strict alignment requirement (like sparc and some riscv chips). I
> think we will need to use either some struct helper with __attribute__((packed))
> or memcpy to avoid it.
I think everything is properly aligned? The “overlapping” comment is a
bit misleading, it's about multiple stores to the same locations, not
partially overlapping stores.
Thanks,
Florian
On 02/12/24 09:05, Florian Weimer wrote:
> * Adhemerval Zanella Netto:
>
>>> +static __always_inline void *
>>> +clear_memory (void *mem, unsigned long clearsize)
>>> +{
>>> + /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes. We know
>>> + that contents have an odd number of INTERNAL_SIZE_T-sized words;
>>> + minimally 3 words. */
>>> + INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
>>
>> I think this strictly UB and it might generate some issues on architecture
>> with strict alignment requirement (like sparc and some riscv chips). I
>> think we will need to use either some struct helper with __attribute__((packed))
>> or memcpy to avoid it.
>
> I think everything is properly aligned? The “overlapping” comment is a
> bit misleading, it's about multiple stores to the same locations, not
> partially overlapping stores.
So maybe then define 'mem' as 'INTERNAL_SIZE_T *'?
On Fri, Nov 29, 2024 at 8:21 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> Add calloc-clear-memory.h to clear memory size up to 36 bytes (72 if 8byte
> sizes) for calloc. Use overlapping stores with 1 branch, instead of up to
> 3 branches. On x860-64, it is faster than memset since calling memset
> needs 1 indirect branch, 1 broadcast, and up to 4 branches.
>
> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> ---
> malloc/malloc-internal.h | 1 +
> malloc/malloc.c | 36 +-------------------
> sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++++++++
> 3 files changed, 50 insertions(+), 35 deletions(-)
> create mode 100644 sysdeps/generic/calloc-clear-memory.h
>
> diff --git a/malloc/malloc-internal.h b/malloc/malloc-internal.h
> index cba03433fe..3349e2d1fe 100644
> --- a/malloc/malloc-internal.h
> +++ b/malloc/malloc-internal.h
> @@ -23,6 +23,7 @@
> #include <malloc-sysdep.h>
> #include <malloc-size.h>
> #include <malloc-hugepages.h>
> +#include <calloc-clear-memory.h>
>
> /* Called in the parent process before a fork. */
> void __malloc_fork_lock_parent (void) attribute_hidden;
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 287fa0904d..95775a3515 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3755,8 +3755,6 @@ __libc_calloc (size_t n, size_t elem_size)
> INTERNAL_SIZE_T sz, oldtopsize;
> void *mem;
> unsigned long clearsize;
> - unsigned long nclears;
> - INTERNAL_SIZE_T *d;
> ptrdiff_t bytes;
>
> if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
> @@ -3853,40 +3851,8 @@ __libc_calloc (size_t n, size_t elem_size)
> }
> #endif
>
> - /* Unroll clear of <= 36 bytes (72 if 8byte sizes). We know that
> - contents have an odd number of INTERNAL_SIZE_T-sized words;
> - minimally 3. */
> - d = (INTERNAL_SIZE_T *) mem;
> clearsize = csz - SIZE_SZ;
> - nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> - assert (nclears >= 3);
> -
> - if (nclears > 9)
> - return memset (d, 0, clearsize);
> -
> - else
> - {
> - *(d + 0) = 0;
> - *(d + 1) = 0;
> - *(d + 2) = 0;
> - if (nclears > 4)
> - {
> - *(d + 3) = 0;
> - *(d + 4) = 0;
> - if (nclears > 6)
> - {
> - *(d + 5) = 0;
> - *(d + 6) = 0;
> - if (nclears > 8)
> - {
> - *(d + 7) = 0;
> - *(d + 8) = 0;
> - }
> - }
> - }
> - }
> -
> - return mem;
> + return clear_memory (mem, clearsize);
> }
> #endif /* IS_IN (libc) */
>
> diff --git a/sysdeps/generic/calloc-clear-memory.h b/sysdeps/generic/calloc-clear-memory.h
> new file mode 100644
> index 0000000000..9a2727b772
> --- /dev/null
> +++ b/sysdeps/generic/calloc-clear-memory.h
> @@ -0,0 +1,48 @@
> +/* Clear a block of memory for calloc. Generic version.
> + Copyright (C) 2024 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +static __always_inline void *
> +clear_memory (void *mem, unsigned long clearsize)
> +{
> + /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes. We know
> + that contents have an odd number of INTERNAL_SIZE_T-sized words;
> + minimally 3 words. */
> + INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
> + unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> +
> + if (nclears > 9)
> + return memset (d, 0, clearsize);
> +
> + /* Use overlapping stores with 1 branch, instead of up to 3. */
> + *(d + 0) = 0;
> + *(d + 1) = 0;
> + *(d + 2) = 0;
> + *(d + nclears - 2) = 0;
> + *(d + nclears - 2 + 1) = 0;
> + if (nclears > 5)
> + {
> + *(d + 3) = 0;
> + *(d + 3 + 1) = 0;
> + *(d + nclears - 4) = 0;
> + *(d + nclears - 4 + 1) = 0;
> + }
> + else if (nclears < 3)
> + __builtin_unreachable ();
> +
Should be before the first deref?
I.e at the top
```
if(nclears < 3) __builtin_unreachable();`
*(d + 0) = 0;
...
```
> + return mem;
> +}
> --
> 2.47.1
>
On Tue, Dec 3, 2024 at 10:00 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Nov 29, 2024 at 8:21 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > Add calloc-clear-memory.h to clear memory size up to 36 bytes (72 if 8byte
> > sizes) for calloc. Use overlapping stores with 1 branch, instead of up to
> > 3 branches. On x860-64, it is faster than memset since calling memset
> > needs 1 indirect branch, 1 broadcast, and up to 4 branches.
> >
> > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> > malloc/malloc-internal.h | 1 +
> > malloc/malloc.c | 36 +-------------------
> > sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++++++++
> > 3 files changed, 50 insertions(+), 35 deletions(-)
> > create mode 100644 sysdeps/generic/calloc-clear-memory.h
> >
> > diff --git a/malloc/malloc-internal.h b/malloc/malloc-internal.h
> > index cba03433fe..3349e2d1fe 100644
> > --- a/malloc/malloc-internal.h
> > +++ b/malloc/malloc-internal.h
> > @@ -23,6 +23,7 @@
> > #include <malloc-sysdep.h>
> > #include <malloc-size.h>
> > #include <malloc-hugepages.h>
> > +#include <calloc-clear-memory.h>
> >
> > /* Called in the parent process before a fork. */
> > void __malloc_fork_lock_parent (void) attribute_hidden;
> > diff --git a/malloc/malloc.c b/malloc/malloc.c
> > index 287fa0904d..95775a3515 100644
> > --- a/malloc/malloc.c
> > +++ b/malloc/malloc.c
> > @@ -3755,8 +3755,6 @@ __libc_calloc (size_t n, size_t elem_size)
> > INTERNAL_SIZE_T sz, oldtopsize;
> > void *mem;
> > unsigned long clearsize;
> > - unsigned long nclears;
> > - INTERNAL_SIZE_T *d;
> > ptrdiff_t bytes;
> >
> > if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
> > @@ -3853,40 +3851,8 @@ __libc_calloc (size_t n, size_t elem_size)
> > }
> > #endif
> >
> > - /* Unroll clear of <= 36 bytes (72 if 8byte sizes). We know that
> > - contents have an odd number of INTERNAL_SIZE_T-sized words;
> > - minimally 3. */
> > - d = (INTERNAL_SIZE_T *) mem;
> > clearsize = csz - SIZE_SZ;
> > - nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> > - assert (nclears >= 3);
> > -
> > - if (nclears > 9)
> > - return memset (d, 0, clearsize);
> > -
> > - else
> > - {
> > - *(d + 0) = 0;
> > - *(d + 1) = 0;
> > - *(d + 2) = 0;
> > - if (nclears > 4)
> > - {
> > - *(d + 3) = 0;
> > - *(d + 4) = 0;
> > - if (nclears > 6)
> > - {
> > - *(d + 5) = 0;
> > - *(d + 6) = 0;
> > - if (nclears > 8)
> > - {
> > - *(d + 7) = 0;
> > - *(d + 8) = 0;
> > - }
> > - }
> > - }
> > - }
> > -
> > - return mem;
> > + return clear_memory (mem, clearsize);
> > }
> > #endif /* IS_IN (libc) */
> >
> > diff --git a/sysdeps/generic/calloc-clear-memory.h b/sysdeps/generic/calloc-clear-memory.h
> > new file mode 100644
> > index 0000000000..9a2727b772
> > --- /dev/null
> > +++ b/sysdeps/generic/calloc-clear-memory.h
> > @@ -0,0 +1,48 @@
> > +/* Clear a block of memory for calloc. Generic version.
> > + Copyright (C) 2024 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +static __always_inline void *
> > +clear_memory (void *mem, unsigned long clearsize)
> > +{
> > + /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes. We know
> > + that contents have an odd number of INTERNAL_SIZE_T-sized words;
> > + minimally 3 words. */
> > + INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
> > + unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> > +
> > + if (nclears > 9)
> > + return memset (d, 0, clearsize);
> > +
> > + /* Use overlapping stores with 1 branch, instead of up to 3. */
> > + *(d + 0) = 0;
> > + *(d + 1) = 0;
> > + *(d + 2) = 0;
> > + *(d + nclears - 2) = 0;
> > + *(d + nclears - 2 + 1) = 0;
> > + if (nclears > 5)
> > + {
> > + *(d + 3) = 0;
> > + *(d + 3 + 1) = 0;
> > + *(d + nclears - 4) = 0;
> > + *(d + nclears - 4 + 1) = 0;
> > + }
> > + else if (nclears < 3)
> > + __builtin_unreachable ();
> > +
> Should be before the first deref?
>
> I.e at the top
> ```
> if(nclears < 3) __builtin_unreachable();`
This should never happen in production, just like malloc
memory is always aligned to INTERNAL_SIZE_T. If
nclears < 3, something really went wrong in malloc which
should be caught during glibc development.
> *(d + 0) = 0;
> ...
> ```
> > + return mem;
> > +}
> > --
> > 2.47.1
> >
On Mon, Dec 2, 2024 at 6:00 PM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:
> On Fri, Nov 29, 2024 at 8:21 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > Add calloc-clear-memory.h to clear memory size up to 36 bytes (72 if
> 8byte
> > sizes) for calloc. Use overlapping stores with 1 branch, instead of up
> to
> > 3 branches. On x860-64, it is faster than memset since calling memset
> > needs 1 indirect branch, 1 broadcast, and up to 4 branches.
> >
> > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> > malloc/malloc-internal.h | 1 +
> > malloc/malloc.c | 36 +-------------------
> > sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++++++++
> > 3 files changed, 50 insertions(+), 35 deletions(-)
> > create mode 100644 sysdeps/generic/calloc-clear-memory.h
> >
> > diff --git a/malloc/malloc-internal.h b/malloc/malloc-internal.h
> > index cba03433fe..3349e2d1fe 100644
> > --- a/malloc/malloc-internal.h
> > +++ b/malloc/malloc-internal.h
> > @@ -23,6 +23,7 @@
> > #include <malloc-sysdep.h>
> > #include <malloc-size.h>
> > #include <malloc-hugepages.h>
> > +#include <calloc-clear-memory.h>
> >
> > /* Called in the parent process before a fork. */
> > void __malloc_fork_lock_parent (void) attribute_hidden;
> > diff --git a/malloc/malloc.c b/malloc/malloc.c
> > index 287fa0904d..95775a3515 100644
> > --- a/malloc/malloc.c
> > +++ b/malloc/malloc.c
> > @@ -3755,8 +3755,6 @@ __libc_calloc (size_t n, size_t elem_size)
> > INTERNAL_SIZE_T sz, oldtopsize;
> > void *mem;
> > unsigned long clearsize;
> > - unsigned long nclears;
> > - INTERNAL_SIZE_T *d;
> > ptrdiff_t bytes;
> >
> > if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
> > @@ -3853,40 +3851,8 @@ __libc_calloc (size_t n, size_t elem_size)
> > }
> > #endif
> >
> > - /* Unroll clear of <= 36 bytes (72 if 8byte sizes). We know that
> > - contents have an odd number of INTERNAL_SIZE_T-sized words;
> > - minimally 3. */
> > - d = (INTERNAL_SIZE_T *) mem;
> > clearsize = csz - SIZE_SZ;
> > - nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> > - assert (nclears >= 3);
> > -
> > - if (nclears > 9)
> > - return memset (d, 0, clearsize);
> > -
> > - else
> > - {
> > - *(d + 0) = 0;
> > - *(d + 1) = 0;
> > - *(d + 2) = 0;
> > - if (nclears > 4)
> > - {
> > - *(d + 3) = 0;
> > - *(d + 4) = 0;
> > - if (nclears > 6)
> > - {
> > - *(d + 5) = 0;
> > - *(d + 6) = 0;
> > - if (nclears > 8)
> > - {
> > - *(d + 7) = 0;
> > - *(d + 8) = 0;
> > - }
> > - }
> > - }
> > - }
> > -
> > - return mem;
> > + return clear_memory (mem, clearsize);
> > }
> > #endif /* IS_IN (libc) */
> >
> > diff --git a/sysdeps/generic/calloc-clear-memory.h
> b/sysdeps/generic/calloc-clear-memory.h
> > new file mode 100644
> > index 0000000000..9a2727b772
> > --- /dev/null
> > +++ b/sysdeps/generic/calloc-clear-memory.h
> > @@ -0,0 +1,48 @@
> > +/* Clear a block of memory for calloc. Generic version.
> > + Copyright (C) 2024 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +static __always_inline void *
> > +clear_memory (void *mem, unsigned long clearsize)
> > +{
> > + /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes. We know
> > + that contents have an odd number of INTERNAL_SIZE_T-sized words;
> > + minimally 3 words. */
> > + INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
> > + unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> > +
> > + if (nclears > 9)
> > + return memset (d, 0, clearsize);
> > +
> > + /* Use overlapping stores with 1 branch, instead of up to 3. */
> > + *(d + 0) = 0;
> > + *(d + 1) = 0;
> > + *(d + 2) = 0;
> > + *(d + nclears - 2) = 0;
> > + *(d + nclears - 2 + 1) = 0;
>
For nclears = 8.
In existing code, *(d+7) will not be set.
In new code, *(d+7) will be set.
Not sure if I'm missing something here.
> > + if (nclears > 5)
> > + {
> > + *(d + 3) = 0;
> > + *(d + 3 + 1) = 0;
> > + *(d + nclears - 4) = 0;
> > + *(d + nclears - 4 + 1) = 0;
> > + }
> > + else if (nclears < 3)
> > + __builtin_unreachable ();
> > +
> Should be before the first deref?
>
> I.e at the top
> ```
> if(nclears < 3) __builtin_unreachable();`
> *(d + 0) = 0;
> ...
> ```
> > + return mem;
> > +}
> > --
> > 2.47.1
> >
>
On Tue, Dec 3, 2024 at 10:13 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Mon, Dec 2, 2024 at 6:00 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> On Fri, Nov 29, 2024 at 8:21 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>> >
>> > Add calloc-clear-memory.h to clear memory size up to 36 bytes (72 if 8byte
>> > sizes) for calloc. Use overlapping stores with 1 branch, instead of up to
>> > 3 branches. On x860-64, it is faster than memset since calling memset
>> > needs 1 indirect branch, 1 broadcast, and up to 4 branches.
>> >
>> > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
>> > ---
>> > malloc/malloc-internal.h | 1 +
>> > malloc/malloc.c | 36 +-------------------
>> > sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++++++++
>> > 3 files changed, 50 insertions(+), 35 deletions(-)
>> > create mode 100644 sysdeps/generic/calloc-clear-memory.h
>> >
>> > diff --git a/malloc/malloc-internal.h b/malloc/malloc-internal.h
>> > index cba03433fe..3349e2d1fe 100644
>> > --- a/malloc/malloc-internal.h
>> > +++ b/malloc/malloc-internal.h
>> > @@ -23,6 +23,7 @@
>> > #include <malloc-sysdep.h>
>> > #include <malloc-size.h>
>> > #include <malloc-hugepages.h>
>> > +#include <calloc-clear-memory.h>
>> >
>> > /* Called in the parent process before a fork. */
>> > void __malloc_fork_lock_parent (void) attribute_hidden;
>> > diff --git a/malloc/malloc.c b/malloc/malloc.c
>> > index 287fa0904d..95775a3515 100644
>> > --- a/malloc/malloc.c
>> > +++ b/malloc/malloc.c
>> > @@ -3755,8 +3755,6 @@ __libc_calloc (size_t n, size_t elem_size)
>> > INTERNAL_SIZE_T sz, oldtopsize;
>> > void *mem;
>> > unsigned long clearsize;
>> > - unsigned long nclears;
>> > - INTERNAL_SIZE_T *d;
>> > ptrdiff_t bytes;
>> >
>> > if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
>> > @@ -3853,40 +3851,8 @@ __libc_calloc (size_t n, size_t elem_size)
>> > }
>> > #endif
>> >
>> > - /* Unroll clear of <= 36 bytes (72 if 8byte sizes). We know that
>> > - contents have an odd number of INTERNAL_SIZE_T-sized words;
>> > - minimally 3. */
>> > - d = (INTERNAL_SIZE_T *) mem;
>> > clearsize = csz - SIZE_SZ;
>> > - nclears = clearsize / sizeof (INTERNAL_SIZE_T);
>> > - assert (nclears >= 3);
>> > -
>> > - if (nclears > 9)
>> > - return memset (d, 0, clearsize);
>> > -
>> > - else
>> > - {
>> > - *(d + 0) = 0;
>> > - *(d + 1) = 0;
>> > - *(d + 2) = 0;
>> > - if (nclears > 4)
>> > - {
>> > - *(d + 3) = 0;
>> > - *(d + 4) = 0;
>> > - if (nclears > 6)
>> > - {
>> > - *(d + 5) = 0;
>> > - *(d + 6) = 0;
>> > - if (nclears > 8)
>> > - {
>> > - *(d + 7) = 0;
>> > - *(d + 8) = 0;
>> > - }
>> > - }
>> > - }
>> > - }
>> > -
>> > - return mem;
>> > + return clear_memory (mem, clearsize);
>> > }
>> > #endif /* IS_IN (libc) */
>> >
>> > diff --git a/sysdeps/generic/calloc-clear-memory.h b/sysdeps/generic/calloc-clear-memory.h
>> > new file mode 100644
>> > index 0000000000..9a2727b772
>> > --- /dev/null
>> > +++ b/sysdeps/generic/calloc-clear-memory.h
>> > @@ -0,0 +1,48 @@
>> > +/* Clear a block of memory for calloc. Generic version.
>> > + Copyright (C) 2024 Free Software Foundation, Inc.
>> > + This file is part of the GNU C Library.
>> > +
>> > + The GNU C Library is free software; you can redistribute it and/or
>> > + modify it under the terms of the GNU Lesser General Public
>> > + License as published by the Free Software Foundation; either
>> > + version 2.1 of the License, or (at your option) any later version.
>> > +
>> > + The GNU C Library is distributed in the hope that it will be useful,
>> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> > + Lesser General Public License for more details.
>> > +
>> > + You should have received a copy of the GNU Lesser General Public
>> > + License along with the GNU C Library; if not, see
>> > + <https://www.gnu.org/licenses/>. */
>> > +
>> > +static __always_inline void *
>> > +clear_memory (void *mem, unsigned long clearsize)
>> > +{
>> > + /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes. We know
>> > + that contents have an odd number of INTERNAL_SIZE_T-sized words;
>> > + minimally 3 words. */
>> > + INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
>> > + unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
>> > +
>> > + if (nclears > 9)
>> > + return memset (d, 0, clearsize);
>> > +
>> > + /* Use overlapping stores with 1 branch, instead of up to 3. */
>> > + *(d + 0) = 0;
>> > + *(d + 1) = 0;
>> > + *(d + 2) = 0;
>> > + *(d + nclears - 2) = 0;
>> > + *(d + nclears - 2 + 1) = 0;
>
>
> For nclears = 8.
>
> In existing code, *(d+7) will not be set.
>
> In new code, *(d+7) will be set.
>
> Not sure if I'm missing something here.
The code comments say:
/* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes. We know
that contents have an odd number of INTERNAL_SIZE_T-sized words;
minimally 3 words. */
nclears == 8 never happens.
>
>>
>> > + if (nclears > 5)
>> > + {
>> > + *(d + 3) = 0;
>> > + *(d + 3 + 1) = 0;
>> > + *(d + nclears - 4) = 0;
>> > + *(d + nclears - 4 + 1) = 0;
>> > + }
>> > + else if (nclears < 3)
>> > + __builtin_unreachable ();
>> > +
>> Should be before the first deref?
>>
>> I.e at the top
>> ```
>> if(nclears < 3) __builtin_unreachable();`
>> *(d + 0) = 0;
>> ...
>> ```
>> > + return mem;
>> > +}
>> > --
>> > 2.47.1
>> >
@@ -23,6 +23,7 @@
#include <malloc-sysdep.h>
#include <malloc-size.h>
#include <malloc-hugepages.h>
+#include <calloc-clear-memory.h>
/* Called in the parent process before a fork. */
void __malloc_fork_lock_parent (void) attribute_hidden;
@@ -3755,8 +3755,6 @@ __libc_calloc (size_t n, size_t elem_size)
INTERNAL_SIZE_T sz, oldtopsize;
void *mem;
unsigned long clearsize;
- unsigned long nclears;
- INTERNAL_SIZE_T *d;
ptrdiff_t bytes;
if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
@@ -3853,40 +3851,8 @@ __libc_calloc (size_t n, size_t elem_size)
}
#endif
- /* Unroll clear of <= 36 bytes (72 if 8byte sizes). We know that
- contents have an odd number of INTERNAL_SIZE_T-sized words;
- minimally 3. */
- d = (INTERNAL_SIZE_T *) mem;
clearsize = csz - SIZE_SZ;
- nclears = clearsize / sizeof (INTERNAL_SIZE_T);
- assert (nclears >= 3);
-
- if (nclears > 9)
- return memset (d, 0, clearsize);
-
- else
- {
- *(d + 0) = 0;
- *(d + 1) = 0;
- *(d + 2) = 0;
- if (nclears > 4)
- {
- *(d + 3) = 0;
- *(d + 4) = 0;
- if (nclears > 6)
- {
- *(d + 5) = 0;
- *(d + 6) = 0;
- if (nclears > 8)
- {
- *(d + 7) = 0;
- *(d + 8) = 0;
- }
- }
- }
- }
-
- return mem;
+ return clear_memory (mem, clearsize);
}
#endif /* IS_IN (libc) */
new file mode 100644
@@ -0,0 +1,48 @@
+/* Clear a block of memory for calloc. Generic version.
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+static __always_inline void *
+clear_memory (void *mem, unsigned long clearsize)
+{
+ /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes. We know
+ that contents have an odd number of INTERNAL_SIZE_T-sized words;
+ minimally 3 words. */
+ INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
+ unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
+
+ if (nclears > 9)
+ return memset (d, 0, clearsize);
+
+ /* Use overlapping stores with 1 branch, instead of up to 3. */
+ *(d + 0) = 0;
+ *(d + 1) = 0;
+ *(d + 2) = 0;
+ *(d + nclears - 2) = 0;
+ *(d + nclears - 2 + 1) = 0;
+ if (nclears > 5)
+ {
+ *(d + 3) = 0;
+ *(d + 3 + 1) = 0;
+ *(d + nclears - 4) = 0;
+ *(d + nclears - 4 + 1) = 0;
+ }
+ else if (nclears < 3)
+ __builtin_unreachable ();
+
+ return mem;
+}