[v5] malloc: Optimize small memory clearing for calloc

Message ID 20241130042111.663276-1-hjl.tools@gmail.com
State Superseded
Headers
Series [v5] malloc: Optimize small memory clearing for calloc |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Test passed
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Test passed
redhat-pt-bot/TryBot-32bit success Build for i686

Commit Message

H.J. Lu Nov. 30, 2024, 4:21 a.m. UTC
  Add calloc-clear-memory.h to clear memory size up to 36 bytes (72 if 8byte
sizes) for calloc.  Use overlapping stores with 1 branch, instead of up to
3 branches.  On x860-64, it is faster than memset since calling memset
needs 1 indirect branch, 1 broadcast, and up to 4 branches.

Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
---
 malloc/malloc-internal.h              |  1 +
 malloc/malloc.c                       | 36 +-------------------
 sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 35 deletions(-)
 create mode 100644 sysdeps/generic/calloc-clear-memory.h
  

Comments

Adhemerval Zanella Netto Dec. 2, 2024, 11:56 a.m. UTC | #1
On 30/11/24 01:21, H.J. Lu wrote:
> Add calloc-clear-memory.h to clear memory size up to 36 bytes (72 if 8byte
> sizes) for calloc.  Use overlapping stores with 1 branch, instead of up to
> 3 branches.  On x860-64, it is faster than memset since calling memset
> needs 1 indirect branch, 1 broadcast, and up to 4 branches.
> 
> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> ---
>  malloc/malloc-internal.h              |  1 +
>  malloc/malloc.c                       | 36 +-------------------
>  sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++++++++
>  3 files changed, 50 insertions(+), 35 deletions(-)
>  create mode 100644 sysdeps/generic/calloc-clear-memory.h
> 
> diff --git a/malloc/malloc-internal.h b/malloc/malloc-internal.h
> index cba03433fe..3349e2d1fe 100644
> --- a/malloc/malloc-internal.h
> +++ b/malloc/malloc-internal.h
> @@ -23,6 +23,7 @@
>  #include <malloc-sysdep.h>
>  #include <malloc-size.h>
>  #include <malloc-hugepages.h>
> +#include <calloc-clear-memory.h>
>  
>  /* Called in the parent process before a fork.  */
>  void __malloc_fork_lock_parent (void) attribute_hidden;
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 287fa0904d..95775a3515 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3755,8 +3755,6 @@ __libc_calloc (size_t n, size_t elem_size)
>    INTERNAL_SIZE_T sz, oldtopsize;
>    void *mem;
>    unsigned long clearsize;
> -  unsigned long nclears;
> -  INTERNAL_SIZE_T *d;
>    ptrdiff_t bytes;
>  
>    if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
> @@ -3853,40 +3851,8 @@ __libc_calloc (size_t n, size_t elem_size)
>      }
>  #endif
>  
> -  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
> -     contents have an odd number of INTERNAL_SIZE_T-sized words;
> -     minimally 3.  */
> -  d = (INTERNAL_SIZE_T *) mem;
>    clearsize = csz - SIZE_SZ;
> -  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> -  assert (nclears >= 3);
> -
> -  if (nclears > 9)
> -    return memset (d, 0, clearsize);
> -
> -  else
> -    {
> -      *(d + 0) = 0;
> -      *(d + 1) = 0;
> -      *(d + 2) = 0;
> -      if (nclears > 4)
> -        {
> -          *(d + 3) = 0;
> -          *(d + 4) = 0;
> -          if (nclears > 6)
> -            {
> -              *(d + 5) = 0;
> -              *(d + 6) = 0;
> -              if (nclears > 8)
> -                {
> -                  *(d + 7) = 0;
> -                  *(d + 8) = 0;
> -                }
> -            }
> -        }
> -    }
> -
> -  return mem;
> +  return clear_memory (mem, clearsize);
>  }
>  #endif /* IS_IN (libc) */
>  
> diff --git a/sysdeps/generic/calloc-clear-memory.h b/sysdeps/generic/calloc-clear-memory.h
> new file mode 100644
> index 0000000000..9a2727b772
> --- /dev/null
> +++ b/sysdeps/generic/calloc-clear-memory.h
> @@ -0,0 +1,48 @@
> +/* Clear a block of memory for calloc.  Generic version.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +static __always_inline void *
> +clear_memory (void *mem, unsigned long clearsize)
> +{
> +  /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes.  We know
> +     that contents have an odd number of INTERNAL_SIZE_T-sized words;
> +     minimally 3 words.  */
> +  INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;

I think this strictly UB and it might generate some issues on architecture 
with strict alignment requirement (like sparc and some riscv chips). I
think we will need to use either some struct helper with __attribute__((packed))
or memcpy to avoid it.

> +  unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> +
> +  if (nclears > 9)
> +    return memset (d, 0, clearsize);
> +
> +  /* Use overlapping stores with 1 branch, instead of up to 3.  */
> +  *(d + 0) = 0;
> +  *(d + 1) = 0;
> +  *(d + 2) = 0;
> +  *(d + nclears - 2) = 0;
> +  *(d + nclears - 2 + 1) = 0;
> +  if (nclears > 5)
> +    {
> +      *(d + 3) = 0;
> +      *(d + 3 + 1) = 0;
> +      *(d + nclears - 4) = 0;
> +      *(d + nclears - 4 + 1) = 0;
> +    }
> +  else if (nclears < 3)
> +    __builtin_unreachable ();
> +
> +  return mem;
> +}
  
H.J. Lu Dec. 2, 2024, 12:05 p.m. UTC | #2
On Mon, Dec 2, 2024 at 7:56 PM Adhemerval Zanella Netto
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 30/11/24 01:21, H.J. Lu wrote:
> > Add calloc-clear-memory.h to clear memory size up to 36 bytes (72 if 8byte
> > sizes) for calloc.  Use overlapping stores with 1 branch, instead of up to
> > 3 branches.  On x860-64, it is faster than memset since calling memset
> > needs 1 indirect branch, 1 broadcast, and up to 4 branches.
> >
> > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> >  malloc/malloc-internal.h              |  1 +
> >  malloc/malloc.c                       | 36 +-------------------
> >  sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++++++++
> >  3 files changed, 50 insertions(+), 35 deletions(-)
> >  create mode 100644 sysdeps/generic/calloc-clear-memory.h
> >
> > diff --git a/malloc/malloc-internal.h b/malloc/malloc-internal.h
> > index cba03433fe..3349e2d1fe 100644
> > --- a/malloc/malloc-internal.h
> > +++ b/malloc/malloc-internal.h
> > @@ -23,6 +23,7 @@
> >  #include <malloc-sysdep.h>
> >  #include <malloc-size.h>
> >  #include <malloc-hugepages.h>
> > +#include <calloc-clear-memory.h>
> >
> >  /* Called in the parent process before a fork.  */
> >  void __malloc_fork_lock_parent (void) attribute_hidden;
> > diff --git a/malloc/malloc.c b/malloc/malloc.c
> > index 287fa0904d..95775a3515 100644
> > --- a/malloc/malloc.c
> > +++ b/malloc/malloc.c
> > @@ -3755,8 +3755,6 @@ __libc_calloc (size_t n, size_t elem_size)
> >    INTERNAL_SIZE_T sz, oldtopsize;
> >    void *mem;
> >    unsigned long clearsize;
> > -  unsigned long nclears;
> > -  INTERNAL_SIZE_T *d;
> >    ptrdiff_t bytes;
> >
> >    if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
> > @@ -3853,40 +3851,8 @@ __libc_calloc (size_t n, size_t elem_size)
> >      }
> >  #endif
> >
> > -  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
> > -     contents have an odd number of INTERNAL_SIZE_T-sized words;
> > -     minimally 3.  */
> > -  d = (INTERNAL_SIZE_T *) mem;
> >    clearsize = csz - SIZE_SZ;
> > -  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> > -  assert (nclears >= 3);
> > -
> > -  if (nclears > 9)
> > -    return memset (d, 0, clearsize);
> > -
> > -  else
> > -    {
> > -      *(d + 0) = 0;
> > -      *(d + 1) = 0;
> > -      *(d + 2) = 0;
> > -      if (nclears > 4)
> > -        {
> > -          *(d + 3) = 0;
> > -          *(d + 4) = 0;
> > -          if (nclears > 6)
> > -            {
> > -              *(d + 5) = 0;
> > -              *(d + 6) = 0;
> > -              if (nclears > 8)
> > -                {
> > -                  *(d + 7) = 0;
> > -                  *(d + 8) = 0;
> > -                }
> > -            }
> > -        }
> > -    }
> > -
> > -  return mem;
> > +  return clear_memory (mem, clearsize);
> >  }
> >  #endif /* IS_IN (libc) */
> >
> > diff --git a/sysdeps/generic/calloc-clear-memory.h b/sysdeps/generic/calloc-clear-memory.h
> > new file mode 100644
> > index 0000000000..9a2727b772
> > --- /dev/null
> > +++ b/sysdeps/generic/calloc-clear-memory.h
> > @@ -0,0 +1,48 @@
> > +/* Clear a block of memory for calloc.  Generic version.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +static __always_inline void *
> > +clear_memory (void *mem, unsigned long clearsize)
> > +{
> > +  /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes.  We know
> > +     that contents have an odd number of INTERNAL_SIZE_T-sized words;
> > +     minimally 3 words.  */
> > +  INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
>
> I think this strictly UB and it might generate some issues on architecture
> with strict alignment requirement (like sparc and some riscv chips). I
> think we will need to use either some struct helper with __attribute__((packed))
> or memcpy to avoid it.

I think we should use the original code by default.  The unaligned
version is opt-in.

> > +  unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> > +
> > +  if (nclears > 9)
> > +    return memset (d, 0, clearsize);
> > +
> > +  /* Use overlapping stores with 1 branch, instead of up to 3.  */
> > +  *(d + 0) = 0;
> > +  *(d + 1) = 0;
> > +  *(d + 2) = 0;
> > +  *(d + nclears - 2) = 0;
> > +  *(d + nclears - 2 + 1) = 0;
> > +  if (nclears > 5)
> > +    {
> > +      *(d + 3) = 0;
> > +      *(d + 3 + 1) = 0;
> > +      *(d + nclears - 4) = 0;
> > +      *(d + nclears - 4 + 1) = 0;
> > +    }
> > +  else if (nclears < 3)
> > +    __builtin_unreachable ();
> > +
> > +  return mem;
> > +}
>
  
Florian Weimer Dec. 2, 2024, 12:05 p.m. UTC | #3
* Adhemerval Zanella Netto:

>> +static __always_inline void *
>> +clear_memory (void *mem, unsigned long clearsize)
>> +{
>> +  /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes.  We know
>> +     that contents have an odd number of INTERNAL_SIZE_T-sized words;
>> +     minimally 3 words.  */
>> +  INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
>
> I think this strictly UB and it might generate some issues on architecture 
> with strict alignment requirement (like sparc and some riscv chips). I
> think we will need to use either some struct helper with __attribute__((packed))
> or memcpy to avoid it.

I think everything is properly aligned?  The “overlapping” comment is a
bit misleading, it's about multiple stores to the same locations, not
partially overlapping stores.

Thanks,
Florian
  
Adhemerval Zanella Netto Dec. 2, 2024, 12:17 p.m. UTC | #4
On 02/12/24 09:05, Florian Weimer wrote:
> * Adhemerval Zanella Netto:
> 
>>> +static __always_inline void *
>>> +clear_memory (void *mem, unsigned long clearsize)
>>> +{
>>> +  /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes.  We know
>>> +     that contents have an odd number of INTERNAL_SIZE_T-sized words;
>>> +     minimally 3 words.  */
>>> +  INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
>>
>> I think this strictly UB and it might generate some issues on architecture 
>> with strict alignment requirement (like sparc and some riscv chips). I
>> think we will need to use either some struct helper with __attribute__((packed))
>> or memcpy to avoid it.
> 
> I think everything is properly aligned?  The “overlapping” comment is a
> bit misleading, it's about multiple stores to the same locations, not
> partially overlapping stores.

So maybe then define 'mem' as 'INTERNAL_SIZE_T *'?
  
Noah Goldstein Dec. 3, 2024, 2 a.m. UTC | #5
On Fri, Nov 29, 2024 at 8:21 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> Add calloc-clear-memory.h to clear memory size up to 36 bytes (72 if 8byte
> sizes) for calloc.  Use overlapping stores with 1 branch, instead of up to
> 3 branches.  On x860-64, it is faster than memset since calling memset
> needs 1 indirect branch, 1 broadcast, and up to 4 branches.
>
> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> ---
>  malloc/malloc-internal.h              |  1 +
>  malloc/malloc.c                       | 36 +-------------------
>  sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++++++++
>  3 files changed, 50 insertions(+), 35 deletions(-)
>  create mode 100644 sysdeps/generic/calloc-clear-memory.h
>
> diff --git a/malloc/malloc-internal.h b/malloc/malloc-internal.h
> index cba03433fe..3349e2d1fe 100644
> --- a/malloc/malloc-internal.h
> +++ b/malloc/malloc-internal.h
> @@ -23,6 +23,7 @@
>  #include <malloc-sysdep.h>
>  #include <malloc-size.h>
>  #include <malloc-hugepages.h>
> +#include <calloc-clear-memory.h>
>
>  /* Called in the parent process before a fork.  */
>  void __malloc_fork_lock_parent (void) attribute_hidden;
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 287fa0904d..95775a3515 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3755,8 +3755,6 @@ __libc_calloc (size_t n, size_t elem_size)
>    INTERNAL_SIZE_T sz, oldtopsize;
>    void *mem;
>    unsigned long clearsize;
> -  unsigned long nclears;
> -  INTERNAL_SIZE_T *d;
>    ptrdiff_t bytes;
>
>    if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
> @@ -3853,40 +3851,8 @@ __libc_calloc (size_t n, size_t elem_size)
>      }
>  #endif
>
> -  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
> -     contents have an odd number of INTERNAL_SIZE_T-sized words;
> -     minimally 3.  */
> -  d = (INTERNAL_SIZE_T *) mem;
>    clearsize = csz - SIZE_SZ;
> -  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> -  assert (nclears >= 3);
> -
> -  if (nclears > 9)
> -    return memset (d, 0, clearsize);
> -
> -  else
> -    {
> -      *(d + 0) = 0;
> -      *(d + 1) = 0;
> -      *(d + 2) = 0;
> -      if (nclears > 4)
> -        {
> -          *(d + 3) = 0;
> -          *(d + 4) = 0;
> -          if (nclears > 6)
> -            {
> -              *(d + 5) = 0;
> -              *(d + 6) = 0;
> -              if (nclears > 8)
> -                {
> -                  *(d + 7) = 0;
> -                  *(d + 8) = 0;
> -                }
> -            }
> -        }
> -    }
> -
> -  return mem;
> +  return clear_memory (mem, clearsize);
>  }
>  #endif /* IS_IN (libc) */
>
> diff --git a/sysdeps/generic/calloc-clear-memory.h b/sysdeps/generic/calloc-clear-memory.h
> new file mode 100644
> index 0000000000..9a2727b772
> --- /dev/null
> +++ b/sysdeps/generic/calloc-clear-memory.h
> @@ -0,0 +1,48 @@
> +/* Clear a block of memory for calloc.  Generic version.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +static __always_inline void *
> +clear_memory (void *mem, unsigned long clearsize)
> +{
> +  /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes.  We know
> +     that contents have an odd number of INTERNAL_SIZE_T-sized words;
> +     minimally 3 words.  */
> +  INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
> +  unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> +
> +  if (nclears > 9)
> +    return memset (d, 0, clearsize);
> +
> +  /* Use overlapping stores with 1 branch, instead of up to 3.  */
> +  *(d + 0) = 0;
> +  *(d + 1) = 0;
> +  *(d + 2) = 0;
> +  *(d + nclears - 2) = 0;
> +  *(d + nclears - 2 + 1) = 0;
> +  if (nclears > 5)
> +    {
> +      *(d + 3) = 0;
> +      *(d + 3 + 1) = 0;
> +      *(d + nclears - 4) = 0;
> +      *(d + nclears - 4 + 1) = 0;
> +    }
> +  else if (nclears < 3)
> +    __builtin_unreachable ();
> +
Should be before the first deref?

I.e at the top
```
if(nclears < 3) __builtin_unreachable();`
*(d + 0) = 0;
...
```
> +  return mem;
> +}
> --
> 2.47.1
>
  
H.J. Lu Dec. 3, 2024, 2:08 a.m. UTC | #6
On Tue, Dec 3, 2024 at 10:00 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Nov 29, 2024 at 8:21 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > Add calloc-clear-memory.h to clear memory size up to 36 bytes (72 if 8byte
> > sizes) for calloc.  Use overlapping stores with 1 branch, instead of up to
> > 3 branches.  On x860-64, it is faster than memset since calling memset
> > needs 1 indirect branch, 1 broadcast, and up to 4 branches.
> >
> > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> >  malloc/malloc-internal.h              |  1 +
> >  malloc/malloc.c                       | 36 +-------------------
> >  sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++++++++
> >  3 files changed, 50 insertions(+), 35 deletions(-)
> >  create mode 100644 sysdeps/generic/calloc-clear-memory.h
> >
> > diff --git a/malloc/malloc-internal.h b/malloc/malloc-internal.h
> > index cba03433fe..3349e2d1fe 100644
> > --- a/malloc/malloc-internal.h
> > +++ b/malloc/malloc-internal.h
> > @@ -23,6 +23,7 @@
> >  #include <malloc-sysdep.h>
> >  #include <malloc-size.h>
> >  #include <malloc-hugepages.h>
> > +#include <calloc-clear-memory.h>
> >
> >  /* Called in the parent process before a fork.  */
> >  void __malloc_fork_lock_parent (void) attribute_hidden;
> > diff --git a/malloc/malloc.c b/malloc/malloc.c
> > index 287fa0904d..95775a3515 100644
> > --- a/malloc/malloc.c
> > +++ b/malloc/malloc.c
> > @@ -3755,8 +3755,6 @@ __libc_calloc (size_t n, size_t elem_size)
> >    INTERNAL_SIZE_T sz, oldtopsize;
> >    void *mem;
> >    unsigned long clearsize;
> > -  unsigned long nclears;
> > -  INTERNAL_SIZE_T *d;
> >    ptrdiff_t bytes;
> >
> >    if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
> > @@ -3853,40 +3851,8 @@ __libc_calloc (size_t n, size_t elem_size)
> >      }
> >  #endif
> >
> > -  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
> > -     contents have an odd number of INTERNAL_SIZE_T-sized words;
> > -     minimally 3.  */
> > -  d = (INTERNAL_SIZE_T *) mem;
> >    clearsize = csz - SIZE_SZ;
> > -  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> > -  assert (nclears >= 3);
> > -
> > -  if (nclears > 9)
> > -    return memset (d, 0, clearsize);
> > -
> > -  else
> > -    {
> > -      *(d + 0) = 0;
> > -      *(d + 1) = 0;
> > -      *(d + 2) = 0;
> > -      if (nclears > 4)
> > -        {
> > -          *(d + 3) = 0;
> > -          *(d + 4) = 0;
> > -          if (nclears > 6)
> > -            {
> > -              *(d + 5) = 0;
> > -              *(d + 6) = 0;
> > -              if (nclears > 8)
> > -                {
> > -                  *(d + 7) = 0;
> > -                  *(d + 8) = 0;
> > -                }
> > -            }
> > -        }
> > -    }
> > -
> > -  return mem;
> > +  return clear_memory (mem, clearsize);
> >  }
> >  #endif /* IS_IN (libc) */
> >
> > diff --git a/sysdeps/generic/calloc-clear-memory.h b/sysdeps/generic/calloc-clear-memory.h
> > new file mode 100644
> > index 0000000000..9a2727b772
> > --- /dev/null
> > +++ b/sysdeps/generic/calloc-clear-memory.h
> > @@ -0,0 +1,48 @@
> > +/* Clear a block of memory for calloc.  Generic version.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +static __always_inline void *
> > +clear_memory (void *mem, unsigned long clearsize)
> > +{
> > +  /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes.  We know
> > +     that contents have an odd number of INTERNAL_SIZE_T-sized words;
> > +     minimally 3 words.  */
> > +  INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
> > +  unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> > +
> > +  if (nclears > 9)
> > +    return memset (d, 0, clearsize);
> > +
> > +  /* Use overlapping stores with 1 branch, instead of up to 3.  */
> > +  *(d + 0) = 0;
> > +  *(d + 1) = 0;
> > +  *(d + 2) = 0;
> > +  *(d + nclears - 2) = 0;
> > +  *(d + nclears - 2 + 1) = 0;
> > +  if (nclears > 5)
> > +    {
> > +      *(d + 3) = 0;
> > +      *(d + 3 + 1) = 0;
> > +      *(d + nclears - 4) = 0;
> > +      *(d + nclears - 4 + 1) = 0;
> > +    }
> > +  else if (nclears < 3)
> > +    __builtin_unreachable ();
> > +
> Should be before the first deref?
>
> I.e at the top
> ```
> if(nclears < 3) __builtin_unreachable();`

This should never happen in production, just like malloc
memory is always aligned to INTERNAL_SIZE_T.  If
nclears < 3, something really went wrong in malloc which
should be caught during glibc development.

> *(d + 0) = 0;
> ...
> ```
> > +  return mem;
> > +}
> > --
> > 2.47.1
> >
  
Sunil Pandey Dec. 3, 2024, 2:12 a.m. UTC | #7
On Mon, Dec 2, 2024 at 6:00 PM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> On Fri, Nov 29, 2024 at 8:21 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > Add calloc-clear-memory.h to clear memory size up to 36 bytes (72 if
> 8byte
> > sizes) for calloc.  Use overlapping stores with 1 branch, instead of up
> to
> > 3 branches.  On x860-64, it is faster than memset since calling memset
> > needs 1 indirect branch, 1 broadcast, and up to 4 branches.
> >
> > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> > ---
> >  malloc/malloc-internal.h              |  1 +
> >  malloc/malloc.c                       | 36 +-------------------
> >  sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++++++++
> >  3 files changed, 50 insertions(+), 35 deletions(-)
> >  create mode 100644 sysdeps/generic/calloc-clear-memory.h
> >
> > diff --git a/malloc/malloc-internal.h b/malloc/malloc-internal.h
> > index cba03433fe..3349e2d1fe 100644
> > --- a/malloc/malloc-internal.h
> > +++ b/malloc/malloc-internal.h
> > @@ -23,6 +23,7 @@
> >  #include <malloc-sysdep.h>
> >  #include <malloc-size.h>
> >  #include <malloc-hugepages.h>
> > +#include <calloc-clear-memory.h>
> >
> >  /* Called in the parent process before a fork.  */
> >  void __malloc_fork_lock_parent (void) attribute_hidden;
> > diff --git a/malloc/malloc.c b/malloc/malloc.c
> > index 287fa0904d..95775a3515 100644
> > --- a/malloc/malloc.c
> > +++ b/malloc/malloc.c
> > @@ -3755,8 +3755,6 @@ __libc_calloc (size_t n, size_t elem_size)
> >    INTERNAL_SIZE_T sz, oldtopsize;
> >    void *mem;
> >    unsigned long clearsize;
> > -  unsigned long nclears;
> > -  INTERNAL_SIZE_T *d;
> >    ptrdiff_t bytes;
> >
> >    if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
> > @@ -3853,40 +3851,8 @@ __libc_calloc (size_t n, size_t elem_size)
> >      }
> >  #endif
> >
> > -  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
> > -     contents have an odd number of INTERNAL_SIZE_T-sized words;
> > -     minimally 3.  */
> > -  d = (INTERNAL_SIZE_T *) mem;
> >    clearsize = csz - SIZE_SZ;
> > -  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> > -  assert (nclears >= 3);
> > -
> > -  if (nclears > 9)
> > -    return memset (d, 0, clearsize);
> > -
> > -  else
> > -    {
> > -      *(d + 0) = 0;
> > -      *(d + 1) = 0;
> > -      *(d + 2) = 0;
> > -      if (nclears > 4)
> > -        {
> > -          *(d + 3) = 0;
> > -          *(d + 4) = 0;
> > -          if (nclears > 6)
> > -            {
> > -              *(d + 5) = 0;
> > -              *(d + 6) = 0;
> > -              if (nclears > 8)
> > -                {
> > -                  *(d + 7) = 0;
> > -                  *(d + 8) = 0;
> > -                }
> > -            }
> > -        }
> > -    }
> > -
> > -  return mem;
> > +  return clear_memory (mem, clearsize);
> >  }
> >  #endif /* IS_IN (libc) */
> >
> > diff --git a/sysdeps/generic/calloc-clear-memory.h
> b/sysdeps/generic/calloc-clear-memory.h
> > new file mode 100644
> > index 0000000000..9a2727b772
> > --- /dev/null
> > +++ b/sysdeps/generic/calloc-clear-memory.h
> > @@ -0,0 +1,48 @@
> > +/* Clear a block of memory for calloc.  Generic version.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +static __always_inline void *
> > +clear_memory (void *mem, unsigned long clearsize)
> > +{
> > +  /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes.  We know
> > +     that contents have an odd number of INTERNAL_SIZE_T-sized words;
> > +     minimally 3 words.  */
> > +  INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
> > +  unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
> > +
> > +  if (nclears > 9)
> > +    return memset (d, 0, clearsize);
> > +
> > +  /* Use overlapping stores with 1 branch, instead of up to 3.  */
> > +  *(d + 0) = 0;
> > +  *(d + 1) = 0;
> > +  *(d + 2) = 0;
> > +  *(d + nclears - 2) = 0;
> > +  *(d + nclears - 2 + 1) = 0;
>

For nclears = 8.

In existing code, *(d+7) will not be set.

In new code, *(d+7)  will be set.

Not sure if I'm missing something here.



> > +  if (nclears > 5)
> > +    {
> > +      *(d + 3) = 0;
> > +      *(d + 3 + 1) = 0;
> > +      *(d + nclears - 4) = 0;
> > +      *(d + nclears - 4 + 1) = 0;
> > +    }
> > +  else if (nclears < 3)
> > +    __builtin_unreachable ();
> > +
> Should be before the first deref?
>
> I.e at the top
> ```
> if(nclears < 3) __builtin_unreachable();`
> *(d + 0) = 0;
> ...
> ```
> > +  return mem;
> > +}
> > --
> > 2.47.1
> >
>
  
H.J. Lu Dec. 3, 2024, 2:16 a.m. UTC | #8
On Tue, Dec 3, 2024 at 10:13 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Mon, Dec 2, 2024 at 6:00 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> On Fri, Nov 29, 2024 at 8:21 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>> >
>> > Add calloc-clear-memory.h to clear memory size up to 36 bytes (72 if 8byte
>> > sizes) for calloc.  Use overlapping stores with 1 branch, instead of up to
>> > 3 branches.  On x860-64, it is faster than memset since calling memset
>> > needs 1 indirect branch, 1 broadcast, and up to 4 branches.
>> >
>> > Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
>> > ---
>> >  malloc/malloc-internal.h              |  1 +
>> >  malloc/malloc.c                       | 36 +-------------------
>> >  sysdeps/generic/calloc-clear-memory.h | 48 +++++++++++++++++++++++++++
>> >  3 files changed, 50 insertions(+), 35 deletions(-)
>> >  create mode 100644 sysdeps/generic/calloc-clear-memory.h
>> >
>> > diff --git a/malloc/malloc-internal.h b/malloc/malloc-internal.h
>> > index cba03433fe..3349e2d1fe 100644
>> > --- a/malloc/malloc-internal.h
>> > +++ b/malloc/malloc-internal.h
>> > @@ -23,6 +23,7 @@
>> >  #include <malloc-sysdep.h>
>> >  #include <malloc-size.h>
>> >  #include <malloc-hugepages.h>
>> > +#include <calloc-clear-memory.h>
>> >
>> >  /* Called in the parent process before a fork.  */
>> >  void __malloc_fork_lock_parent (void) attribute_hidden;
>> > diff --git a/malloc/malloc.c b/malloc/malloc.c
>> > index 287fa0904d..95775a3515 100644
>> > --- a/malloc/malloc.c
>> > +++ b/malloc/malloc.c
>> > @@ -3755,8 +3755,6 @@ __libc_calloc (size_t n, size_t elem_size)
>> >    INTERNAL_SIZE_T sz, oldtopsize;
>> >    void *mem;
>> >    unsigned long clearsize;
>> > -  unsigned long nclears;
>> > -  INTERNAL_SIZE_T *d;
>> >    ptrdiff_t bytes;
>> >
>> >    if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
>> > @@ -3853,40 +3851,8 @@ __libc_calloc (size_t n, size_t elem_size)
>> >      }
>> >  #endif
>> >
>> > -  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
>> > -     contents have an odd number of INTERNAL_SIZE_T-sized words;
>> > -     minimally 3.  */
>> > -  d = (INTERNAL_SIZE_T *) mem;
>> >    clearsize = csz - SIZE_SZ;
>> > -  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
>> > -  assert (nclears >= 3);
>> > -
>> > -  if (nclears > 9)
>> > -    return memset (d, 0, clearsize);
>> > -
>> > -  else
>> > -    {
>> > -      *(d + 0) = 0;
>> > -      *(d + 1) = 0;
>> > -      *(d + 2) = 0;
>> > -      if (nclears > 4)
>> > -        {
>> > -          *(d + 3) = 0;
>> > -          *(d + 4) = 0;
>> > -          if (nclears > 6)
>> > -            {
>> > -              *(d + 5) = 0;
>> > -              *(d + 6) = 0;
>> > -              if (nclears > 8)
>> > -                {
>> > -                  *(d + 7) = 0;
>> > -                  *(d + 8) = 0;
>> > -                }
>> > -            }
>> > -        }
>> > -    }
>> > -
>> > -  return mem;
>> > +  return clear_memory (mem, clearsize);
>> >  }
>> >  #endif /* IS_IN (libc) */
>> >
>> > diff --git a/sysdeps/generic/calloc-clear-memory.h b/sysdeps/generic/calloc-clear-memory.h
>> > new file mode 100644
>> > index 0000000000..9a2727b772
>> > --- /dev/null
>> > +++ b/sysdeps/generic/calloc-clear-memory.h
>> > @@ -0,0 +1,48 @@
>> > +/* Clear a block of memory for calloc.  Generic version.
>> > +   Copyright (C) 2024 Free Software Foundation, Inc.
>> > +   This file is part of the GNU C Library.
>> > +
>> > +   The GNU C Library is free software; you can redistribute it and/or
>> > +   modify it under the terms of the GNU Lesser General Public
>> > +   License as published by the Free Software Foundation; either
>> > +   version 2.1 of the License, or (at your option) any later version.
>> > +
>> > +   The GNU C Library is distributed in the hope that it will be useful,
>> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > +   Lesser General Public License for more details.
>> > +
>> > +   You should have received a copy of the GNU Lesser General Public
>> > +   License along with the GNU C Library; if not, see
>> > +   <https://www.gnu.org/licenses/>.  */
>> > +
>> > +static __always_inline void *
>> > +clear_memory (void *mem, unsigned long clearsize)
>> > +{
>> > +  /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes.  We know
>> > +     that contents have an odd number of INTERNAL_SIZE_T-sized words;
>> > +     minimally 3 words.  */
>> > +  INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
>> > +  unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
>> > +
>> > +  if (nclears > 9)
>> > +    return memset (d, 0, clearsize);
>> > +
>> > +  /* Use overlapping stores with 1 branch, instead of up to 3.  */
>> > +  *(d + 0) = 0;
>> > +  *(d + 1) = 0;
>> > +  *(d + 2) = 0;
>> > +  *(d + nclears - 2) = 0;
>> > +  *(d + nclears - 2 + 1) = 0;
>
>
> For nclears = 8.
>
> In existing code, *(d+7) will not be set.
>
> In new code, *(d+7)  will be set.
>
> Not sure if I'm missing something here.

The code comments say:

 /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes.  We know
     that contents have an odd number of INTERNAL_SIZE_T-sized words;
     minimally 3 words.  */

nclears == 8 never happens.

>
>>
>> > +  if (nclears > 5)
>> > +    {
>> > +      *(d + 3) = 0;
>> > +      *(d + 3 + 1) = 0;
>> > +      *(d + nclears - 4) = 0;
>> > +      *(d + nclears - 4 + 1) = 0;
>> > +    }
>> > +  else if (nclears < 3)
>> > +    __builtin_unreachable ();
>> > +
>> Should be before the first deref?
>>
>> I.e at the top
>> ```
>> if(nclears < 3) __builtin_unreachable();`
>> *(d + 0) = 0;
>> ...
>> ```
>> > +  return mem;
>> > +}
>> > --
>> > 2.47.1
>> >
  

Patch

diff --git a/malloc/malloc-internal.h b/malloc/malloc-internal.h
index cba03433fe..3349e2d1fe 100644
--- a/malloc/malloc-internal.h
+++ b/malloc/malloc-internal.h
@@ -23,6 +23,7 @@ 
 #include <malloc-sysdep.h>
 #include <malloc-size.h>
 #include <malloc-hugepages.h>
+#include <calloc-clear-memory.h>
 
 /* Called in the parent process before a fork.  */
 void __malloc_fork_lock_parent (void) attribute_hidden;
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 287fa0904d..95775a3515 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3755,8 +3755,6 @@  __libc_calloc (size_t n, size_t elem_size)
   INTERNAL_SIZE_T sz, oldtopsize;
   void *mem;
   unsigned long clearsize;
-  unsigned long nclears;
-  INTERNAL_SIZE_T *d;
   ptrdiff_t bytes;
 
   if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
@@ -3853,40 +3851,8 @@  __libc_calloc (size_t n, size_t elem_size)
     }
 #endif
 
-  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
-     contents have an odd number of INTERNAL_SIZE_T-sized words;
-     minimally 3.  */
-  d = (INTERNAL_SIZE_T *) mem;
   clearsize = csz - SIZE_SZ;
-  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
-  assert (nclears >= 3);
-
-  if (nclears > 9)
-    return memset (d, 0, clearsize);
-
-  else
-    {
-      *(d + 0) = 0;
-      *(d + 1) = 0;
-      *(d + 2) = 0;
-      if (nclears > 4)
-        {
-          *(d + 3) = 0;
-          *(d + 4) = 0;
-          if (nclears > 6)
-            {
-              *(d + 5) = 0;
-              *(d + 6) = 0;
-              if (nclears > 8)
-                {
-                  *(d + 7) = 0;
-                  *(d + 8) = 0;
-                }
-            }
-        }
-    }
-
-  return mem;
+  return clear_memory (mem, clearsize);
 }
 #endif /* IS_IN (libc) */
 
diff --git a/sysdeps/generic/calloc-clear-memory.h b/sysdeps/generic/calloc-clear-memory.h
new file mode 100644
index 0000000000..9a2727b772
--- /dev/null
+++ b/sysdeps/generic/calloc-clear-memory.h
@@ -0,0 +1,48 @@ 
+/* Clear a block of memory for calloc.  Generic version.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+static __always_inline void *
+clear_memory (void *mem, unsigned long clearsize)
+{
+  /* Unroll clear memory size up to 9 * INTERNAL_SIZE_T bytes.  We know
+     that contents have an odd number of INTERNAL_SIZE_T-sized words;
+     minimally 3 words.  */
+  INTERNAL_SIZE_T *d = (INTERNAL_SIZE_T *) mem;
+  unsigned long nclears = clearsize / sizeof (INTERNAL_SIZE_T);
+
+  if (nclears > 9)
+    return memset (d, 0, clearsize);
+
+  /* Use overlapping stores with 1 branch, instead of up to 3.  */
+  *(d + 0) = 0;
+  *(d + 1) = 0;
+  *(d + 2) = 0;
+  *(d + nclears - 2) = 0;
+  *(d + nclears - 2 + 1) = 0;
+  if (nclears > 5)
+    {
+      *(d + 3) = 0;
+      *(d + 3 + 1) = 0;
+      *(d + nclears - 4) = 0;
+      *(d + nclears - 4 + 1) = 0;
+    }
+  else if (nclears < 3)
+    __builtin_unreachable ();
+
+  return mem;
+}