Linux: Avoid calling malloc indirectly from __get_nprocs

Message ID 87a6n7jvbe.fsf@oldenburg.str.redhat.com
State Committed
Headers
Series Linux: Avoid calling malloc indirectly from __get_nprocs |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Florian Weimer June 30, 2021, 2:12 p.m. UTC
  malloc initialization depends on __get_nprocs, so using
scratch buffers in __get_nprocs may result in infinite recursion.

Tested on i686-linux-gnu, x86_64-linux-gnu.  build-many-glibcs.py is
still running, but looks good so far.

(Exposure of this bug may have been a side effect of the malloc
initialization rework, not sure.)

Thanks,
Florian
---
 sysdeps/unix/sysv/linux/getsysstats.c | 83 ++++++++++++++++++++++++-----------
 1 file changed, 57 insertions(+), 26 deletions(-)
  

Comments

Richard W.M. Jones June 30, 2021, 2:18 p.m. UTC | #1
For context:
https://bugzilla.redhat.com/show_bug.cgi?id=1975693#c15

qemu uses a seccomp filter to block (probably incorrectly)
sched_getaffinity, so this system call fails.  Currently glibc
believes this means that the buffer is too small and doubles it until
it runs out of stack/memory and crashes.

A patch is going to be submitted to qemu to remove the seccomp filter
from this syscall, but it seems like glibc still shouldn't assume
sched_getaffinity is always callable.

Rich.
  
Siddhesh Poyarekar June 30, 2021, 2:53 p.m. UTC | #2
On 6/30/21 7:42 PM, Florian Weimer via Libc-alpha wrote:
> malloc initialization depends on __get_nprocs, so using
> scratch buffers in __get_nprocs may result in infinite recursion.
> 
> Tested on i686-linux-gnu, x86_64-linux-gnu.  build-many-glibcs.py is
> still running, but looks good so far.
> 
> (Exposure of this bug may have been a side effect of the malloc
> initialization rework, not sure.)

Richard identified[1] the commit that introduced scratch buffers into 
__get_nprocs.

[1] 
https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=903bc7dcc2acafc40be11639767e10a2de712649

> Thanks,
> Florian
> ---
>   sysdeps/unix/sysv/linux/getsysstats.c | 83 ++++++++++++++++++++++++-----------
>   1 file changed, 57 insertions(+), 26 deletions(-)
> 
> diff --git a/sysdeps/unix/sysv/linux/getsysstats.c b/sysdeps/unix/sysv/linux/getsysstats.c
> index 2e15f0039e..1391e360b8 100644
> --- a/sysdeps/unix/sysv/linux/getsysstats.c
> +++ b/sysdeps/unix/sysv/linux/getsysstats.c
> @@ -17,42 +17,73 @@
>      License along with the GNU C Library; if not, see
>      <https://www.gnu.org/licenses/>.  */
>   
> +#include <array_length.h>
>   #include <dirent.h>
> +#include <errno.h>
> +#include <ldsodefs.h>
> +#include <limits.h>
>   #include <not-cancel.h>
> -#include <scratch_buffer.h>
>   #include <stdio.h>
>   #include <stdio_ext.h>
> +#include <sys/mman.h>
>   #include <sys/sysinfo.h>
>   #include <sysdep.h>

Remove scratch buffer and get in mman.  OK.

>   
> +/* Compute the population count of the entire array.  */
> +static int
> +__get_nprocs_count (const unsigned long int *array, size_t length)
> +{
> +  int count = 0;
> +  for (size_t i = 0; i < length; ++i)
> +    if (__builtin_add_overflow (count,  __builtin_popcountl (array[i]),
> +				&count))
> +      return INT_MAX;
> +  return count;
> +}

Use popcount instead of __sched_cpucount to count CPUs, using saturated 
addition.  OK.

> +
> +/* __get_nprocs with a large buffer.  */
> +static int
> +__get_nprocs_large (void)
> +{
> +  /* This code cannot use scratch_buffer because it is used during
> +     malloc initialization.  */
> +  size_t pagesize = GLRO (dl_pagesize);
> +  unsigned long int *page = __mmap (0, pagesize, PROT_READ | PROT_WRITE,
> +				    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
> +  if (page == MAP_FAILED)
> +    return 2;
> +  int r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0, pagesize, page);
> +  int count;
> +  if (r > 0)
> +    count = __get_nprocs_count (page, pagesize / sizeof (unsigned long int));
> +  else if (r == -EINVAL)
> +    /* One page is still not enough to store the bits.  A more-or-less
> +       arbitrary value.  This assumes t hat such large systems never
> +       happen in practice.  */
> +    count = GLRO (dl_pagesize) * CHAR_BIT;
> +  else
> +    count = 2;
> +  __munmap (page, GLRO (dl_pagesize));
> +  return count;
> +}
> +

For larger requests, get a page to write to. 32K processors ought to be 
enough for anybody.  OK :)

Maybe there's an opportunity to consolidate the syscall and error checks 
in __get_nprocs_large and __get_nprocs into, say, __get_nprocs_impl into 
a single function, static __always_inline int __get_nprocs_impl 
(unsigned long *, size_t).  It's a minor suggestion though, feel free to 
ignore if you like it this way.

>   int
>   __get_nprocs (void)
>   {
> -  struct scratch_buffer set;
> -  scratch_buffer_init (&set);
> -
> -  int r;
> -  while (true)
> -    {
> -      /* The possible error are EFAULT for an invalid buffer or ESRCH for
> -	 invalid pid, none could happen.  */
> -      r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0, set.length,
> -                                 set.data);
> -      if (r > 0)
> -        break;
> -
> -      if (!scratch_buffer_grow (&set))
> -        /* Default to an SMP system in case we cannot obtain an accurate
> -           number.  */
> -        return 2;
> -    }
> -
> -  /* The scratch_buffer is aligned to max_align_t.  */
> -  r = __sched_cpucount (r, (const cpu_set_t *) set.data);
> -
> -  scratch_buffer_free (&set);
> -
> -  return r;
> +  /* Fast path for most systems.  The kernel expects a buffer size
> +     that is a multiple of 8.  */
> +  unsigned long int small_buffer[1024 / CHAR_BIT / sizeof (unsigned long int)];

cpu_set_t would have been more readable but maybe clumsy to use with 
popcount.

> +  int r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0,
> +				 sizeof (small_buffer), small_buffer);
> +  if (r > 0)
> +    return __get_nprocs_count (small_buffer, r / sizeof (unsigned long int));

Punt counting to __get_nprocs_count.  OK.

> +  else if (r == -EINVAL)
> +    /* The kernel requests a larger buffer to store the data.  */
> +    return __get_nprocs_large ();
> +  else
> +    /* Some other error.  2 is conservative (not a uniprocessor
> +       system, so atomics are needed). */
> +    return 2;
>   }
>   libc_hidden_def (__get_nprocs)
>   weak_alias (__get_nprocs, get_nprocs)

LGTM.

Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
  
Carlos O'Donell June 30, 2021, 3:20 p.m. UTC | #3
On 6/30/21 10:12 AM, Florian Weimer via Libc-alpha wrote:
> malloc initialization depends on __get_nprocs, so using
> scratch buffers in __get_nprocs may result in infinite recursion.

LGTM.

Reviewed-by: Carlos O'Donell <carlos@redhat.com>

In our implementation we have:

Configured processors:
_SC_NPROCESSORS_CONF -> __get_nprocs_conf -> /sys/devices/system/cpu

Online processors:
_SC_NPROCESSORS_ONLN -> __get_nprocs -> sched_getaffinity.

Note: the JVM tries sched_getaffinity first, and if that syscall fails
it call sysconf(_SC_NPROCESSORS_ONLN).

So in this case we end up with 2 being returned because rather than EINVAL
we see ENOSYS or EPERM depending on syscall seccomp blocking.

I think 2 is sufficiently conservative to trigger the right behaviour.

The upperbound of page size * bits-per-char seems acceptable for this case
to give some parallelism and a good-enough answer to the sysconf query
or the malloc code. I wish we had another method, but such a method involves
looking into sysfs to find something we can use with reliable semantics.

> Tested on i686-linux-gnu, x86_64-linux-gnu.  build-many-glibcs.py is
> still running, but looks good so far.
> 
> (Exposure of this bug may have been a side effect of the malloc
> initialization rework, not sure.)
> 
> Thanks,
> Florian
> ---
>  sysdeps/unix/sysv/linux/getsysstats.c | 83 ++++++++++++++++++++++++-----------
>  1 file changed, 57 insertions(+), 26 deletions(-)
> 
> diff --git a/sysdeps/unix/sysv/linux/getsysstats.c b/sysdeps/unix/sysv/linux/getsysstats.c
> index 2e15f0039e..1391e360b8 100644
> --- a/sysdeps/unix/sysv/linux/getsysstats.c
> +++ b/sysdeps/unix/sysv/linux/getsysstats.c
> @@ -17,42 +17,73 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>  
> +#include <array_length.h>
>  #include <dirent.h>
> +#include <errno.h>
> +#include <ldsodefs.h>
> +#include <limits.h>
>  #include <not-cancel.h>
> -#include <scratch_buffer.h>
>  #include <stdio.h>
>  #include <stdio_ext.h>
> +#include <sys/mman.h>
>  #include <sys/sysinfo.h>
>  #include <sysdep.h>
>  
> +/* Compute the population count of the entire array.  */
> +static int
> +__get_nprocs_count (const unsigned long int *array, size_t length)
> +{
> +  int count = 0;
> +  for (size_t i = 0; i < length; ++i)
> +    if (__builtin_add_overflow (count,  __builtin_popcountl (array[i]),
> +				&count))
> +      return INT_MAX;
> +  return count;
> +}

OK.

> +
> +/* __get_nprocs with a large buffer.  */
> +static int
> +__get_nprocs_large (void)
> +{
> +  /* This code cannot use scratch_buffer because it is used during
> +     malloc initialization.  */
> +  size_t pagesize = GLRO (dl_pagesize);
> +  unsigned long int *page = __mmap (0, pagesize, PROT_READ | PROT_WRITE,
> +				    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
> +  if (page == MAP_FAILED)
> +    return 2;
> +  int r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0, pagesize, page);
> +  int count;
> +  if (r > 0)
> +    count = __get_nprocs_count (page, pagesize / sizeof (unsigned long int));
> +  else if (r == -EINVAL)
> +    /* One page is still not enough to store the bits.  A more-or-less
> +       arbitrary value.  This assumes t hat such large systems never
> +       happen in practice.  */
> +    count = GLRO (dl_pagesize) * CHAR_BIT;

OK. Conservative on -EINVAL (not EPERM/ENOSYS).

> +  else
> +    count = 2;

OK. Conservative on EPERM/ENOSYS. Note: JVM's conservative is also 2 (os::processor_count()).

> +  __munmap (page, GLRO (dl_pagesize));
> +  return count;
> +}
> +
>  int
>  __get_nprocs (void)
>  {
> -  struct scratch_buffer set;
> -  scratch_buffer_init (&set);
> -
> -  int r;
> -  while (true)
> -    {
> -      /* The possible error are EFAULT for an invalid buffer or ESRCH for
> -	 invalid pid, none could happen.  */
> -      r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0, set.length,
> -                                 set.data);
> -      if (r > 0)
> -        break;
> -
> -      if (!scratch_buffer_grow (&set))
> -        /* Default to an SMP system in case we cannot obtain an accurate
> -           number.  */
> -        return 2;
> -    }
> -
> -  /* The scratch_buffer is aligned to max_align_t.  */
> -  r = __sched_cpucount (r, (const cpu_set_t *) set.data);
> -
> -  scratch_buffer_free (&set);
> -
> -  return r;
> +  /* Fast path for most systems.  The kernel expects a buffer size
> +     that is a multiple of 8.  */
> +  unsigned long int small_buffer[1024 / CHAR_BIT / sizeof (unsigned long int)];
> +  int r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0,
> +				 sizeof (small_buffer), small_buffer);
> +  if (r > 0)
> +    return __get_nprocs_count (small_buffer, r / sizeof (unsigned long int));
> +  else if (r == -EINVAL)
> +    /* The kernel requests a larger buffer to store the data.  */
> +    return __get_nprocs_large ();

OK. Retry once for failure due to sizing.

> +  else
> +    /* Some other error.  2 is conservative (not a uniprocessor
> +       system, so atomics are needed). */
> +    return 2;

OK. Acceptable for EPERM/ENOSYS.

>  }
>  libc_hidden_def (__get_nprocs)
>  weak_alias (__get_nprocs, get_nprocs)
>
  
Adhemerval Zanella June 30, 2021, 5:20 p.m. UTC | #4
On 30/06/2021 11:12, Florian Weimer wrote:
> malloc initialization depends on __get_nprocs, so using
> scratch buffers in __get_nprocs may result in infinite recursion.
> 
> Tested on i686-linux-gnu, x86_64-linux-gnu.  build-many-glibcs.py is
> still running, but looks good so far.
> 
> (Exposure of this bug may have been a side effect of the malloc
> initialization rework, not sure.)

Some comments below.

> 
> Thanks,
> Florian
> ---
>  sysdeps/unix/sysv/linux/getsysstats.c | 83 ++++++++++++++++++++++++-----------
>  1 file changed, 57 insertions(+), 26 deletions(-)
> 
> diff --git a/sysdeps/unix/sysv/linux/getsysstats.c b/sysdeps/unix/sysv/linux/getsysstats.c
> index 2e15f0039e..1391e360b8 100644
> --- a/sysdeps/unix/sysv/linux/getsysstats.c
> +++ b/sysdeps/unix/sysv/linux/getsysstats.c
> @@ -17,42 +17,73 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>  
> +#include <array_length.h>
>  #include <dirent.h>
> +#include <errno.h>
> +#include <ldsodefs.h>
> +#include <limits.h>
>  #include <not-cancel.h>
> -#include <scratch_buffer.h>
>  #include <stdio.h>
>  #include <stdio_ext.h>
> +#include <sys/mman.h>
>  #include <sys/sysinfo.h>
>  #include <sysdep.h>
>  
> +/* Compute the population count of the entire array.  */
> +static int
> +__get_nprocs_count (const unsigned long int *array, size_t length)
> +{
> +  int count = 0;
> +  for (size_t i = 0; i < length; ++i)
> +    if (__builtin_add_overflow (count,  __builtin_popcountl (array[i]),
> +				&count))
> +      return INT_MAX;
> +  return count;
> +}

Could we avoid replicate the same logic over different files? We have a
countbits() on posix/sched_cpucount.c, so I think it would better to move
it on a sched.h and use it instead (there is no need to really handle
overflow here, it would require a *very* large buffer...).

> +
> +/* __get_nprocs with a large buffer.  */
> +static int
> +__get_nprocs_large (void)
> +{
> +  /* This code cannot use scratch_buffer because it is used during
> +     malloc initialization.  */
> +  size_t pagesize = GLRO (dl_pagesize);
> +  unsigned long int *page = __mmap (0, pagesize, PROT_READ | PROT_WRITE,
> +				    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
> +  if (page == MAP_FAILED)
> +    return 2;
> +  int r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0, pagesize, page);
> +  int count;
> +  if (r > 0)
> +    count = __get_nprocs_count (page, pagesize / sizeof (unsigned long int));
> +  else if (r == -EINVAL)
> +    /* One page is still not enough to store the bits.  A more-or-less
> +       arbitrary value.  This assumes t hat such large systems never
> +       happen in practice.  */
> +    count = GLRO (dl_pagesize) * CHAR_BIT;
> +  else
> +    count = 2;
> +  __munmap (page, GLRO (dl_pagesize));

Maybe use pagesize here since you are defining it.

> +  return count;
> +}
> +
>  int
>  __get_nprocs (void)
>  {
> -  struct scratch_buffer set;
> -  scratch_buffer_init (&set);
> -
> -  int r;
> -  while (true)
> -    {
> -      /* The possible error are EFAULT for an invalid buffer or ESRCH for
> -	 invalid pid, none could happen.  */
> -      r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0, set.length,
> -                                 set.data);
> -      if (r > 0)
> -        break;
> -
> -      if (!scratch_buffer_grow (&set))
> -        /* Default to an SMP system in case we cannot obtain an accurate
> -           number.  */
> -        return 2;
> -    }
> -
> -  /* The scratch_buffer is aligned to max_align_t.  */
> -  r = __sched_cpucount (r, (const cpu_set_t *) set.data);
> -
> -  scratch_buffer_free (&set);
> -
> -  return r;
> +  /* Fast path for most systems.  The kernel expects a buffer size
> +     that is a multiple of 8.  */
> +  unsigned long int small_buffer[1024 / CHAR_BIT / sizeof (unsigned long int)];

Use __cpu_mask instead of 'unsigned long int'.  

> +  int r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0,
> +				 sizeof (small_buffer), small_buffer);
> +  if (r > 0)
> +    return __get_nprocs_count (small_buffer, r / sizeof (unsigned long int));
> +  else if (r == -EINVAL)
> +    /* The kernel requests a larger buffer to store the data.  */
> +    return __get_nprocs_large ();
> +  else
> +    /* Some other error.  2 is conservative (not a uniprocessor
> +       system, so atomics are needed). */
> +    return 2;
>  }
>  libc_hidden_def (__get_nprocs)
>  weak_alias (__get_nprocs, get_nprocs)
> 
I would prefer that since now we don't iterate increasing the buffer size for
sched_getaffinity we go for a simplified version and use a large buffer instead.

Linux currently supports at maximum of 4096 cpus for most architectures:

$ find -iname Kconfig | xargs git grep -A10 -w NR_CPUS | grep -w range
arch/alpha/Kconfig-	range 2 32
arch/arc/Kconfig-	range 2 4096
arch/arm/Kconfig-	range 2 16 if DEBUG_KMAP_LOCAL
arch/arm/Kconfig-	range 2 32 if !DEBUG_KMAP_LOCAL
arch/arm64/Kconfig-	range 2 4096
arch/csky/Kconfig-	range 2 32
arch/hexagon/Kconfig-	range 2 6 if SMP
arch/ia64/Kconfig-	range 2 4096
arch/mips/Kconfig-	range 2 256
arch/openrisc/Kconfig-	range 2 32
arch/parisc/Kconfig-	range 2 32
arch/riscv/Kconfig-	range 2 32
arch/s390/Kconfig-	range 2 512
arch/sh/Kconfig-	range 2 32
arch/sparc/Kconfig-	range 2 32 if SPARC32
arch/sparc/Kconfig-	range 2 4096 if SPARC64
arch/um/Kconfig-	range 1 1
arch/x86/Kconfig-# [NR_CPUS_RANGE_BEGIN ... NR_CPUS_RANGE_END] range.
arch/x86/Kconfig-	range NR_CPUS_RANGE_BEGIN NR_CPUS_RANGE_END
arch/xtensa/Kconfig-	range 2 32

With x86 supporting 8192:

arch/x86/Kconfig
 976 config NR_CPUS_RANGE_END
 977         int
 978         depends on X86_64
 979         default 8192 if  SMP && CPUMASK_OFFSTACK
 980         default  512 if  SMP && !CPUMASK_OFFSTACK
 981         default    1 if !SMP

So using a buffer for 16384 entries seems more than enough.

  enum { max_cpus = 16384 };
  __cpu_mask buffer[max_cpus / CHAR_BIT / sizeof (__cpu_mask)];
  int r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0, sizeof buffer, buffer);
  if (r < 0)
    return 2;
  else if (r == -EINVAL)
    return max_cpus;

  int count = 0;
  for (size_t i = 0; i < max_cpus; ++i)
    count += countbits (buffer[i]);
  return count;
  
Florian Weimer July 6, 2021, 12:50 p.m. UTC | #5
* Adhemerval Zanella:

>> +/* Compute the population count of the entire array.  */
>> +static int
>> +__get_nprocs_count (const unsigned long int *array, size_t length)
>> +{
>> +  int count = 0;
>> +  for (size_t i = 0; i < length; ++i)
>> +    if (__builtin_add_overflow (count,  __builtin_popcountl (array[i]),
>> +				&count))
>> +      return INT_MAX;
>> +  return count;
>> +}
>
> Could we avoid replicate the same logic over different files? We have a
> countbits() on posix/sched_cpucount.c, so I think it would better to move
> it on a sched.h and use it instead (there is no need to really handle
> overflow here, it would require a *very* large buffer...).

Probably, yes.

Should I send a patch?  I was busy with other stuff last week, but I
should be able to work on fixing this now.

>> +/* __get_nprocs with a large buffer.  */
>> +static int
>> +__get_nprocs_large (void)
>> +{
>> +  /* This code cannot use scratch_buffer because it is used during
>> +     malloc initialization.  */
>> +  size_t pagesize = GLRO (dl_pagesize);
>> +  unsigned long int *page = __mmap (0, pagesize, PROT_READ | PROT_WRITE,
>> +				    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
>> +  if (page == MAP_FAILED)
>> +    return 2;
>> +  int r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0, pagesize, page);
>> +  int count;
>> +  if (r > 0)
>> +    count = __get_nprocs_count (page, pagesize / sizeof (unsigned long int));
>> +  else if (r == -EINVAL)
>> +    /* One page is still not enough to store the bits.  A more-or-less
>> +       arbitrary value.  This assumes t hat such large systems never
>> +       happen in practice.  */
>> +    count = GLRO (dl_pagesize) * CHAR_BIT;
>> +  else
>> +    count = 2;
>> +  __munmap (page, GLRO (dl_pagesize));
>
> Maybe use pagesize here since you are defining it.

Right.

> I would prefer that since now we don't iterate increasing the buffer
> size for sched_getaffinity we go for a simplified version and use a
> large buffer instead.
>
> Linux currently supports at maximum of 4096 cpus for most
> architectures:

I'm not sure if that's a good idea.  I think some distributions patched
the defaults in the past.  The limit isn't really set in stone.

Thanks,
Florian
  
Adhemerval Zanella July 6, 2021, 1 p.m. UTC | #6
On 06/07/2021 09:50, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>>> +/* Compute the population count of the entire array.  */
>>> +static int
>>> +__get_nprocs_count (const unsigned long int *array, size_t length)
>>> +{
>>> +  int count = 0;
>>> +  for (size_t i = 0; i < length; ++i)
>>> +    if (__builtin_add_overflow (count,  __builtin_popcountl (array[i]),
>>> +				&count))
>>> +      return INT_MAX;
>>> +  return count;
>>> +}
>>
>> Could we avoid replicate the same logic over different files? We have a
>> countbits() on posix/sched_cpucount.c, so I think it would better to move
>> it on a sched.h and use it instead (there is no need to really handle
>> overflow here, it would require a *very* large buffer...).
> 
> Probably, yes.
> 
> Should I send a patch?  I was busy with other stuff last week, but I
> should be able to work on fixing this now.

I have this one m backlog, I will prob send a patch this week.

> 
>>> +/* __get_nprocs with a large buffer.  */
>>> +static int
>>> +__get_nprocs_large (void)
>>> +{
>>> +  /* This code cannot use scratch_buffer because it is used during
>>> +     malloc initialization.  */
>>> +  size_t pagesize = GLRO (dl_pagesize);
>>> +  unsigned long int *page = __mmap (0, pagesize, PROT_READ | PROT_WRITE,
>>> +				    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
>>> +  if (page == MAP_FAILED)
>>> +    return 2;
>>> +  int r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0, pagesize, page);
>>> +  int count;
>>> +  if (r > 0)
>>> +    count = __get_nprocs_count (page, pagesize / sizeof (unsigned long int));
>>> +  else if (r == -EINVAL)
>>> +    /* One page is still not enough to store the bits.  A more-or-less
>>> +       arbitrary value.  This assumes t hat such large systems never
>>> +       happen in practice.  */
>>> +    count = GLRO (dl_pagesize) * CHAR_BIT;
>>> +  else
>>> +    count = 2;
>>> +  __munmap (page, GLRO (dl_pagesize));
>>
>> Maybe use pagesize here since you are defining it.
> 
> Right.
> 
>> I would prefer that since now we don't iterate increasing the buffer
>> size for sched_getaffinity we go for a simplified version and use a
>> large buffer instead.
>>
>> Linux currently supports at maximum of 4096 cpus for most
>> architectures:
> 
> I'm not sure if that's a good idea.  I think some distributions patched
> the defaults in the past.  The limit isn't really set in stone.

But we are still using an artificial limit here, although higher.  Couldn't
we do the other way around: make malloc use sched_getaffinity with a
limited buffer and use either to the saturated value or 2 if syscall failed,
and get back to use a scratch_buffer on libc provided symbol (which does not
have any limit and simpler to handle than stack plus mmap)? For malloc it 
does not seems critical to get the fully correct CPU number for large systems.
  

Patch

diff --git a/sysdeps/unix/sysv/linux/getsysstats.c b/sysdeps/unix/sysv/linux/getsysstats.c
index 2e15f0039e..1391e360b8 100644
--- a/sysdeps/unix/sysv/linux/getsysstats.c
+++ b/sysdeps/unix/sysv/linux/getsysstats.c
@@ -17,42 +17,73 @@ 
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
+#include <array_length.h>
 #include <dirent.h>
+#include <errno.h>
+#include <ldsodefs.h>
+#include <limits.h>
 #include <not-cancel.h>
-#include <scratch_buffer.h>
 #include <stdio.h>
 #include <stdio_ext.h>
+#include <sys/mman.h>
 #include <sys/sysinfo.h>
 #include <sysdep.h>
 
+/* Compute the population count of the entire array.  */
+static int
+__get_nprocs_count (const unsigned long int *array, size_t length)
+{
+  int count = 0;
+  for (size_t i = 0; i < length; ++i)
+    if (__builtin_add_overflow (count,  __builtin_popcountl (array[i]),
+				&count))
+      return INT_MAX;
+  return count;
+}
+
+/* __get_nprocs with a large buffer.  */
+static int
+__get_nprocs_large (void)
+{
+  /* This code cannot use scratch_buffer because it is used during
+     malloc initialization.  */
+  size_t pagesize = GLRO (dl_pagesize);
+  unsigned long int *page = __mmap (0, pagesize, PROT_READ | PROT_WRITE,
+				    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+  if (page == MAP_FAILED)
+    return 2;
+  int r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0, pagesize, page);
+  int count;
+  if (r > 0)
+    count = __get_nprocs_count (page, pagesize / sizeof (unsigned long int));
+  else if (r == -EINVAL)
+    /* One page is still not enough to store the bits.  A more-or-less
+       arbitrary value.  This assumes t hat such large systems never
+       happen in practice.  */
+    count = GLRO (dl_pagesize) * CHAR_BIT;
+  else
+    count = 2;
+  __munmap (page, GLRO (dl_pagesize));
+  return count;
+}
+
 int
 __get_nprocs (void)
 {
-  struct scratch_buffer set;
-  scratch_buffer_init (&set);
-
-  int r;
-  while (true)
-    {
-      /* The possible error are EFAULT for an invalid buffer or ESRCH for
-	 invalid pid, none could happen.  */
-      r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0, set.length,
-                                 set.data);
-      if (r > 0)
-        break;
-
-      if (!scratch_buffer_grow (&set))
-        /* Default to an SMP system in case we cannot obtain an accurate
-           number.  */
-        return 2;
-    }
-
-  /* The scratch_buffer is aligned to max_align_t.  */
-  r = __sched_cpucount (r, (const cpu_set_t *) set.data);
-
-  scratch_buffer_free (&set);
-
-  return r;
+  /* Fast path for most systems.  The kernel expects a buffer size
+     that is a multiple of 8.  */
+  unsigned long int small_buffer[1024 / CHAR_BIT / sizeof (unsigned long int)];
+  int r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0,
+				 sizeof (small_buffer), small_buffer);
+  if (r > 0)
+    return __get_nprocs_count (small_buffer, r / sizeof (unsigned long int));
+  else if (r == -EINVAL)
+    /* The kernel requests a larger buffer to store the data.  */
+    return __get_nprocs_large ();
+  else
+    /* Some other error.  2 is conservative (not a uniprocessor
+       system, so atomics are needed). */
+    return 2;
 }
 libc_hidden_def (__get_nprocs)
 weak_alias (__get_nprocs, get_nprocs)