[1/4] Remove architecture specific sched_cpucount optimizations

Message ID 20210329182520.323665-1-adhemerval.zanella@linaro.org
State Committed
Headers
Series [1/4] Remove architecture specific sched_cpucount optimizations |

Commit Message

Adhemerval Zanella March 29, 2021, 6:25 p.m. UTC
  And replace the generic algorithm with the Brian Kernighan’s one.
GCC optimize it with popcnt if the architecture supports, so there
is no need to add the extra POPCNT define to enable it.

This is really a micro-optimization that only adds complexity:
recent ABIs already support it (x86-64-v2 or power64le) and it
simplifies the code for internal usage, since i686 does not allow an
internal iFUNC call.

Checked on x86_64-linux-gnu, aarch64-linux-gnu, and
powerpc64le-linux-gnu.
---
 posix/sched_cpucount.c                       | 28 +++------------
 sysdeps/i386/i686/multiarch/sched_cpucount.c |  1 -
 sysdeps/ia64/sched_cpucount.c                | 20 -----------
 sysdeps/powerpc/sched_cpucount.c             | 22 ------------
 sysdeps/x86_64/multiarch/sched_cpucount.c    | 36 --------------------
 sysdeps/x86_64/sched_cpucount.c              | 25 --------------
 6 files changed, 4 insertions(+), 128 deletions(-)
 delete mode 100644 sysdeps/i386/i686/multiarch/sched_cpucount.c
 delete mode 100644 sysdeps/ia64/sched_cpucount.c
 delete mode 100644 sysdeps/powerpc/sched_cpucount.c
 delete mode 100644 sysdeps/x86_64/multiarch/sched_cpucount.c
 delete mode 100644 sysdeps/x86_64/sched_cpucount.c
  

Comments

Adhemerval Zanella May 5, 2021, 4:52 p.m. UTC | #1
Ping.

On 29/03/2021 15:25, Adhemerval Zanella wrote:
> And replace the generic algorithm with the Brian Kernighan’s one.
> GCC optimize it with popcnt if the architecture supports, so there
> is no need to add the extra POPCNT define to enable it.
> 
> This is really a micro-optimization that only adds complexity:
> recent ABIs already support it (x86-64-v2 or power64le) and it
> simplifies the code for internal usage, since i686 does not allow an
> internal iFUNC call.
> 
> Checked on x86_64-linux-gnu, aarch64-linux-gnu, and
> powerpc64le-linux-gnu.
> ---
>  posix/sched_cpucount.c                       | 28 +++------------
>  sysdeps/i386/i686/multiarch/sched_cpucount.c |  1 -
>  sysdeps/ia64/sched_cpucount.c                | 20 -----------
>  sysdeps/powerpc/sched_cpucount.c             | 22 ------------
>  sysdeps/x86_64/multiarch/sched_cpucount.c    | 36 --------------------
>  sysdeps/x86_64/sched_cpucount.c              | 25 --------------
>  6 files changed, 4 insertions(+), 128 deletions(-)
>  delete mode 100644 sysdeps/i386/i686/multiarch/sched_cpucount.c
>  delete mode 100644 sysdeps/ia64/sched_cpucount.c
>  delete mode 100644 sysdeps/powerpc/sched_cpucount.c
>  delete mode 100644 sysdeps/x86_64/multiarch/sched_cpucount.c
>  delete mode 100644 sysdeps/x86_64/sched_cpucount.c
> 
> diff --git a/posix/sched_cpucount.c b/posix/sched_cpucount.c
> index b0ca4ea7bc..529286e777 100644
> --- a/posix/sched_cpucount.c
> +++ b/posix/sched_cpucount.c
> @@ -22,31 +22,11 @@ int
>  __sched_cpucount (size_t setsize, const cpu_set_t *setp)
>  {
>    int s = 0;
> -  const __cpu_mask *p = setp->__bits;
> -  const __cpu_mask *end = &setp->__bits[setsize / sizeof (__cpu_mask)];
> -
> -  while (p < end)
> +  for (int i = 0; i < setsize / sizeof (__cpu_mask); i++)
>      {
> -      __cpu_mask l = *p++;
> -
> -#ifdef POPCNT
> -      s += POPCNT (l);
> -#else
> -      if (l == 0)
> -	continue;
> -
> -      _Static_assert (sizeof (l) == sizeof (unsigned int)
> -		      || sizeof (l) == sizeof (unsigned long)
> -		      || sizeof (l) == sizeof (unsigned long long),
> -		      "sizeof (__cpu_mask");
> -      if (sizeof (__cpu_mask) == sizeof (unsigned int))
> -	s += __builtin_popcount (l);
> -      else if (sizeof (__cpu_mask) == sizeof (unsigned long))
> -	s += __builtin_popcountl (l);
> -      else
> -	s += __builtin_popcountll (l);
> -#endif
> +      __cpu_mask si = setp->__bits[i];
> +      /* Clear the least significant bit set.  */
> +      for (; si != 0; si &= si - 1, s++);
>      }
> -
>    return s;
>  }
> diff --git a/sysdeps/i386/i686/multiarch/sched_cpucount.c b/sysdeps/i386/i686/multiarch/sched_cpucount.c
> deleted file mode 100644
> index 7db31b02f8..0000000000
> --- a/sysdeps/i386/i686/multiarch/sched_cpucount.c
> +++ /dev/null
> @@ -1 +0,0 @@
> -#include <sysdeps/x86_64/multiarch/sched_cpucount.c>
> diff --git a/sysdeps/ia64/sched_cpucount.c b/sysdeps/ia64/sched_cpucount.c
> deleted file mode 100644
> index 8440864b02..0000000000
> --- a/sysdeps/ia64/sched_cpucount.c
> +++ /dev/null
> @@ -1,20 +0,0 @@
> -/* Copyright (C) 2007-2021 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#define POPCNT(l) __builtin_popcountl (l)
> -
> -#include <posix/sched_cpucount.c>
> diff --git a/sysdeps/powerpc/sched_cpucount.c b/sysdeps/powerpc/sched_cpucount.c
> deleted file mode 100644
> index 8f00e3dbc8..0000000000
> --- a/sysdeps/powerpc/sched_cpucount.c
> +++ /dev/null
> @@ -1,22 +0,0 @@
> -/* Copyright (C) 2007-2021 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#ifdef _ARCH_PWR5
> -# define POPCNT(l) __builtin_popcountl (l)
> -#endif
> -
> -#include <posix/sched_cpucount.c>
> diff --git a/sysdeps/x86_64/multiarch/sched_cpucount.c b/sysdeps/x86_64/multiarch/sched_cpucount.c
> deleted file mode 100644
> index 5180a11434..0000000000
> --- a/sysdeps/x86_64/multiarch/sched_cpucount.c
> +++ /dev/null
> @@ -1,36 +0,0 @@
> -/* Count bits in CPU set.  x86-64 multi-arch version.
> -   This file is part of the GNU C Library.
> -   Copyright (C) 2008-2021 Free Software Foundation, Inc.
> -   Contributed by Ulrich Drepper <drepper@redhat.com>.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <sched.h>
> -#include "init-arch.h"
> -
> -#define __sched_cpucount static generic_cpucount
> -#include <posix/sched_cpucount.c>
> -#undef __sched_cpucount
> -
> -#define POPCNT(l) \
> -  ({ __cpu_mask r; \
> -     asm ("popcnt %1, %0" : "=r" (r) : "0" (l));\
> -     r; })
> -#define __sched_cpucount static popcount_cpucount
> -#include <posix/sched_cpucount.c>
> -#undef __sched_cpucount
> -
> -libc_ifunc (__sched_cpucount,
> -	    CPU_FEATURE_USABLE (POPCNT) ? popcount_cpucount : generic_cpucount);
> diff --git a/sysdeps/x86_64/sched_cpucount.c b/sysdeps/x86_64/sched_cpucount.c
> deleted file mode 100644
> index 5a27336d6d..0000000000
> --- a/sysdeps/x86_64/sched_cpucount.c
> +++ /dev/null
> @@ -1,25 +0,0 @@
> -/* Copyright (C) 2007-2021 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#ifdef __amdfam10
> -# define POPCNT(l) \
> -  ({ __cpu_mask r;							      \
> -     asm ("popcntq %1, %0" : "=r" (r) : "0" (l));			      \
> -     r; })
> -#endif
> -
> -#include <posix/sched_cpucount.c>
>
  
Florian Weimer May 5, 2021, 5:28 p.m. UTC | #2
* Adhemerval Zanella via Libc-alpha:

> diff --git a/posix/sched_cpucount.c b/posix/sched_cpucount.c
> index b0ca4ea7bc..529286e777 100644
> --- a/posix/sched_cpucount.c
> +++ b/posix/sched_cpucount.c
> @@ -22,31 +22,11 @@ int
>  __sched_cpucount (size_t setsize, const cpu_set_t *setp)
>  {
>    int s = 0;
> +  for (int i = 0; i < setsize / sizeof (__cpu_mask); i++)
>      {
> +      __cpu_mask si = setp->__bits[i];
> +      /* Clear the least significant bit set.  */
> +      for (; si != 0; si &= si - 1, s++);
>      }
> -
>    return s;
>  }

Why “si”?  It think si &= si - 1 clears the *most* significant bit in
si.  If you agree, please update the comment.

The rest looks okay to me.

Thanks,
Florian
  
Paul Eggert May 5, 2021, 6:25 p.m. UTC | #3
On 5/5/21 10:28 AM, Florian Weimer via Libc-alpha wrote:
>> diff --git a/posix/sched_cpucount.c b/posix/sched_cpucount.c
>> index b0ca4ea7bc..529286e777 100644
>> --- a/posix/sched_cpucount.c
>> +++ b/posix/sched_cpucount.c
>> @@ -22,31 +22,11 @@ int
>>   __sched_cpucount (size_t setsize, const cpu_set_t *setp)
>>   {
>>     int s = 0;
>> +  for (int i = 0; i < setsize / sizeof (__cpu_mask); i++)
>>       {
>> +      __cpu_mask si = setp->__bits[i];
>> +      /* Clear the least significant bit set.  */
>> +      for (; si != 0; si &= si - 1, s++);
>>       }
>> -
>>     return s;
>>   }
> Why “si”?  It think si &= si - 1 clears the*most*  significant bit in
> si.  If you agree, please update the comment.

Better yet, define a static function 'popcount' that uses Kernighan's 
trick and call that function. As things stand it's not obvious what the 
code is doing, regardless of which bit it's clearing. The function's 
comment should explain why it's not using __builtin_popcount.
  
Noah Goldstein May 5, 2021, 7:52 p.m. UTC | #4
On Wed, May 5, 2021 at 12:32 PM Paul Eggert <eggert@cs.ucla.edu> wrote:
>
> On 5/5/21 10:28 AM, Florian Weimer via Libc-alpha wrote:
> >> diff --git a/posix/sched_cpucount.c b/posix/sched_cpucount.c
> >> index b0ca4ea7bc..529286e777 100644
> >> --- a/posix/sched_cpucount.c
> >> +++ b/posix/sched_cpucount.c
> >> @@ -22,31 +22,11 @@ int
> >>   __sched_cpucount (size_t setsize, const cpu_set_t *setp)
> >>   {
> >>     int s = 0;
> >> +  for (int i = 0; i < setsize / sizeof (__cpu_mask); i++)
> >>       {
> >> +      __cpu_mask si = setp->__bits[i];
> >> +      /* Clear the least significant bit set.  */
> >> +      for (; si != 0; si &= si - 1, s++);
> >>       }
> >> -
> >>     return s;
> >>   }
> > Why “si”?  It think si &= si - 1 clears the*most*  significant bit in
> > si.  If you agree, please update the comment.
>
> Better yet, define a static function 'popcount' that uses Kernighan's
> trick and call that function. As things stand it's not obvious what the
> code is doing, regardless of which bit it's clearing. The function's
> comment should explain why it's not using __builtin_popcount.

It looks to me like GCC is still having a bit of trouble with the new
implementation. With skylake as a target it seems to be throwing
in a cmovcc in the outer loop:

https://godbolt.org/z/ocn1KWGPx
  
Adhemerval Zanella May 6, 2021, 12:22 p.m. UTC | #5
On 05/05/2021 16:52, Noah Goldstein via Libc-alpha wrote:
> On Wed, May 5, 2021 at 12:32 PM Paul Eggert <eggert@cs.ucla.edu> wrote:
>>
>> On 5/5/21 10:28 AM, Florian Weimer via Libc-alpha wrote:
>>>> diff --git a/posix/sched_cpucount.c b/posix/sched_cpucount.c
>>>> index b0ca4ea7bc..529286e777 100644
>>>> --- a/posix/sched_cpucount.c
>>>> +++ b/posix/sched_cpucount.c
>>>> @@ -22,31 +22,11 @@ int
>>>>   __sched_cpucount (size_t setsize, const cpu_set_t *setp)
>>>>   {
>>>>     int s = 0;
>>>> +  for (int i = 0; i < setsize / sizeof (__cpu_mask); i++)
>>>>       {
>>>> +      __cpu_mask si = setp->__bits[i];
>>>> +      /* Clear the least significant bit set.  */
>>>> +      for (; si != 0; si &= si - 1, s++);
>>>>       }
>>>> -
>>>>     return s;
>>>>   }
>>> Why “si”?  It think si &= si - 1 clears the*most*  significant bit in
>>> si.  If you agree, please update the comment.
>>
>> Better yet, define a static function 'popcount' that uses Kernighan's
>> trick and call that function. As things stand it's not obvious what the
>> code is doing, regardless of which bit it's clearing. The function's
>> comment should explain why it's not using __builtin_popcount.
> 
> It looks to me like GCC is still having a bit of trouble with the new
> implementation. With skylake as a target it seems to be throwing
> in a cmovcc in the outer loop:
> 
> https://godbolt.org/z/ocn1KWGPx
> 

It seems that come from using a signed int as the counter, I got a slight
better version using unsigned: https://godbolt.org/z/4MrMq1zKb

It is not as better as the builtin, but do we really need that
micro-optimization in this specific implementation? This is exactly what 
added the current complexity and using a open-coded popcount implementation
is slight better on architectures that do not provided such instruction
(where builtin_popcount will issue a call to __popcountsi2 and gcc seems
to aiming for open code such scenarios [1]).

In any case I would go for simplicity (and the builtin requires know the
underlying type size, which requires the ugly size of check to call the
correct one).

[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92135
  
Adhemerval Zanella May 6, 2021, 1:33 p.m. UTC | #6
On 05/05/2021 15:25, Paul Eggert wrote:
> On 5/5/21 10:28 AM, Florian Weimer via Libc-alpha wrote:
>>> diff --git a/posix/sched_cpucount.c b/posix/sched_cpucount.c
>>> index b0ca4ea7bc..529286e777 100644
>>> --- a/posix/sched_cpucount.c
>>> +++ b/posix/sched_cpucount.c
>>> @@ -22,31 +22,11 @@ int
>>>   __sched_cpucount (size_t setsize, const cpu_set_t *setp)
>>>   {
>>>     int s = 0;
>>> +  for (int i = 0; i < setsize / sizeof (__cpu_mask); i++)
>>>       {
>>> +      __cpu_mask si = setp->__bits[i];
>>> +      /* Clear the least significant bit set.  */
>>> +      for (; si != 0; si &= si - 1, s++);
>>>       }
>>> -
>>>     return s;
>>>   }
>> Why “si”?  It think si &= si - 1 clears the*most*  significant bit in
>> si.  If you agree, please update the comment.
> 
> Better yet, define a static function 'popcount' that uses Kernighan's trick and call that function. As things stand it's not obvious what the code is doing, regardless of which bit it's clearing. The function's comment should explain why it's not using __builtin_popcount.

What about the below:

diff --git a/posix/sched_cpucount.c b/posix/sched_cpucount.c
index b0ca4ea7bc..6cb5c4337e 100644
--- a/posix/sched_cpucount.c
+++ b/posix/sched_cpucount.c
@@ -17,36 +17,21 @@
 
 #include <sched.h>
 
+/* Counting bits set, Brian Kernighan's way  */
+static inline unsigned int
+countbits (__cpu_mask v)
+{
+  unsigned int s = 0;
+  for (; v != 0; s++)
+    v &= v - 1;
+  return s;
+}
 
 int
 __sched_cpucount (size_t setsize, const cpu_set_t *setp)
 {
   int s = 0;
-  const __cpu_mask *p = setp->__bits;
-  const __cpu_mask *end = &setp->__bits[setsize / sizeof (__cpu_mask)];
-
-  while (p < end)
-    {
-      __cpu_mask l = *p++;
-
-#ifdef POPCNT
-      s += POPCNT (l);
-#else
-      if (l == 0)
-	continue;
-
-      _Static_assert (sizeof (l) == sizeof (unsigned int)
-		      || sizeof (l) == sizeof (unsigned long)
-		      || sizeof (l) == sizeof (unsigned long long),
-		      "sizeof (__cpu_mask");
-      if (sizeof (__cpu_mask) == sizeof (unsigned int))
-	s += __builtin_popcount (l);
-      else if (sizeof (__cpu_mask) == sizeof (unsigned long))
-	s += __builtin_popcountl (l);
-      else
-	s += __builtin_popcountll (l);
-#endif
-    }
-
+  for (int i = 0; i < setsize / sizeof (__cpu_mask); i++)
+    s += countbits (setp->__bits[i]);
   return s;
 }
diff --git a/sysdeps/i386/i686/multiarch/sched_cpucount.c b/sysdeps/i386/i686/multiarch/sched_cpucount.c
deleted file mode 100644
index 7db31b02f8..0000000000
--- a/sysdeps/i386/i686/multiarch/sched_cpucount.c
+++ /dev/null
@@ -1 +0,0 @@
-#include <sysdeps/x86_64/multiarch/sched_cpucount.c>
diff --git a/sysdeps/ia64/sched_cpucount.c b/sysdeps/ia64/sched_cpucount.c
deleted file mode 100644
index 8440864b02..0000000000
--- a/sysdeps/ia64/sched_cpucount.c
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (C) 2007-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#define POPCNT(l) __builtin_popcountl (l)
-
-#include <posix/sched_cpucount.c>
diff --git a/sysdeps/powerpc/sched_cpucount.c b/sysdeps/powerpc/sched_cpucount.c
deleted file mode 100644
index 8f00e3dbc8..0000000000
--- a/sysdeps/powerpc/sched_cpucount.c
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (C) 2007-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifdef _ARCH_PWR5
-# define POPCNT(l) __builtin_popcountl (l)
-#endif
-
-#include <posix/sched_cpucount.c>
diff --git a/sysdeps/x86_64/multiarch/sched_cpucount.c b/sysdeps/x86_64/multiarch/sched_cpucount.c
deleted file mode 100644
index 5180a11434..0000000000
--- a/sysdeps/x86_64/multiarch/sched_cpucount.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Count bits in CPU set.  x86-64 multi-arch version.
-   This file is part of the GNU C Library.
-   Copyright (C) 2008-2021 Free Software Foundation, Inc.
-   Contributed by Ulrich Drepper <drepper@redhat.com>.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sched.h>
-#include "init-arch.h"
-
-#define __sched_cpucount static generic_cpucount
-#include <posix/sched_cpucount.c>
-#undef __sched_cpucount
-
-#define POPCNT(l) \
-  ({ __cpu_mask r; \
-     asm ("popcnt %1, %0" : "=r" (r) : "0" (l));\
-     r; })
-#define __sched_cpucount static popcount_cpucount
-#include <posix/sched_cpucount.c>
-#undef __sched_cpucount
-
-libc_ifunc (__sched_cpucount,
-	    CPU_FEATURE_USABLE (POPCNT) ? popcount_cpucount : generic_cpucount);
diff --git a/sysdeps/x86_64/sched_cpucount.c b/sysdeps/x86_64/sched_cpucount.c
deleted file mode 100644
index 5a27336d6d..0000000000
--- a/sysdeps/x86_64/sched_cpucount.c
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (C) 2007-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifdef __amdfam10
-# define POPCNT(l) \
-  ({ __cpu_mask r;							      \
-     asm ("popcntq %1, %0" : "=r" (r) : "0" (l));			      \
-     r; })
-#endif
-
-#include <posix/sched_cpucount.c>
  
Florian Weimer May 6, 2021, 1:43 p.m. UTC | #7
* Adhemerval Zanella:

> On 05/05/2021 15:25, Paul Eggert wrote:
>> On 5/5/21 10:28 AM, Florian Weimer via Libc-alpha wrote:
>>>> diff --git a/posix/sched_cpucount.c b/posix/sched_cpucount.c
>>>> index b0ca4ea7bc..529286e777 100644
>>>> --- a/posix/sched_cpucount.c
>>>> +++ b/posix/sched_cpucount.c
>>>> @@ -22,31 +22,11 @@ int
>>>>   __sched_cpucount (size_t setsize, const cpu_set_t *setp)
>>>>   {
>>>>     int s = 0;
>>>> +  for (int i = 0; i < setsize / sizeof (__cpu_mask); i++)
>>>>       {
>>>> +      __cpu_mask si = setp->__bits[i];
>>>> +      /* Clear the least significant bit set.  */
>>>> +      for (; si != 0; si &= si - 1, s++);
>>>>       }
>>>> -
>>>>     return s;
>>>>   }
>>> Why “si”?  It think si &= si - 1 clears the*most*  significant bit in
>>> si.  If you agree, please update the comment.
>> 
>> Better yet, define a static function 'popcount' that uses Kernighan's trick and call that function. As things stand it's not obvious what the code is doing, regardless of which bit it's clearing. The function's comment should explain why it's not using __builtin_popcount.
>
> What about the below:
>
> diff --git a/posix/sched_cpucount.c b/posix/sched_cpucount.c
> index b0ca4ea7bc..6cb5c4337e 100644
> --- a/posix/sched_cpucount.c
> +++ b/posix/sched_cpucount.c
> @@ -17,36 +17,21 @@
>  
>  #include <sched.h>
>  
> +/* Counting bits set, Brian Kernighan's way  */
> +static inline unsigned int
> +countbits (__cpu_mask v)
> +{
> +  unsigned int s = 0;
> +  for (; v != 0; s++)
> +    v &= v - 1;
> +  return s;
> +}

I get that choosing the exact matching builtin for the __cpu_mask type
isn't easy, but wouldn't it be safe to use __builtin_popcountll
unconditionally?

Thanks,
Florian
  
Adhemerval Zanella May 6, 2021, 4:16 p.m. UTC | #8
On 06/05/2021 10:43, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>> On 05/05/2021 15:25, Paul Eggert wrote:
>>> On 5/5/21 10:28 AM, Florian Weimer via Libc-alpha wrote:
>>>>> diff --git a/posix/sched_cpucount.c b/posix/sched_cpucount.c
>>>>> index b0ca4ea7bc..529286e777 100644
>>>>> --- a/posix/sched_cpucount.c
>>>>> +++ b/posix/sched_cpucount.c
>>>>> @@ -22,31 +22,11 @@ int
>>>>>   __sched_cpucount (size_t setsize, const cpu_set_t *setp)
>>>>>   {
>>>>>     int s = 0;
>>>>> +  for (int i = 0; i < setsize / sizeof (__cpu_mask); i++)
>>>>>       {
>>>>> +      __cpu_mask si = setp->__bits[i];
>>>>> +      /* Clear the least significant bit set.  */
>>>>> +      for (; si != 0; si &= si - 1, s++);
>>>>>       }
>>>>> -
>>>>>     return s;
>>>>>   }
>>>> Why “si”?  It think si &= si - 1 clears the*most*  significant bit in
>>>> si.  If you agree, please update the comment.
>>>
>>> Better yet, define a static function 'popcount' that uses Kernighan's trick and call that function. As things stand it's not obvious what the code is doing, regardless of which bit it's clearing. The function's comment should explain why it's not using __builtin_popcount.
>>
>> What about the below:
>>
>> diff --git a/posix/sched_cpucount.c b/posix/sched_cpucount.c
>> index b0ca4ea7bc..6cb5c4337e 100644
>> --- a/posix/sched_cpucount.c
>> +++ b/posix/sched_cpucount.c
>> @@ -17,36 +17,21 @@
>>  
>>  #include <sched.h>
>>  
>> +/* Counting bits set, Brian Kernighan's way  */
>> +static inline unsigned int
>> +countbits (__cpu_mask v)
>> +{
>> +  unsigned int s = 0;
>> +  for (; v != 0; s++)
>> +    v &= v - 1;
>> +  return s;
>> +}
> 
> I get that choosing the exact matching builtin for the __cpu_mask type
> isn't easy, but wouldn't it be safe to use __builtin_popcountll
> unconditionally?

Using a open-coded routine is slight better for architectures that
do not have a popcount instruction, since __builtin_popcountll will
call the libgcc routine).  But I hardly think we need that amount
of micro-optimization for such routine.
  
Florian Weimer May 6, 2021, 4:42 p.m. UTC | #9
* Adhemerval Zanella:

>> I get that choosing the exact matching builtin for the __cpu_mask type
>> isn't easy, but wouldn't it be safe to use __builtin_popcountll
>> unconditionally?
>
> Using a open-coded routine is slight better for architectures that
> do not have a popcount instruction, since __builtin_popcountll will
> call the libgcc routine).  But I hardly think we need that amount
> of micro-optimization for such routine.

Ahh, and we'll get a check-localplt failure.  Hmm.  So mention that in
the comment?

Thanks,
Florian
  
Adhemerval Zanella May 6, 2021, 4:54 p.m. UTC | #10
On 06/05/2021 13:42, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>>> I get that choosing the exact matching builtin for the __cpu_mask type
>>> isn't easy, but wouldn't it be safe to use __builtin_popcountll
>>> unconditionally?
>>
>> Using a open-coded routine is slight better for architectures that
>> do not have a popcount instruction, since __builtin_popcountll will
>> call the libgcc routine).  But I hardly think we need that amount
>> of micro-optimization for such routine.
> 
> Ahh, and we'll get a check-localplt failure.  Hmm.  So mention that in
> the comment?
The check-localplt issue only occurs when using compiler generated symbols 
if the libgcc function itself calls a glibc symbol (for instance arm div
soft-fp might call raise). In this case the call will be local, the
issue is more performance-wise.
  
Paul Eggert May 6, 2021, 5:12 p.m. UTC | #11
On 5/6/21 6:33 AM, Adhemerval Zanella wrote:
> +/* Counting bits set, Brian Kernighan's way  */
> +static inline unsigned int
> +countbits (__cpu_mask v)
> +{
> +  unsigned int s = 0;
> +  for (; v != 0; s++)
> +    v &= v - 1;
> +  return s;
> +}

Looks OK, except please use 'int' rather than 'unsigned int' (twice) as 
we should prefer signed to unsigned where either will do and the caller 
wants plain int anyway.

More important, please add a comment saying why we're not using the GCC 
builtin (summarizing Florian's and your followup remarks) since it is a 
little odd to see code that simply replicates a compiler builtin (what's 
next, we'll forgo multiplication in favor of repeated addition? :-).
  
Adhemerval Zanella May 6, 2021, 5:51 p.m. UTC | #12
On 06/05/2021 14:12, Paul Eggert wrote:
> On 5/6/21 6:33 AM, Adhemerval Zanella wrote:
>> +/* Counting bits set, Brian Kernighan's way  */
>> +static inline unsigned int
>> +countbits (__cpu_mask v)
>> +{
>> +  unsigned int s = 0;
>> +  for (; v != 0; s++)
>> +    v &= v - 1;
>> +  return s;
>> +}
> 
> Looks OK, except please use 'int' rather than 'unsigned int' (twice) as we should prefer signed to unsigned where either will do and the caller wants plain int anyway.

The unsigned int is more to avoid the cmovcc on x86_64
https://godbolt.org/z/4MrMq1zKb.  But I do prefer int
in this case as well.

> 
> More important, please add a comment saying why we're not using the GCC builtin (summarizing Florian's and your followup remarks) since it is a little odd to see code that simply replicates a compiler builtin (what's next, we'll forgo multiplication in favor of repeated addition? :-).

This popcount is trick, even gcc is not really sure what do do
on every architecture [1].

[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92135
  
Noah Goldstein May 6, 2021, 6:34 p.m. UTC | #13
On Thu, May 6, 2021 at 8:22 AM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 05/05/2021 16:52, Noah Goldstein via Libc-alpha wrote:
> > On Wed, May 5, 2021 at 12:32 PM Paul Eggert <eggert@cs.ucla.edu> wrote:
> >>
> >> On 5/5/21 10:28 AM, Florian Weimer via Libc-alpha wrote:
> >>>> diff --git a/posix/sched_cpucount.c b/posix/sched_cpucount.c
> >>>> index b0ca4ea7bc..529286e777 100644
> >>>> --- a/posix/sched_cpucount.c
> >>>> +++ b/posix/sched_cpucount.c
> >>>> @@ -22,31 +22,11 @@ int
> >>>>   __sched_cpucount (size_t setsize, const cpu_set_t *setp)
> >>>>   {
> >>>>     int s = 0;
> >>>> +  for (int i = 0; i < setsize / sizeof (__cpu_mask); i++)
> >>>>       {
> >>>> +      __cpu_mask si = setp->__bits[i];
> >>>> +      /* Clear the least significant bit set.  */
> >>>> +      for (; si != 0; si &= si - 1, s++);
> >>>>       }
> >>>> -
> >>>>     return s;
> >>>>   }
> >>> Why “si”?  It think si &= si - 1 clears the*most*  significant bit in
> >>> si.  If you agree, please update the comment.
> >>
> >> Better yet, define a static function 'popcount' that uses Kernighan's
> >> trick and call that function. As things stand it's not obvious what the
> >> code is doing, regardless of which bit it's clearing. The function's
> >> comment should explain why it's not using __builtin_popcount.
> >
> > It looks to me like GCC is still having a bit of trouble with the new
> > implementation. With skylake as a target it seems to be throwing
> > in a cmovcc in the outer loop:
> >
> > https://godbolt.org/z/ocn1KWGPx
> >
>
> It seems that come from using a signed int as the counter, I got a slight
> better version using unsigned: https://godbolt.org/z/4MrMq1zKb
>
> It is not as better as the builtin, but do we really need that
> micro-optimization in this specific implementation? This is exactly what
> added the current complexity and using a open-coded popcount implementation
> is slight better on architectures that do not provided such instruction
> (where builtin_popcount will issue a call to __popcountsi2 and gcc seems
> to aiming for open code such scenarios [1]).
>
> In any case I would go for simplicity (and the builtin requires know the
> underlying type size, which requires the ugly size of check to call the
> correct one).
>
> [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92135

I agree. Definetly not opposed to the patch.
  

Patch

diff --git a/posix/sched_cpucount.c b/posix/sched_cpucount.c
index b0ca4ea7bc..529286e777 100644
--- a/posix/sched_cpucount.c
+++ b/posix/sched_cpucount.c
@@ -22,31 +22,11 @@  int
 __sched_cpucount (size_t setsize, const cpu_set_t *setp)
 {
   int s = 0;
-  const __cpu_mask *p = setp->__bits;
-  const __cpu_mask *end = &setp->__bits[setsize / sizeof (__cpu_mask)];
-
-  while (p < end)
+  for (int i = 0; i < setsize / sizeof (__cpu_mask); i++)
     {
-      __cpu_mask l = *p++;
-
-#ifdef POPCNT
-      s += POPCNT (l);
-#else
-      if (l == 0)
-	continue;
-
-      _Static_assert (sizeof (l) == sizeof (unsigned int)
-		      || sizeof (l) == sizeof (unsigned long)
-		      || sizeof (l) == sizeof (unsigned long long),
-		      "sizeof (__cpu_mask");
-      if (sizeof (__cpu_mask) == sizeof (unsigned int))
-	s += __builtin_popcount (l);
-      else if (sizeof (__cpu_mask) == sizeof (unsigned long))
-	s += __builtin_popcountl (l);
-      else
-	s += __builtin_popcountll (l);
-#endif
+      __cpu_mask si = setp->__bits[i];
+      /* Clear the least significant bit set.  */
+      for (; si != 0; si &= si - 1, s++);
     }
-
   return s;
 }
diff --git a/sysdeps/i386/i686/multiarch/sched_cpucount.c b/sysdeps/i386/i686/multiarch/sched_cpucount.c
deleted file mode 100644
index 7db31b02f8..0000000000
--- a/sysdeps/i386/i686/multiarch/sched_cpucount.c
+++ /dev/null
@@ -1 +0,0 @@ 
-#include <sysdeps/x86_64/multiarch/sched_cpucount.c>
diff --git a/sysdeps/ia64/sched_cpucount.c b/sysdeps/ia64/sched_cpucount.c
deleted file mode 100644
index 8440864b02..0000000000
--- a/sysdeps/ia64/sched_cpucount.c
+++ /dev/null
@@ -1,20 +0,0 @@ 
-/* Copyright (C) 2007-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#define POPCNT(l) __builtin_popcountl (l)
-
-#include <posix/sched_cpucount.c>
diff --git a/sysdeps/powerpc/sched_cpucount.c b/sysdeps/powerpc/sched_cpucount.c
deleted file mode 100644
index 8f00e3dbc8..0000000000
--- a/sysdeps/powerpc/sched_cpucount.c
+++ /dev/null
@@ -1,22 +0,0 @@ 
-/* Copyright (C) 2007-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifdef _ARCH_PWR5
-# define POPCNT(l) __builtin_popcountl (l)
-#endif
-
-#include <posix/sched_cpucount.c>
diff --git a/sysdeps/x86_64/multiarch/sched_cpucount.c b/sysdeps/x86_64/multiarch/sched_cpucount.c
deleted file mode 100644
index 5180a11434..0000000000
--- a/sysdeps/x86_64/multiarch/sched_cpucount.c
+++ /dev/null
@@ -1,36 +0,0 @@ 
-/* Count bits in CPU set.  x86-64 multi-arch version.
-   This file is part of the GNU C Library.
-   Copyright (C) 2008-2021 Free Software Foundation, Inc.
-   Contributed by Ulrich Drepper <drepper@redhat.com>.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sched.h>
-#include "init-arch.h"
-
-#define __sched_cpucount static generic_cpucount
-#include <posix/sched_cpucount.c>
-#undef __sched_cpucount
-
-#define POPCNT(l) \
-  ({ __cpu_mask r; \
-     asm ("popcnt %1, %0" : "=r" (r) : "0" (l));\
-     r; })
-#define __sched_cpucount static popcount_cpucount
-#include <posix/sched_cpucount.c>
-#undef __sched_cpucount
-
-libc_ifunc (__sched_cpucount,
-	    CPU_FEATURE_USABLE (POPCNT) ? popcount_cpucount : generic_cpucount);
diff --git a/sysdeps/x86_64/sched_cpucount.c b/sysdeps/x86_64/sched_cpucount.c
deleted file mode 100644
index 5a27336d6d..0000000000
--- a/sysdeps/x86_64/sched_cpucount.c
+++ /dev/null
@@ -1,25 +0,0 @@ 
-/* Copyright (C) 2007-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifdef __amdfam10
-# define POPCNT(l) \
-  ({ __cpu_mask r;							      \
-     asm ("popcntq %1, %0" : "=r" (r) : "0" (l));			      \
-     r; })
-#endif
-
-#include <posix/sched_cpucount.c>