[v3,3/7] stdlib: Optimization qsort{_r} swap implementation (BZ #19305)

Message ID 20210903171144.952737-4-adhemerval.zanella@linaro.org
State Superseded
Headers
Series Use introsort for qsort |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Adhemerval Zanella Sept. 3, 2021, 5:11 p.m. UTC
  It optimizes take in consideration both the most common elements are
either 32 or 64 bit in size [1] and inputs are aligned to the word
boundary.  This is similar to the optimization done on lib/sort.c
from Linux.

This patchs adds an optimized swap operation on qsort based in previous
msort one.  Instead of byte operation, three variants are provided:

  1. Using uint32_t loads and stores.
  2. Using uint64_t loads and stores.
  3. Generic one with a temporary buffer and memcpy/mempcpy.

The 1. and 2. options are selected only either if architecture defines
_STRING_ARCH_unaligned or if base pointer is aligned to required type.

It also fixes BZ#19305 by checking input size against number of
elements 1 besides 0.

Checked on x86_64-linux-gnu.

[1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html
---
 stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 91 insertions(+), 18 deletions(-)
  

Comments

Noah Goldstein Oct. 13, 2021, 3:29 a.m. UTC | #1
On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha <
libc-alpha@sourceware.org> wrote:

> It optimizes take in consideration both the most common elements are
> either 32 or 64 bit in size [1] and inputs are aligned to the word
> boundary.  This is similar to the optimization done on lib/sort.c
> from Linux.
>
> This patchs adds an optimized swap operation on qsort based in previous
> msort one.  Instead of byte operation, three variants are provided:
>
>   1. Using uint32_t loads and stores.
>   2. Using uint64_t loads and stores.
>   3. Generic one with a temporary buffer and memcpy/mempcpy.
>
> The 1. and 2. options are selected only either if architecture defines
> _STRING_ARCH_unaligned or if base pointer is aligned to required type.
>
> It also fixes BZ#19305 by checking input size against number of
> elements 1 besides 0.
>
> Checked on x86_64-linux-gnu.
>
> [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html
> ---
>  stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 91 insertions(+), 18 deletions(-)
>
> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> index 23f2d28314..59458d151b 100644
> --- a/stdlib/qsort.c
> +++ b/stdlib/qsort.c
> @@ -24,20 +24,85 @@
>  #include <limits.h>
>  #include <stdlib.h>
>  #include <string.h>
> +#include <stdbool.h>
>
> -/* Byte-wise swap two items of size SIZE. */
> -#define SWAP(a, b, size)
>    \
> -  do
>    \
> -    {
>     \
> -      size_t __size = (size);
>     \
> -      char *__a = (a), *__b = (b);
>    \
> -      do
>    \
> -       {
>    \
> -         char __tmp = *__a;
>     \
> -         *__a++ = *__b;
>     \
> -         *__b++ = __tmp;
>    \
> -       } while (--__size > 0);
>    \
> -    } while (0)
> +/* Swap SIZE bytes between addresses A and B.  These helpers are provided
> +   along the generic one as an optimization.  */
> +
> +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
> +
> +/* Return trues is elements can be copied used word load and sortes.
> +   The size must be a multiple of the alignment, and the base address.  */
> +static inline bool
> +is_aligned_to_copy (const void *base, size_t size, size_t align)
> +{
> +  unsigned char lsbits = size;
> +#if !_STRING_ARCH_unaligned
> +  lsbits |= (unsigned char)(uintptr_t) base;
> +#endif
> +  return (lsbits & (align - 1)) == 0;
> +}
> +
> +#define SWAP_WORDS_64 (swap_func_t)0
> +#define SWAP_WORDS_32 (swap_func_t)1
> +#define SWAP_BYTES    (swap_func_t)2
> +
> +static void
> +swap_words_64 (void * restrict a, void * restrict b, size_t n)
> +{
> +  do
> +   {
> +     n -= 8;
> +     uint64_t t = *(uint64_t *)(a + n);
> +     *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
> +     *(uint64_t *)(b + n) = t;
> +   } while (n);
> +}
> +
> +static void
> +swap_words_32 (void * restrict a, void * restrict b, size_t n)
> +{
> +  do
> +   {
> +     n -= 4;
> +     uint32_t t = *(uint32_t *)(a + n);
> +     *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
> +     *(uint32_t *)(b + n) = t;
> +   } while (n);
> +}
> +
> +static void
> +swap_bytes (void * restrict a, void * restrict b, size_t n)
> +{
> +  /* Use multiple small memcpys with constant size to enable inlining
> +     on most targets.  */
> +  enum { SWAP_GENERIC_SIZE = 32 };
> +  unsigned char tmp[SWAP_GENERIC_SIZE];
> +  while (n > SWAP_GENERIC_SIZE)
> +    {
> +      memcpy (tmp, a, SWAP_GENERIC_SIZE);
> +      a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
> +      b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
> +      n -= SWAP_GENERIC_SIZE;
> +    }
> +  memcpy (tmp, a, n);
> +  memcpy (a, b, n);
> +  memcpy (b, tmp, n);
> +}
> +
> +/* Replace the indirect call with a serie of if statements.  It should
> help
> +   the branch predictor.  */
>

1) Really? On Intel at least an indirect call that is always going to the
same place
is certainly going to be predicted as well if not better than 2/3
branches + direct call.

2) If you're going to just test which swap function to use, why
bother initializing
swap_func? Why not just use an int?



> +static void
> +do_swap (void * restrict a, void * restrict b, size_t size,
> +        swap_func_t swap_func)
> +{
> +  if (swap_func == SWAP_WORDS_64)
> +    swap_words_64 (a, b, size);
> +  else if (swap_func == SWAP_WORDS_32)
> +    swap_words_32 (a, b, size);
> +  else
> +    swap_bytes (a, b, size);
> +}
>
>  /* Discontinue quicksort algorithm when partition gets below this size.
>     This particular magic number was chosen to work best on a Sun 4/260. */
> @@ -97,6 +162,14 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
>      /* Avoid lossage with unsigned arithmetic below.  */
>      return;
>
> +  swap_func_t swap_func;
> +  if (is_aligned_to_copy (pbase, size, 8))
> +    swap_func = SWAP_WORDS_64;
> +  else if (is_aligned_to_copy (pbase, size, 4))
> +    swap_func = SWAP_WORDS_32;
> +  else
> +    swap_func = SWAP_BYTES;
> +
>    if (total_elems > MAX_THRESH)
>      {
>        char *lo = base_ptr;
> @@ -120,13 +193,13 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
>           char *mid = lo + size * ((hi - lo) / size >> 1);
>
>           if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
> -           SWAP (mid, lo, size);
> +           do_swap (mid, lo, size, swap_func);
>           if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
> -           SWAP (mid, hi, size);
> +           do_swap (mid, hi, size, swap_func);
>           else
>             goto jump_over;
>           if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
> -           SWAP (mid, lo, size);
> +           do_swap (mid, lo, size, swap_func);
>         jump_over:;
>
>           left_ptr  = lo + size;
> @@ -145,7 +218,7 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
>
>               if (left_ptr < right_ptr)
>                 {
> -                 SWAP (left_ptr, right_ptr, size);
> +                 do_swap (left_ptr, right_ptr, size, swap_func);
>                   if (mid == left_ptr)
>                     mid = right_ptr;
>                   else if (mid == right_ptr)
> @@ -217,7 +290,7 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
>          tmp_ptr = run_ptr;
>
>      if (tmp_ptr != base_ptr)
> -      SWAP (tmp_ptr, base_ptr, size);
> +      do_swap (tmp_ptr, base_ptr, size, swap_func);
>
>      /* Insertion sort, running from left-hand-side up to
> right-hand-side.  */
>
> --
> 2.30.2
>
>
  
Noah Goldstein Oct. 13, 2021, 3:39 a.m. UTC | #2
On Tue, Oct 12, 2021 at 11:29 PM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

>
>
> On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha <
> libc-alpha@sourceware.org> wrote:
>
>> It optimizes take in consideration both the most common elements are
>> either 32 or 64 bit in size [1] and inputs are aligned to the word
>> boundary.  This is similar to the optimization done on lib/sort.c
>> from Linux.
>>
>> This patchs adds an optimized swap operation on qsort based in previous
>> msort one.  Instead of byte operation, three variants are provided:
>>
>>   1. Using uint32_t loads and stores.
>>   2. Using uint64_t loads and stores.
>>   3. Generic one with a temporary buffer and memcpy/mempcpy.
>>
>> The 1. and 2. options are selected only either if architecture defines
>> _STRING_ARCH_unaligned or if base pointer is aligned to required type.
>>
>> It also fixes BZ#19305 by checking input size against number of
>> elements 1 besides 0.
>>
>> Checked on x86_64-linux-gnu.
>>
>> [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html
>> ---
>>  stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++--------
>>  1 file changed, 91 insertions(+), 18 deletions(-)
>>
>> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
>> index 23f2d28314..59458d151b 100644
>> --- a/stdlib/qsort.c
>> +++ b/stdlib/qsort.c
>> @@ -24,20 +24,85 @@
>>  #include <limits.h>
>>  #include <stdlib.h>
>>  #include <string.h>
>> +#include <stdbool.h>
>>
>> -/* Byte-wise swap two items of size SIZE. */
>> -#define SWAP(a, b, size)
>>      \
>> -  do
>>      \
>> -    {
>>     \
>> -      size_t __size = (size);
>>     \
>> -      char *__a = (a), *__b = (b);
>>      \
>> -      do
>>      \
>> -       {
>>      \
>> -         char __tmp = *__a;
>>     \
>> -         *__a++ = *__b;
>>     \
>> -         *__b++ = __tmp;
>>      \
>> -       } while (--__size > 0);
>>      \
>> -    } while (0)
>> +/* Swap SIZE bytes between addresses A and B.  These helpers are provided
>> +   along the generic one as an optimization.  */
>> +
>> +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
>> +
>> +/* Return trues is elements can be copied used word load and sortes.
>> +   The size must be a multiple of the alignment, and the base address.
>> */
>> +static inline bool
>> +is_aligned_to_copy (const void *base, size_t size, size_t align)
>> +{
>> +  unsigned char lsbits = size;
>> +#if !_STRING_ARCH_unaligned
>> +  lsbits |= (unsigned char)(uintptr_t) base;
>> +#endif
>> +  return (lsbits & (align - 1)) == 0;
>> +}
>> +
>> +#define SWAP_WORDS_64 (swap_func_t)0
>> +#define SWAP_WORDS_32 (swap_func_t)1
>> +#define SWAP_BYTES    (swap_func_t)2
>> +
>> +static void
>> +swap_words_64 (void * restrict a, void * restrict b, size_t n)
>> +{
>> +  do
>> +   {
>> +     n -= 8;
>> +     uint64_t t = *(uint64_t *)(a + n);
>> +     *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
>> +     *(uint64_t *)(b + n) = t;
>> +   } while (n);
>> +}
>> +
>> +static void
>> +swap_words_32 (void * restrict a, void * restrict b, size_t n)
>> +{
>> +  do
>> +   {
>> +     n -= 4;
>> +     uint32_t t = *(uint32_t *)(a + n);
>> +     *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
>> +     *(uint32_t *)(b + n) = t;
>> +   } while (n);
>> +}
>>
>
I'm not certain swap_words_32 / swap_words_8 will be optimal for larger
key sizes. Looking at GCC's implementation of swap_generic on modern
x86_64:
https://godbolt.org/z/638h3Y9va
It's able to optimize the temporary buffer out of the loop and use xmm
registers which
will likely win out for larger sizes.

> +
>> +static void
>> +swap_bytes (void * restrict a, void * restrict b, size_t n)
>> +{
>> +  /* Use multiple small memcpys with constant size to enable inlining
>> +     on most targets.  */
>> +  enum { SWAP_GENERIC_SIZE = 32 };
>> +  unsigned char tmp[SWAP_GENERIC_SIZE];
>> +  while (n > SWAP_GENERIC_SIZE)
>> +    {
>> +      memcpy (tmp, a, SWAP_GENERIC_SIZE);
>> +      a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
>> +      b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
>> +      n -= SWAP_GENERIC_SIZE;
>> +    }
>> +  memcpy (tmp, a, n);
>> +  memcpy (a, b, n);
>> +  memcpy (b, tmp, n);
>> +}
>> +
>> +/* Replace the indirect call with a serie of if statements.  It should
>> help
>> +   the branch predictor.  */
>>
>
> 1) Really? On Intel at least an indirect call that is always going to the
> same place
> is certainly going to be predicted as well if not better than 2/3
> branches + direct call.
>
> 2) If you're going to just test which swap function to use, why
> bother initializing
> swap_func? Why not just use an int?
>
>
>
>> +static void
>> +do_swap (void * restrict a, void * restrict b, size_t size,
>> +        swap_func_t swap_func)
>> +{
>> +  if (swap_func == SWAP_WORDS_64)
>> +    swap_words_64 (a, b, size);
>> +  else if (swap_func == SWAP_WORDS_32)
>> +    swap_words_32 (a, b, size);
>> +  else
>> +    swap_bytes (a, b, size);
>> +}
>>
>>  /* Discontinue quicksort algorithm when partition gets below this size.
>>     This particular magic number was chosen to work best on a Sun 4/260.
>> */
>> @@ -97,6 +162,14 @@ _quicksort (void *const pbase, size_t total_elems,
>> size_t size,
>>      /* Avoid lossage with unsigned arithmetic below.  */
>>      return;
>>
>> +  swap_func_t swap_func;
>> +  if (is_aligned_to_copy (pbase, size, 8))
>> +    swap_func = SWAP_WORDS_64;
>> +  else if (is_aligned_to_copy (pbase, size, 4))
>>
>
For many modern architectures that support fast unaligned loads/stores
(for example x86_64 SnB and newer) I don't think this check really makes
sense.


> +    swap_func = SWAP_WORDS_32;
>> +  else
>> +    swap_func = SWAP_BYTES;
>> +
>>    if (total_elems > MAX_THRESH)
>>      {
>>        char *lo = base_ptr;
>> @@ -120,13 +193,13 @@ _quicksort (void *const pbase, size_t total_elems,
>> size_t size,
>>           char *mid = lo + size * ((hi - lo) / size >> 1);
>>
>>           if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
>> -           SWAP (mid, lo, size);
>> +           do_swap (mid, lo, size, swap_func);
>>           if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
>> -           SWAP (mid, hi, size);
>> +           do_swap (mid, hi, size, swap_func);
>>           else
>>             goto jump_over;
>>           if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
>> -           SWAP (mid, lo, size);
>> +           do_swap (mid, lo, size, swap_func);
>>         jump_over:;
>>
>>           left_ptr  = lo + size;
>> @@ -145,7 +218,7 @@ _quicksort (void *const pbase, size_t total_elems,
>> size_t size,
>>
>>               if (left_ptr < right_ptr)
>>                 {
>> -                 SWAP (left_ptr, right_ptr, size);
>> +                 do_swap (left_ptr, right_ptr, size, swap_func);
>>                   if (mid == left_ptr)
>>                     mid = right_ptr;
>>                   else if (mid == right_ptr)
>> @@ -217,7 +290,7 @@ _quicksort (void *const pbase, size_t total_elems,
>> size_t size,
>>          tmp_ptr = run_ptr;
>>
>>      if (tmp_ptr != base_ptr)
>> -      SWAP (tmp_ptr, base_ptr, size);
>> +      do_swap (tmp_ptr, base_ptr, size, swap_func);
>>
>>      /* Insertion sort, running from left-hand-side up to
>> right-hand-side.  */
>>
>> --
>> 2.30.2
>>
>>
  
Adhemerval Zanella Oct. 15, 2021, 1:12 p.m. UTC | #3
On 13/10/2021 00:29, Noah Goldstein wrote:
>     +static void
>     +swap_bytes (void * restrict a, void * restrict b, size_t n)
>     +{
>     +  /* Use multiple small memcpys with constant size to enable inlining
>     +     on most targets.  */
>     +  enum { SWAP_GENERIC_SIZE = 32 };
>     +  unsigned char tmp[SWAP_GENERIC_SIZE];
>     +  while (n > SWAP_GENERIC_SIZE)
>     +    {
>     +      memcpy (tmp, a, SWAP_GENERIC_SIZE);
>     +      a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
>     +      b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
>     +      n -= SWAP_GENERIC_SIZE;
>     +    }
>     +  memcpy (tmp, a, n);
>     +  memcpy (a, b, n);
>     +  memcpy (b, tmp, n);
>     +}
>     +
>     +/* Replace the indirect call with a serie of if statements.  It should help
>     +   the branch predictor.  */
> 
>  
> 1) Really? On Intel at least an indirect call that is always going to the same place
> is certainly going to be predicted as well if not better than 2/3 branches + direct call.
> 

I shamelessly copy the same strategy Linux kernel used on its lib/sort.c
(8fb583c4258d08f0).  Maybe Linux internal usage of its qsort() leads to
better predictable branch, and for this change I would prefer to work
better on different architectures than assume an specific one.

> 2) If you're going to just test which swap function to use, why bother initializing
> swap_func? Why not just use an int?

Indeed this is no much gain on glibc usage.  The kernel provides a API to use
used-defined swap_func, which is not our case.
  
Adhemerval Zanella Oct. 15, 2021, 1:29 p.m. UTC | #4
On 13/10/2021 00:39, Noah Goldstein wrote:
> 
> 
> On Tue, Oct 12, 2021 at 11:29 PM Noah Goldstein <goldstein.w.n@gmail.com <mailto:goldstein.w.n@gmail.com>> wrote:
> 
> 
> 
>     On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org <mailto:libc-alpha@sourceware.org>> wrote:
> 
>         It optimizes take in consideration both the most common elements are
>         either 32 or 64 bit in size [1] and inputs are aligned to the word
>         boundary.  This is similar to the optimization done on lib/sort.c
>         from Linux.
> 
>         This patchs adds an optimized swap operation on qsort based in previous
>         msort one.  Instead of byte operation, three variants are provided:
> 
>           1. Using uint32_t loads and stores.
>           2. Using uint64_t loads and stores.
>           3. Generic one with a temporary buffer and memcpy/mempcpy.
> 
>         The 1. and 2. options are selected only either if architecture defines
>         _STRING_ARCH_unaligned or if base pointer is aligned to required type.
> 
>         It also fixes BZ#19305 by checking input size against number of
>         elements 1 besides 0.
> 
>         Checked on x86_64-linux-gnu.
> 
>         [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html <https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html>
>         ---
>          stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++--------
>          1 file changed, 91 insertions(+), 18 deletions(-)
> 
>         diff --git a/stdlib/qsort.c b/stdlib/qsort.c
>         index 23f2d28314..59458d151b 100644
>         --- a/stdlib/qsort.c
>         +++ b/stdlib/qsort.c
>         @@ -24,20 +24,85 @@
>          #include <limits.h>
>          #include <stdlib.h>
>          #include <string.h>
>         +#include <stdbool.h>
> 
>         -/* Byte-wise swap two items of size SIZE. */
>         -#define SWAP(a, b, size)                                                     \
>         -  do                                                                         \
>         -    {                                                                        \
>         -      size_t __size = (size);                                                \
>         -      char *__a = (a), *__b = (b);                                           \
>         -      do                                                                     \
>         -       {                                                                     \
>         -         char __tmp = *__a;                                                  \
>         -         *__a++ = *__b;                                                      \
>         -         *__b++ = __tmp;                                                     \
>         -       } while (--__size > 0);                                               \
>         -    } while (0)
>         +/* Swap SIZE bytes between addresses A and B.  These helpers are provided
>         +   along the generic one as an optimization.  */
>         +
>         +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
>         +
>         +/* Return trues is elements can be copied used word load and sortes.
>         +   The size must be a multiple of the alignment, and the base address.  */
>         +static inline bool
>         +is_aligned_to_copy (const void *base, size_t size, size_t align)
>         +{
>         +  unsigned char lsbits = size;
>         +#if !_STRING_ARCH_unaligned
>         +  lsbits |= (unsigned char)(uintptr_t) base;
>         +#endif
>         +  return (lsbits & (align - 1)) == 0;
>         +}
>         +
>         +#define SWAP_WORDS_64 (swap_func_t)0
>         +#define SWAP_WORDS_32 (swap_func_t)1
>         +#define SWAP_BYTES    (swap_func_t)2
>         +
>         +static void
>         +swap_words_64 (void * restrict a, void * restrict b, size_t n)
>         +{
>         +  do
>         +   {
>         +     n -= 8;
>         +     uint64_t t = *(uint64_t *)(a + n);
>         +     *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
>         +     *(uint64_t *)(b + n) = t;
>         +   } while (n);
>         +}
>         +
>         +static void
>         +swap_words_32 (void * restrict a, void * restrict b, size_t n)
>         +{
>         +  do
>         +   {
>         +     n -= 4;
>         +     uint32_t t = *(uint32_t *)(a + n);
>         +     *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
>         +     *(uint32_t *)(b + n) = t;
>         +   } while (n);
>         +}
> 
> 
> I'm not certain swap_words_32 / swap_words_8 will be optimal for larger
> key sizes. Looking at GCC's implementation of swap_generic on modern x86_64: 
> https://godbolt.org/z/638h3Y9va <https://godbolt.org/z/638h3Y9va>
> It's able to optimize the temporary buffer out of the loop and use xmm registers which
> will likely win out for larger sizes. 

It is probably not the most optimized code compiler can generated since
I tried to go for a more generic code that should results in a somewhat
better code in a architecture agnostic code.  I trying to mimic some
of optimization Linux did on 37d0ec34d111acfdb, and the swap_bytes I
used the same strategy used on d3496c9f4f27d (Linux did not optimize
anything for the byte version).

I think it would be possible to tune it for an specific architecture
and/or compiler, but I would prefer to use a good enough algorithm that
work reasonable on multiple architectures.
  
Noah Goldstein Oct. 15, 2021, 4:45 p.m. UTC | #5
On Fri, Oct 15, 2021 at 8:12 AM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 13/10/2021 00:29, Noah Goldstein wrote:
> >     +static void
> >     +swap_bytes (void * restrict a, void * restrict b, size_t n)
> >     +{
> >     +  /* Use multiple small memcpys with constant size to enable inlining
> >     +     on most targets.  */
> >     +  enum { SWAP_GENERIC_SIZE = 32 };
> >     +  unsigned char tmp[SWAP_GENERIC_SIZE];
> >     +  while (n > SWAP_GENERIC_SIZE)
> >     +    {
> >     +      memcpy (tmp, a, SWAP_GENERIC_SIZE);
> >     +      a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
> >     +      b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
> >     +      n -= SWAP_GENERIC_SIZE;
> >     +    }
> >     +  memcpy (tmp, a, n);
> >     +  memcpy (a, b, n);
> >     +  memcpy (b, tmp, n);
> >     +}
> >     +
> >     +/* Replace the indirect call with a serie of if statements.  It should help
> >     +   the branch predictor.  */
> >
> >
> > 1) Really? On Intel at least an indirect call that is always going to the same place
> > is certainly going to be predicted as well if not better than 2/3 branches + direct call.
> >
>
> I shamelessly copy the same strategy Linux kernel used on its lib/sort.c
> (8fb583c4258d08f0).  Maybe Linux internal usage of its qsort() leads to
> better predictable branch, and for this change I would prefer to work
> better on different architectures than assume an specific one.

If it's running in the Kernel spectre mitigations can make indirect jumps
particularly expensive. I don't think we have the same issue targeting
userland.

>
> > 2) If you're going to just test which swap function to use, why bother initializing
> > swap_func? Why not just use an int?
>
> Indeed this is no much gain on glibc usage.  The kernel provides a API to use
> used-defined swap_func, which is not our case.
>
  
Noah Goldstein Oct. 15, 2021, 5:17 p.m. UTC | #6
On Fri, Oct 15, 2021 at 8:29 AM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 13/10/2021 00:39, Noah Goldstein wrote:
> >
> >
> > On Tue, Oct 12, 2021 at 11:29 PM Noah Goldstein <goldstein.w.n@gmail.com <mailto:goldstein.w.n@gmail.com>> wrote:
> >
> >
> >
> >     On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org <mailto:libc-alpha@sourceware.org>> wrote:
> >
> >         It optimizes take in consideration both the most common elements are
> >         either 32 or 64 bit in size [1] and inputs are aligned to the word
> >         boundary.  This is similar to the optimization done on lib/sort.c
> >         from Linux.
> >
> >         This patchs adds an optimized swap operation on qsort based in previous
> >         msort one.  Instead of byte operation, three variants are provided:
> >
> >           1. Using uint32_t loads and stores.
> >           2. Using uint64_t loads and stores.
> >           3. Generic one with a temporary buffer and memcpy/mempcpy.
> >
> >         The 1. and 2. options are selected only either if architecture defines
> >         _STRING_ARCH_unaligned or if base pointer is aligned to required type.
> >
> >         It also fixes BZ#19305 by checking input size against number of
> >         elements 1 besides 0.
> >
> >         Checked on x86_64-linux-gnu.
> >
> >         [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html <https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html>
> >         ---
> >          stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++--------
> >          1 file changed, 91 insertions(+), 18 deletions(-)
> >
> >         diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> >         index 23f2d28314..59458d151b 100644
> >         --- a/stdlib/qsort.c
> >         +++ b/stdlib/qsort.c
> >         @@ -24,20 +24,85 @@
> >          #include <limits.h>
> >          #include <stdlib.h>
> >          #include <string.h>
> >         +#include <stdbool.h>
> >
> >         -/* Byte-wise swap two items of size SIZE. */
> >         -#define SWAP(a, b, size)                                                     \
> >         -  do                                                                         \
> >         -    {                                                                        \
> >         -      size_t __size = (size);                                                \
> >         -      char *__a = (a), *__b = (b);                                           \
> >         -      do                                                                     \
> >         -       {                                                                     \
> >         -         char __tmp = *__a;                                                  \
> >         -         *__a++ = *__b;                                                      \
> >         -         *__b++ = __tmp;                                                     \
> >         -       } while (--__size > 0);                                               \
> >         -    } while (0)
> >         +/* Swap SIZE bytes between addresses A and B.  These helpers are provided
> >         +   along the generic one as an optimization.  */
> >         +
> >         +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
> >         +
> >         +/* Return trues is elements can be copied used word load and sortes.
> >         +   The size must be a multiple of the alignment, and the base address.  */
> >         +static inline bool
> >         +is_aligned_to_copy (const void *base, size_t size, size_t align)
> >         +{
> >         +  unsigned char lsbits = size;
> >         +#if !_STRING_ARCH_unaligned
> >         +  lsbits |= (unsigned char)(uintptr_t) base;
> >         +#endif
> >         +  return (lsbits & (align - 1)) == 0;
> >         +}
> >         +
> >         +#define SWAP_WORDS_64 (swap_func_t)0
> >         +#define SWAP_WORDS_32 (swap_func_t)1
> >         +#define SWAP_BYTES    (swap_func_t)2
> >         +
> >         +static void
> >         +swap_words_64 (void * restrict a, void * restrict b, size_t n)
> >         +{
> >         +  do
> >         +   {
> >         +     n -= 8;
> >         +     uint64_t t = *(uint64_t *)(a + n);
> >         +     *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
> >         +     *(uint64_t *)(b + n) = t;
> >         +   } while (n);
> >         +}
> >         +
> >         +static void
> >         +swap_words_32 (void * restrict a, void * restrict b, size_t n)
> >         +{
> >         +  do
> >         +   {
> >         +     n -= 4;
> >         +     uint32_t t = *(uint32_t *)(a + n);
> >         +     *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
> >         +     *(uint32_t *)(b + n) = t;
> >         +   } while (n);
> >         +}
> >
> >
> > I'm not certain swap_words_32 / swap_words_8 will be optimal for larger
> > key sizes. Looking at GCC's implementation of swap_generic on modern x86_64:
> > https://godbolt.org/z/638h3Y9va <https://godbolt.org/z/638h3Y9va>
> > It's able to optimize the temporary buffer out of the loop and use xmm registers which
> > will likely win out for larger sizes.
>
> It is probably not the most optimized code compiler can generated since
> I tried to go for a more generic code that should results in a somewhat
> better code in a architecture agnostic code.  I trying to mimic some
> of optimization Linux did on 37d0ec34d111acfdb, and the swap_bytes I
> used the same strategy used on d3496c9f4f27d (Linux did not optimize
> anything for the byte version).
>
> I think it would be possible to tune it for an specific architecture
> and/or compiler, but I would prefer to use a good enough algorithm that
> work reasonable on multiple architectures.

That's fair. Although I still think there are some improvements.

Looking at the assembly for all three in fact it seems GCC optimizes all of them
to larger register copies: https://godbolt.org/z/bd9nnnoEY

The biggest difference seems to be the setup / register spills for
the generic version so for the common case of a relatively small key
the special case for 4/8 makes sense.

Have you checked that GCC is able to use the conditions for selecting
the swap function to optimize the functions themselves? In the godbolt
link I got reasonable value out of adding the invariants to swap_64/swap_32.

It also may be worth it to write a custom memcpy implementation for
size = 0..SWAP_GENERIC_SIZE so it can be inlined (and probably
more optimized than what generic memcpy can get away with).
  
Adhemerval Zanella Oct. 15, 2021, 5:21 p.m. UTC | #7
On 15/10/2021 13:45, Noah Goldstein wrote:
> On Fri, Oct 15, 2021 at 8:12 AM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>>
>> On 13/10/2021 00:29, Noah Goldstein wrote:
>>>     +static void
>>>     +swap_bytes (void * restrict a, void * restrict b, size_t n)
>>>     +{
>>>     +  /* Use multiple small memcpys with constant size to enable inlining
>>>     +     on most targets.  */
>>>     +  enum { SWAP_GENERIC_SIZE = 32 };
>>>     +  unsigned char tmp[SWAP_GENERIC_SIZE];
>>>     +  while (n > SWAP_GENERIC_SIZE)
>>>     +    {
>>>     +      memcpy (tmp, a, SWAP_GENERIC_SIZE);
>>>     +      a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
>>>     +      b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
>>>     +      n -= SWAP_GENERIC_SIZE;
>>>     +    }
>>>     +  memcpy (tmp, a, n);
>>>     +  memcpy (a, b, n);
>>>     +  memcpy (b, tmp, n);
>>>     +}
>>>     +
>>>     +/* Replace the indirect call with a serie of if statements.  It should help
>>>     +   the branch predictor.  */
>>>
>>>
>>> 1) Really? On Intel at least an indirect call that is always going to the same place
>>> is certainly going to be predicted as well if not better than 2/3 branches + direct call.
>>>
>>
>> I shamelessly copy the same strategy Linux kernel used on its lib/sort.c
>> (8fb583c4258d08f0).  Maybe Linux internal usage of its qsort() leads to
>> better predictable branch, and for this change I would prefer to work
>> better on different architectures than assume an specific one.
> 
> If it's running in the Kernel spectre mitigations can make indirect jumps
> particularly expensive. I don't think we have the same issue targeting
> userland.

Indeed I didn't take this in consideration, and it does not seems to
help much in glibc case.  I will used indirect calls.
  
Adhemerval Zanella Oct. 15, 2021, 5:34 p.m. UTC | #8
On 15/10/2021 14:17, Noah Goldstein wrote:
> On Fri, Oct 15, 2021 at 8:29 AM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>>
>> On 13/10/2021 00:39, Noah Goldstein wrote:
>>>
>>>
>>> On Tue, Oct 12, 2021 at 11:29 PM Noah Goldstein <goldstein.w.n@gmail.com <mailto:goldstein.w.n@gmail.com>> wrote:
>>>
>>>
>>>
>>>     On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org <mailto:libc-alpha@sourceware.org>> wrote:
>>>
>>>         It optimizes take in consideration both the most common elements are
>>>         either 32 or 64 bit in size [1] and inputs are aligned to the word
>>>         boundary.  This is similar to the optimization done on lib/sort.c
>>>         from Linux.
>>>
>>>         This patchs adds an optimized swap operation on qsort based in previous
>>>         msort one.  Instead of byte operation, three variants are provided:
>>>
>>>           1. Using uint32_t loads and stores.
>>>           2. Using uint64_t loads and stores.
>>>           3. Generic one with a temporary buffer and memcpy/mempcpy.
>>>
>>>         The 1. and 2. options are selected only either if architecture defines
>>>         _STRING_ARCH_unaligned or if base pointer is aligned to required type.
>>>
>>>         It also fixes BZ#19305 by checking input size against number of
>>>         elements 1 besides 0.
>>>
>>>         Checked on x86_64-linux-gnu.
>>>
>>>         [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html <https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html>
>>>         ---
>>>          stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++--------
>>>          1 file changed, 91 insertions(+), 18 deletions(-)
>>>
>>>         diff --git a/stdlib/qsort.c b/stdlib/qsort.c
>>>         index 23f2d28314..59458d151b 100644
>>>         --- a/stdlib/qsort.c
>>>         +++ b/stdlib/qsort.c
>>>         @@ -24,20 +24,85 @@
>>>          #include <limits.h>
>>>          #include <stdlib.h>
>>>          #include <string.h>
>>>         +#include <stdbool.h>
>>>
>>>         -/* Byte-wise swap two items of size SIZE. */
>>>         -#define SWAP(a, b, size)                                                     \
>>>         -  do                                                                         \
>>>         -    {                                                                        \
>>>         -      size_t __size = (size);                                                \
>>>         -      char *__a = (a), *__b = (b);                                           \
>>>         -      do                                                                     \
>>>         -       {                                                                     \
>>>         -         char __tmp = *__a;                                                  \
>>>         -         *__a++ = *__b;                                                      \
>>>         -         *__b++ = __tmp;                                                     \
>>>         -       } while (--__size > 0);                                               \
>>>         -    } while (0)
>>>         +/* Swap SIZE bytes between addresses A and B.  These helpers are provided
>>>         +   along the generic one as an optimization.  */
>>>         +
>>>         +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
>>>         +
>>>         +/* Return trues is elements can be copied used word load and sortes.
>>>         +   The size must be a multiple of the alignment, and the base address.  */
>>>         +static inline bool
>>>         +is_aligned_to_copy (const void *base, size_t size, size_t align)
>>>         +{
>>>         +  unsigned char lsbits = size;
>>>         +#if !_STRING_ARCH_unaligned
>>>         +  lsbits |= (unsigned char)(uintptr_t) base;
>>>         +#endif
>>>         +  return (lsbits & (align - 1)) == 0;
>>>         +}
>>>         +
>>>         +#define SWAP_WORDS_64 (swap_func_t)0
>>>         +#define SWAP_WORDS_32 (swap_func_t)1
>>>         +#define SWAP_BYTES    (swap_func_t)2
>>>         +
>>>         +static void
>>>         +swap_words_64 (void * restrict a, void * restrict b, size_t n)
>>>         +{
>>>         +  do
>>>         +   {
>>>         +     n -= 8;
>>>         +     uint64_t t = *(uint64_t *)(a + n);
>>>         +     *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
>>>         +     *(uint64_t *)(b + n) = t;
>>>         +   } while (n);
>>>         +}
>>>         +
>>>         +static void
>>>         +swap_words_32 (void * restrict a, void * restrict b, size_t n)
>>>         +{
>>>         +  do
>>>         +   {
>>>         +     n -= 4;
>>>         +     uint32_t t = *(uint32_t *)(a + n);
>>>         +     *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
>>>         +     *(uint32_t *)(b + n) = t;
>>>         +   } while (n);
>>>         +}
>>>
>>>
>>> I'm not certain swap_words_32 / swap_words_8 will be optimal for larger
>>> key sizes. Looking at GCC's implementation of swap_generic on modern x86_64:
>>> https://godbolt.org/z/638h3Y9va <https://godbolt.org/z/638h3Y9va>
>>> It's able to optimize the temporary buffer out of the loop and use xmm registers which
>>> will likely win out for larger sizes.
>>
>> It is probably not the most optimized code compiler can generated since
>> I tried to go for a more generic code that should results in a somewhat
>> better code in a architecture agnostic code.  I trying to mimic some
>> of optimization Linux did on 37d0ec34d111acfdb, and the swap_bytes I
>> used the same strategy used on d3496c9f4f27d (Linux did not optimize
>> anything for the byte version).
>>
>> I think it would be possible to tune it for an specific architecture
>> and/or compiler, but I would prefer to use a good enough algorithm that
>> work reasonable on multiple architectures.
> 
> That's fair. Although I still think there are some improvements.
> 
> Looking at the assembly for all three in fact it seems GCC optimizes all of them
> to larger register copies: https://godbolt.org/z/bd9nnnoEY
> 
> The biggest difference seems to be the setup / register spills for
> the generic version so for the common case of a relatively small key
> the special case for 4/8 makes sense.
> 
> Have you checked that GCC is able to use the conditions for selecting
> the swap function to optimize the functions themselves? In the godbolt
> link I got reasonable value out of adding the invariants to swap_64/swap_32.

Not yet, but I think the generated code for both x86_64 and aarch64 seems
simple enough and should cover the common case (keys with size 4 or 8)
fast enough [1]. 

And since for v4 my plan is not to remove mergesort anymore, but limit it
to a static buffer; it might be possible to use the same swap routines
for both mergesort and quicksort.

> 
> It also may be worth it to write a custom memcpy implementation for
> size = 0..SWAP_GENERIC_SIZE so it can be inlined (and probably
> more optimized than what generic memcpy can get away with).
> 

I *really* do not want to go for this way.  My hope is with small sizes
compiler can inline the memcpy (which gcc does for most common
architectures).


[1] https://godbolt.org/z/v7e4xxqGa
  
Noah Goldstein Oct. 15, 2021, 5:45 p.m. UTC | #9
On Fri, Oct 15, 2021 at 12:34 PM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 15/10/2021 14:17, Noah Goldstein wrote:
> > On Fri, Oct 15, 2021 at 8:29 AM Adhemerval Zanella
> > <adhemerval.zanella@linaro.org> wrote:
> >>
> >>
> >>
> >> On 13/10/2021 00:39, Noah Goldstein wrote:
> >>>
> >>>
> >>> On Tue, Oct 12, 2021 at 11:29 PM Noah Goldstein <goldstein.w.n@gmail.com <mailto:goldstein.w.n@gmail.com>> wrote:
> >>>
> >>>
> >>>
> >>>     On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org <mailto:libc-alpha@sourceware.org>> wrote:
> >>>
> >>>         It optimizes take in consideration both the most common elements are
> >>>         either 32 or 64 bit in size [1] and inputs are aligned to the word
> >>>         boundary.  This is similar to the optimization done on lib/sort.c
> >>>         from Linux.
> >>>
> >>>         This patchs adds an optimized swap operation on qsort based in previous
> >>>         msort one.  Instead of byte operation, three variants are provided:
> >>>
> >>>           1. Using uint32_t loads and stores.
> >>>           2. Using uint64_t loads and stores.
> >>>           3. Generic one with a temporary buffer and memcpy/mempcpy.
> >>>
> >>>         The 1. and 2. options are selected only either if architecture defines
> >>>         _STRING_ARCH_unaligned or if base pointer is aligned to required type.
> >>>
> >>>         It also fixes BZ#19305 by checking input size against number of
> >>>         elements 1 besides 0.
> >>>
> >>>         Checked on x86_64-linux-gnu.
> >>>
> >>>         [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html <https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html>
> >>>         ---
> >>>          stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++--------
> >>>          1 file changed, 91 insertions(+), 18 deletions(-)
> >>>
> >>>         diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> >>>         index 23f2d28314..59458d151b 100644
> >>>         --- a/stdlib/qsort.c
> >>>         +++ b/stdlib/qsort.c
> >>>         @@ -24,20 +24,85 @@
> >>>          #include <limits.h>
> >>>          #include <stdlib.h>
> >>>          #include <string.h>
> >>>         +#include <stdbool.h>
> >>>
> >>>         -/* Byte-wise swap two items of size SIZE. */
> >>>         -#define SWAP(a, b, size)                                                     \
> >>>         -  do                                                                         \
> >>>         -    {                                                                        \
> >>>         -      size_t __size = (size);                                                \
> >>>         -      char *__a = (a), *__b = (b);                                           \
> >>>         -      do                                                                     \
> >>>         -       {                                                                     \
> >>>         -         char __tmp = *__a;                                                  \
> >>>         -         *__a++ = *__b;                                                      \
> >>>         -         *__b++ = __tmp;                                                     \
> >>>         -       } while (--__size > 0);                                               \
> >>>         -    } while (0)
> >>>         +/* Swap SIZE bytes between addresses A and B.  These helpers are provided
> >>>         +   along the generic one as an optimization.  */
> >>>         +
> >>>         +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
> >>>         +
> >>>         +/* Return trues is elements can be copied used word load and sortes.
> >>>         +   The size must be a multiple of the alignment, and the base address.  */
> >>>         +static inline bool
> >>>         +is_aligned_to_copy (const void *base, size_t size, size_t align)
> >>>         +{
> >>>         +  unsigned char lsbits = size;
> >>>         +#if !_STRING_ARCH_unaligned
> >>>         +  lsbits |= (unsigned char)(uintptr_t) base;
> >>>         +#endif
> >>>         +  return (lsbits & (align - 1)) == 0;
> >>>         +}
> >>>         +
> >>>         +#define SWAP_WORDS_64 (swap_func_t)0
> >>>         +#define SWAP_WORDS_32 (swap_func_t)1
> >>>         +#define SWAP_BYTES    (swap_func_t)2
> >>>         +
> >>>         +static void
> >>>         +swap_words_64 (void * restrict a, void * restrict b, size_t n)
> >>>         +{
> >>>         +  do
> >>>         +   {
> >>>         +     n -= 8;
> >>>         +     uint64_t t = *(uint64_t *)(a + n);
> >>>         +     *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
> >>>         +     *(uint64_t *)(b + n) = t;
> >>>         +   } while (n);
> >>>         +}
> >>>         +
> >>>         +static void
> >>>         +swap_words_32 (void * restrict a, void * restrict b, size_t n)
> >>>         +{
> >>>         +  do
> >>>         +   {
> >>>         +     n -= 4;
> >>>         +     uint32_t t = *(uint32_t *)(a + n);
> >>>         +     *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
> >>>         +     *(uint32_t *)(b + n) = t;
> >>>         +   } while (n);
> >>>         +}
> >>>
> >>>
> >>> I'm not certain swap_words_32 / swap_words_8 will be optimal for larger
> >>> key sizes. Looking at GCC's implementation of swap_generic on modern x86_64:
> >>> https://godbolt.org/z/638h3Y9va <https://godbolt.org/z/638h3Y9va>
> >>> It's able to optimize the temporary buffer out of the loop and use xmm registers which
> >>> will likely win out for larger sizes.
> >>
> >> It is probably not the most optimized code compiler can generated since
> >> I tried to go for a more generic code that should results in a somewhat
> >> better code in a architecture agnostic code.  I trying to mimic some
> >> of optimization Linux did on 37d0ec34d111acfdb, and the swap_bytes I
> >> used the same strategy used on d3496c9f4f27d (Linux did not optimize
> >> anything for the byte version).
> >>
> >> I think it would be possible to tune it for an specific architecture
> >> and/or compiler, but I would prefer to use a good enough algorithm that
> >> work reasonable on multiple architectures.
> >
> > That's fair. Although I still think there are some improvements.
> >
> > Looking at the assembly for all three in fact it seems GCC optimizes all of them
> > to larger register copies: https://godbolt.org/z/bd9nnnoEY
> >
> > The biggest difference seems to be the setup / register spills for
> > the generic version so for the common case of a relatively small key
> > the special case for 4/8 makes sense.
> >
> > Have you checked that GCC is able to use the conditions for selecting
> > the swap function to optimize the functions themselves? In the godbolt
> > link I got reasonable value out of adding the invariants to swap_64/swap_32.
>
> Not yet, but I think the generated code for both x86_64 and aarch64 seems
> simple enough and should cover the common case (keys with size 4 or 8)
> fast enough [1].

swap is in the inner loop. Seems like a pretty critical component to have fully
optimized. The aarch64 version looks good, but the x86_64 version seems
to be lacking. Not arguing for an arch specific version, but if the directives
can add value to x86_64 without detracting from aarch64 seems like a zero
cost improvement.

>
> And since for v4 my plan is not to remove mergesort anymore, but limit it
> to a static buffer; it might be possible to use the same swap routines
> for both mergesort and quicksort.
>
> >
> > It also may be worth it to write a custom memcpy implementation for
> > size = 0..SWAP_GENERIC_SIZE so it can be inlined (and probably
> > more optimized than what generic memcpy can get away with).
> >
>
> I *really* do not want to go for this way.  My hope is with small sizes
> compiler can inline the memcpy (which gcc does for most common
> architectures).

Since size is non-constant for the tail I don't see how we are going
avoid 3x memcpy calls. Although that can be another patch if it
gets values.

>
>
> [1] https://godbolt.org/z/v7e4xxqGa
  
Adhemerval Zanella Oct. 15, 2021, 5:56 p.m. UTC | #10
On 15/10/2021 14:45, Noah Goldstein wrote:
> swap is in the inner loop. Seems like a pretty critical component to have fully
> optimized. The aarch64 version looks good, but the x86_64 version seems
> to be lacking. Not arguing for an arch specific version, but if the directives
> can add value to x86_64 without detracting from aarch64 seems like a zero
> cost improvement.
> 
> Since size is non-constant for the tail I don't see how we are going
> avoid 3x memcpy calls. Although that can be another patch if it
> gets values.
> 
>>
>>
>> [1] https://godbolt.org/z/v7e4xxqGa

Maybe use a byte copy in the tail to avoid memcpy [1], another option
might to tune SWAP_GENERIC_SIZE to make the tail less costly (16 should
be ok for most architecture, although some might be better with large
values).

[1] https://godbolt.org/z/G76dcej16
  

Patch

diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index 23f2d28314..59458d151b 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -24,20 +24,85 @@ 
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
+#include <stdbool.h>
 
-/* Byte-wise swap two items of size SIZE. */
-#define SWAP(a, b, size)						      \
-  do									      \
-    {									      \
-      size_t __size = (size);						      \
-      char *__a = (a), *__b = (b);					      \
-      do								      \
-	{								      \
-	  char __tmp = *__a;						      \
-	  *__a++ = *__b;						      \
-	  *__b++ = __tmp;						      \
-	} while (--__size > 0);						      \
-    } while (0)
+/* Swap SIZE bytes between addresses A and B.  These helpers are provided
+   along the generic one as an optimization.  */
+
+typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
+
+/* Return trues is elements can be copied used word load and sortes.
+   The size must be a multiple of the alignment, and the base address.  */
+static inline bool
+is_aligned_to_copy (const void *base, size_t size, size_t align)
+{
+  unsigned char lsbits = size;
+#if !_STRING_ARCH_unaligned
+  lsbits |= (unsigned char)(uintptr_t) base;
+#endif
+  return (lsbits & (align - 1)) == 0;
+}
+
+#define SWAP_WORDS_64 (swap_func_t)0
+#define SWAP_WORDS_32 (swap_func_t)1
+#define SWAP_BYTES    (swap_func_t)2
+
+static void
+swap_words_64 (void * restrict a, void * restrict b, size_t n)
+{
+  do
+   {
+     n -= 8;
+     uint64_t t = *(uint64_t *)(a + n);
+     *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
+     *(uint64_t *)(b + n) = t;
+   } while (n);
+}
+
+static void
+swap_words_32 (void * restrict a, void * restrict b, size_t n)
+{
+  do
+   {
+     n -= 4;
+     uint32_t t = *(uint32_t *)(a + n);
+     *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
+     *(uint32_t *)(b + n) = t;
+   } while (n);
+}
+
+static void
+swap_bytes (void * restrict a, void * restrict b, size_t n)
+{
+  /* Use multiple small memcpys with constant size to enable inlining
+     on most targets.  */
+  enum { SWAP_GENERIC_SIZE = 32 };
+  unsigned char tmp[SWAP_GENERIC_SIZE];
+  while (n > SWAP_GENERIC_SIZE)
+    {
+      memcpy (tmp, a, SWAP_GENERIC_SIZE);
+      a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
+      b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
+      n -= SWAP_GENERIC_SIZE;
+    }
+  memcpy (tmp, a, n);
+  memcpy (a, b, n);
+  memcpy (b, tmp, n);
+}
+
+/* Replace the indirect call with a serie of if statements.  It should help
+   the branch predictor.  */
+static void
+do_swap (void * restrict a, void * restrict b, size_t size,
+	 swap_func_t swap_func)
+{
+  if (swap_func == SWAP_WORDS_64)
+    swap_words_64 (a, b, size);
+  else if (swap_func == SWAP_WORDS_32)
+    swap_words_32 (a, b, size);
+  else
+    swap_bytes (a, b, size);
+}
 
 /* Discontinue quicksort algorithm when partition gets below this size.
    This particular magic number was chosen to work best on a Sun 4/260. */
@@ -97,6 +162,14 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
     /* Avoid lossage with unsigned arithmetic below.  */
     return;
 
+  swap_func_t swap_func;
+  if (is_aligned_to_copy (pbase, size, 8))
+    swap_func = SWAP_WORDS_64;
+  else if (is_aligned_to_copy (pbase, size, 4))
+    swap_func = SWAP_WORDS_32;
+  else
+    swap_func = SWAP_BYTES;
+
   if (total_elems > MAX_THRESH)
     {
       char *lo = base_ptr;
@@ -120,13 +193,13 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
 	  char *mid = lo + size * ((hi - lo) / size >> 1);
 
 	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
-	    SWAP (mid, lo, size);
+	    do_swap (mid, lo, size, swap_func);
 	  if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
-	    SWAP (mid, hi, size);
+	    do_swap (mid, hi, size, swap_func);
 	  else
 	    goto jump_over;
 	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
-	    SWAP (mid, lo, size);
+	    do_swap (mid, lo, size, swap_func);
 	jump_over:;
 
 	  left_ptr  = lo + size;
@@ -145,7 +218,7 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
 
 	      if (left_ptr < right_ptr)
 		{
-		  SWAP (left_ptr, right_ptr, size);
+		  do_swap (left_ptr, right_ptr, size, swap_func);
 		  if (mid == left_ptr)
 		    mid = right_ptr;
 		  else if (mid == right_ptr)
@@ -217,7 +290,7 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
         tmp_ptr = run_ptr;
 
     if (tmp_ptr != base_ptr)
-      SWAP (tmp_ptr, base_ptr, size);
+      do_swap (tmp_ptr, base_ptr, size, swap_func);
 
     /* Insertion sort, running from left-hand-side up to right-hand-side.  */