[v3,3/7] stdlib: Optimization qsort{_r} swap implementation (BZ #19305)
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
It optimizes take in consideration both the most common elements are
either 32 or 64 bit in size [1] and inputs are aligned to the word
boundary. This is similar to the optimization done on lib/sort.c
from Linux.
This patchs adds an optimized swap operation on qsort based in previous
msort one. Instead of byte operation, three variants are provided:
1. Using uint32_t loads and stores.
2. Using uint64_t loads and stores.
3. Generic one with a temporary buffer and memcpy/mempcpy.
The 1. and 2. options are selected only either if architecture defines
_STRING_ARCH_unaligned or if base pointer is aligned to required type.
It also fixes BZ#19305 by checking input size against number of
elements 1 besides 0.
Checked on x86_64-linux-gnu.
[1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html
---
stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 91 insertions(+), 18 deletions(-)
Comments
On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha <
libc-alpha@sourceware.org> wrote:
> It optimizes take in consideration both the most common elements are
> either 32 or 64 bit in size [1] and inputs are aligned to the word
> boundary. This is similar to the optimization done on lib/sort.c
> from Linux.
>
> This patchs adds an optimized swap operation on qsort based in previous
> msort one. Instead of byte operation, three variants are provided:
>
> 1. Using uint32_t loads and stores.
> 2. Using uint64_t loads and stores.
> 3. Generic one with a temporary buffer and memcpy/mempcpy.
>
> The 1. and 2. options are selected only either if architecture defines
> _STRING_ARCH_unaligned or if base pointer is aligned to required type.
>
> It also fixes BZ#19305 by checking input size against number of
> elements 1 besides 0.
>
> Checked on x86_64-linux-gnu.
>
> [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html
> ---
> stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++--------
> 1 file changed, 91 insertions(+), 18 deletions(-)
>
> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> index 23f2d28314..59458d151b 100644
> --- a/stdlib/qsort.c
> +++ b/stdlib/qsort.c
> @@ -24,20 +24,85 @@
> #include <limits.h>
> #include <stdlib.h>
> #include <string.h>
> +#include <stdbool.h>
>
> -/* Byte-wise swap two items of size SIZE. */
> -#define SWAP(a, b, size)
> \
> - do
> \
> - {
> \
> - size_t __size = (size);
> \
> - char *__a = (a), *__b = (b);
> \
> - do
> \
> - {
> \
> - char __tmp = *__a;
> \
> - *__a++ = *__b;
> \
> - *__b++ = __tmp;
> \
> - } while (--__size > 0);
> \
> - } while (0)
> +/* Swap SIZE bytes between addresses A and B. These helpers are provided
> + along the generic one as an optimization. */
> +
> +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
> +
> +/* Return trues is elements can be copied used word load and sortes.
> + The size must be a multiple of the alignment, and the base address. */
> +static inline bool
> +is_aligned_to_copy (const void *base, size_t size, size_t align)
> +{
> + unsigned char lsbits = size;
> +#if !_STRING_ARCH_unaligned
> + lsbits |= (unsigned char)(uintptr_t) base;
> +#endif
> + return (lsbits & (align - 1)) == 0;
> +}
> +
> +#define SWAP_WORDS_64 (swap_func_t)0
> +#define SWAP_WORDS_32 (swap_func_t)1
> +#define SWAP_BYTES (swap_func_t)2
> +
> +static void
> +swap_words_64 (void * restrict a, void * restrict b, size_t n)
> +{
> + do
> + {
> + n -= 8;
> + uint64_t t = *(uint64_t *)(a + n);
> + *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
> + *(uint64_t *)(b + n) = t;
> + } while (n);
> +}
> +
> +static void
> +swap_words_32 (void * restrict a, void * restrict b, size_t n)
> +{
> + do
> + {
> + n -= 4;
> + uint32_t t = *(uint32_t *)(a + n);
> + *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
> + *(uint32_t *)(b + n) = t;
> + } while (n);
> +}
> +
> +static void
> +swap_bytes (void * restrict a, void * restrict b, size_t n)
> +{
> + /* Use multiple small memcpys with constant size to enable inlining
> + on most targets. */
> + enum { SWAP_GENERIC_SIZE = 32 };
> + unsigned char tmp[SWAP_GENERIC_SIZE];
> + while (n > SWAP_GENERIC_SIZE)
> + {
> + memcpy (tmp, a, SWAP_GENERIC_SIZE);
> + a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
> + b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
> + n -= SWAP_GENERIC_SIZE;
> + }
> + memcpy (tmp, a, n);
> + memcpy (a, b, n);
> + memcpy (b, tmp, n);
> +}
> +
> +/* Replace the indirect call with a serie of if statements. It should
> help
> + the branch predictor. */
>
1) Really? On Intel at least an indirect call that is always going to the
same place
is certainly going to be predicted as well if not better than 2/3
branches + direct call.
2) If you're going to just test which swap function to use, why
bother initializing
swap_func? Why not just use an int?
> +static void
> +do_swap (void * restrict a, void * restrict b, size_t size,
> + swap_func_t swap_func)
> +{
> + if (swap_func == SWAP_WORDS_64)
> + swap_words_64 (a, b, size);
> + else if (swap_func == SWAP_WORDS_32)
> + swap_words_32 (a, b, size);
> + else
> + swap_bytes (a, b, size);
> +}
>
> /* Discontinue quicksort algorithm when partition gets below this size.
> This particular magic number was chosen to work best on a Sun 4/260. */
> @@ -97,6 +162,14 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
> /* Avoid lossage with unsigned arithmetic below. */
> return;
>
> + swap_func_t swap_func;
> + if (is_aligned_to_copy (pbase, size, 8))
> + swap_func = SWAP_WORDS_64;
> + else if (is_aligned_to_copy (pbase, size, 4))
> + swap_func = SWAP_WORDS_32;
> + else
> + swap_func = SWAP_BYTES;
> +
> if (total_elems > MAX_THRESH)
> {
> char *lo = base_ptr;
> @@ -120,13 +193,13 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
> char *mid = lo + size * ((hi - lo) / size >> 1);
>
> if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
> - SWAP (mid, lo, size);
> + do_swap (mid, lo, size, swap_func);
> if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
> - SWAP (mid, hi, size);
> + do_swap (mid, hi, size, swap_func);
> else
> goto jump_over;
> if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
> - SWAP (mid, lo, size);
> + do_swap (mid, lo, size, swap_func);
> jump_over:;
>
> left_ptr = lo + size;
> @@ -145,7 +218,7 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
>
> if (left_ptr < right_ptr)
> {
> - SWAP (left_ptr, right_ptr, size);
> + do_swap (left_ptr, right_ptr, size, swap_func);
> if (mid == left_ptr)
> mid = right_ptr;
> else if (mid == right_ptr)
> @@ -217,7 +290,7 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
> tmp_ptr = run_ptr;
>
> if (tmp_ptr != base_ptr)
> - SWAP (tmp_ptr, base_ptr, size);
> + do_swap (tmp_ptr, base_ptr, size, swap_func);
>
> /* Insertion sort, running from left-hand-side up to
> right-hand-side. */
>
> --
> 2.30.2
>
>
On Tue, Oct 12, 2021 at 11:29 PM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:
>
>
> On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha <
> libc-alpha@sourceware.org> wrote:
>
>> It optimizes take in consideration both the most common elements are
>> either 32 or 64 bit in size [1] and inputs are aligned to the word
>> boundary. This is similar to the optimization done on lib/sort.c
>> from Linux.
>>
>> This patchs adds an optimized swap operation on qsort based in previous
>> msort one. Instead of byte operation, three variants are provided:
>>
>> 1. Using uint32_t loads and stores.
>> 2. Using uint64_t loads and stores.
>> 3. Generic one with a temporary buffer and memcpy/mempcpy.
>>
>> The 1. and 2. options are selected only either if architecture defines
>> _STRING_ARCH_unaligned or if base pointer is aligned to required type.
>>
>> It also fixes BZ#19305 by checking input size against number of
>> elements 1 besides 0.
>>
>> Checked on x86_64-linux-gnu.
>>
>> [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html
>> ---
>> stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++--------
>> 1 file changed, 91 insertions(+), 18 deletions(-)
>>
>> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
>> index 23f2d28314..59458d151b 100644
>> --- a/stdlib/qsort.c
>> +++ b/stdlib/qsort.c
>> @@ -24,20 +24,85 @@
>> #include <limits.h>
>> #include <stdlib.h>
>> #include <string.h>
>> +#include <stdbool.h>
>>
>> -/* Byte-wise swap two items of size SIZE. */
>> -#define SWAP(a, b, size)
>> \
>> - do
>> \
>> - {
>> \
>> - size_t __size = (size);
>> \
>> - char *__a = (a), *__b = (b);
>> \
>> - do
>> \
>> - {
>> \
>> - char __tmp = *__a;
>> \
>> - *__a++ = *__b;
>> \
>> - *__b++ = __tmp;
>> \
>> - } while (--__size > 0);
>> \
>> - } while (0)
>> +/* Swap SIZE bytes between addresses A and B. These helpers are provided
>> + along the generic one as an optimization. */
>> +
>> +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
>> +
>> +/* Return trues is elements can be copied used word load and sortes.
>> + The size must be a multiple of the alignment, and the base address.
>> */
>> +static inline bool
>> +is_aligned_to_copy (const void *base, size_t size, size_t align)
>> +{
>> + unsigned char lsbits = size;
>> +#if !_STRING_ARCH_unaligned
>> + lsbits |= (unsigned char)(uintptr_t) base;
>> +#endif
>> + return (lsbits & (align - 1)) == 0;
>> +}
>> +
>> +#define SWAP_WORDS_64 (swap_func_t)0
>> +#define SWAP_WORDS_32 (swap_func_t)1
>> +#define SWAP_BYTES (swap_func_t)2
>> +
>> +static void
>> +swap_words_64 (void * restrict a, void * restrict b, size_t n)
>> +{
>> + do
>> + {
>> + n -= 8;
>> + uint64_t t = *(uint64_t *)(a + n);
>> + *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
>> + *(uint64_t *)(b + n) = t;
>> + } while (n);
>> +}
>> +
>> +static void
>> +swap_words_32 (void * restrict a, void * restrict b, size_t n)
>> +{
>> + do
>> + {
>> + n -= 4;
>> + uint32_t t = *(uint32_t *)(a + n);
>> + *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
>> + *(uint32_t *)(b + n) = t;
>> + } while (n);
>> +}
>>
>
I'm not certain swap_words_32 / swap_words_8 will be optimal for larger
key sizes. Looking at GCC's implementation of swap_generic on modern
x86_64:
https://godbolt.org/z/638h3Y9va
It's able to optimize the temporary buffer out of the loop and use xmm
registers which
will likely win out for larger sizes.
> +
>> +static void
>> +swap_bytes (void * restrict a, void * restrict b, size_t n)
>> +{
>> + /* Use multiple small memcpys with constant size to enable inlining
>> + on most targets. */
>> + enum { SWAP_GENERIC_SIZE = 32 };
>> + unsigned char tmp[SWAP_GENERIC_SIZE];
>> + while (n > SWAP_GENERIC_SIZE)
>> + {
>> + memcpy (tmp, a, SWAP_GENERIC_SIZE);
>> + a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
>> + b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
>> + n -= SWAP_GENERIC_SIZE;
>> + }
>> + memcpy (tmp, a, n);
>> + memcpy (a, b, n);
>> + memcpy (b, tmp, n);
>> +}
>> +
>> +/* Replace the indirect call with a serie of if statements. It should
>> help
>> + the branch predictor. */
>>
>
> 1) Really? On Intel at least an indirect call that is always going to the
> same place
> is certainly going to be predicted as well if not better than 2/3
> branches + direct call.
>
> 2) If you're going to just test which swap function to use, why
> bother initializing
> swap_func? Why not just use an int?
>
>
>
>> +static void
>> +do_swap (void * restrict a, void * restrict b, size_t size,
>> + swap_func_t swap_func)
>> +{
>> + if (swap_func == SWAP_WORDS_64)
>> + swap_words_64 (a, b, size);
>> + else if (swap_func == SWAP_WORDS_32)
>> + swap_words_32 (a, b, size);
>> + else
>> + swap_bytes (a, b, size);
>> +}
>>
>> /* Discontinue quicksort algorithm when partition gets below this size.
>> This particular magic number was chosen to work best on a Sun 4/260.
>> */
>> @@ -97,6 +162,14 @@ _quicksort (void *const pbase, size_t total_elems,
>> size_t size,
>> /* Avoid lossage with unsigned arithmetic below. */
>> return;
>>
>> + swap_func_t swap_func;
>> + if (is_aligned_to_copy (pbase, size, 8))
>> + swap_func = SWAP_WORDS_64;
>> + else if (is_aligned_to_copy (pbase, size, 4))
>>
>
For many modern architectures that support fast unaligned loads/stores
(for example x86_64 SnB and newer) I don't think this check really makes
sense.
> + swap_func = SWAP_WORDS_32;
>> + else
>> + swap_func = SWAP_BYTES;
>> +
>> if (total_elems > MAX_THRESH)
>> {
>> char *lo = base_ptr;
>> @@ -120,13 +193,13 @@ _quicksort (void *const pbase, size_t total_elems,
>> size_t size,
>> char *mid = lo + size * ((hi - lo) / size >> 1);
>>
>> if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
>> - SWAP (mid, lo, size);
>> + do_swap (mid, lo, size, swap_func);
>> if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
>> - SWAP (mid, hi, size);
>> + do_swap (mid, hi, size, swap_func);
>> else
>> goto jump_over;
>> if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
>> - SWAP (mid, lo, size);
>> + do_swap (mid, lo, size, swap_func);
>> jump_over:;
>>
>> left_ptr = lo + size;
>> @@ -145,7 +218,7 @@ _quicksort (void *const pbase, size_t total_elems,
>> size_t size,
>>
>> if (left_ptr < right_ptr)
>> {
>> - SWAP (left_ptr, right_ptr, size);
>> + do_swap (left_ptr, right_ptr, size, swap_func);
>> if (mid == left_ptr)
>> mid = right_ptr;
>> else if (mid == right_ptr)
>> @@ -217,7 +290,7 @@ _quicksort (void *const pbase, size_t total_elems,
>> size_t size,
>> tmp_ptr = run_ptr;
>>
>> if (tmp_ptr != base_ptr)
>> - SWAP (tmp_ptr, base_ptr, size);
>> + do_swap (tmp_ptr, base_ptr, size, swap_func);
>>
>> /* Insertion sort, running from left-hand-side up to
>> right-hand-side. */
>>
>> --
>> 2.30.2
>>
>>
On 13/10/2021 00:29, Noah Goldstein wrote:
> +static void
> +swap_bytes (void * restrict a, void * restrict b, size_t n)
> +{
> + /* Use multiple small memcpys with constant size to enable inlining
> + on most targets. */
> + enum { SWAP_GENERIC_SIZE = 32 };
> + unsigned char tmp[SWAP_GENERIC_SIZE];
> + while (n > SWAP_GENERIC_SIZE)
> + {
> + memcpy (tmp, a, SWAP_GENERIC_SIZE);
> + a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
> + b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
> + n -= SWAP_GENERIC_SIZE;
> + }
> + memcpy (tmp, a, n);
> + memcpy (a, b, n);
> + memcpy (b, tmp, n);
> +}
> +
> +/* Replace the indirect call with a serie of if statements. It should help
> + the branch predictor. */
>
>
> 1) Really? On Intel at least an indirect call that is always going to the same place
> is certainly going to be predicted as well if not better than 2/3 branches + direct call.
>
I shamelessly copy the same strategy Linux kernel used on its lib/sort.c
(8fb583c4258d08f0). Maybe Linux internal usage of its qsort() leads to
better predictable branch, and for this change I would prefer to work
better on different architectures than assume an specific one.
> 2) If you're going to just test which swap function to use, why bother initializing
> swap_func? Why not just use an int?
Indeed this is no much gain on glibc usage. The kernel provides a API to use
used-defined swap_func, which is not our case.
On 13/10/2021 00:39, Noah Goldstein wrote:
>
>
> On Tue, Oct 12, 2021 at 11:29 PM Noah Goldstein <goldstein.w.n@gmail.com <mailto:goldstein.w.n@gmail.com>> wrote:
>
>
>
> On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org <mailto:libc-alpha@sourceware.org>> wrote:
>
> It optimizes take in consideration both the most common elements are
> either 32 or 64 bit in size [1] and inputs are aligned to the word
> boundary. This is similar to the optimization done on lib/sort.c
> from Linux.
>
> This patchs adds an optimized swap operation on qsort based in previous
> msort one. Instead of byte operation, three variants are provided:
>
> 1. Using uint32_t loads and stores.
> 2. Using uint64_t loads and stores.
> 3. Generic one with a temporary buffer and memcpy/mempcpy.
>
> The 1. and 2. options are selected only either if architecture defines
> _STRING_ARCH_unaligned or if base pointer is aligned to required type.
>
> It also fixes BZ#19305 by checking input size against number of
> elements 1 besides 0.
>
> Checked on x86_64-linux-gnu.
>
> [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html <https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html>
> ---
> stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++--------
> 1 file changed, 91 insertions(+), 18 deletions(-)
>
> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> index 23f2d28314..59458d151b 100644
> --- a/stdlib/qsort.c
> +++ b/stdlib/qsort.c
> @@ -24,20 +24,85 @@
> #include <limits.h>
> #include <stdlib.h>
> #include <string.h>
> +#include <stdbool.h>
>
> -/* Byte-wise swap two items of size SIZE. */
> -#define SWAP(a, b, size) \
> - do \
> - { \
> - size_t __size = (size); \
> - char *__a = (a), *__b = (b); \
> - do \
> - { \
> - char __tmp = *__a; \
> - *__a++ = *__b; \
> - *__b++ = __tmp; \
> - } while (--__size > 0); \
> - } while (0)
> +/* Swap SIZE bytes between addresses A and B. These helpers are provided
> + along the generic one as an optimization. */
> +
> +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
> +
> +/* Return trues is elements can be copied used word load and sortes.
> + The size must be a multiple of the alignment, and the base address. */
> +static inline bool
> +is_aligned_to_copy (const void *base, size_t size, size_t align)
> +{
> + unsigned char lsbits = size;
> +#if !_STRING_ARCH_unaligned
> + lsbits |= (unsigned char)(uintptr_t) base;
> +#endif
> + return (lsbits & (align - 1)) == 0;
> +}
> +
> +#define SWAP_WORDS_64 (swap_func_t)0
> +#define SWAP_WORDS_32 (swap_func_t)1
> +#define SWAP_BYTES (swap_func_t)2
> +
> +static void
> +swap_words_64 (void * restrict a, void * restrict b, size_t n)
> +{
> + do
> + {
> + n -= 8;
> + uint64_t t = *(uint64_t *)(a + n);
> + *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
> + *(uint64_t *)(b + n) = t;
> + } while (n);
> +}
> +
> +static void
> +swap_words_32 (void * restrict a, void * restrict b, size_t n)
> +{
> + do
> + {
> + n -= 4;
> + uint32_t t = *(uint32_t *)(a + n);
> + *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
> + *(uint32_t *)(b + n) = t;
> + } while (n);
> +}
>
>
> I'm not certain swap_words_32 / swap_words_8 will be optimal for larger
> key sizes. Looking at GCC's implementation of swap_generic on modern x86_64:
> https://godbolt.org/z/638h3Y9va <https://godbolt.org/z/638h3Y9va>
> It's able to optimize the temporary buffer out of the loop and use xmm registers which
> will likely win out for larger sizes.
It is probably not the most optimized code compiler can generated since
I tried to go for a more generic code that should results in a somewhat
better code in a architecture agnostic code. I trying to mimic some
of optimization Linux did on 37d0ec34d111acfdb, and the swap_bytes I
used the same strategy used on d3496c9f4f27d (Linux did not optimize
anything for the byte version).
I think it would be possible to tune it for an specific architecture
and/or compiler, but I would prefer to use a good enough algorithm that
work reasonable on multiple architectures.
On Fri, Oct 15, 2021 at 8:12 AM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 13/10/2021 00:29, Noah Goldstein wrote:
> > +static void
> > +swap_bytes (void * restrict a, void * restrict b, size_t n)
> > +{
> > + /* Use multiple small memcpys with constant size to enable inlining
> > + on most targets. */
> > + enum { SWAP_GENERIC_SIZE = 32 };
> > + unsigned char tmp[SWAP_GENERIC_SIZE];
> > + while (n > SWAP_GENERIC_SIZE)
> > + {
> > + memcpy (tmp, a, SWAP_GENERIC_SIZE);
> > + a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
> > + b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
> > + n -= SWAP_GENERIC_SIZE;
> > + }
> > + memcpy (tmp, a, n);
> > + memcpy (a, b, n);
> > + memcpy (b, tmp, n);
> > +}
> > +
> > +/* Replace the indirect call with a serie of if statements. It should help
> > + the branch predictor. */
> >
> >
> > 1) Really? On Intel at least an indirect call that is always going to the same place
> > is certainly going to be predicted as well if not better than 2/3 branches + direct call.
> >
>
> I shamelessly copy the same strategy Linux kernel used on its lib/sort.c
> (8fb583c4258d08f0). Maybe Linux internal usage of its qsort() leads to
> better predictable branch, and for this change I would prefer to work
> better on different architectures than assume an specific one.
If it's running in the Kernel spectre mitigations can make indirect jumps
particularly expensive. I don't think we have the same issue targeting
userland.
>
> > 2) If you're going to just test which swap function to use, why bother initializing
> > swap_func? Why not just use an int?
>
> Indeed this is no much gain on glibc usage. The kernel provides a API to use
> used-defined swap_func, which is not our case.
>
On Fri, Oct 15, 2021 at 8:29 AM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 13/10/2021 00:39, Noah Goldstein wrote:
> >
> >
> > On Tue, Oct 12, 2021 at 11:29 PM Noah Goldstein <goldstein.w.n@gmail.com <mailto:goldstein.w.n@gmail.com>> wrote:
> >
> >
> >
> > On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org <mailto:libc-alpha@sourceware.org>> wrote:
> >
> > It optimizes take in consideration both the most common elements are
> > either 32 or 64 bit in size [1] and inputs are aligned to the word
> > boundary. This is similar to the optimization done on lib/sort.c
> > from Linux.
> >
> > This patchs adds an optimized swap operation on qsort based in previous
> > msort one. Instead of byte operation, three variants are provided:
> >
> > 1. Using uint32_t loads and stores.
> > 2. Using uint64_t loads and stores.
> > 3. Generic one with a temporary buffer and memcpy/mempcpy.
> >
> > The 1. and 2. options are selected only either if architecture defines
> > _STRING_ARCH_unaligned or if base pointer is aligned to required type.
> >
> > It also fixes BZ#19305 by checking input size against number of
> > elements 1 besides 0.
> >
> > Checked on x86_64-linux-gnu.
> >
> > [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html <https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html>
> > ---
> > stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++--------
> > 1 file changed, 91 insertions(+), 18 deletions(-)
> >
> > diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> > index 23f2d28314..59458d151b 100644
> > --- a/stdlib/qsort.c
> > +++ b/stdlib/qsort.c
> > @@ -24,20 +24,85 @@
> > #include <limits.h>
> > #include <stdlib.h>
> > #include <string.h>
> > +#include <stdbool.h>
> >
> > -/* Byte-wise swap two items of size SIZE. */
> > -#define SWAP(a, b, size) \
> > - do \
> > - { \
> > - size_t __size = (size); \
> > - char *__a = (a), *__b = (b); \
> > - do \
> > - { \
> > - char __tmp = *__a; \
> > - *__a++ = *__b; \
> > - *__b++ = __tmp; \
> > - } while (--__size > 0); \
> > - } while (0)
> > +/* Swap SIZE bytes between addresses A and B. These helpers are provided
> > + along the generic one as an optimization. */
> > +
> > +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
> > +
> > +/* Return trues is elements can be copied used word load and sortes.
> > + The size must be a multiple of the alignment, and the base address. */
> > +static inline bool
> > +is_aligned_to_copy (const void *base, size_t size, size_t align)
> > +{
> > + unsigned char lsbits = size;
> > +#if !_STRING_ARCH_unaligned
> > + lsbits |= (unsigned char)(uintptr_t) base;
> > +#endif
> > + return (lsbits & (align - 1)) == 0;
> > +}
> > +
> > +#define SWAP_WORDS_64 (swap_func_t)0
> > +#define SWAP_WORDS_32 (swap_func_t)1
> > +#define SWAP_BYTES (swap_func_t)2
> > +
> > +static void
> > +swap_words_64 (void * restrict a, void * restrict b, size_t n)
> > +{
> > + do
> > + {
> > + n -= 8;
> > + uint64_t t = *(uint64_t *)(a + n);
> > + *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
> > + *(uint64_t *)(b + n) = t;
> > + } while (n);
> > +}
> > +
> > +static void
> > +swap_words_32 (void * restrict a, void * restrict b, size_t n)
> > +{
> > + do
> > + {
> > + n -= 4;
> > + uint32_t t = *(uint32_t *)(a + n);
> > + *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
> > + *(uint32_t *)(b + n) = t;
> > + } while (n);
> > +}
> >
> >
> > I'm not certain swap_words_32 / swap_words_8 will be optimal for larger
> > key sizes. Looking at GCC's implementation of swap_generic on modern x86_64:
> > https://godbolt.org/z/638h3Y9va <https://godbolt.org/z/638h3Y9va>
> > It's able to optimize the temporary buffer out of the loop and use xmm registers which
> > will likely win out for larger sizes.
>
> It is probably not the most optimized code compiler can generated since
> I tried to go for a more generic code that should results in a somewhat
> better code in a architecture agnostic code. I trying to mimic some
> of optimization Linux did on 37d0ec34d111acfdb, and the swap_bytes I
> used the same strategy used on d3496c9f4f27d (Linux did not optimize
> anything for the byte version).
>
> I think it would be possible to tune it for an specific architecture
> and/or compiler, but I would prefer to use a good enough algorithm that
> work reasonable on multiple architectures.
That's fair. Although I still think there are some improvements.
Looking at the assembly for all three in fact it seems GCC optimizes all of them
to larger register copies: https://godbolt.org/z/bd9nnnoEY
The biggest difference seems to be the setup / register spills for
the generic version so for the common case of a relatively small key
the special case for 4/8 makes sense.
Have you checked that GCC is able to use the conditions for selecting
the swap function to optimize the functions themselves? In the godbolt
link I got reasonable value out of adding the invariants to swap_64/swap_32.
It also may be worth it to write a custom memcpy implementation for
size = 0..SWAP_GENERIC_SIZE so it can be inlined (and probably
more optimized than what generic memcpy can get away with).
On 15/10/2021 13:45, Noah Goldstein wrote:
> On Fri, Oct 15, 2021 at 8:12 AM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>>
>> On 13/10/2021 00:29, Noah Goldstein wrote:
>>> +static void
>>> +swap_bytes (void * restrict a, void * restrict b, size_t n)
>>> +{
>>> + /* Use multiple small memcpys with constant size to enable inlining
>>> + on most targets. */
>>> + enum { SWAP_GENERIC_SIZE = 32 };
>>> + unsigned char tmp[SWAP_GENERIC_SIZE];
>>> + while (n > SWAP_GENERIC_SIZE)
>>> + {
>>> + memcpy (tmp, a, SWAP_GENERIC_SIZE);
>>> + a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
>>> + b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
>>> + n -= SWAP_GENERIC_SIZE;
>>> + }
>>> + memcpy (tmp, a, n);
>>> + memcpy (a, b, n);
>>> + memcpy (b, tmp, n);
>>> +}
>>> +
>>> +/* Replace the indirect call with a serie of if statements. It should help
>>> + the branch predictor. */
>>>
>>>
>>> 1) Really? On Intel at least an indirect call that is always going to the same place
>>> is certainly going to be predicted as well if not better than 2/3 branches + direct call.
>>>
>>
>> I shamelessly copy the same strategy Linux kernel used on its lib/sort.c
>> (8fb583c4258d08f0). Maybe Linux internal usage of its qsort() leads to
>> better predictable branch, and for this change I would prefer to work
>> better on different architectures than assume an specific one.
>
> If it's running in the Kernel spectre mitigations can make indirect jumps
> particularly expensive. I don't think we have the same issue targeting
> userland.
Indeed I didn't take this in consideration, and it does not seems to
help much in glibc case. I will used indirect calls.
On 15/10/2021 14:17, Noah Goldstein wrote:
> On Fri, Oct 15, 2021 at 8:29 AM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>>
>> On 13/10/2021 00:39, Noah Goldstein wrote:
>>>
>>>
>>> On Tue, Oct 12, 2021 at 11:29 PM Noah Goldstein <goldstein.w.n@gmail.com <mailto:goldstein.w.n@gmail.com>> wrote:
>>>
>>>
>>>
>>> On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org <mailto:libc-alpha@sourceware.org>> wrote:
>>>
>>> It optimizes take in consideration both the most common elements are
>>> either 32 or 64 bit in size [1] and inputs are aligned to the word
>>> boundary. This is similar to the optimization done on lib/sort.c
>>> from Linux.
>>>
>>> This patchs adds an optimized swap operation on qsort based in previous
>>> msort one. Instead of byte operation, three variants are provided:
>>>
>>> 1. Using uint32_t loads and stores.
>>> 2. Using uint64_t loads and stores.
>>> 3. Generic one with a temporary buffer and memcpy/mempcpy.
>>>
>>> The 1. and 2. options are selected only either if architecture defines
>>> _STRING_ARCH_unaligned or if base pointer is aligned to required type.
>>>
>>> It also fixes BZ#19305 by checking input size against number of
>>> elements 1 besides 0.
>>>
>>> Checked on x86_64-linux-gnu.
>>>
>>> [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html <https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html>
>>> ---
>>> stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++--------
>>> 1 file changed, 91 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
>>> index 23f2d28314..59458d151b 100644
>>> --- a/stdlib/qsort.c
>>> +++ b/stdlib/qsort.c
>>> @@ -24,20 +24,85 @@
>>> #include <limits.h>
>>> #include <stdlib.h>
>>> #include <string.h>
>>> +#include <stdbool.h>
>>>
>>> -/* Byte-wise swap two items of size SIZE. */
>>> -#define SWAP(a, b, size) \
>>> - do \
>>> - { \
>>> - size_t __size = (size); \
>>> - char *__a = (a), *__b = (b); \
>>> - do \
>>> - { \
>>> - char __tmp = *__a; \
>>> - *__a++ = *__b; \
>>> - *__b++ = __tmp; \
>>> - } while (--__size > 0); \
>>> - } while (0)
>>> +/* Swap SIZE bytes between addresses A and B. These helpers are provided
>>> + along the generic one as an optimization. */
>>> +
>>> +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
>>> +
>>> +/* Return trues is elements can be copied used word load and sortes.
>>> + The size must be a multiple of the alignment, and the base address. */
>>> +static inline bool
>>> +is_aligned_to_copy (const void *base, size_t size, size_t align)
>>> +{
>>> + unsigned char lsbits = size;
>>> +#if !_STRING_ARCH_unaligned
>>> + lsbits |= (unsigned char)(uintptr_t) base;
>>> +#endif
>>> + return (lsbits & (align - 1)) == 0;
>>> +}
>>> +
>>> +#define SWAP_WORDS_64 (swap_func_t)0
>>> +#define SWAP_WORDS_32 (swap_func_t)1
>>> +#define SWAP_BYTES (swap_func_t)2
>>> +
>>> +static void
>>> +swap_words_64 (void * restrict a, void * restrict b, size_t n)
>>> +{
>>> + do
>>> + {
>>> + n -= 8;
>>> + uint64_t t = *(uint64_t *)(a + n);
>>> + *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
>>> + *(uint64_t *)(b + n) = t;
>>> + } while (n);
>>> +}
>>> +
>>> +static void
>>> +swap_words_32 (void * restrict a, void * restrict b, size_t n)
>>> +{
>>> + do
>>> + {
>>> + n -= 4;
>>> + uint32_t t = *(uint32_t *)(a + n);
>>> + *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
>>> + *(uint32_t *)(b + n) = t;
>>> + } while (n);
>>> +}
>>>
>>>
>>> I'm not certain swap_words_32 / swap_words_8 will be optimal for larger
>>> key sizes. Looking at GCC's implementation of swap_generic on modern x86_64:
>>> https://godbolt.org/z/638h3Y9va <https://godbolt.org/z/638h3Y9va>
>>> It's able to optimize the temporary buffer out of the loop and use xmm registers which
>>> will likely win out for larger sizes.
>>
>> It is probably not the most optimized code compiler can generated since
>> I tried to go for a more generic code that should results in a somewhat
>> better code in a architecture agnostic code. I trying to mimic some
>> of optimization Linux did on 37d0ec34d111acfdb, and the swap_bytes I
>> used the same strategy used on d3496c9f4f27d (Linux did not optimize
>> anything for the byte version).
>>
>> I think it would be possible to tune it for an specific architecture
>> and/or compiler, but I would prefer to use a good enough algorithm that
>> work reasonable on multiple architectures.
>
> That's fair. Although I still think there are some improvements.
>
> Looking at the assembly for all three in fact it seems GCC optimizes all of them
> to larger register copies: https://godbolt.org/z/bd9nnnoEY
>
> The biggest difference seems to be the setup / register spills for
> the generic version so for the common case of a relatively small key
> the special case for 4/8 makes sense.
>
> Have you checked that GCC is able to use the conditions for selecting
> the swap function to optimize the functions themselves? In the godbolt
> link I got reasonable value out of adding the invariants to swap_64/swap_32.
Not yet, but I think the generated code for both x86_64 and aarch64 seems
simple enough and should cover the common case (keys with size 4 or 8)
fast enough [1].
And since for v4 my plan is not to remove mergesort anymore, but limit it
to a static buffer; it might be possible to use the same swap routines
for both mergesort and quicksort.
>
> It also may be worth it to write a custom memcpy implementation for
> size = 0..SWAP_GENERIC_SIZE so it can be inlined (and probably
> more optimized than what generic memcpy can get away with).
>
I *really* do not want to go for this way. My hope is with small sizes
compiler can inline the memcpy (which gcc does for most common
architectures).
[1] https://godbolt.org/z/v7e4xxqGa
On Fri, Oct 15, 2021 at 12:34 PM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 15/10/2021 14:17, Noah Goldstein wrote:
> > On Fri, Oct 15, 2021 at 8:29 AM Adhemerval Zanella
> > <adhemerval.zanella@linaro.org> wrote:
> >>
> >>
> >>
> >> On 13/10/2021 00:39, Noah Goldstein wrote:
> >>>
> >>>
> >>> On Tue, Oct 12, 2021 at 11:29 PM Noah Goldstein <goldstein.w.n@gmail.com <mailto:goldstein.w.n@gmail.com>> wrote:
> >>>
> >>>
> >>>
> >>> On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org <mailto:libc-alpha@sourceware.org>> wrote:
> >>>
> >>> It optimizes take in consideration both the most common elements are
> >>> either 32 or 64 bit in size [1] and inputs are aligned to the word
> >>> boundary. This is similar to the optimization done on lib/sort.c
> >>> from Linux.
> >>>
> >>> This patchs adds an optimized swap operation on qsort based in previous
> >>> msort one. Instead of byte operation, three variants are provided:
> >>>
> >>> 1. Using uint32_t loads and stores.
> >>> 2. Using uint64_t loads and stores.
> >>> 3. Generic one with a temporary buffer and memcpy/mempcpy.
> >>>
> >>> The 1. and 2. options are selected only either if architecture defines
> >>> _STRING_ARCH_unaligned or if base pointer is aligned to required type.
> >>>
> >>> It also fixes BZ#19305 by checking input size against number of
> >>> elements 1 besides 0.
> >>>
> >>> Checked on x86_64-linux-gnu.
> >>>
> >>> [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html <https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html>
> >>> ---
> >>> stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++--------
> >>> 1 file changed, 91 insertions(+), 18 deletions(-)
> >>>
> >>> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> >>> index 23f2d28314..59458d151b 100644
> >>> --- a/stdlib/qsort.c
> >>> +++ b/stdlib/qsort.c
> >>> @@ -24,20 +24,85 @@
> >>> #include <limits.h>
> >>> #include <stdlib.h>
> >>> #include <string.h>
> >>> +#include <stdbool.h>
> >>>
> >>> -/* Byte-wise swap two items of size SIZE. */
> >>> -#define SWAP(a, b, size) \
> >>> - do \
> >>> - { \
> >>> - size_t __size = (size); \
> >>> - char *__a = (a), *__b = (b); \
> >>> - do \
> >>> - { \
> >>> - char __tmp = *__a; \
> >>> - *__a++ = *__b; \
> >>> - *__b++ = __tmp; \
> >>> - } while (--__size > 0); \
> >>> - } while (0)
> >>> +/* Swap SIZE bytes between addresses A and B. These helpers are provided
> >>> + along the generic one as an optimization. */
> >>> +
> >>> +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
> >>> +
> >>> +/* Return trues is elements can be copied used word load and sortes.
> >>> + The size must be a multiple of the alignment, and the base address. */
> >>> +static inline bool
> >>> +is_aligned_to_copy (const void *base, size_t size, size_t align)
> >>> +{
> >>> + unsigned char lsbits = size;
> >>> +#if !_STRING_ARCH_unaligned
> >>> + lsbits |= (unsigned char)(uintptr_t) base;
> >>> +#endif
> >>> + return (lsbits & (align - 1)) == 0;
> >>> +}
> >>> +
> >>> +#define SWAP_WORDS_64 (swap_func_t)0
> >>> +#define SWAP_WORDS_32 (swap_func_t)1
> >>> +#define SWAP_BYTES (swap_func_t)2
> >>> +
> >>> +static void
> >>> +swap_words_64 (void * restrict a, void * restrict b, size_t n)
> >>> +{
> >>> + do
> >>> + {
> >>> + n -= 8;
> >>> + uint64_t t = *(uint64_t *)(a + n);
> >>> + *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
> >>> + *(uint64_t *)(b + n) = t;
> >>> + } while (n);
> >>> +}
> >>> +
> >>> +static void
> >>> +swap_words_32 (void * restrict a, void * restrict b, size_t n)
> >>> +{
> >>> + do
> >>> + {
> >>> + n -= 4;
> >>> + uint32_t t = *(uint32_t *)(a + n);
> >>> + *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
> >>> + *(uint32_t *)(b + n) = t;
> >>> + } while (n);
> >>> +}
> >>>
> >>>
> >>> I'm not certain swap_words_32 / swap_words_8 will be optimal for larger
> >>> key sizes. Looking at GCC's implementation of swap_generic on modern x86_64:
> >>> https://godbolt.org/z/638h3Y9va <https://godbolt.org/z/638h3Y9va>
> >>> It's able to optimize the temporary buffer out of the loop and use xmm registers which
> >>> will likely win out for larger sizes.
> >>
> >> It is probably not the most optimized code compiler can generated since
> >> I tried to go for a more generic code that should results in a somewhat
> >> better code in a architecture agnostic code. I trying to mimic some
> >> of optimization Linux did on 37d0ec34d111acfdb, and the swap_bytes I
> >> used the same strategy used on d3496c9f4f27d (Linux did not optimize
> >> anything for the byte version).
> >>
> >> I think it would be possible to tune it for an specific architecture
> >> and/or compiler, but I would prefer to use a good enough algorithm that
> >> work reasonable on multiple architectures.
> >
> > That's fair. Although I still think there are some improvements.
> >
> > Looking at the assembly for all three in fact it seems GCC optimizes all of them
> > to larger register copies: https://godbolt.org/z/bd9nnnoEY
> >
> > The biggest difference seems to be the setup / register spills for
> > the generic version so for the common case of a relatively small key
> > the special case for 4/8 makes sense.
> >
> > Have you checked that GCC is able to use the conditions for selecting
> > the swap function to optimize the functions themselves? In the godbolt
> > link I got reasonable value out of adding the invariants to swap_64/swap_32.
>
> Not yet, but I think the generated code for both x86_64 and aarch64 seems
> simple enough and should cover the common case (keys with size 4 or 8)
> fast enough [1].
swap is in the inner loop. Seems like a pretty critical component to have fully
optimized. The aarch64 version looks good, but the x86_64 version seems
to be lacking. Not arguing for an arch specific version, but if the directives
can add value to x86_64 without detracting from aarch64 seems like a zero
cost improvement.
>
> And since for v4 my plan is not to remove mergesort anymore, but limit it
> to a static buffer; it might be possible to use the same swap routines
> for both mergesort and quicksort.
>
> >
> > It also may be worth it to write a custom memcpy implementation for
> > size = 0..SWAP_GENERIC_SIZE so it can be inlined (and probably
> > more optimized than what generic memcpy can get away with).
> >
>
> I *really* do not want to go for this way. My hope is with small sizes
> compiler can inline the memcpy (which gcc does for most common
> architectures).
Since size is non-constant for the tail I don't see how we are going
avoid 3x memcpy calls. Although that can be another patch if it
gets values.
>
>
> [1] https://godbolt.org/z/v7e4xxqGa
On 15/10/2021 14:45, Noah Goldstein wrote:
> swap is in the inner loop. Seems like a pretty critical component to have fully
> optimized. The aarch64 version looks good, but the x86_64 version seems
> to be lacking. Not arguing for an arch specific version, but if the directives
> can add value to x86_64 without detracting from aarch64 seems like a zero
> cost improvement.
>
> Since size is non-constant for the tail I don't see how we are going
> avoid 3x memcpy calls. Although that can be another patch if it
> gets values.
>
>>
>>
>> [1] https://godbolt.org/z/v7e4xxqGa
Maybe use a byte copy in the tail to avoid memcpy [1], another option
might to tune SWAP_GENERIC_SIZE to make the tail less costly (16 should
be ok for most architecture, although some might be better with large
values).
[1] https://godbolt.org/z/G76dcej16
@@ -24,20 +24,85 @@
#include <limits.h>
#include <stdlib.h>
#include <string.h>
+#include <stdbool.h>
-/* Byte-wise swap two items of size SIZE. */
-#define SWAP(a, b, size) \
- do \
- { \
- size_t __size = (size); \
- char *__a = (a), *__b = (b); \
- do \
- { \
- char __tmp = *__a; \
- *__a++ = *__b; \
- *__b++ = __tmp; \
- } while (--__size > 0); \
- } while (0)
+/* Swap SIZE bytes between addresses A and B. These helpers are provided
+ along the generic one as an optimization. */
+
+typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
+
+/* Return trues is elements can be copied used word load and sortes.
+ The size must be a multiple of the alignment, and the base address. */
+static inline bool
+is_aligned_to_copy (const void *base, size_t size, size_t align)
+{
+ unsigned char lsbits = size;
+#if !_STRING_ARCH_unaligned
+ lsbits |= (unsigned char)(uintptr_t) base;
+#endif
+ return (lsbits & (align - 1)) == 0;
+}
+
+#define SWAP_WORDS_64 (swap_func_t)0
+#define SWAP_WORDS_32 (swap_func_t)1
+#define SWAP_BYTES (swap_func_t)2
+
+static void
+swap_words_64 (void * restrict a, void * restrict b, size_t n)
+{
+ do
+ {
+ n -= 8;
+ uint64_t t = *(uint64_t *)(a + n);
+ *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
+ *(uint64_t *)(b + n) = t;
+ } while (n);
+}
+
+static void
+swap_words_32 (void * restrict a, void * restrict b, size_t n)
+{
+ do
+ {
+ n -= 4;
+ uint32_t t = *(uint32_t *)(a + n);
+ *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
+ *(uint32_t *)(b + n) = t;
+ } while (n);
+}
+
+static void
+swap_bytes (void * restrict a, void * restrict b, size_t n)
+{
+ /* Use multiple small memcpys with constant size to enable inlining
+ on most targets. */
+ enum { SWAP_GENERIC_SIZE = 32 };
+ unsigned char tmp[SWAP_GENERIC_SIZE];
+ while (n > SWAP_GENERIC_SIZE)
+ {
+ memcpy (tmp, a, SWAP_GENERIC_SIZE);
+ a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
+ b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE;
+ n -= SWAP_GENERIC_SIZE;
+ }
+ memcpy (tmp, a, n);
+ memcpy (a, b, n);
+ memcpy (b, tmp, n);
+}
+
+/* Replace the indirect call with a serie of if statements. It should help
+ the branch predictor. */
+static void
+do_swap (void * restrict a, void * restrict b, size_t size,
+ swap_func_t swap_func)
+{
+ if (swap_func == SWAP_WORDS_64)
+ swap_words_64 (a, b, size);
+ else if (swap_func == SWAP_WORDS_32)
+ swap_words_32 (a, b, size);
+ else
+ swap_bytes (a, b, size);
+}
/* Discontinue quicksort algorithm when partition gets below this size.
This particular magic number was chosen to work best on a Sun 4/260. */
@@ -97,6 +162,14 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
/* Avoid lossage with unsigned arithmetic below. */
return;
+ swap_func_t swap_func;
+ if (is_aligned_to_copy (pbase, size, 8))
+ swap_func = SWAP_WORDS_64;
+ else if (is_aligned_to_copy (pbase, size, 4))
+ swap_func = SWAP_WORDS_32;
+ else
+ swap_func = SWAP_BYTES;
+
if (total_elems > MAX_THRESH)
{
char *lo = base_ptr;
@@ -120,13 +193,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
char *mid = lo + size * ((hi - lo) / size >> 1);
if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
- SWAP (mid, lo, size);
+ do_swap (mid, lo, size, swap_func);
if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
- SWAP (mid, hi, size);
+ do_swap (mid, hi, size, swap_func);
else
goto jump_over;
if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
- SWAP (mid, lo, size);
+ do_swap (mid, lo, size, swap_func);
jump_over:;
left_ptr = lo + size;
@@ -145,7 +218,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
if (left_ptr < right_ptr)
{
- SWAP (left_ptr, right_ptr, size);
+ do_swap (left_ptr, right_ptr, size, swap_func);
if (mid == left_ptr)
mid = right_ptr;
else if (mid == right_ptr)
@@ -217,7 +290,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
tmp_ptr = run_ptr;
if (tmp_ptr != base_ptr)
- SWAP (tmp_ptr, base_ptr, size);
+ do_swap (tmp_ptr, base_ptr, size, swap_func);
/* Insertion sort, running from left-hand-side up to right-hand-side. */