diff mbox series

[v6,1/6] stdlib: Optimization qsort{_r} swap implementation

Message ID	20230718141543.3925254-2-adhemerval.zanella@linaro.org
State	Superseded
Headers	DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org D4024385773C To: libc-alpha@sourceware.org, Paul Eggert <eggert@cs.ucla.edu>, Alexander Monakov <amonakov@ispras.ru> Subject: [PATCH v6 1/6] stdlib: Optimization qsort{_r} swap implementation Date: Tue, 18 Jul 2023 11:15:38 -0300 Message-Id: <20230718141543.3925254-2-adhemerval.zanella@linaro.org> In-Reply-To: <20230718141543.3925254-1-adhemerval.zanella@linaro.org> References: <20230718141543.3925254-1-adhemerval.zanella@linaro.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list From: Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org> Reply-To: Adhemerval Zanella <adhemerval.zanella@linaro.org> Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
Series	Use introsort for qsort \| [v6,0/6] Use introsort for qsort [v6,1/6] stdlib: Optimization qsort{_r} swap implementation [v6,2/6] stdlib: Move insertion sort out qsort [v6,3/6] stdlib: qsort: Move some macros to inline function [v6,4/6] stdlib: Implement introsort for qsort (BZ 19305) [v6,5/6] stdlib: Remove use of mergesort on qsort (BZ 21719) [v6,6/6] stdlib: Add more qsort{_r} coverage

Checks

Context	Check	Description
redhat-pt-bot/TryBot-apply_patch	success	Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-arm	success	Testing passed
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64	success	Testing passed

Commit Message

Adhemerval Zanella Netto July 18, 2023, 2:15 p.m. UTC

  The optimization takes in consideration both the most common elements
are either 32 or 64 bit in size and inputs are aligned to the word
boundary.  This is similar to what msort does.

For large buffer the swap operation uses memcpy/mempcpy with a
small fixed size buffer (so compiler might inline the operations).

Checked on x86_64-linux-gnu.
---
 stdlib/qsort.c | 116 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 98 insertions(+), 18 deletions(-)

Comments

Adhemerval Zanella Netto Aug. 22, 2023, 7:31 p.m. UTC | #1

Ping.

On 18/07/23 11:15, Adhemerval Zanella wrote:
> The optimization takes in consideration both the most common elements
> are either 32 or 64 bit in size and inputs are aligned to the word
> boundary.  This is similar to what msort does.
> 
> For large buffer the swap operation uses memcpy/mempcpy with a
> small fixed size buffer (so compiler might inline the operations).
> 
> Checked on x86_64-linux-gnu.
> ---
>  stdlib/qsort.c | 116 +++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 98 insertions(+), 18 deletions(-)
> 
> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> index 728a0ed370..5bcc287c79 100644
> --- a/stdlib/qsort.c
> +++ b/stdlib/qsort.c
> @@ -23,20 +23,92 @@
>  #include <limits.h>
>  #include <stdlib.h>
>  #include <string.h>
> +#include <stdbool.h>
>  
> -/* Byte-wise swap two items of size SIZE. */
> -#define SWAP(a, b, size)						      \
> -  do									      \
> -    {									      \
> -      size_t __size = (size);						      \
> -      char *__a = (a), *__b = (b);					      \
> -      do								      \
> -	{								      \
> -	  char __tmp = *__a;						      \
> -	  *__a++ = *__b;						      \
> -	  *__b++ = __tmp;						      \
> -	} while (--__size > 0);						      \
> -    } while (0)
> +/* Swap SIZE bytes between addresses A and B.  These helpers are provided
> +   along the generic one as an optimization.  */
> +
> +enum swap_type_t
> +  {
> +    SWAP_WORDS_64,
> +    SWAP_WORDS_32,
> +    SWAP_BYTES
> +  };
> +
> +/* If this function returns true, elements can be safely copied using word
> +   loads and stores.  Otherwise, it might not be safe.  BASE (as an integer)
> +   must be a multiple of the word alignment.  SIZE must be a multiple of
> +   WORDSIZE.  Since WORDSIZE must be a multiple of the word alignment, and
> +   WORDSIZE is a power of two on all supported platforms, this function for
> +   speed merely checks that BASE and SIZE are both multiples of the word
> +   size.  */
> +static inline bool
> +is_aligned (const void *base, size_t size, size_t wordsize)
> +{
> +  return (((uintptr_t) base | size) & (wordsize - 1)) == 0;
> +}
> +
> +static inline void
> +swap_words_64 (void * restrict a, void * restrict b, size_t n)
> +{
> +  typedef uint64_t __attribute__ ((__may_alias__)) u64_alias_t;
> +  do
> +   {
> +     n -= 8;
> +     u64_alias_t t = *(u64_alias_t *)(a + n);
> +     *(u64_alias_t *)(a + n) = *(u64_alias_t *)(b + n);
> +     *(u64_alias_t *)(b + n) = t;
> +   } while (n);
> +}
> +
> +static inline void
> +swap_words_32 (void * restrict a, void * restrict b, size_t n)
> +{
> +  typedef uint32_t __attribute__ ((__may_alias__)) u32_alias_t;
> +  do
> +   {
> +     n -= 4;
> +     u32_alias_t t = *(u32_alias_t *)(a + n);
> +     *(u32_alias_t *)(a + n) = *(u32_alias_t *)(b + n);
> +     *(u32_alias_t *)(b + n) = t;
> +   } while (n);
> +}
> +
> +static inline void
> +swap_bytes (void * restrict a, void * restrict b, size_t n)
> +{
> +  /* Use multiple small memcpys with constant size to enable inlining
> +     on most targets.  */
> +  enum { SWAP_GENERIC_SIZE = 32 };
> +  unsigned char tmp[SWAP_GENERIC_SIZE];
> +  while (n > SWAP_GENERIC_SIZE)
> +    {
> +      memcpy (tmp, a, SWAP_GENERIC_SIZE);
> +      a = __mempcpy (a, b, SWAP_GENERIC_SIZE);
> +      b = __mempcpy (b, tmp, SWAP_GENERIC_SIZE);
> +      n -= SWAP_GENERIC_SIZE;
> +    }
> +  while (n > 0)
> +    {
> +      unsigned char t = ((unsigned char *)a)[--n];
> +      ((unsigned char *)a)[n] = ((unsigned char *)b)[n];
> +      ((unsigned char *)b)[n] = t;
> +    }
> +}
> +
> +/* Replace the indirect call with a serie of if statements.  It should help
> +   the branch predictor.  */
> +static void
> +do_swap (void * restrict a, void * restrict b, size_t size,
> +	 enum swap_type_t swap_func)
> +{
> +  if (swap_func == SWAP_WORDS_64)
> +    swap_words_64 (a, b, size);
> +  else if (swap_func == SWAP_WORDS_32)
> +    swap_words_32 (a, b, size);
> +  else
> +    swap_bytes (a, b, size);
> +}
>  
>  /* Discontinue quicksort algorithm when partition gets below this size.
>     This particular magic number was chosen to work best on a Sun 4/260. */
> @@ -96,6 +168,14 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>      /* Avoid lossage with unsigned arithmetic below.  */
>      return;
>  
> +  enum swap_type_t swap_type;
> +  if (is_aligned (pbase, size, 8))
> +    swap_type = SWAP_WORDS_64;
> +  else if (is_aligned (pbase, size, 4))
> +    swap_type = SWAP_WORDS_32;
> +  else
> +    swap_type = SWAP_BYTES;
> +
>    if (total_elems > MAX_THRESH)
>      {
>        char *lo = base_ptr;
> @@ -119,13 +199,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>  	  char *mid = lo + size * ((hi - lo) / size >> 1);
>  
>  	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
> -	    SWAP (mid, lo, size);
> +	    do_swap (mid, lo, size, swap_type);
>  	  if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
> -	    SWAP (mid, hi, size);
> +	    do_swap (mid, hi, size, swap_type);
>  	  else
>  	    goto jump_over;
>  	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
> -	    SWAP (mid, lo, size);
> +	    do_swap (mid, lo, size, swap_type);
>  	jump_over:;
>  
>  	  left_ptr  = lo + size;
> @@ -144,7 +224,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>  
>  	      if (left_ptr < right_ptr)
>  		{
> -		  SWAP (left_ptr, right_ptr, size);
> +		  do_swap (left_ptr, right_ptr, size, swap_type);
>  		  if (mid == left_ptr)
>  		    mid = right_ptr;
>  		  else if (mid == right_ptr)
> @@ -216,7 +296,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>          tmp_ptr = run_ptr;
>  
>      if (tmp_ptr != base_ptr)
> -      SWAP (tmp_ptr, base_ptr, size);
> +      do_swap (tmp_ptr, base_ptr, size, swap_type);
>  
>      /* Insertion sort, running from left-hand-side up to right-hand-side.  */
>

Noah Goldstein Aug. 29, 2023, 7:33 a.m. UTC | #2

On Tue, Jul 18, 2023 at 9:16 AM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> The optimization takes in consideration both the most common elements
> are either 32 or 64 bit in size and inputs are aligned to the word
> boundary.  This is similar to what msort does.
>
> For large buffer the swap operation uses memcpy/mempcpy with a
> small fixed size buffer (so compiler might inline the operations).
>
> Checked on x86_64-linux-gnu.
> ---
>  stdlib/qsort.c | 116 +++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 98 insertions(+), 18 deletions(-)
>
> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> index 728a0ed370..5bcc287c79 100644
> --- a/stdlib/qsort.c
> +++ b/stdlib/qsort.c
> @@ -23,20 +23,92 @@
>  #include <limits.h>
>  #include <stdlib.h>
>  #include <string.h>
> +#include <stdbool.h>
>
> -/* Byte-wise swap two items of size SIZE. */
> -#define SWAP(a, b, size)                                                     \
> -  do                                                                         \
> -    {                                                                        \
> -      size_t __size = (size);                                                \
> -      char *__a = (a), *__b = (b);                                           \
> -      do                                                                     \
> -       {                                                                     \
> -         char __tmp = *__a;                                                  \
> -         *__a++ = *__b;                                                      \
> -         *__b++ = __tmp;                                                     \
> -       } while (--__size > 0);                                               \
> -    } while (0)
> +/* Swap SIZE bytes between addresses A and B.  These helpers are provided
> +   along the generic one as an optimization.  */
> +
> +enum swap_type_t
> +  {
> +    SWAP_WORDS_64,
> +    SWAP_WORDS_32,
> +    SWAP_BYTES
> +  };
> +
> +/* If this function returns true, elements can be safely copied using word
> +   loads and stores.  Otherwise, it might not be safe.  BASE (as an integer)
> +   must be a multiple of the word alignment.  SIZE must be a multiple of
> +   WORDSIZE.  Since WORDSIZE must be a multiple of the word alignment, and
> +   WORDSIZE is a power of two on all supported platforms, this function for
> +   speed merely checks that BASE and SIZE are both multiples of the word
> +   size.  */
> +static inline bool
> +is_aligned (const void *base, size_t size, size_t wordsize)
> +{
> +  return (((uintptr_t) base | size) & (wordsize - 1)) == 0;
> +}
> +
> +static inline void
> +swap_words_64 (void * restrict a, void * restrict b, size_t n)
> +{
> +  typedef uint64_t __attribute__ ((__may_alias__)) u64_alias_t;
> +  do
> +   {
> +     n -= 8;
> +     u64_alias_t t = *(u64_alias_t *)(a + n);
> +     *(u64_alias_t *)(a + n) = *(u64_alias_t *)(b + n);
> +     *(u64_alias_t *)(b + n) = t;
> +   } while (n);
> +}
> +
> +static inline void
> +swap_words_32 (void * restrict a, void * restrict b, size_t n)
> +{
> +  typedef uint32_t __attribute__ ((__may_alias__)) u32_alias_t;
> +  do
> +   {
> +     n -= 4;
> +     u32_alias_t t = *(u32_alias_t *)(a + n);
> +     *(u32_alias_t *)(a + n) = *(u32_alias_t *)(b + n);
> +     *(u32_alias_t *)(b + n) = t;
> +   } while (n);
> +}
> +
> +static inline void
> +swap_bytes (void * restrict a, void * restrict b, size_t n)
> +{
> +  /* Use multiple small memcpys with constant size to enable inlining
> +     on most targets.  */
> +  enum { SWAP_GENERIC_SIZE = 32 };
Just intuitively think 16 makes more sense here, but no particularly good
reason.
> +  unsigned char tmp[SWAP_GENERIC_SIZE];
> +  while (n > SWAP_GENERIC_SIZE)
> +    {
> +      memcpy (tmp, a, SWAP_GENERIC_SIZE);
> +      a = __mempcpy (a, b, SWAP_GENERIC_SIZE);
> +      b = __mempcpy (b, tmp, SWAP_GENERIC_SIZE);
> +      n -= SWAP_GENERIC_SIZE;
> +    }
> +  while (n > 0)
> +    {
> +      unsigned char t = ((unsigned char *)a)[--n];
> +      ((unsigned char *)a)[n] = ((unsigned char *)b)[n];
> +      ((unsigned char *)b)[n] = t;
> +    }
> +}
> +
> +/* Replace the indirect call with a serie of if statements.  It should help
> +   the branch predictor.  */
> +static void
> +do_swap (void * restrict a, void * restrict b, size_t size,
> +        enum swap_type_t swap_func)
> +{
nit: swap_func -> swap_type/swap_method (think was previously
a function pointer but the name still implies as much).
> +  if (swap_func == SWAP_WORDS_64)
> +    swap_words_64 (a, b, size);
> +  else if (swap_func == SWAP_WORDS_32)
> +    swap_words_32 (a, b, size);
> +  else
> +    swap_bytes (a, b, size);
> +}
>
>  /* Discontinue quicksort algorithm when partition gets below this size.
>     This particular magic number was chosen to work best on a Sun 4/260. */
> @@ -96,6 +168,14 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>      /* Avoid lossage with unsigned arithmetic below.  */
>      return;
>
> +  enum swap_type_t swap_type;
> +  if (is_aligned (pbase, size, 8))
> +    swap_type = SWAP_WORDS_64;
> +  else if (is_aligned (pbase, size, 4))
> +    swap_type = SWAP_WORDS_32;
> +  else
> +    swap_type = SWAP_BYTES;
> +
>    if (total_elems > MAX_THRESH)
>      {
>        char *lo = base_ptr;
> @@ -119,13 +199,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>           char *mid = lo + size * ((hi - lo) / size >> 1);
>
>           if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
> -           SWAP (mid, lo, size);
> +           do_swap (mid, lo, size, swap_type);
>           if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
> -           SWAP (mid, hi, size);
> +           do_swap (mid, hi, size, swap_type);
>           else
>             goto jump_over;
>           if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
> -           SWAP (mid, lo, size);
> +           do_swap (mid, lo, size, swap_type);
>         jump_over:;
>
>           left_ptr  = lo + size;
> @@ -144,7 +224,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>
>               if (left_ptr < right_ptr)
>                 {
> -                 SWAP (left_ptr, right_ptr, size);
> +                 do_swap (left_ptr, right_ptr, size, swap_type);
>                   if (mid == left_ptr)
>                     mid = right_ptr;
>                   else if (mid == right_ptr)
> @@ -216,7 +296,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>          tmp_ptr = run_ptr;
>
>      if (tmp_ptr != base_ptr)
> -      SWAP (tmp_ptr, base_ptr, size);
> +      do_swap (tmp_ptr, base_ptr, size, swap_type);
>
>      /* Insertion sort, running from left-hand-side up to right-hand-side.  */
>
> --
> 2.34.1
>

Adhemerval Zanella Netto Aug. 29, 2023, 1:56 p.m. UTC | #3

On 29/08/23 04:33, Noah Goldstein wrote:
> On Tue, Jul 18, 2023 at 9:16 AM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> The optimization takes in consideration both the most common elements
>> are either 32 or 64 bit in size and inputs are aligned to the word
>> boundary.  This is similar to what msort does.
>>
>> For large buffer the swap operation uses memcpy/mempcpy with a
>> small fixed size buffer (so compiler might inline the operations).
>>
>> Checked on x86_64-linux-gnu.
>> ---
>>  stdlib/qsort.c | 116 +++++++++++++++++++++++++++++++++++++++++--------
>>  1 file changed, 98 insertions(+), 18 deletions(-)
>>
>> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
>> index 728a0ed370..5bcc287c79 100644
>> --- a/stdlib/qsort.c
>> +++ b/stdlib/qsort.c
>> @@ -23,20 +23,92 @@
>>  #include <limits.h>
>>  #include <stdlib.h>
>>  #include <string.h>
>> +#include <stdbool.h>
>>
>> -/* Byte-wise swap two items of size SIZE. */
>> -#define SWAP(a, b, size)                                                     \
>> -  do                                                                         \
>> -    {                                                                        \
>> -      size_t __size = (size);                                                \
>> -      char *__a = (a), *__b = (b);                                           \
>> -      do                                                                     \
>> -       {                                                                     \
>> -         char __tmp = *__a;                                                  \
>> -         *__a++ = *__b;                                                      \
>> -         *__b++ = __tmp;                                                     \
>> -       } while (--__size > 0);                                               \
>> -    } while (0)
>> +/* Swap SIZE bytes between addresses A and B.  These helpers are provided
>> +   along the generic one as an optimization.  */
>> +
>> +enum swap_type_t
>> +  {
>> +    SWAP_WORDS_64,
>> +    SWAP_WORDS_32,
>> +    SWAP_BYTES
>> +  };
>> +
>> +/* If this function returns true, elements can be safely copied using word
>> +   loads and stores.  Otherwise, it might not be safe.  BASE (as an integer)
>> +   must be a multiple of the word alignment.  SIZE must be a multiple of
>> +   WORDSIZE.  Since WORDSIZE must be a multiple of the word alignment, and
>> +   WORDSIZE is a power of two on all supported platforms, this function for
>> +   speed merely checks that BASE and SIZE are both multiples of the word
>> +   size.  */
>> +static inline bool
>> +is_aligned (const void *base, size_t size, size_t wordsize)
>> +{
>> +  return (((uintptr_t) base | size) & (wordsize - 1)) == 0;
>> +}
>> +
>> +static inline void
>> +swap_words_64 (void * restrict a, void * restrict b, size_t n)
>> +{
>> +  typedef uint64_t __attribute__ ((__may_alias__)) u64_alias_t;
>> +  do
>> +   {
>> +     n -= 8;
>> +     u64_alias_t t = *(u64_alias_t *)(a + n);
>> +     *(u64_alias_t *)(a + n) = *(u64_alias_t *)(b + n);
>> +     *(u64_alias_t *)(b + n) = t;
>> +   } while (n);
>> +}
>> +
>> +static inline void
>> +swap_words_32 (void * restrict a, void * restrict b, size_t n)
>> +{
>> +  typedef uint32_t __attribute__ ((__may_alias__)) u32_alias_t;
>> +  do
>> +   {
>> +     n -= 4;
>> +     u32_alias_t t = *(u32_alias_t *)(a + n);
>> +     *(u32_alias_t *)(a + n) = *(u32_alias_t *)(b + n);
>> +     *(u32_alias_t *)(b + n) = t;
>> +   } while (n);
>> +}
>> +
>> +static inline void
>> +swap_bytes (void * restrict a, void * restrict b, size_t n)
>> +{
>> +  /* Use multiple small memcpys with constant size to enable inlining
>> +     on most targets.  */
>> +  enum { SWAP_GENERIC_SIZE = 32 };
> Just intuitively think 16 makes more sense here, but no particularly good
> reason.

I don't have a strong opinion here, and most likely the best value will depend on
the architecture.  I picked a small value so memcpy/__mempcpy could be inline 
(and it should be easier to add a arch-specific hook to change this if this is
really required).

>> +  unsigned char tmp[SWAP_GENERIC_SIZE];
>> +  while (n > SWAP_GENERIC_SIZE)
>> +    {
>> +      memcpy (tmp, a, SWAP_GENERIC_SIZE);
>> +      a = __mempcpy (a, b, SWAP_GENERIC_SIZE);
>> +      b = __mempcpy (b, tmp, SWAP_GENERIC_SIZE);
>> +      n -= SWAP_GENERIC_SIZE;
>> +    }
>> +  while (n > 0)
>> +    {
>> +      unsigned char t = ((unsigned char *)a)[--n];
>> +      ((unsigned char *)a)[n] = ((unsigned char *)b)[n];
>> +      ((unsigned char *)b)[n] = t;
>> +    }
>> +}
>> +
>> +/* Replace the indirect call with a serie of if statements.  It should help
>> +   the branch predictor.  */
>> +static void
>> +do_swap (void * restrict a, void * restrict b, size_t size,
>> +        enum swap_type_t swap_func)
>> +{
> nit: swap_func -> swap_type/swap_method (think was previously
> a function pointer but the name still implies as much).


Ack.

>> +  if (swap_func == SWAP_WORDS_64)
>> +    swap_words_64 (a, b, size);
>> +  else if (swap_func == SWAP_WORDS_32)
>> +    swap_words_32 (a, b, size);
>> +  else
>> +    swap_bytes (a, b, size);
>> +}
>>
>>  /* Discontinue quicksort algorithm when partition gets below this size.
>>     This particular magic number was chosen to work best on a Sun 4/260. */
>> @@ -96,6 +168,14 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>>      /* Avoid lossage with unsigned arithmetic below.  */
>>      return;
>>
>> +  enum swap_type_t swap_type;
>> +  if (is_aligned (pbase, size, 8))
>> +    swap_type = SWAP_WORDS_64;
>> +  else if (is_aligned (pbase, size, 4))
>> +    swap_type = SWAP_WORDS_32;
>> +  else
>> +    swap_type = SWAP_BYTES;
>> +
>>    if (total_elems > MAX_THRESH)
>>      {
>>        char *lo = base_ptr;
>> @@ -119,13 +199,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>>           char *mid = lo + size * ((hi - lo) / size >> 1);
>>
>>           if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
>> -           SWAP (mid, lo, size);
>> +           do_swap (mid, lo, size, swap_type);
>>           if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
>> -           SWAP (mid, hi, size);
>> +           do_swap (mid, hi, size, swap_type);
>>           else
>>             goto jump_over;
>>           if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
>> -           SWAP (mid, lo, size);
>> +           do_swap (mid, lo, size, swap_type);
>>         jump_over:;
>>
>>           left_ptr  = lo + size;
>> @@ -144,7 +224,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>>
>>               if (left_ptr < right_ptr)
>>                 {
>> -                 SWAP (left_ptr, right_ptr, size);
>> +                 do_swap (left_ptr, right_ptr, size, swap_type);
>>                   if (mid == left_ptr)
>>                     mid = right_ptr;
>>                   else if (mid == right_ptr)
>> @@ -216,7 +296,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>>          tmp_ptr = run_ptr;
>>
>>      if (tmp_ptr != base_ptr)
>> -      SWAP (tmp_ptr, base_ptr, size);
>> +      do_swap (tmp_ptr, base_ptr, size, swap_type);
>>
>>      /* Insertion sort, running from left-hand-side up to right-hand-side.  */
>>
>> --
>> 2.34.1
>>

Florian Weimer Aug. 29, 2023, 4:53 p.m. UTC | #4

* Adhemerval Zanella via Libc-alpha:

> The optimization takes in consideration both the most common elements
> are either 32 or 64 bit in size and inputs are aligned to the word
> boundary.  This is similar to what msort does.
>
> For large buffer the swap operation uses memcpy/mempcpy with a
> small fixed size buffer (so compiler might inline the operations).

Should we add a public memswap function that works on non-overlapping
buffers and swaps them?  It seems this might be useful in general.  Then
architecture maintainers can optimize it as needed.

Thanks,
Florian

Noah Goldstein Aug. 29, 2023, 10:52 p.m. UTC | #5

On Tue, Aug 29, 2023 at 11:53 AM Florian Weimer via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> * Adhemerval Zanella via Libc-alpha:
>
> > The optimization takes in consideration both the most common elements
> > are either 32 or 64 bit in size and inputs are aligned to the word
> > boundary.  This is similar to what msort does.
> >
> > For large buffer the swap operation uses memcpy/mempcpy with a
> > small fixed size buffer (so compiler might inline the operations).
>
> Should we add a public memswap function that works on non-overlapping
> buffers and swaps them?  It seems this might be useful in general.  Then
> architecture maintainers can optimize it as needed.
>
+1
> Thanks,
> Florian
>

Adhemerval Zanella Netto Aug. 30, 2023, 1:51 p.m. UTC | #6

On 29/08/23 19:52, Noah Goldstein via Libc-alpha wrote:
> On Tue, Aug 29, 2023 at 11:53 AM Florian Weimer via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> * Adhemerval Zanella via Libc-alpha:
>>
>>> The optimization takes in consideration both the most common elements
>>> are either 32 or 64 bit in size and inputs are aligned to the word
>>> boundary.  This is similar to what msort does.
>>>
>>> For large buffer the swap operation uses memcpy/mempcpy with a
>>> small fixed size buffer (so compiler might inline the operations).
>>
>> Should we add a public memswap function that works on non-overlapping
>> buffers and swaps them?  It seems this might be useful in general.  Then
>> architecture maintainers can optimize it as needed.
>>
> +1
>> Thanks,
>> Florian
>>


Fair enough, I think we can start with a internal symbol for qsort.

diff mbox series

Patch

diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index 728a0ed370..5bcc287c79 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -23,20 +23,92 @@ 
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
+#include <stdbool.h>
 
-/* Byte-wise swap two items of size SIZE. */
-#define SWAP(a, b, size)						      \
-  do									      \
-    {									      \
-      size_t __size = (size);						      \
-      char *__a = (a), *__b = (b);					      \
-      do								      \
-	{								      \
-	  char __tmp = *__a;						      \
-	  *__a++ = *__b;						      \
-	  *__b++ = __tmp;						      \
-	} while (--__size > 0);						      \
-    } while (0)
+/* Swap SIZE bytes between addresses A and B.  These helpers are provided
+   along the generic one as an optimization.  */
+
+enum swap_type_t
+  {
+    SWAP_WORDS_64,
+    SWAP_WORDS_32,
+    SWAP_BYTES
+  };
+
+/* If this function returns true, elements can be safely copied using word
+   loads and stores.  Otherwise, it might not be safe.  BASE (as an integer)
+   must be a multiple of the word alignment.  SIZE must be a multiple of
+   WORDSIZE.  Since WORDSIZE must be a multiple of the word alignment, and
+   WORDSIZE is a power of two on all supported platforms, this function for
+   speed merely checks that BASE and SIZE are both multiples of the word
+   size.  */
+static inline bool
+is_aligned (const void *base, size_t size, size_t wordsize)
+{
+  return (((uintptr_t) base | size) & (wordsize - 1)) == 0;
+}
+
+static inline void
+swap_words_64 (void * restrict a, void * restrict b, size_t n)
+{
+  typedef uint64_t __attribute__ ((__may_alias__)) u64_alias_t;
+  do
+   {
+     n -= 8;
+     u64_alias_t t = *(u64_alias_t *)(a + n);
+     *(u64_alias_t *)(a + n) = *(u64_alias_t *)(b + n);
+     *(u64_alias_t *)(b + n) = t;
+   } while (n);
+}
+
+static inline void
+swap_words_32 (void * restrict a, void * restrict b, size_t n)
+{
+  typedef uint32_t __attribute__ ((__may_alias__)) u32_alias_t;
+  do
+   {
+     n -= 4;
+     u32_alias_t t = *(u32_alias_t *)(a + n);
+     *(u32_alias_t *)(a + n) = *(u32_alias_t *)(b + n);
+     *(u32_alias_t *)(b + n) = t;
+   } while (n);
+}
+
+static inline void
+swap_bytes (void * restrict a, void * restrict b, size_t n)
+{
+  /* Use multiple small memcpys with constant size to enable inlining
+     on most targets.  */
+  enum { SWAP_GENERIC_SIZE = 32 };
+  unsigned char tmp[SWAP_GENERIC_SIZE];
+  while (n > SWAP_GENERIC_SIZE)
+    {
+      memcpy (tmp, a, SWAP_GENERIC_SIZE);
+      a = __mempcpy (a, b, SWAP_GENERIC_SIZE);
+      b = __mempcpy (b, tmp, SWAP_GENERIC_SIZE);
+      n -= SWAP_GENERIC_SIZE;
+    }
+  while (n > 0)
+    {
+      unsigned char t = ((unsigned char *)a)[--n];
+      ((unsigned char *)a)[n] = ((unsigned char *)b)[n];
+      ((unsigned char *)b)[n] = t;
+    }
+}
+
+/* Replace the indirect call with a serie of if statements.  It should help
+   the branch predictor.  */
+static void
+do_swap (void * restrict a, void * restrict b, size_t size,
+	 enum swap_type_t swap_func)
+{
+  if (swap_func == SWAP_WORDS_64)
+    swap_words_64 (a, b, size);
+  else if (swap_func == SWAP_WORDS_32)
+    swap_words_32 (a, b, size);
+  else
+    swap_bytes (a, b, size);
+}
 
 /* Discontinue quicksort algorithm when partition gets below this size.
    This particular magic number was chosen to work best on a Sun 4/260. */
@@ -96,6 +168,14 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
     /* Avoid lossage with unsigned arithmetic below.  */
     return;
 
+  enum swap_type_t swap_type;
+  if (is_aligned (pbase, size, 8))
+    swap_type = SWAP_WORDS_64;
+  else if (is_aligned (pbase, size, 4))
+    swap_type = SWAP_WORDS_32;
+  else
+    swap_type = SWAP_BYTES;
+
   if (total_elems > MAX_THRESH)
     {
       char *lo = base_ptr;
@@ -119,13 +199,13 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
 	  char *mid = lo + size * ((hi - lo) / size >> 1);
 
 	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
-	    SWAP (mid, lo, size);
+	    do_swap (mid, lo, size, swap_type);
 	  if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
-	    SWAP (mid, hi, size);
+	    do_swap (mid, hi, size, swap_type);
 	  else
 	    goto jump_over;
 	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
-	    SWAP (mid, lo, size);
+	    do_swap (mid, lo, size, swap_type);
 	jump_over:;
 
 	  left_ptr  = lo + size;
@@ -144,7 +224,7 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
 
 	      if (left_ptr < right_ptr)
 		{
-		  SWAP (left_ptr, right_ptr, size);
+		  do_swap (left_ptr, right_ptr, size, swap_type);
 		  if (mid == left_ptr)
 		    mid = right_ptr;
 		  else if (mid == right_ptr)
@@ -216,7 +296,7 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
         tmp_ptr = run_ptr;
 
     if (tmp_ptr != base_ptr)
-      SWAP (tmp_ptr, base_ptr, size);
+      do_swap (tmp_ptr, base_ptr, size, swap_type);
 
     /* Insertion sort, running from left-hand-side up to right-hand-side.  */