[v4,1/6] stdlib: Optimization qsort{_r} swap implementation (BZ 19305)

Message ID 20230711190722.4028821-2-adhemerval.zanella@linaro.org
State Superseded
Headers
Series Use introsort for qsort |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Testing passed

Commit Message

Adhemerval Zanella Netto July 11, 2023, 7:07 p.m. UTC
  It optimizes take in consideration both the most common elements are
either 32 or 64 bit in size and inputs are aligned to the word
boundary.  This is similar to the optimization done on lib/sort.c
from Linux.

This patchs adds an optimized swap operation on qsort based in previous
msort one.  Instead of byte operation, three variants are provided:

  1. Using uint32_t loads and stores.
  2. Using uint64_t loads and stores.
  3. Generic one with a temporary buffer and memcpy/mempcpy.

The 1. and 2. options are selected only if base pointer is aligned
to required type.

It also fixes BZ#19305 by checking input size against number of
elements 1 besides 0.

Checked on x86_64-linux-gnu.
---
 stdlib/qsort.c | 113 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 95 insertions(+), 18 deletions(-)
  

Comments

Noah Goldstein July 11, 2023, 11:40 p.m. UTC | #1
On Tue, Jul 11, 2023 at 2:08 PM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> It optimizes take in consideration both the most common elements are
> either 32 or 64 bit in size and inputs are aligned to the word
> boundary.  This is similar to the optimization done on lib/sort.c
> from Linux.
>
> This patchs adds an optimized swap operation on qsort based in previous
> msort one.  Instead of byte operation, three variants are provided:
>
>   1. Using uint32_t loads and stores.
>   2. Using uint64_t loads and stores.
>   3. Generic one with a temporary buffer and memcpy/mempcpy.
>
> The 1. and 2. options are selected only if base pointer is aligned
> to required type.
>
> It also fixes BZ#19305 by checking input size against number of
> elements 1 besides 0.
>
> Checked on x86_64-linux-gnu.
> ---
>  stdlib/qsort.c | 113 +++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 95 insertions(+), 18 deletions(-)
>
> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> index 728a0ed370..8a3331fdb4 100644
> --- a/stdlib/qsort.c
> +++ b/stdlib/qsort.c
> @@ -23,20 +23,89 @@
>  #include <limits.h>
>  #include <stdlib.h>
>  #include <string.h>
> +#include <stdbool.h>
> +#include <libc-pointer-arith.h>
>
> -/* Byte-wise swap two items of size SIZE. */
> -#define SWAP(a, b, size)                                                     \
> -  do                                                                         \
> -    {                                                                        \
> -      size_t __size = (size);                                                \
> -      char *__a = (a), *__b = (b);                                           \
> -      do                                                                     \
> -       {                                                                     \
> -         char __tmp = *__a;                                                  \
> -         *__a++ = *__b;                                                      \
> -         *__b++ = __tmp;                                                     \
> -       } while (--__size > 0);                                               \
> -    } while (0)
> +/* Swap SIZE bytes between addresses A and B.  These helpers are provided
> +   along the generic one as an optimization.  */
> +
> +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
> +
> +#define SWAP_WORDS_64 (swap_func_t)0
> +#define SWAP_WORDS_32 (swap_func_t)1
> +#define SWAP_BYTES    (swap_func_t)2
> +

Why both making this a function pointer? Why not just make it an enum?
> +/* Returns true if elements can be copied using word loads and stores.
> +   The SIZE and BASE must be a multiple of the ALIGN.  */
> +__attribute_const__ __always_inline
> +static bool
> +is_aligned (const void *base, size_t size, unsigned char align)
> +{
> +  unsigned char lsbits = (unsigned char) size;
> +
> +  lsbits |= (unsigned char)(uintptr_t) base;
> +  return (lsbits & (align - 1)) == 0;
> +}
> +
> +static void
> +swap_words_64 (void * restrict a, void * restrict b, size_t n)
> +{
> +  do
> +   {
> +     n -= 8;
> +     uint64_t t = *(uint64_t *)(a + n);
> +     *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
> +     *(uint64_t *)(b + n) = t;
> +   } while (n);
> +}
> +
> +static void
> +swap_words_32 (void * restrict a, void * restrict b, size_t n)
> +{
> +  do
> +   {
> +     n -= 4;
> +     uint32_t t = *(uint32_t *)(a + n);
> +     *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
> +     *(uint32_t *)(b + n) = t;
> +   } while (n);
> +}
> +
> +static void
> +swap_bytes (void * restrict a, void * restrict b, size_t n)
> +{
> +  /* Use multiple small memcpys with constant size to enable inlining
> +     on most targets.  */
> +  enum { SWAP_GENERIC_SIZE = 32 };
> +  unsigned char tmp[SWAP_GENERIC_SIZE];
> +  while (n > SWAP_GENERIC_SIZE)
> +    {
> +      __builtin_memcpy (tmp, a, SWAP_GENERIC_SIZE);
> +      a = __mempcpy (a, b, SWAP_GENERIC_SIZE);
> +      b = __mempcpy (b, tmp, SWAP_GENERIC_SIZE);
> +      n -= SWAP_GENERIC_SIZE;
> +    }
> +  while (n > 0)
> +    {
> +      unsigned char t = ((unsigned char *)a)[--n];
> +      ((unsigned char *)a)[n] = ((unsigned char *)b)[n];
> +      ((unsigned char *)b)[n] = t;
> +    }
> +}
> +
> +/* Replace the indirect call with a serie of if statements.  It should help
> +   the branch predictor.  */
> +static void
> +do_swap (void * restrict a, void * restrict b, size_t size,
> +        swap_func_t swap_func)
> +{
> +  if (swap_func == SWAP_WORDS_64)
> +    swap_words_64 (a, b, size);
> +  else if (swap_func == SWAP_WORDS_32)
> +    swap_words_32 (a, b, size);
> +  else
> +    swap_bytes (a, b, size);
> +}
>
>  /* Discontinue quicksort algorithm when partition gets below this size.
>     This particular magic number was chosen to work best on a Sun 4/260. */
> @@ -96,6 +165,14 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>      /* Avoid lossage with unsigned arithmetic below.  */
>      return;
>
> +  swap_func_t swap_func;
> +  if (is_aligned (pbase, size, sizeof (uint64_t)))
> +    swap_func = SWAP_WORDS_64;
> +  else if (is_aligned (pbase, size, sizeof (uint32_t)))
> +    swap_func = SWAP_WORDS_32;
> +  else
> +    swap_func = SWAP_BYTES;
> +
>    if (total_elems > MAX_THRESH)
>      {
>        char *lo = base_ptr;
> @@ -119,13 +196,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>           char *mid = lo + size * ((hi - lo) / size >> 1);
>
>           if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
> -           SWAP (mid, lo, size);
> +           do_swap (mid, lo, size, swap_func);
>           if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
> -           SWAP (mid, hi, size);
> +           do_swap (mid, hi, size, swap_func);
>           else
>             goto jump_over;
>           if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
> -           SWAP (mid, lo, size);
> +           do_swap (mid, lo, size, swap_func);
>         jump_over:;
>
>           left_ptr  = lo + size;
> @@ -144,7 +221,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>
>               if (left_ptr < right_ptr)
>                 {
> -                 SWAP (left_ptr, right_ptr, size);
> +                 do_swap (left_ptr, right_ptr, size, swap_func);
>                   if (mid == left_ptr)
>                     mid = right_ptr;
>                   else if (mid == right_ptr)
> @@ -216,7 +293,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>          tmp_ptr = run_ptr;
>
>      if (tmp_ptr != base_ptr)
> -      SWAP (tmp_ptr, base_ptr, size);
> +      do_swap (tmp_ptr, base_ptr, size, swap_func);
>
>      /* Insertion sort, running from left-hand-side up to right-hand-side.  */
>
> --
> 2.34.1
>
  
Adhemerval Zanella Netto July 12, 2023, 7:40 p.m. UTC | #2
On 11/07/23 20:40, Noah Goldstein wrote:
> On Tue, Jul 11, 2023 at 2:08 PM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> It optimizes take in consideration both the most common elements are
>> either 32 or 64 bit in size and inputs are aligned to the word
>> boundary.  This is similar to the optimization done on lib/sort.c
>> from Linux.
>>
>> This patchs adds an optimized swap operation on qsort based in previous
>> msort one.  Instead of byte operation, three variants are provided:
>>
>>   1. Using uint32_t loads and stores.
>>   2. Using uint64_t loads and stores.
>>   3. Generic one with a temporary buffer and memcpy/mempcpy.
>>
>> The 1. and 2. options are selected only if base pointer is aligned
>> to required type.
>>
>> It also fixes BZ#19305 by checking input size against number of
>> elements 1 besides 0.
>>
>> Checked on x86_64-linux-gnu.
>> ---
>>  stdlib/qsort.c | 113 +++++++++++++++++++++++++++++++++++++++++--------
>>  1 file changed, 95 insertions(+), 18 deletions(-)
>>
>> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
>> index 728a0ed370..8a3331fdb4 100644
>> --- a/stdlib/qsort.c
>> +++ b/stdlib/qsort.c
>> @@ -23,20 +23,89 @@
>>  #include <limits.h>
>>  #include <stdlib.h>
>>  #include <string.h>
>> +#include <stdbool.h>
>> +#include <libc-pointer-arith.h>
>>
>> -/* Byte-wise swap two items of size SIZE. */
>> -#define SWAP(a, b, size)                                                     \
>> -  do                                                                         \
>> -    {                                                                        \
>> -      size_t __size = (size);                                                \
>> -      char *__a = (a), *__b = (b);                                           \
>> -      do                                                                     \
>> -       {                                                                     \
>> -         char __tmp = *__a;                                                  \
>> -         *__a++ = *__b;                                                      \
>> -         *__b++ = __tmp;                                                     \
>> -       } while (--__size > 0);                                               \
>> -    } while (0)
>> +/* Swap SIZE bytes between addresses A and B.  These helpers are provided
>> +   along the generic one as an optimization.  */
>> +
>> +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
>> +
>> +#define SWAP_WORDS_64 (swap_func_t)0
>> +#define SWAP_WORDS_32 (swap_func_t)1
>> +#define SWAP_BYTES    (swap_func_t)2
>> +
> 
> Why both making this a function pointer? Why not just make it an enum?

I used kernel lib/sort.c as base and it support caller to pass a function as
wrapper, but since it does make sense for this patch an enum does make more
sense.  I will adjust to use it.
  
Paul Eggert July 12, 2023, 9:04 p.m. UTC | #3
On 2023-07-11 12:07, Adhemerval Zanella via Libc-alpha wrote:
> +/* Returns true if elements can be copied using word loads and stores.
> +   The SIZE and BASE must be a multiple of the ALIGN.  */
> +__attribute_const__ __always_inline
> +static bool
> +is_aligned (const void *base, size_t size, unsigned char align)
> +{
> +  unsigned char lsbits = (unsigned char) size;
> +
> +  lsbits |= (unsigned char)(uintptr_t) base;
> +  return (lsbits & (align - 1)) == 0;
> +}

Doesn't the following source code generate better machine code? It does 
for me, anyway (x86-64). Even if the code were the same, the simplicity 
would be a win.

   static bool
   is_aligned (const void *base, size_t size, size_t align)
   {
     return (((uintptr_t) base | size) & (align - 1)) == 0;
   }


> +  if (is_aligned (pbase, size, sizeof (uint64_t)))
> +    swap_func = SWAP_WORDS_64;

alignof not sizeof, in contexts like these that are talking about 
alignment not size.
  
Adhemerval Zanella Netto July 13, 2023, 11:07 a.m. UTC | #4
On 12/07/23 18:04, Paul Eggert wrote:
> On 2023-07-11 12:07, Adhemerval Zanella via Libc-alpha wrote:
>> +/* Returns true if elements can be copied using word loads and stores.
>> +   The SIZE and BASE must be a multiple of the ALIGN.  */
>> +__attribute_const__ __always_inline
>> +static bool
>> +is_aligned (const void *base, size_t size, unsigned char align)
>> +{
>> +  unsigned char lsbits = (unsigned char) size;
>> +
>> +  lsbits |= (unsigned char)(uintptr_t) base;
>> +  return (lsbits & (align - 1)) == 0;
>> +}
> 
> Doesn't the following source code generate better machine code? It does for me, anyway (x86-64). Even if the code were the same, the simplicity would be a win.
> 
>   static bool
>   is_aligned (const void *base, size_t size, size_t align)
>   {
>     return (((uintptr_t) base | size) & (align - 1)) == 0;
>   }
> 

Ok.

> 
>> +  if (is_aligned (pbase, size, sizeof (uint64_t)))
>> +    swap_func = SWAP_WORDS_64;
> 
> alignof not sizeof, in contexts like these that are talking about alignment not size.

Indeed, I will fix it.
  
Alexander Monakov July 13, 2023, 1:13 p.m. UTC | #5
On Thu, 13 Jul 2023, Adhemerval Zanella Netto via Libc-alpha wrote:

> >> +  if (is_aligned (pbase, size, sizeof (uint64_t)))
> >> +    swap_func = SWAP_WORDS_64;
> > 
> > alignof not sizeof, in contexts like these that are talking about alignment not size.
> 
> Indeed, I will fix it.

Are you going to use GNU __alignof, or C11 _Alignof? One is safe. The other
makes the code go *boom* on 32-bit x86 when 'size' is 4 modulo 8, coming
this || close to a sneaky singular out-of-bounds write, saved only by the
fact that gcc doesn't (yet) do high-level transforms in swap_words_64.

(the code is not, in fact, talking solely about alignment there)

Alexander
  
Adhemerval Zanella Netto July 13, 2023, 1:29 p.m. UTC | #6
On 13/07/23 10:13, Alexander Monakov wrote:
> 
> On Thu, 13 Jul 2023, Adhemerval Zanella Netto via Libc-alpha wrote:
> 
>>>> +  if (is_aligned (pbase, size, sizeof (uint64_t)))
>>>> +    swap_func = SWAP_WORDS_64;
>>>
>>> alignof not sizeof, in contexts like these that are talking about alignment not size.
>>
>> Indeed, I will fix it.
> 
> Are you going to use GNU __alignof, or C11 _Alignof? One is safe. The other
> makes the code go *boom* on 32-bit x86 when 'size' is 4 modulo 8, coming
> this || close to a sneaky singular out-of-bounds write, saved only by the
> fact that gcc doesn't (yet) do high-level transforms in swap_words_64.
> 
> (the code is not, in fact, talking solely about alignment there)

Yes I noted if fails on i686, I will use explicit value instead
(as Linux lib/sort.c does).
  

Patch

diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index 728a0ed370..8a3331fdb4 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -23,20 +23,89 @@ 
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
+#include <stdbool.h>
+#include <libc-pointer-arith.h>
 
-/* Byte-wise swap two items of size SIZE. */
-#define SWAP(a, b, size)						      \
-  do									      \
-    {									      \
-      size_t __size = (size);						      \
-      char *__a = (a), *__b = (b);					      \
-      do								      \
-	{								      \
-	  char __tmp = *__a;						      \
-	  *__a++ = *__b;						      \
-	  *__b++ = __tmp;						      \
-	} while (--__size > 0);						      \
-    } while (0)
+/* Swap SIZE bytes between addresses A and B.  These helpers are provided
+   along the generic one as an optimization.  */
+
+typedef void (*swap_func_t)(void * restrict, void * restrict, size_t);
+
+#define SWAP_WORDS_64 (swap_func_t)0
+#define SWAP_WORDS_32 (swap_func_t)1
+#define SWAP_BYTES    (swap_func_t)2
+
+/* Returns true if elements can be copied using word loads and stores.
+   The SIZE and BASE must be a multiple of the ALIGN.  */
+__attribute_const__ __always_inline
+static bool
+is_aligned (const void *base, size_t size, unsigned char align)
+{
+  unsigned char lsbits = (unsigned char) size;
+
+  lsbits |= (unsigned char)(uintptr_t) base;
+  return (lsbits & (align - 1)) == 0;
+}
+
+static void
+swap_words_64 (void * restrict a, void * restrict b, size_t n)
+{
+  do
+   {
+     n -= 8;
+     uint64_t t = *(uint64_t *)(a + n);
+     *(uint64_t *)(a + n) = *(uint64_t *)(b + n);
+     *(uint64_t *)(b + n) = t;
+   } while (n);
+}
+
+static void
+swap_words_32 (void * restrict a, void * restrict b, size_t n)
+{
+  do
+   {
+     n -= 4;
+     uint32_t t = *(uint32_t *)(a + n);
+     *(uint32_t *)(a + n) = *(uint32_t *)(b + n);
+     *(uint32_t *)(b + n) = t;
+   } while (n);
+}
+
+static void
+swap_bytes (void * restrict a, void * restrict b, size_t n)
+{
+  /* Use multiple small memcpys with constant size to enable inlining
+     on most targets.  */
+  enum { SWAP_GENERIC_SIZE = 32 };
+  unsigned char tmp[SWAP_GENERIC_SIZE];
+  while (n > SWAP_GENERIC_SIZE)
+    {
+      __builtin_memcpy (tmp, a, SWAP_GENERIC_SIZE);
+      a = __mempcpy (a, b, SWAP_GENERIC_SIZE);
+      b = __mempcpy (b, tmp, SWAP_GENERIC_SIZE);
+      n -= SWAP_GENERIC_SIZE;
+    }
+  while (n > 0)
+    {
+      unsigned char t = ((unsigned char *)a)[--n];
+      ((unsigned char *)a)[n] = ((unsigned char *)b)[n];
+      ((unsigned char *)b)[n] = t;
+    }
+}
+
+/* Replace the indirect call with a serie of if statements.  It should help
+   the branch predictor.  */
+static void
+do_swap (void * restrict a, void * restrict b, size_t size,
+	 swap_func_t swap_func)
+{
+  if (swap_func == SWAP_WORDS_64)
+    swap_words_64 (a, b, size);
+  else if (swap_func == SWAP_WORDS_32)
+    swap_words_32 (a, b, size);
+  else
+    swap_bytes (a, b, size);
+}
 
 /* Discontinue quicksort algorithm when partition gets below this size.
    This particular magic number was chosen to work best on a Sun 4/260. */
@@ -96,6 +165,14 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
     /* Avoid lossage with unsigned arithmetic below.  */
     return;
 
+  swap_func_t swap_func;
+  if (is_aligned (pbase, size, sizeof (uint64_t)))
+    swap_func = SWAP_WORDS_64;
+  else if (is_aligned (pbase, size, sizeof (uint32_t)))
+    swap_func = SWAP_WORDS_32;
+  else
+    swap_func = SWAP_BYTES;
+
   if (total_elems > MAX_THRESH)
     {
       char *lo = base_ptr;
@@ -119,13 +196,13 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
 	  char *mid = lo + size * ((hi - lo) / size >> 1);
 
 	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
-	    SWAP (mid, lo, size);
+	    do_swap (mid, lo, size, swap_func);
 	  if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
-	    SWAP (mid, hi, size);
+	    do_swap (mid, hi, size, swap_func);
 	  else
 	    goto jump_over;
 	  if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
-	    SWAP (mid, lo, size);
+	    do_swap (mid, lo, size, swap_func);
 	jump_over:;
 
 	  left_ptr  = lo + size;
@@ -144,7 +221,7 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
 
 	      if (left_ptr < right_ptr)
 		{
-		  SWAP (left_ptr, right_ptr, size);
+		  do_swap (left_ptr, right_ptr, size, swap_func);
 		  if (mid == left_ptr)
 		    mid = right_ptr;
 		  else if (mid == right_ptr)
@@ -216,7 +293,7 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
         tmp_ptr = run_ptr;
 
     if (tmp_ptr != base_ptr)
-      SWAP (tmp_ptr, base_ptr, size);
+      do_swap (tmp_ptr, base_ptr, size, swap_func);
 
     /* Insertion sort, running from left-hand-side up to right-hand-side.  */