[v7,5/7] stdlib: Implement introsort for qsort (BZ 19305)

Message ID 20230914183704.1386534-6-adhemerval.zanella@linaro.org (mailing list archive)
State Superseded
Headers
Series Use introsort for qsort |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Testing passed

Commit Message

Adhemerval Zanella Netto Sept. 14, 2023, 6:37 p.m. UTC
  This patch makes the quicksort implementation to acts as introsort, to
avoid worse-case performance (and thus making it O(nlog n)).  It switch
to heapsort when the depth level reaches 2*log2(total elements).  The
heapsort is a textbook implementation.

Checked on x86_64-linux-gnu.
---
 stdlib/qsort.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 80 insertions(+), 7 deletions(-)
  

Comments

Noah Goldstein Sept. 15, 2023, 2:42 p.m. UTC | #1
On Thu, Sep 14, 2023 at 1:37 PM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
> This patch makes the quicksort implementation to acts as introsort, to
> avoid worse-case performance (and thus making it O(nlog n)).  It switch
> to heapsort when the depth level reaches 2*log2(total elements).  The
> heapsort is a textbook implementation.
>
> Checked on x86_64-linux-gnu.
> ---
>  stdlib/qsort.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 80 insertions(+), 7 deletions(-)
>
> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> index 4082f5f9c1..e160932657 100644
> --- a/stdlib/qsort.c
> +++ b/stdlib/qsort.c
> @@ -97,6 +97,7 @@ typedef struct
>    {
>      char *lo;
>      char *hi;
> +    size_t depth;
>    } stack_node;
>
>  /* The stack needs log (total_elements) entries (we could even subtract
> @@ -106,22 +107,83 @@ typedef struct
>  enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
>
>  static inline stack_node *
> -push (stack_node *top, char *lo, char *hi)
> +push (stack_node *top, char *lo, char *hi, size_t depth)
>  {
>    top->lo = lo;
>    top->hi = hi;
> +  top->depth = depth;
>    return ++top;
>  }
>
>  static inline stack_node *
> -pop (stack_node *top, char **lo, char **hi)
> +pop (stack_node *top, char **lo, char **hi, size_t *depth)
>  {
>    --top;
>    *lo = top->lo;
>    *hi = top->hi;
> +  *depth = top->depth;
>    return top;
>  }
>
> +/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux
> +   lib/sort.c.  Used on introsort implementation as a fallback routine with
> +   worst-case performance of O(nlog n) and worst-case space complexity of
> +   O(1).  */
> +
> +static inline void
> +siftdown (void *base, size_t size, size_t k, size_t n,
> +         enum swap_type_t swap_type, __compar_d_fn_t cmp, void *arg)
> +{
> +  while (k <= n / 2)
IMO a comment that `n` is an *inclusive* bound would be nice. Otherwise
looks very much so like there is an out-bounds-access. Especially since
we have `j < n` in the inner loop.
> +    {
> +      size_t j = 2 * k;
> +      if (j < n && cmp (base + (j * size), base + ((j + 1) * size), arg) < 0)
> +       j++;
> +
> +      if (cmp (base + (k * size), base + (j * size), arg) >= 0)
> +       break;
> +
> +      do_swap (base + (size * j), base + (k * size), size, swap_type);
> +      k = j;
> +    }
> +}
> +
> +static inline void
> +heapify (void *base, size_t size, size_t n, enum swap_type_t swap_type,
> +        __compar_d_fn_t cmp, void *arg)
> +{
> +  size_t k = n / 2;
> +  while (1)
> +    {
> +      siftdown (base, size, k, n, swap_type, cmp, arg);
> +      if (k-- == 0)
> +       break;
> +    }
> +}
> +
> +static void
> +heapsort_r (void *base, void *end, size_t size, enum swap_type_t swap_type,
> +           __compar_d_fn_t cmp, void *arg)
> +{
> +  const size_t count = ((uintptr_t) end - (uintptr_t) base) / size;
> +
> +  if (count < 2)
> +    return;
> +
> +  size_t n = count - 1;
> +
> +  /* Build the binary heap, largest value at the base[0].  */
> +  heapify (base, size, n, swap_type, cmp, arg);
> +
> +  /* On each iteration base[0:n] is the binary heap, while base[n:count]
> +     is sorted.  */
> +  while (n > 0)
> +    {
> +      do_swap (base, base + (n * size), size, swap_type);
> +      n--;
> +      siftdown (base, size, 0, n, swap_type, cmp, arg);
> +    }
> +}

Might be nice to move `heapsort_r` to header file so it can be tested
explicitly with the existing `qsort` test suite.
>
>  static inline void
>  insertion_sort_qsort_partitions (void *const pbase, size_t total_elems,
> @@ -207,7 +269,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>
>    const size_t max_thresh = MAX_THRESH * size;
>
> -  if (total_elems == 0)
> +  if (total_elems <= 1)
>      /* Avoid lossage with unsigned arithmetic below.  */
>      return;
>
> @@ -219,15 +281,26 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>    else
>      swap_type = SWAP_BYTES;
>
> +  /* Maximum depth before quicksort switches to heapsort.  */
> +  size_t depth = 2 * (sizeof (size_t) * CHAR_BIT - 1
> +                     - __builtin_clzl (total_elems));
> +
>    if (total_elems > MAX_THRESH)
>      {
>        char *lo = base_ptr;
>        char *hi = &lo[size * (total_elems - 1)];
>        stack_node stack[STACK_SIZE];
> -      stack_node *top = stack + 1;
> +      stack_node *top = push (stack, NULL, NULL, depth);
>
>        while (stack < top)
>          {
> +          if (depth == 0)
> +            {
> +              heapsort_r (lo, hi, size, swap_type, cmp, arg);
> +              top = pop (top, &lo, &hi, &depth);
> +              continue;
> +            }
> +
>            char *left_ptr;
>            char *right_ptr;
>
> @@ -291,7 +364,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>              {
>                if ((size_t) (hi - left_ptr) <= max_thresh)
>                 /* Ignore both small partitions. */
> -                top = pop (top, &lo, &hi);
> +                top = pop (top, &lo, &hi, &depth);
>                else
>                 /* Ignore small left partition. */
>                  lo = left_ptr;
> @@ -302,13 +375,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>            else if ((right_ptr - lo) > (hi - left_ptr))
>              {
>               /* Push larger left partition indices. */
> -              top = push (top, lo, right_ptr);
> +              top = push (top, lo, right_ptr, depth - 1);
>                lo = left_ptr;
>              }
>            else
>              {
>               /* Push larger right partition indices. */
> -              top = push (top, left_ptr, hi);
> +              top = push (top, left_ptr, hi, depth - 1);
>                hi = right_ptr;
>              }
>          }
> --
> 2.34.1
>
  
Adhemerval Zanella Netto Sept. 20, 2023, 1:50 p.m. UTC | #2
On 15/09/23 11:42, Noah Goldstein wrote:
> On Thu, Sep 14, 2023 at 1:37 PM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>> This patch makes the quicksort implementation to acts as introsort, to
>> avoid worse-case performance (and thus making it O(nlog n)).  It switch
>> to heapsort when the depth level reaches 2*log2(total elements).  The
>> heapsort is a textbook implementation.
>>
>> Checked on x86_64-linux-gnu.
>> ---
>>  stdlib/qsort.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++----
>>  1 file changed, 80 insertions(+), 7 deletions(-)
>>
>> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
>> index 4082f5f9c1..e160932657 100644
>> --- a/stdlib/qsort.c
>> +++ b/stdlib/qsort.c
>> @@ -97,6 +97,7 @@ typedef struct
>>    {
>>      char *lo;
>>      char *hi;
>> +    size_t depth;
>>    } stack_node;
>>
>>  /* The stack needs log (total_elements) entries (we could even subtract
>> @@ -106,22 +107,83 @@ typedef struct
>>  enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
>>
>>  static inline stack_node *
>> -push (stack_node *top, char *lo, char *hi)
>> +push (stack_node *top, char *lo, char *hi, size_t depth)
>>  {
>>    top->lo = lo;
>>    top->hi = hi;
>> +  top->depth = depth;
>>    return ++top;
>>  }
>>
>>  static inline stack_node *
>> -pop (stack_node *top, char **lo, char **hi)
>> +pop (stack_node *top, char **lo, char **hi, size_t *depth)
>>  {
>>    --top;
>>    *lo = top->lo;
>>    *hi = top->hi;
>> +  *depth = top->depth;
>>    return top;
>>  }
>>
>> +/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux
>> +   lib/sort.c.  Used on introsort implementation as a fallback routine with
>> +   worst-case performance of O(nlog n) and worst-case space complexity of
>> +   O(1).  */
>> +
>> +static inline void
>> +siftdown (void *base, size_t size, size_t k, size_t n,
>> +         enum swap_type_t swap_type, __compar_d_fn_t cmp, void *arg)
>> +{
>> +  while (k <= n / 2)
> IMO a comment that `n` is an *inclusive* bound would be nice. Otherwise
> looks very much so like there is an out-bounds-access. Especially since
> we have `j < n` in the inner loop.

Ack.

>> +    {
>> +      size_t j = 2 * k;
>> +      if (j < n && cmp (base + (j * size), base + ((j + 1) * size), arg) < 0)
>> +       j++;
>> +
>> +      if (cmp (base + (k * size), base + (j * size), arg) >= 0)
>> +       break;
>> +
>> +      do_swap (base + (size * j), base + (k * size), size, swap_type);
>> +      k = j;
>> +    }
>> +}
>> +
>> +static inline void
>> +heapify (void *base, size_t size, size_t n, enum swap_type_t swap_type,
>> +        __compar_d_fn_t cmp, void *arg)
>> +{
>> +  size_t k = n / 2;
>> +  while (1)
>> +    {
>> +      siftdown (base, size, k, n, swap_type, cmp, arg);
>> +      if (k-- == 0)
>> +       break;
>> +    }
>> +}
>> +
>> +static void
>> +heapsort_r (void *base, void *end, size_t size, enum swap_type_t swap_type,
>> +           __compar_d_fn_t cmp, void *arg)
>> +{
>> +  const size_t count = ((uintptr_t) end - (uintptr_t) base) / size;
>> +
>> +  if (count < 2)
>> +    return;
>> +
>> +  size_t n = count - 1;
>> +
>> +  /* Build the binary heap, largest value at the base[0].  */
>> +  heapify (base, size, n, swap_type, cmp, arg);
>> +
>> +  /* On each iteration base[0:n] is the binary heap, while base[n:count]
>> +     is sorted.  */
>> +  while (n > 0)
>> +    {
>> +      do_swap (base, base + (n * size), size, swap_type);
>> +      n--;
>> +      siftdown (base, size, 0, n, swap_type, cmp, arg);
>> +    }
>> +}
> 
> Might be nice to move `heapsort_r` to header file so it can be tested
> explicitly with the existing `qsort` test suite.

In fact I tough about that, but I ended up improving tst-qsort3 to always
trigger the heapsort by adding quicksort defeating pattern, and also added
another sorting algorithm so we can check against.  Not sure if we really
need to add that much extra testing for qsort.

>>
>>  static inline void
>>  insertion_sort_qsort_partitions (void *const pbase, size_t total_elems,
>> @@ -207,7 +269,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>>
>>    const size_t max_thresh = MAX_THRESH * size;
>>
>> -  if (total_elems == 0)
>> +  if (total_elems <= 1)
>>      /* Avoid lossage with unsigned arithmetic below.  */
>>      return;
>>
>> @@ -219,15 +281,26 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>>    else
>>      swap_type = SWAP_BYTES;
>>
>> +  /* Maximum depth before quicksort switches to heapsort.  */
>> +  size_t depth = 2 * (sizeof (size_t) * CHAR_BIT - 1
>> +                     - __builtin_clzl (total_elems));
>> +
>>    if (total_elems > MAX_THRESH)
>>      {
>>        char *lo = base_ptr;
>>        char *hi = &lo[size * (total_elems - 1)];
>>        stack_node stack[STACK_SIZE];
>> -      stack_node *top = stack + 1;
>> +      stack_node *top = push (stack, NULL, NULL, depth);
>>
>>        while (stack < top)
>>          {
>> +          if (depth == 0)
>> +            {
>> +              heapsort_r (lo, hi, size, swap_type, cmp, arg);
>> +              top = pop (top, &lo, &hi, &depth);
>> +              continue;
>> +            }
>> +
>>            char *left_ptr;
>>            char *right_ptr;
>>
>> @@ -291,7 +364,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>>              {
>>                if ((size_t) (hi - left_ptr) <= max_thresh)
>>                 /* Ignore both small partitions. */
>> -                top = pop (top, &lo, &hi);
>> +                top = pop (top, &lo, &hi, &depth);
>>                else
>>                 /* Ignore small left partition. */
>>                  lo = left_ptr;
>> @@ -302,13 +375,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>>            else if ((right_ptr - lo) > (hi - left_ptr))
>>              {
>>               /* Push larger left partition indices. */
>> -              top = push (top, lo, right_ptr);
>> +              top = push (top, lo, right_ptr, depth - 1);
>>                lo = left_ptr;
>>              }
>>            else
>>              {
>>               /* Push larger right partition indices. */
>> -              top = push (top, left_ptr, hi);
>> +              top = push (top, left_ptr, hi, depth - 1);
>>                hi = right_ptr;
>>              }
>>          }
>> --
>> 2.34.1
>>
  
Noah Goldstein Sept. 20, 2023, 2:12 p.m. UTC | #3
On Wed, Sep 20, 2023 at 8:50 AM Adhemerval Zanella Netto
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 15/09/23 11:42, Noah Goldstein wrote:
> > On Thu, Sep 14, 2023 at 1:37 PM Adhemerval Zanella
> > <adhemerval.zanella@linaro.org> wrote:
> >>
> >> This patch makes the quicksort implementation to acts as introsort, to
> >> avoid worse-case performance (and thus making it O(nlog n)).  It switch
> >> to heapsort when the depth level reaches 2*log2(total elements).  The
> >> heapsort is a textbook implementation.
> >>
> >> Checked on x86_64-linux-gnu.
> >> ---
> >>  stdlib/qsort.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++----
> >>  1 file changed, 80 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> >> index 4082f5f9c1..e160932657 100644
> >> --- a/stdlib/qsort.c
> >> +++ b/stdlib/qsort.c
> >> @@ -97,6 +97,7 @@ typedef struct
> >>    {
> >>      char *lo;
> >>      char *hi;
> >> +    size_t depth;
> >>    } stack_node;
> >>
> >>  /* The stack needs log (total_elements) entries (we could even subtract
> >> @@ -106,22 +107,83 @@ typedef struct
> >>  enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
> >>
> >>  static inline stack_node *
> >> -push (stack_node *top, char *lo, char *hi)
> >> +push (stack_node *top, char *lo, char *hi, size_t depth)
> >>  {
> >>    top->lo = lo;
> >>    top->hi = hi;
> >> +  top->depth = depth;
> >>    return ++top;
> >>  }
> >>
> >>  static inline stack_node *
> >> -pop (stack_node *top, char **lo, char **hi)
> >> +pop (stack_node *top, char **lo, char **hi, size_t *depth)
> >>  {
> >>    --top;
> >>    *lo = top->lo;
> >>    *hi = top->hi;
> >> +  *depth = top->depth;
> >>    return top;
> >>  }
> >>
> >> +/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux
> >> +   lib/sort.c.  Used on introsort implementation as a fallback routine with
> >> +   worst-case performance of O(nlog n) and worst-case space complexity of
> >> +   O(1).  */
> >> +
> >> +static inline void
> >> +siftdown (void *base, size_t size, size_t k, size_t n,
> >> +         enum swap_type_t swap_type, __compar_d_fn_t cmp, void *arg)
> >> +{
> >> +  while (k <= n / 2)
> > IMO a comment that `n` is an *inclusive* bound would be nice. Otherwise
> > looks very much so like there is an out-bounds-access. Especially since
> > we have `j < n` in the inner loop.
>
> Ack.
>
> >> +    {
> >> +      size_t j = 2 * k;
> >> +      if (j < n && cmp (base + (j * size), base + ((j + 1) * size), arg) < 0)
> >> +       j++;
> >> +
> >> +      if (cmp (base + (k * size), base + (j * size), arg) >= 0)
> >> +       break;
> >> +
> >> +      do_swap (base + (size * j), base + (k * size), size, swap_type);
> >> +      k = j;
> >> +    }
> >> +}
> >> +
> >> +static inline void
> >> +heapify (void *base, size_t size, size_t n, enum swap_type_t swap_type,
> >> +        __compar_d_fn_t cmp, void *arg)
> >> +{
> >> +  size_t k = n / 2;
> >> +  while (1)
> >> +    {
> >> +      siftdown (base, size, k, n, swap_type, cmp, arg);
> >> +      if (k-- == 0)
> >> +       break;
> >> +    }
> >> +}
> >> +
> >> +static void
> >> +heapsort_r (void *base, void *end, size_t size, enum swap_type_t swap_type,
> >> +           __compar_d_fn_t cmp, void *arg)
> >> +{
> >> +  const size_t count = ((uintptr_t) end - (uintptr_t) base) / size;
> >> +
> >> +  if (count < 2)
> >> +    return;
> >> +
> >> +  size_t n = count - 1;
> >> +
> >> +  /* Build the binary heap, largest value at the base[0].  */
> >> +  heapify (base, size, n, swap_type, cmp, arg);
> >> +
> >> +  /* On each iteration base[0:n] is the binary heap, while base[n:count]
> >> +     is sorted.  */
> >> +  while (n > 0)
> >> +    {
> >> +      do_swap (base, base + (n * size), size, swap_type);
> >> +      n--;
> >> +      siftdown (base, size, 0, n, swap_type, cmp, arg);
> >> +    }
> >> +}
> >
> > Might be nice to move `heapsort_r` to header file so it can be tested
> > explicitly with the existing `qsort` test suite.
>
> In fact I tough about that, but I ended up improving tst-qsort3 to always
> trigger the heapsort by adding quicksort defeating pattern, and also added
> another sorting algorithm so we can check against.  Not sure if we really
> need to add that much extra testing for qsort.

Fair enough.
>
> >>
> >>  static inline void
> >>  insertion_sort_qsort_partitions (void *const pbase, size_t total_elems,
> >> @@ -207,7 +269,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
> >>
> >>    const size_t max_thresh = MAX_THRESH * size;
> >>
> >> -  if (total_elems == 0)
> >> +  if (total_elems <= 1)
> >>      /* Avoid lossage with unsigned arithmetic below.  */
> >>      return;
> >>
> >> @@ -219,15 +281,26 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
> >>    else
> >>      swap_type = SWAP_BYTES;
> >>
> >> +  /* Maximum depth before quicksort switches to heapsort.  */
> >> +  size_t depth = 2 * (sizeof (size_t) * CHAR_BIT - 1
> >> +                     - __builtin_clzl (total_elems));
> >> +
> >>    if (total_elems > MAX_THRESH)
> >>      {
> >>        char *lo = base_ptr;
> >>        char *hi = &lo[size * (total_elems - 1)];
> >>        stack_node stack[STACK_SIZE];
> >> -      stack_node *top = stack + 1;
> >> +      stack_node *top = push (stack, NULL, NULL, depth);
> >>
> >>        while (stack < top)
> >>          {
> >> +          if (depth == 0)
> >> +            {
> >> +              heapsort_r (lo, hi, size, swap_type, cmp, arg);
> >> +              top = pop (top, &lo, &hi, &depth);
> >> +              continue;
> >> +            }
> >> +
> >>            char *left_ptr;
> >>            char *right_ptr;
> >>
> >> @@ -291,7 +364,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
> >>              {
> >>                if ((size_t) (hi - left_ptr) <= max_thresh)
> >>                 /* Ignore both small partitions. */
> >> -                top = pop (top, &lo, &hi);
> >> +                top = pop (top, &lo, &hi, &depth);
> >>                else
> >>                 /* Ignore small left partition. */
> >>                  lo = left_ptr;
> >> @@ -302,13 +375,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
> >>            else if ((right_ptr - lo) > (hi - left_ptr))
> >>              {
> >>               /* Push larger left partition indices. */
> >> -              top = push (top, lo, right_ptr);
> >> +              top = push (top, lo, right_ptr, depth - 1);
> >>                lo = left_ptr;
> >>              }
> >>            else
> >>              {
> >>               /* Push larger right partition indices. */
> >> -              top = push (top, left_ptr, hi);
> >> +              top = push (top, left_ptr, hi, depth - 1);
> >>                hi = right_ptr;
> >>              }
> >>          }
> >> --
> >> 2.34.1
> >>
  

Patch

diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index 4082f5f9c1..e160932657 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -97,6 +97,7 @@  typedef struct
   {
     char *lo;
     char *hi;
+    size_t depth;
   } stack_node;
 
 /* The stack needs log (total_elements) entries (we could even subtract
@@ -106,22 +107,83 @@  typedef struct
 enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
 
 static inline stack_node *
-push (stack_node *top, char *lo, char *hi)
+push (stack_node *top, char *lo, char *hi, size_t depth)
 {
   top->lo = lo;
   top->hi = hi;
+  top->depth = depth;
   return ++top;
 }
 
 static inline stack_node *
-pop (stack_node *top, char **lo, char **hi)
+pop (stack_node *top, char **lo, char **hi, size_t *depth)
 {
   --top;
   *lo = top->lo;
   *hi = top->hi;
+  *depth = top->depth;
   return top;
 }
 
+/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux
+   lib/sort.c.  Used on introsort implementation as a fallback routine with
+   worst-case performance of O(nlog n) and worst-case space complexity of
+   O(1).  */
+
+static inline void
+siftdown (void *base, size_t size, size_t k, size_t n,
+	  enum swap_type_t swap_type, __compar_d_fn_t cmp, void *arg)
+{
+  while (k <= n / 2)
+    {
+      size_t j = 2 * k;
+      if (j < n && cmp (base + (j * size), base + ((j + 1) * size), arg) < 0)
+	j++;
+
+      if (cmp (base + (k * size), base + (j * size), arg) >= 0)
+	break;
+
+      do_swap (base + (size * j), base + (k * size), size, swap_type);
+      k = j;
+    }
+}
+
+static inline void
+heapify (void *base, size_t size, size_t n, enum swap_type_t swap_type,
+	 __compar_d_fn_t cmp, void *arg)
+{
+  size_t k = n / 2;
+  while (1)
+    {
+      siftdown (base, size, k, n, swap_type, cmp, arg);
+      if (k-- == 0)
+	break;
+    }
+}
+
+static void
+heapsort_r (void *base, void *end, size_t size, enum swap_type_t swap_type,
+	    __compar_d_fn_t cmp, void *arg)
+{
+  const size_t count = ((uintptr_t) end - (uintptr_t) base) / size;
+
+  if (count < 2)
+    return;
+
+  size_t n = count - 1;
+
+  /* Build the binary heap, largest value at the base[0].  */
+  heapify (base, size, n, swap_type, cmp, arg);
+
+  /* On each iteration base[0:n] is the binary heap, while base[n:count]
+     is sorted.  */
+  while (n > 0)
+    {
+      do_swap (base, base + (n * size), size, swap_type);
+      n--;
+      siftdown (base, size, 0, n, swap_type, cmp, arg);
+    }
+}
 
 static inline void
 insertion_sort_qsort_partitions (void *const pbase, size_t total_elems,
@@ -207,7 +269,7 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
 
   const size_t max_thresh = MAX_THRESH * size;
 
-  if (total_elems == 0)
+  if (total_elems <= 1)
     /* Avoid lossage with unsigned arithmetic below.  */
     return;
 
@@ -219,15 +281,26 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
   else
     swap_type = SWAP_BYTES;
 
+  /* Maximum depth before quicksort switches to heapsort.  */
+  size_t depth = 2 * (sizeof (size_t) * CHAR_BIT - 1
+		      - __builtin_clzl (total_elems));
+
   if (total_elems > MAX_THRESH)
     {
       char *lo = base_ptr;
       char *hi = &lo[size * (total_elems - 1)];
       stack_node stack[STACK_SIZE];
-      stack_node *top = stack + 1;
+      stack_node *top = push (stack, NULL, NULL, depth);
 
       while (stack < top)
         {
+          if (depth == 0)
+            {
+              heapsort_r (lo, hi, size, swap_type, cmp, arg);
+              top = pop (top, &lo, &hi, &depth);
+              continue;
+            }
+
           char *left_ptr;
           char *right_ptr;
 
@@ -291,7 +364,7 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
             {
               if ((size_t) (hi - left_ptr) <= max_thresh)
 		/* Ignore both small partitions. */
-                top = pop (top, &lo, &hi);
+                top = pop (top, &lo, &hi, &depth);
               else
 		/* Ignore small left partition. */
                 lo = left_ptr;
@@ -302,13 +375,13 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
           else if ((right_ptr - lo) > (hi - left_ptr))
             {
 	      /* Push larger left partition indices. */
-              top = push (top, lo, right_ptr);
+              top = push (top, lo, right_ptr, depth - 1);
               lo = left_ptr;
             }
           else
             {
 	      /* Push larger right partition indices. */
-              top = push (top, left_ptr, hi);
+              top = push (top, left_ptr, hi, depth - 1);
               hi = right_ptr;
             }
         }