[v3,6/7] stdlib: Implement introsort with qsort
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
This patch adds a introsort implementation on qsort to avoid worse-case
performance of quicksort to O(nlog n). The heapsort fallback used is a
heapsort based on Linux implementation (commit 22a241ccb2c19962a). As a
side note the introsort implementation is similar the one used on
libstdc++ for std::sort.
Checked on x86_64-linux-gnu.
---
stdlib/qsort.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 87 insertions(+), 7 deletions(-)
Comments
On Fri, 3 Sep 2021, Adhemerval Zanella via Libc-alpha wrote:
> @@ -235,6 +305,9 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
> else
> swap_func = SWAP_BYTES;
>
> + /* Maximum depth before quicksort switches to heapsort. */
> + size_t depth = 2 * (CHAR_BIT - 1 - __builtin_clzl (total_elems));
> +
This doesn't look right, the expression in parenthesis is usually negative,
conversion to size_t wraps around and you make 'depth' a huge positive value.
The first term probably should have been 'CHAR_BIT * sizeof(size_t)'.
Does this mean that heapsort fallback was not exercised in testing?
Also, is it certain at that point that total_elems is nonzero? __builtin_clzl
is undefined for a zero argument.
Alexander
On 04/09/2021 06:17, Alexander Monakov wrote:
> On Fri, 3 Sep 2021, Adhemerval Zanella via Libc-alpha wrote:
>
>> @@ -235,6 +305,9 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>> else
>> swap_func = SWAP_BYTES;
>>
>> + /* Maximum depth before quicksort switches to heapsort. */
>> + size_t depth = 2 * (CHAR_BIT - 1 - __builtin_clzl (total_elems));
>> +
>
> This doesn't look right, the expression in parenthesis is usually negative,
> conversion to size_t wraps around and you make 'depth' a huge positive value.
> The first term probably should have been 'CHAR_BIT * sizeof(size_t)'.
It is not indeed, it should be:
size_t depth = 2 * (sizeof (size_t) * CHAR_BIT - 1
- __builtin_clzl (total_elems));
(I think I did something wrong on some rebase, since I had the above
originally).
>
> Does this mean that heapsort fallback was not exercised in testing?
The current tests does not trigger the quadratic behavior on quicksort,
that's why I have added the tst-qsort3. The heapsort_r is called
on both 'SORTED' and 'BITONIC' array distribution (which should be
expected since they tend to be pattern-defeating for quicksort
implementations).
>
> Also, is it certain at that point that total_elems is nonzero? __builtin_clzl
> is undefined for a zero argument.
Yes, the patch also takes care of that:
@@ -222,7 +292,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
const size_t max_thresh = MAX_THRESH * size;
- if (total_elems == 0)
+ if (total_elems <= 1)
/* Avoid lossage with unsigned arithmetic below. */
return;
On 2021-09-03, Adhemerval Zanella via Libc-alpha wrote:
>This patch adds a introsort implementation on qsort to avoid worse-case
>performance of quicksort to O(nlog n). The heapsort fallback used is a
>heapsort based on Linux implementation (commit 22a241ccb2c19962a). As a
>side note the introsort implementation is similar the one used on
>libstdc++ for std::sort.
>
>Checked on x86_64-linux-gnu.
>---
> stdlib/qsort.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 87 insertions(+), 7 deletions(-)
>
>diff --git a/stdlib/qsort.c b/stdlib/qsort.c
>index 5df640362d..8368576aae 100644
>--- a/stdlib/qsort.c
>+++ b/stdlib/qsort.c
>@@ -113,6 +113,7 @@ typedef struct
> {
> char *lo;
> char *hi;
>+ size_t depth;
> } stack_node;
>
> /* The stack needs log (total_elements) entries (we could even subtract
>@@ -122,23 +123,92 @@ typedef struct
> enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
>
> static inline stack_node *
>-push (stack_node *top, char *lo, char *hi)
>+push (stack_node *top, char *lo, char *hi, size_t depth)
> {
> top->lo = lo;
> top->hi = hi;
>+ top->depth = depth;
> return ++top;
> }
>
> static inline stack_node *
>-pop (stack_node *top, char **lo, char **hi)
>+pop (stack_node *top, char **lo, char **hi, size_t *depth)
> {
> --top;
> *lo = top->lo;
> *hi = top->hi;
>+ *depth = top->depth;
> return top;
> }
>
>
>+/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux
>+ lib/sort.c. Used on introsort implementation as a fallback routine with
>+ worst-case performance of O(nlog n) and worst-case space complexity of
>+ O(1). */
>+
>+static inline size_t
>+parent (size_t i, unsigned int lsbit, size_t size)
>+{
>+ i -= size;
>+ i -= size & -(i & lsbit);
i -= size & -(i & lsbit); may need a comment that it is logically
if (i & lsbit) i -= size;
to make i/2 a multiple of size.
>+ return i / 2;
>+}
>+
>+static void
>+heapsort_r (void *base, void *end, size_t size, swap_func_t swap_func,
>+ __compar_d_fn_t cmp, void *arg)
>+{
>+ size_t num = ((uintptr_t) end - (uintptr_t) base) / size;
>+ size_t n = num * size, a = (num/2) * size;
>+ /* Used to find parent */
>+ const unsigned int lsbit = size & -size;
>+
>+ /* num < 2 || size == 0. */
>+ if (a == 0)
>+ return;
>+
>+ for (;;)
>+ {
>+ size_t b, c, d;
>+
>+ if (a != 0)
>+ /* Building heap: sift down --a */
>+ a -= size;
>+ else if (n -= size)
>+ /* Sorting: Extract root to --n */
>+ do_swap (base, base + n, size, swap_func);
>+ else
>+ break;
>+
>+ /* Sift element at "a" down into heap. This is the "bottom-up" variant,
>+ which significantly reduces calls to cmp_func(): we find the sift-down
>+ path all the way to the leaves (one compare per level), then backtrack
>+ to find where to insert the target element.
>+
>+ Because elements tend to sift down close to the leaves, this uses fewer
>+ compares than doing two per level on the way down. (A bit more than
>+ half as many on average, 3/4 worst-case.). */
>+ for (b = a; c = 2 * b + size, (d = c + size) < n;)
>+ b = cmp (base + c, base + d, arg) >= 0 ? c : d;
>+ if (d == n)
>+ /* Special case last leaf with no sibling. */
>+ b = c;
>+
>+ /* Now backtrack from "b" to the correct location for "a". */
>+ while (b != a && cmp (base + a, base + b, arg) >= 0)
>+ b = parent (b, lsbit, size);
>+ /* Where "a" belongs. */
>+ c = b;
>+ while (b != a)
>+ {
>+ /* Shift it into place. */
>+ b = parent (b, lsbit, size);
>+ do_swap (base + b, base + c, size, swap_func);
>+ }
here a tab should be used.
>+ }
>+}
>+
> /* Order size using quicksort. This implementation incorporates
> four optimizations discussed in Sedgewick:
>
>@@ -223,7 +293,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>
> const size_t max_thresh = MAX_THRESH * size;
>
>- if (total_elems == 0)
>+ if (total_elems <= 1)
> /* Avoid lossage with unsigned arithmetic below. */
> return;
>
>@@ -235,6 +305,9 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
> else
> swap_func = SWAP_BYTES;
>
>+ /* Maximum depth before quicksort switches to heapsort. */
>+ size_t depth = 2 * (CHAR_BIT - 1 - __builtin_clzl (total_elems));
With this fixed, the logic LGTM.
> if (total_elems > MAX_THRESH)
> {
> char *lo = base_ptr;
>@@ -242,10 +315,17 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
> stack_node stack[STACK_SIZE];
> stack_node *top = stack;
>
>- top = push (top, NULL, NULL);
>+ top = push (top, NULL, NULL, depth);
>
> while (stack < top)
> {
>+ if (depth == 0)
>+ {
>+ heapsort_r (lo, hi, size, swap_func, cmp, arg);
>+ top = pop (top, &lo, &hi, &depth);
>+ continue;
>+ }
>+
> char *left_ptr;
> char *right_ptr;
>
>@@ -309,7 +389,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
> {
> if ((size_t) (hi - left_ptr) <= max_thresh)
> /* Ignore both small partitions. */
>- top = pop (top, &lo, &hi);
>+ top = pop (top, &lo, &hi, &depth);
> else
> /* Ignore small left partition. */
> lo = left_ptr;
>@@ -320,13 +400,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
> else if ((right_ptr - lo) > (hi - left_ptr))
> {
> /* Push larger left partition indices. */
>- top = push (top, lo, right_ptr);
>+ top = push (top, lo, right_ptr, depth - 1);
> lo = left_ptr;
> }
> else
> {
> /* Push larger right partition indices. */
>- top = push (top, left_ptr, hi);
>+ top = push (top, left_ptr, hi, depth - 1);
> hi = right_ptr;
> }
> }
>--
>2.30.2
>
On Fri, Sep 3, 2021 at 1:16 PM Adhemerval Zanella via Libc-alpha <
libc-alpha@sourceware.org> wrote:
> This patch adds a introsort implementation on qsort to avoid worse-case
> performance of quicksort to O(nlog n). The heapsort fallback used is a
> heapsort based on Linux implementation (commit 22a241ccb2c19962a). As a
> side note the introsort implementation is similar the one used on
> libstdc++ for std::sort.
>
> Checked on x86_64-linux-gnu.
> ---
> stdlib/qsort.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 87 insertions(+), 7 deletions(-)
>
> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> index 5df640362d..8368576aae 100644
> --- a/stdlib/qsort.c
> +++ b/stdlib/qsort.c
> @@ -113,6 +113,7 @@ typedef struct
> {
> char *lo;
> char *hi;
> + size_t depth;
>
Why do a depth tracker per stack_node as opposed to one for
the entire call?
> } stack_node;
>
> /* The stack needs log (total_elements) entries (we could even subtract
> @@ -122,23 +123,92 @@ typedef struct
> enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
>
> static inline stack_node *
> -push (stack_node *top, char *lo, char *hi)
> +push (stack_node *top, char *lo, char *hi, size_t depth)
> {
> top->lo = lo;
> top->hi = hi;
> + top->depth = depth;
> return ++top;
> }
>
> static inline stack_node *
> -pop (stack_node *top, char **lo, char **hi)
> +pop (stack_node *top, char **lo, char **hi, size_t *depth)
> {
> --top;
> *lo = top->lo;
> *hi = top->hi;
> + *depth = top->depth;
> return top;
> }
>
>
> +/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux
> + lib/sort.c. Used on introsort implementation as a fallback routine
> with
> + worst-case performance of O(nlog n) and worst-case space complexity of
> + O(1). */
> +
> +static inline size_t
> +parent (size_t i, unsigned int lsbit, size_t size)
> +{
> + i -= size;
> + i -= size & -(i & lsbit);
> + return i / 2;
> +}
> +
> +static void
> +heapsort_r (void *base, void *end, size_t size, swap_func_t swap_func,
> + __compar_d_fn_t cmp, void *arg)
> +{
> + size_t num = ((uintptr_t) end - (uintptr_t) base) / size;
> + size_t n = num * size, a = (num/2) * size;
> + /* Used to find parent */
> + const unsigned int lsbit = size & -size;
> +
> + /* num < 2 || size == 0. */
> + if (a == 0)
> + return;
> +
> + for (;;)
> + {
> + size_t b, c, d;
> +
> + if (a != 0)
> + /* Building heap: sift down --a */
> + a -= size;
> + else if (n -= size)
> + /* Sorting: Extract root to --n */
> + do_swap (base, base + n, size, swap_func);
> + else
> + break;
> +
> + /* Sift element at "a" down into heap. This is the "bottom-up"
> variant,
> + which significantly reduces calls to cmp_func(): we find the
> sift-down
> + path all the way to the leaves (one compare per level), then
> backtrack
> + to find where to insert the target element.
> +
> + Because elements tend to sift down close to the leaves, this uses
> fewer
> + compares than doing two per level on the way down. (A bit more
> than
> + half as many on average, 3/4 worst-case.). */
> + for (b = a; c = 2 * b + size, (d = c + size) < n;)
> + b = cmp (base + c, base + d, arg) >= 0 ? c : d;
> + if (d == n)
> + /* Special case last leaf with no sibling. */
> + b = c;
> +
> + /* Now backtrack from "b" to the correct location for "a". */
> + while (b != a && cmp (base + a, base + b, arg) >= 0)
> + b = parent (b, lsbit, size);
> + /* Where "a" belongs. */
> + c = b;
> + while (b != a)
> + {
> + /* Shift it into place. */
> + b = parent (b, lsbit, size);
> + do_swap (base + b, base + c, size, swap_func);
> + }
> + }
> +}
> +
> /* Order size using quicksort. This implementation incorporates
> four optimizations discussed in Sedgewick:
>
> @@ -223,7 +293,7 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
>
> const size_t max_thresh = MAX_THRESH * size;
>
> - if (total_elems == 0)
> + if (total_elems <= 1)
> /* Avoid lossage with unsigned arithmetic below. */
> return;
>
> @@ -235,6 +305,9 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
> else
> swap_func = SWAP_BYTES;
>
> + /* Maximum depth before quicksort switches to heapsort. */
> + size_t depth = 2 * (CHAR_BIT - 1 - __builtin_clzl (total_elems));
> +
> if (total_elems > MAX_THRESH)
> {
> char *lo = base_ptr;
> @@ -242,10 +315,17 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
> stack_node stack[STACK_SIZE];
> stack_node *top = stack;
>
> - top = push (top, NULL, NULL);
> + top = push (top, NULL, NULL, depth);
>
> while (stack < top)
> {
> + if (depth == 0)
> + {
+ heapsort_r (lo, hi, size, swap_func, cmp, arg);
> + top = pop (top, &lo, &hi, &depth);
> + continue;
> + }
> +
> char *left_ptr;
> char *right_ptr;
>
> @@ -309,7 +389,7 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
> {
> if ((size_t) (hi - left_ptr) <= max_thresh)
> /* Ignore both small partitions. */
> - top = pop (top, &lo, &hi);
> + top = pop (top, &lo, &hi, &depth);
> else
> /* Ignore small left partition. */
> lo = left_ptr;
> @@ -320,13 +400,13 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
> else if ((right_ptr - lo) > (hi - left_ptr))
>
Since we now have a depth counter, is it faster to still
always select the bigger region for the stack or
to remove this branch and just choose a direction?
We should be able to bound the size of the stack structure
with depth.
> {
> /* Push larger left partition indices. */
> - top = push (top, lo, right_ptr);
> + top = push (top, lo, right_ptr, depth - 1);
> lo = left_ptr;
> }
> else
> {
> /* Push larger right partition indices. */
> - top = push (top, left_ptr, hi);
> + top = push (top, left_ptr, hi, depth - 1);
> hi = right_ptr;
> }
> }
> --
> 2.30.2
>
>
@@ -113,6 +113,7 @@ typedef struct
{
char *lo;
char *hi;
+ size_t depth;
} stack_node;
/* The stack needs log (total_elements) entries (we could even subtract
@@ -122,23 +123,92 @@ typedef struct
enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
static inline stack_node *
-push (stack_node *top, char *lo, char *hi)
+push (stack_node *top, char *lo, char *hi, size_t depth)
{
top->lo = lo;
top->hi = hi;
+ top->depth = depth;
return ++top;
}
static inline stack_node *
-pop (stack_node *top, char **lo, char **hi)
+pop (stack_node *top, char **lo, char **hi, size_t *depth)
{
--top;
*lo = top->lo;
*hi = top->hi;
+ *depth = top->depth;
return top;
}
+/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux
+ lib/sort.c. Used on introsort implementation as a fallback routine with
+ worst-case performance of O(nlog n) and worst-case space complexity of
+ O(1). */
+
+static inline size_t
+parent (size_t i, unsigned int lsbit, size_t size)
+{
+ i -= size;
+ i -= size & -(i & lsbit);
+ return i / 2;
+}
+
+static void
+heapsort_r (void *base, void *end, size_t size, swap_func_t swap_func,
+ __compar_d_fn_t cmp, void *arg)
+{
+ size_t num = ((uintptr_t) end - (uintptr_t) base) / size;
+ size_t n = num * size, a = (num/2) * size;
+ /* Used to find parent */
+ const unsigned int lsbit = size & -size;
+
+ /* num < 2 || size == 0. */
+ if (a == 0)
+ return;
+
+ for (;;)
+ {
+ size_t b, c, d;
+
+ if (a != 0)
+ /* Building heap: sift down --a */
+ a -= size;
+ else if (n -= size)
+ /* Sorting: Extract root to --n */
+ do_swap (base, base + n, size, swap_func);
+ else
+ break;
+
+ /* Sift element at "a" down into heap. This is the "bottom-up" variant,
+ which significantly reduces calls to cmp_func(): we find the sift-down
+ path all the way to the leaves (one compare per level), then backtrack
+ to find where to insert the target element.
+
+ Because elements tend to sift down close to the leaves, this uses fewer
+ compares than doing two per level on the way down. (A bit more than
+ half as many on average, 3/4 worst-case.). */
+ for (b = a; c = 2 * b + size, (d = c + size) < n;)
+ b = cmp (base + c, base + d, arg) >= 0 ? c : d;
+ if (d == n)
+ /* Special case last leaf with no sibling. */
+ b = c;
+
+ /* Now backtrack from "b" to the correct location for "a". */
+ while (b != a && cmp (base + a, base + b, arg) >= 0)
+ b = parent (b, lsbit, size);
+ /* Where "a" belongs. */
+ c = b;
+ while (b != a)
+ {
+ /* Shift it into place. */
+ b = parent (b, lsbit, size);
+ do_swap (base + b, base + c, size, swap_func);
+ }
+ }
+}
+
/* Order size using quicksort. This implementation incorporates
four optimizations discussed in Sedgewick:
@@ -223,7 +293,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
const size_t max_thresh = MAX_THRESH * size;
- if (total_elems == 0)
+ if (total_elems <= 1)
/* Avoid lossage with unsigned arithmetic below. */
return;
@@ -235,6 +305,9 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
else
swap_func = SWAP_BYTES;
+ /* Maximum depth before quicksort switches to heapsort. */
+ size_t depth = 2 * (CHAR_BIT - 1 - __builtin_clzl (total_elems));
+
if (total_elems > MAX_THRESH)
{
char *lo = base_ptr;
@@ -242,10 +315,17 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
stack_node stack[STACK_SIZE];
stack_node *top = stack;
- top = push (top, NULL, NULL);
+ top = push (top, NULL, NULL, depth);
while (stack < top)
{
+ if (depth == 0)
+ {
+ heapsort_r (lo, hi, size, swap_func, cmp, arg);
+ top = pop (top, &lo, &hi, &depth);
+ continue;
+ }
+
char *left_ptr;
char *right_ptr;
@@ -309,7 +389,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
{
if ((size_t) (hi - left_ptr) <= max_thresh)
/* Ignore both small partitions. */
- top = pop (top, &lo, &hi);
+ top = pop (top, &lo, &hi, &depth);
else
/* Ignore small left partition. */
lo = left_ptr;
@@ -320,13 +400,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
else if ((right_ptr - lo) > (hi - left_ptr))
{
/* Push larger left partition indices. */
- top = push (top, lo, right_ptr);
+ top = push (top, lo, right_ptr, depth - 1);
lo = left_ptr;
}
else
{
/* Push larger right partition indices. */
- top = push (top, left_ptr, hi);
+ top = push (top, left_ptr, hi, depth - 1);
hi = right_ptr;
}
}