[v3,6/7] stdlib: Implement introsort with qsort

Message ID 20210903171144.952737-7-adhemerval.zanella@linaro.org
State Superseded
Headers
Series Use introsort for qsort |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Adhemerval Zanella Sept. 3, 2021, 5:11 p.m. UTC
  This patch adds a introsort implementation on qsort to avoid worse-case
performance of quicksort to O(nlog n).  The heapsort fallback used is a
heapsort based on Linux implementation (commit 22a241ccb2c19962a).  As a
side note the introsort implementation is similar the one used on
libstdc++ for std::sort.

Checked on x86_64-linux-gnu.
---
 stdlib/qsort.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 87 insertions(+), 7 deletions(-)
  

Comments

Alexander Monakov Sept. 4, 2021, 9:17 a.m. UTC | #1
On Fri, 3 Sep 2021, Adhemerval Zanella via Libc-alpha wrote:

> @@ -235,6 +305,9 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>    else
>      swap_func = SWAP_BYTES;
>  
> +  /* Maximum depth before quicksort switches to heapsort.  */
> +  size_t depth = 2 * (CHAR_BIT - 1 - __builtin_clzl (total_elems));
> +

This doesn't look right, the expression in parenthesis is usually negative,
conversion to size_t wraps around and you make 'depth' a huge positive value.
The first term probably should have been 'CHAR_BIT * sizeof(size_t)'.

Does this mean that heapsort fallback was not exercised in testing?

Also, is it certain at that point that total_elems is nonzero? __builtin_clzl
is undefined for a zero argument.

Alexander
  
Adhemerval Zanella Sept. 6, 2021, 6:43 p.m. UTC | #2
On 04/09/2021 06:17, Alexander Monakov wrote:
> On Fri, 3 Sep 2021, Adhemerval Zanella via Libc-alpha wrote:
> 
>> @@ -235,6 +305,9 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>>    else
>>      swap_func = SWAP_BYTES;
>>  
>> +  /* Maximum depth before quicksort switches to heapsort.  */
>> +  size_t depth = 2 * (CHAR_BIT - 1 - __builtin_clzl (total_elems));
>> +
> 
> This doesn't look right, the expression in parenthesis is usually negative,
> conversion to size_t wraps around and you make 'depth' a huge positive value.
> The first term probably should have been 'CHAR_BIT * sizeof(size_t)'.

It is not indeed, it should be:

  size_t depth = 2 * (sizeof (size_t) * CHAR_BIT - 1
                      - __builtin_clzl (total_elems));

(I think I did something wrong on some rebase, since I had the above
originally).

> 
> Does this mean that heapsort fallback was not exercised in testing?

The current tests does not trigger the quadratic behavior on quicksort,
that's why I have added the tst-qsort3.  The heapsort_r is called
on both 'SORTED' and 'BITONIC' array distribution (which should be
expected since they tend to be pattern-defeating for quicksort
implementations).

> 
> Also, is it certain at that point that total_elems is nonzero? __builtin_clzl
> is undefined for a zero argument.

Yes, the patch also takes care of that:

@@ -222,7 +292,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,

   const size_t max_thresh = MAX_THRESH * size;

-  if (total_elems == 0)
+  if (total_elems <= 1)
     /* Avoid lossage with unsigned arithmetic below.  */
     return;
  
Fangrui Song Sept. 6, 2021, 8:23 p.m. UTC | #3
On 2021-09-03, Adhemerval Zanella via Libc-alpha wrote:
>This patch adds a introsort implementation on qsort to avoid worse-case
>performance of quicksort to O(nlog n).  The heapsort fallback used is a
>heapsort based on Linux implementation (commit 22a241ccb2c19962a).  As a
>side note the introsort implementation is similar the one used on
>libstdc++ for std::sort.
>
>Checked on x86_64-linux-gnu.
>---
> stdlib/qsort.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 87 insertions(+), 7 deletions(-)
>
>diff --git a/stdlib/qsort.c b/stdlib/qsort.c
>index 5df640362d..8368576aae 100644
>--- a/stdlib/qsort.c
>+++ b/stdlib/qsort.c
>@@ -113,6 +113,7 @@ typedef struct
>   {
>     char *lo;
>     char *hi;
>+    size_t depth;
>   } stack_node;
>
> /* The stack needs log (total_elements) entries (we could even subtract
>@@ -122,23 +123,92 @@ typedef struct
> enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
>
> static inline stack_node *
>-push (stack_node *top, char *lo, char *hi)
>+push (stack_node *top, char *lo, char *hi, size_t depth)
> {
>   top->lo = lo;
>   top->hi = hi;
>+  top->depth = depth;
>   return ++top;
> }
>
> static inline stack_node *
>-pop (stack_node *top, char **lo, char **hi)
>+pop (stack_node *top, char **lo, char **hi, size_t *depth)
> {
>   --top;
>   *lo = top->lo;
>   *hi = top->hi;
>+  *depth = top->depth;
>   return top;
> }
>
>
>+/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux
>+   lib/sort.c.  Used on introsort implementation as a fallback routine with
>+   worst-case performance of O(nlog n) and worst-case space complexity of
>+   O(1).  */
>+
>+static inline size_t
>+parent (size_t i, unsigned int lsbit, size_t size)
>+{
>+  i -= size;
>+  i -= size & -(i & lsbit);

i -= size & -(i & lsbit); may need a comment that it is logically 

if (i & lsbit) i -= size;

to make i/2 a multiple of size.


>+  return i / 2;
>+}
>+
>+static void
>+heapsort_r (void *base, void *end, size_t size, swap_func_t swap_func,
>+	    __compar_d_fn_t cmp, void *arg)
>+{
>+  size_t num = ((uintptr_t) end - (uintptr_t) base) / size;
>+  size_t n = num * size, a = (num/2) * size;
>+  /* Used to find parent  */
>+  const unsigned int lsbit = size & -size;
>+
>+  /* num < 2 || size == 0.  */
>+  if (a == 0)
>+    return;
>+
>+  for (;;)
>+    {
>+      size_t b, c, d;
>+
>+      if (a != 0)
>+	/* Building heap: sift down --a */
>+	a -= size;
>+      else if (n -= size)
>+	/* Sorting: Extract root to --n */
>+	do_swap (base, base + n, size, swap_func);
>+      else
>+	break;
>+
>+      /* Sift element at "a" down into heap.  This is the "bottom-up" variant,
>+	 which significantly reduces calls to cmp_func(): we find the sift-down
>+	 path all the way to the leaves (one compare per level), then backtrack
>+	 to find where to insert the target element.
>+
>+	 Because elements tend to sift down close to the leaves, this uses fewer
>+	 compares than doing two per level on the way down.  (A bit more than
>+	 half as many on average, 3/4 worst-case.).  */
>+      for (b = a; c = 2 * b + size, (d = c + size) < n;)
>+	b = cmp (base + c, base + d, arg) >= 0 ? c : d;
>+      if (d == n)
>+	/* Special case last leaf with no sibling.  */
>+	b = c;
>+
>+      /* Now backtrack from "b" to the correct location for "a".  */
>+      while (b != a && cmp (base + a, base + b, arg) >= 0)
>+	b = parent (b, lsbit, size);
>+      /* Where "a" belongs.  */
>+      c = b;
>+      while (b != a)
>+	{
>+	  /* Shift it into place.  */
>+	  b = parent (b, lsbit, size);
>+          do_swap (base + b, base + c, size, swap_func);
>+        }

here a tab should be used.

>+    }
>+}
>+
> /* Order size using quicksort.  This implementation incorporates
>    four optimizations discussed in Sedgewick:
>
>@@ -223,7 +293,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>
>   const size_t max_thresh = MAX_THRESH * size;
>
>-  if (total_elems == 0)
>+  if (total_elems <= 1)
>     /* Avoid lossage with unsigned arithmetic below.  */
>     return;
>
>@@ -235,6 +305,9 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>   else
>     swap_func = SWAP_BYTES;
>
>+  /* Maximum depth before quicksort switches to heapsort.  */
>+  size_t depth = 2 * (CHAR_BIT - 1 - __builtin_clzl (total_elems));

With this fixed, the logic LGTM.

>   if (total_elems > MAX_THRESH)
>     {
>       char *lo = base_ptr;
>@@ -242,10 +315,17 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>       stack_node stack[STACK_SIZE];
>       stack_node *top = stack;
>
>-      top = push (top, NULL, NULL);
>+      top = push (top, NULL, NULL, depth);
>
>       while (stack < top)
>         {
>+	  if (depth == 0)
>+	    {
>+	      heapsort_r (lo, hi, size, swap_func, cmp, arg);
>+              top = pop (top, &lo, &hi, &depth);
>+	      continue;
>+	    }
>+
>           char *left_ptr;
>           char *right_ptr;
>
>@@ -309,7 +389,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>             {
>               if ((size_t) (hi - left_ptr) <= max_thresh)
> 		/* Ignore both small partitions. */
>-		top = pop (top, &lo, &hi);
>+		top = pop (top, &lo, &hi, &depth);
>               else
> 		/* Ignore small left partition. */
>                 lo = left_ptr;
>@@ -320,13 +400,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size,
>           else if ((right_ptr - lo) > (hi - left_ptr))
>             {
> 	      /* Push larger left partition indices. */
>-	      top = push (top, lo, right_ptr);
>+	      top = push (top, lo, right_ptr, depth - 1);
>               lo = left_ptr;
>             }
>           else
>             {
> 	      /* Push larger right partition indices. */
>-	      top = push (top, left_ptr, hi);
>+	      top = push (top, left_ptr, hi, depth - 1);
>               hi = right_ptr;
>             }
>         }
>-- 
>2.30.2
>
  
Noah Goldstein Oct. 13, 2021, 3:53 a.m. UTC | #4
On Fri, Sep 3, 2021 at 1:16 PM Adhemerval Zanella via Libc-alpha <
libc-alpha@sourceware.org> wrote:

> This patch adds a introsort implementation on qsort to avoid worse-case
> performance of quicksort to O(nlog n).  The heapsort fallback used is a
> heapsort based on Linux implementation (commit 22a241ccb2c19962a).  As a
> side note the introsort implementation is similar the one used on
> libstdc++ for std::sort.
>
> Checked on x86_64-linux-gnu.
> ---
>  stdlib/qsort.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 87 insertions(+), 7 deletions(-)
>
> diff --git a/stdlib/qsort.c b/stdlib/qsort.c
> index 5df640362d..8368576aae 100644
> --- a/stdlib/qsort.c
> +++ b/stdlib/qsort.c
> @@ -113,6 +113,7 @@ typedef struct
>    {
>      char *lo;
>      char *hi;
> +    size_t depth;
>

Why do a depth tracker per stack_node as opposed to one for
the entire call?


>    } stack_node;
>
>  /* The stack needs log (total_elements) entries (we could even subtract
> @@ -122,23 +123,92 @@ typedef struct
>  enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
>
>  static inline stack_node *
> -push (stack_node *top, char *lo, char *hi)
> +push (stack_node *top, char *lo, char *hi, size_t depth)
>  {
>    top->lo = lo;
>    top->hi = hi;
> +  top->depth = depth;
>    return ++top;
>  }
>
>  static inline stack_node *
> -pop (stack_node *top, char **lo, char **hi)
> +pop (stack_node *top, char **lo, char **hi, size_t *depth)
>  {
>    --top;
>    *lo = top->lo;
>    *hi = top->hi;
> +  *depth = top->depth;
>    return top;
>  }
>
>
> +/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux
> +   lib/sort.c.  Used on introsort implementation as a fallback routine
> with
> +   worst-case performance of O(nlog n) and worst-case space complexity of
> +   O(1).  */
> +
> +static inline size_t
> +parent (size_t i, unsigned int lsbit, size_t size)
> +{
> +  i -= size;
> +  i -= size & -(i & lsbit);
> +  return i / 2;
> +}
> +
> +static void
> +heapsort_r (void *base, void *end, size_t size, swap_func_t swap_func,
> +           __compar_d_fn_t cmp, void *arg)
> +{
> +  size_t num = ((uintptr_t) end - (uintptr_t) base) / size;
> +  size_t n = num * size, a = (num/2) * size;
> +  /* Used to find parent  */
> +  const unsigned int lsbit = size & -size;
> +
> +  /* num < 2 || size == 0.  */
> +  if (a == 0)
> +    return;
> +
> +  for (;;)
> +    {
> +      size_t b, c, d;
> +
> +      if (a != 0)
> +       /* Building heap: sift down --a */
> +       a -= size;
> +      else if (n -= size)
> +       /* Sorting: Extract root to --n */
> +       do_swap (base, base + n, size, swap_func);
> +      else
> +       break;
> +
> +      /* Sift element at "a" down into heap.  This is the "bottom-up"
> variant,
> +        which significantly reduces calls to cmp_func(): we find the
> sift-down
> +        path all the way to the leaves (one compare per level), then
> backtrack
> +        to find where to insert the target element.
> +
> +        Because elements tend to sift down close to the leaves, this uses
> fewer
> +        compares than doing two per level on the way down.  (A bit more
> than
> +        half as many on average, 3/4 worst-case.).  */
> +      for (b = a; c = 2 * b + size, (d = c + size) < n;)
> +       b = cmp (base + c, base + d, arg) >= 0 ? c : d;
> +      if (d == n)
> +       /* Special case last leaf with no sibling.  */
> +       b = c;
> +
> +      /* Now backtrack from "b" to the correct location for "a".  */
> +      while (b != a && cmp (base + a, base + b, arg) >= 0)
> +       b = parent (b, lsbit, size);
> +      /* Where "a" belongs.  */
> +      c = b;
> +      while (b != a)
> +       {
> +         /* Shift it into place.  */
> +         b = parent (b, lsbit, size);
> +          do_swap (base + b, base + c, size, swap_func);
> +        }
> +    }
> +}
> +
>  /* Order size using quicksort.  This implementation incorporates
>     four optimizations discussed in Sedgewick:
>
> @@ -223,7 +293,7 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
>
>    const size_t max_thresh = MAX_THRESH * size;
>
> -  if (total_elems == 0)
> +  if (total_elems <= 1)
>      /* Avoid lossage with unsigned arithmetic below.  */
>      return;
>
> @@ -235,6 +305,9 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
>    else
>      swap_func = SWAP_BYTES;
>
> +  /* Maximum depth before quicksort switches to heapsort.  */
> +  size_t depth = 2 * (CHAR_BIT - 1 - __builtin_clzl (total_elems));
> +
>    if (total_elems > MAX_THRESH)
>      {
>        char *lo = base_ptr;
> @@ -242,10 +315,17 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
>        stack_node stack[STACK_SIZE];
>        stack_node *top = stack;
>
> -      top = push (top, NULL, NULL);
> +      top = push (top, NULL, NULL, depth);
>
>        while (stack < top)
>          {
> +         if (depth == 0)
> +           {

+             heapsort_r (lo, hi, size, swap_func, cmp, arg);
> +              top = pop (top, &lo, &hi, &depth);
> +             continue;
> +           }
> +
>            char *left_ptr;
>            char *right_ptr;
>
> @@ -309,7 +389,7 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
>              {
>                if ((size_t) (hi - left_ptr) <= max_thresh)
>                 /* Ignore both small partitions. */
> -               top = pop (top, &lo, &hi);
> +               top = pop (top, &lo, &hi, &depth);
>                else
>                 /* Ignore small left partition. */
>                  lo = left_ptr;
> @@ -320,13 +400,13 @@ _quicksort (void *const pbase, size_t total_elems,
> size_t size,
>            else if ((right_ptr - lo) > (hi - left_ptr))
>

Since we now have a depth counter, is it faster to still
always select the bigger region for the stack or
to remove this branch and just choose a direction?

We should be able to bound the size of the stack structure
with depth.


>              {
>               /* Push larger left partition indices. */
> -             top = push (top, lo, right_ptr);
> +             top = push (top, lo, right_ptr, depth - 1);
>                lo = left_ptr;
>              }
>            else
>              {
>               /* Push larger right partition indices. */
> -             top = push (top, left_ptr, hi);
> +             top = push (top, left_ptr, hi, depth - 1);
>                hi = right_ptr;
>              }
>          }
> --
> 2.30.2
>
>
  

Patch

diff --git a/stdlib/qsort.c b/stdlib/qsort.c
index 5df640362d..8368576aae 100644
--- a/stdlib/qsort.c
+++ b/stdlib/qsort.c
@@ -113,6 +113,7 @@  typedef struct
   {
     char *lo;
     char *hi;
+    size_t depth;
   } stack_node;
 
 /* The stack needs log (total_elements) entries (we could even subtract
@@ -122,23 +123,92 @@  typedef struct
 enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
 
 static inline stack_node *
-push (stack_node *top, char *lo, char *hi)
+push (stack_node *top, char *lo, char *hi, size_t depth)
 {
   top->lo = lo;
   top->hi = hi;
+  top->depth = depth;
   return ++top;
 }
 
 static inline stack_node *
-pop (stack_node *top, char **lo, char **hi)
+pop (stack_node *top, char **lo, char **hi, size_t *depth)
 {
   --top;
   *lo = top->lo;
   *hi = top->hi;
+  *depth = top->depth;
   return top;
 }
 
 
+/* A fast, small, non-recursive O(nlog n) heapsort, adapted from Linux
+   lib/sort.c.  Used on introsort implementation as a fallback routine with
+   worst-case performance of O(nlog n) and worst-case space complexity of
+   O(1).  */
+
+static inline size_t
+parent (size_t i, unsigned int lsbit, size_t size)
+{
+  i -= size;
+  i -= size & -(i & lsbit);
+  return i / 2;
+}
+
+static void
+heapsort_r (void *base, void *end, size_t size, swap_func_t swap_func,
+	    __compar_d_fn_t cmp, void *arg)
+{
+  size_t num = ((uintptr_t) end - (uintptr_t) base) / size;
+  size_t n = num * size, a = (num/2) * size;
+  /* Used to find parent  */
+  const unsigned int lsbit = size & -size;
+
+  /* num < 2 || size == 0.  */
+  if (a == 0)
+    return;
+
+  for (;;)
+    {
+      size_t b, c, d;
+
+      if (a != 0)
+	/* Building heap: sift down --a */
+	a -= size;
+      else if (n -= size)
+	/* Sorting: Extract root to --n */
+	do_swap (base, base + n, size, swap_func);
+      else
+	break;
+
+      /* Sift element at "a" down into heap.  This is the "bottom-up" variant,
+	 which significantly reduces calls to cmp_func(): we find the sift-down
+	 path all the way to the leaves (one compare per level), then backtrack
+	 to find where to insert the target element.
+
+	 Because elements tend to sift down close to the leaves, this uses fewer
+	 compares than doing two per level on the way down.  (A bit more than
+	 half as many on average, 3/4 worst-case.).  */
+      for (b = a; c = 2 * b + size, (d = c + size) < n;)
+	b = cmp (base + c, base + d, arg) >= 0 ? c : d;
+      if (d == n)
+	/* Special case last leaf with no sibling.  */
+	b = c;
+
+      /* Now backtrack from "b" to the correct location for "a".  */
+      while (b != a && cmp (base + a, base + b, arg) >= 0)
+	b = parent (b, lsbit, size);
+      /* Where "a" belongs.  */
+      c = b;
+      while (b != a)
+	{
+	  /* Shift it into place.  */
+	  b = parent (b, lsbit, size);
+          do_swap (base + b, base + c, size, swap_func);
+        }
+    }
+}
+
 /* Order size using quicksort.  This implementation incorporates
    four optimizations discussed in Sedgewick:
 
@@ -223,7 +293,7 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
 
   const size_t max_thresh = MAX_THRESH * size;
 
-  if (total_elems == 0)
+  if (total_elems <= 1)
     /* Avoid lossage with unsigned arithmetic below.  */
     return;
 
@@ -235,6 +305,9 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
   else
     swap_func = SWAP_BYTES;
 
+  /* Maximum depth before quicksort switches to heapsort.  */
+  size_t depth = 2 * (CHAR_BIT - 1 - __builtin_clzl (total_elems));
+
   if (total_elems > MAX_THRESH)
     {
       char *lo = base_ptr;
@@ -242,10 +315,17 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
       stack_node stack[STACK_SIZE];
       stack_node *top = stack;
 
-      top = push (top, NULL, NULL);
+      top = push (top, NULL, NULL, depth);
 
       while (stack < top)
         {
+	  if (depth == 0)
+	    {
+	      heapsort_r (lo, hi, size, swap_func, cmp, arg);
+              top = pop (top, &lo, &hi, &depth);
+	      continue;
+	    }
+
           char *left_ptr;
           char *right_ptr;
 
@@ -309,7 +389,7 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
             {
               if ((size_t) (hi - left_ptr) <= max_thresh)
 		/* Ignore both small partitions. */
-		top = pop (top, &lo, &hi);
+		top = pop (top, &lo, &hi, &depth);
               else
 		/* Ignore small left partition. */
                 lo = left_ptr;
@@ -320,13 +400,13 @@  _quicksort (void *const pbase, size_t total_elems, size_t size,
           else if ((right_ptr - lo) > (hi - left_ptr))
             {
 	      /* Push larger left partition indices. */
-	      top = push (top, lo, right_ptr);
+	      top = push (top, lo, right_ptr, depth - 1);
               lo = left_ptr;
             }
           else
             {
 	      /* Push larger right partition indices. */
-	      top = push (top, left_ptr, hi);
+	      top = push (top, left_ptr, hi, depth - 1);
               hi = right_ptr;
             }
         }