[RFC] Optimize bsearch() implementation for performance

Message ID 20240901015456.1782446-1-visitorckw@gmail.com
State New
Headers
Series [RFC] Optimize bsearch() implementation for performance |

Checks

Context Check Description
redhat-pt-bot/TryBot-32bit success Build for i686

Commit Message

Kuan-Wei Chiu Sept. 1, 2024, 1:54 a.m. UTC
  Optimize the bsearch() function to improve binary search performance.
This modification reduces the execution time by approximately 8% on an
x86 machine with a 100,000-element array under 1e8 searches. The
optimization slightly increases the code size by 8 bytes.

code size:
* old:
   text    data     bss     dec     hex filename
    250       0       0     250      fa ./stdlib/bsearch.o
* new:
   text    data     bss     dec     hex filename
    258       0       0     258     102 ./stdlib/bsearch.o

benchmark:
Old bsearch elapsed time: 4536833
New bsearch elapsed time: 4137556
Improve efficiency by 8 %

Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
---
Since I only have x86 machines available for testing, I am not entirely
certain whether this patch will bring performance improvements across
different architectures. Therefore, I am sending this as an RFC patch
first.

 bits/stdlib-bsearch.h | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)
  

Comments

Kuan-Wei Chiu Sept. 1, 2024, 1:57 a.m. UTC | #1
On Sun, Sep 01, 2024 at 09:54:56AM +0800, Kuan-Wei Chiu wrote:
> Optimize the bsearch() function to improve binary search performance.
> This modification reduces the execution time by approximately 8% on an
> x86 machine with a 100,000-element array under 1e8 searches. The
> optimization slightly increases the code size by 8 bytes.
> 
> code size:
> * old:
>    text    data     bss     dec     hex filename
>     250       0       0     250      fa ./stdlib/bsearch.o
> * new:
>    text    data     bss     dec     hex filename
>     258       0       0     258     102 ./stdlib/bsearch.o
> 
> benchmark:
> Old bsearch elapsed time: 4536833
> New bsearch elapsed time: 4137556
> Improve efficiency by 8 %
> 
> Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
> ---
> Since I only have x86 machines available for testing, I am not entirely
> certain whether this patch will bring performance improvements across
> different architectures. Therefore, I am sending this as an RFC patch
> first.
> 
>  bits/stdlib-bsearch.h | 20 +++++++++-----------
>  1 file changed, 9 insertions(+), 11 deletions(-)
> 
FWIW, here is the code I used for testing:

/* tst-bsearch1.c */

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define TEST_OLD

int arr[100000];
#define narr (sizeof (arr) / sizeof (arr[0]))

static int
comp (const void *p1, const void *p2)
{
  int x1 = *(int *) p1;
  int x2 = *(int *) p2;

  if (x1 < x2)
    return -1;
  if (x1 > x2)
    return 1;
  return 0;
}

__attribute__((noinline)) void *
newbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
	 __compar_fn_t __compar)
{
  const void *__p;
  int __comparison;

  while (__nmemb)
    {
      __p = (const void *) (((const char *) __base) + ((__nmemb >> 1) * __size));
      __comparison = (*__compar) (__key, __p);
      if (__comparison == 0)
	{
#if __GNUC_PREREQ(4, 6)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wcast-qual"
#endif
	  return (void *) __p;
#if __GNUC_PREREQ(4, 6)
# pragma GCC diagnostic pop
#endif
	}
      if (__comparison > 0)
	{
	  __base = ((const char *) __p) + __size;
	  --__nmemb;
	}
      __nmemb >>= 1;
    }

  return NULL;
}

__attribute__((noinline)) void *
oldbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
	 __compar_fn_t __compar)
{
  size_t __l, __u, __idx;
  const void *__p;
  int __comparison;

  __l = 0;
  __u = __nmemb;
  while (__l < __u)
    {
      __idx = (__l + __u) / 2;
      __p = (const void *) (((const char *) __base) + (__idx * __size));
      __comparison = (*__compar) (__key, __p);
      if (__comparison < 0)
	__u = __idx;
      else if (__comparison > 0)
	__l = __idx + 1;
      else
	{
#if __GNUC_PREREQ(4, 6)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wcast-qual"
#endif
	  return (void *) __p;
#if __GNUC_PREREQ(4, 6)
# pragma GCC diagnostic pop
#endif
	}
    }

  return NULL;
}

static int
do_test (void)
{
  volatile size_t i;
  volatile int *res;
  volatile clock_t begin, end;
  int key;
  long long int old_time, new_time;

  for (i = 0; i < narr; ++i)
    arr[i] = i;

  asm volatile ("" : : : "memory");
  begin = clock ();
  asm volatile ("" : : : "memory");
  for (i = 0; i < 1e8; ++i)
    {
      key = i % narr;
      res = (int *) oldbsearch (&key, arr, narr, sizeof (arr[0]), comp);
      if (res != arr + key)
	{
	  printf ("entry %zd got wrong answer\n", i);
	  return 1;
	}
    }
  asm volatile ("" : : : "memory");
  end = clock ();
  asm volatile ("" : : : "memory");
  old_time = end - begin;
  printf ("Old bsearch elapsed time: %lld\n", old_time);

  asm volatile ("" : : : "memory");
  begin = clock ();
  asm volatile ("" : : : "memory");
  for (i = 0; i < 1e8; ++i)
    {
      key = i % narr;
      res = (int *) newbsearch (&key, arr, narr, sizeof (arr[0]), comp);
      if (res != arr + key)
	{
	  printf ("entry %zd got wrong answer\n", i);
	  return 1;
	}
    }
  asm volatile ("" : : : "memory");
  end = clock ();
  asm volatile ("" : : : "memory");
  new_time = end - begin;
  printf ("New bsearch elapsed time: %lld\n", new_time);

  printf ("Improve efficiency by %lld %%\n", (old_time - new_time) * 100 / old_time);

  return 0;
}

#define TEST_FUNCTION do_test ()
#include "../test-skeleton.c"
  
Noah Goldstein Sept. 1, 2024, 11:42 p.m. UTC | #2
On Sat, Aug 31, 2024 at 6:57 PM Kuan-Wei Chiu <visitorckw@gmail.com> wrote:
>
> On Sun, Sep 01, 2024 at 09:54:56AM +0800, Kuan-Wei Chiu wrote:
> > Optimize the bsearch() function to improve binary search performance.
> > This modification reduces the execution time by approximately 8% on an
> > x86 machine with a 100,000-element array under 1e8 searches. The
> > optimization slightly increases the code size by 8 bytes.
> >
> > code size:
> > * old:
> >    text    data     bss     dec     hex filename
> >     250       0       0     250      fa ./stdlib/bsearch.o
> > * new:
> >    text    data     bss     dec     hex filename
> >     258       0       0     258     102 ./stdlib/bsearch.o
> >
> > benchmark:
> > Old bsearch elapsed time: 4536833
> > New bsearch elapsed time: 4137556
> > Improve efficiency by 8 %
> >
> > Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
> > ---
> > Since I only have x86 machines available for testing, I am not entirely
> > certain whether this patch will bring performance improvements across
> > different architectures. Therefore, I am sending this as an RFC patch
> > first.
> >
> >  bits/stdlib-bsearch.h | 20 +++++++++-----------
> >  1 file changed, 9 insertions(+), 11 deletions(-)
> >
> FWIW, here is the code I used for testing:
>
> /* tst-bsearch1.c */
You could add this as a benchtest in this series :)

If you are interested in doing see other benchmark files
`benchmarks/bench-*.c` for example on how we doing
timing/output (TIMING_* macros and json respectively).

>
> #include <stdio.h>
> #include <stdlib.h>
> #include <time.h>
>
> #define TEST_OLD
>
> int arr[100000];
> #define narr (sizeof (arr) / sizeof (arr[0]))
>
> static int
> comp (const void *p1, const void *p2)
> {
>   int x1 = *(int *) p1;
>   int x2 = *(int *) p2;
>
>   if (x1 < x2)
>     return -1;
>   if (x1 > x2)
>     return 1;
>   return 0;
> }
>
> __attribute__((noinline)) void *
> newbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
>          __compar_fn_t __compar)
> {
>   const void *__p;
>   int __comparison;
>
>   while (__nmemb)
>     {
>       __p = (const void *) (((const char *) __base) + ((__nmemb >> 1) * __size));
>       __comparison = (*__compar) (__key, __p);
>       if (__comparison == 0)
>         {
> #if __GNUC_PREREQ(4, 6)
> # pragma GCC diagnostic push
> # pragma GCC diagnostic ignored "-Wcast-qual"
> #endif
>           return (void *) __p;
> #if __GNUC_PREREQ(4, 6)
> # pragma GCC diagnostic pop
> #endif
>         }
>       if (__comparison > 0)
>         {
>           __base = ((const char *) __p) + __size;
>           --__nmemb;
>         }
>       __nmemb >>= 1;
>     }
>
>   return NULL;
> }
>
> __attribute__((noinline)) void *
> oldbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
>          __compar_fn_t __compar)
> {
>   size_t __l, __u, __idx;
>   const void *__p;
>   int __comparison;
>
>   __l = 0;
>   __u = __nmemb;
>   while (__l < __u)
>     {
>       __idx = (__l + __u) / 2;
>       __p = (const void *) (((const char *) __base) + (__idx * __size));
>       __comparison = (*__compar) (__key, __p);
>       if (__comparison < 0)
>         __u = __idx;
>       else if (__comparison > 0)
>         __l = __idx + 1;
>       else
>         {
> #if __GNUC_PREREQ(4, 6)
> # pragma GCC diagnostic push
> # pragma GCC diagnostic ignored "-Wcast-qual"
> #endif
>           return (void *) __p;
> #if __GNUC_PREREQ(4, 6)
> # pragma GCC diagnostic pop
> #endif
>         }
>     }
>
>   return NULL;
> }
>
> static int
> do_test (void)
> {
>   volatile size_t i;
>   volatile int *res;
>   volatile clock_t begin, end;
>   int key;
>   long long int old_time, new_time;
>
>   for (i = 0; i < narr; ++i)
>     arr[i] = i;
>
>   asm volatile ("" : : : "memory");
>   begin = clock ();
>   asm volatile ("" : : : "memory");
>   for (i = 0; i < 1e8; ++i)
>     {
>       key = i % narr;
>       res = (int *) oldbsearch (&key, arr, narr, sizeof (arr[0]), comp);
>       if (res != arr + key)
>         {
>           printf ("entry %zd got wrong answer\n", i);
>           return 1;
>         }
>     }
>   asm volatile ("" : : : "memory");
>   end = clock ();
>   asm volatile ("" : : : "memory");
>   old_time = end - begin;
>   printf ("Old bsearch elapsed time: %lld\n", old_time);
>
>   asm volatile ("" : : : "memory");
>   begin = clock ();
>   asm volatile ("" : : : "memory");
>   for (i = 0; i < 1e8; ++i)
>     {
>       key = i % narr;
>       res = (int *) newbsearch (&key, arr, narr, sizeof (arr[0]), comp);
>       if (res != arr + key)
>         {
>           printf ("entry %zd got wrong answer\n", i);
>           return 1;
>         }
>     }
>   asm volatile ("" : : : "memory");
>   end = clock ();
>   asm volatile ("" : : : "memory");
>   new_time = end - begin;
>   printf ("New bsearch elapsed time: %lld\n", new_time);
>
>   printf ("Improve efficiency by %lld %%\n", (old_time - new_time) * 100 / old_time);
>
>   return 0;
> }
>
> #define TEST_FUNCTION do_test ()
> #include "../test-skeleton.c"
>
  
Kuan-Wei Chiu Sept. 2, 2024, 1:09 p.m. UTC | #3
On Sun, Sep 01, 2024 at 04:42:17PM -0700, Noah Goldstein wrote:
> On Sat, Aug 31, 2024 at 6:57 PM Kuan-Wei Chiu <visitorckw@gmail.com> wrote:
> >
> > On Sun, Sep 01, 2024 at 09:54:56AM +0800, Kuan-Wei Chiu wrote:
> > > Optimize the bsearch() function to improve binary search performance.
> > > This modification reduces the execution time by approximately 8% on an
> > > x86 machine with a 100,000-element array under 1e8 searches. The
> > > optimization slightly increases the code size by 8 bytes.
> > >
> > > code size:
> > > * old:
> > >    text    data     bss     dec     hex filename
> > >     250       0       0     250      fa ./stdlib/bsearch.o
> > > * new:
> > >    text    data     bss     dec     hex filename
> > >     258       0       0     258     102 ./stdlib/bsearch.o
> > >
> > > benchmark:
> > > Old bsearch elapsed time: 4536833
> > > New bsearch elapsed time: 4137556
> > > Improve efficiency by 8 %
> > >
> > > Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
> > > ---
> > > Since I only have x86 machines available for testing, I am not entirely
> > > certain whether this patch will bring performance improvements across
> > > different architectures. Therefore, I am sending this as an RFC patch
> > > first.
> > >
> > >  bits/stdlib-bsearch.h | 20 +++++++++-----------
> > >  1 file changed, 9 insertions(+), 11 deletions(-)
> > >
> > FWIW, here is the code I used for testing:
> >
> > /* tst-bsearch1.c */
> You could add this as a benchtest in this series :)
> 
> If you are interested in doing see other benchmark files
> `benchmarks/bench-*.c` for example on how we doing
> timing/output (TIMING_* macros and json respectively).
>
Thanks for your guidance! :)
I'll give it a try.

Regards,
Kuan-Wei

> >
> > #include <stdio.h>
> > #include <stdlib.h>
> > #include <time.h>
> >
> > #define TEST_OLD
> >
> > int arr[100000];
> > #define narr (sizeof (arr) / sizeof (arr[0]))
> >
> > static int
> > comp (const void *p1, const void *p2)
> > {
> >   int x1 = *(int *) p1;
> >   int x2 = *(int *) p2;
> >
> >   if (x1 < x2)
> >     return -1;
> >   if (x1 > x2)
> >     return 1;
> >   return 0;
> > }
> >
> > __attribute__((noinline)) void *
> > newbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
> >          __compar_fn_t __compar)
> > {
> >   const void *__p;
> >   int __comparison;
> >
> >   while (__nmemb)
> >     {
> >       __p = (const void *) (((const char *) __base) + ((__nmemb >> 1) * __size));
> >       __comparison = (*__compar) (__key, __p);
> >       if (__comparison == 0)
> >         {
> > #if __GNUC_PREREQ(4, 6)
> > # pragma GCC diagnostic push
> > # pragma GCC diagnostic ignored "-Wcast-qual"
> > #endif
> >           return (void *) __p;
> > #if __GNUC_PREREQ(4, 6)
> > # pragma GCC diagnostic pop
> > #endif
> >         }
> >       if (__comparison > 0)
> >         {
> >           __base = ((const char *) __p) + __size;
> >           --__nmemb;
> >         }
> >       __nmemb >>= 1;
> >     }
> >
> >   return NULL;
> > }
> >
> > __attribute__((noinline)) void *
> > oldbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
> >          __compar_fn_t __compar)
> > {
> >   size_t __l, __u, __idx;
> >   const void *__p;
> >   int __comparison;
> >
> >   __l = 0;
> >   __u = __nmemb;
> >   while (__l < __u)
> >     {
> >       __idx = (__l + __u) / 2;
> >       __p = (const void *) (((const char *) __base) + (__idx * __size));
> >       __comparison = (*__compar) (__key, __p);
> >       if (__comparison < 0)
> >         __u = __idx;
> >       else if (__comparison > 0)
> >         __l = __idx + 1;
> >       else
> >         {
> > #if __GNUC_PREREQ(4, 6)
> > # pragma GCC diagnostic push
> > # pragma GCC diagnostic ignored "-Wcast-qual"
> > #endif
> >           return (void *) __p;
> > #if __GNUC_PREREQ(4, 6)
> > # pragma GCC diagnostic pop
> > #endif
> >         }
> >     }
> >
> >   return NULL;
> > }
> >
> > static int
> > do_test (void)
> > {
> >   volatile size_t i;
> >   volatile int *res;
> >   volatile clock_t begin, end;
> >   int key;
> >   long long int old_time, new_time;
> >
> >   for (i = 0; i < narr; ++i)
> >     arr[i] = i;
> >
> >   asm volatile ("" : : : "memory");
> >   begin = clock ();
> >   asm volatile ("" : : : "memory");
> >   for (i = 0; i < 1e8; ++i)
> >     {
> >       key = i % narr;
> >       res = (int *) oldbsearch (&key, arr, narr, sizeof (arr[0]), comp);
> >       if (res != arr + key)
> >         {
> >           printf ("entry %zd got wrong answer\n", i);
> >           return 1;
> >         }
> >     }
> >   asm volatile ("" : : : "memory");
> >   end = clock ();
> >   asm volatile ("" : : : "memory");
> >   old_time = end - begin;
> >   printf ("Old bsearch elapsed time: %lld\n", old_time);
> >
> >   asm volatile ("" : : : "memory");
> >   begin = clock ();
> >   asm volatile ("" : : : "memory");
> >   for (i = 0; i < 1e8; ++i)
> >     {
> >       key = i % narr;
> >       res = (int *) newbsearch (&key, arr, narr, sizeof (arr[0]), comp);
> >       if (res != arr + key)
> >         {
> >           printf ("entry %zd got wrong answer\n", i);
> >           return 1;
> >         }
> >     }
> >   asm volatile ("" : : : "memory");
> >   end = clock ();
> >   asm volatile ("" : : : "memory");
> >   new_time = end - begin;
> >   printf ("New bsearch elapsed time: %lld\n", new_time);
> >
> >   printf ("Improve efficiency by %lld %%\n", (old_time - new_time) * 100 / old_time);
> >
> >   return 0;
> > }
> >
> > #define TEST_FUNCTION do_test ()
> > #include "../test-skeleton.c"
> >
  

Patch

diff --git a/bits/stdlib-bsearch.h b/bits/stdlib-bsearch.h
index 540d718952..57f060b504 100644
--- a/bits/stdlib-bsearch.h
+++ b/bits/stdlib-bsearch.h
@@ -20,22 +20,14 @@  __extern_inline void *
 bsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
 	 __compar_fn_t __compar)
 {
-  size_t __l, __u, __idx;
   const void *__p;
   int __comparison;
 
-  __l = 0;
-  __u = __nmemb;
-  while (__l < __u)
+  while (__nmemb)
     {
-      __idx = (__l + __u) / 2;
-      __p = (const void *) (((const char *) __base) + (__idx * __size));
+      __p = (const void *) (((const char *) __base) + ((__nmemb >> 1) * __size));
       __comparison = (*__compar) (__key, __p);
-      if (__comparison < 0)
-	__u = __idx;
-      else if (__comparison > 0)
-	__l = __idx + 1;
-      else
+      if (__comparison == 0)
 	{
 #if __GNUC_PREREQ(4, 6)
 # pragma GCC diagnostic push
@@ -46,6 +38,12 @@  bsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
 # pragma GCC diagnostic pop
 #endif
 	}
+      if (__comparison > 0)
+	{
+	  __base = ((const char *) __p) + __size;
+	  --__nmemb;
+	}
+      __nmemb >>= 1;
     }
 
   return NULL;