[RFC] Optimize bsearch() implementation for performance
Checks
Context |
Check |
Description |
redhat-pt-bot/TryBot-32bit |
success
|
Build for i686
|
Commit Message
Optimize the bsearch() function to improve binary search performance.
This modification reduces the execution time by approximately 8% on an
x86 machine with a 100,000-element array under 1e8 searches. The
optimization slightly increases the code size by 8 bytes.
code size:
* old:
text data bss dec hex filename
250 0 0 250 fa ./stdlib/bsearch.o
* new:
text data bss dec hex filename
258 0 0 258 102 ./stdlib/bsearch.o
benchmark:
Old bsearch elapsed time: 4536833
New bsearch elapsed time: 4137556
Improve efficiency by 8 %
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
---
Since I only have x86 machines available for testing, I am not entirely
certain whether this patch will bring performance improvements across
different architectures. Therefore, I am sending this as an RFC patch
first.
bits/stdlib-bsearch.h | 20 +++++++++-----------
1 file changed, 9 insertions(+), 11 deletions(-)
Comments
On Sun, Sep 01, 2024 at 09:54:56AM +0800, Kuan-Wei Chiu wrote:
> Optimize the bsearch() function to improve binary search performance.
> This modification reduces the execution time by approximately 8% on an
> x86 machine with a 100,000-element array under 1e8 searches. The
> optimization slightly increases the code size by 8 bytes.
>
> code size:
> * old:
> text data bss dec hex filename
> 250 0 0 250 fa ./stdlib/bsearch.o
> * new:
> text data bss dec hex filename
> 258 0 0 258 102 ./stdlib/bsearch.o
>
> benchmark:
> Old bsearch elapsed time: 4536833
> New bsearch elapsed time: 4137556
> Improve efficiency by 8 %
>
> Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
> ---
> Since I only have x86 machines available for testing, I am not entirely
> certain whether this patch will bring performance improvements across
> different architectures. Therefore, I am sending this as an RFC patch
> first.
>
> bits/stdlib-bsearch.h | 20 +++++++++-----------
> 1 file changed, 9 insertions(+), 11 deletions(-)
>
FWIW, here is the code I used for testing:
/* tst-bsearch1.c */
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define TEST_OLD
int arr[100000];
#define narr (sizeof (arr) / sizeof (arr[0]))
static int
comp (const void *p1, const void *p2)
{
int x1 = *(int *) p1;
int x2 = *(int *) p2;
if (x1 < x2)
return -1;
if (x1 > x2)
return 1;
return 0;
}
__attribute__((noinline)) void *
newbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
__compar_fn_t __compar)
{
const void *__p;
int __comparison;
while (__nmemb)
{
__p = (const void *) (((const char *) __base) + ((__nmemb >> 1) * __size));
__comparison = (*__compar) (__key, __p);
if (__comparison == 0)
{
#if __GNUC_PREREQ(4, 6)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wcast-qual"
#endif
return (void *) __p;
#if __GNUC_PREREQ(4, 6)
# pragma GCC diagnostic pop
#endif
}
if (__comparison > 0)
{
__base = ((const char *) __p) + __size;
--__nmemb;
}
__nmemb >>= 1;
}
return NULL;
}
__attribute__((noinline)) void *
oldbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
__compar_fn_t __compar)
{
size_t __l, __u, __idx;
const void *__p;
int __comparison;
__l = 0;
__u = __nmemb;
while (__l < __u)
{
__idx = (__l + __u) / 2;
__p = (const void *) (((const char *) __base) + (__idx * __size));
__comparison = (*__compar) (__key, __p);
if (__comparison < 0)
__u = __idx;
else if (__comparison > 0)
__l = __idx + 1;
else
{
#if __GNUC_PREREQ(4, 6)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wcast-qual"
#endif
return (void *) __p;
#if __GNUC_PREREQ(4, 6)
# pragma GCC diagnostic pop
#endif
}
}
return NULL;
}
static int
do_test (void)
{
volatile size_t i;
volatile int *res;
volatile clock_t begin, end;
int key;
long long int old_time, new_time;
for (i = 0; i < narr; ++i)
arr[i] = i;
asm volatile ("" : : : "memory");
begin = clock ();
asm volatile ("" : : : "memory");
for (i = 0; i < 1e8; ++i)
{
key = i % narr;
res = (int *) oldbsearch (&key, arr, narr, sizeof (arr[0]), comp);
if (res != arr + key)
{
printf ("entry %zd got wrong answer\n", i);
return 1;
}
}
asm volatile ("" : : : "memory");
end = clock ();
asm volatile ("" : : : "memory");
old_time = end - begin;
printf ("Old bsearch elapsed time: %lld\n", old_time);
asm volatile ("" : : : "memory");
begin = clock ();
asm volatile ("" : : : "memory");
for (i = 0; i < 1e8; ++i)
{
key = i % narr;
res = (int *) newbsearch (&key, arr, narr, sizeof (arr[0]), comp);
if (res != arr + key)
{
printf ("entry %zd got wrong answer\n", i);
return 1;
}
}
asm volatile ("" : : : "memory");
end = clock ();
asm volatile ("" : : : "memory");
new_time = end - begin;
printf ("New bsearch elapsed time: %lld\n", new_time);
printf ("Improve efficiency by %lld %%\n", (old_time - new_time) * 100 / old_time);
return 0;
}
#define TEST_FUNCTION do_test ()
#include "../test-skeleton.c"
On Sat, Aug 31, 2024 at 6:57 PM Kuan-Wei Chiu <visitorckw@gmail.com> wrote:
>
> On Sun, Sep 01, 2024 at 09:54:56AM +0800, Kuan-Wei Chiu wrote:
> > Optimize the bsearch() function to improve binary search performance.
> > This modification reduces the execution time by approximately 8% on an
> > x86 machine with a 100,000-element array under 1e8 searches. The
> > optimization slightly increases the code size by 8 bytes.
> >
> > code size:
> > * old:
> > text data bss dec hex filename
> > 250 0 0 250 fa ./stdlib/bsearch.o
> > * new:
> > text data bss dec hex filename
> > 258 0 0 258 102 ./stdlib/bsearch.o
> >
> > benchmark:
> > Old bsearch elapsed time: 4536833
> > New bsearch elapsed time: 4137556
> > Improve efficiency by 8 %
> >
> > Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
> > ---
> > Since I only have x86 machines available for testing, I am not entirely
> > certain whether this patch will bring performance improvements across
> > different architectures. Therefore, I am sending this as an RFC patch
> > first.
> >
> > bits/stdlib-bsearch.h | 20 +++++++++-----------
> > 1 file changed, 9 insertions(+), 11 deletions(-)
> >
> FWIW, here is the code I used for testing:
>
> /* tst-bsearch1.c */
You could add this as a benchtest in this series :)
If you are interested in doing see other benchmark files
`benchmarks/bench-*.c` for example on how we doing
timing/output (TIMING_* macros and json respectively).
>
> #include <stdio.h>
> #include <stdlib.h>
> #include <time.h>
>
> #define TEST_OLD
>
> int arr[100000];
> #define narr (sizeof (arr) / sizeof (arr[0]))
>
> static int
> comp (const void *p1, const void *p2)
> {
> int x1 = *(int *) p1;
> int x2 = *(int *) p2;
>
> if (x1 < x2)
> return -1;
> if (x1 > x2)
> return 1;
> return 0;
> }
>
> __attribute__((noinline)) void *
> newbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
> __compar_fn_t __compar)
> {
> const void *__p;
> int __comparison;
>
> while (__nmemb)
> {
> __p = (const void *) (((const char *) __base) + ((__nmemb >> 1) * __size));
> __comparison = (*__compar) (__key, __p);
> if (__comparison == 0)
> {
> #if __GNUC_PREREQ(4, 6)
> # pragma GCC diagnostic push
> # pragma GCC diagnostic ignored "-Wcast-qual"
> #endif
> return (void *) __p;
> #if __GNUC_PREREQ(4, 6)
> # pragma GCC diagnostic pop
> #endif
> }
> if (__comparison > 0)
> {
> __base = ((const char *) __p) + __size;
> --__nmemb;
> }
> __nmemb >>= 1;
> }
>
> return NULL;
> }
>
> __attribute__((noinline)) void *
> oldbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
> __compar_fn_t __compar)
> {
> size_t __l, __u, __idx;
> const void *__p;
> int __comparison;
>
> __l = 0;
> __u = __nmemb;
> while (__l < __u)
> {
> __idx = (__l + __u) / 2;
> __p = (const void *) (((const char *) __base) + (__idx * __size));
> __comparison = (*__compar) (__key, __p);
> if (__comparison < 0)
> __u = __idx;
> else if (__comparison > 0)
> __l = __idx + 1;
> else
> {
> #if __GNUC_PREREQ(4, 6)
> # pragma GCC diagnostic push
> # pragma GCC diagnostic ignored "-Wcast-qual"
> #endif
> return (void *) __p;
> #if __GNUC_PREREQ(4, 6)
> # pragma GCC diagnostic pop
> #endif
> }
> }
>
> return NULL;
> }
>
> static int
> do_test (void)
> {
> volatile size_t i;
> volatile int *res;
> volatile clock_t begin, end;
> int key;
> long long int old_time, new_time;
>
> for (i = 0; i < narr; ++i)
> arr[i] = i;
>
> asm volatile ("" : : : "memory");
> begin = clock ();
> asm volatile ("" : : : "memory");
> for (i = 0; i < 1e8; ++i)
> {
> key = i % narr;
> res = (int *) oldbsearch (&key, arr, narr, sizeof (arr[0]), comp);
> if (res != arr + key)
> {
> printf ("entry %zd got wrong answer\n", i);
> return 1;
> }
> }
> asm volatile ("" : : : "memory");
> end = clock ();
> asm volatile ("" : : : "memory");
> old_time = end - begin;
> printf ("Old bsearch elapsed time: %lld\n", old_time);
>
> asm volatile ("" : : : "memory");
> begin = clock ();
> asm volatile ("" : : : "memory");
> for (i = 0; i < 1e8; ++i)
> {
> key = i % narr;
> res = (int *) newbsearch (&key, arr, narr, sizeof (arr[0]), comp);
> if (res != arr + key)
> {
> printf ("entry %zd got wrong answer\n", i);
> return 1;
> }
> }
> asm volatile ("" : : : "memory");
> end = clock ();
> asm volatile ("" : : : "memory");
> new_time = end - begin;
> printf ("New bsearch elapsed time: %lld\n", new_time);
>
> printf ("Improve efficiency by %lld %%\n", (old_time - new_time) * 100 / old_time);
>
> return 0;
> }
>
> #define TEST_FUNCTION do_test ()
> #include "../test-skeleton.c"
>
On Sun, Sep 01, 2024 at 04:42:17PM -0700, Noah Goldstein wrote:
> On Sat, Aug 31, 2024 at 6:57 PM Kuan-Wei Chiu <visitorckw@gmail.com> wrote:
> >
> > On Sun, Sep 01, 2024 at 09:54:56AM +0800, Kuan-Wei Chiu wrote:
> > > Optimize the bsearch() function to improve binary search performance.
> > > This modification reduces the execution time by approximately 8% on an
> > > x86 machine with a 100,000-element array under 1e8 searches. The
> > > optimization slightly increases the code size by 8 bytes.
> > >
> > > code size:
> > > * old:
> > > text data bss dec hex filename
> > > 250 0 0 250 fa ./stdlib/bsearch.o
> > > * new:
> > > text data bss dec hex filename
> > > 258 0 0 258 102 ./stdlib/bsearch.o
> > >
> > > benchmark:
> > > Old bsearch elapsed time: 4536833
> > > New bsearch elapsed time: 4137556
> > > Improve efficiency by 8 %
> > >
> > > Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
> > > ---
> > > Since I only have x86 machines available for testing, I am not entirely
> > > certain whether this patch will bring performance improvements across
> > > different architectures. Therefore, I am sending this as an RFC patch
> > > first.
> > >
> > > bits/stdlib-bsearch.h | 20 +++++++++-----------
> > > 1 file changed, 9 insertions(+), 11 deletions(-)
> > >
> > FWIW, here is the code I used for testing:
> >
> > /* tst-bsearch1.c */
> You could add this as a benchtest in this series :)
>
> If you are interested in doing see other benchmark files
> `benchmarks/bench-*.c` for example on how we doing
> timing/output (TIMING_* macros and json respectively).
>
Thanks for your guidance! :)
I'll give it a try.
Regards,
Kuan-Wei
> >
> > #include <stdio.h>
> > #include <stdlib.h>
> > #include <time.h>
> >
> > #define TEST_OLD
> >
> > int arr[100000];
> > #define narr (sizeof (arr) / sizeof (arr[0]))
> >
> > static int
> > comp (const void *p1, const void *p2)
> > {
> > int x1 = *(int *) p1;
> > int x2 = *(int *) p2;
> >
> > if (x1 < x2)
> > return -1;
> > if (x1 > x2)
> > return 1;
> > return 0;
> > }
> >
> > __attribute__((noinline)) void *
> > newbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
> > __compar_fn_t __compar)
> > {
> > const void *__p;
> > int __comparison;
> >
> > while (__nmemb)
> > {
> > __p = (const void *) (((const char *) __base) + ((__nmemb >> 1) * __size));
> > __comparison = (*__compar) (__key, __p);
> > if (__comparison == 0)
> > {
> > #if __GNUC_PREREQ(4, 6)
> > # pragma GCC diagnostic push
> > # pragma GCC diagnostic ignored "-Wcast-qual"
> > #endif
> > return (void *) __p;
> > #if __GNUC_PREREQ(4, 6)
> > # pragma GCC diagnostic pop
> > #endif
> > }
> > if (__comparison > 0)
> > {
> > __base = ((const char *) __p) + __size;
> > --__nmemb;
> > }
> > __nmemb >>= 1;
> > }
> >
> > return NULL;
> > }
> >
> > __attribute__((noinline)) void *
> > oldbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
> > __compar_fn_t __compar)
> > {
> > size_t __l, __u, __idx;
> > const void *__p;
> > int __comparison;
> >
> > __l = 0;
> > __u = __nmemb;
> > while (__l < __u)
> > {
> > __idx = (__l + __u) / 2;
> > __p = (const void *) (((const char *) __base) + (__idx * __size));
> > __comparison = (*__compar) (__key, __p);
> > if (__comparison < 0)
> > __u = __idx;
> > else if (__comparison > 0)
> > __l = __idx + 1;
> > else
> > {
> > #if __GNUC_PREREQ(4, 6)
> > # pragma GCC diagnostic push
> > # pragma GCC diagnostic ignored "-Wcast-qual"
> > #endif
> > return (void *) __p;
> > #if __GNUC_PREREQ(4, 6)
> > # pragma GCC diagnostic pop
> > #endif
> > }
> > }
> >
> > return NULL;
> > }
> >
> > static int
> > do_test (void)
> > {
> > volatile size_t i;
> > volatile int *res;
> > volatile clock_t begin, end;
> > int key;
> > long long int old_time, new_time;
> >
> > for (i = 0; i < narr; ++i)
> > arr[i] = i;
> >
> > asm volatile ("" : : : "memory");
> > begin = clock ();
> > asm volatile ("" : : : "memory");
> > for (i = 0; i < 1e8; ++i)
> > {
> > key = i % narr;
> > res = (int *) oldbsearch (&key, arr, narr, sizeof (arr[0]), comp);
> > if (res != arr + key)
> > {
> > printf ("entry %zd got wrong answer\n", i);
> > return 1;
> > }
> > }
> > asm volatile ("" : : : "memory");
> > end = clock ();
> > asm volatile ("" : : : "memory");
> > old_time = end - begin;
> > printf ("Old bsearch elapsed time: %lld\n", old_time);
> >
> > asm volatile ("" : : : "memory");
> > begin = clock ();
> > asm volatile ("" : : : "memory");
> > for (i = 0; i < 1e8; ++i)
> > {
> > key = i % narr;
> > res = (int *) newbsearch (&key, arr, narr, sizeof (arr[0]), comp);
> > if (res != arr + key)
> > {
> > printf ("entry %zd got wrong answer\n", i);
> > return 1;
> > }
> > }
> > asm volatile ("" : : : "memory");
> > end = clock ();
> > asm volatile ("" : : : "memory");
> > new_time = end - begin;
> > printf ("New bsearch elapsed time: %lld\n", new_time);
> >
> > printf ("Improve efficiency by %lld %%\n", (old_time - new_time) * 100 / old_time);
> >
> > return 0;
> > }
> >
> > #define TEST_FUNCTION do_test ()
> > #include "../test-skeleton.c"
> >
@@ -20,22 +20,14 @@ __extern_inline void *
bsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
__compar_fn_t __compar)
{
- size_t __l, __u, __idx;
const void *__p;
int __comparison;
- __l = 0;
- __u = __nmemb;
- while (__l < __u)
+ while (__nmemb)
{
- __idx = (__l + __u) / 2;
- __p = (const void *) (((const char *) __base) + (__idx * __size));
+ __p = (const void *) (((const char *) __base) + ((__nmemb >> 1) * __size));
__comparison = (*__compar) (__key, __p);
- if (__comparison < 0)
- __u = __idx;
- else if (__comparison > 0)
- __l = __idx + 1;
- else
+ if (__comparison == 0)
{
#if __GNUC_PREREQ(4, 6)
# pragma GCC diagnostic push
@@ -46,6 +38,12 @@ bsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
# pragma GCC diagnostic pop
#endif
}
+ if (__comparison > 0)
+ {
+ __base = ((const char *) __p) + __size;
+ --__nmemb;
+ }
+ __nmemb >>= 1;
}
return NULL;