Patchwork Improve string benchtests

login
register
mail settings
Submitter Adhemerval Zanella Netto
Date Feb. 4, 2019, 7:35 p.m.
Message ID <49967cf5-a89a-fa17-5c94-556c92705bef@linaro.org>
Download mbox | patch
Permalink /patch/31300/
State New
Headers show

Comments

Adhemerval Zanella Netto - Feb. 4, 2019, 7:35 p.m.
On 27/12/2018 16:26, Wilco Dijkstra wrote:
> Replace slow byte-oriented tests in several string benchmarks with the
> generic implementations from the string/ directory so the comparisons
> are more realistic and useful.
> 
> OK to commit?
> 
> ChangeLog:
> 2018-12-27  Wilco Dijkstra  <wdijkstr@arm.com>
> 
>         * benchtests/bench-stpcpy.c (SIMPLE_STPCPY): Remove function.
>         (generic_stpcpy): New function.
>         * benchtests/bench-stpncpy.c (SIMPLE_STPNCPY): Remove function.
>         (generic_stpncpy): New function.
>         * benchtests/bench-strcat.c (SIMPLE_STRCAT): Remove function.
>         (generic_strcat): New function.
>         * benchtests/bench-strcpy.c (SIMPLE_STRCPY): Remove function.
>         (generic_strcpy): New function.
>         * benchtests/bench-strncat.c (SIMPLE_STRNCAT): Remove function.
>         (STUPID_STRNCAT): Remove function.
>         (generic_strncat): New function.
>         * benchtests/bench-strncpy.c (SIMPLE_STRNCPY): Remove function.
>         (STUPID_STRNCPY): Remove function.
>         (generic_strncpy): New function.
>         * benchtests/bench-strnlen.c (SIMPLE_STRNLEN): Remove function.
>         (generic_strnlen): New function.
>         (memchr_strnlen): New function.
> 
> --
> diff --git a/benchtests/bench-stpcpy.c b/benchtests/bench-stpcpy.c
> index 77ffb0f5c22b8d8ba83d46e903a23c90d8c983c1..7982c1886bb0af680b00757d8383b2ef84503533 100644
> --- a/benchtests/bench-stpcpy.c
> +++ b/benchtests/bench-stpcpy.c
> @@ -24,22 +24,15 @@
>  # define TEST_NAME "wcpcpy"
>  #endif /* WIDE */
>  #include "bench-string.h"
> -#ifndef WIDE
> -# define SIMPLE_STPCPY simple_stpcpy
> -#else
> -# define SIMPLE_STPCPY simple_wcpcpy
> -#endif /* WIDE */
> -
> -CHAR *SIMPLE_STPCPY (CHAR *, const CHAR *);
> -
> -IMPL (SIMPLE_STPCPY, 0)
> -IMPL (STPCPY, 1)
>  
>  CHAR *
> -SIMPLE_STPCPY (CHAR *dst, const CHAR *src)
> +generic_stpcpy (CHAR *dst, const CHAR *src)
>  {
> -  while ((*dst++ = *src++) != '\0');
> -  return dst - 1;
> +  size_t len = STRLEN (src);
> +  return (CHAR*)MEMCPY (dst, src, len + 1) + len;
>  }

Space after cast.  As a side note, this won't evaluate wstpcpy as-is, since it
will use an optimized version and it will result in a wrong generic name for
wcscpy.

What about:

---
---

And as a following cleanup for wcpcpy we can use the similar code for strcpy
adjusting for wide-chars:

wchar_t *
__wcpcpy (wchar_t *dest, const wchar_t *src)
{
  size_t len = wcslen (src) * sizeof (wchar_t);
  return memcpy (dest, src, len + sizeof (wchar_t)) + len;
}

And this allows remove the arch-specific m68k implementation as well.

>  
> +IMPL (STPCPY, 1)
> +IMPL (generic_stpcpy, 0)
> +
>  #include "bench-strcpy.c"
> diff --git a/benchtests/bench-stpncpy.c b/benchtests/bench-stpncpy.c
> index 40c82cf716e6d7bc8fcb8d5a390da1ee8fee3245..331f5e67b7fc054bd678dbd264a24e43f940c0ed 100644
> --- a/benchtests/bench-stpncpy.c
> +++ b/benchtests/bench-stpncpy.c
> @@ -24,47 +24,19 @@
>  # define TEST_NAME "wcpncpy"
>  #endif /* WIDE */
>  #include "bench-string.h"
> -#ifndef WIDE
> -# define SIMPLE_STPNCPY simple_stpncpy
> -# define STUPID_STPNCPY stupid_stpncpy
> -#else
> -# define SIMPLE_STPNCPY simple_wcpncpy
> -# define STUPID_STPNCPY stupid_wcpncpy
> -#endif /* WIDE */
> -
> -CHAR *SIMPLE_STPNCPY (CHAR *, const CHAR *, size_t);
> -CHAR *STUPID_STPNCPY (CHAR *, const CHAR *, size_t);
> -
> -IMPL (STUPID_STPNCPY, 0)
> -IMPL (SIMPLE_STPNCPY, 0)
> -IMPL (STPNCPY, 1)
> -
> -CHAR *
> -SIMPLE_STPNCPY (CHAR *dst, const CHAR *src, size_t n)
> -{
> -  while (n--)
> -    if ((*dst++ = *src++) == '\0')
> -      {
> -	size_t i;
> -
> -	for (i = 0; i < n; ++i)
> -	  dst[i] = '\0';
> -	return dst - 1;
> -      }
> -  return dst;
> -}
>  
>  CHAR *
> -STUPID_STPNCPY (CHAR *dst, const CHAR *src, size_t n)
> +generic_stpncpy (CHAR *dst, const CHAR *src, size_t n)
>  {
>    size_t nc = STRNLEN (src, n);
> -  size_t i;
> -
> -  for (i = 0; i < nc; ++i)
> -    dst[i] = src[i];
> -  for (; i < n; ++i)
> -    dst[i] = '\0';
> -  return dst + nc;
> +  MEMCPY (dst, src, nc);
> +  dst += nc;
> +  if (nc == n)
> +    return dst;
> +  return MEMSET (dst, 0, n - nc);
>  }
>  
> +IMPL (STPNCPY, 1)
> +IMPL (generic_stpncpy, 0)
> +
>  #include "bench-strncpy.c"

Same as before for wcpncpy: instead of reimplement the generic implementation
on benchtests we can just include them. And it also leads to an possible
optimization on generic implementation for wcpncpy.

> diff --git a/benchtests/bench-strcat.c b/benchtests/bench-strcat.c
> index 6b3b084ae19a931cb2bda07794380b5745ccfc86..d3e96f4ec6331bea0027016eb6af4a276d0e6714 100644
> --- a/benchtests/bench-strcat.c
> +++ b/benchtests/bench-strcat.c
> @@ -28,31 +28,25 @@
>  
>  #ifndef WIDE
>  # define sfmt "s"
> -# define SIMPLE_STRCAT simple_strcat
>  # define SMALL_CHAR 127
>  #else
>  # define sfmt "ls"
> -# define SIMPLE_STRCAT simple_wcscat
>  # define SMALL_CHAR 1273
>  #endif /* WIDE */
>  
>  
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
> -CHAR *SIMPLE_STRCAT (CHAR *, const CHAR *);
> -
> -IMPL (SIMPLE_STRCAT, 0)
> -IMPL (STRCAT, 1)
>  
>  CHAR *
> -SIMPLE_STRCAT (CHAR *dst, const CHAR *src)
> +generic_strcat (CHAR *dst, const CHAR *src)
>  {
> -  CHAR *ret = dst;
> -  while (*dst++ != '\0');
> -  --dst;
> -  while ((*dst++ = *src++) != '\0');
> -  return ret;
> +  STRCPY (dst + STRLEN (dst), src);
> +  return dst;
>  }
>  
> +IMPL (STRCAT, 1)
> +IMPL (generic_strcat, 0)
> +
>  static void
>  do_one_test (impl_t *impl, CHAR *dst, const CHAR *src)
>  {

Same as before for wcscat.

> diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
> index e5fd27ffba9ca6dd871e658d1cffc22f28047e49..426ee8f79ad8400b5ceddad5d6cdfc3523f49f16 100644
> --- a/benchtests/bench-strcpy.c
> +++ b/benchtests/bench-strcpy.c
> @@ -34,25 +34,17 @@
>  # else
>  #  define TEST_NAME "wcscpy"
>  # endif
> -# include "bench-string.h"
> -# ifndef WIDE
> -#  define SIMPLE_STRCPY simple_strcpy
> -# else
> -#  define SIMPLE_STRCPY simple_wcscpy
> -# endif
> -
> -CHAR *SIMPLE_STRCPY (CHAR *, const CHAR *);
> -
> -IMPL (SIMPLE_STRCPY, 0)
> -IMPL (STRCPY, 1)
> +#include "bench-string.h"
>  
>  CHAR *
> -SIMPLE_STRCPY (CHAR *dst, const CHAR *src)
> +generic_strcpy (CHAR *dst, const CHAR *src)
>  {
> -  CHAR *ret = dst;
> -  while ((*dst++ = *src++) != '\0');
> -  return ret;
> +  return MEMCPY (dst, src, STRLEN (src) + 1);
>  }
> +
> +IMPL (STRCPY, 1)
> +IMPL (generic_strcpy, 0)
> +
>  #endif
>  
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *);

Same as before for wcscpy.

> diff --git a/benchtests/bench-strncat.c b/benchtests/bench-strncat.c
> index 7e0112273ba0727ae29407e38724c39159ce1c93..da8318e9a24e50ca491b65e092cc5152b3c47715 100644
> --- a/benchtests/bench-strncat.c
> +++ b/benchtests/bench-strncat.c
> @@ -27,35 +27,26 @@
>  #define BIG_CHAR MAX_CHAR
>  
>  #ifndef WIDE
> -# define SIMPLE_STRNCAT simple_strncat
> -# define STUPID_STRNCAT stupid_strncat
>  # define SMALL_CHAR 127
>  #else
> -# define SIMPLE_STRNCAT simple_wcsncat
> -# define STUPID_STRNCAT stupid_wcsncat
>  # define SMALL_CHAR 1273
>  #endif /* WIDE */
>  
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);
> -CHAR *STUPID_STRNCAT (CHAR *, const CHAR *, size_t);
> -CHAR *SIMPLE_STRNCAT (CHAR *, const CHAR *, size_t);
> -
> -IMPL (STUPID_STRNCAT, 0)
> -IMPL (STRNCAT, 2)
>  
>  CHAR *
> -STUPID_STRNCAT (CHAR *dst, const CHAR *src, size_t n)
> +generic_strncat (CHAR *dst, const CHAR *src, size_t n)
>  {
> -  CHAR *ret = dst;
> -  while (*dst++ != '\0');
> -  --dst;
> -  while (n--)
> -    if ((*dst++ = *src++) == '\0')
> -      return ret;
> -  *dst = '\0';
> -  return ret;
> +  CHAR *end = dst + STRLEN (dst);
> +  n = STRNLEN (src, n);
> +  end[n] = 0;
> +  MEMCPY (end, src, n);
> +  return dst;
>  }
>  
> +IMPL (STRNCAT, 2)
> +IMPL (generic_strncat, 0)
> +
>  static void
>  do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, size_t n)
>  {

Same as before for wcsncat.

> diff --git a/benchtests/bench-strncpy.c b/benchtests/bench-strncpy.c
> index 79953759bfbd7201b9e3a846b59666375cb76348..e32a1519cd95af5456277e765813eb78211a3a2a 100644
> --- a/benchtests/bench-strncpy.c
> +++ b/benchtests/bench-strncpy.c
> @@ -33,47 +33,19 @@
>  #  define TEST_NAME "wcsncpy"
>  # endif /* WIDE */
>  # include "bench-string.h"
> -# ifndef WIDE
> -#  define SIMPLE_STRNCPY simple_strncpy
> -#  define STUPID_STRNCPY stupid_strncpy
> -# else
> -#  define SIMPLE_STRNCPY simple_wcsncpy
> -#  define STUPID_STRNCPY stupid_wcsncpy
> -# endif /* WIDE */
> -
> -CHAR *SIMPLE_STRNCPY (CHAR *, const CHAR *, size_t);
> -CHAR *STUPID_STRNCPY (CHAR *, const CHAR *, size_t);
> -
> -IMPL (STUPID_STRNCPY, 0)
> -IMPL (SIMPLE_STRNCPY, 0)
> -IMPL (STRNCPY, 1)
>  
>  CHAR *
> -SIMPLE_STRNCPY (CHAR *dst, const CHAR *src, size_t n)
> +generic_strncpy (CHAR *dst, const CHAR *src, size_t n)
>  {
> -  CHAR *ret = dst;
> -  while (n--)
> -    if ((*dst++ = *src++) == '\0')
> -      {
> -	while (n--)
> -	  *dst++ = '\0';
> -	return ret;
> -      }
> -  return ret;
> +  size_t nc = STRNLEN (src, n);
> +  if (nc != n)
> +    MEMSET (dst + nc, 0, n - nc);
> +  return MEMCPY (dst, src, nc);
>  }
>  
> -CHAR *
> -STUPID_STRNCPY (CHAR *dst, const CHAR *src, size_t n)
> -{
> -  size_t nc = STRNLEN (src, n);
> -  size_t i;
> +IMPL (STRNCPY, 1)
> +IMPL (generic_strncpy, 0)
>  
> -  for (i = 0; i < nc; ++i)
> -    dst[i] = src[i];
> -  for (; i < n; ++i)
> -    dst[i] = '\0';
> -  return dst;
> -}
>  #endif /* !STRNCPY_RESULT */
>  
>  typedef CHAR *(*proto_t) (CHAR *, const CHAR *, size_t);

Same as before for wcsncpy.

> diff --git a/benchtests/bench-strnlen.c b/benchtests/bench-strnlen.c
> index 8a7641669a36603e3943b5ca3b9cb0f648dd18f0..8a0fabfcf6c4223dceab3a8eb581851c6ab6be5d 100644
> --- a/benchtests/bench-strnlen.c
> +++ b/benchtests/bench-strnlen.c
> @@ -28,27 +28,24 @@
>  
>  #ifndef WIDE
>  # define MIDDLE_CHAR 127
> -# define SIMPLE_STRNLEN simple_strnlen
>  #else
>  # define MIDDLE_CHAR 1121
> -# define SIMPLE_STRNLEN simple_wcsnlen
>  #endif /* WIDE */
>  
>  typedef size_t (*proto_t) (const CHAR *, size_t);
> -size_t SIMPLE_STRNLEN (const CHAR *, size_t);
> -
> -IMPL (SIMPLE_STRNLEN, 0)
> -IMPL (STRNLEN, 1)
> +size_t generic_strnlen (const CHAR *, size_t);
>  
>  size_t
> -SIMPLE_STRNLEN (const CHAR *s, size_t maxlen)
> +memchr_strnlen (const CHAR *s, size_t maxlen)
>  {
> -  size_t i;
> -
> -  for (i = 0; i < maxlen && s[i]; ++i);
> -  return i;
> +  const CHAR *s1 = MEMCHR (s, 0, maxlen);
> +  return (s1 == NULL) ? maxlen : s1 - s;
>  }

You will need to adjust the name for WIDE, maybe memchr_wcsnlen.

>  
> +IMPL (STRNLEN, 1)
> +IMPL (memchr_strnlen, 0)
> +IMPL (generic_strnlen, 0)
> +
>  static void
>  do_one_test (impl_t *impl, const CHAR *s, size_t maxlen, size_t exp_len)
>  {
> @@ -146,3 +143,13 @@ test_main (void)
>  }
>  
>  #include <support/test-driver.c>
> +
> +#define libc_hidden_def(X)
> +#ifndef WIDE
> +# undef STRNLEN
> +# define STRNLEN generic_strnlen
> +# include <string/strnlen.c>
> +#else
> +# define WCSNLEN generic_strnlen
> +# include <wcsmbs/wcsnlen.c>
> +#endif
>
Wilco Dijkstra - Feb. 5, 2019, 3:17 p.m.
Hi Adhemerval,

> Space after cast.  As a side note, this won't evaluate wstpcpy as-is, since it
> will use an optimized version and it will result in a wrong generic name for
> wcscpy.

I'll have a look at that. Maybe we could create GENERIC_STPCPY from
concatenation of the STPCPY define. I want to avoid huge amounts of defines
leading to completely incomprehensible magic.

> And as a following cleanup for wcpcpy we can use the similar code for strcpy
> adjusting for wide-chars:

Sure, there are lots of generic string functions which aren't optimized yet. However
that's a different patch... This patch simply makes it easy to find those cases.

> Same as before for wcpncpy: instead of reimplement the generic implementation
> on benchtests we can just include them. And it also leads to an possible
> optimization on generic implementation for wcpncpy.

The point is to enable useful comparisons of string implementations. If we include
the generic implementation then we just compare the generic implementation with
itself in many cases. And that isn't useful. If I change a generic implementation I
want to see the difference that makes in the benchmark comparison rather than
showing no difference.

Maybe the name generic_xxx is confusing? It's meant to be the baseline,
something which you should beat in all cases with the actual implementation.

Cheers,
Wilco
Adhemerval Zanella Netto - Feb. 5, 2019, 7:33 p.m.
On 05/02/2019 13:17, Wilco Dijkstra wrote:
> Hi Adhemerval,
> 
>> Space after cast.  As a side note, this won't evaluate wstpcpy as-is, since it
>> will use an optimized version and it will result in a wrong generic name for
>> wcscpy.
> 
> I'll have a look at that. Maybe we could create GENERIC_STPCPY from
> concatenation of the STPCPY define. I want to avoid huge amounts of defines
> leading to completely incomprehensible magic.
> 
>> And as a following cleanup for wcpcpy we can use the similar code for strcpy
>> adjusting for wide-chars:
> 
> Sure, there are lots of generic string functions which aren't optimized yet. However
> that's a different patch... This patch simply makes it easy to find those cases.

The idea was not make is a pre-requisite, but rather an idea for a different
patch indeed.

> 
>> Same as before for wcpncpy: instead of reimplement the generic implementation
>> on benchtests we can just include them. And it also leads to an possible
>> optimization on generic implementation for wcpncpy.
> 
> The point is to enable useful comparisons of string implementations. If we include
> the generic implementation then we just compare the generic implementation with
> itself in many cases. And that isn't useful. If I change a generic implementation I
> want to see the difference that makes in the benchmark comparison rather than
> showing no difference.

My understanding is we have the generic implementation as the baseline
where arch-specific optimization might be applied and the idea of the 
comparison is to check against it.  I see no point in using a different 
implementation on benchtests, it should compare against exactly what 
glibc is currently providing.

If you want to check if the your changes improves the generic, you can
compare against multiples glibc builds.

> 
> Maybe the name generic_xxx is confusing? It's meant to be the baseline,
> something which you should beat in all cases with the actual implementation.

My understanding is the baseline should be the generic implementation which
is selected if the architecture does not provide an optimized one.

> 
> Cheers,
> Wilco
>   
>
Wilco Dijkstra - Feb. 6, 2019, 2:01 p.m.
Hi Adhemerval,

>>> Same as before for wcpncpy: instead of reimplement the generic implementation
>>> on benchtests we can just include them. And it also leads to an possible
>>> optimization on generic implementation for wcpncpy.
>> 
>> The point is to enable useful comparisons of string implementations. If we include
>> the generic implementation then we just compare the generic implementation with
>> itself in many cases. And that isn't useful. If I change a generic implementation I
>> want to see the difference that makes in the benchmark comparison rather than
>> showing no difference.
>
> My understanding is we have the generic implementation as the baseline
> where arch-specific optimization might be applied and the idea of the 
> comparison is to check against it.  I see no point in using a different 
> implementation on benchtests, it should compare against exactly what 
> glibc is currently providing.

I have to disagree, we cannot do an exact comparison unless build the generic
string functions as part of GLIBC and call them via the PLT. Including source
files with lots of #define magic is never going to be equivalent.

The goal here is not an accurate comparison with generic string functions but
to enable a realistic comparison with an efficient baseline - the existing byte
oriented implementations provide a baseline but are too slow to be useful.

> If you want to check if the your changes improves the generic, you can
> compare against multiples glibc builds.

That doesn't work so well given it takes a long time to rebuild GLIBC and 
benchmarks. For all benchmarking I do, I always create a direct comparison of
old vs new in a single run so it shows the differences and can be run repeatedly
to confirm. The string bench is setup to do this already, so why remove this
useful feature?
 
>> Maybe the name generic_xxx is confusing? It's meant to be the baseline,
>> something which you should beat in all cases with the actual implementation.
>
> My understanding is the baseline should be the generic implementation which
> is selected if the architecture does not provide an optimized one.

That means you never compare the generic implementation against a baseline.
Given that is what we do today, I don't see why we should stop doing that.

Cheers,
Wilco
Adhemerval Zanella Netto - Feb. 6, 2019, 2:53 p.m.
On 06/02/2019 12:01, Wilco Dijkstra wrote:
> Hi Adhemerval,
> 
>>>> Same as before for wcpncpy: instead of reimplement the generic implementation
>>>> on benchtests we can just include them. And it also leads to an possible
>>>> optimization on generic implementation for wcpncpy.
>>>
>>> The point is to enable useful comparisons of string implementations. If we include
>>> the generic implementation then we just compare the generic implementation with
>>> itself in many cases. And that isn't useful. If I change a generic implementation I
>>> want to see the difference that makes in the benchmark comparison rather than
>>> showing no difference.
>>
>> My understanding is we have the generic implementation as the baseline
>> where arch-specific optimization might be applied and the idea of the 
>> comparison is to check against it.  I see no point in using a different 
>> implementation on benchtests, it should compare against exactly what 
>> glibc is currently providing.
> 
> I have to disagree, we cannot do an exact comparison unless build the generic
> string functions as part of GLIBC and call them via the PLT. Including source
> files with lots of #define magic is never going to be equivalent.
> 
> The goal here is not an accurate comparison with generic string functions but
> to enable a realistic comparison with an efficient baseline - the existing byte
> oriented implementations provide a baseline but are too slow to be useful.

The idea is not to be equivalent, since benchtests already adds the exported
libc symbol which will be called through PLT.  I do agree with you that the
byte-oriented baseline is somewhat useless now that most architecture implements
efficient word or vectorized common symbol, and the idea is also to provide
some more efficient generic string implementation (I have a long-standing patchset
'Improve generic string routines' to address this).

So my point is to which exactly should we compare on benchtests? Current we have:

  1. Byte-oriented 'simple' implementation which, as we agree, should not be
     used as a baseline.

  2. Some named 'stupid' which are usually composed implementation that might
     in fact be a faster implementation than some 'clever' ones.

  3. Compiler builtins, which also does not represent meaningful data for
     libc optimization (it will either be inline, call libc implementation, or
     mix both strategies).

  4. The libc implementations themselves, possible including all ifunc variations.

So which really give us meaningful data for future optimization? Should we keep
add multiple implementation as baselines to compare with?

What about an architecture that uses as baseline an arch-specific implementation,
which might use non optimal strategy that a future generic implementation might
use? We have examples for both string and math code on different architectures
where the generic implementation ended up performing better than the arch-specific
implementation.

Another example is your recent bench-strlen improvement (5289f1f56b7) which
added the memchr_strlen.  The generic implementation of strlen uses a similar
strategy of memchr, with the difference it does not need to materialize the
magic constant and add some loop unrolling for tail comparison. At first
it should be faster than memchr_strlen, however if the architecture has 
an optimized memchr implementation (which is a hotspot and it is usually a
target for arch-specific optimization), the memchr_strlen should be indeed
faster (and have a lower i-cache footprint). 

My point is using memchr_strlen as the *generic* implementation and also use
it as the *baseline* for performance comparison shows to the developer that
optimizing memchr would be a net gain in general than providing multiple
different optimization for multiple symbols that can be built by memchr
calls.

So I still think we should define better which exactly we need to compare
in benchtests and use the generic implementation, which will be used as
default for new ports, as the default basline. The file inclusion is just to 
avoid code duplication, I don't have a strong opinion whether to include or 
just copy-paste the code on benchtests. 

> 
>> If you want to check if the your changes improves the generic, you can
>> compare against multiples glibc builds.
> 
> That doesn't work so well given it takes a long time to rebuild GLIBC and 
> benchmarks. For all benchmarking I do, I always create a direct comparison of
> old vs new in a single run so it shows the differences and can be run repeatedly
> to confirm. The string bench is setup to do this already, so why remove this
> useful feature?
>  
>>> Maybe the name generic_xxx is confusing? It's meant to be the baseline,
>>> something which you should beat in all cases with the actual implementation.
>>
>> My understanding is the baseline should be the generic implementation which
>> is selected if the architecture does not provide an optimized one.
> 
> That means you never compare the generic implementation against a baseline.
> Given that is what we do today, I don't see why we should stop doing that.
> 
> Cheers,
> Wilco
>     
>

Patch

diff --git a/benchtests/bench-stpcpy.c b/benchtests/bench-stpcpy.c
index 74ea54f..b73eba6 100644
--- a/benchtests/bench-stpcpy.c
+++ b/benchtests/bench-stpcpy.c
@@ -24,22 +24,19 @@ 
 # define TEST_NAME "wcpcpy"
 #endif /* WIDE */
 #include "bench-string.h"
-#ifndef WIDE
-# define SIMPLE_STPCPY simple_stpcpy
-#else
-# define SIMPLE_STPCPY simple_wcpcpy
-#endif /* WIDE */
 
-CHAR *SIMPLE_STPCPY (CHAR *, const CHAR *);
-
-IMPL (SIMPLE_STPCPY, 0)
 IMPL (STPCPY, 1)
 
-CHAR *
-SIMPLE_STPCPY (CHAR *dst, const CHAR *src)
-{
-  while ((*dst++ = *src++) != '\0');
-  return dst - 1;
-}
+#undef STPCPY
+#ifndef WIDE
+# define GENERIC_STPCPY generic_stpcpy
+# define STPCPY         GENERIC_STPCPY
+# include <string/stpcpy.c>
+#else
+# define GENERIC_STPCPY generic_wcpcpy
+# define WCPCPY         GENERIC_STPCPY
+# include <wcsmbs/wcpcpy.c>
+#endif
+IMPL (GENERIC_STPCPY, 0)
 
 #include "bench-strcpy.c"
diff --git a/string/stpcpy.c b/string/stpcpy.c
index 2e65988..728e64f 100644
--- a/string/stpcpy.c
+++ b/string/stpcpy.c
@@ -25,17 +25,20 @@ 
 #undef __stpcpy
 #undef stpcpy
 
-#ifndef STPCPY
-# define STPCPY __stpcpy
+#ifdef STPCPY
+# define __stpcpy STPCPY
 #endif
 
 /* Copy SRC to DEST, returning the address of the terminating '\0' in DEST.  */
 char *
-STPCPY (char *dest, const char *src)
+__stpcpy (char *dest, const char *src)
 {
   size_t len = strlen (src);
   return memcpy (dest, src, len + 1) + len;
 }
+
+#ifndef STPCPY
 weak_alias (__stpcpy, stpcpy)
 libc_hidden_def (__stpcpy)
 libc_hidden_builtin_def (stpcpy)
+#endif