powerpc: strstr optimization

Message ID 1429254584-124997-1-git-send-email-raji@linux.vnet.ibm.com
State Superseded
Headers

Commit Message

Rajalakshmi S April 17, 2015, 7:09 a.m. UTC
  This patch optimizes strstr function for power >= 7 systems.  Performance
gain is obtained using doubleword aligned memory access and usage of cmpb
instruction for quicker comparison.  The average improvement of this
optimization is ~40%.

2015-04-17  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>

	* sysdeps/powerpc/powerpc64/multiarch/Makefile: Add strstr().
	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c: Likewise.
	* sysdeps/powerpc/powerpc64/power7/strstr.S: New File.
	* sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S: New File.
	* sysdeps/powerpc/powerpc64/multiarch/strstr-ppc64.c: New File.
	* sysdeps/powerpc/powerpc64/multiarch/strstr.c: New File.
---
 sysdeps/powerpc/powerpc64/multiarch/Makefile       |   2 +-
 .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   8 +
 .../powerpc/powerpc64/multiarch/strstr-power7.S    |  44 ++
 sysdeps/powerpc/powerpc64/multiarch/strstr-ppc64.c |  29 ++
 sysdeps/powerpc/powerpc64/multiarch/strstr.c       |  34 ++
 sysdeps/powerpc/powerpc64/power7/strstr.S          | 500 +++++++++++++++++++++
 6 files changed, 616 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strstr-ppc64.c
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strstr.c
 create mode 100644 sysdeps/powerpc/powerpc64/power7/strstr.S
  

Comments

Ondrej Bilka May 1, 2015, 12:05 p.m. UTC | #1
On Fri, Apr 17, 2015 at 02:09:44AM -0500, Rajalakshmi Srinivasaraghavan wrote:
> This patch optimizes strstr function for power >= 7 systems.  Performance
> gain is obtained using doubleword aligned memory access and usage of cmpb
> instruction for quicker comparison.  The average improvement of this
> optimization is ~40%.

As I did same with x64 I will add improvements, I didn't look at patch
in detail.

First problem looks there is  quadratic worst case. That is solved by
buy-or-rent technique. You count number of comparisons and if that
exceeds C*N where N is number of haystack bytes read so far and C
suitable constant then you switch to slower generic algorithm.

Main improvement in x64 case was checking for pair of characters. A pair
of characters is less likely than single character even vectorized
branch would be rarely triggered saving misprediction cost that
dominates performance.

That improved performance about 25 times.


For checking pair only with arithmetic you look on zero bytes of
following expression

(LOAD (haystack + n) ^ BROADCAST(needle[0])) | (LOAD (haystack + n + 1) ^ BROADCAST(needle[1]))

Then you need to do technical improvements.

One is unrolling to test 64 bytes at once, you could keep that to 32/16
if that improves performance. Hot portion is header, loop is rarely used
so you should check first 64 bytes without going to main loop.

Next is that you need to load vector twice, once for first character and
then for second character. You need to have loads corresponding to
second character aligned. For first character you could use unaligned
loads if they are fast or get these by bit fiddling from second
character loads. This way you don't have to worry about crossing page
boundaries.

Here is template that I used to generate x64 version you could use that
with modifications. Main problem is mapping suitable instrincts or
replacing these with corresponding powerpc instructions.


#include <emmintrin.h>
#include <stdint.h>
#include <stdlib.h>
#include "../header.h"

#ifdef AS_SSSE3
# define PREVCHAR(s, va, vb) CONCAT (va, vb, 15)
#else
# define PREVCHAR(s, va, vb) LOADU (s)
#endif

static inline tp_mask
forget_first_bit (tp_mask x)
{
  return x & (x - 1);
}

static inline int
strcmp2 (char *x, char *y, long *buy)
{
  int i = 0;
  while (1)
    {
      if (!y[i])
	return 0;

      if (x[i] != y[i])
	{
	  buy += i; return 1;
	}
      i++;
    }
}
char *strchr (char *s, int n);

char *strstr_string (char *a, char *b);

unsigned char *
strstr_new (unsigned char *s, unsigned char *n)
{
  long i;
  unsigned char *s_or, *s_al;
  if (__builtin_expect (!n[0], 0))
    return s;

  if (!n[1])
    return strchr (s, n[0]);

  tp_vector v0 = BROADCAST (n[0]), v1 = BROADCAST (n[1]), vz = BROADCAST (0);
  tp_vector vs0, vs1, vs2, vs3, vsb;
  long buy;
  long unused;
  if ((((size_t) s) & 4095) <= 4096 - 65)
    {
      vs0 = LOADU (s);
      /*or test three chars. */
      vs0 = OR (MIN (EQ (vs0, v0), EQ (v1, LOADU (s + 1))), EQ (vs0, vz));
      tp_mask m = get_mask (vs0);    //| (get_mask(vs1) << 16) | (get_mask(vs2) <<32) | (get_mask(vs3)<<48);

      while (m)
	{
	  unsigned char *r = s + first_bit (m);
	  if (!*r)
	    return NULL;

	  if (!strcmp2 (r + 2, n + 2, &unused))
	    return r;

	  m = forget_first_bit (m);
	}

      vs1 = LOADU (s + 16);
      vs2 = LOADU (s + 32);
      vs3 = LOADU (s + 48);
      /*or test three chars. */
      vs1 = OR (MIN (EQ (vs1, v0), EQ (v1, LOADU (s + 16 + 1))), EQ (vs1, vz));
      vs2 = OR (MIN (EQ (vs2, v0), EQ (v1, LOADU (s + 32 + 1))), EQ (vs2, vz));
      vs3 = OR (MIN (EQ (vs3, v0), EQ (v1, LOADU (s + 48 + 1))), EQ (vs3, vz));
      m = (get_mask (vs1) << 16) | (get_mask (vs2) << 32) | (get_mask (vs3) << 48);

      while (m)
	{
	  unsigned char *r = s + first_bit (m);
	  if (!*r)
	    return NULL;

	  if (!strcmp2 (r + 2, n + 2, &unused))
	    return r;

	  m = forget_first_bit (m);
	}
    }
  else
    {
      s_al = ALIGN_DOWN (s, 64);

      vs0 = LOAD (s_al);
      vs1 = LOAD (s_al + 16);
      vs2 = LOAD (s_al + 32);
      vs3 = LOAD (s_al + 48);
      /*or test three chars. */
      vs0 = OR (MIN (EQ (vs0, v1), EQ (v0, LOADU (s_al - 1))), EQ (vs0, vz));
      vs1 = OR (MIN (EQ (vs1, v1), EQ (v0, LOADU (s_al + 16 - 1))), EQ (vs1, vz));
      vs2 = OR (MIN (EQ (vs2, v1), EQ (v0, LOADU (s_al + 32 - 1))), EQ (vs2, vz));
      vs3 = OR (MIN (EQ (vs3, v1), EQ (v0, LOADU (s_al + 48 - 1))), EQ (vs3, vz));
      tp_mask m = get_mask (vs0) | (get_mask (vs1) << 16) | (get_mask (vs2) << 32) | (get_mask (vs3) << 48);
      m = m >> (s - s_al);
      while (m)
	{
	  unsigned char *r = s + first_bit (m);
	  if (!*r)
	    return NULL;

	  if (r != s && !strcmp2 (r + 2, n + 2, &unused))
	    return r;

	  m = forget_first_bit (m);
	}
    }

  buy = -512;
  s_or = s;
  s = ALIGN_DOWN (s + 64, 64);
#ifdef AS_SSSE3
  vs3 = LOAD (s - 16);
#endif
  while (1)
    {
      vs0 = LOAD (s);
      vs1 = LOAD (s + 16);
      vs2 = LOAD (s + 32);
      vs3 = LOAD (s + 48);
      /*or test three chars. */
      tp_vector ret = vs0;
      ret = MIN (ret, vs1);
      ret = MIN (ret, vs2);
      ret = MIN (ret, vs3);
      tp_vector ret2 = OR (XOR (PREVCHAR (s - 1, vs3, vs0), v0), XOR (vs0, v1));
      ret2 = MIN (ret2, OR (XOR (PREVCHAR (s - 1 + 16, vs0, vs1), v0), XOR (vs1, v1)));
      ret2 = MIN (ret2, OR (XOR (PREVCHAR (s - 1 + 32, vs1, vs2), v0), XOR (vs2, v1)));
      ret2 = MIN (ret2, OR (XOR (PREVCHAR (s - 1 + 48, vs2, vs3), v0), XOR (vs3, v1)));
      ret = MIN (ret, ret2);
      if (get_mask (EQ (ret, vz)))
	{
	  s_al = ALIGN_DOWN (s, 64);

	  vs0 = LOAD (s_al);
	  vs1 = LOAD (s_al + 16);
	  vs2 = LOAD (s_al + 32);
	  vs3 = LOAD (s_al + 48);
	  /*or test three chars. */
	  vs0 = OR (MIN (EQ (vs0, v1), EQ (v0, LOADU (s_al - 1))), EQ (vs0, vz)); /* todo last three nondestructive */
	  vs1 = OR (MIN (EQ (vs1, v1), EQ (v0, LOADU (s_al + 16 - 1))), EQ (vs1, vz));
	  vs2 = OR (MIN (EQ (vs2, v1), EQ (v0, LOADU (s_al + 32 - 1))), EQ (vs2, vz));
	  vs3 = OR (MIN (EQ (vs3, v1), EQ (v0, LOADU (s_al + 48 - 1))), EQ (vs3, vz));
	  tp_mask m = get_mask (vs0) | (get_mask (vs1) << 16) | (get_mask (vs2) << 32) | (get_mask (vs3) << 48);
	  while (m)
	    {
	      unsigned char *r = s + first_bit (m);
	      if (!*r)
		return NULL;

	      if (!strcmp2 (r + 1, n + 2, &buy))
		return r - 1;

	      if (buy > s - s_or)
		return strstr_string (s, n);

	      m = forget_first_bit (m);
	    }
	}
      s += 64;
    }
}
  
Matt Turner May 3, 2015, 11:51 p.m. UTC | #2
On Fri, Apr 17, 2015 at 12:09 AM, Rajalakshmi Srinivasaraghavan
<raji@linux.vnet.ibm.com> wrote:
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S
> new file mode 100644
> index 0000000..68b93b0
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S
> @@ -0,0 +1,44 @@
> +/* Optimized strstr implementation for POWER7.
> +   Copyright (C) 2015-2016 Free Software Foundation, Inc.

Help me understand the 2016 copyright?
  
Rajalakshmi S May 4, 2015, 5:25 a.m. UTC | #3
On 05/04/2015 05:21 AM, Matt Turner wrote:
> On Fri, Apr 17, 2015 at 12:09 AM, Rajalakshmi Srinivasaraghavan
> <raji@linux.vnet.ibm.com> wrote:
>> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S
>> new file mode 100644
>> index 0000000..68b93b0
>> --- /dev/null
>> +++ b/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S
>> @@ -0,0 +1,44 @@
>> +/* Optimized strstr implementation for POWER7.
>> +   Copyright (C) 2015-2016 Free Software Foundation, Inc.
> Help me understand the 2016 copyright?
This must be just 2015. I will correct it.
>
  
Rajalakshmi S May 4, 2015, 5:33 a.m. UTC | #4
On 05/01/2015 05:35 PM, Ondřej Bílka wrote:
> On Fri, Apr 17, 2015 at 02:09:44AM -0500, Rajalakshmi Srinivasaraghavan wrote:
>> This patch optimizes strstr function for power >= 7 systems.  Performance
>> gain is obtained using doubleword aligned memory access and usage of cmpb
>> instruction for quicker comparison.  The average improvement of this
>> optimization is ~40%.
> As I did same with x64 I will add improvements, I didn't look at patch
> in detail.
>
> First problem looks there is  quadratic worst case. That is solved by
> buy-or-rent technique. You count number of comparisons and if that
> exceeds C*N where N is number of haystack bytes read so far and C
> suitable constant then you switch to slower generic algorithm.
>
> Main improvement in x64 case was checking for pair of characters. A pair
> of characters is less likely than single character even vectorized
> branch would be rarely triggered saving misprediction cost that
> dominates performance.
>
Though I did not understand your program completely, I agree with 
checking pair of characters
can improve the performance.

In my optimization there are totally three branches.
case 1# if needle len >= 8 (common case)
case 2# if needle < 8 (common case)
case 3# For page boundary cross haystack. (rare case)

For case 1, I always compare 8 characters at a time using cmpb instruction.
-> perform basic strlen, strnlen, checks. Move haystack based on strchr 
result.
-> Load first 8 bytes(taking care of aligned load)from haystack and 
needle and compare them.
If they are same proceed to next 8 bytes of haystack and needle.
Else load next doubleword from haystack.Form a doubleword with last 
seven characters from first load and first character from
second load and compare again with needle. Proceed in a similar way.
This gives good improvement.

For case2, always compare 'n'  characters at a time, 'n' is needle length.
-> Algorithm is same as case 1, however before doing cmpb,create input 
mask based on needle len.

For case 1 and case 2, input is always loaded as double word.

For case 3, its byte by byte comparison.
  
Ondrej Bilka May 5, 2015, 9:14 p.m. UTC | #5
On Mon, May 04, 2015 at 11:03:21AM +0530, Rajalakshmi Srinivasaraghavan wrote:
> 
> 
> On 05/01/2015 05:35 PM, Ondřej Bílka wrote:
> >On Fri, Apr 17, 2015 at 02:09:44AM -0500, Rajalakshmi Srinivasaraghavan wrote:
> >>This patch optimizes strstr function for power >= 7 systems.  Performance
> >>gain is obtained using doubleword aligned memory access and usage of cmpb
> >>instruction for quicker comparison.  The average improvement of this
> >>optimization is ~40%.
> >As I did same with x64 I will add improvements, I didn't look at patch
> >in detail.
> >
> >First problem looks there is  quadratic worst case. That is solved by
> >buy-or-rent technique. You count number of comparisons and if that
> >exceeds C*N where N is number of haystack bytes read so far and C
> >suitable constant then you switch to slower generic algorithm.
> >
> >Main improvement in x64 case was checking for pair of characters. A pair
> >of characters is less likely than single character even vectorized
> >branch would be rarely triggered saving misprediction cost that
> >dominates performance.
> >
> Though I did not understand your program completely, I agree with
> checking pair of characters
> can improve the performance.
> 
It does. AFAIR for practical strings on x64 its only around 3 times slower than
theoretical lower bound. 

A lower bound is strlen call as you must find terminating null when
needle isn't present.

I hope that main idea was clear and you will
use it. Code I send is complex as it contains ten different speedups
that improve performance.


> In my optimization there are totally three branches.
> case 1# if needle len >= 8 (common case)
> case 2# if needle < 8 (common case)
> case 3# For page boundary cross haystack. (rare case)
>
Checking if its case 1 or 2 is relatively costy as in practice haystacks
tend to be short. Most apps that use strstr have 90% call haystack less than 64 bytes long.

You don't need to know needle size if you know that it doesn't contain
first pair. Also its rare that it contains first pair of characters twice.

> For case 1, I always compare 8 characters at a time using cmpb instruction.
> -> perform basic strlen, strnlen, checks. Move haystack based on
> strchr result.
> -> Load first 8 bytes(taking care of aligned load)from haystack and
> needle and compare them.
> If they are same proceed to next 8 bytes of haystack and needle.
> Else load next doubleword from haystack.Form a doubleword with last
> seven characters from first load and first character from
> second load and compare again with needle. Proceed in a similar way.
> This gives good improvement.
> 
Thats looks worse than naive algorithm on practical strings. Most
instructions you do are useless as you have mismatch at first character.

Following function should beat it.

char *strstr (char *s, char *n)
{
  if (n[0] == 0)
    return s;
  while (s = strchr (s, n[0]))
    {
      long i;
      for (i=1; n[i] && n[i] == s[i]; i++);
      if (n[i] == 0)
        return s;
      s++;
    }
}

Or use musl strstr idea. While implementation is terrible as it in
practice unnecessary spends 90% time on preprocessing their loop is
relatively good for portable implementation.

> For case2, always compare 'n'  characters at a time, 'n' is needle length.
> -> Algorithm is same as case 1, however before doing cmpb,create
> input mask based on needle len.
> 
> For case 1 and case 2, input is always loaded as double word.
> 
> For case 3, its byte by byte comparison.
>
  
Rajalakshmi S May 6, 2015, 1:28 p.m. UTC | #6
On 05/06/2015 02:44 AM, Ondřej Bílka wrote:
>
> It does. AFAIR for practical strings on x64 its only around 3 times slower than
> theoretical lower bound.
>
> A lower bound is strlen call as you must find terminating null when
> needle isn't present.
>
> I hope that main idea was clear and you will
> use it. Code I send is complex as it contains ten different speedups
> that improve performance.
>
>
>> In my optimization there are totally three branches.
>> case 1# if needle len >= 8 (common case)
>> case 2# if needle < 8 (common case)
>> case 3# For page boundary cross haystack. (rare case)
>>
> Checking if its case 1 or 2 is relatively costy as in practice haystacks
> tend to be short. Most apps that use strstr have 90% call haystack less than 64 bytes long.
>
> You don't need to know needle size if you know that it doesn't contain
> first pair. Also its rare that it contains first pair of characters twice.
strlen of needle is needed in all the cases so as to make sure haystack 
is longer than needle
as part of initial check before proceeding to search. Once this check is 
passed, strchr is
called to find the  position of first character.

>
>> For case 1, I always compare 8 characters at a time using cmpb instruction.
>> -> perform basic strlen, strnlen, checks. Move haystack based on
>> strchr result.
>> -> Load first 8 bytes(taking care of aligned load)from haystack and
>> needle and compare them.
>> If they are same proceed to next 8 bytes of haystack and needle.
>> Else load next doubleword from haystack.Form a doubleword with last
>> seven characters from first load and first character from
>> second load and compare again with needle. Proceed in a similar way.
>> This gives good improvement.
>>
> Thats looks worse than naive algorithm on practical strings. Most
> instructions you do are useless as you have mismatch at first character.
>
> Following function should beat it.
>
> char *strstr (char *s, char *n)
> {
>    if (n[0] == 0)
>      return s;
>    while (s = strchr (s, n[0]))
>      {
>        long i;
>        for (i=1; n[i] && n[i] == s[i]; i++);
>        if (n[i] == 0)
>          return s;
>        s++;
>      }
> }
Calling strchr in a loop is not showing  improvement when compared to 
the proposed patch.
>
> Or use musl strstr idea. While implementation is terrible as it in
> practice unnecessary spends 90% time on preprocessing their loop is
> relatively good for portable implementation.
>
>
  
Steven Munroe May 11, 2015, 6:10 p.m. UTC | #7
On Wed, 2015-05-06 at 18:58 +0530, Rajalakshmi Srinivasaraghavan wrote:
> 
> On 05/06/2015 02:44 AM, Ondřej Bílka wrote:
> >
> > It does. AFAIR for practical strings on x64 its only around 3 times slower than
> > theoretical lower bound.
> >
> > A lower bound is strlen call as you must find terminating null when
> > needle isn't present.
> >
> > I hope that main idea was clear and you will
> > use it. Code I send is complex as it contains ten different speedups
> > that improve performance.
> >
> >
> >> In my optimization there are totally three branches.
> >> case 1# if needle len >= 8 (common case)
> >> case 2# if needle < 8 (common case)
> >> case 3# For page boundary cross haystack. (rare case)
> >>
> > Checking if its case 1 or 2 is relatively costy as in practice haystacks
> > tend to be short. Most apps that use strstr have 90% call haystack less than 64 bytes long.
> >
> > You don't need to know needle size if you know that it doesn't contain
> > first pair. Also its rare that it contains first pair of characters twice.
> strlen of needle is needed in all the cases so as to make sure haystack 
> is longer than needle
> as part of initial check before proceeding to search. Once this check is 
> passed, strchr is
> called to find the  position of first character.
> 
I think we should move on with (commit) Raji's patch as is. 

While we appreciate you suggestions Ondrej, not all optimization apply
equally to all platforms. 

> >
> >> For case 1, I always compare 8 characters at a time using cmpb instruction.
> >> -> perform basic strlen, strnlen, checks. Move haystack based on
> >> strchr result.
> >> -> Load first 8 bytes(taking care of aligned load)from haystack and
> >> needle and compare them.
> >> If they are same proceed to next 8 bytes of haystack and needle.
> >> Else load next doubleword from haystack.Form a doubleword with last
> >> seven characters from first load and first character from
> >> second load and compare again with needle. Proceed in a similar way.
> >> This gives good improvement.
> >>
> > Thats looks worse than naive algorithm on practical strings. Most
> > instructions you do are useless as you have mismatch at first character.
> >
> > Following function should beat it.
> >
> > char *strstr (char *s, char *n)
> > {
> >    if (n[0] == 0)
> >      return s;
> >    while (s = strchr (s, n[0]))
> >      {
> >        long i;
> >        for (i=1; n[i] && n[i] == s[i]; i++);
> >        if (n[i] == 0)
> >          return s;
> >        s++;
> >      }
> > }
> Calling strchr in a loop is not showing  improvement when compared to 
> the proposed patch.
> >
> > Or use musl strstr idea. While implementation is terrible as it in
> > practice unnecessary spends 90% time on preprocessing their loop is
> > relatively good for portable implementation.
> >
> >
>
  
Joseph Myers May 11, 2015, 8:19 p.m. UTC | #8
On Mon, 11 May 2015, Steven Munroe wrote:

> I think we should move on with (commit) Raji's patch as is. 

Does the patch avoid quadratic worst case?  Quadratic performance in 
strstr is an unambiguous regression, allowing for denial-of-service, that 
should not be introduced on any platform (cf bug 12100).  Indeed, gnulib 
has a configure test for quadratic strstr, so if you introduce quadratic 
worst case then the effect will be that all applications using gnulib 
automatically configure themselves to use gnulib's strstr and not glibc's 
at all.
  
Steven Munroe May 11, 2015, 9:03 p.m. UTC | #9
On Mon, 2015-05-11 at 20:19 +0000, Joseph Myers wrote:
> On Mon, 11 May 2015, Steven Munroe wrote:
> 
> > I think we should move on with (commit) Raji's patch as is. 
> 
> Does the patch avoid quadratic worst case?  Quadratic performance in 
> strstr is an unambiguous regression, allowing for denial-of-service, that 
> should not be introduced on any platform (cf bug 12100).  Indeed, gnulib 
> has a configure test for quadratic strstr, so if you introduce quadratic 
> worst case then the effect will be that all applications using gnulib 
> automatically configure themselves to use gnulib's strstr and not glibc's 
> at all.
> 

There was an assertion that there "might be" a quadratic worse case. No
proof was offered that there was such.

Raji do you have the data for this?
  
Ondrej Bilka May 12, 2015, 8:49 a.m. UTC | #10
On Wed, May 06, 2015 at 06:58:35PM +0530, Rajalakshmi Srinivasaraghavan wrote:
> 
> 
> On 05/06/2015 02:44 AM, Ondřej Bílka wrote:
> >
> >It does. AFAIR for practical strings on x64 its only around 3 times slower than
> >theoretical lower bound.
> >
> >A lower bound is strlen call as you must find terminating null when
> >needle isn't present.
> >
> >I hope that main idea was clear and you will
> >use it. Code I send is complex as it contains ten different speedups
> >that improve performance.
> >
> >
> >>In my optimization there are totally three branches.
> >>case 1# if needle len >= 8 (common case)
> >>case 2# if needle < 8 (common case)
> >>case 3# For page boundary cross haystack. (rare case)
> >>
> >Checking if its case 1 or 2 is relatively costy as in practice haystacks
> >tend to be short. Most apps that use strstr have 90% call haystack less than 64 bytes long.
> >
> >You don't need to know needle size if you know that it doesn't contain
> >first pair. Also its rare that it contains first pair of characters twice.
> strlen of needle is needed in all the cases so as to make sure
> haystack is longer than needle
> as part of initial check before proceeding to search. Once this
> check is passed, strchr is
> called to find the  position of first character.
>
No, strlen is entirely unnecessary. When haystack doesn't contain first
triple of characters (and if it contains match is relatively likely)
then question if needle or haystack are larger is irrelevant.
 
> >
> >>For case 1, I always compare 8 characters at a time using cmpb instruction.
> >>-> perform basic strlen, strnlen, checks. Move haystack based on
> >>strchr result.
> >>-> Load first 8 bytes(taking care of aligned load)from haystack and
> >>needle and compare them.
> >>If they are same proceed to next 8 bytes of haystack and needle.
> >>Else load next doubleword from haystack.Form a doubleword with last
> >>seven characters from first load and first character from
> >>second load and compare again with needle. Proceed in a similar way.
> >>This gives good improvement.
> >>
> >Thats looks worse than naive algorithm on practical strings. Most
> >instructions you do are useless as you have mismatch at first character.
> >
> >Following function should beat it.
> >
> >char *strstr (char *s, char *n)
> >{
> >   if (n[0] == 0)
> >     return s;
> >   while (s = strchr (s, n[0]))
> >     {
> >       long i;
> >       for (i=1; n[i] && n[i] == s[i]; i++);
> >       if (n[i] == 0)
> >         return s;
> >       s++;
> >     }
> >}
> Calling strchr in a loop is not showing  improvement when compared
> to the proposed patch.
> >
Evidence?
  
Ondrej Bilka May 12, 2015, 9:08 a.m. UTC | #11
On Mon, May 11, 2015 at 01:10:48PM -0500, Steven Munroe wrote:
> On Wed, 2015-05-06 at 18:58 +0530, Rajalakshmi Srinivasaraghavan wrote:
> > 
> > On 05/06/2015 02:44 AM, Ondřej Bílka wrote:
> > >
> > > It does. AFAIR for practical strings on x64 its only around 3 times slower than
> > > theoretical lower bound.
> > >
> > > A lower bound is strlen call as you must find terminating null when
> > > needle isn't present.
> > >
> > > I hope that main idea was clear and you will
> > > use it. Code I send is complex as it contains ten different speedups
> > > that improve performance.
> > >
> > >
> > >> In my optimization there are totally three branches.
> > >> case 1# if needle len >= 8 (common case)
> > >> case 2# if needle < 8 (common case)
> > >> case 3# For page boundary cross haystack. (rare case)
> > >>
> > > Checking if its case 1 or 2 is relatively costy as in practice haystacks
> > > tend to be short. Most apps that use strstr have 90% call haystack less than 64 bytes long.
> > >
> > > You don't need to know needle size if you know that it doesn't contain
> > > first pair. Also its rare that it contains first pair of characters twice.
> > strlen of needle is needed in all the cases so as to make sure haystack 
> > is longer than needle
> > as part of initial check before proceeding to search. Once this check is 
> > passed, strchr is
> > called to find the  position of first character.
> > 
> I think we should move on with (commit) Raji's patch as is. 
> 
> While we appreciate you suggestions Ondrej, not all optimization apply
> equally to all platforms. 

No as you failed a basic item of contributor checklist. All performance
related patches need a benchmark to show its performance improvement and
not regression. A algorithm overview mentioned there is garbage. It
would likely cause performance regression in average case but I don't
have a powerpc machine to run a test and present numbers.
  
Carlos O'Donell May 21, 2015, 3:06 p.m. UTC | #12
On 05/11/2015 04:19 PM, Joseph Myers wrote:
> On Mon, 11 May 2015, Steven Munroe wrote:
> 
>> I think we should move on with (commit) Raji's patch as is. 
> 
> Does the patch avoid quadratic worst case?  Quadratic performance in 
> strstr is an unambiguous regression, allowing for denial-of-service, that 
> should not be introduced on any platform (cf bug 12100).  Indeed, gnulib 
> has a configure test for quadratic strstr, so if you introduce quadratic 
> worst case then the effect will be that all applications using gnulib 
> automatically configure themselves to use gnulib's strstr and not glibc's 
> at all.
> 

We should add gnulib's test to the microbenchmarks.

c.
  
Carlos O'Donell May 21, 2015, 3:08 p.m. UTC | #13
On 05/12/2015 04:49 AM, Ondřej Bílka wrote:
>>> Following function should beat it.
>>>
>>> char *strstr (char *s, char *n)
>>> {
>>>   if (n[0] == 0)
>>>     return s;
>>>   while (s = strchr (s, n[0]))
>>>     {
>>>       long i;
>>>       for (i=1; n[i] && n[i] == s[i]; i++);
>>>       if (n[i] == 0)
>>>         return s;
>>>       s++;
>>>     }
>>> }
>> Calling strchr in a loop is not showing  improvement when compared
>> to the proposed patch.
>>>
> Evidence?
> 

I agree with Ondrej here. What tests are you doing to show there is
no performance improvement? Can we see those tests? Can you contribute
them to the microbenchmark so other Power vendors can use them against
their particular hardware to validate performance?

Cheers,
Carlos.
  
Steven Munroe May 21, 2015, 3:27 p.m. UTC | #14
On Thu, 2015-05-21 at 11:08 -0400, Carlos O'Donell wrote:
> On 05/12/2015 04:49 AM, Ondřej Bílka wrote:
> >>> Following function should beat it.
> >>>
> >>> char *strstr (char *s, char *n)
> >>> {
> >>>   if (n[0] == 0)
> >>>     return s;
> >>>   while (s = strchr (s, n[0]))
> >>>     {
> >>>       long i;
> >>>       for (i=1; n[i] && n[i] == s[i]; i++);
> >>>       if (n[i] == 0)
> >>>         return s;
> >>>       s++;
> >>>     }
> >>> }
> >> Calling strchr in a loop is not showing  improvement when compared
> >> to the proposed patch.
> >>>
> > Evidence?
> > 
> 
> I agree with Ondrej here. What tests are you doing to show there is
> no performance improvement? Can we see those tests? Can you contribute
> them to the microbenchmark so other Power vendors can use them against
> their particular hardware to validate performance?
> 

I thought this was addressed with
https://www.sourceware.org/ml/libc-alpha/2015-05/msg00276.html

Which includes results from benchtest/bench-strstr.c.

Why do we need another micro-benchmark for this?
  
Carlos O'Donell May 21, 2015, 3:58 p.m. UTC | #15
On 05/21/2015 11:27 AM, Steven Munroe wrote:
> On Thu, 2015-05-21 at 11:08 -0400, Carlos O'Donell wrote:
>> On 05/12/2015 04:49 AM, Ondřej Bílka wrote:
>>>>> Following function should beat it.
>>>>>
>>>>> char *strstr (char *s, char *n)
>>>>> {
>>>>>   if (n[0] == 0)
>>>>>     return s;
>>>>>   while (s = strchr (s, n[0]))
>>>>>     {
>>>>>       long i;
>>>>>       for (i=1; n[i] && n[i] == s[i]; i++);
>>>>>       if (n[i] == 0)
>>>>>         return s;
>>>>>       s++;
>>>>>     }
>>>>> }
>>>> Calling strchr in a loop is not showing  improvement when compared
>>>> to the proposed patch.
>>>>>
>>> Evidence?
>>>
>>
>> I agree with Ondrej here. What tests are you doing to show there is
>> no performance improvement? Can we see those tests? Can you contribute
>> them to the microbenchmark so other Power vendors can use them against
>> their particular hardware to validate performance?
>>
> 
> I thought this was addressed with
> https://www.sourceware.org/ml/libc-alpha/2015-05/msg00276.html
> 
> Which includes results from benchtest/bench-strstr.c.
> 
> Why do we need another micro-benchmark for this?

You don't. The above addresses my complaints. I must have missed
the other post upthread.

Cheers,
Carlos.
  

Patch

diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 17265bd..3b0e3a0 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -19,7 +19,7 @@  sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   strcmp-power8 strcmp-power7 strcmp-ppc64 \
 		   strcat-power8 strcat-power7 strcat-ppc64 \
 		   memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \
-		   strncpy-power8
+		   strncpy-power8 strstr-power7 strstr-ppc64
 
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index f5fdea5..9d808e0 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -322,5 +322,13 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strcat, 1,
 			     __strcat_ppc))
 
+  /* Support sysdeps/powerpc/powerpc64/multiarch/strstr.c.  */
+  IFUNC_IMPL (i, name, strstr,
+	      IFUNC_IMPL_ADD (array, i, strstr,
+			      hwcap & PPC_FEATURE_HAS_VSX,
+			      __strstr_power7)
+	      IFUNC_IMPL_ADD (array, i, strstr, 1,
+			      __strstr_ppc))
+
   return i;
 }
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S
new file mode 100644
index 0000000..68b93b0
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S
@@ -0,0 +1,44 @@ 
+/* Optimized strstr implementation for POWER7.
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__strstr_power7)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__strstr_power7):					\
+  cfi_startproc;						\
+  LOCALENTRY(__strstr_power7)
+
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__strstr_power7)					\
+  END_2(__strstr_power7)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#define STRLEN __strlen_power7
+#define STRNLEN __strnlen_power7
+#define STRCHR __strchr_power7
+
+#include <sysdeps/powerpc/powerpc64/power7/strstr.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strstr-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strstr-ppc64.c
new file mode 100644
index 0000000..2ca62c4
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strstr-ppc64.c
@@ -0,0 +1,29 @@ 
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+
+#define STRSTR __strstr_ppc
+#if IS_IN (libc) && defined(SHARED)
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+  __hidden_ver1(__strstr_ppc, __GI_strstr, __strstr_ppc);
+#endif
+
+extern __typeof (strstr) __strstr_ppc attribute_hidden;
+
+#include <string/strstr.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strstr.c b/sysdeps/powerpc/powerpc64/multiarch/strstr.c
new file mode 100644
index 0000000..7efc4b0
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strstr.c
@@ -0,0 +1,34 @@ 
+/* Multiple versions of strstr. PowerPC64 version.
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for definition in libc.  */
+#if IS_IN (libc)
+# include <string.h>
+# include <shlib-compat.h>
+# include "init-arch.h"
+
+extern __typeof (strstr) __strstr_ppc attribute_hidden;
+extern __typeof (strstr) __strstr_power7 attribute_hidden;
+
+/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
+   ifunc symbol properly.  */
+libc_ifunc (strstr,
+            (hwcap & PPC_FEATURE_HAS_VSX)
+            ? __strstr_power7
+            : __strstr_ppc);
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/strstr.S b/sysdeps/powerpc/powerpc64/power7/strstr.S
new file mode 100644
index 0000000..e895d36
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strstr.S
@@ -0,0 +1,500 @@ 
+/* Optimized strstr implementation for PowerPC64/POWER7.
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* char * [r3] strstr (char *s [r3], char * pat[r4])  */
+
+/*
+ * The performance gain is obtained using aligned memory access, load doubleword
+ * and usage of cmpb instruction for quicker comparison.
+ */
+
+#ifndef STRLEN
+/* For builds with no IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define STRLEN   __GI_strlen
+# else
+#  define STRLEN   strlen
+# endif
+#endif
+
+#ifndef STRNLEN
+/* For builds with no IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define STRNLEN   __GI_strnlen
+# else
+#  define STRNLEN   strnlen
+# endif
+#endif
+
+#ifndef STRCHR
+# ifdef SHARED
+#  define STRCHR   __GI_strchr
+# else
+#  define STRCHR   strchr
+# endif
+#endif
+
+#define	FRAMESIZE	(FRAME_MIN_SIZE+32)
+	.machine  power7
+EALIGN (strstr, 4, 0)
+	CALL_MCOUNT 2
+	mflr	r0			/* load link register LR to r0 */
+	std	r20, -8(r1)		/* save callers register , r20 */
+	std	r19, -16(r1)		/* save callers register , r19 */
+	std	r18, -24(r1)		/* save callers register , r18 */
+	std	r0, 16(r1)		/* store the link register  */
+	stdu	r1, -FRAMESIZE(r1)	/* create the stack frame  */
+
+	dcbt	0, r3
+	dcbt	0, r4
+
+	and	r0, r3, r4
+	cmpdi	cr7, r0, 0
+	beq	cr7, L(retnull)
+
+	mr	r18, r3
+	mr	r19, r4
+	mr	r3, r4
+	bl	STRLEN
+	nop
+
+	cmpdi	cr7, r3, 0	/* if search str is null */
+	beq	cr7, L(ret_r3)
+	mr	r20, r3
+	mr	r4, r3
+	mr	r3, r18
+	bl	STRNLEN
+	nop
+
+	cmpd	cr7, r3, r20 	/* if len(r3) < len(r4) */
+	blt	cr7, L(retnull)
+	mr	r3, r18
+	lbz	r4, 0(r19)
+	bl	STRCHR
+	nop
+
+	mr	r11, r3
+	/* if first char of search str is not present */
+	cmpdi	cr7, r3, 0
+	ble	cr7, L(end)
+
+	rldicl	r8, r3, 0, 52	/* page cross check */
+	cmpldi	cr7, r8, 4096-16
+	bgt	cr7, L(bytebybyte)
+
+	rldicl	r8, r19, 0, 52
+	cmpldi	cr7, r8, 4096-16
+	bgt	cr7, L(bytebybyte)
+
+	/* if len(r4) < 8 handle in a different way */
+	/* Shift position based on null and use cmpb */
+	cmpdi	cr7, r20, 8
+	blt	cr7, L(lessthan8)
+
+	/*  len(r4) >= 8 reaches here */
+	mr	r8, r3		/* save r3 for future use */
+L(begin):
+	mr	r3, r8
+	mr	r4, r19		/* restore r4 */
+	li	r0, 0
+
+L(begin2):
+	rlwinm	r10, r19, 3, 26, 28	/* calculate padding in bits */
+	clrrdi	r4, r4, 3	/* Make r4 aligned to 8 */
+	ld	r6, 0(r4)
+	addi	r4, r4, 8
+	cmpdi	cr7, r10, 0	/* check if its already aligned? */
+	beq	cr7, L(begin1)
+#ifdef __LITTLE_ENDIAN__
+	srd	r6, r6, r10	/* Discard unwanted bits */
+#else
+	sld	r6, r6, r10
+#endif
+	ld	r9, 0(r4)
+	subfic	r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+	sld	r9, r9, r10	/*  Discard unwanted bits */
+#else
+	srd	r9, r9, r10
+#endif
+	or	r6, r6, r9	/* Form complete search str */
+L(begin1):
+	rlwinm	r10, r3, 3, 26, 28
+	clrrdi	r3, r3, 3
+	ld	r5, 0(r3)
+	cmpb	r9, r0, r6	/* check if input has null */
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(return3)
+	cmpb	r9, r0, r5	/* check if input has null */
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r9, r10
+#else
+	sld	r9, r9, r10
+#endif
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(return2)
+
+	cmpdi	cr7, r10, 0
+	beq	cr7, L(loop2)
+	ldu	r7, 8(r3)
+	mr	r12, r10
+	addi	r12, r12, -8
+	subfic	r11, r12, 64
+	srdi	r9, r11, 3
+	addi	r9, r9, -1
+	mtctr	r9
+	b	L(nextbyte1)
+
+L(loop2):
+	li	r9, 8
+	li	r12, -8		/* shift values	*/
+	li	r11, 72		/* shift values */
+	ldu	r7, 8(r3) 	/* load next dw */
+	mtctr	r9
+L(nextbyte1):
+	addi	r12, r12, 8	/* shift one byte and compare */
+	addi	r11, r11, -8
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r5, r12	/* rotate based on mask */
+	sld	r10, r7, r11
+#else
+	sld	r9, r5, r12
+	srd	r10, r7, r11
+#endif
+	/* Form single dw from few bytes on first load and second load */
+	or	r10, r9, r10
+	/* Check for null in the formed dw */
+	cmpb	r9, r0, r10
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(retnull)
+	/* cmpb search str and input str */
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	beq	cr7, L(match)
+	bdnz	L(nextbyte1)
+	/* Check if input has null before next load */
+	cmpb	r9, r0, r7
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(retnull)
+	mr	r5, r7
+	/* Proceed to next load */
+	b	L(loop2)
+
+	.align	4
+L(match):
+	/* There is a match of 8 bytes, check next bytes */
+	cmpdi	cr7, r20, 8
+	beq	cr7, L(return)
+	/* Update next starting point r8 */
+	srdi	r9, r11, 3
+	subf	r9, r9, r3
+	mr	r8, r9
+
+L(secondmatch):
+	mr	r5, r7
+	rlwinm	r10, r19, 3, 26, 28	/* calculate padding in bits */
+	ld	r6, 0(r4)
+	addi	r4, r4, 8
+	cmpdi	cr7, r10, 0	/* check if its already aligned? */
+	beq	cr7, L(proceed3)
+#ifdef __LITTLE_ENDIAN__
+	srd	r6, r6, r10	/* Discard unwanted bits */
+	cmpb	r9, r0, r6
+	sld	r9, r9, r10
+#else
+	sld	r6, r6, r10
+	cmpb	r9, r0, r6
+	srd	r9, r9, r10
+#endif
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(proceed3)
+	ld	r9, 0(r4)
+	subfic	r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+	sld	r9, r9, r10	/*  Discard unwanted bits */
+#else
+	srd	r9, r9, r10
+#endif
+	or	r6, r6, r9	/* Form complete search str */
+
+L(proceed3):
+	li	r7, 0
+	addi	r3, r3, 8
+	cmpb	r9, r0, r5
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(proceed4)
+	ld	r7, 0(r3)
+L(proceed4):
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r5, r12
+	sld	r10, r7, r11
+#else
+	sld	r9, r5, r12
+	srd	r10, r7, r11
+#endif
+	/* Form single dw with few bytes from first and second load */
+	or	r10, r9, r10
+	cmpb	r9, r0, r6
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(return4)
+	/* Check for null in the formed dw */
+	cmpb	r9, r0, r10
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(retnull)
+	/* If the next 8 bytes dont match, start search again */
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	bne	cr7, L(reset)
+	/* If the next 8 bytes match, load and compare next 8 */
+	b	L(secondmatch)
+
+	.align	4
+L(reset):
+	/* start the search again */
+	addi	r8, r8, 1
+	b	L(begin)
+
+	.align	4
+L(loadnext):
+	/* Update r3 and proceed */
+	addi	r3, r3, 8
+	b	L(begin2)
+
+	.align	4
+L(return2):
+	mr	r3, r8
+	b	L(end)
+
+	.align	4
+L(return3):
+	/* Count leading zeros and compare partial dw*/
+#ifdef __LITTLE_ENDIAN__
+	addi	r7, r9, -1
+	andc	r7, r7, r9
+	popcntd	r7, r7
+	subfic	r7, r7, 64
+	sld	r10, r5, r7
+	sld	r6, r6, r7
+#else
+	cntlzd	r7, r9
+	subfic	r7, r7, 64
+	srd	r10, r5, r7
+	srd	r6, r6, r7
+#endif
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	addi	r8, r8, 1
+	/* start search again if there is no match */
+	bne	cr7, L(begin)
+	/* if the words match, update return values */
+	subfic	r7, r7, 64
+	srdi	r7, r7, 3
+	add	r3, r3, r7
+	subf	r3, r20, r3
+	b	L(end)
+
+	.align	4
+L(return4):
+	/* Count leading zeros and compare partial dw*/
+#ifdef __LITTLE_ENDIAN__
+	addi	r7, r9, -1
+	andc	r7, r7, r9
+	popcntd	r7, r7
+	subfic	r7, r7, 64
+	sld	r10, r10, r7
+	sld	r6, r6, r7
+#else
+	cntlzd	r7, r9
+	subfic	r7, r7, 64
+	srd	r10, r10, r7
+	srd	r6, r6, r7
+#endif
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	addi	r8, r8, 1
+	bne	cr7, L(begin)
+	subfic	r7, r7, 64
+	srdi	r11, r11, 3
+	subf	r3, r11, r3
+	srdi	r7, r7, 3
+	add	r3, r3, r7
+	subf	r3, r20, r3
+	b	L(end)
+
+	/* handle less than 8 search string */
+	.align	4
+L(lessthan8):
+	mr	r4, r19
+	li	r0, 0
+
+	rlwinm	r10, r4, 3, 26, 28	/* calculate padding in bits */
+	srdi	r8, r10, 3	/* padding in bytes */
+	clrrdi	r4, r4, 3	/* Make r4 aligned to 8 */
+	ld	r6, 0(r4)
+	cmpdi	cr7, r10, 0	/* check if its already aligned? */
+	beq	cr7, L(proceed2)
+#ifdef __LITTLE_ENDIAN__
+	srd	r6, r6, r10	/* Discard unwanted bits */
+#else
+	sld	r6, r6, r10
+#endif
+	subfic	r8, r8, 8
+	cmpd	cr7, r8, r20	/* Next load needed? */
+	bge	cr7, L(proceed2)
+	ld	r7, 8(r4)
+	subfic	r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+	sld	r7, r7, r10	/*  Discard unwanted bits */
+#else
+	srd	r7, r7, r10
+#endif
+	or	r6, r6, r7	/* Form complete search str */
+L(proceed2):
+	rlwinm	r10, r3, 3, 26, 28
+	clrrdi	r3, r3, 3	/* Make r3 aligned */
+	ld	r5, 0(r3)
+	sldi	r8, r20, 3
+	subfic	r8, r8, 64
+#ifdef __LITTLE_ENDIAN__
+	sld	r6, r6, r8
+	cmpb	r9, r0, r5
+	srd	r9, r9, r10
+#else
+	srd	r6, r6, r8
+	cmpb	r9, r0, r5
+	sld	r9, r9, r10
+#endif
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(noload)
+	cmpdi	cr7, r10, 0
+	beq	cr7, L(continue)
+	ldu	r7, 8(r3)
+	mr	r12, r10
+	addi	r12, r12, -8
+	subfic	r11, r12, 64
+	srdi	r9, r11, 3
+	addi	r9, r9, -1
+	mtctr	r9
+	b	L(nextbyte)
+L(continue):
+	ldu	r7, 8(r3)
+L(continue1):
+	li	r9, 8
+	li	r12, -8		/* shift values	*/
+	li	r11, 72		/* shift values */
+	mtctr	r9
+L(nextbyte):
+	addi	r12, r12, 8	/* mask for rotation */
+	addi	r11, r11, -8
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r5, r12
+	sld	r10, r7, r11
+	or	r10, r9, r10
+	sld	r10, r10, r8
+	cmpb	r9, r0, r10
+	srd	r9, r9, r8
+#else
+	sld	r9, r5, r12
+	srd	r10, r7, r11
+	or	r10, r9, r10
+	srd	r10, r10, r8
+	cmpb	r9, r0, r10
+	sld	r9, r9, r8
+#endif
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(retnull)
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	beq	cr7, L(return)
+	bdnz	L(nextbyte)
+	mr	r5, r7
+	/* Check null in loaded dw */
+	cmpb	r9, r0, r7
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(noload)
+	b	L(continue)
+
+	.align	4
+L(noload):
+	/* Reached null in r3, so skip next load */
+	addi	r3, r3, 8
+	li 	r7, 0
+	b	L(continue1)
+
+	.align	4
+L(return):
+	/* Update return values */
+	srdi	r9, r11, 3
+	subf	r3, r9, r3
+	b	L(end)
+
+	/* handling byte by byte */
+	.align	4
+L(bytebybyte):
+	mr	r8, r3
+	addi	r8, r8, -1
+L(loop1):
+	addi	r8, r8, 1
+	mr	r3, r8
+	mr	r4, r19
+	lbz	r6, 0(r4)
+	cmpdi	cr7, r6, 0
+	beq	cr7, L(updater3)
+L(loop):
+	lbz	r5, 0(r3)
+	cmpdi	cr7, r5, 0
+	beq	cr7, L(retnull)
+	cmpld	cr7, r6, r5
+	bne	cr7, L(loop1)
+	addi	r3, r3, 1
+	addi	r4, r4, 1
+	lbz	r6, 0(r4)
+	cmpdi	cr7, r6, 0
+	beq	cr7, L(updater3)
+	b	L(loop)
+
+	/* handling return values */
+	.align	4
+L(updater3):
+	subf	r3, r20, r3	/* reduce len of r4 from r3 */
+	b	L(end)
+
+	.align	4
+L(ret_r3):
+	mr	r3, r18		/* return r3 */
+	b	L(end)
+
+	.align	4
+L(retnull):
+	li	r3, 0		/* return NULL */
+
+	.align	4
+L(end):
+	addi	r1, r1, FRAMESIZE	/* restore stack pointer  */
+	ld	r0, 16(r1)	/* read the saved link register  */
+	ld	r18, -24(r1)	/* restore callers save register, r18  */
+	ld	r19, -16(r1)	/* restore callers save register, r19  */
+	ld	r20, -8(r1)	/* restore callers save register, r20  */
+	mtlr	r0		/* branch to link register  */
+	blr
+END (strstr)
+libc_hidden_builtin_def (strstr)