On Fri, Apr 17, 2015 at 02:09:44AM -0500, Rajalakshmi Srinivasaraghavan wrote:
> This patch optimizes strstr function for power >= 7 systems. Performance
> gain is obtained using doubleword aligned memory access and usage of cmpb
> instruction for quicker comparison. The average improvement of this
> optimization is ~40%.
As I did same with x64 I will add improvements, I didn't look at patch
in detail.
First problem looks there is quadratic worst case. That is solved by
buy-or-rent technique. You count number of comparisons and if that
exceeds C*N where N is number of haystack bytes read so far and C
suitable constant then you switch to slower generic algorithm.
Main improvement in x64 case was checking for pair of characters. A pair
of characters is less likely than single character even vectorized
branch would be rarely triggered saving misprediction cost that
dominates performance.
That improved performance about 25 times.
For checking pair only with arithmetic you look on zero bytes of
following expression
(LOAD (haystack + n) ^ BROADCAST(needle[0])) | (LOAD (haystack + n + 1) ^ BROADCAST(needle[1]))
Then you need to do technical improvements.
One is unrolling to test 64 bytes at once, you could keep that to 32/16
if that improves performance. Hot portion is header, loop is rarely used
so you should check first 64 bytes without going to main loop.
Next is that you need to load vector twice, once for first character and
then for second character. You need to have loads corresponding to
second character aligned. For first character you could use unaligned
loads if they are fast or get these by bit fiddling from second
character loads. This way you don't have to worry about crossing page
boundaries.
Here is template that I used to generate x64 version you could use that
with modifications. Main problem is mapping suitable instrincts or
replacing these with corresponding powerpc instructions.
#include <emmintrin.h>
#include <stdint.h>
#include <stdlib.h>
#include "../header.h"
#ifdef AS_SSSE3
# define PREVCHAR(s, va, vb) CONCAT (va, vb, 15)
#else
# define PREVCHAR(s, va, vb) LOADU (s)
#endif
static inline tp_mask
forget_first_bit (tp_mask x)
{
return x & (x - 1);
}
static inline int
strcmp2 (char *x, char *y, long *buy)
{
int i = 0;
while (1)
{
if (!y[i])
return 0;
if (x[i] != y[i])
{
buy += i; return 1;
}
i++;
}
}
char *strchr (char *s, int n);
char *strstr_string (char *a, char *b);
unsigned char *
strstr_new (unsigned char *s, unsigned char *n)
{
long i;
unsigned char *s_or, *s_al;
if (__builtin_expect (!n[0], 0))
return s;
if (!n[1])
return strchr (s, n[0]);
tp_vector v0 = BROADCAST (n[0]), v1 = BROADCAST (n[1]), vz = BROADCAST (0);
tp_vector vs0, vs1, vs2, vs3, vsb;
long buy;
long unused;
if ((((size_t) s) & 4095) <= 4096 - 65)
{
vs0 = LOADU (s);
/*or test three chars. */
vs0 = OR (MIN (EQ (vs0, v0), EQ (v1, LOADU (s + 1))), EQ (vs0, vz));
tp_mask m = get_mask (vs0); //| (get_mask(vs1) << 16) | (get_mask(vs2) <<32) | (get_mask(vs3)<<48);
while (m)
{
unsigned char *r = s + first_bit (m);
if (!*r)
return NULL;
if (!strcmp2 (r + 2, n + 2, &unused))
return r;
m = forget_first_bit (m);
}
vs1 = LOADU (s + 16);
vs2 = LOADU (s + 32);
vs3 = LOADU (s + 48);
/*or test three chars. */
vs1 = OR (MIN (EQ (vs1, v0), EQ (v1, LOADU (s + 16 + 1))), EQ (vs1, vz));
vs2 = OR (MIN (EQ (vs2, v0), EQ (v1, LOADU (s + 32 + 1))), EQ (vs2, vz));
vs3 = OR (MIN (EQ (vs3, v0), EQ (v1, LOADU (s + 48 + 1))), EQ (vs3, vz));
m = (get_mask (vs1) << 16) | (get_mask (vs2) << 32) | (get_mask (vs3) << 48);
while (m)
{
unsigned char *r = s + first_bit (m);
if (!*r)
return NULL;
if (!strcmp2 (r + 2, n + 2, &unused))
return r;
m = forget_first_bit (m);
}
}
else
{
s_al = ALIGN_DOWN (s, 64);
vs0 = LOAD (s_al);
vs1 = LOAD (s_al + 16);
vs2 = LOAD (s_al + 32);
vs3 = LOAD (s_al + 48);
/*or test three chars. */
vs0 = OR (MIN (EQ (vs0, v1), EQ (v0, LOADU (s_al - 1))), EQ (vs0, vz));
vs1 = OR (MIN (EQ (vs1, v1), EQ (v0, LOADU (s_al + 16 - 1))), EQ (vs1, vz));
vs2 = OR (MIN (EQ (vs2, v1), EQ (v0, LOADU (s_al + 32 - 1))), EQ (vs2, vz));
vs3 = OR (MIN (EQ (vs3, v1), EQ (v0, LOADU (s_al + 48 - 1))), EQ (vs3, vz));
tp_mask m = get_mask (vs0) | (get_mask (vs1) << 16) | (get_mask (vs2) << 32) | (get_mask (vs3) << 48);
m = m >> (s - s_al);
while (m)
{
unsigned char *r = s + first_bit (m);
if (!*r)
return NULL;
if (r != s && !strcmp2 (r + 2, n + 2, &unused))
return r;
m = forget_first_bit (m);
}
}
buy = -512;
s_or = s;
s = ALIGN_DOWN (s + 64, 64);
#ifdef AS_SSSE3
vs3 = LOAD (s - 16);
#endif
while (1)
{
vs0 = LOAD (s);
vs1 = LOAD (s + 16);
vs2 = LOAD (s + 32);
vs3 = LOAD (s + 48);
/*or test three chars. */
tp_vector ret = vs0;
ret = MIN (ret, vs1);
ret = MIN (ret, vs2);
ret = MIN (ret, vs3);
tp_vector ret2 = OR (XOR (PREVCHAR (s - 1, vs3, vs0), v0), XOR (vs0, v1));
ret2 = MIN (ret2, OR (XOR (PREVCHAR (s - 1 + 16, vs0, vs1), v0), XOR (vs1, v1)));
ret2 = MIN (ret2, OR (XOR (PREVCHAR (s - 1 + 32, vs1, vs2), v0), XOR (vs2, v1)));
ret2 = MIN (ret2, OR (XOR (PREVCHAR (s - 1 + 48, vs2, vs3), v0), XOR (vs3, v1)));
ret = MIN (ret, ret2);
if (get_mask (EQ (ret, vz)))
{
s_al = ALIGN_DOWN (s, 64);
vs0 = LOAD (s_al);
vs1 = LOAD (s_al + 16);
vs2 = LOAD (s_al + 32);
vs3 = LOAD (s_al + 48);
/*or test three chars. */
vs0 = OR (MIN (EQ (vs0, v1), EQ (v0, LOADU (s_al - 1))), EQ (vs0, vz)); /* todo last three nondestructive */
vs1 = OR (MIN (EQ (vs1, v1), EQ (v0, LOADU (s_al + 16 - 1))), EQ (vs1, vz));
vs2 = OR (MIN (EQ (vs2, v1), EQ (v0, LOADU (s_al + 32 - 1))), EQ (vs2, vz));
vs3 = OR (MIN (EQ (vs3, v1), EQ (v0, LOADU (s_al + 48 - 1))), EQ (vs3, vz));
tp_mask m = get_mask (vs0) | (get_mask (vs1) << 16) | (get_mask (vs2) << 32) | (get_mask (vs3) << 48);
while (m)
{
unsigned char *r = s + first_bit (m);
if (!*r)
return NULL;
if (!strcmp2 (r + 1, n + 2, &buy))
return r - 1;
if (buy > s - s_or)
return strstr_string (s, n);
m = forget_first_bit (m);
}
}
s += 64;
}
}
@@ -19,7 +19,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
strcmp-power8 strcmp-power7 strcmp-ppc64 \
strcat-power8 strcat-power7 strcat-ppc64 \
memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \
- strncpy-power8
+ strncpy-power8 strstr-power7 strstr-ppc64
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
@@ -322,5 +322,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strcat, 1,
__strcat_ppc))
+ /* Support sysdeps/powerpc/powerpc64/multiarch/strstr.c. */
+ IFUNC_IMPL (i, name, strstr,
+ IFUNC_IMPL_ADD (array, i, strstr,
+ hwcap & PPC_FEATURE_HAS_VSX,
+ __strstr_power7)
+ IFUNC_IMPL_ADD (array, i, strstr, 1,
+ __strstr_ppc))
+
return i;
}
new file mode 100644
@@ -0,0 +1,44 @@
+/* Optimized strstr implementation for POWER7.
+ Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words) \
+ .section ".text"; \
+ ENTRY_2(__strstr_power7) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__strstr_power7): \
+ cfi_startproc; \
+ LOCALENTRY(__strstr_power7)
+
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__strstr_power7) \
+ END_2(__strstr_power7)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#define STRLEN __strlen_power7
+#define STRNLEN __strnlen_power7
+#define STRCHR __strchr_power7
+
+#include <sysdeps/powerpc/powerpc64/power7/strstr.S>
new file mode 100644
@@ -0,0 +1,29 @@
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <string.h>
+
+#define STRSTR __strstr_ppc
+#if IS_IN (libc) && defined(SHARED)
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+ __hidden_ver1(__strstr_ppc, __GI_strstr, __strstr_ppc);
+#endif
+
+extern __typeof (strstr) __strstr_ppc attribute_hidden;
+
+#include <string/strstr.c>
new file mode 100644
@@ -0,0 +1,34 @@
+/* Multiple versions of strstr. PowerPC64 version.
+ Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for definition in libc. */
+#if IS_IN (libc)
+# include <string.h>
+# include <shlib-compat.h>
+# include "init-arch.h"
+
+extern __typeof (strstr) __strstr_ppc attribute_hidden;
+extern __typeof (strstr) __strstr_power7 attribute_hidden;
+
+/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
+ ifunc symbol properly. */
+libc_ifunc (strstr,
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strstr_power7
+ : __strstr_ppc);
+#endif
new file mode 100644
@@ -0,0 +1,500 @@
+/* Optimized strstr implementation for PowerPC64/POWER7.
+ Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* char * [r3] strstr (char *s [r3], char * pat[r4]) */
+
+/*
+ * The performance gain is obtained using aligned memory access, load doubleword
+ * and usage of cmpb instruction for quicker comparison.
+ */
+
+#ifndef STRLEN
+/* For builds with no IFUNC support, local calls should be made to internal
+ GLIBC symbol (created by libc_hidden_builtin_def). */
+# ifdef SHARED
+# define STRLEN __GI_strlen
+# else
+# define STRLEN strlen
+# endif
+#endif
+
+#ifndef STRNLEN
+/* For builds with no IFUNC support, local calls should be made to internal
+ GLIBC symbol (created by libc_hidden_builtin_def). */
+# ifdef SHARED
+# define STRNLEN __GI_strnlen
+# else
+# define STRNLEN strnlen
+# endif
+#endif
+
+#ifndef STRCHR
+# ifdef SHARED
+# define STRCHR __GI_strchr
+# else
+# define STRCHR strchr
+# endif
+#endif
+
+#define FRAMESIZE (FRAME_MIN_SIZE+32)
+ .machine power7
+EALIGN (strstr, 4, 0)
+ CALL_MCOUNT 2
+ mflr r0 /* load link register LR to r0 */
+ std r20, -8(r1) /* save callers register , r20 */
+ std r19, -16(r1) /* save callers register , r19 */
+ std r18, -24(r1) /* save callers register , r18 */
+ std r0, 16(r1) /* store the link register */
+ stdu r1, -FRAMESIZE(r1) /* create the stack frame */
+
+ dcbt 0, r3
+ dcbt 0, r4
+
+ and r0, r3, r4
+ cmpdi cr7, r0, 0
+ beq cr7, L(retnull)
+
+ mr r18, r3
+ mr r19, r4
+ mr r3, r4
+ bl STRLEN
+ nop
+
+ cmpdi cr7, r3, 0 /* if search str is null */
+ beq cr7, L(ret_r3)
+ mr r20, r3
+ mr r4, r3
+ mr r3, r18
+ bl STRNLEN
+ nop
+
+ cmpd cr7, r3, r20 /* if len(r3) < len(r4) */
+ blt cr7, L(retnull)
+ mr r3, r18
+ lbz r4, 0(r19)
+ bl STRCHR
+ nop
+
+ mr r11, r3
+ /* if first char of search str is not present */
+ cmpdi cr7, r3, 0
+ ble cr7, L(end)
+
+ rldicl r8, r3, 0, 52 /* page cross check */
+ cmpldi cr7, r8, 4096-16
+ bgt cr7, L(bytebybyte)
+
+ rldicl r8, r19, 0, 52
+ cmpldi cr7, r8, 4096-16
+ bgt cr7, L(bytebybyte)
+
+ /* if len(r4) < 8 handle in a different way */
+ /* Shift position based on null and use cmpb */
+ cmpdi cr7, r20, 8
+ blt cr7, L(lessthan8)
+
+ /* len(r4) >= 8 reaches here */
+ mr r8, r3 /* save r3 for future use */
+L(begin):
+ mr r3, r8
+ mr r4, r19 /* restore r4 */
+ li r0, 0
+
+L(begin2):
+ rlwinm r10, r19, 3, 26, 28 /* calculate padding in bits */
+ clrrdi r4, r4, 3 /* Make r4 aligned to 8 */
+ ld r6, 0(r4)
+ addi r4, r4, 8
+ cmpdi cr7, r10, 0 /* check if its already aligned? */
+ beq cr7, L(begin1)
+#ifdef __LITTLE_ENDIAN__
+ srd r6, r6, r10 /* Discard unwanted bits */
+#else
+ sld r6, r6, r10
+#endif
+ ld r9, 0(r4)
+ subfic r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+ sld r9, r9, r10 /* Discard unwanted bits */
+#else
+ srd r9, r9, r10
+#endif
+ or r6, r6, r9 /* Form complete search str */
+L(begin1):
+ rlwinm r10, r3, 3, 26, 28
+ clrrdi r3, r3, 3
+ ld r5, 0(r3)
+ cmpb r9, r0, r6 /* check if input has null */
+ cmpdi cr7, r9, 0
+ bne cr7, L(return3)
+ cmpb r9, r0, r5 /* check if input has null */
+#ifdef __LITTLE_ENDIAN__
+ srd r9, r9, r10
+#else
+ sld r9, r9, r10
+#endif
+ cmpdi cr7, r9, 0
+ bne cr7, L(return2)
+
+ cmpdi cr7, r10, 0
+ beq cr7, L(loop2)
+ ldu r7, 8(r3)
+ mr r12, r10
+ addi r12, r12, -8
+ subfic r11, r12, 64
+ srdi r9, r11, 3
+ addi r9, r9, -1
+ mtctr r9
+ b L(nextbyte1)
+
+L(loop2):
+ li r9, 8
+ li r12, -8 /* shift values */
+ li r11, 72 /* shift values */
+ ldu r7, 8(r3) /* load next dw */
+ mtctr r9
+L(nextbyte1):
+ addi r12, r12, 8 /* shift one byte and compare */
+ addi r11, r11, -8
+#ifdef __LITTLE_ENDIAN__
+ srd r9, r5, r12 /* rotate based on mask */
+ sld r10, r7, r11
+#else
+ sld r9, r5, r12
+ srd r10, r7, r11
+#endif
+ /* Form single dw from few bytes on first load and second load */
+ or r10, r9, r10
+ /* Check for null in the formed dw */
+ cmpb r9, r0, r10
+ cmpdi cr7, r9, 0
+ bne cr7, L(retnull)
+ /* cmpb search str and input str */
+ cmpb r9, r10, r6
+ cmpdi cr7, r9, -1
+ beq cr7, L(match)
+ bdnz L(nextbyte1)
+ /* Check if input has null before next load */
+ cmpb r9, r0, r7
+ cmpdi cr7, r9, 0
+ bne cr7, L(retnull)
+ mr r5, r7
+ /* Proceed to next load */
+ b L(loop2)
+
+ .align 4
+L(match):
+ /* There is a match of 8 bytes, check next bytes */
+ cmpdi cr7, r20, 8
+ beq cr7, L(return)
+ /* Update next starting point r8 */
+ srdi r9, r11, 3
+ subf r9, r9, r3
+ mr r8, r9
+
+L(secondmatch):
+ mr r5, r7
+ rlwinm r10, r19, 3, 26, 28 /* calculate padding in bits */
+ ld r6, 0(r4)
+ addi r4, r4, 8
+ cmpdi cr7, r10, 0 /* check if its already aligned? */
+ beq cr7, L(proceed3)
+#ifdef __LITTLE_ENDIAN__
+ srd r6, r6, r10 /* Discard unwanted bits */
+ cmpb r9, r0, r6
+ sld r9, r9, r10
+#else
+ sld r6, r6, r10
+ cmpb r9, r0, r6
+ srd r9, r9, r10
+#endif
+ cmpdi cr7, r9, 0
+ bne cr7, L(proceed3)
+ ld r9, 0(r4)
+ subfic r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+ sld r9, r9, r10 /* Discard unwanted bits */
+#else
+ srd r9, r9, r10
+#endif
+ or r6, r6, r9 /* Form complete search str */
+
+L(proceed3):
+ li r7, 0
+ addi r3, r3, 8
+ cmpb r9, r0, r5
+ cmpdi cr7, r9, 0
+ bne cr7, L(proceed4)
+ ld r7, 0(r3)
+L(proceed4):
+#ifdef __LITTLE_ENDIAN__
+ srd r9, r5, r12
+ sld r10, r7, r11
+#else
+ sld r9, r5, r12
+ srd r10, r7, r11
+#endif
+ /* Form single dw with few bytes from first and second load */
+ or r10, r9, r10
+ cmpb r9, r0, r6
+ cmpdi cr7, r9, 0
+ bne cr7, L(return4)
+ /* Check for null in the formed dw */
+ cmpb r9, r0, r10
+ cmpdi cr7, r9, 0
+ bne cr7, L(retnull)
+ /* If the next 8 bytes dont match, start search again */
+ cmpb r9, r10, r6
+ cmpdi cr7, r9, -1
+ bne cr7, L(reset)
+ /* If the next 8 bytes match, load and compare next 8 */
+ b L(secondmatch)
+
+ .align 4
+L(reset):
+ /* start the search again */
+ addi r8, r8, 1
+ b L(begin)
+
+ .align 4
+L(loadnext):
+ /* Update r3 and proceed */
+ addi r3, r3, 8
+ b L(begin2)
+
+ .align 4
+L(return2):
+ mr r3, r8
+ b L(end)
+
+ .align 4
+L(return3):
+ /* Count leading zeros and compare partial dw*/
+#ifdef __LITTLE_ENDIAN__
+ addi r7, r9, -1
+ andc r7, r7, r9
+ popcntd r7, r7
+ subfic r7, r7, 64
+ sld r10, r5, r7
+ sld r6, r6, r7
+#else
+ cntlzd r7, r9
+ subfic r7, r7, 64
+ srd r10, r5, r7
+ srd r6, r6, r7
+#endif
+ cmpb r9, r10, r6
+ cmpdi cr7, r9, -1
+ addi r8, r8, 1
+ /* start search again if there is no match */
+ bne cr7, L(begin)
+ /* if the words match, update return values */
+ subfic r7, r7, 64
+ srdi r7, r7, 3
+ add r3, r3, r7
+ subf r3, r20, r3
+ b L(end)
+
+ .align 4
+L(return4):
+ /* Count leading zeros and compare partial dw*/
+#ifdef __LITTLE_ENDIAN__
+ addi r7, r9, -1
+ andc r7, r7, r9
+ popcntd r7, r7
+ subfic r7, r7, 64
+ sld r10, r10, r7
+ sld r6, r6, r7
+#else
+ cntlzd r7, r9
+ subfic r7, r7, 64
+ srd r10, r10, r7
+ srd r6, r6, r7
+#endif
+ cmpb r9, r10, r6
+ cmpdi cr7, r9, -1
+ addi r8, r8, 1
+ bne cr7, L(begin)
+ subfic r7, r7, 64
+ srdi r11, r11, 3
+ subf r3, r11, r3
+ srdi r7, r7, 3
+ add r3, r3, r7
+ subf r3, r20, r3
+ b L(end)
+
+ /* handle less than 8 search string */
+ .align 4
+L(lessthan8):
+ mr r4, r19
+ li r0, 0
+
+ rlwinm r10, r4, 3, 26, 28 /* calculate padding in bits */
+ srdi r8, r10, 3 /* padding in bytes */
+ clrrdi r4, r4, 3 /* Make r4 aligned to 8 */
+ ld r6, 0(r4)
+ cmpdi cr7, r10, 0 /* check if its already aligned? */
+ beq cr7, L(proceed2)
+#ifdef __LITTLE_ENDIAN__
+ srd r6, r6, r10 /* Discard unwanted bits */
+#else
+ sld r6, r6, r10
+#endif
+ subfic r8, r8, 8
+ cmpd cr7, r8, r20 /* Next load needed? */
+ bge cr7, L(proceed2)
+ ld r7, 8(r4)
+ subfic r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+ sld r7, r7, r10 /* Discard unwanted bits */
+#else
+ srd r7, r7, r10
+#endif
+ or r6, r6, r7 /* Form complete search str */
+L(proceed2):
+ rlwinm r10, r3, 3, 26, 28
+ clrrdi r3, r3, 3 /* Make r3 aligned */
+ ld r5, 0(r3)
+ sldi r8, r20, 3
+ subfic r8, r8, 64
+#ifdef __LITTLE_ENDIAN__
+ sld r6, r6, r8
+ cmpb r9, r0, r5
+ srd r9, r9, r10
+#else
+ srd r6, r6, r8
+ cmpb r9, r0, r5
+ sld r9, r9, r10
+#endif
+ cmpdi cr7, r9, 0
+ bne cr7, L(noload)
+ cmpdi cr7, r10, 0
+ beq cr7, L(continue)
+ ldu r7, 8(r3)
+ mr r12, r10
+ addi r12, r12, -8
+ subfic r11, r12, 64
+ srdi r9, r11, 3
+ addi r9, r9, -1
+ mtctr r9
+ b L(nextbyte)
+L(continue):
+ ldu r7, 8(r3)
+L(continue1):
+ li r9, 8
+ li r12, -8 /* shift values */
+ li r11, 72 /* shift values */
+ mtctr r9
+L(nextbyte):
+ addi r12, r12, 8 /* mask for rotation */
+ addi r11, r11, -8
+#ifdef __LITTLE_ENDIAN__
+ srd r9, r5, r12
+ sld r10, r7, r11
+ or r10, r9, r10
+ sld r10, r10, r8
+ cmpb r9, r0, r10
+ srd r9, r9, r8
+#else
+ sld r9, r5, r12
+ srd r10, r7, r11
+ or r10, r9, r10
+ srd r10, r10, r8
+ cmpb r9, r0, r10
+ sld r9, r9, r8
+#endif
+ cmpdi cr7, r9, 0
+ bne cr7, L(retnull)
+ cmpb r9, r10, r6
+ cmpdi cr7, r9, -1
+ beq cr7, L(return)
+ bdnz L(nextbyte)
+ mr r5, r7
+ /* Check null in loaded dw */
+ cmpb r9, r0, r7
+ cmpdi cr7, r9, 0
+ bne cr7, L(noload)
+ b L(continue)
+
+ .align 4
+L(noload):
+ /* Reached null in r3, so skip next load */
+ addi r3, r3, 8
+ li r7, 0
+ b L(continue1)
+
+ .align 4
+L(return):
+ /* Update return values */
+ srdi r9, r11, 3
+ subf r3, r9, r3
+ b L(end)
+
+ /* handling byte by byte */
+ .align 4
+L(bytebybyte):
+ mr r8, r3
+ addi r8, r8, -1
+L(loop1):
+ addi r8, r8, 1
+ mr r3, r8
+ mr r4, r19
+ lbz r6, 0(r4)
+ cmpdi cr7, r6, 0
+ beq cr7, L(updater3)
+L(loop):
+ lbz r5, 0(r3)
+ cmpdi cr7, r5, 0
+ beq cr7, L(retnull)
+ cmpld cr7, r6, r5
+ bne cr7, L(loop1)
+ addi r3, r3, 1
+ addi r4, r4, 1
+ lbz r6, 0(r4)
+ cmpdi cr7, r6, 0
+ beq cr7, L(updater3)
+ b L(loop)
+
+ /* handling return values */
+ .align 4
+L(updater3):
+ subf r3, r20, r3 /* reduce len of r4 from r3 */
+ b L(end)
+
+ .align 4
+L(ret_r3):
+ mr r3, r18 /* return r3 */
+ b L(end)
+
+ .align 4
+L(retnull):
+ li r3, 0 /* return NULL */
+
+ .align 4
+L(end):
+ addi r1, r1, FRAMESIZE /* restore stack pointer */
+ ld r0, 16(r1) /* read the saved link register */
+ ld r18, -24(r1) /* restore callers save register, r18 */
+ ld r19, -16(r1) /* restore callers save register, r19 */
+ ld r20, -8(r1) /* restore callers save register, r20 */
+ mtlr r0 /* branch to link register */
+ blr
+END (strstr)
+libc_hidden_builtin_def (strstr)