diff mbox

[AArch64] Add optimized strchrnul

Message ID 539AD11E.50507@arm.com
State Committed
Headers show

Commit Message

Richard Earnshaw June 13, 2014, 10:23 a.m. UTC
Here is an optimized implementation of __strchrnul.  The simplification
that we don't have to track precisely why the loop terminates (match or
end-of-string) means we have to do less work in both setup and the core
inner loop.  That means this should never be slower than strchr.

As with strchr, the use of LD1 means we do not need different versions
for big-/little-endian.

<date>  Richard Earnshaw  <rearnsha@arm.com>

	* sysdeps/aarch64/strchrnul.S: New file.

OK?

Comments

Ondrej Bilka June 13, 2014, 12:12 p.m. UTC | #1
On Fri, Jun 13, 2014 at 11:23:26AM +0100, Richard Earnshaw wrote:
> Here is an optimized implementation of __strchrnul.  The simplification
> that we don't have to track precisely why the loop terminates (match or
> end-of-string) means we have to do less work in both setup and the core
> inner loop.  That means this should never be slower than strchr.
> 
> As with strchr, the use of LD1 means we do not need different versions
> for big-/little-endian.
> 
> <date>  Richard Earnshaw  <rearnsha@arm.com>
> 
> 	* sysdeps/aarch64/strchrnul.S: New file.
> 
> OK?

Few comments, a hot path in strchrnul are first 64 bytes so you should
focus on these.

First get a profiler here. This is a simple program that collects a
sizes of strchr calls and then runs these again. It is good first
approximation of real performance.

http://kam.mff.cuni.cz/~ondra/dryrun_strchrnul.tar.bz2

After you collect calls from programs that interest you try to compare
these. A old/new implementation is minimum, but I have several
questions.

First what is latency of unaligned loads? One performance problem on x64
were small strings that cross 64byte boundary. It turned out that it is
faster first check if we do not cross page and then do unaligned comparison
on 64 bytes. That needs to be checked.

Second trick is first check page crossing, then align to 16 bytes and do
32byte compare so you always compare at least 16 valid bytes in header.



> +
> +ENTRY (__strchrnul)
> +	/* Magic constant 0x40100401 to allow us to identify which lane
> +	   matches the termination condition.  */
> +	mov	wtmp2, #0x0401
> +	movk	wtmp2, #0x4010, lsl #16
> +	dup	vrepchr.16b, chrin
> +	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
> +	dup	vrepmask.4s, wtmp2
> +	ands	tmp1, srcin, #31
> +	b.eq	L(loop)
> +
Omitting this branch would improve performance if code below still works
on aligned hunk. In loop there is risk of branch misprediction which is
slower than staying at head. Condition could also cause branch
misprediction but when I looked to data a 32-byte aligned strings are
rare.

> +	/* Input string is not 32-byte aligned.  Rather than forcing
> +	   the padding bytes to a safe value, we calculate the syndrome
> +	   for all the bytes, but then mask off those bits of the
> +	   syndrome that are related to the padding.  */
> +	ld1	{vdata1.16b, vdata2.16b}, [src], #32
> +	neg	tmp1, tmp1
> +	cmeq	vhas_nul1.16b, vdata1.16b, #0
> +	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
> +	cmeq	vhas_nul2.16b, vdata2.16b, #0
> +	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
> +	orr	vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
> +	orr	vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
> +	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
> +	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
> +	lsl	tmp1, tmp1, #1
> +	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
> +	mov	tmp3, #~0
> +	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
> +	lsr	tmp1, tmp3, tmp1
> +
> +	mov	tmp3, vend1.2d[0]
> +	bic	tmp1, tmp3, tmp1	// Mask padding bits.
> +	cbnz	tmp1, L(tail)
> +
> +L(loop):
> +	ld1	{vdata1.16b, vdata2.16b}, [src], #32
> +	cmeq	vhas_nul1.16b, vdata1.16b, #0
> +	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
> +	cmeq	vhas_nul2.16b, vdata2.16b, #0
> +	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
> +	/* Use a fast check for the termination condition.  */
> +	orr	vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
> +	orr	vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
> +	orr	vend1.16b, vhas_chr1.16b, vhas_chr2.16b
> +	addp	vend1.2d, vend1.2d, vend1.2d
> +	mov	tmp1, vend1.2d[0]
> +	cbz	tmp1, L(loop)
> +
> +	/* Termination condition found.  Now need to establish exactly why
> +	   we terminated.  */
> +	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
> +	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
> +	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
> +	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
> +
> +	mov	tmp1, vend1.2d[0]
> +L(tail):
> +	/* Count the trailing zeros, by bit reversing...  */
> +	rbit	tmp1, tmp1
> +	/* Re-bias source.  */
> +	sub	src, src, #32
> +	clz	tmp1, tmp1	/* ... and counting the leading zeros.  */
> +	/* tmp1 is twice the offset into the fragment.  */
> +	add	result, src, tmp1, lsr #1
> +	ret
> +
> +END(__strchrnul)
> +weak_alias (__strchrnul, strchrnul)
Richard Earnshaw June 13, 2014, 3:20 p.m. UTC | #2
On 13/06/14 13:12, Ondřej Bílka wrote:
> On Fri, Jun 13, 2014 at 11:23:26AM +0100, Richard Earnshaw wrote:
>> Here is an optimized implementation of __strchrnul.  The simplification
>> that we don't have to track precisely why the loop terminates (match or
>> end-of-string) means we have to do less work in both setup and the core
>> inner loop.  That means this should never be slower than strchr.
>>
>> As with strchr, the use of LD1 means we do not need different versions
>> for big-/little-endian.
>>
>> <date>  Richard Earnshaw  <rearnsha@arm.com>
>>
>> 	* sysdeps/aarch64/strchrnul.S: New file.
>>
>> OK?
> 
> Few comments, a hot path in strchrnul are first 64 bytes so you should
> focus on these.
> 
> First get a profiler here. This is a simple program that collects a
> sizes of strchr calls and then runs these again. It is good first
> approximation of real performance.
> 
> http://kam.mff.cuni.cz/~ondra/dryrun_strchrnul.tar.bz2
> 
> After you collect calls from programs that interest you try to compare
> these. A old/new implementation is minimum, but I have several
> questions.
> 
> First what is latency of unaligned loads? One performance problem on x64
> were small strings that cross 64byte boundary. It turned out that it is
> faster first check if we do not cross page and then do unaligned comparison
> on 64 bytes. That needs to be checked.
> 
> Second trick is first check page crossing, then align to 16 bytes and do
> 32byte compare so you always compare at least 16 valid bytes in header.
> 

Thanks for the hints and the link.  I'll try to look into this more next
week.

On the question of latency for unaligned loads, the answer is that
there's no single answer; ARM just defines the architecture and then
implementations are derived from it with different trade-offs in
micro-architecture (I might be able to work out what our own
implementations do, but not implementations by architecture licencees).
  Furthermore, answers to questions such as cache line length and even
page size are similarly vague -- I can probably assume pages will not be
less than 4k, but there's no guarantee that they aren't bigger; a test
to ask the kernel what the page size is would undoubtedly cost more time
than we'd save.  Similarly cache lines might be 64 bytes long, but
there's no architectural guarantee that they aren't shorter.

> 
> 
>> +
>> +ENTRY (__strchrnul)
>> +	/* Magic constant 0x40100401 to allow us to identify which lane
>> +	   matches the termination condition.  */
>> +	mov	wtmp2, #0x0401
>> +	movk	wtmp2, #0x4010, lsl #16
>> +	dup	vrepchr.16b, chrin
>> +	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
>> +	dup	vrepmask.4s, wtmp2
>> +	ands	tmp1, srcin, #31
>> +	b.eq	L(loop)
>> +
> Omitting this branch would improve performance if code below still works
> on aligned hunk. In loop there is risk of branch misprediction which is
> slower than staying at head. Condition could also cause branch
> misprediction but when I looked to data a 32-byte aligned strings are
> rare.
> 

Hmm, interesting observation.  I agree that even with strings regularly
aligned to 8 bytes, they often will not be 32-byte aligned (1 in 4
chance), and decreasing to 1 in 32 when the input alignment is fully
random.  The trade-off here is that for the first non-aligned iteration
we have to do more work in the Neon unit (calculating an accurate
syndrome so that we can correctly mask it).  Once we hit the main loop
we can trade a faster check for a bit more work in the epilogue.

On the other hand, removing that branch gives me more freedom to pull
the initial load forward and otherwise re-organise the code in addition
to the benefit of eliminating a branch that won't in practice be very
predictable -- it should be possible to share the entry sequence -- at
most it means forcing the mask to enable all lanes in the vector and
that's just a conditional select instruction.

R.

>> +	/* Input string is not 32-byte aligned.  Rather than forcing
>> +	   the padding bytes to a safe value, we calculate the syndrome
>> +	   for all the bytes, but then mask off those bits of the
>> +	   syndrome that are related to the padding.  */
>> +	ld1	{vdata1.16b, vdata2.16b}, [src], #32
>> +	neg	tmp1, tmp1
>> +	cmeq	vhas_nul1.16b, vdata1.16b, #0
>> +	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
>> +	cmeq	vhas_nul2.16b, vdata2.16b, #0
>> +	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
>> +	orr	vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
>> +	orr	vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
>> +	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
>> +	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
>> +	lsl	tmp1, tmp1, #1
>> +	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
>> +	mov	tmp3, #~0
>> +	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
>> +	lsr	tmp1, tmp3, tmp1
>> +
>> +	mov	tmp3, vend1.2d[0]
>> +	bic	tmp1, tmp3, tmp1	// Mask padding bits.
>> +	cbnz	tmp1, L(tail)
>> +
>> +L(loop):
>> +	ld1	{vdata1.16b, vdata2.16b}, [src], #32
>> +	cmeq	vhas_nul1.16b, vdata1.16b, #0
>> +	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
>> +	cmeq	vhas_nul2.16b, vdata2.16b, #0
>> +	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
>> +	/* Use a fast check for the termination condition.  */
>> +	orr	vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
>> +	orr	vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
>> +	orr	vend1.16b, vhas_chr1.16b, vhas_chr2.16b
>> +	addp	vend1.2d, vend1.2d, vend1.2d
>> +	mov	tmp1, vend1.2d[0]
>> +	cbz	tmp1, L(loop)
>> +
>> +	/* Termination condition found.  Now need to establish exactly why
>> +	   we terminated.  */
>> +	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
>> +	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
>> +	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
>> +	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
>> +
>> +	mov	tmp1, vend1.2d[0]
>> +L(tail):
>> +	/* Count the trailing zeros, by bit reversing...  */
>> +	rbit	tmp1, tmp1
>> +	/* Re-bias source.  */
>> +	sub	src, src, #32
>> +	clz	tmp1, tmp1	/* ... and counting the leading zeros.  */
>> +	/* tmp1 is twice the offset into the fragment.  */
>> +	add	result, src, tmp1, lsr #1
>> +	ret
>> +
>> +END(__strchrnul)
>> +weak_alias (__strchrnul, strchrnul)
> 
> 
>
Ondrej Bilka June 25, 2014, 1:28 a.m. UTC | #3
On Fri, Jun 13, 2014 at 04:20:38PM +0100, Richard Earnshaw wrote:
> On 13/06/14 13:12, Ondřej Bílka wrote:
> > On Fri, Jun 13, 2014 at 11:23:26AM +0100, Richard Earnshaw wrote:
> >> Here is an optimized implementation of __strchrnul.  The simplification
> >> that we don't have to track precisely why the loop terminates (match or
> >> end-of-string) means we have to do less work in both setup and the core
> >> inner loop.  That means this should never be slower than strchr.
> >>
> >> As with strchr, the use of LD1 means we do not need different versions
> >> for big-/little-endian.
> >>
> >> <date>  Richard Earnshaw  <rearnsha@arm.com>
> >>
> >> 	* sysdeps/aarch64/strchrnul.S: New file.
> >>
> >> OK?
> > 
> > Few comments, a hot path in strchrnul are first 64 bytes so you should
> > focus on these.
> > 
> > First get a profiler here. This is a simple program that collects a
> > sizes of strchr calls and then runs these again. It is good first
> > approximation of real performance.
> > 
> > http://kam.mff.cuni.cz/~ondra/dryrun_strchrnul.tar.bz2
> >
> > After you collect calls from programs that interest you try to compare
> > these. A old/new implementation is minimum, but I have several
> > questions.
> > 
> > First what is latency of unaligned loads? One performance problem on x64
> > were small strings that cross 64byte boundary. It turned out that it is
> > faster first check if we do not cross page and then do unaligned comparison
> > on 64 bytes. That needs to be checked.
> > 
> > Second trick is first check page crossing, then align to 16 bytes and do
> > 32byte compare so you always compare at least 16 valid bytes in header.
> > 
> 
> Thanks for the hints and the link.  I'll try to look into this more next
> week.
> 
> On the question of latency for unaligned loads, the answer is that
> there's no single answer; ARM just defines the architecture and then
> implementations are derived from it with different trade-offs in
> micro-architecture (I might be able to work out what our own
> implementations do, but not implementations by architecture licencees).
>   Furthermore, answers to questions such as cache line length and even
> page size are similarly vague -- I can probably assume pages will not be
> less than 4k, but there's no guarantee that they aren't bigger; a test
> to ask the kernel what the page size is would undoubtedly cost more time
> than we'd save.  Similarly cache lines might be 64 bytes long, but
> there's no architectural guarantee that they aren't shorter.
> 
> 
Of course it is faster to hardcode a 4096 page size and 64 bytes cache
lines as different values does not make that much difference.

A best answer would be to have several implementations and then for
each processor we run a benchmark that says what function should be
used. That has two problems, first is that we need to cache benchmark
results and check if cpu changed, second is that it is hard to write a
good benchmark.

A start is to have variants that you could try.
Marcus Shawcroft Nov. 5, 2014, 2:02 p.m. UTC | #4
On 13 June 2014 11:23, Richard Earnshaw <rearnsha@arm.com> wrote:
> Here is an optimized implementation of __strchrnul.  The simplification
> that we don't have to track precisely why the loop terminates (match or
> end-of-string) means we have to do less work in both setup and the core
> inner loop.  That means this should never be slower than strchr.
>
> As with strchr, the use of LD1 means we do not need different versions
> for big-/little-endian.
>
> <date>  Richard Earnshaw  <rearnsha@arm.com>
>
>         * sysdeps/aarch64/strchrnul.S: New file.

Committed. + NEWS entry. /Marcus
Kyle McMartin Dec. 3, 2014, 5:45 p.m. UTC | #5
On Wed, Nov 05, 2014 at 02:02:22PM +0000, Marcus Shawcroft wrote:
> On 13 June 2014 11:23, Richard Earnshaw <rearnsha@arm.com> wrote:
> > Here is an optimized implementation of __strchrnul.  The simplification
> > that we don't have to track precisely why the loop terminates (match or
> > end-of-string) means we have to do less work in both setup and the core
> > inner loop.  That means this should never be slower than strchr.
> >
> > As with strchr, the use of LD1 means we do not need different versions
> > for big-/little-endian.
> >
> > <date>  Richard Earnshaw  <rearnsha@arm.com>
> >
> >         * sysdeps/aarch64/strchrnul.S: New file.
> 
> Committed. + NEWS entry. /Marcus

Hrm, I've bisected this commit as the cause of localedef segfaulting as
part of install-locales... it ends up crashing weirdly inside of
vsprintf's internals. Have either of you seen anything similar?

regards, Kyle
Andrew Pinski Dec. 9, 2014, 10:38 p.m. UTC | #6
On Wed, Dec 3, 2014 at 9:45 AM, Kyle McMartin <kmcmarti@redhat.com> wrote:
> On Wed, Nov 05, 2014 at 02:02:22PM +0000, Marcus Shawcroft wrote:
>> On 13 June 2014 11:23, Richard Earnshaw <rearnsha@arm.com> wrote:
>> > Here is an optimized implementation of __strchrnul.  The simplification
>> > that we don't have to track precisely why the loop terminates (match or
>> > end-of-string) means we have to do less work in both setup and the core
>> > inner loop.  That means this should never be slower than strchr.
>> >
>> > As with strchr, the use of LD1 means we do not need different versions
>> > for big-/little-endian.
>> >
>> > <date>  Richard Earnshaw  <rearnsha@arm.com>
>> >
>> >         * sysdeps/aarch64/strchrnul.S: New file.
>>
>> Committed. + NEWS entry. /Marcus
>
> Hrm, I've bisected this commit as the cause of localedef segfaulting as
> part of install-locales... it ends up crashing weirdly inside of
> vsprintf's internals. Have either of you seen anything similar?

I just ran into this with an ILP32 compiled glibc (with thunderX
tuning and some extra patches) where printf was crashing.  The problem
is v15 is being used but it is a callee saved register.  Here is the
documentation from the ABI:
Registers v8-v15 must be preserved by a callee across subroutine
calls; the remaining registers (v0-v7, v16-v31)
do not need to be preserved (or should be preserved by the caller).

Thanks,
Andrew Pinski

>
> regards, Kyle
Kyle McMartin Dec. 9, 2014, 11:46 p.m. UTC | #7
On Tue, Dec 09, 2014 at 02:38:11PM -0800, Andrew Pinski wrote:
> > Hrm, I've bisected this commit as the cause of localedef segfaulting as
> > part of install-locales... it ends up crashing weirdly inside of
> > vsprintf's internals. Have either of you seen anything similar?
> 
> I just ran into this with an ILP32 compiled glibc (with thunderX
> tuning and some extra patches) where printf was crashing.  The problem
> is v15 is being used but it is a callee saved register.  Here is the
> documentation from the ABI:
> Registers v8-v15 must be preserved by a callee across subroutine
> calls; the remaining registers (v0-v7, v16-v31)
> do not need to be preserved (or should be preserved by the caller).
> 

Aha, I'd debugged to the same point today after comparing logs between
the C and assembler versions of strchrnul. I swapped vrepmask for v7 and
things chugged along as expected.

--Kyle

> Thanks,
> Andrew Pinski
>
Richard Earnshaw Dec. 10, 2014, 9:38 a.m. UTC | #8
On 09/12/14 22:38, Andrew Pinski wrote:
> On Wed, Dec 3, 2014 at 9:45 AM, Kyle McMartin <kmcmarti@redhat.com> wrote:
>> On Wed, Nov 05, 2014 at 02:02:22PM +0000, Marcus Shawcroft wrote:
>>> On 13 June 2014 11:23, Richard Earnshaw <rearnsha@arm.com> wrote:
>>>> Here is an optimized implementation of __strchrnul.  The simplification
>>>> that we don't have to track precisely why the loop terminates (match or
>>>> end-of-string) means we have to do less work in both setup and the core
>>>> inner loop.  That means this should never be slower than strchr.
>>>>
>>>> As with strchr, the use of LD1 means we do not need different versions
>>>> for big-/little-endian.
>>>>
>>>> <date>  Richard Earnshaw  <rearnsha@arm.com>
>>>>
>>>>         * sysdeps/aarch64/strchrnul.S: New file.
>>>
>>> Committed. + NEWS entry. /Marcus
>>
>> Hrm, I've bisected this commit as the cause of localedef segfaulting as
>> part of install-locales... it ends up crashing weirdly inside of
>> vsprintf's internals. Have either of you seen anything similar?
> 
> I just ran into this with an ILP32 compiled glibc (with thunderX
> tuning and some extra patches) where printf was crashing.  The problem
> is v15 is being used but it is a callee saved register.  Here is the
> documentation from the ABI:
> Registers v8-v15 must be preserved by a callee across subroutine
> calls; the remaining registers (v0-v7, v16-v31)
> do not need to be preserved (or should be preserved by the caller).
> 
> Thanks,
> Andrew Pinski
> 
>>
>> regards, Kyle
> 

Well spotted!  I'll push a fix round the various implementations of this
ASAP.

R.
diff mbox

Patch

diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
new file mode 100644
index 0000000..b98c2e9
--- /dev/null
+++ b/sysdeps/aarch64/strchrnul.S
@@ -0,0 +1,130 @@ 
+/* strchrnul - find a character or nul in a string
+
+   Copyright (C) 2014 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+/* Locals and temporaries.  */
+
+#define src		x2
+#define tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_nul1	v3
+#define vhas_nul2	v4
+#define vhas_chr1	v5
+#define vhas_chr2	v6
+#define vrepmask	v15
+#define vend1		v16
+
+/* Core algorithm.
+
+   For each 32-byte hunk we calculate a 64-bit syndrome value, with
+   two bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set iff
+   the relevant byte matched the requested character or nul.  Since the
+   bits in the syndrome reflect exactly the order in which things occur
+   in the original string a count_trailing_zeros() operation will
+   identify exactly which byte is causing the termination.  */
+
+ENTRY (__strchrnul)
+	/* Magic constant 0x40100401 to allow us to identify which lane
+	   matches the termination condition.  */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
+	dup	vrepmask.4s, wtmp2
+	ands	tmp1, srcin, #31
+	b.eq	L(loop)
+
+	/* Input string is not 32-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	neg	tmp1, tmp1
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	orr	vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
+	orr	vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	lsl	tmp1, tmp1, #1
+	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	mov	tmp3, #~0
+	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
+	lsr	tmp1, tmp3, tmp1
+
+	mov	tmp3, vend1.2d[0]
+	bic	tmp1, tmp3, tmp1	// Mask padding bits.
+	cbnz	tmp1, L(tail)
+
+L(loop):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* Use a fast check for the termination condition.  */
+	orr	vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
+	orr	vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
+	orr	vend1.16b, vhas_chr1.16b, vhas_chr2.16b
+	addp	vend1.2d, vend1.2d, vend1.2d
+	mov	tmp1, vend1.2d[0]
+	cbz	tmp1, L(loop)
+
+	/* Termination condition found.  Now need to establish exactly why
+	   we terminated.  */
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
+	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
+
+	mov	tmp1, vend1.2d[0]
+L(tail):
+	/* Count the trailing zeros, by bit reversing...  */
+	rbit	tmp1, tmp1
+	/* Re-bias source.  */
+	sub	src, src, #32
+	clz	tmp1, tmp1	/* ... and counting the leading zeros.  */
+	/* tmp1 is twice the offset into the fragment.  */
+	add	result, src, tmp1, lsr #1
+	ret
+
+END(__strchrnul)
+weak_alias (__strchrnul, strchrnul)