ARM: Add optimized ARMv7 strcmp implementation

Message ID 1398256338-24784-1-git-send-email-will.newton@linaro.org
State Committed
Headers

Commit Message

Will Newton April 23, 2014, 12:32 p.m. UTC
  Add an optimized implementation of strcmp for ARMv7-A cores. This
implementation is significantly faster than the current generic C
implementation, particularly for strings of 16 bytes and longer.

The code was written by ARM, who have agreed to assign the copyright
to the FSF for integration into glibc.

ChangeLog:

2014-04-23  Will Newton  <will.newton@linaro.org>

	* sysdeps/arm/armv7/strcmp.S: New file.
---
 sysdeps/arm/armv7/strcmp.S | 483 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 483 insertions(+)
 create mode 100644 sysdeps/arm/armv7/strcmp.S
  

Comments

Richard Earnshaw April 23, 2014, 3:20 p.m. UTC | #1
On 23/04/14 13:32, Will Newton wrote:
> Add an optimized implementation of strcmp for ARMv7-A cores. This
> implementation is significantly faster than the current generic C
> implementation, particularly for strings of 16 bytes and longer.
> 
> The code was written by ARM, who have agreed to assign the copyright
> to the FSF for integration into glibc.
> 

I confirm that ARM is happy to contribute this code to GLIBC under its
FSF assignment agreement.

Will, can you add "Contributed by ARM Ltd." to the banner part please?

R.

> ChangeLog:
> 
> 2014-04-23  Will Newton  <will.newton@linaro.org>
> 
>         * sysdeps/arm/armv7/strcmp.S: New file.
> ---
>  sysdeps/arm/armv7/strcmp.S | 483 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 483 insertions(+)
>  create mode 100644 sysdeps/arm/armv7/strcmp.S
> 
> diff --git a/sysdeps/arm/armv7/strcmp.S b/sysdeps/arm/armv7/strcmp.S
> new file mode 100644
> index 0000000..0f47767
> --- /dev/null
> +++ b/sysdeps/arm/armv7/strcmp.S
> @@ -0,0 +1,483 @@
> +/* Copyright (C) 2012-2014 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <arm-features.h>
> +#include <sysdep.h>
> +
> +/* Implementation of strcmp for ARMv7 when DSP instructions are
> +   available.  Use ldrd to support wider loads, provided the data
> +   is sufficiently aligned.  Use saturating arithmetic to optimize
> +   the compares.  */
> +
> +/* Build Options:
> +   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
> +   byte in the string.  If comparing completely random strings
> +   the pre-check will save time, since there is a very high
> +   probability of a mismatch in the first character: we save
> +   significant overhead if this is the common case.  However,
> +   if strings are likely to be identical (eg because we're
> +   verifying a hit in a hash table), then this check is largely
> +   redundant.  */
> +
> +#define STRCMP_NO_PRECHECK     0
> +
> +       /* This version uses Thumb-2 code.  */
> +       .thumb
> +       .syntax unified
> +
> +#ifdef __ARM_BIG_ENDIAN
> +#define S2LO lsl
> +#define S2LOEQ lsleq
> +#define S2HI lsr
> +#define MSB 0x000000ff
> +#define LSB 0xff000000
> +#define BYTE0_OFFSET 24
> +#define BYTE1_OFFSET 16
> +#define BYTE2_OFFSET 8
> +#define BYTE3_OFFSET 0
> +#else /* not  __ARM_BIG_ENDIAN */
> +#define S2LO lsr
> +#define S2LOEQ lsreq
> +#define S2HI lsl
> +#define BYTE0_OFFSET 0
> +#define BYTE1_OFFSET 8
> +#define BYTE2_OFFSET 16
> +#define BYTE3_OFFSET 24
> +#define MSB 0xff000000
> +#define LSB 0x000000ff
> +#endif /* not  __ARM_BIG_ENDIAN */
> +
> +/* Parameters and result.  */
> +#define src1           r0
> +#define src2           r1
> +#define result         r0      /* Overlaps src1.  */
> +
> +/* Internal variables.  */
> +#define tmp1           r4
> +#define tmp2           r5
> +#define const_m1       r12
> +
> +/* Additional internal variables for 64-bit aligned data.  */
> +#define data1a         r2
> +#define data1b         r3
> +#define data2a         r6
> +#define data2b         r7
> +#define syndrome_a     tmp1
> +#define syndrome_b     tmp2
> +
> +/* Additional internal variables for 32-bit aligned data.  */
> +#define data1          r2
> +#define data2          r3
> +#define syndrome       tmp2
> +
> +
> +       /* Macro to compute and return the result value for word-aligned
> +          cases.  */
> +       .macro strcmp_epilogue_aligned synd d1 d2 restore_r6
> +#ifdef __ARM_BIG_ENDIAN
> +       /* If data1 contains a zero byte, then syndrome will contain a 1 in
> +          bit 7 of that byte.  Otherwise, the highest set bit in the
> +          syndrome will highlight the first different bit.  It is therefore
> +          sufficient to extract the eight bits starting with the syndrome
> +          bit.  */
> +       clz     tmp1, \synd
> +       lsl     r1, \d2, tmp1
> +       .if \restore_r6
> +       ldrd    r6, r7, [sp, #8]
> +       .endif
> +       cfi_restore (r6)
> +       cfi_restore (r7)
> +       lsl     \d1, \d1, tmp1
> +       cfi_remember_state
> +       lsr     result, \d1, #24
> +       ldrd    r4, r5, [sp], #16
> +       cfi_restore (r4)
> +       cfi_restore (r5)
> +       sub     result, result, r1, lsr #24
> +       bx      lr
> +#else
> +       /* To use the big-endian trick we'd have to reverse all three words.
> +          that's slower than this approach.  */
> +       rev     \synd, \synd
> +       clz     tmp1, \synd
> +       bic     tmp1, tmp1, #7
> +       lsr     r1, \d2, tmp1
> +       cfi_remember_state
> +       .if \restore_r6
> +       ldrd    r6, r7, [sp, #8]
> +       .endif
> +       cfi_restore (r6)
> +       cfi_restore (r7)
> +       lsr     \d1, \d1, tmp1
> +       and     result, \d1, #255
> +       and     r1, r1, #255
> +       ldrd    r4, r5, [sp], #16
> +       cfi_restore (r4)
> +       cfi_restore (r5)
> +       sub     result, result, r1
> +
> +       bx      lr
> +#endif
> +       .endm
> +
> +       .text
> +       .p2align        5
> +.Lstrcmp_start_addr:
> +#if STRCMP_NO_PRECHECK == 0
> +.Lfastpath_exit:
> +       sub     r0, r2, r3
> +       bx      lr
> +       nop
> +#endif
> +ENTRY(strcmp)
> +#if STRCMP_NO_PRECHECK == 0
> +       ldrb    r2, [src1]
> +       ldrb    r3, [src2]
> +       cmp     r2, #1
> +       it      cs
> +       cmpcs   r2, r3
> +       bne     .Lfastpath_exit
> +#endif
> +       strd    r4, r5, [sp, #-16]!
> +       cfi_def_cfa_offset (16)
> +       cfi_offset (r4, -16)
> +       cfi_offset (r5, -12)
> +       orr     tmp1, src1, src2
> +       strd    r6, r7, [sp, #8]
> +       cfi_offset (r6, -8)
> +       cfi_offset (r7, -4)
> +       mvn     const_m1, #0
> +       lsl     r2, tmp1, #29
> +       cbz     r2, .Lloop_aligned8
> +
> +.Lnot_aligned:
> +       eor     tmp1, src1, src2
> +       tst     tmp1, #7
> +       bne     .Lmisaligned8
> +
> +       /* Deal with mutual misalignment by aligning downwards and then
> +          masking off the unwanted loaded data to prevent a difference.  */
> +       and     tmp1, src1, #7
> +       bic     src1, src1, #7
> +       and     tmp2, tmp1, #3
> +       bic     src2, src2, #7
> +       lsl     tmp2, tmp2, #3  /* Bytes -> bits.  */
> +       ldrd    data1a, data1b, [src1], #16
> +       tst     tmp1, #4
> +       ldrd    data2a, data2b, [src2], #16
> +       /* In thumb code we can't use MVN with a register shift, but
> +          we do have ORN.  */
> +       S2HI    tmp1, const_m1, tmp2
> +       orn     data1a, data1a, tmp1
> +       orn     data2a, data2a, tmp1
> +       beq     .Lstart_realigned8
> +       orn     data1b, data1b, tmp1
> +       mov     data1a, const_m1
> +       orn     data2b, data2b, tmp1
> +       mov     data2a, const_m1
> +       b       .Lstart_realigned8
> +
> +       /* Unwind the inner loop by a factor of 2, giving 16 bytes per
> +          pass.  */
> +       .p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
> +       .p2align 2      /* Always word aligned.  */
> +.Lloop_aligned8:
> +       ldrd    data1a, data1b, [src1], #16
> +       ldrd    data2a, data2b, [src2], #16
> +.Lstart_realigned8:
> +       uadd8   syndrome_b, data1a, const_m1    /* Only want GE bits,  */
> +       eor     syndrome_a, data1a, data2a
> +       sel     syndrome_a, syndrome_a, const_m1
> +       cbnz    syndrome_a, .Ldiff_in_a
> +       uadd8   syndrome_b, data1b, const_m1    /* Only want GE bits.  */
> +       eor     syndrome_b, data1b, data2b
> +       sel     syndrome_b, syndrome_b, const_m1
> +       cbnz    syndrome_b, .Ldiff_in_b
> +
> +       ldrd    data1a, data1b, [src1, #-8]
> +       ldrd    data2a, data2b, [src2, #-8]
> +       uadd8   syndrome_b, data1a, const_m1    /* Only want GE bits,  */
> +       eor     syndrome_a, data1a, data2a
> +       sel     syndrome_a, syndrome_a, const_m1
> +       uadd8   syndrome_b, data1b, const_m1    /* Only want GE bits.  */
> +       eor     syndrome_b, data1b, data2b
> +       sel     syndrome_b, syndrome_b, const_m1
> +       /* Can't use CBZ for backwards branch.  */
> +       orrs    syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
> +       beq     .Lloop_aligned8
> +
> +.Ldiff_found:
> +       cbnz    syndrome_a, .Ldiff_in_a
> +
> +.Ldiff_in_b:
> +       strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
> +
> +.Ldiff_in_a:
> +       cfi_restore_state
> +       strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
> +
> +       cfi_restore_state
> +.Lmisaligned8:
> +       tst     tmp1, #3
> +       bne     .Lmisaligned4
> +       ands    tmp1, src1, #3
> +       bne     .Lmutual_align4
> +
> +       /* Unrolled by a factor of 2, to reduce the number of post-increment
> +          operations.  */
> +.Lloop_aligned4:
> +       ldr     data1, [src1], #8
> +       ldr     data2, [src2], #8
> +.Lstart_realigned4:
> +       uadd8   syndrome, data1, const_m1       /* Only need GE bits.  */
> +       eor     syndrome, data1, data2
> +       sel     syndrome, syndrome, const_m1
> +       cbnz    syndrome, .Laligned4_done
> +       ldr     data1, [src1, #-4]
> +       ldr     data2, [src2, #-4]
> +       uadd8   syndrome, data1, const_m1
> +       eor     syndrome, data1, data2
> +       sel     syndrome, syndrome, const_m1
> +       cmp     syndrome, #0
> +       beq     .Lloop_aligned4
> +
> +.Laligned4_done:
> +       strcmp_epilogue_aligned syndrome, data1, data2, 0
> +
> +.Lmutual_align4:
> +       cfi_restore_state
> +       /* Deal with mutual misalignment by aligning downwards and then
> +          masking off the unwanted loaded data to prevent a difference.  */
> +       lsl     tmp1, tmp1, #3  /* Bytes -> bits.  */
> +       bic     src1, src1, #3
> +       ldr     data1, [src1], #8
> +       bic     src2, src2, #3
> +       ldr     data2, [src2], #8
> +
> +       /* In thumb code we can't use MVN with a register shift, but
> +          we do have ORN.  */
> +       S2HI    tmp1, const_m1, tmp1
> +       orn     data1, data1, tmp1
> +       orn     data2, data2, tmp1
> +       b       .Lstart_realigned4
> +
> +.Lmisaligned4:
> +       ands    tmp1, src1, #3
> +       beq     .Lsrc1_aligned
> +       sub     src2, src2, tmp1
> +       bic     src1, src1, #3
> +       lsls    tmp1, tmp1, #31
> +       ldr     data1, [src1], #4
> +       beq     .Laligned_m2
> +       bcs     .Laligned_m1
> +
> +#if STRCMP_NO_PRECHECK == 1
> +       ldrb    data2, [src2, #1]
> +       uxtb    tmp1, data1, ror #BYTE1_OFFSET
> +       subs    tmp1, tmp1, data2
> +       bne     .Lmisaligned_exit
> +       cbz     data2, .Lmisaligned_exit
> +
> +.Laligned_m2:
> +       ldrb    data2, [src2, #2]
> +       uxtb    tmp1, data1, ror #BYTE2_OFFSET
> +       subs    tmp1, tmp1, data2
> +       bne     .Lmisaligned_exit
> +       cbz     data2, .Lmisaligned_exit
> +
> +.Laligned_m1:
> +       ldrb    data2, [src2, #3]
> +       uxtb    tmp1, data1, ror #BYTE3_OFFSET
> +       subs    tmp1, tmp1, data2
> +       bne     .Lmisaligned_exit
> +       add     src2, src2, #4
> +       cbnz    data2, .Lsrc1_aligned
> +#else  /* STRCMP_NO_PRECHECK */
> +       /* If we've done the pre-check, then we don't need to check the
> +          first byte again here.  */
> +       ldrb    data2, [src2, #2]
> +       uxtb    tmp1, data1, ror #BYTE2_OFFSET
> +       subs    tmp1, tmp1, data2
> +       bne     .Lmisaligned_exit
> +       cbz     data2, .Lmisaligned_exit
> +
> +.Laligned_m2:
> +       ldrb    data2, [src2, #3]
> +       uxtb    tmp1, data1, ror #BYTE3_OFFSET
> +       subs    tmp1, tmp1, data2
> +       bne     .Lmisaligned_exit
> +       cbnz    data2, .Laligned_m1
> +#endif
> +
> +.Lmisaligned_exit:
> +       cfi_remember_state
> +       mov     result, tmp1
> +       ldr     r4, [sp], #16
> +       cfi_restore (r4)
> +       bx      lr
> +
> +#if STRCMP_NO_PRECHECK == 0
> +.Laligned_m1:
> +       add     src2, src2, #4
> +#endif
> +.Lsrc1_aligned:
> +       cfi_restore_state
> +       /* src1 is word aligned, but src2 has no common alignment
> +          with it.  */
> +       ldr     data1, [src1], #4
> +       lsls    tmp1, src2, #31         /* C=src2[1], Z=src2[0].  */
> +
> +       bic     src2, src2, #3
> +       ldr     data2, [src2], #4
> +       bhi     .Loverlap1              /* C=1, Z=0 => src2[1:0] = 0b11.  */
> +       bcs     .Loverlap2              /* C=1, Z=1 => src2[1:0] = 0b10.  */
> +
> +       /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
> +.Loverlap3:
> +       bic     tmp1, data1, #MSB
> +       uadd8   syndrome, data1, const_m1
> +       eors    syndrome, tmp1, data2, S2LO #8
> +       sel     syndrome, syndrome, const_m1
> +       bne     4f
> +       cbnz    syndrome, 5f
> +       ldr     data2, [src2], #4
> +       eor     tmp1, tmp1, data1
> +       cmp     tmp1, data2, S2HI #24
> +       bne     6f
> +       ldr     data1, [src1], #4
> +       b       .Loverlap3
> +4:
> +       S2LO    data2, data2, #8
> +       b       .Lstrcmp_tail
> +
> +5:
> +       bics    syndrome, syndrome, #MSB
> +       bne     .Lstrcmp_done_equal
> +
> +       /* We can only get here if the MSB of data1 contains 0, so
> +          fast-path the exit.  */
> +       ldrb    result, [src2]
> +       cfi_remember_state
> +       ldrd    r4, r5, [sp], #16
> +       cfi_restore (r4)
> +       cfi_restore (r5)
> +       /* R6/7 Not used in this sequence.  */
> +       cfi_restore (r6)
> +       cfi_restore (r7)
> +       neg     result, result
> +       bx      lr
> +
> +6:
> +       cfi_restore_state
> +       S2LO    data1, data1, #24
> +       and     data2, data2, #LSB
> +       b       .Lstrcmp_tail
> +
> +       .p2align 5,,12  /* Ensure at least 3 instructions in cache line.  */
> +.Loverlap2:
> +       and     tmp1, data1, const_m1, S2LO #16
> +       uadd8   syndrome, data1, const_m1
> +       eors    syndrome, tmp1, data2, S2LO #16
> +       sel     syndrome, syndrome, const_m1
> +       bne     4f
> +       cbnz    syndrome, 5f
> +       ldr     data2, [src2], #4
> +       eor     tmp1, tmp1, data1
> +       cmp     tmp1, data2, S2HI #16
> +       bne     6f
> +       ldr     data1, [src1], #4
> +       b       .Loverlap2
> +4:
> +       S2LO    data2, data2, #16
> +       b       .Lstrcmp_tail
> +5:
> +       ands    syndrome, syndrome, const_m1, S2LO #16
> +       bne     .Lstrcmp_done_equal
> +
> +       ldrh    data2, [src2]
> +       S2LO    data1, data1, #16
> +#ifdef __ARM_BIG_ENDIAN
> +       lsl     data2, data2, #16
> +#endif
> +       b       .Lstrcmp_tail
> +
> +6:
> +       S2LO    data1, data1, #16
> +       and     data2, data2, const_m1, S2LO #16
> +       b       .Lstrcmp_tail
> +
> +       .p2align 5,,12  /* Ensure at least 3 instructions in cache line.  */
> +.Loverlap1:
> +       and     tmp1, data1, #LSB
> +       uadd8   syndrome, data1, const_m1
> +       eors    syndrome, tmp1, data2, S2LO #24
> +       sel     syndrome, syndrome, const_m1
> +       bne     4f
> +       cbnz    syndrome, 5f
> +       ldr     data2, [src2], #4
> +       eor     tmp1, tmp1, data1
> +       cmp     tmp1, data2, S2HI #8
> +       bne     6f
> +       ldr     data1, [src1], #4
> +       b       .Loverlap1
> +4:
> +       S2LO    data2, data2, #24
> +       b       .Lstrcmp_tail
> +5:
> +       tst     syndrome, #LSB
> +       bne     .Lstrcmp_done_equal
> +       ldr     data2, [src2]
> +6:
> +       S2LO    data1, data1, #8
> +       bic     data2, data2, #MSB
> +       b       .Lstrcmp_tail
> +
> +.Lstrcmp_done_equal:
> +       mov     result, #0
> +       cfi_remember_state
> +       ldrd    r4, r5, [sp], #16
> +       cfi_restore (r4)
> +       cfi_restore (r5)
> +       /* R6/7 not used in this sequence.  */
> +       cfi_restore (r6)
> +       cfi_restore (r7)
> +       bx      lr
> +
> +.Lstrcmp_tail:
> +       cfi_restore_state
> +#ifndef __ARM_BIG_ENDIAN
> +       rev     data1, data1
> +       rev     data2, data2
> +       /* Now everything looks big-endian...  */
> +#endif
> +       uadd8   tmp1, data1, const_m1
> +       eor     tmp1, data1, data2
> +       sel     syndrome, tmp1, const_m1
> +       clz     tmp1, syndrome
> +       lsl     data1, data1, tmp1
> +       lsl     data2, data2, tmp1
> +       lsr     result, data1, #24
> +       ldrd    r4, r5, [sp], #16
> +       cfi_restore (r4)
> +       cfi_restore (r5)
> +       /* R6/7 not used in this sequence.  */
> +       cfi_restore (r6)
> +       cfi_restore (r7)
> +       sub     result, result, data2, lsr #24
> +       bx      lr
> +END(strcmp)
> +libc_hidden_builtin_def (strcmp)
> --
> 1.8.1.4
> 
>
  
Richard Henderson April 23, 2014, 4:13 p.m. UTC | #2
On 04/23/2014 05:32 AM, Will Newton wrote:
> +	clz	tmp1, \synd
> +	lsl	r1, \d2, tmp1
> +	.if \restore_r6
> +	ldrd	r6, r7, [sp, #8]
> +	.endif
> +	cfi_restore (r6)
> +	cfi_restore (r7)
> +	lsl	\d1, \d1, tmp1
> +	cfi_remember_state
> +	lsr	result, \d1, #24
> +	ldrd	r4, r5, [sp], #16
> +	cfi_restore (r4)
> +	cfi_restore (r5)
> +	sub	result, result, r1, lsr #24

You can minimize the size of the unwind info (avoiding DW_CFA_advance opcodes)
by delaying restores until the previous storage may be clobbered, or until some
other adjustment must be made.  The storage is clobbered, obviously, by the
post-increment of SP deallocating the storage.

> +#else
> +	/* To use the big-endian trick we'd have to reverse all three words.
> +	   that's slower than this approach.  */
> +	rev	\synd, \synd
> +	clz	tmp1, \synd
> +	bic	tmp1, tmp1, #7
> +	lsr	r1, \d2, tmp1
> +	cfi_remember_state
> +	.if \restore_r6
> +	ldrd	r6, r7, [sp, #8]
> +	.endif
> +	cfi_restore (r6)
> +	cfi_restore (r7)

Also, it would appear that you're remembering two different states between the
big and little endian versions of this code.  The LE version remembers before
restoring r6/r7; the BE version remembers afterward.

I suspect you want

	ldrd	r4, r5, [sp], #16
	cfi_remember_state
	cfi_def_cfa_offset(0)
	cfi_restore (r4)
	cfi_restore (r5)
	cfi_restore (r6)
	cfi_restore (r7)

which has a single advance within this code sequence, and r6/r7 remembered as
saved, and actually describes the stack adjustment.

> +	strd	r4, r5, [sp, #-16]!
> +	cfi_def_cfa_offset (16)
> +	cfi_offset (r4, -16)
> +	cfi_offset (r5, -12)
> +	orr	tmp1, src1, src2
> +	strd	r6, r7, [sp, #8]
> +	cfi_offset (r6, -8)
> +	cfi_offset (r7, -4)

FWIW, I find using cfi_rel_offset much easier to reason with, since you get to
match up numbers in the cfi data with numbers in the assembly.  E.g.

	cfi_rel_offset(r6, 8)
	cfi_rel_offset(r7, 12)

but perhaps not that helpful in this case.

> +.Lmisaligned_exit:
> +	cfi_remember_state
> +	mov	result, tmp1
> +	ldr	r4, [sp], #16
> +	cfi_restore (r4)
> +	bx	lr

Even though only r4 was modified and reloaded, r5-r7 are still described as
saved.  After popping all of the storage, that's not correct -- you need to
restore all 4 registers.  Again, you're not describing the stack adjustment.
Again, no point in doing the remember 2 insns before the restores.

Alternately, since this unwind info isn't for correctness of exception
handling, you could simply drop all this restore business and let the unwind
info be incorrect for the 1-2 insns before the return.


r~
  
Roland McGrath April 23, 2014, 4:31 p.m. UTC | #3
> Alternately, since this unwind info isn't for correctness of exception
> handling, you could simply drop all this restore business and let the unwind
> info be incorrect for the 1-2 insns before the return.

Please let's keep all CFI exactly correct.
  
Joseph Myers April 24, 2014, 2:42 p.m. UTC | #4
On Wed, 23 Apr 2014, Will Newton wrote:

> Add an optimized implementation of strcmp for ARMv7-A cores. This
> implementation is significantly faster than the current generic C
> implementation, particularly for strings of 16 bytes and longer.

Could you please describe the testing (at least the string/ tests should 
have been run for both endiannesses) and be more specific about 
"significantly faster" (benchmark results - on specific identified 
hardware - using glibc's benchtests are desirable for such string function 
tests, in addition to any other benchmarks you may have used if glibc's 
benchtests aren't useful for the cases of interest)?

> +/* Build Options:
> +   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
> +   byte in the string.  If comparing completely random strings
> +   the pre-check will save time, since there is a very high
> +   probability of a mismatch in the first character: we save
> +   significant overhead if this is the common case.  However,
> +   if strings are likely to be identical (eg because we're
> +   verifying a hit in a hash table), then this check is largely
> +   redundant.  */
> +
> +#define STRCMP_NO_PRECHECK	0

In general I think it's clearer to have such macros in the positive sense, 
i.e. STRCMP_PRECHECK (value 1) instead of STRCMP_NO_PRECHECK (value 0).
  
Joseph Myers April 24, 2014, 2:46 p.m. UTC | #5
On Wed, 23 Apr 2014, Richard Earnshaw wrote:

> On 23/04/14 13:32, Will Newton wrote:
> > Add an optimized implementation of strcmp for ARMv7-A cores. This
> > implementation is significantly faster than the current generic C
> > implementation, particularly for strings of 16 bytes and longer.
> > 
> > The code was written by ARM, who have agreed to assign the copyright
> > to the FSF for integration into glibc.
> > 
> 
> I confirm that ARM is happy to contribute this code to GLIBC under its
> FSF assignment agreement.
> 
> Will, can you add "Contributed by ARM Ltd." to the banner part please?

glibc no longer uses "Contributed by" lines in source files.  (They can go 
in the NEWS file entries, which are appropriate for such contributions of 
optimized functions.)
  
Will Newton April 25, 2014, 10:15 a.m. UTC | #6
On 23 April 2014 17:13, Richard Henderson <rth@twiddle.net> wrote:

Thanks for the review, particularly the attention to CFI which I
always find is easy to get wrong.

> On 04/23/2014 05:32 AM, Will Newton wrote:
>> +     clz     tmp1, \synd
>> +     lsl     r1, \d2, tmp1
>> +     .if \restore_r6
>> +     ldrd    r6, r7, [sp, #8]
>> +     .endif
>> +     cfi_restore (r6)
>> +     cfi_restore (r7)
>> +     lsl     \d1, \d1, tmp1
>> +     cfi_remember_state
>> +     lsr     result, \d1, #24
>> +     ldrd    r4, r5, [sp], #16
>> +     cfi_restore (r4)
>> +     cfi_restore (r5)
>> +     sub     result, result, r1, lsr #24
>
> You can minimize the size of the unwind info (avoiding DW_CFA_advance opcodes)
> by delaying restores until the previous storage may be clobbered, or until some
> other adjustment must be made.  The storage is clobbered, obviously, by the
> post-increment of SP deallocating the storage.
>
>> +#else
>> +     /* To use the big-endian trick we'd have to reverse all three words.
>> +        that's slower than this approach.  */
>> +     rev     \synd, \synd
>> +     clz     tmp1, \synd
>> +     bic     tmp1, tmp1, #7
>> +     lsr     r1, \d2, tmp1
>> +     cfi_remember_state
>> +     .if \restore_r6
>> +     ldrd    r6, r7, [sp, #8]
>> +     .endif
>> +     cfi_restore (r6)
>> +     cfi_restore (r7)
>
> Also, it would appear that you're remembering two different states between the
> big and little endian versions of this code.  The LE version remembers before
> restoring r6/r7; the BE version remembers afterward.
>
> I suspect you want
>
>         ldrd    r4, r5, [sp], #16
>         cfi_remember_state
>         cfi_def_cfa_offset(0)
>         cfi_restore (r4)
>         cfi_restore (r5)
>         cfi_restore (r6)
>         cfi_restore (r7)
>
> which has a single advance within this code sequence, and r6/r7 remembered as
> saved, and actually describes the stack adjustment.

Ok, I have attempted to fix this in all cases.

>> +     strd    r4, r5, [sp, #-16]!
>> +     cfi_def_cfa_offset (16)
>> +     cfi_offset (r4, -16)
>> +     cfi_offset (r5, -12)
>> +     orr     tmp1, src1, src2
>> +     strd    r6, r7, [sp, #8]
>> +     cfi_offset (r6, -8)
>> +     cfi_offset (r7, -4)
>
> FWIW, I find using cfi_rel_offset much easier to reason with, since you get to
> match up numbers in the cfi data with numbers in the assembly.  E.g.
>
>         cfi_rel_offset(r6, 8)
>         cfi_rel_offset(r7, 12)
>
> but perhaps not that helpful in this case.

Yes, I tend to find that simpler too but the code came to me using the
absolute offsets and as you suggest in this case there's not much in
it.

>> +.Lmisaligned_exit:
>> +     cfi_remember_state
>> +     mov     result, tmp1
>> +     ldr     r4, [sp], #16
>> +     cfi_restore (r4)
>> +     bx      lr
>
> Even though only r4 was modified and reloaded, r5-r7 are still described as
> saved.  After popping all of the storage, that's not correct -- you need to
> restore all 4 registers.  Again, you're not describing the stack adjustment.
> Again, no point in doing the remember 2 insns before the restores.
>
> Alternately, since this unwind info isn't for correctness of exception
> handling, you could simply drop all this restore business and let the unwind
> info be incorrect for the 1-2 insns before the return.

Hopefully the CFI is all correct now, I will post a v2.
  
Richard Earnshaw April 25, 2014, 11:11 a.m. UTC | #7
On 24/04/14 15:42, Joseph S. Myers wrote:
> On Wed, 23 Apr 2014, Will Newton wrote:
> 
>> Add an optimized implementation of strcmp for ARMv7-A cores. This
>> implementation is significantly faster than the current generic C
>> implementation, particularly for strings of 16 bytes and longer.
> 
> Could you please describe the testing (at least the string/ tests should 
> have been run for both endiannesses) and be more specific about 
> "significantly faster" (benchmark results - on specific identified 
> hardware - using glibc's benchtests are desirable for such string function 
> tests, in addition to any other benchmarks you may have used if glibc's 
> benchtests aren't useful for the cases of interest)?

These are the improvements for a production board with cortex-a15.  For
very short strings there's a fair amount of noise in the measurements,
but the regressions are generally very small, while the improvements can
be significant. Yes, there are cases where the new code is more than
three times faster.

R.


Length	1	alignment	1/1:	 15.63%
Length	1	alignment	1/1:	 1.61%
Length	1	alignment	1/1:	-0.04%
Length	2	alignment	2/2:	 39.45%
Length	2	alignment	2/2:	 51.49%
Length	2	alignment	2/2:	 47.71%
Length	3	alignment	3/3:	-1.46%
Length	3	alignment	3/3:	 47.09%
Length	3	alignment	3/3:	 48.54%
Length	4	alignment	4/4:	-7.89%
Length	4	alignment	4/4:	 0.00%
Length	4	alignment	4/4:	-0.03%
Length	5	alignment	5/5:	-4.09%
Length	5	alignment	5/5:	-4.11%
Length	5	alignment	5/5:	-1.42%
Length	6	alignment	6/6:	 4.17%
Length	6	alignment	6/6:	 2.81%
Length	6	alignment	6/6:	 4.29%
Length	7	alignment	7/7:	 6.93%
Length	7	alignment	7/7:	 4.19%
Length	7	alignment	7/7:	 2.84%
Length	8	alignment	8/8:	 12.86%
Length	8	alignment	8/8:	 15.40%
Length	8	alignment	8/8:	 13.88%
Length	9	alignment	9/9:	 37.50%
Length	9	alignment	9/9:	 59.20%
Length	9	alignment	9/9:	 20.03%
Length	10	alignment	10/10:	 52.00%
Length	10	alignment	10/10:	 11.45%
Length	10	alignment	10/10:	 12.82%
Length	11	alignment	11/11:	 58.13%
Length	11	alignment	11/11:	 67.18%
Length	11	alignment	11/11:	 15.93%
Length	12	alignment	12/12:	 50.61%
Length	12	alignment	12/12:	 13.90%
Length	12	alignment	12/12:	 18.85%
Length	13	alignment	13/13:	 60.85%
Length	13	alignment	13/13:	 13.68%
Length	13	alignment	13/13:	 10.69%
Length	14	alignment	14/14:	 66.23%
Length	14	alignment	14/14:	 16.44%
Length	14	alignment	14/14:	 16.48%
Length	15	alignment	15/15:	 72.64%
Length	15	alignment	15/15:	 72.58%
Length	15	alignment	15/15:	 72.58%
Length	16	alignment	16/16:	 70.66%
Length	16	alignment	16/16:	 84.07%
Length	16	alignment	16/16:	 84.07%
Length	17	alignment	17/17:	 68.39%
Length	17	alignment	17/17:	 80.53%
Length	17	alignment	17/17:	 76.75%
Length	18	alignment	18/18:	 74.99%
Length	18	alignment	18/18:	 83.33%
Length	18	alignment	18/18:	 84.73%
Length	19	alignment	19/19:	 80.86%
Length	19	alignment	19/19:	 82.18%
Length	19	alignment	19/19:	 87.50%
Length	20	alignment	20/20:	 35.61%
Length	20	alignment	20/20:	 83.75%
Length	20	alignment	20/20:	 81.31%
Length	21	alignment	21/21:	 49.39%
Length	21	alignment	21/21:	 69.13%
Length	21	alignment	21/21:	 71.59%
Length	22	alignment	22/22:	 44.69%
Length	22	alignment	22/22:	 63.97%
Length	22	alignment	22/22:	 73.16%
Length	23	alignment	23/23:	 51.82%
Length	23	alignment	23/23:	 79.02%
Length	23	alignment	23/23:	 79.79%
Length	24	alignment	24/24:	 62.08%
Length	24	alignment	24/24:	 87.17%
Length	24	alignment	24/24:	 93.34%
Length	25	alignment	25/25:	 48.28%
Length	25	alignment	25/25:	 68.28%
Length	25	alignment	25/25:	 80.27%
Length	26	alignment	26/26:	 55.85%
Length	26	alignment	26/26:	 76.21%
Length	26	alignment	26/26:	 79.79%
Length	27	alignment	27/27:	 75.34%
Length	27	alignment	27/27:	 85.72%
Length	27	alignment	27/27:	 80.02%
Length	28	alignment	28/28:	 52.72%
Length	28	alignment	28/28:	 69.16%
Length	28	alignment	28/28:	 65.85%
Length	29	alignment	29/29:	 65.49%
Length	29	alignment	29/29:	 60.48%
Length	29	alignment	29/29:	 67.50%
Length	30	alignment	30/30:	 67.02%
Length	30	alignment	30/30:	 71.98%
Length	30	alignment	30/30:	 71.11%
Length	31	alignment	31/31:	 66.29%
Length	31	alignment	31/31:	 71.43%
Length	31	alignment	31/31:	 70.23%
Length	4	alignment	0/0:	 4.48%
Length	4	alignment	0/0:	 14.90%
Length	4	alignment	0/0:	 9.19%
Length	4	alignment	0/0:	 9.38%
Length	4	alignment	0/0:	-14.30%
Length	4	alignment	0/0:	 4.69%
Length	4	alignment	0/1:	-17.09%
Length	4	alignment	1/2:	-10.69%
Length	8	alignment	0/0:	 4.05%
Length	8	alignment	0/0:	 8.70%
Length	8	alignment	0/0:	 14.72%
Length	8	alignment	0/0:	 14.98%
Length	8	alignment	0/0:	 22.73%
Length	8	alignment	0/0:	 9.09%
Length	8	alignment	0/2:	 10.59%
Length	8	alignment	2/3:	 29.27%
Length	16	alignment	0/0:	 44.31%
Length	16	alignment	0/0:	 105.97%
Length	16	alignment	0/0:	 91.31%
Length	16	alignment	0/0:	 75.53%
Length	16	alignment	0/0:	 46.52%
Length	16	alignment	0/0:	 60.55%
Length	16	alignment	0/3:	 1.46%
Length	16	alignment	3/4:	 7.76%
Length	32	alignment	0/0:	 14.62%
Length	32	alignment	0/0:	 100.03%
Length	32	alignment	0/0:	 98.84%
Length	32	alignment	0/0:	 34.27%
Length	32	alignment	0/0:	 24.78%
Length	32	alignment	0/0:	 81.48%
Length	32	alignment	0/4:	 80.47%
Length	32	alignment	4/5:	-2.03%
Length	64	alignment	0/0:	 98.29%
Length	64	alignment	0/0:	 102.75%
Length	64	alignment	0/0:	 140.99%
Length	64	alignment	0/0:	 142.60%
Length	64	alignment	0/0:	 129.61%
Length	64	alignment	0/0:	 125.91%
Length	64	alignment	0/5:	 25.23%
Length	64	alignment	5/6:	 37.97%
Length	128	alignment	0/0:	 43.83%
Length	128	alignment	0/0:	 33.34%
Length	128	alignment	0/0:	 55.93%
Length	128	alignment	0/0:	 91.75%
Length	128	alignment	0/0:	 148.25%
Length	128	alignment	0/0:	 136.22%
Length	128	alignment	0/6:	 24.40%
Length	128	alignment	6/7:	 34.18%
Length	256	alignment	0/0:	 84.87%
Length	256	alignment	0/0:	 104.30%
Length	256	alignment	0/0:	 95.59%
Length	256	alignment	0/0:	 86.65%
Length	256	alignment	0/0:	 109.61%
Length	256	alignment	0/0:	 97.53%
Length	256	alignment	0/7:	 25.79%
Length	256	alignment	7/8:	 32.29%
Length	512	alignment	0/0:	 195.49%
Length	512	alignment	0/0:	 140.54%
Length	512	alignment	0/0:	 148.29%
Length	512	alignment	0/0:	 164.03%
Length	512	alignment	0/0:	 150.61%
Length	512	alignment	0/0:	 174.32%
Length	512	alignment	0/8:	 193.32%
Length	512	alignment	8/9:	 60.42%
Length	1024	alignment	0/0:	 238.38%
Length	1024	alignment	0/0:	 239.21%
Length	1024	alignment	0/0:	 247.45%
Length	1024	alignment	0/0:	 241.00%
Length	1024	alignment	0/0:	 241.49%
Length	1024	alignment	0/0:	 340.83%
Length	1024	alignment	0/9:	 64.46%
Length	1024	alignment	9/10:	 62.71%
Length	16	alignment	1/2:	 37.79%
Length	16	alignment	2/1:	 33.02%
Length	16	alignment	1/2:	 38.88%
Length	16	alignment	2/1:	 34.80%
Length	16	alignment	1/2:	 38.49%
Length	16	alignment	2/1:	 40.42%
Length	32	alignment	2/4:	 25.43%
Length	32	alignment	4/2:	 29.08%
Length	32	alignment	2/4:	 30.27%
Length	32	alignment	4/2:	-4.03%
Length	32	alignment	2/4:	 32.73%
Length	32	alignment	4/2:	-4.69%
Length	64	alignment	3/6:	-4.86%
Length	64	alignment	6/3:	 2.38%
Length	64	alignment	3/6:	-3.60%
Length	64	alignment	6/3:	 6.44%
Length	64	alignment	3/6:	-3.60%
Length	64	alignment	6/3:	 7.55%
Length	128	alignment	4/8:	 71.91%
Length	128	alignment	8/4:	 77.35%
Length	128	alignment	4/8:	 67.13%
Length	128	alignment	8/4:	 58.60%
Length	128	alignment	4/8:	 70.61%
Length	128	alignment	8/4:	 58.14%
Length	256	alignment	5/10:	 40.61%
Length	256	alignment	10/5:	 42.76%
Length	256	alignment	5/10:	 40.78%
Length	256	alignment	10/5:	 42.96%
Length	256	alignment	5/10:	 40.17%
Length	256	alignment	10/5:	 42.74%
Length	512	alignment	6/12:	 44.60%
Length	512	alignment	12/6:	 47.72%
Length	512	alignment	6/12:	 46.02%
Length	512	alignment	12/6:	 48.19%
Length	512	alignment	6/12:	 45.56%
Length	512	alignment	12/6:	 48.07%
Length	1024	alignment	7/14:	 63.23%
Length	1024	alignment	14/7:	 63.54%
Length	1024	alignment	7/14:	 63.07%
Length	1024	alignment	14/7:	 63.54%
Length	1024	alignment	7/14:	 63.18%
Length	1024	alignment	14/7:	 63.83%
  
Matt Turner April 25, 2014, 7:30 p.m. UTC | #8
On Fri, Apr 25, 2014 at 4:11 AM, Richard Earnshaw <rearnsha@arm.com> wrote:
> These are the improvements for a production board with cortex-a15.  For
> very short strings there's a fair amount of noise in the measurements,
> but the regressions are generally very small, while the improvements can
> be significant. Yes, there are cases where the new code is more than
> three times faster.
>
[snip]
>
> Length  1024    alignment       0/0:     238.38%
> Length  1024    alignment       0/0:     239.21%
> Length  1024    alignment       0/0:     247.45%

While for these cases it's clear from the surrounding context what
these numbers mean, I recommend not using percentages to describe
performance improvements. I'd be nice for glibc to adopt this policy
for sake of consistency and clarity.

If I saw in isolation

> Length  16      alignment       0/0:     105.97%

I wouldn't know whether this was measured relative to the original
performance (and was 5% faster) or as a speed up over the original
performance (and was a bit more than twice as fast).

This could be unambiguously written as

> Length  16      alignment       0/0:     2.0597x
  
Richard Earnshaw April 28, 2014, 9 a.m. UTC | #9
On 25/04/14 20:30, Matt Turner wrote:
> On Fri, Apr 25, 2014 at 4:11 AM, Richard Earnshaw <rearnsha@arm.com> wrote:
>> These are the improvements for a production board with cortex-a15.  For
>> very short strings there's a fair amount of noise in the measurements,
>> but the regressions are generally very small, while the improvements can
>> be significant. Yes, there are cases where the new code is more than
>> three times faster.
>>
> [snip]
>>
>> Length  1024    alignment       0/0:     238.38%
>> Length  1024    alignment       0/0:     239.21%
>> Length  1024    alignment       0/0:     247.45%
> 
> While for these cases it's clear from the surrounding context what
> these numbers mean, I recommend not using percentages to describe
> performance improvements. I'd be nice for glibc to adopt this policy
> for sake of consistency and clarity.
> 
> If I saw in isolation
> 
>> Length  16      alignment       0/0:     105.97%
> 
> I wouldn't know whether this was measured relative to the original
> performance (and was 5% faster) or as a speed up over the original
> performance (and was a bit more than twice as fast).
> 

Yes, I pondered this before posting the numbers.  I concluded that the
presence of a few small negative numbers (for the regressions) made the
whole table unambiguous.

> This could be unambiguously written as
> 
>> Length  16      alignment       0/0:     2.0597x
> 

If there's a need to re-submit, I'll do that.

R.
  

Patch

diff --git a/sysdeps/arm/armv7/strcmp.S b/sysdeps/arm/armv7/strcmp.S
new file mode 100644
index 0000000..0f47767
--- /dev/null
+++ b/sysdeps/arm/armv7/strcmp.S
@@ -0,0 +1,483 @@ 
+/* Copyright (C) 2012-2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <arm-features.h>
+#include <sysdep.h>
+
+/* Implementation of strcmp for ARMv7 when DSP instructions are
+   available.  Use ldrd to support wider loads, provided the data
+   is sufficiently aligned.  Use saturating arithmetic to optimize
+   the compares.  */
+
+/* Build Options:
+   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
+   byte in the string.  If comparing completely random strings
+   the pre-check will save time, since there is a very high
+   probability of a mismatch in the first character: we save
+   significant overhead if this is the common case.  However,
+   if strings are likely to be identical (eg because we're
+   verifying a hit in a hash table), then this check is largely
+   redundant.  */
+
+#define STRCMP_NO_PRECHECK	0
+
+	/* This version uses Thumb-2 code.  */
+	.thumb
+	.syntax unified
+
+#ifdef __ARM_BIG_ENDIAN
+#define S2LO lsl
+#define S2LOEQ lsleq
+#define S2HI lsr
+#define MSB 0x000000ff
+#define LSB 0xff000000
+#define BYTE0_OFFSET 24
+#define BYTE1_OFFSET 16
+#define BYTE2_OFFSET 8
+#define BYTE3_OFFSET 0
+#else /* not  __ARM_BIG_ENDIAN */
+#define S2LO lsr
+#define S2LOEQ lsreq
+#define S2HI lsl
+#define BYTE0_OFFSET 0
+#define BYTE1_OFFSET 8
+#define BYTE2_OFFSET 16
+#define BYTE3_OFFSET 24
+#define MSB 0xff000000
+#define LSB 0x000000ff
+#endif /* not  __ARM_BIG_ENDIAN */
+
+/* Parameters and result.  */
+#define src1		r0
+#define src2		r1
+#define result		r0	/* Overlaps src1.  */
+
+/* Internal variables.  */
+#define tmp1		r4
+#define tmp2		r5
+#define const_m1	r12
+
+/* Additional internal variables for 64-bit aligned data.  */
+#define data1a		r2
+#define data1b		r3
+#define data2a		r6
+#define data2b		r7
+#define syndrome_a	tmp1
+#define syndrome_b	tmp2
+
+/* Additional internal variables for 32-bit aligned data.  */
+#define data1		r2
+#define data2		r3
+#define syndrome	tmp2
+
+
+	/* Macro to compute and return the result value for word-aligned
+	   cases.  */
+	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
+#ifdef __ARM_BIG_ENDIAN
+	/* If data1 contains a zero byte, then syndrome will contain a 1 in
+	   bit 7 of that byte.  Otherwise, the highest set bit in the
+	   syndrome will highlight the first different bit.  It is therefore
+	   sufficient to extract the eight bits starting with the syndrome
+	   bit.  */
+	clz	tmp1, \synd
+	lsl	r1, \d2, tmp1
+	.if \restore_r6
+	ldrd	r6, r7, [sp, #8]
+	.endif
+	cfi_restore (r6)
+	cfi_restore (r7)
+	lsl	\d1, \d1, tmp1
+	cfi_remember_state
+	lsr	result, \d1, #24
+	ldrd	r4, r5, [sp], #16
+	cfi_restore (r4)
+	cfi_restore (r5)
+	sub	result, result, r1, lsr #24
+	bx	lr
+#else
+	/* To use the big-endian trick we'd have to reverse all three words.
+	   that's slower than this approach.  */
+	rev	\synd, \synd
+	clz	tmp1, \synd
+	bic	tmp1, tmp1, #7
+	lsr	r1, \d2, tmp1
+	cfi_remember_state
+	.if \restore_r6
+	ldrd	r6, r7, [sp, #8]
+	.endif
+	cfi_restore (r6)
+	cfi_restore (r7)
+	lsr	\d1, \d1, tmp1
+	and	result, \d1, #255
+	and	r1, r1, #255
+	ldrd	r4, r5, [sp], #16
+	cfi_restore (r4)
+	cfi_restore (r5)
+	sub	result, result, r1
+
+	bx	lr
+#endif
+	.endm
+
+	.text
+	.p2align	5
+.Lstrcmp_start_addr:
+#if STRCMP_NO_PRECHECK == 0
+.Lfastpath_exit:
+	sub	r0, r2, r3
+	bx	lr
+	nop
+#endif
+ENTRY(strcmp)
+#if STRCMP_NO_PRECHECK == 0
+	ldrb	r2, [src1]
+	ldrb	r3, [src2]
+	cmp	r2, #1
+	it	cs
+	cmpcs	r2, r3
+	bne	.Lfastpath_exit
+#endif
+	strd	r4, r5, [sp, #-16]!
+	cfi_def_cfa_offset (16)
+	cfi_offset (r4, -16)
+	cfi_offset (r5, -12)
+	orr	tmp1, src1, src2
+	strd	r6, r7, [sp, #8]
+	cfi_offset (r6, -8)
+	cfi_offset (r7, -4)
+	mvn	const_m1, #0
+	lsl	r2, tmp1, #29
+	cbz	r2, .Lloop_aligned8
+
+.Lnot_aligned:
+	eor	tmp1, src1, src2
+	tst	tmp1, #7
+	bne	.Lmisaligned8
+
+	/* Deal with mutual misalignment by aligning downwards and then
+	   masking off the unwanted loaded data to prevent a difference.  */
+	and	tmp1, src1, #7
+	bic	src1, src1, #7
+	and	tmp2, tmp1, #3
+	bic	src2, src2, #7
+	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
+	ldrd	data1a, data1b, [src1], #16
+	tst	tmp1, #4
+	ldrd	data2a, data2b, [src2], #16
+	/* In thumb code we can't use MVN with a register shift, but
+	   we do have ORN.  */
+	S2HI	tmp1, const_m1, tmp2
+	orn	data1a, data1a, tmp1
+	orn	data2a, data2a, tmp1
+	beq	.Lstart_realigned8
+	orn	data1b, data1b, tmp1
+	mov	data1a, const_m1
+	orn	data2b, data2b, tmp1
+	mov	data2a, const_m1
+	b	.Lstart_realigned8
+
+	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
+	   pass.  */
+	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
+	.p2align 2	/* Always word aligned.  */
+.Lloop_aligned8:
+	ldrd	data1a, data1b, [src1], #16
+	ldrd	data2a, data2b, [src2], #16
+.Lstart_realigned8:
+	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
+	eor	syndrome_a, data1a, data2a
+	sel	syndrome_a, syndrome_a, const_m1
+	cbnz	syndrome_a, .Ldiff_in_a
+	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
+	eor	syndrome_b, data1b, data2b
+	sel	syndrome_b, syndrome_b, const_m1
+	cbnz	syndrome_b, .Ldiff_in_b
+
+	ldrd	data1a, data1b, [src1, #-8]
+	ldrd	data2a, data2b, [src2, #-8]
+	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
+	eor	syndrome_a, data1a, data2a
+	sel	syndrome_a, syndrome_a, const_m1
+	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
+	eor	syndrome_b, data1b, data2b
+	sel	syndrome_b, syndrome_b, const_m1
+	/* Can't use CBZ for backwards branch.  */
+	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
+	beq	.Lloop_aligned8
+
+.Ldiff_found:
+	cbnz	syndrome_a, .Ldiff_in_a
+
+.Ldiff_in_b:
+	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
+
+.Ldiff_in_a:
+	cfi_restore_state
+	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
+
+	cfi_restore_state
+.Lmisaligned8:
+	tst	tmp1, #3
+	bne	.Lmisaligned4
+	ands	tmp1, src1, #3
+	bne	.Lmutual_align4
+
+	/* Unrolled by a factor of 2, to reduce the number of post-increment
+	   operations.  */
+.Lloop_aligned4:
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+.Lstart_realigned4:
+	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
+	eor	syndrome, data1, data2
+	sel	syndrome, syndrome, const_m1
+	cbnz	syndrome, .Laligned4_done
+	ldr	data1, [src1, #-4]
+	ldr	data2, [src2, #-4]
+	uadd8	syndrome, data1, const_m1
+	eor	syndrome, data1, data2
+	sel	syndrome, syndrome, const_m1
+	cmp	syndrome, #0
+	beq	.Lloop_aligned4
+
+.Laligned4_done:
+	strcmp_epilogue_aligned syndrome, data1, data2, 0
+
+.Lmutual_align4:
+	cfi_restore_state
+	/* Deal with mutual misalignment by aligning downwards and then
+	   masking off the unwanted loaded data to prevent a difference.  */
+	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
+	bic	src1, src1, #3
+	ldr	data1, [src1], #8
+	bic	src2, src2, #3
+	ldr	data2, [src2], #8
+
+	/* In thumb code we can't use MVN with a register shift, but
+	   we do have ORN.  */
+	S2HI	tmp1, const_m1, tmp1
+	orn	data1, data1, tmp1
+	orn	data2, data2, tmp1
+	b	.Lstart_realigned4
+
+.Lmisaligned4:
+	ands	tmp1, src1, #3
+	beq	.Lsrc1_aligned
+	sub	src2, src2, tmp1
+	bic	src1, src1, #3
+	lsls	tmp1, tmp1, #31
+	ldr	data1, [src1], #4
+	beq	.Laligned_m2
+	bcs	.Laligned_m1
+
+#if STRCMP_NO_PRECHECK == 1
+	ldrb	data2, [src2, #1]
+	uxtb	tmp1, data1, ror #BYTE1_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	cbz	data2, .Lmisaligned_exit
+
+.Laligned_m2:
+	ldrb	data2, [src2, #2]
+	uxtb	tmp1, data1, ror #BYTE2_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	cbz	data2, .Lmisaligned_exit
+
+.Laligned_m1:
+	ldrb	data2, [src2, #3]
+	uxtb	tmp1, data1, ror #BYTE3_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	add	src2, src2, #4
+	cbnz	data2, .Lsrc1_aligned
+#else  /* STRCMP_NO_PRECHECK */
+	/* If we've done the pre-check, then we don't need to check the
+	   first byte again here.  */
+	ldrb	data2, [src2, #2]
+	uxtb	tmp1, data1, ror #BYTE2_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	cbz	data2, .Lmisaligned_exit
+
+.Laligned_m2:
+	ldrb	data2, [src2, #3]
+	uxtb	tmp1, data1, ror #BYTE3_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	cbnz	data2, .Laligned_m1
+#endif
+
+.Lmisaligned_exit:
+	cfi_remember_state
+	mov	result, tmp1
+	ldr	r4, [sp], #16
+	cfi_restore (r4)
+	bx	lr
+
+#if STRCMP_NO_PRECHECK == 0
+.Laligned_m1:
+	add	src2, src2, #4
+#endif
+.Lsrc1_aligned:
+	cfi_restore_state
+	/* src1 is word aligned, but src2 has no common alignment
+	   with it.  */
+	ldr	data1, [src1], #4
+	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
+
+	bic	src2, src2, #3
+	ldr	data2, [src2], #4
+	bhi	.Loverlap1		/* C=1, Z=0 => src2[1:0] = 0b11.  */
+	bcs	.Loverlap2		/* C=1, Z=1 => src2[1:0] = 0b10.  */
+
+	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
+.Loverlap3:
+	bic	tmp1, data1, #MSB
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #8
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cbnz	syndrome, 5f
+	ldr	data2, [src2], #4
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #24
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap3
+4:
+	S2LO	data2, data2, #8
+	b	.Lstrcmp_tail
+
+5:
+	bics	syndrome, syndrome, #MSB
+	bne	.Lstrcmp_done_equal
+
+	/* We can only get here if the MSB of data1 contains 0, so
+	   fast-path the exit.  */
+	ldrb	result, [src2]
+	cfi_remember_state
+	ldrd	r4, r5, [sp], #16
+	cfi_restore (r4)
+	cfi_restore (r5)
+	/* R6/7 Not used in this sequence.  */
+	cfi_restore (r6)
+	cfi_restore (r7)
+	neg	result, result
+	bx	lr
+
+6:
+	cfi_restore_state
+	S2LO	data1, data1, #24
+	and	data2, data2, #LSB
+	b	.Lstrcmp_tail
+
+	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
+.Loverlap2:
+	and	tmp1, data1, const_m1, S2LO #16
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #16
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cbnz	syndrome, 5f
+	ldr	data2, [src2], #4
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #16
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap2
+4:
+	S2LO	data2, data2, #16
+	b	.Lstrcmp_tail
+5:
+	ands	syndrome, syndrome, const_m1, S2LO #16
+	bne	.Lstrcmp_done_equal
+
+	ldrh	data2, [src2]
+	S2LO	data1, data1, #16
+#ifdef __ARM_BIG_ENDIAN
+	lsl	data2, data2, #16
+#endif
+	b	.Lstrcmp_tail
+
+6:
+	S2LO	data1, data1, #16
+	and	data2, data2, const_m1, S2LO #16
+	b	.Lstrcmp_tail
+
+	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
+.Loverlap1:
+	and	tmp1, data1, #LSB
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #24
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cbnz	syndrome, 5f
+	ldr	data2, [src2], #4
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #8
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap1
+4:
+	S2LO	data2, data2, #24
+	b	.Lstrcmp_tail
+5:
+	tst	syndrome, #LSB
+	bne	.Lstrcmp_done_equal
+	ldr	data2, [src2]
+6:
+	S2LO	data1, data1, #8
+	bic	data2, data2, #MSB
+	b	.Lstrcmp_tail
+
+.Lstrcmp_done_equal:
+	mov	result, #0
+	cfi_remember_state
+	ldrd	r4, r5, [sp], #16
+	cfi_restore (r4)
+	cfi_restore (r5)
+	/* R6/7 not used in this sequence.  */
+	cfi_restore (r6)
+	cfi_restore (r7)
+	bx	lr
+
+.Lstrcmp_tail:
+	cfi_restore_state
+#ifndef __ARM_BIG_ENDIAN
+	rev	data1, data1
+	rev	data2, data2
+	/* Now everything looks big-endian...  */
+#endif
+	uadd8	tmp1, data1, const_m1
+	eor	tmp1, data1, data2
+	sel	syndrome, tmp1, const_m1
+	clz	tmp1, syndrome
+	lsl	data1, data1, tmp1
+	lsl	data2, data2, tmp1
+	lsr	result, data1, #24
+	ldrd	r4, r5, [sp], #16
+	cfi_restore (r4)
+	cfi_restore (r5)
+	/* R6/7 not used in this sequence.  */
+	cfi_restore (r6)
+	cfi_restore (r7)
+	sub	result, result, data2, lsr #24
+	bx	lr
+END(strcmp)
+libc_hidden_builtin_def (strcmp)