[v3,1/2] aarch64: Sync with ARM-software/optimized-routines
Commit Message
Update AArch64 assembly string routines from:
https://github.com/ARM-software/optimized-routines
commit 0cf84f26b6b8dcad8287fe30a4dcc1fdabd06560
Author: Sebastian Huber <sebastian.huber@embedded-brains.de>
Date: Thu Jul 27 17:14:57 2023 +0200
string: Fix corrupt GNU_PROPERTY_TYPE (5) size
For ELF32 the notes alignment is 4 and not 8.
---
newlib/libc/machine/aarch64/asmdefs.h | 106 ++++++
newlib/libc/machine/aarch64/memchr.S | 73 ++--
newlib/libc/machine/aarch64/memcmp.S | 311 +++++++++--------
newlib/libc/machine/aarch64/memcpy.S | 272 ++++++++-------
newlib/libc/machine/aarch64/memset.S | 194 ++---------
newlib/libc/machine/aarch64/stpcpy.S | 36 +-
newlib/libc/machine/aarch64/strchr.S | 107 ++----
newlib/libc/machine/aarch64/strchrnul.S | 90 ++---
newlib/libc/machine/aarch64/strcmp.S | 282 ++++++++-------
newlib/libc/machine/aarch64/strcpy.S | 437 +++++++-----------------
newlib/libc/machine/aarch64/strlen.S | 319 ++++++++---------
newlib/libc/machine/aarch64/strncmp.S | 323 ++++++++++--------
newlib/libc/machine/aarch64/strnlen.S | 256 +++++---------
newlib/libc/machine/aarch64/strrchr.S | 86 ++---
14 files changed, 1226 insertions(+), 1666 deletions(-)
create mode 100644 newlib/libc/machine/aarch64/asmdefs.h
Comments
Hi Sebastian,
My apologies for the delay replying, the GNU Cauldron organizing took up
a lot of my time over the last few weeks.
This is basically ok, but you're removing an existing license and adding
a new one from Arm; I think you need to copy the new license into
COPYING.NEWLIB - it's not enough just to have an SPDX identifier, the
text of the license must be added somewhere as well.
R.
On 12/09/2023 11:05, Sebastian Huber wrote:
> Update AArch64 assembly string routines from:
>
> https://github.com/ARM-software/optimized-routines
>
> commit 0cf84f26b6b8dcad8287fe30a4dcc1fdabd06560
> Author: Sebastian Huber <sebastian.huber@embedded-brains.de>
> Date: Thu Jul 27 17:14:57 2023 +0200
>
> string: Fix corrupt GNU_PROPERTY_TYPE (5) size
>
> For ELF32 the notes alignment is 4 and not 8.
> ---
> newlib/libc/machine/aarch64/asmdefs.h | 106 ++++++
> newlib/libc/machine/aarch64/memchr.S | 73 ++--
> newlib/libc/machine/aarch64/memcmp.S | 311 +++++++++--------
> newlib/libc/machine/aarch64/memcpy.S | 272 ++++++++-------
> newlib/libc/machine/aarch64/memset.S | 194 ++---------
> newlib/libc/machine/aarch64/stpcpy.S | 36 +-
> newlib/libc/machine/aarch64/strchr.S | 107 ++----
> newlib/libc/machine/aarch64/strchrnul.S | 90 ++---
> newlib/libc/machine/aarch64/strcmp.S | 282 ++++++++-------
> newlib/libc/machine/aarch64/strcpy.S | 437 +++++++-----------------
> newlib/libc/machine/aarch64/strlen.S | 319 ++++++++---------
> newlib/libc/machine/aarch64/strncmp.S | 323 ++++++++++--------
> newlib/libc/machine/aarch64/strnlen.S | 256 +++++---------
> newlib/libc/machine/aarch64/strrchr.S | 86 ++---
> 14 files changed, 1226 insertions(+), 1666 deletions(-)
> create mode 100644 newlib/libc/machine/aarch64/asmdefs.h
>
> diff --git a/newlib/libc/machine/aarch64/asmdefs.h b/newlib/libc/machine/aarch64/asmdefs.h
> new file mode 100644
> index 0000000000..131b95e1fe
> --- /dev/null
> +++ b/newlib/libc/machine/aarch64/asmdefs.h
> @@ -0,0 +1,106 @@
> +/*
> + * Macros for asm code. AArch64 version.
> + *
> + * Copyright (c) 2019-2023, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
> +
> +#ifndef _ASMDEFS_H
> +#define _ASMDEFS_H
> +
> +/* Branch Target Identitication support. */
> +#define BTI_C hint 34
> +#define BTI_J hint 36
> +/* Return address signing support (pac-ret). */
> +#define PACIASP hint 25; .cfi_window_save
> +#define AUTIASP hint 29; .cfi_window_save
> +
> +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */
> +#define FEATURE_1_AND 0xc0000000
> +#define FEATURE_1_BTI 1
> +#define FEATURE_1_PAC 2
> +
> +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
> +#ifdef __ILP32__
> +#define GNU_PROPERTY(type, value) \
> + .section .note.gnu.property, "a"; \
> + .p2align 2; \
> + .word 4; \
> + .word 12; \
> + .word 5; \
> + .asciz "GNU"; \
> + .word type; \
> + .word 4; \
> + .word value; \
> + .text
> +#else
> +#define GNU_PROPERTY(type, value) \
> + .section .note.gnu.property, "a"; \
> + .p2align 3; \
> + .word 4; \
> + .word 16; \
> + .word 5; \
> + .asciz "GNU"; \
> + .word type; \
> + .word 4; \
> + .word value; \
> + .word 0; \
> + .text
> +#endif
> +
> +/* If set then the GNU Property Note section will be added to
> + mark objects to support BTI and PAC-RET. */
> +#ifndef WANT_GNU_PROPERTY
> +#define WANT_GNU_PROPERTY 1
> +#endif
> +
> +#if WANT_GNU_PROPERTY
> +/* Add property note with supported features to all asm files. */
> +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
> +#endif
> +
> +#define ENTRY_ALIGN(name, alignment) \
> + .global name; \
> + .type name,%function; \
> + .align alignment; \
> + name: \
> + .cfi_startproc; \
> + BTI_C;
> +
> +#define ENTRY(name) ENTRY_ALIGN(name, 6)
> +
> +#define ENTRY_ALIAS(name) \
> + .global name; \
> + .type name,%function; \
> + name:
> +
> +#define END(name) \
> + .cfi_endproc; \
> + .size name, .-name;
> +
> +#define L(l) .L ## l
> +
> +#ifdef __ILP32__
> + /* Sanitize padding bits of pointer arguments as per aapcs64 */
> +#define PTR_ARG(n) mov w##n, w##n
> +#else
> +#define PTR_ARG(n)
> +#endif
> +
> +#ifdef __ILP32__
> + /* Sanitize padding bits of size arguments as per aapcs64 */
> +#define SIZE_ARG(n) mov w##n, w##n
> +#else
> +#define SIZE_ARG(n)
> +#endif
> +
> +/* Compiler supports SVE instructions */
> +#ifndef HAVE_SVE
> +# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
> +# define HAVE_SVE 1
> +# else
> +# define HAVE_SVE 0
> +# endif
> +#endif
> +
> +#endif
> diff --git a/newlib/libc/machine/aarch64/memchr.S b/newlib/libc/machine/aarch64/memchr.S
> index 53f5d6bc0e..a0f305e0fc 100644
> --- a/newlib/libc/machine/aarch64/memchr.S
> +++ b/newlib/libc/machine/aarch64/memchr.S
> @@ -1,31 +1,8 @@
> /*
> * memchr - find a character in a memory zone
> *
> - * Copyright (c) 2014, ARM Limited
> - * All rights Reserved.
> - *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions are met:
> - * * Redistributions of source code must retain the above copyright
> - * notice, this list of conditions and the following disclaimer.
> - * * Redistributions in binary form must reproduce the above copyright
> - * notice, this list of conditions and the following disclaimer in the
> - * documentation and/or other materials provided with the distribution.
> - * * Neither the name of the company nor the names of its contributors
> - * may be used to endorse or promote products derived from this
> - * software without specific prior written permission.
> - *
> - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + * Copyright (c) 2014-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> */
>
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> @@ -37,6 +14,8 @@
> * Neon Available.
> */
>
> +#include "asmdefs.h"
> +
> /* Arguments and results. */
> #define srcin x0
> #define chrin w1
> @@ -70,17 +49,11 @@
> * identify exactly which byte has matched.
> */
>
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> -def_fn memchr
> +ENTRY (memchr)
> + PTR_ARG (0)
> + SIZE_ARG (2)
> /* Do not dereference srcin if no bytes to compare. */
> - cbz cntin, .Lzero_length
> + cbz cntin, L(zero_length)
> /*
> * Magic constant 0x40100401 allows us to identify which lane matches
> * the requested byte.
> @@ -93,7 +66,7 @@ def_fn memchr
> dup vrepmask.4s, wtmp2
> ands soff, srcin, #31
> and cntrem, cntin, #31
> - b.eq .Lloop
> + b.eq L(loop)
>
> /*
> * Input string is not 32-byte aligned. We calculate the syndrome
> @@ -110,41 +83,41 @@ def_fn memchr
> and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
> addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
> addp vend.16b, vend.16b, vend.16b /* 128->64 */
> - mov synd, vend.2d[0]
> + mov synd, vend.d[0]
> /* Clear the soff*2 lower bits */
> lsl tmp, soff, #1
> lsr synd, synd, tmp
> lsl synd, synd, tmp
> /* The first block can also be the last */
> - b.ls .Lmasklast
> + b.ls L(masklast)
> /* Have we found something already? */
> - cbnz synd, .Ltail
> + cbnz synd, L(tail)
>
> -.Lloop:
> +L(loop):
> ld1 {vdata1.16b, vdata2.16b}, [src], #32
> subs cntin, cntin, #32
> cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
> cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
> /* If we're out of data we finish regardless of the result */
> - b.ls .Lend
> + b.ls L(end)
> /* Use a fast check for the termination condition */
> orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
> addp vend.2d, vend.2d, vend.2d
> - mov synd, vend.2d[0]
> + mov synd, vend.d[0]
> /* We're not out of data, loop if we haven't found the character */
> - cbz synd, .Lloop
> + cbz synd, L(loop)
>
> -.Lend:
> +L(end):
> /* Termination condition found, let's calculate the syndrome value */
> and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
> and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
> addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
> addp vend.16b, vend.16b, vend.16b /* 128->64 */
> - mov synd, vend.2d[0]
> + mov synd, vend.d[0]
> /* Only do the clear for the last possible block */
> - b.hi .Ltail
> + b.hs L(tail)
>
> -.Lmasklast:
> +L(masklast):
> /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
> add tmp, cntrem, soff
> and tmp, tmp, #31
> @@ -153,7 +126,7 @@ def_fn memchr
> lsl synd, synd, tmp
> lsr synd, synd, tmp
>
> -.Ltail:
> +L(tail):
> /* Count the trailing zeros using bit reversing */
> rbit synd, synd
> /* Compensate the last post-increment */
> @@ -168,9 +141,9 @@ def_fn memchr
> csel result, xzr, result, eq
> ret
>
> -.Lzero_length:
> +L(zero_length):
> mov result, #0
> ret
>
> - .size memchr, . - memchr
> +END (memchr)
> #endif
> diff --git a/newlib/libc/machine/aarch64/memcmp.S b/newlib/libc/machine/aarch64/memcmp.S
> index 605d99365e..18874d3215 100644
> --- a/newlib/libc/machine/aarch64/memcmp.S
> +++ b/newlib/libc/machine/aarch64/memcmp.S
> @@ -1,57 +1,7 @@
> /* memcmp - compare memory
> -
> - Copyright (c) 2018 Linaro Limited
> - All rights reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the Linaro nor the
> - names of its contributors may be used to endorse or promote products
> - derived from this software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> -/*
> - * Copyright (c) 2017 ARM Ltd
> - * All rights reserved.
> - *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions
> - * are met:
> - * 1. Redistributions of source code must retain the above copyright
> - * notice, this list of conditions and the following disclaimer.
> - * 2. Redistributions in binary form must reproduce the above copyright
> - * notice, this list of conditions and the following disclaimer in the
> - * documentation and/or other materials provided with the distribution.
> - * 3. The name of the company may not be used to endorse or promote
> - * products derived from this software without specific prior written
> - * permission.
> *
> - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
> - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
> - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
> - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
> - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + * Copyright (c) 2013-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> */
>
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> @@ -60,103 +10,79 @@
>
> /* Assumptions:
> *
> - * ARMv8-a, AArch64, unaligned accesses.
> + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
> */
>
> -#define L(l) .L ## l
> -
> -/* Parameters and result. */
> -#define src1 x0
> -#define src2 x1
> -#define limit x2
> -#define result w0
> -
> -/* Internal variables. */
> -#define data1 x3
> -#define data1w w3
> -#define data1h x4
> -#define data2 x5
> -#define data2w w5
> -#define data2h x6
> -#define tmp1 x7
> -#define tmp2 x8
> -
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> -def_fn memcmp p2align=6
> - subs limit, limit, 8
> - b.lo L(less8)
> -
> - ldr data1, [src1], 8
> - ldr data2, [src2], 8
> - cmp data1, data2
> - b.ne L(return)
> -
> - subs limit, limit, 8
> - b.gt L(more16)
> -
> - ldr data1, [src1, limit]
> - ldr data2, [src2, limit]
> - b L(return)
> -
> -L(more16):
> - ldr data1, [src1], 8
> - ldr data2, [src2], 8
> - cmp data1, data2
> - bne L(return)
> -
> - /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
> - strings. */
> - subs limit, limit, 16
> +#include "asmdefs.h"
> +
> +#define src1 x0
> +#define src2 x1
> +#define limit x2
> +#define result w0
> +
> +#define data1 x3
> +#define data1w w3
> +#define data2 x4
> +#define data2w w4
> +#define data3 x5
> +#define data3w w5
> +#define data4 x6
> +#define data4w w6
> +#define tmp x6
> +#define src1end x7
> +#define src2end x8
> +
> +
> +ENTRY (memcmp)
> + PTR_ARG (0)
> + PTR_ARG (1)
> + SIZE_ARG (2)
> +
> + cmp limit, 16
> + b.lo L(less16)
> + ldp data1, data3, [src1]
> + ldp data2, data4, [src2]
> + ccmp data1, data2, 0, ne
> + ccmp data3, data4, 0, eq
> + b.ne L(return2)
> +
> + add src1end, src1, limit
> + add src2end, src2, limit
> + cmp limit, 32
> b.ls L(last_bytes)
> + cmp limit, 160
> + b.hs L(loop_align)
> + sub limit, limit, 32
>
> - /* We overlap loads between 0-32 bytes at either side of SRC1 when we
> - try to align, so limit it only to strings larger than 128 bytes. */
> - cmp limit, 96
> - b.ls L(loop16)
> -
> - /* Align src1 and adjust src2 with bytes not yet done. */
> - and tmp1, src1, 15
> - add limit, limit, tmp1
> - sub src1, src1, tmp1
> - sub src2, src2, tmp1
> -
> - /* Loop performing 16 bytes per iteration using aligned src1.
> - Limit is pre-decremented by 16 and must be larger than zero.
> - Exit if <= 16 bytes left to do or if the data is not equal. */
> .p2align 4
> -L(loop16):
> - ldp data1, data1h, [src1], 16
> - ldp data2, data2h, [src2], 16
> - subs limit, limit, 16
> - ccmp data1, data2, 0, hi
> - ccmp data1h, data2h, 0, eq
> - b.eq L(loop16)
> -
> +L(loop32):
> + ldp data1, data3, [src1, 16]
> + ldp data2, data4, [src2, 16]
> cmp data1, data2
> - bne L(return)
> - mov data1, data1h
> - mov data2, data2h
> + ccmp data3, data4, 0, eq
> + b.ne L(return2)
> + cmp limit, 16
> + b.ls L(last_bytes)
> +
> + ldp data1, data3, [src1, 32]
> + ldp data2, data4, [src2, 32]
> cmp data1, data2
> - bne L(return)
> + ccmp data3, data4, 0, eq
> + b.ne L(return2)
> + add src1, src1, 32
> + add src2, src2, 32
> +L(last64):
> + subs limit, limit, 32
> + b.hi L(loop32)
>
> /* Compare last 1-16 bytes using unaligned access. */
> L(last_bytes):
> - add src1, src1, limit
> - add src2, src2, limit
> - ldp data1, data1h, [src1]
> - ldp data2, data2h, [src2]
> - cmp data1, data2
> - bne L(return)
> - mov data1, data1h
> - mov data2, data2h
> + ldp data1, data3, [src1end, -16]
> + ldp data2, data4, [src2end, -16]
> +L(return2):
> cmp data1, data2
> + csel data1, data1, data3, ne
> + csel data2, data2, data4, ne
>
> /* Compare data bytes and set return value to 0, -1 or 1. */
> L(return):
> @@ -164,33 +90,106 @@ L(return):
> rev data1, data1
> rev data2, data2
> #endif
> - cmp data1, data2
> -L(ret_eq):
> + cmp data1, data2
> cset result, ne
> cneg result, result, lo
> ret
>
> .p2align 4
> - /* Compare up to 8 bytes. Limit is [-8..-1]. */
> +L(less16):
> + add src1end, src1, limit
> + add src2end, src2, limit
> + tbz limit, 3, L(less8)
> + ldr data1, [src1]
> + ldr data2, [src2]
> + ldr data3, [src1end, -8]
> + ldr data4, [src2end, -8]
> + b L(return2)
> +
> + .p2align 4
> L(less8):
> - adds limit, limit, 4
> - b.lo L(less4)
> - ldr data1w, [src1], 4
> - ldr data2w, [src2], 4
> + tbz limit, 2, L(less4)
> + ldr data1w, [src1]
> + ldr data2w, [src2]
> + ldr data3w, [src1end, -4]
> + ldr data4w, [src2end, -4]
> + b L(return2)
> +
> +L(less4):
> + tbz limit, 1, L(less2)
> + ldrh data1w, [src1]
> + ldrh data2w, [src2]
> cmp data1w, data2w
> b.ne L(return)
> - sub limit, limit, 4
> -L(less4):
> - adds limit, limit, 4
> - beq L(ret_eq)
> -L(byte_loop):
> - ldrb data1w, [src1], 1
> - ldrb data2w, [src2], 1
> - subs limit, limit, 1
> - ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
> - b.eq L(byte_loop)
> +L(less2):
> + mov result, 0
> + tbz limit, 0, L(return_zero)
> + ldrb data1w, [src1end, -1]
> + ldrb data2w, [src2end, -1]
> sub result, data1w, data2w
> +L(return_zero):
> + ret
> +
> +L(loop_align):
> + ldp data1, data3, [src1, 16]
> + ldp data2, data4, [src2, 16]
> + cmp data1, data2
> + ccmp data3, data4, 0, eq
> + b.ne L(return2)
> +
> + /* Align src2 and adjust src1, src2 and limit. */
> + and tmp, src2, 15
> + sub tmp, tmp, 16
> + sub src2, src2, tmp
> + add limit, limit, tmp
> + sub src1, src1, tmp
> + sub limit, limit, 64 + 16
> +
> + .p2align 4
> +L(loop64):
> + ldr q0, [src1, 16]
> + ldr q1, [src2, 16]
> + subs limit, limit, 64
> + ldr q2, [src1, 32]
> + ldr q3, [src2, 32]
> + eor v0.16b, v0.16b, v1.16b
> + eor v1.16b, v2.16b, v3.16b
> + ldr q2, [src1, 48]
> + ldr q3, [src2, 48]
> + umaxp v0.16b, v0.16b, v1.16b
> + ldr q4, [src1, 64]!
> + ldr q5, [src2, 64]!
> + eor v1.16b, v2.16b, v3.16b
> + eor v2.16b, v4.16b, v5.16b
> + umaxp v1.16b, v1.16b, v2.16b
> + umaxp v0.16b, v0.16b, v1.16b
> + umaxp v0.16b, v0.16b, v0.16b
> + fmov tmp, d0
> + ccmp tmp, 0, 0, hi
> + b.eq L(loop64)
> +
> + /* If equal, process last 1-64 bytes using scalar loop. */
> + add limit, limit, 64 + 16
> + cbz tmp, L(last64)
> +
> + /* Determine the 8-byte aligned offset of the first difference. */
> +#ifdef __AARCH64EB__
> + rev16 tmp, tmp
> +#endif
> + rev tmp, tmp
> + clz tmp, tmp
> + bic tmp, tmp, 7
> + sub tmp, tmp, 48
> + ldr data1, [src1, tmp]
> + ldr data2, [src2, tmp]
> +#ifndef __AARCH64EB__
> + rev data1, data1
> + rev data2, data2
> +#endif
> + mov result, 1
> + cmp data1, data2
> + cneg result, result, lo
> ret
>
> - .size memcmp, . - memcmp
> +END (memcmp)
> #endif
> diff --git a/newlib/libc/machine/aarch64/memcpy.S b/newlib/libc/machine/aarch64/memcpy.S
> index 463bad0a18..248e7843a2 100644
> --- a/newlib/libc/machine/aarch64/memcpy.S
> +++ b/newlib/libc/machine/aarch64/memcpy.S
> @@ -1,55 +1,8 @@
> -/* Copyright (c) 2012-2013, Linaro Limited
> - All rights reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the Linaro nor the
> - names of its contributors may be used to endorse or promote products
> - derived from this software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> /*
> - * Copyright (c) 2015 ARM Ltd
> - * All rights reserved.
> - *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions
> - * are met:
> - * 1. Redistributions of source code must retain the above copyright
> - * notice, this list of conditions and the following disclaimer.
> - * 2. Redistributions in binary form must reproduce the above copyright
> - * notice, this list of conditions and the following disclaimer in the
> - * documentation and/or other materials provided with the distribution.
> - * 3. The name of the company may not be used to endorse or promote
> - * products derived from this software without specific prior written
> - * permission.
> + * memcpy - copy memory area
> *
> - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
> - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
> - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
> - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
> - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + * Copyright (c) 2012-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> */
>
> /* Assumptions:
> @@ -61,6 +14,7 @@
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See memcpy-stub.c */
> #else
> +#include "asmdefs.h"
>
> #define dstin x0
> #define src x1
> @@ -71,122 +25,139 @@
> #define A_l x6
> #define A_lw w6
> #define A_h x7
> -#define A_hw w7
> #define B_l x8
> #define B_lw w8
> #define B_h x9
> #define C_l x10
> +#define C_lw w10
> #define C_h x11
> #define D_l x12
> #define D_h x13
> -#define E_l src
> -#define E_h count
> -#define F_l srcend
> -#define F_h dst
> -#define tmp1 x9
> -
> -#define L(l) .L ## l
> -
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> -/* Copies are split into 3 main cases: small copies of up to 16 bytes,
> - medium copies of 17..96 bytes which are fully unrolled. Large copies
> - of more than 96 bytes align the destination and use an unrolled loop
> - processing 64 bytes per iteration.
> - Small and medium copies read all data before writing, allowing any
> - kind of overlap, and memmove tailcalls memcpy for these cases as
> - well as non-overlapping copies.
> +#define E_l x14
> +#define E_h x15
> +#define F_l x16
> +#define F_h x17
> +#define G_l count
> +#define G_h dst
> +#define H_l src
> +#define H_h srcend
> +#define tmp1 x14
> +
> +/* This implementation handles overlaps and supports both memcpy and memmove
> + from a single entry point. It uses unaligned accesses and branchless
> + sequences to keep the code small, simple and improve performance.
> +
> + Copies are split into 3 main cases: small copies of up to 32 bytes, medium
> + copies of up to 128 bytes, and large copies. The overhead of the overlap
> + check is negligible since it is only required for large copies.
> +
> + Large copies use a software pipelined loop processing 64 bytes per iteration.
> + The destination pointer is 16-byte aligned to minimize unaligned accesses.
> + The loop tail is handled by always copying 64 bytes from the end.
> */
>
> -def_fn memcpy p2align=6
> - prfm PLDL1KEEP, [src]
> +ENTRY_ALIAS (memmove)
> +ENTRY (memcpy)
> + PTR_ARG (0)
> + PTR_ARG (1)
> + SIZE_ARG (2)
> add srcend, src, count
> add dstend, dstin, count
> - cmp count, 16
> - b.ls L(copy16)
> - cmp count, 96
> + cmp count, 128
> b.hi L(copy_long)
> + cmp count, 32
> + b.hi L(copy32_128)
>
> - /* Medium copies: 17..96 bytes. */
> - sub tmp1, count, 1
> + /* Small copies: 0..32 bytes. */
> + cmp count, 16
> + b.lo L(copy16)
> ldp A_l, A_h, [src]
> - tbnz tmp1, 6, L(copy96)
> ldp D_l, D_h, [srcend, -16]
> - tbz tmp1, 5, 1f
> - ldp B_l, B_h, [src, 16]
> - ldp C_l, C_h, [srcend, -32]
> - stp B_l, B_h, [dstin, 16]
> - stp C_l, C_h, [dstend, -32]
> -1:
> stp A_l, A_h, [dstin]
> stp D_l, D_h, [dstend, -16]
> ret
>
> - .p2align 4
> - /* Small copies: 0..16 bytes. */
> + /* Copy 8-15 bytes. */
> L(copy16):
> - cmp count, 8
> - b.lo 1f
> + tbz count, 3, L(copy8)
> ldr A_l, [src]
> ldr A_h, [srcend, -8]
> str A_l, [dstin]
> str A_h, [dstend, -8]
> ret
> - .p2align 4
> -1:
> - tbz count, 2, 1f
> +
> + .p2align 3
> + /* Copy 4-7 bytes. */
> +L(copy8):
> + tbz count, 2, L(copy4)
> ldr A_lw, [src]
> - ldr A_hw, [srcend, -4]
> + ldr B_lw, [srcend, -4]
> str A_lw, [dstin]
> - str A_hw, [dstend, -4]
> + str B_lw, [dstend, -4]
> ret
>
> - /* Copy 0..3 bytes. Use a branchless sequence that copies the same
> - byte 3 times if count==1, or the 2nd byte twice if count==2. */
> -1:
> - cbz count, 2f
> + /* Copy 0..3 bytes using a branchless sequence. */
> +L(copy4):
> + cbz count, L(copy0)
> lsr tmp1, count, 1
> ldrb A_lw, [src]
> - ldrb A_hw, [srcend, -1]
> + ldrb C_lw, [srcend, -1]
> ldrb B_lw, [src, tmp1]
> strb A_lw, [dstin]
> strb B_lw, [dstin, tmp1]
> - strb A_hw, [dstend, -1]
> -2: ret
> + strb C_lw, [dstend, -1]
> +L(copy0):
> + ret
>
> .p2align 4
> - /* Copy 64..96 bytes. Copy 64 bytes from the start and
> - 32 bytes from the end. */
> -L(copy96):
> + /* Medium copies: 33..128 bytes. */
> +L(copy32_128):
> + ldp A_l, A_h, [src]
> ldp B_l, B_h, [src, 16]
> - ldp C_l, C_h, [src, 32]
> - ldp D_l, D_h, [src, 48]
> - ldp E_l, E_h, [srcend, -32]
> - ldp F_l, F_h, [srcend, -16]
> + ldp C_l, C_h, [srcend, -32]
> + ldp D_l, D_h, [srcend, -16]
> + cmp count, 64
> + b.hi L(copy128)
> stp A_l, A_h, [dstin]
> stp B_l, B_h, [dstin, 16]
> - stp C_l, C_h, [dstin, 32]
> - stp D_l, D_h, [dstin, 48]
> - stp E_l, E_h, [dstend, -32]
> - stp F_l, F_h, [dstend, -16]
> + stp C_l, C_h, [dstend, -32]
> + stp D_l, D_h, [dstend, -16]
> ret
>
> - /* Align DST to 16 byte alignment so that we don't cross cache line
> - boundaries on both loads and stores. There are at least 96 bytes
> - to copy, so copy 16 bytes unaligned and then align. The loop
> - copies 64 bytes per iteration and prefetches one iteration ahead. */
> + .p2align 4
> + /* Copy 65..128 bytes. */
> +L(copy128):
> + ldp E_l, E_h, [src, 32]
> + ldp F_l, F_h, [src, 48]
> + cmp count, 96
> + b.ls L(copy96)
> + ldp G_l, G_h, [srcend, -64]
> + ldp H_l, H_h, [srcend, -48]
> + stp G_l, G_h, [dstend, -64]
> + stp H_l, H_h, [dstend, -48]
> +L(copy96):
> + stp A_l, A_h, [dstin]
> + stp B_l, B_h, [dstin, 16]
> + stp E_l, E_h, [dstin, 32]
> + stp F_l, F_h, [dstin, 48]
> + stp C_l, C_h, [dstend, -32]
> + stp D_l, D_h, [dstend, -16]
> + ret
>
> .p2align 4
> + /* Copy more than 128 bytes. */
> L(copy_long):
> + /* Use backwards copy if there is an overlap. */
> + sub tmp1, dstin, src
> + cbz tmp1, L(copy0)
> + cmp tmp1, count
> + b.lo L(copy_long_backwards)
> +
> + /* Copy 16 bytes and then align dst to 16-byte alignment. */
> +
> + ldp D_l, D_h, [src]
> and tmp1, dstin, 15
> bic dst, dstin, 15
> - ldp D_l, D_h, [src]
> sub src, src, tmp1
> add count, count, tmp1 /* Count is now 16 too large. */
> ldp A_l, A_h, [src, 16]
> @@ -195,8 +166,9 @@ L(copy_long):
> ldp C_l, C_h, [src, 48]
> ldp D_l, D_h, [src, 64]!
> subs count, count, 128 + 16 /* Test and readjust count. */
> - b.ls 2f
> -1:
> + b.ls L(copy64_from_end)
> +
> +L(loop64):
> stp A_l, A_h, [dst, 16]
> ldp A_l, A_h, [src, 16]
> stp B_l, B_h, [dst, 32]
> @@ -206,12 +178,10 @@ L(copy_long):
> stp D_l, D_h, [dst, 64]!
> ldp D_l, D_h, [src, 64]!
> subs count, count, 64
> - b.hi 1b
> + b.hi L(loop64)
>
> - /* Write the last full set of 64 bytes. The remainder is at most 64
> - bytes, so it is safe to always copy 64 bytes from the end even if
> - there is just 1 byte left. */
> -2:
> + /* Write the last iteration and copy 64 bytes from the end. */
> +L(copy64_from_end):
> ldp E_l, E_h, [srcend, -64]
> stp A_l, A_h, [dst, 16]
> ldp A_l, A_h, [srcend, -48]
> @@ -226,5 +196,51 @@ L(copy_long):
> stp C_l, C_h, [dstend, -16]
> ret
>
> - .size memcpy, . - memcpy
> + .p2align 4
> +
> + /* Large backwards copy for overlapping copies.
> + Copy 16 bytes and then align dst to 16-byte alignment. */
> +L(copy_long_backwards):
> + ldp D_l, D_h, [srcend, -16]
> + and tmp1, dstend, 15
> + sub srcend, srcend, tmp1
> + sub count, count, tmp1
> + ldp A_l, A_h, [srcend, -16]
> + stp D_l, D_h, [dstend, -16]
> + ldp B_l, B_h, [srcend, -32]
> + ldp C_l, C_h, [srcend, -48]
> + ldp D_l, D_h, [srcend, -64]!
> + sub dstend, dstend, tmp1
> + subs count, count, 128
> + b.ls L(copy64_from_start)
> +
> +L(loop64_backwards):
> + stp A_l, A_h, [dstend, -16]
> + ldp A_l, A_h, [srcend, -16]
> + stp B_l, B_h, [dstend, -32]
> + ldp B_l, B_h, [srcend, -32]
> + stp C_l, C_h, [dstend, -48]
> + ldp C_l, C_h, [srcend, -48]
> + stp D_l, D_h, [dstend, -64]!
> + ldp D_l, D_h, [srcend, -64]!
> + subs count, count, 64
> + b.hi L(loop64_backwards)
> +
> + /* Write the last iteration and copy 64 bytes from the start. */
> +L(copy64_from_start):
> + ldp G_l, G_h, [src, 48]
> + stp A_l, A_h, [dstend, -16]
> + ldp A_l, A_h, [src, 32]
> + stp B_l, B_h, [dstend, -32]
> + ldp B_l, B_h, [src, 16]
> + stp C_l, C_h, [dstend, -48]
> + ldp C_l, C_h, [src]
> + stp D_l, D_h, [dstend, -64]
> + stp G_l, G_h, [dstin, 48]
> + stp A_l, A_h, [dstin, 32]
> + stp B_l, B_h, [dstin, 16]
> + stp C_l, C_h, [dstin]
> + ret
> +
> +END (memcpy)
> #endif
> diff --git a/newlib/libc/machine/aarch64/memset.S b/newlib/libc/machine/aarch64/memset.S
> index 103e3f8bb0..ca76439a91 100644
> --- a/newlib/libc/machine/aarch64/memset.S
> +++ b/newlib/libc/machine/aarch64/memset.S
> @@ -1,66 +1,20 @@
> -/* Copyright (c) 2012-2013, Linaro Limited
> - All rights reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the Linaro nor the
> - names of its contributors may be used to endorse or promote products
> - derived from this software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> /*
> - * Copyright (c) 2015 ARM Ltd
> - * All rights reserved.
> + * memset - fill memory with a constant byte
> *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions
> - * are met:
> - * 1. Redistributions of source code must retain the above copyright
> - * notice, this list of conditions and the following disclaimer.
> - * 2. Redistributions in binary form must reproduce the above copyright
> - * notice, this list of conditions and the following disclaimer in the
> - * documentation and/or other materials provided with the distribution.
> - * 3. The name of the company may not be used to endorse or promote
> - * products derived from this software without specific prior written
> - * permission.
> - *
> - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
> - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
> - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
> - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
> - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + * Copyright (c) 2012-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> */
>
> /* Assumptions:
> *
> - * ARMv8-a, AArch64, unaligned accesses
> + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
> *
> */
>
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See memset-stub.c */
> #else
> +#include "asmdefs.h"
>
> #define dstin x0
> #define val x1
> @@ -68,24 +22,11 @@
> #define count x2
> #define dst x3
> #define dstend x4
> -#define tmp1 x5
> -#define tmp1w w5
> -#define tmp2 x6
> -#define tmp2w w6
> -#define zva_len x7
> -#define zva_lenw w7
> -
> -#define L(l) .L ## l
> +#define zva_val x5
>
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> -def_fn memset p2align=6
> +ENTRY (memset)
> + PTR_ARG (0)
> + SIZE_ARG (2)
>
> dup v0.16B, valw
> add dstend, dstin, count
> @@ -101,7 +42,7 @@ def_fn memset p2align=6
> str val, [dstin]
> str val, [dstend, -8]
> ret
> - nop
> + .p2align 4
> 1: tbz count, 2, 2f
> str valw, [dstin]
> str valw, [dstend, -4]
> @@ -131,110 +72,49 @@ L(set96):
> stp q0, q0, [dstend, -32]
> ret
>
> - .p2align 3
> - nop
> + .p2align 4
> L(set_long):
> and valw, valw, 255
> bic dst, dstin, 15
> str q0, [dstin]
> - cmp count, 256
> - ccmp valw, 0, 0, cs
> - b.eq L(try_zva)
> -L(no_zva):
> - sub count, dstend, dst /* Count is 16 too large. */
> - sub dst, dst, 16 /* Dst is biased by -32. */
> - sub count, count, 64 + 16 /* Adjust count and bias for loop. */
> -1: stp q0, q0, [dst, 32]
> - stp q0, q0, [dst, 64]!
> -L(tail64):
> - subs count, count, 64
> - b.hi 1b
> -2: stp q0, q0, [dstend, -64]
> - stp q0, q0, [dstend, -32]
> - ret
> -
> - .p2align 3
> -L(try_zva):
> - mrs tmp1, dczid_el0
> - tbnz tmp1w, 4, L(no_zva)
> - and tmp1w, tmp1w, 15
> - cmp tmp1w, 4 /* ZVA size is 64 bytes. */
> - b.ne L(zva_128)
> -
> - /* Write the first and last 64 byte aligned block using stp rather
> - than using DC ZVA. This is faster on some cores.
> - */
> -L(zva_64):
> + cmp count, 160
> + ccmp valw, 0, 0, hs
> + b.ne L(no_zva)
> +
> +#ifndef SKIP_ZVA_CHECK
> + mrs zva_val, dczid_el0
> + and zva_val, zva_val, 31
> + cmp zva_val, 4 /* ZVA size is 64 bytes. */
> + b.ne L(no_zva)
> +#endif
> str q0, [dst, 16]
> stp q0, q0, [dst, 32]
> bic dst, dst, 63
> - stp q0, q0, [dst, 64]
> - stp q0, q0, [dst, 96]
> - sub count, dstend, dst /* Count is now 128 too large. */
> - sub count, count, 128+64+64 /* Adjust count and bias for loop. */
> - add dst, dst, 128
> - nop
> -1: dc zva, dst
> + sub count, dstend, dst /* Count is now 64 too large. */
> + sub count, count, 128 /* Adjust count and bias for loop. */
> +
> + .p2align 4
> +L(zva_loop):
> add dst, dst, 64
> + dc zva, dst
> subs count, count, 64
> - b.hi 1b
> - stp q0, q0, [dst, 0]
> - stp q0, q0, [dst, 32]
> + b.hi L(zva_loop)
> stp q0, q0, [dstend, -64]
> stp q0, q0, [dstend, -32]
> ret
>
> - .p2align 3
> -L(zva_128):
> - cmp tmp1w, 5 /* ZVA size is 128 bytes. */
> - b.ne L(zva_other)
> -
> - str q0, [dst, 16]
> +L(no_zva):
> + sub count, dstend, dst /* Count is 16 too large. */
> + sub dst, dst, 16 /* Dst is biased by -32. */
> + sub count, count, 64 + 16 /* Adjust count and bias for loop. */
> +L(no_zva_loop):
> stp q0, q0, [dst, 32]
> - stp q0, q0, [dst, 64]
> - stp q0, q0, [dst, 96]
> - bic dst, dst, 127
> - sub count, dstend, dst /* Count is now 128 too large. */
> - sub count, count, 128+128 /* Adjust count and bias for loop. */
> - add dst, dst, 128
> -1: dc zva, dst
> - add dst, dst, 128
> - subs count, count, 128
> - b.hi 1b
> - stp q0, q0, [dstend, -128]
> - stp q0, q0, [dstend, -96]
> + stp q0, q0, [dst, 64]!
> + subs count, count, 64
> + b.hi L(no_zva_loop)
> stp q0, q0, [dstend, -64]
> stp q0, q0, [dstend, -32]
> ret
>
> -L(zva_other):
> - mov tmp2w, 4
> - lsl zva_lenw, tmp2w, tmp1w
> - add tmp1, zva_len, 64 /* Max alignment bytes written. */
> - cmp count, tmp1
> - blo L(no_zva)
> -
> - sub tmp2, zva_len, 1
> - add tmp1, dst, zva_len
> - add dst, dst, 16
> - subs count, tmp1, dst /* Actual alignment bytes to write. */
> - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
> - beq 2f
> -1: stp q0, q0, [dst], 64
> - stp q0, q0, [dst, -32]
> - subs count, count, 64
> - b.hi 1b
> -2: mov dst, tmp1
> - sub count, dstend, tmp1 /* Remaining bytes to write. */
> - subs count, count, zva_len
> - b.lo 4f
> -3: dc zva, dst
> - add dst, dst, zva_len
> - subs count, count, zva_len
> - b.hs 3b
> -4: add count, count, zva_len
> - sub dst, dst, 32 /* Bias dst for tail loop. */
> - b L(tail64)
> -
> - .size memset, . - memset
> +END (memset)
> #endif
> diff --git a/newlib/libc/machine/aarch64/stpcpy.S b/newlib/libc/machine/aarch64/stpcpy.S
> index 696b45889f..155c68d75a 100644
> --- a/newlib/libc/machine/aarch64/stpcpy.S
> +++ b/newlib/libc/machine/aarch64/stpcpy.S
> @@ -1,34 +1,10 @@
> /*
> - stpcpy - copy a string returning pointer to end.
> + * stpcpy - copy a string returning pointer to end.
> + *
> + * Copyright (c) 2020, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
>
> - Copyright (c) 2015 ARM Ltd.
> - All Rights Reserved.
> +#define BUILD_STPCPY 1
>
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the company nor the names of its contributors
> - may be used to endorse or promote products derived from this
> - software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> -/* This is just a wrapper that uses strcpy code with appropriate
> - pre-defines. */
> -
> -#define BUILD_STPCPY
> #include "strcpy.S"
> diff --git a/newlib/libc/machine/aarch64/strchr.S b/newlib/libc/machine/aarch64/strchr.S
> index 2448dbc7d5..500d9aff29 100644
> --- a/newlib/libc/machine/aarch64/strchr.S
> +++ b/newlib/libc/machine/aarch64/strchr.S
> @@ -1,32 +1,9 @@
> /*
> - strchr - find a character in a string
> -
> - Copyright (c) 2014, ARM Limited
> - All rights Reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the company nor the names of its contributors
> - may be used to endorse or promote products derived from this
> - software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> + * strchr - find a character in a string
> + *
> + * Copyright (c) 2014-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See strchr-stub.c */
> #else
> @@ -37,6 +14,8 @@
> * Neon Available.
> */
>
> +#include "asmdefs.h"
> +
> /* Arguments and results. */
> #define srcin x0
> #define chrin w1
> @@ -74,26 +53,19 @@
>
> /* Locals and temporaries. */
>
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> -def_fn strchr
> - /* Magic constant 0x40100401 to allow us to identify which lane
> - matches the requested byte. Magic constant 0x80200802 used
> - similarly for NUL termination. */
> - mov wtmp2, #0x0401
> - movk wtmp2, #0x4010, lsl #16
> +ENTRY (strchr)
> + PTR_ARG (0)
> + /* Magic constant 0xc0300c03 to allow us to identify which lane
> + matches the requested byte. Even bits are set if the character
> + matches, odd bits if either the char is NUL or matches. */
> + mov wtmp2, 0x0c03
> + movk wtmp2, 0xc030, lsl 16
> dup vrepchr.16b, chrin
> bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
> dup vrepmask_c.4s, wtmp2
> ands tmp1, srcin, #31
> add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
> - b.eq .Lloop
> + b.eq L(loop)
>
> /* Input string is not 32-byte aligned. Rather than forcing
> the padding bytes to a safe value, we calculate the syndrome
> @@ -105,49 +77,42 @@ def_fn strchr
> cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
> cmeq vhas_nul2.16b, vdata2.16b, #0
> cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
> - and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
> - and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
> - and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
> - and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
> - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
> - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
> + bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
> + bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
> + and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
> + and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
> lsl tmp1, tmp1, #1
> addp vend1.16b, vend1.16b, vend2.16b // 256->128
> mov tmp3, #~0
> addp vend1.16b, vend1.16b, vend2.16b // 128->64
> lsr tmp1, tmp3, tmp1
>
> - mov tmp3, vend1.2d[0]
> + mov tmp3, vend1.d[0]
> bic tmp1, tmp3, tmp1 // Mask padding bits.
> - cbnz tmp1, .Ltail
> + cbnz tmp1, L(tail)
>
> -.Lloop:
> + .p2align 4
> +L(loop):
> ld1 {vdata1.16b, vdata2.16b}, [src], #32
> - cmeq vhas_nul1.16b, vdata1.16b, #0
> cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
> - cmeq vhas_nul2.16b, vdata2.16b, #0
> cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
> - /* Use a fast check for the termination condition. */
> - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
> - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
> - orr vend1.16b, vend1.16b, vend2.16b
> - addp vend1.2d, vend1.2d, vend1.2d
> - mov tmp1, vend1.2d[0]
> - cbz tmp1, .Lloop
> + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
> + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
> + orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
> + umaxp vend1.16b, vend1.16b, vend1.16b
> + mov tmp1, vend1.d[0]
> + cbz tmp1, L(loop)
>
> /* Termination condition found. Now need to establish exactly why
> we terminated. */
> - and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
> - and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
> - and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
> - and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
> - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
> - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
> + bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
> + bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
> + and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
> + and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
> addp vend1.16b, vend1.16b, vend2.16b // 256->128
> addp vend1.16b, vend1.16b, vend2.16b // 128->64
> -
> - mov tmp1, vend1.2d[0]
> -.Ltail:
> + mov tmp1, vend1.d[0]
> +L(tail):
> /* Count the trailing zeros, by bit reversing... */
> rbit tmp1, tmp1
> /* Re-bias source. */
> @@ -160,5 +125,5 @@ def_fn strchr
> csel result, result, xzr, eq
> ret
>
> - .size strchr, . - strchr
> +END (strchr)
> #endif
> diff --git a/newlib/libc/machine/aarch64/strchrnul.S b/newlib/libc/machine/aarch64/strchrnul.S
> index a0ac13b7f4..ceaf4dca17 100644
> --- a/newlib/libc/machine/aarch64/strchrnul.S
> +++ b/newlib/libc/machine/aarch64/strchrnul.S
> @@ -1,32 +1,9 @@
> /*
> - strchrnul - find a character or nul in a string
> -
> - Copyright (c) 2014, ARM Limited
> - All rights Reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the company nor the names of its contributors
> - may be used to endorse or promote products derived from this
> - software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> + * strchrnul - find a character or nul in a string
> + *
> + * Copyright (c) 2014-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See strchrnul-stub.c */
> #else
> @@ -37,6 +14,8 @@
> * Neon Available.
> */
>
> +#include "asmdefs.h"
> +
> /* Arguments and results. */
> #define srcin x0
> #define chrin w1
> @@ -70,15 +49,8 @@
>
> /* Locals and temporaries. */
>
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> -def_fn strchrnul
> +ENTRY (strchrnul)
> + PTR_ARG (0)
> /* Magic constant 0x40100401 to allow us to identify which lane
> matches the termination condition. */
> mov wtmp2, #0x0401
> @@ -87,7 +59,7 @@ def_fn strchrnul
> bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
> dup vrepmask.4s, wtmp2
> ands tmp1, srcin, #31
> - b.eq .Lloop
> + b.eq L(loop)
>
> /* Input string is not 32-byte aligned. Rather than forcing
> the padding bytes to a safe value, we calculate the syndrome
> @@ -95,47 +67,43 @@ def_fn strchrnul
> syndrome that are related to the padding. */
> ld1 {vdata1.16b, vdata2.16b}, [src], #32
> neg tmp1, tmp1
> - cmeq vhas_nul1.16b, vdata1.16b, #0
> cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
> - cmeq vhas_nul2.16b, vdata2.16b, #0
> cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
> - orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
> - orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
> - and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
> - and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
> + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
> + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
> + and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
> + and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
> lsl tmp1, tmp1, #1
> addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
> mov tmp3, #~0
> addp vend1.16b, vend1.16b, vend1.16b // 128->64
> lsr tmp1, tmp3, tmp1
>
> - mov tmp3, vend1.2d[0]
> + mov tmp3, vend1.d[0]
> bic tmp1, tmp3, tmp1 // Mask padding bits.
> - cbnz tmp1, .Ltail
> + cbnz tmp1, L(tail)
>
> -.Lloop:
> + .p2align 4
> +L(loop):
> ld1 {vdata1.16b, vdata2.16b}, [src], #32
> - cmeq vhas_nul1.16b, vdata1.16b, #0
> cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
> - cmeq vhas_nul2.16b, vdata2.16b, #0
> cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
> - /* Use a fast check for the termination condition. */
> - orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
> - orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
> - orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b
> - addp vend1.2d, vend1.2d, vend1.2d
> - mov tmp1, vend1.2d[0]
> - cbz tmp1, .Lloop
> + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
> + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
> + orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
> + umaxp vend1.16b, vend1.16b, vend1.16b
> + mov tmp1, vend1.d[0]
> + cbz tmp1, L(loop)
>
> /* Termination condition found. Now need to establish exactly why
> we terminated. */
> - and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
> - and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
> + and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
> + and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
> addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
> addp vend1.16b, vend1.16b, vend1.16b // 128->64
>
> - mov tmp1, vend1.2d[0]
> -.Ltail:
> + mov tmp1, vend1.d[0]
> +L(tail):
> /* Count the trailing zeros, by bit reversing... */
> rbit tmp1, tmp1
> /* Re-bias source. */
> @@ -145,5 +113,5 @@ def_fn strchrnul
> add result, src, tmp1, lsr #1
> ret
>
> - .size strchrnul, . - strchrnul
> +END (strchrnul)
> #endif
> diff --git a/newlib/libc/machine/aarch64/strcmp.S b/newlib/libc/machine/aarch64/strcmp.S
> index e2bef2d49d..691a1760ee 100644
> --- a/newlib/libc/machine/aarch64/strcmp.S
> +++ b/newlib/libc/machine/aarch64/strcmp.S
> @@ -1,202 +1,192 @@
> -/* Copyright (c) 2012-2018, Linaro Limited
> - All rights reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the Linaro nor the
> - names of its contributors may be used to endorse or promote products
> - derived from this software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> -/* Assumptions:
> +/*
> + * strcmp - compare two strings
> *
> - * ARMv8-a, AArch64
> + * Copyright (c) 2012-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> */
>
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See strcmp-stub.c */
> #else
>
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64.
> + * MTE compatible.
> + */
>
> -#define L(label) .L ## label
> +#include "asmdefs.h"
>
> #define REP8_01 0x0101010101010101
> #define REP8_7f 0x7f7f7f7f7f7f7f7f
> -#define REP8_80 0x8080808080808080
>
> -/* Parameters and result. */
> #define src1 x0
> #define src2 x1
> #define result x0
>
> -/* Internal variables. */
> #define data1 x2
> #define data1w w2
> #define data2 x3
> #define data2w w3
> #define has_nul x4
> #define diff x5
> +#define off1 x5
> #define syndrome x6
> -#define tmp1 x7
> -#define tmp2 x8
> -#define tmp3 x9
> -#define zeroones x10
> -#define pos x11
> -
> - /* Start of performance-critical section -- one 64B cache line. */
> -def_fn strcmp p2align=6
> - eor tmp1, src1, src2
> - mov zeroones, #REP8_01
> - tst tmp1, #7
> +#define tmp x6
> +#define data3 x7
> +#define zeroones x8
> +#define shift x9
> +#define off2 x10
> +
> +/* On big-endian early bytes are at MSB and on little-endian LSB.
> + LS_FW means shifting towards early bytes. */
> +#ifdef __AARCH64EB__
> +# define LS_FW lsl
> +#else
> +# define LS_FW lsr
> +#endif
> +
> +/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> + can be done in parallel across the entire word.
> + Since carry propagation makes 0x1 bytes before a NUL byte appear
> + NUL too in big-endian, byte-reverse the data before the NUL check. */
> +
> +
> +ENTRY (strcmp)
> + PTR_ARG (0)
> + PTR_ARG (1)
> + sub off2, src2, src1
> + mov zeroones, REP8_01
> + and tmp, src1, 7
> + tst off2, 7
> b.ne L(misaligned8)
> - ands tmp1, src1, #7
> - b.ne L(mutual_align)
> - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> - can be done in parallel across the entire word. */
> + cbnz tmp, L(mutual_align)
> +
> + .p2align 4
> +
> L(loop_aligned):
> - ldr data1, [src1], #8
> - ldr data2, [src2], #8
> + ldr data2, [src1, off2]
> + ldr data1, [src1], 8
> L(start_realigned):
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - eor diff, data1, data2 /* Non-zero if differences found. */
> - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
> +#ifdef __AARCH64EB__
> + rev tmp, data1
> + sub has_nul, tmp, zeroones
> + orr tmp, tmp, REP8_7f
> +#else
> + sub has_nul, data1, zeroones
> + orr tmp, data1, REP8_7f
> +#endif
> + bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
> + ccmp data1, data2, 0, eq
> + b.eq L(loop_aligned)
> +#ifdef __AARCH64EB__
> + rev has_nul, has_nul
> +#endif
> + eor diff, data1, data2
> orr syndrome, diff, has_nul
> - cbz syndrome, L(loop_aligned)
> - /* End of performance-critical section -- one 64B cache line. */
> -
> L(end):
> -#ifndef __AARCH64EB__
> +#ifndef __AARCH64EB__
> rev syndrome, syndrome
> rev data1, data1
> - /* The MS-non-zero bit of the syndrome marks either the first bit
> - that is different, or the top bit of the first zero byte.
> - Shifting left now will bring the critical information into the
> - top bits. */
> - clz pos, syndrome
> rev data2, data2
> - lsl data1, data1, pos
> - lsl data2, data2, pos
> - /* But we need to zero-extend (char is unsigned) the value and then
> - perform a signed 32-bit subtraction. */
> - lsr data1, data1, #56
> - sub result, data1, data2, lsr #56
> - ret
> -#else
> - /* For big-endian we cannot use the trick with the syndrome value
> - as carry-propagation can corrupt the upper bits if the trailing
> - bytes in the string contain 0x01. */
> - /* However, if there is no NUL byte in the dword, we can generate
> - the result directly. We can't just subtract the bytes as the
> - MSB might be significant. */
> - cbnz has_nul, 1f
> - cmp data1, data2
> - cset result, ne
> - cneg result, result, lo
> - ret
> -1:
> - /* Re-compute the NUL-byte detection, using a byte-reversed value. */
> - rev tmp3, data1
> - sub tmp1, tmp3, zeroones
> - orr tmp2, tmp3, #REP8_7f
> - bic has_nul, tmp1, tmp2
> - rev has_nul, has_nul
> - orr syndrome, diff, has_nul
> - clz pos, syndrome
> - /* The MS-non-zero bit of the syndrome marks either the first bit
> - that is different, or the top bit of the first zero byte.
> +#endif
> + clz shift, syndrome
> + /* The most-significant-non-zero bit of the syndrome marks either the
> + first bit that is different, or the top bit of the first zero byte.
> Shifting left now will bring the critical information into the
> top bits. */
> - lsl data1, data1, pos
> - lsl data2, data2, pos
> + lsl data1, data1, shift
> + lsl data2, data2, shift
> /* But we need to zero-extend (char is unsigned) the value and then
> perform a signed 32-bit subtraction. */
> - lsr data1, data1, #56
> - sub result, data1, data2, lsr #56
> + lsr data1, data1, 56
> + sub result, data1, data2, lsr 56
> ret
> -#endif
> +
> + .p2align 4
>
> L(mutual_align):
> /* Sources are mutually aligned, but are not currently at an
> alignment boundary. Round down the addresses and then mask off
> - the bytes that preceed the start point. */
> - bic src1, src1, #7
> - bic src2, src2, #7
> - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
> - ldr data1, [src1], #8
> - neg tmp1, tmp1 /* Bits to alignment -64. */
> - ldr data2, [src2], #8
> - mov tmp2, #~0
> -#ifdef __AARCH64EB__
> - /* Big-endian. Early bytes are at MSB. */
> - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
> -#else
> - /* Little-endian. Early bytes are at LSB. */
> - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
> -#endif
> - orr data1, data1, tmp2
> - orr data2, data2, tmp2
> + the bytes that precede the start point. */
> + bic src1, src1, 7
> + ldr data2, [src1, off2]
> + ldr data1, [src1], 8
> + neg shift, src2, lsl 3 /* Bits to alignment -64. */
> + mov tmp, -1
> + LS_FW tmp, tmp, shift
> + orr data1, data1, tmp
> + orr data2, data2, tmp
> b L(start_realigned)
>
> L(misaligned8):
> /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
> - checking to make sure that we don't access beyond page boundary in
> - SRC2. */
> - tst src1, #7
> - b.eq L(loop_misaligned)
> + checking to make sure that we don't access beyond the end of SRC2. */
> + cbz tmp, L(src1_aligned)
> L(do_misaligned):
> - ldrb data1w, [src1], #1
> - ldrb data2w, [src2], #1
> - cmp data1w, #1
> - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
> + ldrb data1w, [src1], 1
> + ldrb data2w, [src2], 1
> + cmp data1w, 0
> + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
> b.ne L(done)
> - tst src1, #7
> + tst src1, 7
> b.ne L(do_misaligned)
>
> -L(loop_misaligned):
> - /* Test if we are within the last dword of the end of a 4K page. If
> - yes then jump back to the misaligned loop to copy a byte at a time. */
> - and tmp1, src2, #0xff8
> - eor tmp1, tmp1, #0xff8
> - cbz tmp1, L(do_misaligned)
> - ldr data1, [src1], #8
> - ldr data2, [src2], #8
> -
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - eor diff, data1, data2 /* Non-zero if differences found. */
> - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
> +L(src1_aligned):
> + neg shift, src2, lsl 3
> + bic src2, src2, 7
> + ldr data3, [src2], 8
> +#ifdef __AARCH64EB__
> + rev data3, data3
> +#endif
> + lsr tmp, zeroones, shift
> + orr data3, data3, tmp
> + sub has_nul, data3, zeroones
> + orr tmp, data3, REP8_7f
> + bics has_nul, has_nul, tmp
> + b.ne L(tail)
> +
> + sub off1, src2, src1
> +
> + .p2align 4
> +
> +L(loop_unaligned):
> + ldr data3, [src1, off1]
> + ldr data2, [src1, off2]
> +#ifdef __AARCH64EB__
> + rev data3, data3
> +#endif
> + sub has_nul, data3, zeroones
> + orr tmp, data3, REP8_7f
> + ldr data1, [src1], 8
> + bics has_nul, has_nul, tmp
> + ccmp data1, data2, 0, eq
> + b.eq L(loop_unaligned)
> +
> + lsl tmp, has_nul, shift
> +#ifdef __AARCH64EB__
> + rev tmp, tmp
> +#endif
> + eor diff, data1, data2
> + orr syndrome, diff, tmp
> + cbnz syndrome, L(end)
> +L(tail):
> + ldr data1, [src1]
> + neg shift, shift
> + lsr data2, data3, shift
> + lsr has_nul, has_nul, shift
> +#ifdef __AARCH64EB__
> + rev data2, data2
> + rev has_nul, has_nul
> +#endif
> + eor diff, data1, data2
> orr syndrome, diff, has_nul
> - cbz syndrome, L(loop_misaligned)
> b L(end)
>
> L(done):
> sub result, data1, data2
> ret
> - .size strcmp, .-strcmp
>
> +END (strcmp)
> #endif
> diff --git a/newlib/libc/machine/aarch64/strcpy.S b/newlib/libc/machine/aarch64/strcpy.S
> index e5405f2535..57c46f3908 100644
> --- a/newlib/libc/machine/aarch64/strcpy.S
> +++ b/newlib/libc/machine/aarch64/strcpy.S
> @@ -1,341 +1,160 @@
> /*
> - strcpy/stpcpy - copy a string returning pointer to start/end.
> -
> - Copyright (c) 2013, 2014, 2015 ARM Ltd.
> - All Rights Reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the company nor the names of its contributors
> - may be used to endorse or promote products derived from this
> - software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> + * strcpy/stpcpy - copy a string returning pointer to start/end.
> + *
> + * Copyright (c) 2020-2023, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See strchr-stub.c */
> #else
>
> /* Assumptions:
> *
> - * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
> + * ARMv8-a, AArch64, Advanced SIMD.
> + * MTE compatible.
> */
>
> -/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
> +#include "asmdefs.h"
>
> - To test the page crossing code path more thoroughly, compile with
> - -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
> - entry path. This option is not intended for production use. */
> -
> -/* Arguments and results. */
> #define dstin x0
> #define srcin x1
> +#define result x0
>
> -/* Locals and temporaries. */
> #define src x2
> #define dst x3
> -#define data1 x4
> -#define data1w w4
> -#define data2 x5
> -#define data2w w5
> -#define has_nul1 x6
> -#define has_nul2 x7
> -#define tmp1 x8
> -#define tmp2 x9
> -#define tmp3 x10
> -#define tmp4 x11
> -#define zeroones x12
> -#define data1a x13
> -#define data2a x14
> -#define pos x15
> -#define len x16
> -#define to_align x17
> +#define len x4
> +#define synd x4
> +#define tmp x5
> +#define shift x5
> +#define data1 x6
> +#define dataw1 w6
> +#define data2 x7
> +#define dataw2 w7
> +
> +#define dataq q0
> +#define vdata v0
> +#define vhas_nul v1
> +#define vend v2
> +#define dend d2
> +#define dataq2 q1
>
> #ifdef BUILD_STPCPY
> -#define STRCPY stpcpy
> +# define STRCPY stpcpy
> +# define IFSTPCPY(X,...) X,__VA_ARGS__
> #else
> -#define STRCPY strcpy
> +# define STRCPY strcpy
> +# define IFSTPCPY(X,...)
> #endif
>
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> - can be done in parallel across the entire word. */
> -
> -#define REP8_01 0x0101010101010101
> -#define REP8_7f 0x7f7f7f7f7f7f7f7f
> -#define REP8_80 0x8080808080808080
> -
> - /* AArch64 systems have a minimum page size of 4k. We can do a quick
> - page size check for crossing this boundary on entry and if we
> - do not, then we can short-circuit much of the entry code. We
> - expect early page-crossing strings to be rare (probability of
> - 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
> - predictable, even with random strings.
> -
> - We don't bother checking for larger page sizes, the cost of setting
> - up the correct page size is just not worth the extra gain from
> - a small reduction in the cases taking the slow path. Note that
> - we only care about whether the first fetch, which may be
> - misaligned, crosses a page boundary - after that we move to aligned
> - fetches for the remainder of the string. */
> -
> -#ifdef STRCPY_TEST_PAGE_CROSS
> - /* Make everything that isn't Qword aligned look like a page cross. */
> -#define MIN_PAGE_P2 4
> -#else
> -#define MIN_PAGE_P2 12
> -#endif
> -
> -#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
> -
> -def_fn STRCPY p2align=6
> - /* For moderately short strings, the fastest way to do the copy is to
> - calculate the length of the string in the same way as strlen, then
> - essentially do a memcpy of the result. This avoids the need for
> - multiple byte copies and further means that by the time we
> - reach the bulk copy loop we know we can always use DWord
> - accesses. We expect strcpy to rarely be called repeatedly
> - with the same source string, so branch prediction is likely to
> - always be difficult - we mitigate against this by preferring
> - conditional select operations over branches whenever this is
> - feasible. */
> - and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
> - mov zeroones, #REP8_01
> - and to_align, srcin, #15
> - cmp tmp2, #(MIN_PAGE_SIZE - 16)
> - neg tmp1, to_align
> - /* The first fetch will straddle a (possible) page boundary iff
> - srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
> - aligned string will never fail the page align check, so will
> - always take the fast path. */
> - b.gt .Lpage_cross
> -
> -.Lpage_cross_ok:
> - ldp data1, data2, [srcin]
> -#ifdef __AARCH64EB__
> - /* Because we expect the end to be found within 16 characters
> - (profiling shows this is the most common case), it's worth
> - swapping the bytes now to save having to recalculate the
> - termination syndrome later. We preserve data1 and data2
> - so that we can re-use the values later on. */
> - rev tmp2, data1
> - sub tmp1, tmp2, zeroones
> - orr tmp2, tmp2, #REP8_7f
> - bics has_nul1, tmp1, tmp2
> - b.ne .Lfp_le8
> - rev tmp4, data2
> - sub tmp3, tmp4, zeroones
> - orr tmp4, tmp4, #REP8_7f
> -#else
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - bics has_nul1, tmp1, tmp2
> - b.ne .Lfp_le8
> - sub tmp3, data2, zeroones
> - orr tmp4, data2, #REP8_7f
> +/*
> + Core algorithm:
> + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> + per byte. We take 4 bits of every comparison byte with shift right and narrow
> + by 4 instruction. Since the bits in the nibble mask reflect the order in
> + which things occur in the original string, counting leading zeros identifies
> + exactly which byte matched. */
> +
> +ENTRY (STRCPY)
> + PTR_ARG (0)
> + PTR_ARG (1)
> + bic src, srcin, 15
> + ld1 {vdata.16b}, [src]
> + cmeq vhas_nul.16b, vdata.16b, 0
> + lsl shift, srcin, 2
> + shrn vend.8b, vhas_nul.8h, 4
> + fmov synd, dend
> + lsr synd, synd, shift
> + cbnz synd, L(tail)
> +
> + ldr dataq, [src, 16]!
> + cmeq vhas_nul.16b, vdata.16b, 0
> + shrn vend.8b, vhas_nul.8h, 4
> + fmov synd, dend
> + cbz synd, L(start_loop)
> +
> +#ifndef __AARCH64EB__
> + rbit synd, synd
> #endif
> - bics has_nul2, tmp3, tmp4
> - b.eq .Lbulk_entry
> + sub tmp, src, srcin
> + clz len, synd
> + add len, tmp, len, lsr 2
> + tbz len, 4, L(less16)
> + sub tmp, len, 15
> + ldr dataq, [srcin]
> + ldr dataq2, [srcin, tmp]
> + str dataq, [dstin]
> + str dataq2, [dstin, tmp]
> + IFSTPCPY (add result, dstin, len)
> + ret
>
> - /* The string is short (<=16 bytes). We don't know exactly how
> - short though, yet. Work out the exact length so that we can
> - quickly select the optimal copy strategy. */
> -.Lfp_gt8:
> - rev has_nul2, has_nul2
> - clz pos, has_nul2
> - mov tmp2, #56
> - add dst, dstin, pos, lsr #3 /* Bits to bytes. */
> - sub pos, tmp2, pos
> -#ifdef __AARCH64EB__
> - lsr data2, data2, pos
> -#else
> - lsl data2, data2, pos
> -#endif
> - str data2, [dst, #1]
> +L(tail):
> + rbit synd, synd
> + clz len, synd
> + lsr len, len, 2
> +L(less16):
> + tbz len, 3, L(less8)
> + sub tmp, len, 7
> + ldr data1, [srcin]
> + ldr data2, [srcin, tmp]
> str data1, [dstin]
> -#ifdef BUILD_STPCPY
> - add dstin, dst, #8
> -#endif
> + str data2, [dstin, tmp]
> + IFSTPCPY (add result, dstin, len)
> ret
>
> -.Lfp_le8:
> - rev has_nul1, has_nul1
> - clz pos, has_nul1
> - add dst, dstin, pos, lsr #3 /* Bits to bytes. */
> - subs tmp2, pos, #24 /* Pos in bits. */
> - b.lt .Lfp_lt4
> -#ifdef __AARCH64EB__
> - mov tmp2, #56
> - sub pos, tmp2, pos
> - lsr data2, data1, pos
> - lsr data1, data1, #32
> -#else
> - lsr data2, data1, tmp2
> -#endif
> - /* 4->7 bytes to copy. */
> - str data2w, [dst, #-3]
> - str data1w, [dstin]
> -#ifdef BUILD_STPCPY
> - mov dstin, dst
> -#endif
> - ret
> -.Lfp_lt4:
> - cbz pos, .Lfp_lt2
> - /* 2->3 bytes to copy. */
> -#ifdef __AARCH64EB__
> - lsr data1, data1, #48
> -#endif
> - strh data1w, [dstin]
> - /* Fall-through, one byte (max) to go. */
> -.Lfp_lt2:
> - /* Null-terminated string. Last character must be zero! */
> - strb wzr, [dst]
> -#ifdef BUILD_STPCPY
> - mov dstin, dst
> -#endif
> + .p2align 4
> +L(less8):
> + subs tmp, len, 3
> + b.lo L(less4)
> + ldr dataw1, [srcin]
> + ldr dataw2, [srcin, tmp]
> + str dataw1, [dstin]
> + str dataw2, [dstin, tmp]
> + IFSTPCPY (add result, dstin, len)
> ret
>
> - .p2align 6
> - /* Aligning here ensures that the entry code and main loop all lies
> - within one 64-byte cache line. */
> -.Lbulk_entry:
> - sub to_align, to_align, #16
> - stp data1, data2, [dstin]
> - sub src, srcin, to_align
> - sub dst, dstin, to_align
> - b .Lentry_no_page_cross
> -
> - /* The inner loop deals with two Dwords at a time. This has a
> - slightly higher start-up cost, but we should win quite quickly,
> - especially on cores with a high number of issue slots per
> - cycle, as we get much better parallelism out of the operations. */
> -.Lmain_loop:
> - stp data1, data2, [dst], #16
> -.Lentry_no_page_cross:
> - ldp data1, data2, [src], #16
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - sub tmp3, data2, zeroones
> - orr tmp4, data2, #REP8_7f
> - bic has_nul1, tmp1, tmp2
> - bics has_nul2, tmp3, tmp4
> - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
> - b.eq .Lmain_loop
> -
> - /* Since we know we are copying at least 16 bytes, the fastest way
> - to deal with the tail is to determine the location of the
> - trailing NUL, then (re)copy the 16 bytes leading up to that. */
> - cmp has_nul1, #0
> -#ifdef __AARCH64EB__
> - /* For big-endian, carry propagation (if the final byte in the
> - string is 0x01) means we cannot use has_nul directly. The
> - easiest way to get the correct byte is to byte-swap the data
> - and calculate the syndrome a second time. */
> - csel data1, data1, data2, ne
> - rev data1, data1
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - bic has_nul1, tmp1, tmp2
> -#else
> - csel has_nul1, has_nul1, has_nul2, ne
> -#endif
> - rev has_nul1, has_nul1
> - clz pos, has_nul1
> - add tmp1, pos, #72
> - add pos, pos, #8
> - csel pos, pos, tmp1, ne
> - add src, src, pos, lsr #3
> - add dst, dst, pos, lsr #3
> - ldp data1, data2, [src, #-32]
> - stp data1, data2, [dst, #-16]
> -#ifdef BUILD_STPCPY
> - sub dstin, dst, #1
> -#endif
> +L(less4):
> + cbz len, L(zerobyte)
> + ldrh dataw1, [srcin]
> + strh dataw1, [dstin]
> +L(zerobyte):
> + strb wzr, [dstin, len]
> + IFSTPCPY (add result, dstin, len)
> ret
>
> -.Lpage_cross:
> - bic src, srcin, #15
> - /* Start by loading two words at [srcin & ~15], then forcing the
> - bytes that precede srcin to 0xff. This means they never look
> - like termination bytes. */
> - ldp data1, data2, [src]
> - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
> - tst to_align, #7
> - csetm tmp2, ne
> -#ifdef __AARCH64EB__
> - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
> -#else
> - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
> + .p2align 4
> +L(start_loop):
> + sub tmp, srcin, dstin
> + ldr dataq2, [srcin]
> + sub dst, src, tmp
> + str dataq2, [dstin]
> +L(loop):
> + str dataq, [dst], 32
> + ldr dataq, [src, 16]
> + cmeq vhas_nul.16b, vdata.16b, 0
> + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
> + fmov synd, dend
> + cbnz synd, L(loopend)
> + str dataq, [dst, -16]
> + ldr dataq, [src, 32]!
> + cmeq vhas_nul.16b, vdata.16b, 0
> + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
> + fmov synd, dend
> + cbz synd, L(loop)
> + add dst, dst, 16
> +L(loopend):
> + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> + fmov synd, dend
> + sub dst, dst, 31
> +#ifndef __AARCH64EB__
> + rbit synd, synd
> #endif
> - orr data1, data1, tmp2
> - orr data2a, data2, tmp2
> - cmp to_align, #8
> - csinv data1, data1, xzr, lt
> - csel data2, data2, data2a, lt
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - sub tmp3, data2, zeroones
> - orr tmp4, data2, #REP8_7f
> - bic has_nul1, tmp1, tmp2
> - bics has_nul2, tmp3, tmp4
> - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
> - b.eq .Lpage_cross_ok
> - /* We now need to make data1 and data2 look like they've been
> - loaded directly from srcin. Do a rotate on the 128-bit value. */
> - lsl tmp1, to_align, #3 /* Bytes->bits. */
> - neg tmp2, to_align, lsl #3
> -#ifdef __AARCH64EB__
> - lsl data1a, data1, tmp1
> - lsr tmp4, data2, tmp2
> - lsl data2, data2, tmp1
> - orr tmp4, tmp4, data1a
> - cmp to_align, #8
> - csel data1, tmp4, data2, lt
> - rev tmp2, data1
> - rev tmp4, data2
> - sub tmp1, tmp2, zeroones
> - orr tmp2, tmp2, #REP8_7f
> - sub tmp3, tmp4, zeroones
> - orr tmp4, tmp4, #REP8_7f
> -#else
> - lsr data1a, data1, tmp1
> - lsl tmp4, data2, tmp2
> - lsr data2, data2, tmp1
> - orr tmp4, tmp4, data1a
> - cmp to_align, #8
> - csel data1, tmp4, data2, lt
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - sub tmp3, data2, zeroones
> - orr tmp4, data2, #REP8_7f
> -#endif
> - bic has_nul1, tmp1, tmp2
> - cbnz has_nul1, .Lfp_le8
> - bic has_nul2, tmp3, tmp4
> - b .Lfp_gt8
> + clz len, synd
> + lsr len, len, 2
> + add dst, dst, len
> + ldr dataq, [dst, tmp]
> + str dataq, [dst]
> + IFSTPCPY (add result, dst, 15)
> + ret
>
> - .size STRCPY, . - STRCPY
> +END (STRCPY)
> #endif
> diff --git a/newlib/libc/machine/aarch64/strlen.S b/newlib/libc/machine/aarch64/strlen.S
> index 872d136ef4..68a6f357cf 100644
> --- a/newlib/libc/machine/aarch64/strlen.S
> +++ b/newlib/libc/machine/aarch64/strlen.S
> @@ -1,115 +1,92 @@
> -/* Copyright (c) 2013-2015, Linaro Limited
> - All rights reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the Linaro nor the
> - names of its contributors may be used to endorse or promote products
> - derived from this software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> +/*
> + * strlen - calculate the length of a string.
> + *
> + * Copyright (c) 2020-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See strlen-stub.c */
> #else
>
> /* Assumptions:
> *
> - * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
> + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
> + * Not MTE compatible.
> */
>
> -/* To test the page crossing code path more thoroughly, compile with
> - -DTEST_PAGE_CROSS - this will force all calls through the slower
> - entry path. This option is not intended for production use. */
> -
> -/* Arguments and results. */
> -#define srcin x0
> -#define len x0
> -
> -/* Locals and temporaries. */
> -#define src x1
> -#define data1 x2
> -#define data2 x3
> -#define has_nul1 x4
> -#define has_nul2 x5
> -#define tmp1 x4
> -#define tmp2 x5
> -#define tmp3 x6
> -#define tmp4 x7
> -#define zeroones x8
> -
> -#define L(l) .L ## l
> -
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> - can be done in parallel across the entire word. A faster check
> - (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
> - false hits for characters 129..255. */
> +#include "asmdefs.h"
> +
> +#define srcin x0
> +#define len x0
> +
> +#define src x1
> +#define data1 x2
> +#define data2 x3
> +#define has_nul1 x4
> +#define has_nul2 x5
> +#define tmp1 x4
> +#define tmp2 x5
> +#define tmp3 x6
> +#define tmp4 x7
> +#define zeroones x8
> +
> +#define maskv v0
> +#define maskd d0
> +#define dataq1 q1
> +#define dataq2 q2
> +#define datav1 v1
> +#define datav2 v2
> +#define tmp x2
> +#define tmpw w2
> +#define synd x3
> +#define syndw w3
> +#define shift x4
> +
> +/* For the first 32 bytes, NUL detection works on the principle that
> + (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
> + byte is zero, and can be done in parallel across the entire word. */
>
> #define REP8_01 0x0101010101010101
> #define REP8_7f 0x7f7f7f7f7f7f7f7f
> -#define REP8_80 0x8080808080808080
> +
> +/* To test the page crossing code path more thoroughly, compile with
> + -DTEST_PAGE_CROSS - this will force all calls through the slower
> + entry path. This option is not intended for production use. */
>
> #ifdef TEST_PAGE_CROSS
> -# define MIN_PAGE_SIZE 15
> +# define MIN_PAGE_SIZE 32
> #else
> # define MIN_PAGE_SIZE 4096
> #endif
>
> - /* Since strings are short on average, we check the first 16 bytes
> - of the string for a NUL character. In order to do an unaligned ldp
> - safely we have to do a page cross check first. If there is a NUL
> - byte we calculate the length from the 2 8-byte words using
> - conditional select to reduce branch mispredictions (it is unlikely
> - strlen will be repeatedly called on strings with the same length).
> -
> - If the string is longer than 16 bytes, we align src so don't need
> - further page cross checks, and process 32 bytes per iteration
> - using the fast NUL check. If we encounter non-ASCII characters,
> - fallback to a second loop using the full NUL check.
> -
> - If the page cross check fails, we read 16 bytes from an aligned
> - address, remove any characters before the string, and continue
> - in the main loop using aligned loads. Since strings crossing a
> - page in the first 16 bytes are rare (probability of
> - 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
> -
> - AArch64 systems have a minimum page size of 4k. We don't bother
> - checking for larger page sizes - the cost of setting up the correct
> - page size is just not worth the extra gain from a small reduction in
> - the cases taking the slow path. Note that we only care about
> - whether the first fetch, which may be misaligned, crosses a page
> - boundary. */
> -
> -def_fn strlen p2align=6
> +/* Core algorithm:
> +
> + Since strings are short on average, we check the first 32 bytes of the
> + string for a NUL character without aligning the string. In order to use
> + unaligned loads safely we must do a page cross check first.
> +
> + If there is a NUL byte we calculate the length from the 2 8-byte words
> + using conditional select to reduce branch mispredictions (it is unlikely
> + strlen will be repeatedly called on strings with the same length).
> +
> + If the string is longer than 32 bytes, align src so we don't need further
> + page cross checks, and process 32 bytes per iteration using a fast SIMD
> + loop.
> +
> + If the page cross check fails, we read 32 bytes from an aligned address,
> + and ignore any characters before the string. If it contains a NUL
> + character, return the length, if not, continue in the main loop. */
> +
> +ENTRY (strlen)
> + PTR_ARG (0)
> and tmp1, srcin, MIN_PAGE_SIZE - 1
> - mov zeroones, REP8_01
> - cmp tmp1, MIN_PAGE_SIZE - 16
> - b.gt L(page_cross)
> + cmp tmp1, MIN_PAGE_SIZE - 32
> + b.hi L(page_cross)
> +
> + /* Look for a NUL byte in the first 16 bytes. */
> ldp data1, data2, [srcin]
> + mov zeroones, REP8_01
> +
> #ifdef __AARCH64EB__
> /* For big-endian, carry propagation (if the final byte in the
> string is 0x01) means we cannot use has_nul1/2 directly.
> @@ -125,114 +102,96 @@ def_fn strlen p2align=6
> bics has_nul1, tmp1, tmp2
> bic has_nul2, tmp3, tmp4
> ccmp has_nul2, 0, 0, eq
> - beq L(main_loop_entry)
> + b.eq L(bytes16_31)
>
> - /* Enter with C = has_nul1 == 0. */
> + /* Find the exact offset of the first NUL byte in the first 16 bytes
> + from the string start. Enter with C = has_nul1 == 0. */
> csel has_nul1, has_nul1, has_nul2, cc
> mov len, 8
> rev has_nul1, has_nul1
> - clz tmp1, has_nul1
> csel len, xzr, len, cc
> + clz tmp1, has_nul1
> add len, len, tmp1, lsr 3
> ret
>
> - /* The inner loop processes 32 bytes per iteration and uses the fast
> - NUL check. If we encounter non-ASCII characters, use a second
> - loop with the accurate NUL check. */
> - .p2align 4
> -L(main_loop_entry):
> - bic src, srcin, 15
> - sub src, src, 16
> -L(main_loop):
> - ldp data1, data2, [src, 32]!
> -.Lpage_cross_entry:
> - sub tmp1, data1, zeroones
> - sub tmp3, data2, zeroones
> - orr tmp2, tmp1, tmp3
> - tst tmp2, zeroones, lsl 7
> - bne 1f
> - ldp data1, data2, [src, 16]
> + /* Look for a NUL byte at offset 16..31 in the string. */
> +L(bytes16_31):
> + ldp data1, data2, [srcin, 16]
> +#ifdef __AARCH64EB__
> + rev data1, data1
> + rev data2, data2
> +#endif
> sub tmp1, data1, zeroones
> - sub tmp3, data2, zeroones
> - orr tmp2, tmp1, tmp3
> - tst tmp2, zeroones, lsl 7
> - beq L(main_loop)
> - add src, src, 16
> -1:
> - /* The fast check failed, so do the slower, accurate NUL check. */
> orr tmp2, data1, REP8_7f
> + sub tmp3, data2, zeroones
> orr tmp4, data2, REP8_7f
> bics has_nul1, tmp1, tmp2
> bic has_nul2, tmp3, tmp4
> ccmp has_nul2, 0, 0, eq
> - beq L(nonascii_loop)
> + b.eq L(loop_entry)
>
> - /* Enter with C = has_nul1 == 0. */
> -L(tail):
> -#ifdef __AARCH64EB__
> - /* For big-endian, carry propagation (if the final byte in the
> - string is 0x01) means we cannot use has_nul1/2 directly. The
> - easiest way to get the correct byte is to byte-swap the data
> - and calculate the syndrome a second time. */
> - csel data1, data1, data2, cc
> - rev data1, data1
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, REP8_7f
> - bic has_nul1, tmp1, tmp2
> -#else
> + /* Find the exact offset of the first NUL byte at offset 16..31 from
> + the string start. Enter with C = has_nul1 == 0. */
> csel has_nul1, has_nul1, has_nul2, cc
> -#endif
> - sub len, src, srcin
> + mov len, 24
> rev has_nul1, has_nul1
> - add tmp2, len, 8
> + mov tmp3, 16
> clz tmp1, has_nul1
> - csel len, len, tmp2, cc
> + csel len, tmp3, len, cc
> add len, len, tmp1, lsr 3
> ret
>
> -L(nonascii_loop):
> - ldp data1, data2, [src, 16]!
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, REP8_7f
> - sub tmp3, data2, zeroones
> - orr tmp4, data2, REP8_7f
> - bics has_nul1, tmp1, tmp2
> - bic has_nul2, tmp3, tmp4
> - ccmp has_nul2, 0, 0, eq
> - bne L(tail)
> - ldp data1, data2, [src, 16]!
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, REP8_7f
> - sub tmp3, data2, zeroones
> - orr tmp4, data2, REP8_7f
> - bics has_nul1, tmp1, tmp2
> - bic has_nul2, tmp3, tmp4
> - ccmp has_nul2, 0, 0, eq
> - beq L(nonascii_loop)
> - b L(tail)
> + nop
> +L(loop_entry):
> + bic src, srcin, 31
> +
> + .p2align 5
> +L(loop):
> + ldp dataq1, dataq2, [src, 32]!
> + uminp maskv.16b, datav1.16b, datav2.16b
> + uminp maskv.16b, maskv.16b, maskv.16b
> + cmeq maskv.8b, maskv.8b, 0
> + fmov synd, maskd
> + cbz synd, L(loop)
> +
> + /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
> + cmeq maskv.16b, datav1.16b, 0
> + sub len, src, srcin
> + cbnz syndw, 1f
> + cmeq maskv.16b, datav2.16b, 0
> + add len, len, 16
> +1:
> + /* Generate a bitmask and compute correct byte offset. */
> + shrn maskv.8b, maskv.8h, 4
> + fmov synd, maskd
> +#ifndef __AARCH64EB__
> + rbit synd, synd
> +#endif
> + clz tmp, synd
> + add len, len, tmp, lsr 2
> + ret
>
> - /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
> - srcin to 0x7f, so we ignore any NUL bytes before the string.
> - Then continue in the aligned loop. */
> L(page_cross):
> - bic src, srcin, 15
> - ldp data1, data2, [src]
> - lsl tmp1, srcin, 3
> - mov tmp4, -1
> -#ifdef __AARCH64EB__
> - /* Big-endian. Early bytes are at MSB. */
> - lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
> -#else
> - /* Little-endian. Early bytes are at LSB. */
> - lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
> -#endif
> - orr tmp1, tmp1, REP8_80
> - orn data1, data1, tmp1
> - orn tmp2, data2, tmp1
> - tst srcin, 8
> - csel data1, data1, tmp4, eq
> - csel data2, data2, tmp2, eq
> - b L(page_cross_entry)
> -
> - .size strlen, . - strlen
> + bic src, srcin, 31
> + mov tmpw, 0x0c03
> + movk tmpw, 0xc030, lsl 16
> + ld1 {datav1.16b, datav2.16b}, [src]
> + dup maskv.4s, tmpw
> + cmeq datav1.16b, datav1.16b, 0
> + cmeq datav2.16b, datav2.16b, 0
> + and datav1.16b, datav1.16b, maskv.16b
> + and datav2.16b, datav2.16b, maskv.16b
> + addp maskv.16b, datav1.16b, datav2.16b
> + addp maskv.16b, maskv.16b, maskv.16b
> + fmov synd, maskd
> + lsl shift, srcin, 1
> + lsr synd, synd, shift
> + cbz synd, L(loop)
> +
> + rbit synd, synd
> + clz len, synd
> + lsr len, len, 1
> + ret
> +
> +END (strlen)
> #endif
> diff --git a/newlib/libc/machine/aarch64/strncmp.S b/newlib/libc/machine/aarch64/strncmp.S
> index ffdabc2607..373695503d 100644
> --- a/newlib/libc/machine/aarch64/strncmp.S
> +++ b/newlib/libc/machine/aarch64/strncmp.S
> @@ -1,49 +1,23 @@
> -/* Copyright (c) 2013, 2018, Linaro Limited
> - All rights reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the Linaro nor the
> - names of its contributors may be used to endorse or promote products
> - derived from this software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> +/*
> + * strncmp - compare two strings
> + *
> + * Copyright (c) 2013-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See strcmp-stub.c */
> #else
>
> /* Assumptions:
> *
> - * ARMv8-a, AArch64
> + * ARMv8-a, AArch64.
> + * MTE compatible.
> */
>
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> +#include "asmdefs.h"
>
> #define REP8_01 0x0101010101010101
> #define REP8_7f 0x7f7f7f7f7f7f7f7f
> -#define REP8_80 0x8080808080808080
>
> /* Parameters and result. */
> #define src1 x0
> @@ -64,86 +38,91 @@
> #define tmp3 x10
> #define zeroones x11
> #define pos x12
> -#define limit_wd x13
> -#define mask x14
> -#define endloop x15
> +#define mask x13
> +#define endloop x14
> #define count mask
> +#define offset pos
> +#define neg_offset x15
> +
> +/* Define endian dependent shift operations.
> + On big-endian early bytes are at MSB and on little-endian LSB.
> + LS_FW means shifting towards early bytes.
> + LS_BK means shifting towards later bytes.
> + */
> +#ifdef __AARCH64EB__
> +#define LS_FW lsl
> +#define LS_BK lsr
> +#else
> +#define LS_FW lsr
> +#define LS_BK lsl
> +#endif
>
> - .text
> - .p2align 6
> - .rep 7
> - nop /* Pad so that the loop below fits a cache line. */
> - .endr
> -def_fn strncmp
> - cbz limit, .Lret0
> +ENTRY (strncmp)
> + PTR_ARG (0)
> + PTR_ARG (1)
> + SIZE_ARG (2)
> + cbz limit, L(ret0)
> eor tmp1, src1, src2
> mov zeroones, #REP8_01
> tst tmp1, #7
> and count, src1, #7
> - b.ne .Lmisaligned8
> - cbnz count, .Lmutual_align
> - /* Calculate the number of full and partial words -1. */
> - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
> - lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
> + b.ne L(misaligned8)
> + cbnz count, L(mutual_align)
>
> /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> can be done in parallel across the entire word. */
> - /* Start of performance-critical section -- one 64B cache line. */
> -.Lloop_aligned:
> + .p2align 4
> +L(loop_aligned):
> ldr data1, [src1], #8
> ldr data2, [src2], #8
> -.Lstart_realigned:
> - subs limit_wd, limit_wd, #1
> +L(start_realigned):
> + subs limit, limit, #8
> sub tmp1, data1, zeroones
> orr tmp2, data1, #REP8_7f
> eor diff, data1, data2 /* Non-zero if differences found. */
> - csinv endloop, diff, xzr, pl /* Last Dword or differences. */
> + csinv endloop, diff, xzr, hi /* Last Dword or differences. */
> bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
> ccmp endloop, #0, #0, eq
> - b.eq .Lloop_aligned
> - /* End of performance-critical section -- one 64B cache line. */
> + b.eq L(loop_aligned)
> + /* End of main loop */
>
> - /* Not reached the limit, must have found the end or a diff. */
> - tbz limit_wd, #63, .Lnot_limit
> -
> - /* Limit % 8 == 0 => all bytes significant. */
> - ands limit, limit, #7
> - b.eq .Lnot_limit
> -
> - lsl limit, limit, #3 /* Bits -> bytes. */
> - mov mask, #~0
> -#ifdef __AARCH64EB__
> - lsr mask, mask, limit
> -#else
> - lsl mask, mask, limit
> -#endif
> - bic data1, data1, mask
> - bic data2, data2, mask
> -
> - /* Make sure that the NUL byte is marked in the syndrome. */
> - orr has_nul, has_nul, mask
> -
> -.Lnot_limit:
> +L(full_check):
> +#ifndef __AARCH64EB__
> orr syndrome, diff, has_nul
> -
> -#ifndef __AARCH64EB__
> + add limit, limit, 8 /* Rewind limit to before last subs. */
> +L(syndrome_check):
> + /* Limit was reached. Check if the NUL byte or the difference
> + is before the limit. */
> rev syndrome, syndrome
> rev data1, data1
> - /* The MS-non-zero bit of the syndrome marks either the first bit
> - that is different, or the top bit of the first zero byte.
> - Shifting left now will bring the critical information into the
> - top bits. */
> clz pos, syndrome
> rev data2, data2
> lsl data1, data1, pos
> + cmp limit, pos, lsr #3
> lsl data2, data2, pos
> /* But we need to zero-extend (char is unsigned) the value and then
> perform a signed 32-bit subtraction. */
> lsr data1, data1, #56
> sub result, data1, data2, lsr #56
> + csel result, result, xzr, hi
> ret
> #else
> + /* Not reached the limit, must have found the end or a diff. */
> + tbz limit, #63, L(not_limit)
> + add tmp1, limit, 8
> + cbz limit, L(not_limit)
> +
> + lsl limit, tmp1, #3 /* Bits -> bytes. */
> + mov mask, #~0
> + lsr mask, mask, limit
> + bic data1, data1, mask
> + bic data2, data2, mask
> +
> + /* Make sure that the NUL byte is marked in the syndrome. */
> + orr has_nul, has_nul, mask
> +
> +L(not_limit):
> /* For big-endian we cannot use the trick with the syndrome value
> as carry-propagation can corrupt the upper bits if the trailing
> bytes in the string contain 0x01. */
> @@ -164,10 +143,11 @@ def_fn strncmp
> rev has_nul, has_nul
> orr syndrome, diff, has_nul
> clz pos, syndrome
> - /* The MS-non-zero bit of the syndrome marks either the first bit
> - that is different, or the top bit of the first zero byte.
> + /* The most-significant-non-zero bit of the syndrome marks either the
> + first bit that is different, or the top bit of the first zero byte.
> Shifting left now will bring the critical information into the
> top bits. */
> +L(end_quick):
> lsl data1, data1, pos
> lsl data2, data2, pos
> /* But we need to zero-extend (char is unsigned) the value and then
> @@ -177,7 +157,7 @@ def_fn strncmp
> ret
> #endif
>
> -.Lmutual_align:
> +L(mutual_align):
> /* Sources are mutually aligned, but are not currently at an
> alignment boundary. Round down the addresses and then mask off
> the bytes that precede the start point.
> @@ -189,102 +169,143 @@ def_fn strncmp
> neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
> ldr data2, [src2], #8
> mov tmp2, #~0
> - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
> -#ifdef __AARCH64EB__
> - /* Big-endian. Early bytes are at MSB. */
> - lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
> -#else
> - /* Little-endian. Early bytes are at LSB. */
> - lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
> -#endif
> - and tmp3, limit_wd, #7
> - lsr limit_wd, limit_wd, #3
> - /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
> - add limit, limit, count
> - add tmp3, tmp3, count
> + LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
> + /* Adjust the limit and ensure it doesn't overflow. */
> + adds limit, limit, count
> + csinv limit, limit, xzr, lo
> orr data1, data1, tmp2
> orr data2, data2, tmp2
> - add limit_wd, limit_wd, tmp3, lsr #3
> - b .Lstart_realigned
> + b L(start_realigned)
>
> - .p2align 6
> + .p2align 4
> /* Don't bother with dwords for up to 16 bytes. */
> -.Lmisaligned8:
> +L(misaligned8):
> cmp limit, #16
> - b.hs .Ltry_misaligned_words
> + b.hs L(try_misaligned_words)
>
> -.Lbyte_loop:
> +L(byte_loop):
> /* Perhaps we can do better than this. */
> ldrb data1w, [src1], #1
> ldrb data2w, [src2], #1
> subs limit, limit, #1
> ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
> ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
> - b.eq .Lbyte_loop
> -.Ldone:
> + b.eq L(byte_loop)
> +L(done):
> sub result, data1, data2
> ret
> /* Align the SRC1 to a dword by doing a bytewise compare and then do
> the dword loop. */
> -.Ltry_misaligned_words:
> - lsr limit_wd, limit, #3
> - cbz count, .Ldo_misaligned
> +L(try_misaligned_words):
> + cbz count, L(src1_aligned)
>
> neg count, count
> and count, count, #7
> sub limit, limit, count
> - lsr limit_wd, limit, #3
>
> -.Lpage_end_loop:
> +L(page_end_loop):
> ldrb data1w, [src1], #1
> ldrb data2w, [src2], #1
> cmp data1w, #1
> ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
> - b.ne .Ldone
> + b.ne L(done)
> subs count, count, #1
> - b.hi .Lpage_end_loop
> + b.hi L(page_end_loop)
> +
> + /* The following diagram explains the comparison of misaligned strings.
> + The bytes are shown in natural order. For little-endian, it is
> + reversed in the registers. The "x" bytes are before the string.
> + The "|" separates data that is loaded at one time.
> + src1 | a a a a a a a a | b b b c c c c c | . . .
> + src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
>
> -.Ldo_misaligned:
> - /* Prepare ourselves for the next page crossing. Unlike the aligned
> - loop, we fetch 1 less dword because we risk crossing bounds on
> - SRC2. */
> - mov count, #8
> - subs limit_wd, limit_wd, #1
> - b.lo .Ldone_loop
> -.Lloop_misaligned:
> - and tmp2, src2, #0xff8
> - eor tmp2, tmp2, #0xff8
> - cbz tmp2, .Lpage_end_loop
> + After shifting in each step, the data looks like this:
> + STEP_A STEP_B STEP_C
> + data1 a a a a a a a a b b b c c c c c b b b c c c c c
> + data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
>
> + The bytes with "0" are eliminated from the syndrome via mask.
> +
> + Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
> + time from SRC2. The comparison happens in 3 steps. After each step
> + the loop can exit, or read from SRC1 or SRC2. */
> +L(src1_aligned):
> + /* Calculate offset from 8 byte alignment to string start in bits. No
> + need to mask offset since shifts are ignoring upper bits. */
> + lsl offset, src2, #3
> + bic src2, src2, #0xf
> + mov mask, -1
> + neg neg_offset, offset
> ldr data1, [src1], #8
> - ldr data2, [src2], #8
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - eor diff, data1, data2 /* Non-zero if differences found. */
> - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
> - ccmp diff, #0, #0, eq
> - b.ne .Lnot_limit
> - subs limit_wd, limit_wd, #1
> - b.pl .Lloop_misaligned
> + ldp tmp1, tmp2, [src2], #16
> + LS_BK mask, mask, neg_offset
> + and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
> + /* Skip the first compare if data in tmp1 is irrelevant. */
> + tbnz offset, 6, L(misaligned_mid_loop)
>
> -.Ldone_loop:
> - /* We found a difference or a NULL before the limit was reached. */
> - and limit, limit, #7
> - cbz limit, .Lnot_limit
> - /* Read the last word. */
> - sub src1, src1, 8
> - sub src2, src2, 8
> - ldr data1, [src1, limit]
> - ldr data2, [src2, limit]
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> +L(loop_misaligned):
> + /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
> + LS_FW data2, tmp1, offset
> + LS_BK tmp1, tmp2, neg_offset
> + subs limit, limit, #8
> + orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
> + sub has_nul, data1, zeroones
> eor diff, data1, data2 /* Non-zero if differences found. */
> - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
> - ccmp diff, #0, #0, eq
> - b.ne .Lnot_limit
> + orr tmp3, data1, #REP8_7f
> + csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
> + bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
> + orr tmp3, endloop, has_nul
> + cbnz tmp3, L(full_check)
> +
> + ldr data1, [src1], #8
> +L(misaligned_mid_loop):
> + /* STEP_B: Compare first part of data1 to second part of tmp2. */
> + LS_FW data2, tmp2, offset
> +#ifdef __AARCH64EB__
> + /* For big-endian we do a byte reverse to avoid carry-propagation
> + problem described above. This way we can reuse the has_nul in the
> + next step and also use syndrome value trick at the end. */
> + rev tmp3, data1
> + #define data1_fixed tmp3
> +#else
> + #define data1_fixed data1
> +#endif
> + sub has_nul, data1_fixed, zeroones
> + orr tmp3, data1_fixed, #REP8_7f
> + eor diff, data2, data1 /* Non-zero if differences found. */
> + bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
> +#ifdef __AARCH64EB__
> + rev has_nul, has_nul
> +#endif
> + cmp limit, neg_offset, lsr #3
> + orr syndrome, diff, has_nul
> + bic syndrome, syndrome, mask /* Ignore later bytes. */
> + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
> + cbnz tmp3, L(syndrome_check)
> +
> + /* STEP_C: Compare second part of data1 to first part of tmp1. */
> + ldp tmp1, tmp2, [src2], #16
> + cmp limit, #8
> + LS_BK data2, tmp1, neg_offset
> + eor diff, data2, data1 /* Non-zero if differences found. */
> + orr syndrome, diff, has_nul
> + and syndrome, syndrome, mask /* Ignore earlier bytes. */
> + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
> + cbnz tmp3, L(syndrome_check)
> +
> + ldr data1, [src1], #8
> + sub limit, limit, #8
> + b L(loop_misaligned)
> +
> +#ifdef __AARCH64EB__
> +L(syndrome_check):
> + clz pos, syndrome
> + cmp pos, limit, lsl #3
> + b.lo L(end_quick)
> +#endif
>
> -.Lret0:
> +L(ret0):
> mov result, #0
> ret
> - .size strncmp, . - strncmp
> +END(strncmp)
> #endif
> diff --git a/newlib/libc/machine/aarch64/strnlen.S b/newlib/libc/machine/aarch64/strnlen.S
> index c255c3f7c6..091002e0b0 100644
> --- a/newlib/libc/machine/aarch64/strnlen.S
> +++ b/newlib/libc/machine/aarch64/strnlen.S
> @@ -1,187 +1,105 @@
> -/* strnlen - calculate the length of a string with limit.
> -
> - Copyright (c) 2013, Linaro Limited
> - All rights reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the Linaro nor the
> - names of its contributors may be used to endorse or promote products
> - derived from this software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> +/*
> + * strnlen - calculate the length of a string with limit.
> + *
> + * Copyright (c) 2020-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See strlen-stub.c */
> #else
>
> /* Assumptions:
> *
> - * ARMv8-a, AArch64
> + * ARMv8-a, AArch64, Advanced SIMD.
> + * MTE compatible.
> */
>
> -/* Arguments and results. */
> +#include "asmdefs.h"
> +
> #define srcin x0
> -#define len x0
> -#define limit x1
> +#define cntin x1
> +#define result x0
>
> -/* Locals and temporaries. */
> #define src x2
> -#define data1 x3
> -#define data2 x4
> -#define data2a x5
> -#define has_nul1 x6
> -#define has_nul2 x7
> -#define tmp1 x8
> -#define tmp2 x9
> -#define tmp3 x10
> -#define tmp4 x11
> -#define zeroones x12
> -#define pos x13
> -#define limit_wd x14
> -
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> -#define REP8_01 0x0101010101010101
> -#define REP8_7f 0x7f7f7f7f7f7f7f7f
> -#define REP8_80 0x8080808080808080
> -
> - .text
> - .p2align 6
> -.Lstart:
> - /* Pre-pad to ensure critical loop begins an icache line. */
> - .rep 7
> - nop
> - .endr
> - /* Put this code here to avoid wasting more space with pre-padding. */
> -.Lhit_limit:
> - mov len, limit
> +#define synd x3
> +#define shift x4
> +#define tmp x4
> +#define cntrem x5
> +
> +#define qdata q0
> +#define vdata v0
> +#define vhas_chr v1
> +#define vend v2
> +#define dend d2
> +
> +/*
> + Core algorithm:
> + Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
> + four bits per byte using the shrn instruction. A count trailing zeros then
> + identifies the first zero byte. */
> +
> +ENTRY (strnlen)
> + PTR_ARG (0)
> + SIZE_ARG (1)
> + bic src, srcin, 15
> + cbz cntin, L(nomatch)
> + ld1 {vdata.16b}, [src]
> + cmeq vhas_chr.16b, vdata.16b, 0
> + lsl shift, srcin, 2
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> + fmov synd, dend
> + lsr synd, synd, shift
> + cbz synd, L(start_loop)
> +L(finish):
> + rbit synd, synd
> + clz synd, synd
> + lsr result, synd, 2
> + cmp cntin, result
> + csel result, cntin, result, ls
> ret
>
> -def_fn strnlen
> - cbz limit, .Lhit_limit
> - mov zeroones, #REP8_01
> - bic src, srcin, #15
> - ands tmp1, srcin, #15
> - b.ne .Lmisaligned
> - /* Calculate the number of full and partial words -1. */
> - sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
> - lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
> -
> - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> - can be done in parallel across the entire word. */
> - /* The inner loop deals with two Dwords at a time. This has a
> - slightly higher start-up cost, but we should win quite quickly,
> - especially on cores with a high number of issue slots per
> - cycle, as we get much better parallelism out of the operations. */
> -
> - /* Start of critial section -- keep to one 64Byte cache line. */
> -.Lloop:
> - ldp data1, data2, [src], #16
> -.Lrealigned:
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - sub tmp3, data2, zeroones
> - orr tmp4, data2, #REP8_7f
> - bic has_nul1, tmp1, tmp2
> - bic has_nul2, tmp3, tmp4
> - subs limit_wd, limit_wd, #1
> - orr tmp1, has_nul1, has_nul2
> - ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
> - b.eq .Lloop
> - /* End of critical section -- keep to one 64Byte cache line. */
> -
> - orr tmp1, has_nul1, has_nul2
> - cbz tmp1, .Lhit_limit /* No null in final Qword. */
> -
> - /* We know there's a null in the final Qword. The easiest thing
> - to do now is work out the length of the string and return
> - MIN (len, limit). */
> -
> - sub len, src, srcin
> - cbz has_nul1, .Lnul_in_data2
> -#ifdef __AARCH64EB__
> - mov data2, data1
> -#endif
> - sub len, len, #8
> - mov has_nul2, has_nul1
> -.Lnul_in_data2:
> -#ifdef __AARCH64EB__
> - /* For big-endian, carry propagation (if the final byte in the
> - string is 0x01) means we cannot use has_nul directly. The
> - easiest way to get the correct byte is to byte-swap the data
> - and calculate the syndrome a second time. */
> - rev data2, data2
> - sub tmp1, data2, zeroones
> - orr tmp2, data2, #REP8_7f
> - bic has_nul2, tmp1, tmp2
> -#endif
> - sub len, len, #8
> - rev has_nul2, has_nul2
> - clz pos, has_nul2
> - add len, len, pos, lsr #3 /* Bits to bytes. */
> - cmp len, limit
> - csel len, len, limit, ls /* Return the lower value. */
> +L(nomatch):
> + mov result, cntin
> ret
>
> -.Lmisaligned:
> - /* Deal with a partial first word.
> - We're doing two things in parallel here;
> - 1) Calculate the number of words (but avoiding overflow if
> - limit is near ULONG_MAX) - to do this we need to work out
> - limit + tmp1 - 1 as a 65-bit value before shifting it;
> - 2) Load and mask the initial data words - we force the bytes
> - before the ones we are interested in to 0xff - this ensures
> - early bytes will not hit any zero detection. */
> - sub limit_wd, limit, #1
> - neg tmp4, tmp1
> - cmp tmp1, #8
> -
> - and tmp3, limit_wd, #15
> - lsr limit_wd, limit_wd, #4
> - mov tmp2, #~0
> -
> - ldp data1, data2, [src], #16
> - lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
> - add tmp3, tmp3, tmp1
> -
> -#ifdef __AARCH64EB__
> - /* Big-endian. Early bytes are at MSB. */
> - lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
> -#else
> - /* Little-endian. Early bytes are at LSB. */
> - lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
> +L(start_loop):
> + sub tmp, src, srcin
> + add tmp, tmp, 17
> + subs cntrem, cntin, tmp
> + b.lo L(nomatch)
> +
> + /* Make sure that it won't overread by a 16-byte chunk */
> + tbz cntrem, 4, L(loop32_2)
> + sub src, src, 16
> + .p2align 5
> +L(loop32):
> + ldr qdata, [src, 32]!
> + cmeq vhas_chr.16b, vdata.16b, 0
> + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
> + fmov synd, dend
> + cbnz synd, L(end)
> +L(loop32_2):
> + ldr qdata, [src, 16]
> + subs cntrem, cntrem, 32
> + cmeq vhas_chr.16b, vdata.16b, 0
> + b.lo L(end_2)
> + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
> + fmov synd, dend
> + cbz synd, L(loop32)
> +L(end_2):
> + add src, src, 16
> +L(end):
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> + sub result, src, srcin
> + fmov synd, dend
> +#ifndef __AARCH64EB__
> + rbit synd, synd
> #endif
> - add limit_wd, limit_wd, tmp3, lsr #4
> -
> - orr data1, data1, tmp2
> - orr data2a, data2, tmp2
> -
> - csinv data1, data1, xzr, le
> - csel data2, data2, data2a, le
> - b .Lrealigned
> - .size strnlen, . - .Lstart /* Include pre-padding in size. */
> + clz synd, synd
> + add result, result, synd, lsr 2
> + cmp cntin, result
> + csel result, cntin, result, ls
> + ret
>
> +END (strnlen)
> #endif
> diff --git a/newlib/libc/machine/aarch64/strrchr.S b/newlib/libc/machine/aarch64/strrchr.S
> index d64fc09b1a..b0574228b6 100644
> --- a/newlib/libc/machine/aarch64/strrchr.S
> +++ b/newlib/libc/machine/aarch64/strrchr.S
> @@ -1,32 +1,9 @@
> /*
> - strrchr - find last instance of a character in a string
> -
> - Copyright (c) 2014, ARM Limited
> - All rights Reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the company nor the names of its contributors
> - may be used to endorse or promote products derived from this
> - software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> + * strrchr - find last position of a character in a string.
> + *
> + * Copyright (c) 2014-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See strchr-stub.c */
> #else
> @@ -37,6 +14,8 @@
> * Neon Available.
> */
>
> +#include "asmdefs.h"
> +
> /* Arguments and results. */
> #define srcin x0
> #define chrin w1
> @@ -78,17 +57,8 @@
> in the original string a count_trailing_zeros() operation will
> identify exactly which byte is causing the termination, and why. */
>
> -/* Locals and temporaries. */
> -
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> -def_fn strrchr
> +ENTRY (strrchr)
> + PTR_ARG (0)
> /* Magic constant 0x40100401 to allow us to identify which lane
> matches the requested byte. Magic constant 0x80200802 used
> similarly for NUL termination. */
> @@ -100,7 +70,7 @@ def_fn strrchr
> mov src_offset, #0
> ands tmp1, srcin, #31
> add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
> - b.eq .Laligned
> + b.eq L(aligned)
>
> /* Input string is not 32-byte aligned. Rather than forcing
> the padding bytes to a safe value, we calculate the syndrome
> @@ -118,45 +88,45 @@ def_fn strrchr
> and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
> addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
> addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
> - addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b // 128->64
> - addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
> - mov nul_match, vhas_nul1.2d[0]
> + addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64
> + mov nul_match, vend1.d[0]
> lsl tmp1, tmp1, #1
> mov const_m1, #~0
> - mov chr_match, vhas_chr1.2d[0]
> lsr tmp3, const_m1, tmp1
> + mov chr_match, vend1.d[1]
>
> bic nul_match, nul_match, tmp3 // Mask padding bits.
> bic chr_match, chr_match, tmp3 // Mask padding bits.
> - cbnz nul_match, .Ltail
> + cbnz nul_match, L(tail)
>
> -.Lloop:
> + .p2align 4
> +L(loop):
> cmp chr_match, #0
> csel src_match, src, src_match, ne
> csel src_offset, chr_match, src_offset, ne
> -.Laligned:
> +L(aligned):
> ld1 {vdata1.16b, vdata2.16b}, [src], #32
> - cmeq vhas_nul1.16b, vdata1.16b, #0
> cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
> - cmeq vhas_nul2.16b, vdata2.16b, #0
> cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
> - addp vend1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
> + uminp vend1.16b, vdata1.16b, vdata2.16b
> and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
> and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
> + cmeq vend1.16b, vend1.16b, 0
> addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
> - addp vend1.16b, vend1.16b, vend1.16b // 128->64
> - addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
> - mov nul_match, vend1.2d[0]
> - mov chr_match, vhas_chr1.2d[0]
> - cbz nul_match, .Lloop
> + addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64
> + mov nul_match, vend1.d[0]
> + mov chr_match, vend1.d[1]
> + cbz nul_match, L(loop)
>
> + cmeq vhas_nul1.16b, vdata1.16b, #0
> + cmeq vhas_nul2.16b, vdata2.16b, #0
> and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
> and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
> addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
> addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
> - mov nul_match, vhas_nul1.2d[0]
> + mov nul_match, vhas_nul1.d[0]
>
> -.Ltail:
> +L(tail):
> /* Work out exactly where the string ends. */
> sub tmp4, nul_match, #1
> eor tmp4, tmp4, nul_match
> @@ -178,5 +148,5 @@ def_fn strrchr
>
> ret
>
> - .size strrchr, . - strrchr
> +END (strrchr)
> #endif
Hello Richard,
On 05.10.23 12:37, Richard Earnshaw wrote:
> This is basically ok, but you're removing an existing license and adding
> a new one from Arm; I think you need to copy the new license into
> COPYING.NEWLIB - it's not enough just to have an SPDX identifier, the
> text of the license must be added somewhere as well.
thanks for the review. I sent v4 of the patch set which should fix the
license issue.
new file mode 100644
@@ -0,0 +1,106 @@
+/*
+ * Macros for asm code. AArch64 version.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+/* Branch Target Identitication support. */
+#define BTI_C hint 34
+#define BTI_J hint 36
+/* Return address signing support (pac-ret). */
+#define PACIASP hint 25; .cfi_window_save
+#define AUTIASP hint 29; .cfi_window_save
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h. */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
+#ifdef __ILP32__
+#define GNU_PROPERTY(type, value) \
+ .section .note.gnu.property, "a"; \
+ .p2align 2; \
+ .word 4; \
+ .word 12; \
+ .word 5; \
+ .asciz "GNU"; \
+ .word type; \
+ .word 4; \
+ .word value; \
+ .text
+#else
+#define GNU_PROPERTY(type, value) \
+ .section .note.gnu.property, "a"; \
+ .p2align 3; \
+ .word 4; \
+ .word 16; \
+ .word 5; \
+ .asciz "GNU"; \
+ .word type; \
+ .word 4; \
+ .word value; \
+ .word 0; \
+ .text
+#endif
+
+/* If set then the GNU Property Note section will be added to
+ mark objects to support BTI and PAC-RET. */
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+/* Add property note with supported features to all asm files. */
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
+#endif
+
+#define ENTRY_ALIGN(name, alignment) \
+ .global name; \
+ .type name,%function; \
+ .align alignment; \
+ name: \
+ .cfi_startproc; \
+ BTI_C;
+
+#define ENTRY(name) ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name) \
+ .global name; \
+ .type name,%function; \
+ name:
+
+#define END(name) \
+ .cfi_endproc; \
+ .size name, .-name;
+
+#define L(l) .L ## l
+
+#ifdef __ILP32__
+ /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n) mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#endif
+
+#ifdef __ILP32__
+ /* Sanitize padding bits of size arguments as per aapcs64 */
+#define SIZE_ARG(n) mov w##n, w##n
+#else
+#define SIZE_ARG(n)
+#endif
+
+/* Compiler supports SVE instructions */
+#ifndef HAVE_SVE
+# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
+# define HAVE_SVE 1
+# else
+# define HAVE_SVE 0
+# endif
+#endif
+
+#endif
@@ -1,31 +1,8 @@
/*
* memchr - find a character in a memory zone
*
- * Copyright (c) 2014, ARM Limited
- * All rights Reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of the company nor the names of its contributors
- * may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
@@ -37,6 +14,8 @@
* Neon Available.
*/
+#include "asmdefs.h"
+
/* Arguments and results. */
#define srcin x0
#define chrin w1
@@ -70,17 +49,11 @@
* identify exactly which byte has matched.
*/
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
-def_fn memchr
+ENTRY (memchr)
+ PTR_ARG (0)
+ SIZE_ARG (2)
/* Do not dereference srcin if no bytes to compare. */
- cbz cntin, .Lzero_length
+ cbz cntin, L(zero_length)
/*
* Magic constant 0x40100401 allows us to identify which lane matches
* the requested byte.
@@ -93,7 +66,7 @@ def_fn memchr
dup vrepmask.4s, wtmp2
ands soff, srcin, #31
and cntrem, cntin, #31
- b.eq .Lloop
+ b.eq L(loop)
/*
* Input string is not 32-byte aligned. We calculate the syndrome
@@ -110,41 +83,41 @@ def_fn memchr
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
addp vend.16b, vend.16b, vend.16b /* 128->64 */
- mov synd, vend.2d[0]
+ mov synd, vend.d[0]
/* Clear the soff*2 lower bits */
lsl tmp, soff, #1
lsr synd, synd, tmp
lsl synd, synd, tmp
/* The first block can also be the last */
- b.ls .Lmasklast
+ b.ls L(masklast)
/* Have we found something already? */
- cbnz synd, .Ltail
+ cbnz synd, L(tail)
-.Lloop:
+L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
subs cntin, cntin, #32
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
/* If we're out of data we finish regardless of the result */
- b.ls .Lend
+ b.ls L(end)
/* Use a fast check for the termination condition */
orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
addp vend.2d, vend.2d, vend.2d
- mov synd, vend.2d[0]
+ mov synd, vend.d[0]
/* We're not out of data, loop if we haven't found the character */
- cbz synd, .Lloop
+ cbz synd, L(loop)
-.Lend:
+L(end):
/* Termination condition found, let's calculate the syndrome value */
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
addp vend.16b, vend.16b, vend.16b /* 128->64 */
- mov synd, vend.2d[0]
+ mov synd, vend.d[0]
/* Only do the clear for the last possible block */
- b.hi .Ltail
+ b.hs L(tail)
-.Lmasklast:
+L(masklast):
/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
add tmp, cntrem, soff
and tmp, tmp, #31
@@ -153,7 +126,7 @@ def_fn memchr
lsl synd, synd, tmp
lsr synd, synd, tmp
-.Ltail:
+L(tail):
/* Count the trailing zeros using bit reversing */
rbit synd, synd
/* Compensate the last post-increment */
@@ -168,9 +141,9 @@ def_fn memchr
csel result, xzr, result, eq
ret
-.Lzero_length:
+L(zero_length):
mov result, #0
ret
- .size memchr, . - memchr
+END (memchr)
#endif
@@ -1,57 +1,7 @@
/* memcmp - compare memory
-
- Copyright (c) 2018 Linaro Limited
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
-/*
- * Copyright (c) 2017 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- * products derived from this software without specific prior written
- * permission.
*
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
@@ -60,103 +10,79 @@
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*/
-#define L(l) .L ## l
-
-/* Parameters and result. */
-#define src1 x0
-#define src2 x1
-#define limit x2
-#define result w0
-
-/* Internal variables. */
-#define data1 x3
-#define data1w w3
-#define data1h x4
-#define data2 x5
-#define data2w w5
-#define data2h x6
-#define tmp1 x7
-#define tmp2 x8
-
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
-def_fn memcmp p2align=6
- subs limit, limit, 8
- b.lo L(less8)
-
- ldr data1, [src1], 8
- ldr data2, [src2], 8
- cmp data1, data2
- b.ne L(return)
-
- subs limit, limit, 8
- b.gt L(more16)
-
- ldr data1, [src1, limit]
- ldr data2, [src2, limit]
- b L(return)
-
-L(more16):
- ldr data1, [src1], 8
- ldr data2, [src2], 8
- cmp data1, data2
- bne L(return)
-
- /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
- strings. */
- subs limit, limit, 16
+#include "asmdefs.h"
+
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result w0
+
+#define data1 x3
+#define data1w w3
+#define data2 x4
+#define data2w w4
+#define data3 x5
+#define data3w w5
+#define data4 x6
+#define data4w w6
+#define tmp x6
+#define src1end x7
+#define src2end x8
+
+
+ENTRY (memcmp)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+
+ cmp limit, 16
+ b.lo L(less16)
+ ldp data1, data3, [src1]
+ ldp data2, data4, [src2]
+ ccmp data1, data2, 0, ne
+ ccmp data3, data4, 0, eq
+ b.ne L(return2)
+
+ add src1end, src1, limit
+ add src2end, src2, limit
+ cmp limit, 32
b.ls L(last_bytes)
+ cmp limit, 160
+ b.hs L(loop_align)
+ sub limit, limit, 32
- /* We overlap loads between 0-32 bytes at either side of SRC1 when we
- try to align, so limit it only to strings larger than 128 bytes. */
- cmp limit, 96
- b.ls L(loop16)
-
- /* Align src1 and adjust src2 with bytes not yet done. */
- and tmp1, src1, 15
- add limit, limit, tmp1
- sub src1, src1, tmp1
- sub src2, src2, tmp1
-
- /* Loop performing 16 bytes per iteration using aligned src1.
- Limit is pre-decremented by 16 and must be larger than zero.
- Exit if <= 16 bytes left to do or if the data is not equal. */
.p2align 4
-L(loop16):
- ldp data1, data1h, [src1], 16
- ldp data2, data2h, [src2], 16
- subs limit, limit, 16
- ccmp data1, data2, 0, hi
- ccmp data1h, data2h, 0, eq
- b.eq L(loop16)
-
+L(loop32):
+ ldp data1, data3, [src1, 16]
+ ldp data2, data4, [src2, 16]
cmp data1, data2
- bne L(return)
- mov data1, data1h
- mov data2, data2h
+ ccmp data3, data4, 0, eq
+ b.ne L(return2)
+ cmp limit, 16
+ b.ls L(last_bytes)
+
+ ldp data1, data3, [src1, 32]
+ ldp data2, data4, [src2, 32]
cmp data1, data2
- bne L(return)
+ ccmp data3, data4, 0, eq
+ b.ne L(return2)
+ add src1, src1, 32
+ add src2, src2, 32
+L(last64):
+ subs limit, limit, 32
+ b.hi L(loop32)
/* Compare last 1-16 bytes using unaligned access. */
L(last_bytes):
- add src1, src1, limit
- add src2, src2, limit
- ldp data1, data1h, [src1]
- ldp data2, data2h, [src2]
- cmp data1, data2
- bne L(return)
- mov data1, data1h
- mov data2, data2h
+ ldp data1, data3, [src1end, -16]
+ ldp data2, data4, [src2end, -16]
+L(return2):
cmp data1, data2
+ csel data1, data1, data3, ne
+ csel data2, data2, data4, ne
/* Compare data bytes and set return value to 0, -1 or 1. */
L(return):
@@ -164,33 +90,106 @@ L(return):
rev data1, data1
rev data2, data2
#endif
- cmp data1, data2
-L(ret_eq):
+ cmp data1, data2
cset result, ne
cneg result, result, lo
ret
.p2align 4
- /* Compare up to 8 bytes. Limit is [-8..-1]. */
+L(less16):
+ add src1end, src1, limit
+ add src2end, src2, limit
+ tbz limit, 3, L(less8)
+ ldr data1, [src1]
+ ldr data2, [src2]
+ ldr data3, [src1end, -8]
+ ldr data4, [src2end, -8]
+ b L(return2)
+
+ .p2align 4
L(less8):
- adds limit, limit, 4
- b.lo L(less4)
- ldr data1w, [src1], 4
- ldr data2w, [src2], 4
+ tbz limit, 2, L(less4)
+ ldr data1w, [src1]
+ ldr data2w, [src2]
+ ldr data3w, [src1end, -4]
+ ldr data4w, [src2end, -4]
+ b L(return2)
+
+L(less4):
+ tbz limit, 1, L(less2)
+ ldrh data1w, [src1]
+ ldrh data2w, [src2]
cmp data1w, data2w
b.ne L(return)
- sub limit, limit, 4
-L(less4):
- adds limit, limit, 4
- beq L(ret_eq)
-L(byte_loop):
- ldrb data1w, [src1], 1
- ldrb data2w, [src2], 1
- subs limit, limit, 1
- ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
- b.eq L(byte_loop)
+L(less2):
+ mov result, 0
+ tbz limit, 0, L(return_zero)
+ ldrb data1w, [src1end, -1]
+ ldrb data2w, [src2end, -1]
sub result, data1w, data2w
+L(return_zero):
+ ret
+
+L(loop_align):
+ ldp data1, data3, [src1, 16]
+ ldp data2, data4, [src2, 16]
+ cmp data1, data2
+ ccmp data3, data4, 0, eq
+ b.ne L(return2)
+
+ /* Align src2 and adjust src1, src2 and limit. */
+ and tmp, src2, 15
+ sub tmp, tmp, 16
+ sub src2, src2, tmp
+ add limit, limit, tmp
+ sub src1, src1, tmp
+ sub limit, limit, 64 + 16
+
+ .p2align 4
+L(loop64):
+ ldr q0, [src1, 16]
+ ldr q1, [src2, 16]
+ subs limit, limit, 64
+ ldr q2, [src1, 32]
+ ldr q3, [src2, 32]
+ eor v0.16b, v0.16b, v1.16b
+ eor v1.16b, v2.16b, v3.16b
+ ldr q2, [src1, 48]
+ ldr q3, [src2, 48]
+ umaxp v0.16b, v0.16b, v1.16b
+ ldr q4, [src1, 64]!
+ ldr q5, [src2, 64]!
+ eor v1.16b, v2.16b, v3.16b
+ eor v2.16b, v4.16b, v5.16b
+ umaxp v1.16b, v1.16b, v2.16b
+ umaxp v0.16b, v0.16b, v1.16b
+ umaxp v0.16b, v0.16b, v0.16b
+ fmov tmp, d0
+ ccmp tmp, 0, 0, hi
+ b.eq L(loop64)
+
+ /* If equal, process last 1-64 bytes using scalar loop. */
+ add limit, limit, 64 + 16
+ cbz tmp, L(last64)
+
+ /* Determine the 8-byte aligned offset of the first difference. */
+#ifdef __AARCH64EB__
+ rev16 tmp, tmp
+#endif
+ rev tmp, tmp
+ clz tmp, tmp
+ bic tmp, tmp, 7
+ sub tmp, tmp, 48
+ ldr data1, [src1, tmp]
+ ldr data2, [src2, tmp]
+#ifndef __AARCH64EB__
+ rev data1, data1
+ rev data2, data2
+#endif
+ mov result, 1
+ cmp data1, data2
+ cneg result, result, lo
ret
- .size memcmp, . - memcmp
+END (memcmp)
#endif
@@ -1,55 +1,8 @@
-/* Copyright (c) 2012-2013, Linaro Limited
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
/*
- * Copyright (c) 2015 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- * products derived from this software without specific prior written
- * permission.
+ * memcpy - copy memory area
*
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -61,6 +14,7 @@
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
/* See memcpy-stub.c */
#else
+#include "asmdefs.h"
#define dstin x0
#define src x1
@@ -71,122 +25,139 @@
#define A_l x6
#define A_lw w6
#define A_h x7
-#define A_hw w7
#define B_l x8
#define B_lw w8
#define B_h x9
#define C_l x10
+#define C_lw w10
#define C_h x11
#define D_l x12
#define D_h x13
-#define E_l src
-#define E_h count
-#define F_l srcend
-#define F_h dst
-#define tmp1 x9
-
-#define L(l) .L ## l
-
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
-/* Copies are split into 3 main cases: small copies of up to 16 bytes,
- medium copies of 17..96 bytes which are fully unrolled. Large copies
- of more than 96 bytes align the destination and use an unrolled loop
- processing 64 bytes per iteration.
- Small and medium copies read all data before writing, allowing any
- kind of overlap, and memmove tailcalls memcpy for these cases as
- well as non-overlapping copies.
+#define E_l x14
+#define E_h x15
+#define F_l x16
+#define F_h x17
+#define G_l count
+#define G_h dst
+#define H_l src
+#define H_h srcend
+#define tmp1 x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The destination pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
*/
-def_fn memcpy p2align=6
- prfm PLDL1KEEP, [src]
+ENTRY_ALIAS (memmove)
+ENTRY (memcpy)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
add srcend, src, count
add dstend, dstin, count
- cmp count, 16
- b.ls L(copy16)
- cmp count, 96
+ cmp count, 128
b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
- /* Medium copies: 17..96 bytes. */
- sub tmp1, count, 1
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
ldp A_l, A_h, [src]
- tbnz tmp1, 6, L(copy96)
ldp D_l, D_h, [srcend, -16]
- tbz tmp1, 5, 1f
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
-1:
stp A_l, A_h, [dstin]
stp D_l, D_h, [dstend, -16]
ret
- .p2align 4
- /* Small copies: 0..16 bytes. */
+ /* Copy 8-15 bytes. */
L(copy16):
- cmp count, 8
- b.lo 1f
+ tbz count, 3, L(copy8)
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
- .p2align 4
-1:
- tbz count, 2, 1f
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
ldr A_lw, [src]
- ldr A_hw, [srcend, -4]
+ ldr B_lw, [srcend, -4]
str A_lw, [dstin]
- str A_hw, [dstend, -4]
+ str B_lw, [dstend, -4]
ret
- /* Copy 0..3 bytes. Use a branchless sequence that copies the same
- byte 3 times if count==1, or the 2nd byte twice if count==2. */
-1:
- cbz count, 2f
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
lsr tmp1, count, 1
ldrb A_lw, [src]
- ldrb A_hw, [srcend, -1]
+ ldrb C_lw, [srcend, -1]
ldrb B_lw, [src, tmp1]
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
- strb A_hw, [dstend, -1]
-2: ret
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
.p2align 4
- /* Copy 64..96 bytes. Copy 64 bytes from the start and
- 32 bytes from the end. */
-L(copy96):
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_l, A_h, [src]
ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [src, 32]
- ldp D_l, D_h, [src, 48]
- ldp E_l, E_h, [srcend, -32]
- ldp F_l, F_h, [srcend, -16]
+ ldp C_l, C_h, [srcend, -32]
+ ldp D_l, D_h, [srcend, -16]
+ cmp count, 64
+ b.hi L(copy128)
stp A_l, A_h, [dstin]
stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstin, 32]
- stp D_l, D_h, [dstin, 48]
- stp E_l, E_h, [dstend, -32]
- stp F_l, F_h, [dstend, -16]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
ret
- /* Align DST to 16 byte alignment so that we don't cross cache line
- boundaries on both loads and stores. There are at least 96 bytes
- to copy, so copy 16 bytes unaligned and then align. The loop
- copies 64 bytes per iteration and prefetches one iteration ahead. */
+ .p2align 4
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_l, E_h, [src, 32]
+ ldp F_l, F_h, [src, 48]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_l, G_h, [srcend, -64]
+ ldp H_l, H_h, [srcend, -48]
+ stp G_l, G_h, [dstend, -64]
+ stp H_l, H_h, [dstend, -48]
+L(copy96):
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp E_l, E_h, [dstin, 32]
+ stp F_l, F_h, [dstin, 48]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
.p2align 4
+ /* Copy more than 128 bytes. */
L(copy_long):
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, L(copy0)
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
+
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
+
+ ldp D_l, D_h, [src]
and tmp1, dstin, 15
bic dst, dstin, 15
- ldp D_l, D_h, [src]
sub src, src, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_l, A_h, [src, 16]
@@ -195,8 +166,9 @@ L(copy_long):
ldp C_l, C_h, [src, 48]
ldp D_l, D_h, [src, 64]!
subs count, count, 128 + 16 /* Test and readjust count. */
- b.ls 2f
-1:
+ b.ls L(copy64_from_end)
+
+L(loop64):
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [src, 16]
stp B_l, B_h, [dst, 32]
@@ -206,12 +178,10 @@ L(copy_long):
stp D_l, D_h, [dst, 64]!
ldp D_l, D_h, [src, 64]!
subs count, count, 64
- b.hi 1b
+ b.hi L(loop64)
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the end even if
- there is just 1 byte left. */
-2:
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
ldp E_l, E_h, [srcend, -64]
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [srcend, -48]
@@ -226,5 +196,51 @@ L(copy_long):
stp C_l, C_h, [dstend, -16]
ret
- .size memcpy, . - memcpy
+ .p2align 4
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align dst to 16-byte alignment. */
+L(copy_long_backwards):
+ ldp D_l, D_h, [srcend, -16]
+ and tmp1, dstend, 15
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp G_l, G_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp G_l, G_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+ ret
+
+END (memcpy)
#endif
@@ -1,66 +1,20 @@
-/* Copyright (c) 2012-2013, Linaro Limited
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
/*
- * Copyright (c) 2015 ARM Ltd
- * All rights reserved.
+ * memset - fill memory with a constant byte
*
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- * products derived from this software without specific prior written
- * permission.
- *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*
*/
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
/* See memset-stub.c */
#else
+#include "asmdefs.h"
#define dstin x0
#define val x1
@@ -68,24 +22,11 @@
#define count x2
#define dst x3
#define dstend x4
-#define tmp1 x5
-#define tmp1w w5
-#define tmp2 x6
-#define tmp2w w6
-#define zva_len x7
-#define zva_lenw w7
-
-#define L(l) .L ## l
+#define zva_val x5
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
-def_fn memset p2align=6
+ENTRY (memset)
+ PTR_ARG (0)
+ SIZE_ARG (2)
dup v0.16B, valw
add dstend, dstin, count
@@ -101,7 +42,7 @@ def_fn memset p2align=6
str val, [dstin]
str val, [dstend, -8]
ret
- nop
+ .p2align 4
1: tbz count, 2, 2f
str valw, [dstin]
str valw, [dstend, -4]
@@ -131,110 +72,49 @@ L(set96):
stp q0, q0, [dstend, -32]
ret
- .p2align 3
- nop
+ .p2align 4
L(set_long):
and valw, valw, 255
bic dst, dstin, 15
str q0, [dstin]
- cmp count, 256
- ccmp valw, 0, 0, cs
- b.eq L(try_zva)
-L(no_zva):
- sub count, dstend, dst /* Count is 16 too large. */
- sub dst, dst, 16 /* Dst is biased by -32. */
- sub count, count, 64 + 16 /* Adjust count and bias for loop. */
-1: stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]!
-L(tail64):
- subs count, count, 64
- b.hi 1b
-2: stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
- ret
-
- .p2align 3
-L(try_zva):
- mrs tmp1, dczid_el0
- tbnz tmp1w, 4, L(no_zva)
- and tmp1w, tmp1w, 15
- cmp tmp1w, 4 /* ZVA size is 64 bytes. */
- b.ne L(zva_128)
-
- /* Write the first and last 64 byte aligned block using stp rather
- than using DC ZVA. This is faster on some cores.
- */
-L(zva_64):
+ cmp count, 160
+ ccmp valw, 0, 0, hs
+ b.ne L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+ mrs zva_val, dczid_el0
+ and zva_val, zva_val, 31
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
+ b.ne L(no_zva)
+#endif
str q0, [dst, 16]
stp q0, q0, [dst, 32]
bic dst, dst, 63
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+64+64 /* Adjust count and bias for loop. */
- add dst, dst, 128
- nop
-1: dc zva, dst
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 128 /* Adjust count and bias for loop. */
+
+ .p2align 4
+L(zva_loop):
add dst, dst, 64
+ dc zva, dst
subs count, count, 64
- b.hi 1b
- stp q0, q0, [dst, 0]
- stp q0, q0, [dst, 32]
+ b.hi L(zva_loop)
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
- .p2align 3
-L(zva_128):
- cmp tmp1w, 5 /* ZVA size is 128 bytes. */
- b.ne L(zva_other)
-
- str q0, [dst, 16]
+L(no_zva):
+ sub count, dstend, dst /* Count is 16 too large. */
+ sub dst, dst, 16 /* Dst is biased by -32. */
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+L(no_zva_loop):
stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- bic dst, dst, 127
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+128 /* Adjust count and bias for loop. */
- add dst, dst, 128
-1: dc zva, dst
- add dst, dst, 128
- subs count, count, 128
- b.hi 1b
- stp q0, q0, [dstend, -128]
- stp q0, q0, [dstend, -96]
+ stp q0, q0, [dst, 64]!
+ subs count, count, 64
+ b.hi L(no_zva_loop)
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
-L(zva_other):
- mov tmp2w, 4
- lsl zva_lenw, tmp2w, tmp1w
- add tmp1, zva_len, 64 /* Max alignment bytes written. */
- cmp count, tmp1
- blo L(no_zva)
-
- sub tmp2, zva_len, 1
- add tmp1, dst, zva_len
- add dst, dst, 16
- subs count, tmp1, dst /* Actual alignment bytes to write. */
- bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
- beq 2f
-1: stp q0, q0, [dst], 64
- stp q0, q0, [dst, -32]
- subs count, count, 64
- b.hi 1b
-2: mov dst, tmp1
- sub count, dstend, tmp1 /* Remaining bytes to write. */
- subs count, count, zva_len
- b.lo 4f
-3: dc zva, dst
- add dst, dst, zva_len
- subs count, count, zva_len
- b.hs 3b
-4: add count, count, zva_len
- sub dst, dst, 32 /* Bias dst for tail loop. */
- b L(tail64)
-
- .size memset, . - memset
+END (memset)
#endif
@@ -1,34 +1,10 @@
/*
- stpcpy - copy a string returning pointer to end.
+ * stpcpy - copy a string returning pointer to end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
- Copyright (c) 2015 ARM Ltd.
- All Rights Reserved.
+#define BUILD_STPCPY 1
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the company nor the names of its contributors
- may be used to endorse or promote products derived from this
- software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
-/* This is just a wrapper that uses strcpy code with appropriate
- pre-defines. */
-
-#define BUILD_STPCPY
#include "strcpy.S"
@@ -1,32 +1,9 @@
/*
- strchr - find a character in a string
-
- Copyright (c) 2014, ARM Limited
- All rights Reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the company nor the names of its contributors
- may be used to endorse or promote products derived from this
- software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
+ * strchr - find a character in a string
+ *
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
/* See strchr-stub.c */
#else
@@ -37,6 +14,8 @@
* Neon Available.
*/
+#include "asmdefs.h"
+
/* Arguments and results. */
#define srcin x0
#define chrin w1
@@ -74,26 +53,19 @@
/* Locals and temporaries. */
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
-def_fn strchr
- /* Magic constant 0x40100401 to allow us to identify which lane
- matches the requested byte. Magic constant 0x80200802 used
- similarly for NUL termination. */
- mov wtmp2, #0x0401
- movk wtmp2, #0x4010, lsl #16
+ENTRY (strchr)
+ PTR_ARG (0)
+ /* Magic constant 0xc0300c03 to allow us to identify which lane
+ matches the requested byte. Even bits are set if the character
+ matches, odd bits if either the char is NUL or matches. */
+ mov wtmp2, 0x0c03
+ movk wtmp2, 0xc030, lsl 16
dup vrepchr.16b, chrin
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
dup vrepmask_c.4s, wtmp2
ands tmp1, srcin, #31
add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
- b.eq .Lloop
+ b.eq L(loop)
/* Input string is not 32-byte aligned. Rather than forcing
the padding bytes to a safe value, we calculate the syndrome
@@ -105,49 +77,42 @@ def_fn strchr
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
- and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
- and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
- and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
- orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
- orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+ bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+ bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+ and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+ and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
lsl tmp1, tmp1, #1
addp vend1.16b, vend1.16b, vend2.16b // 256->128
mov tmp3, #~0
addp vend1.16b, vend1.16b, vend2.16b // 128->64
lsr tmp1, tmp3, tmp1
- mov tmp3, vend1.2d[0]
+ mov tmp3, vend1.d[0]
bic tmp1, tmp3, tmp1 // Mask padding bits.
- cbnz tmp1, .Ltail
+ cbnz tmp1, L(tail)
-.Lloop:
+ .p2align 4
+L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
- cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
- cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- /* Use a fast check for the termination condition. */
- orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
- orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
- orr vend1.16b, vend1.16b, vend2.16b
- addp vend1.2d, vend1.2d, vend1.2d
- mov tmp1, vend1.2d[0]
- cbz tmp1, .Lloop
+ cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+ cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+ orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+ umaxp vend1.16b, vend1.16b, vend1.16b
+ mov tmp1, vend1.d[0]
+ cbz tmp1, L(loop)
/* Termination condition found. Now need to establish exactly why
we terminated. */
- and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
- and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
- and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
- and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
- orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
- orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+ bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+ bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+ and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+ and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
addp vend1.16b, vend1.16b, vend2.16b // 256->128
addp vend1.16b, vend1.16b, vend2.16b // 128->64
-
- mov tmp1, vend1.2d[0]
-.Ltail:
+ mov tmp1, vend1.d[0]
+L(tail):
/* Count the trailing zeros, by bit reversing... */
rbit tmp1, tmp1
/* Re-bias source. */
@@ -160,5 +125,5 @@ def_fn strchr
csel result, result, xzr, eq
ret
- .size strchr, . - strchr
+END (strchr)
#endif
@@ -1,32 +1,9 @@
/*
- strchrnul - find a character or nul in a string
-
- Copyright (c) 2014, ARM Limited
- All rights Reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the company nor the names of its contributors
- may be used to endorse or promote products derived from this
- software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
/* See strchrnul-stub.c */
#else
@@ -37,6 +14,8 @@
* Neon Available.
*/
+#include "asmdefs.h"
+
/* Arguments and results. */
#define srcin x0
#define chrin w1
@@ -70,15 +49,8 @@
/* Locals and temporaries. */
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
-def_fn strchrnul
+ENTRY (strchrnul)
+ PTR_ARG (0)
/* Magic constant 0x40100401 to allow us to identify which lane
matches the termination condition. */
mov wtmp2, #0x0401
@@ -87,7 +59,7 @@ def_fn strchrnul
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
dup vrepmask.4s, wtmp2
ands tmp1, srcin, #31
- b.eq .Lloop
+ b.eq L(loop)
/* Input string is not 32-byte aligned. Rather than forcing
the padding bytes to a safe value, we calculate the syndrome
@@ -95,47 +67,43 @@ def_fn strchrnul
syndrome that are related to the padding. */
ld1 {vdata1.16b, vdata2.16b}, [src], #32
neg tmp1, tmp1
- cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
- cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
- orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
- and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
- and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+ cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+ and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
lsl tmp1, tmp1, #1
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
mov tmp3, #~0
addp vend1.16b, vend1.16b, vend1.16b // 128->64
lsr tmp1, tmp3, tmp1
- mov tmp3, vend1.2d[0]
+ mov tmp3, vend1.d[0]
bic tmp1, tmp3, tmp1 // Mask padding bits.
- cbnz tmp1, .Ltail
+ cbnz tmp1, L(tail)
-.Lloop:
+ .p2align 4
+L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
- cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
- cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- /* Use a fast check for the termination condition. */
- orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
- orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
- orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b
- addp vend1.2d, vend1.2d, vend1.2d
- mov tmp1, vend1.2d[0]
- cbz tmp1, .Lloop
+ cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+ cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+ orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+ umaxp vend1.16b, vend1.16b, vend1.16b
+ mov tmp1, vend1.d[0]
+ cbz tmp1, L(loop)
/* Termination condition found. Now need to establish exactly why
we terminated. */
- and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
- and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
addp vend1.16b, vend1.16b, vend1.16b // 128->64
- mov tmp1, vend1.2d[0]
-.Ltail:
+ mov tmp1, vend1.d[0]
+L(tail):
/* Count the trailing zeros, by bit reversing... */
rbit tmp1, tmp1
/* Re-bias source. */
@@ -145,5 +113,5 @@ def_fn strchrnul
add result, src, tmp1, lsr #1
ret
- .size strchrnul, . - strchrnul
+END (strchrnul)
#endif
@@ -1,202 +1,192 @@
-/* Copyright (c) 2012-2018, Linaro Limited
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
-/* Assumptions:
+/*
+ * strcmp - compare two strings
*
- * ARMv8-a, AArch64
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
/* See strcmp-stub.c */
#else
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64.
+ * MTE compatible.
+ */
-#define L(label) .L ## label
+#include "asmdefs.h"
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-/* Parameters and result. */
#define src1 x0
#define src2 x1
#define result x0
-/* Internal variables. */
#define data1 x2
#define data1w w2
#define data2 x3
#define data2w w3
#define has_nul x4
#define diff x5
+#define off1 x5
#define syndrome x6
-#define tmp1 x7
-#define tmp2 x8
-#define tmp3 x9
-#define zeroones x10
-#define pos x11
-
- /* Start of performance-critical section -- one 64B cache line. */
-def_fn strcmp p2align=6
- eor tmp1, src1, src2
- mov zeroones, #REP8_01
- tst tmp1, #7
+#define tmp x6
+#define data3 x7
+#define zeroones x8
+#define shift x9
+#define off2 x10
+
+/* On big-endian early bytes are at MSB and on little-endian LSB.
+ LS_FW means shifting towards early bytes. */
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+#else
+# define LS_FW lsr
+#endif
+
+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word.
+ Since carry propagation makes 0x1 bytes before a NUL byte appear
+ NUL too in big-endian, byte-reverse the data before the NUL check. */
+
+
+ENTRY (strcmp)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ sub off2, src2, src1
+ mov zeroones, REP8_01
+ and tmp, src1, 7
+ tst off2, 7
b.ne L(misaligned8)
- ands tmp1, src1, #7
- b.ne L(mutual_align)
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. */
+ cbnz tmp, L(mutual_align)
+
+ .p2align 4
+
L(loop_aligned):
- ldr data1, [src1], #8
- ldr data2, [src2], #8
+ ldr data2, [src1, off2]
+ ldr data1, [src1], 8
L(start_realigned):
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+#ifdef __AARCH64EB__
+ rev tmp, data1
+ sub has_nul, tmp, zeroones
+ orr tmp, tmp, REP8_7f
+#else
+ sub has_nul, data1, zeroones
+ orr tmp, data1, REP8_7f
+#endif
+ bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
+ ccmp data1, data2, 0, eq
+ b.eq L(loop_aligned)
+#ifdef __AARCH64EB__
+ rev has_nul, has_nul
+#endif
+ eor diff, data1, data2
orr syndrome, diff, has_nul
- cbz syndrome, L(loop_aligned)
- /* End of performance-critical section -- one 64B cache line. */
-
L(end):
-#ifndef __AARCH64EB__
+#ifndef __AARCH64EB__
rev syndrome, syndrome
rev data1, data1
- /* The MS-non-zero bit of the syndrome marks either the first bit
- that is different, or the top bit of the first zero byte.
- Shifting left now will bring the critical information into the
- top bits. */
- clz pos, syndrome
rev data2, data2
- lsl data1, data1, pos
- lsl data2, data2, pos
- /* But we need to zero-extend (char is unsigned) the value and then
- perform a signed 32-bit subtraction. */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
- ret
-#else
- /* For big-endian we cannot use the trick with the syndrome value
- as carry-propagation can corrupt the upper bits if the trailing
- bytes in the string contain 0x01. */
- /* However, if there is no NUL byte in the dword, we can generate
- the result directly. We can't just subtract the bytes as the
- MSB might be significant. */
- cbnz has_nul, 1f
- cmp data1, data2
- cset result, ne
- cneg result, result, lo
- ret
-1:
- /* Re-compute the NUL-byte detection, using a byte-reversed value. */
- rev tmp3, data1
- sub tmp1, tmp3, zeroones
- orr tmp2, tmp3, #REP8_7f
- bic has_nul, tmp1, tmp2
- rev has_nul, has_nul
- orr syndrome, diff, has_nul
- clz pos, syndrome
- /* The MS-non-zero bit of the syndrome marks either the first bit
- that is different, or the top bit of the first zero byte.
+#endif
+ clz shift, syndrome
+ /* The most-significant-non-zero bit of the syndrome marks either the
+ first bit that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
- lsl data1, data1, pos
- lsl data2, data2, pos
+ lsl data1, data1, shift
+ lsl data2, data2, shift
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
+ lsr data1, data1, 56
+ sub result, data1, data2, lsr 56
ret
-#endif
+
+ .p2align 4
L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
- the bytes that preceed the start point. */
- bic src1, src1, #7
- bic src2, src2, #7
- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
- ldr data1, [src1], #8
- neg tmp1, tmp1 /* Bits to alignment -64. */
- ldr data2, [src2], #8
- mov tmp2, #~0
-#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
-#else
- /* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
-#endif
- orr data1, data1, tmp2
- orr data2, data2, tmp2
+ the bytes that precede the start point. */
+ bic src1, src1, 7
+ ldr data2, [src1, off2]
+ ldr data1, [src1], 8
+ neg shift, src2, lsl 3 /* Bits to alignment -64. */
+ mov tmp, -1
+ LS_FW tmp, tmp, shift
+ orr data1, data1, tmp
+ orr data2, data2, tmp
b L(start_realigned)
L(misaligned8):
/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
- checking to make sure that we don't access beyond page boundary in
- SRC2. */
- tst src1, #7
- b.eq L(loop_misaligned)
+ checking to make sure that we don't access beyond the end of SRC2. */
+ cbz tmp, L(src1_aligned)
L(do_misaligned):
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- cmp data1w, #1
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ ldrb data1w, [src1], 1
+ ldrb data2w, [src2], 1
+ cmp data1w, 0
+ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
b.ne L(done)
- tst src1, #7
+ tst src1, 7
b.ne L(do_misaligned)
-L(loop_misaligned):
- /* Test if we are within the last dword of the end of a 4K page. If
- yes then jump back to the misaligned loop to copy a byte at a time. */
- and tmp1, src2, #0xff8
- eor tmp1, tmp1, #0xff8
- cbz tmp1, L(do_misaligned)
- ldr data1, [src1], #8
- ldr data2, [src2], #8
-
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+L(src1_aligned):
+ neg shift, src2, lsl 3
+ bic src2, src2, 7
+ ldr data3, [src2], 8
+#ifdef __AARCH64EB__
+ rev data3, data3
+#endif
+ lsr tmp, zeroones, shift
+ orr data3, data3, tmp
+ sub has_nul, data3, zeroones
+ orr tmp, data3, REP8_7f
+ bics has_nul, has_nul, tmp
+ b.ne L(tail)
+
+ sub off1, src2, src1
+
+ .p2align 4
+
+L(loop_unaligned):
+ ldr data3, [src1, off1]
+ ldr data2, [src1, off2]
+#ifdef __AARCH64EB__
+ rev data3, data3
+#endif
+ sub has_nul, data3, zeroones
+ orr tmp, data3, REP8_7f
+ ldr data1, [src1], 8
+ bics has_nul, has_nul, tmp
+ ccmp data1, data2, 0, eq
+ b.eq L(loop_unaligned)
+
+ lsl tmp, has_nul, shift
+#ifdef __AARCH64EB__
+ rev tmp, tmp
+#endif
+ eor diff, data1, data2
+ orr syndrome, diff, tmp
+ cbnz syndrome, L(end)
+L(tail):
+ ldr data1, [src1]
+ neg shift, shift
+ lsr data2, data3, shift
+ lsr has_nul, has_nul, shift
+#ifdef __AARCH64EB__
+ rev data2, data2
+ rev has_nul, has_nul
+#endif
+ eor diff, data1, data2
orr syndrome, diff, has_nul
- cbz syndrome, L(loop_misaligned)
b L(end)
L(done):
sub result, data1, data2
ret
- .size strcmp, .-strcmp
+END (strcmp)
#endif
@@ -1,341 +1,160 @@
/*
- strcpy/stpcpy - copy a string returning pointer to start/end.
-
- Copyright (c) 2013, 2014, 2015 ARM Ltd.
- All Rights Reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the company nor the names of its contributors
- may be used to endorse or promote products derived from this
- software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
/* See strchr-stub.c */
#else
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
*/
-/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
+#include "asmdefs.h"
- To test the page crossing code path more thoroughly, compile with
- -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
- entry path. This option is not intended for production use. */
-
-/* Arguments and results. */
#define dstin x0
#define srcin x1
+#define result x0
-/* Locals and temporaries. */
#define src x2
#define dst x3
-#define data1 x4
-#define data1w w4
-#define data2 x5
-#define data2w w5
-#define has_nul1 x6
-#define has_nul2 x7
-#define tmp1 x8
-#define tmp2 x9
-#define tmp3 x10
-#define tmp4 x11
-#define zeroones x12
-#define data1a x13
-#define data2a x14
-#define pos x15
-#define len x16
-#define to_align x17
+#define len x4
+#define synd x4
+#define tmp x5
+#define shift x5
+#define data1 x6
+#define dataw1 w6
+#define data2 x7
+#define dataw2 w7
+
+#define dataq q0
+#define vdata v0
+#define vhas_nul v1
+#define vend v2
+#define dend d2
+#define dataq2 q1
#ifdef BUILD_STPCPY
-#define STRCPY stpcpy
+# define STRCPY stpcpy
+# define IFSTPCPY(X,...) X,__VA_ARGS__
#else
-#define STRCPY strcpy
+# define STRCPY strcpy
+# define IFSTPCPY(X,...)
#endif
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. */
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
- /* AArch64 systems have a minimum page size of 4k. We can do a quick
- page size check for crossing this boundary on entry and if we
- do not, then we can short-circuit much of the entry code. We
- expect early page-crossing strings to be rare (probability of
- 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
- predictable, even with random strings.
-
- We don't bother checking for larger page sizes, the cost of setting
- up the correct page size is just not worth the extra gain from
- a small reduction in the cases taking the slow path. Note that
- we only care about whether the first fetch, which may be
- misaligned, crosses a page boundary - after that we move to aligned
- fetches for the remainder of the string. */
-
-#ifdef STRCPY_TEST_PAGE_CROSS
- /* Make everything that isn't Qword aligned look like a page cross. */
-#define MIN_PAGE_P2 4
-#else
-#define MIN_PAGE_P2 12
-#endif
-
-#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
-
-def_fn STRCPY p2align=6
- /* For moderately short strings, the fastest way to do the copy is to
- calculate the length of the string in the same way as strlen, then
- essentially do a memcpy of the result. This avoids the need for
- multiple byte copies and further means that by the time we
- reach the bulk copy loop we know we can always use DWord
- accesses. We expect strcpy to rarely be called repeatedly
- with the same source string, so branch prediction is likely to
- always be difficult - we mitigate against this by preferring
- conditional select operations over branches whenever this is
- feasible. */
- and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
- mov zeroones, #REP8_01
- and to_align, srcin, #15
- cmp tmp2, #(MIN_PAGE_SIZE - 16)
- neg tmp1, to_align
- /* The first fetch will straddle a (possible) page boundary iff
- srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
- aligned string will never fail the page align check, so will
- always take the fast path. */
- b.gt .Lpage_cross
-
-.Lpage_cross_ok:
- ldp data1, data2, [srcin]
-#ifdef __AARCH64EB__
- /* Because we expect the end to be found within 16 characters
- (profiling shows this is the most common case), it's worth
- swapping the bytes now to save having to recalculate the
- termination syndrome later. We preserve data1 and data2
- so that we can re-use the values later on. */
- rev tmp2, data1
- sub tmp1, tmp2, zeroones
- orr tmp2, tmp2, #REP8_7f
- bics has_nul1, tmp1, tmp2
- b.ne .Lfp_le8
- rev tmp4, data2
- sub tmp3, tmp4, zeroones
- orr tmp4, tmp4, #REP8_7f
-#else
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- bics has_nul1, tmp1, tmp2
- b.ne .Lfp_le8
- sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
+/*
+ Core algorithm:
+ For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+ per byte. We take 4 bits of every comparison byte with shift right and narrow
+ by 4 instruction. Since the bits in the nibble mask reflect the order in
+ which things occur in the original string, counting leading zeros identifies
+ exactly which byte matched. */
+
+ENTRY (STRCPY)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ bic src, srcin, 15
+ ld1 {vdata.16b}, [src]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ lsl shift, srcin, 2
+ shrn vend.8b, vhas_nul.8h, 4
+ fmov synd, dend
+ lsr synd, synd, shift
+ cbnz synd, L(tail)
+
+ ldr dataq, [src, 16]!
+ cmeq vhas_nul.16b, vdata.16b, 0
+ shrn vend.8b, vhas_nul.8h, 4
+ fmov synd, dend
+ cbz synd, L(start_loop)
+
+#ifndef __AARCH64EB__
+ rbit synd, synd
#endif
- bics has_nul2, tmp3, tmp4
- b.eq .Lbulk_entry
+ sub tmp, src, srcin
+ clz len, synd
+ add len, tmp, len, lsr 2
+ tbz len, 4, L(less16)
+ sub tmp, len, 15
+ ldr dataq, [srcin]
+ ldr dataq2, [srcin, tmp]
+ str dataq, [dstin]
+ str dataq2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
+ ret
- /* The string is short (<=16 bytes). We don't know exactly how
- short though, yet. Work out the exact length so that we can
- quickly select the optimal copy strategy. */
-.Lfp_gt8:
- rev has_nul2, has_nul2
- clz pos, has_nul2
- mov tmp2, #56
- add dst, dstin, pos, lsr #3 /* Bits to bytes. */
- sub pos, tmp2, pos
-#ifdef __AARCH64EB__
- lsr data2, data2, pos
-#else
- lsl data2, data2, pos
-#endif
- str data2, [dst, #1]
+L(tail):
+ rbit synd, synd
+ clz len, synd
+ lsr len, len, 2
+L(less16):
+ tbz len, 3, L(less8)
+ sub tmp, len, 7
+ ldr data1, [srcin]
+ ldr data2, [srcin, tmp]
str data1, [dstin]
-#ifdef BUILD_STPCPY
- add dstin, dst, #8
-#endif
+ str data2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
ret
-.Lfp_le8:
- rev has_nul1, has_nul1
- clz pos, has_nul1
- add dst, dstin, pos, lsr #3 /* Bits to bytes. */
- subs tmp2, pos, #24 /* Pos in bits. */
- b.lt .Lfp_lt4
-#ifdef __AARCH64EB__
- mov tmp2, #56
- sub pos, tmp2, pos
- lsr data2, data1, pos
- lsr data1, data1, #32
-#else
- lsr data2, data1, tmp2
-#endif
- /* 4->7 bytes to copy. */
- str data2w, [dst, #-3]
- str data1w, [dstin]
-#ifdef BUILD_STPCPY
- mov dstin, dst
-#endif
- ret
-.Lfp_lt4:
- cbz pos, .Lfp_lt2
- /* 2->3 bytes to copy. */
-#ifdef __AARCH64EB__
- lsr data1, data1, #48
-#endif
- strh data1w, [dstin]
- /* Fall-through, one byte (max) to go. */
-.Lfp_lt2:
- /* Null-terminated string. Last character must be zero! */
- strb wzr, [dst]
-#ifdef BUILD_STPCPY
- mov dstin, dst
-#endif
+ .p2align 4
+L(less8):
+ subs tmp, len, 3
+ b.lo L(less4)
+ ldr dataw1, [srcin]
+ ldr dataw2, [srcin, tmp]
+ str dataw1, [dstin]
+ str dataw2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
ret
- .p2align 6
- /* Aligning here ensures that the entry code and main loop all lies
- within one 64-byte cache line. */
-.Lbulk_entry:
- sub to_align, to_align, #16
- stp data1, data2, [dstin]
- sub src, srcin, to_align
- sub dst, dstin, to_align
- b .Lentry_no_page_cross
-
- /* The inner loop deals with two Dwords at a time. This has a
- slightly higher start-up cost, but we should win quite quickly,
- especially on cores with a high number of issue slots per
- cycle, as we get much better parallelism out of the operations. */
-.Lmain_loop:
- stp data1, data2, [dst], #16
-.Lentry_no_page_cross:
- ldp data1, data2, [src], #16
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
- bic has_nul1, tmp1, tmp2
- bics has_nul2, tmp3, tmp4
- ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
- b.eq .Lmain_loop
-
- /* Since we know we are copying at least 16 bytes, the fastest way
- to deal with the tail is to determine the location of the
- trailing NUL, then (re)copy the 16 bytes leading up to that. */
- cmp has_nul1, #0
-#ifdef __AARCH64EB__
- /* For big-endian, carry propagation (if the final byte in the
- string is 0x01) means we cannot use has_nul directly. The
- easiest way to get the correct byte is to byte-swap the data
- and calculate the syndrome a second time. */
- csel data1, data1, data2, ne
- rev data1, data1
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- bic has_nul1, tmp1, tmp2
-#else
- csel has_nul1, has_nul1, has_nul2, ne
-#endif
- rev has_nul1, has_nul1
- clz pos, has_nul1
- add tmp1, pos, #72
- add pos, pos, #8
- csel pos, pos, tmp1, ne
- add src, src, pos, lsr #3
- add dst, dst, pos, lsr #3
- ldp data1, data2, [src, #-32]
- stp data1, data2, [dst, #-16]
-#ifdef BUILD_STPCPY
- sub dstin, dst, #1
-#endif
+L(less4):
+ cbz len, L(zerobyte)
+ ldrh dataw1, [srcin]
+ strh dataw1, [dstin]
+L(zerobyte):
+ strb wzr, [dstin, len]
+ IFSTPCPY (add result, dstin, len)
ret
-.Lpage_cross:
- bic src, srcin, #15
- /* Start by loading two words at [srcin & ~15], then forcing the
- bytes that precede srcin to 0xff. This means they never look
- like termination bytes. */
- ldp data1, data2, [src]
- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
- tst to_align, #7
- csetm tmp2, ne
-#ifdef __AARCH64EB__
- lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
-#else
- lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+ .p2align 4
+L(start_loop):
+ sub tmp, srcin, dstin
+ ldr dataq2, [srcin]
+ sub dst, src, tmp
+ str dataq2, [dstin]
+L(loop):
+ str dataq, [dst], 32
+ ldr dataq, [src, 16]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbnz synd, L(loopend)
+ str dataq, [dst, -16]
+ ldr dataq, [src, 32]!
+ cmeq vhas_nul.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbz synd, L(loop)
+ add dst, dst, 16
+L(loopend):
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
+ fmov synd, dend
+ sub dst, dst, 31
+#ifndef __AARCH64EB__
+ rbit synd, synd
#endif
- orr data1, data1, tmp2
- orr data2a, data2, tmp2
- cmp to_align, #8
- csinv data1, data1, xzr, lt
- csel data2, data2, data2a, lt
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
- bic has_nul1, tmp1, tmp2
- bics has_nul2, tmp3, tmp4
- ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
- b.eq .Lpage_cross_ok
- /* We now need to make data1 and data2 look like they've been
- loaded directly from srcin. Do a rotate on the 128-bit value. */
- lsl tmp1, to_align, #3 /* Bytes->bits. */
- neg tmp2, to_align, lsl #3
-#ifdef __AARCH64EB__
- lsl data1a, data1, tmp1
- lsr tmp4, data2, tmp2
- lsl data2, data2, tmp1
- orr tmp4, tmp4, data1a
- cmp to_align, #8
- csel data1, tmp4, data2, lt
- rev tmp2, data1
- rev tmp4, data2
- sub tmp1, tmp2, zeroones
- orr tmp2, tmp2, #REP8_7f
- sub tmp3, tmp4, zeroones
- orr tmp4, tmp4, #REP8_7f
-#else
- lsr data1a, data1, tmp1
- lsl tmp4, data2, tmp2
- lsr data2, data2, tmp1
- orr tmp4, tmp4, data1a
- cmp to_align, #8
- csel data1, tmp4, data2, lt
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
-#endif
- bic has_nul1, tmp1, tmp2
- cbnz has_nul1, .Lfp_le8
- bic has_nul2, tmp3, tmp4
- b .Lfp_gt8
+ clz len, synd
+ lsr len, len, 2
+ add dst, dst, len
+ ldr dataq, [dst, tmp]
+ str dataq, [dst]
+ IFSTPCPY (add result, dst, 15)
+ ret
- .size STRCPY, . - STRCPY
+END (STRCPY)
#endif
@@ -1,115 +1,92 @@
-/* Copyright (c) 2013-2015, Linaro Limited
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
+/*
+ * strlen - calculate the length of a string.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
/* See strlen-stub.c */
#else
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * Not MTE compatible.
*/
-/* To test the page crossing code path more thoroughly, compile with
- -DTEST_PAGE_CROSS - this will force all calls through the slower
- entry path. This option is not intended for production use. */
-
-/* Arguments and results. */
-#define srcin x0
-#define len x0
-
-/* Locals and temporaries. */
-#define src x1
-#define data1 x2
-#define data2 x3
-#define has_nul1 x4
-#define has_nul2 x5
-#define tmp1 x4
-#define tmp2 x5
-#define tmp3 x6
-#define tmp4 x7
-#define zeroones x8
-
-#define L(l) .L ## l
-
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. A faster check
- (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
- false hits for characters 129..255. */
+#include "asmdefs.h"
+
+#define srcin x0
+#define len x0
+
+#define src x1
+#define data1 x2
+#define data2 x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1 x4
+#define tmp2 x5
+#define tmp3 x6
+#define tmp4 x7
+#define zeroones x8
+
+#define maskv v0
+#define maskd d0
+#define dataq1 q1
+#define dataq2 q2
+#define datav1 v1
+#define datav2 v2
+#define tmp x2
+#define tmpw w2
+#define synd x3
+#define syndw w3
+#define shift x4
+
+/* For the first 32 bytes, NUL detection works on the principle that
+ (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
+ byte is zero, and can be done in parallel across the entire word. */
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
+
+/* To test the page crossing code path more thoroughly, compile with
+ -DTEST_PAGE_CROSS - this will force all calls through the slower
+ entry path. This option is not intended for production use. */
#ifdef TEST_PAGE_CROSS
-# define MIN_PAGE_SIZE 15
+# define MIN_PAGE_SIZE 32
#else
# define MIN_PAGE_SIZE 4096
#endif
- /* Since strings are short on average, we check the first 16 bytes
- of the string for a NUL character. In order to do an unaligned ldp
- safely we have to do a page cross check first. If there is a NUL
- byte we calculate the length from the 2 8-byte words using
- conditional select to reduce branch mispredictions (it is unlikely
- strlen will be repeatedly called on strings with the same length).
-
- If the string is longer than 16 bytes, we align src so don't need
- further page cross checks, and process 32 bytes per iteration
- using the fast NUL check. If we encounter non-ASCII characters,
- fallback to a second loop using the full NUL check.
-
- If the page cross check fails, we read 16 bytes from an aligned
- address, remove any characters before the string, and continue
- in the main loop using aligned loads. Since strings crossing a
- page in the first 16 bytes are rare (probability of
- 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
-
- AArch64 systems have a minimum page size of 4k. We don't bother
- checking for larger page sizes - the cost of setting up the correct
- page size is just not worth the extra gain from a small reduction in
- the cases taking the slow path. Note that we only care about
- whether the first fetch, which may be misaligned, crosses a page
- boundary. */
-
-def_fn strlen p2align=6
+/* Core algorithm:
+
+ Since strings are short on average, we check the first 32 bytes of the
+ string for a NUL character without aligning the string. In order to use
+ unaligned loads safely we must do a page cross check first.
+
+ If there is a NUL byte we calculate the length from the 2 8-byte words
+ using conditional select to reduce branch mispredictions (it is unlikely
+ strlen will be repeatedly called on strings with the same length).
+
+ If the string is longer than 32 bytes, align src so we don't need further
+ page cross checks, and process 32 bytes per iteration using a fast SIMD
+ loop.
+
+ If the page cross check fails, we read 32 bytes from an aligned address,
+ and ignore any characters before the string. If it contains a NUL
+ character, return the length, if not, continue in the main loop. */
+
+ENTRY (strlen)
+ PTR_ARG (0)
and tmp1, srcin, MIN_PAGE_SIZE - 1
- mov zeroones, REP8_01
- cmp tmp1, MIN_PAGE_SIZE - 16
- b.gt L(page_cross)
+ cmp tmp1, MIN_PAGE_SIZE - 32
+ b.hi L(page_cross)
+
+ /* Look for a NUL byte in the first 16 bytes. */
ldp data1, data2, [srcin]
+ mov zeroones, REP8_01
+
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul1/2 directly.
@@ -125,114 +102,96 @@ def_fn strlen p2align=6
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
- beq L(main_loop_entry)
+ b.eq L(bytes16_31)
- /* Enter with C = has_nul1 == 0. */
+ /* Find the exact offset of the first NUL byte in the first 16 bytes
+ from the string start. Enter with C = has_nul1 == 0. */
csel has_nul1, has_nul1, has_nul2, cc
mov len, 8
rev has_nul1, has_nul1
- clz tmp1, has_nul1
csel len, xzr, len, cc
+ clz tmp1, has_nul1
add len, len, tmp1, lsr 3
ret
- /* The inner loop processes 32 bytes per iteration and uses the fast
- NUL check. If we encounter non-ASCII characters, use a second
- loop with the accurate NUL check. */
- .p2align 4
-L(main_loop_entry):
- bic src, srcin, 15
- sub src, src, 16
-L(main_loop):
- ldp data1, data2, [src, 32]!
-.Lpage_cross_entry:
- sub tmp1, data1, zeroones
- sub tmp3, data2, zeroones
- orr tmp2, tmp1, tmp3
- tst tmp2, zeroones, lsl 7
- bne 1f
- ldp data1, data2, [src, 16]
+ /* Look for a NUL byte at offset 16..31 in the string. */
+L(bytes16_31):
+ ldp data1, data2, [srcin, 16]
+#ifdef __AARCH64EB__
+ rev data1, data1
+ rev data2, data2
+#endif
sub tmp1, data1, zeroones
- sub tmp3, data2, zeroones
- orr tmp2, tmp1, tmp3
- tst tmp2, zeroones, lsl 7
- beq L(main_loop)
- add src, src, 16
-1:
- /* The fast check failed, so do the slower, accurate NUL check. */
orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
- beq L(nonascii_loop)
+ b.eq L(loop_entry)
- /* Enter with C = has_nul1 == 0. */
-L(tail):
-#ifdef __AARCH64EB__
- /* For big-endian, carry propagation (if the final byte in the
- string is 0x01) means we cannot use has_nul1/2 directly. The
- easiest way to get the correct byte is to byte-swap the data
- and calculate the syndrome a second time. */
- csel data1, data1, data2, cc
- rev data1, data1
- sub tmp1, data1, zeroones
- orr tmp2, data1, REP8_7f
- bic has_nul1, tmp1, tmp2
-#else
+ /* Find the exact offset of the first NUL byte at offset 16..31 from
+ the string start. Enter with C = has_nul1 == 0. */
csel has_nul1, has_nul1, has_nul2, cc
-#endif
- sub len, src, srcin
+ mov len, 24
rev has_nul1, has_nul1
- add tmp2, len, 8
+ mov tmp3, 16
clz tmp1, has_nul1
- csel len, len, tmp2, cc
+ csel len, tmp3, len, cc
add len, len, tmp1, lsr 3
ret
-L(nonascii_loop):
- ldp data1, data2, [src, 16]!
- sub tmp1, data1, zeroones
- orr tmp2, data1, REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, REP8_7f
- bics has_nul1, tmp1, tmp2
- bic has_nul2, tmp3, tmp4
- ccmp has_nul2, 0, 0, eq
- bne L(tail)
- ldp data1, data2, [src, 16]!
- sub tmp1, data1, zeroones
- orr tmp2, data1, REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, REP8_7f
- bics has_nul1, tmp1, tmp2
- bic has_nul2, tmp3, tmp4
- ccmp has_nul2, 0, 0, eq
- beq L(nonascii_loop)
- b L(tail)
+ nop
+L(loop_entry):
+ bic src, srcin, 31
+
+ .p2align 5
+L(loop):
+ ldp dataq1, dataq2, [src, 32]!
+ uminp maskv.16b, datav1.16b, datav2.16b
+ uminp maskv.16b, maskv.16b, maskv.16b
+ cmeq maskv.8b, maskv.8b, 0
+ fmov synd, maskd
+ cbz synd, L(loop)
+
+ /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
+ cmeq maskv.16b, datav1.16b, 0
+ sub len, src, srcin
+ cbnz syndw, 1f
+ cmeq maskv.16b, datav2.16b, 0
+ add len, len, 16
+1:
+ /* Generate a bitmask and compute correct byte offset. */
+ shrn maskv.8b, maskv.8h, 4
+ fmov synd, maskd
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ clz tmp, synd
+ add len, len, tmp, lsr 2
+ ret
- /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
- srcin to 0x7f, so we ignore any NUL bytes before the string.
- Then continue in the aligned loop. */
L(page_cross):
- bic src, srcin, 15
- ldp data1, data2, [src]
- lsl tmp1, srcin, 3
- mov tmp4, -1
-#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
-#else
- /* Little-endian. Early bytes are at LSB. */
- lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
-#endif
- orr tmp1, tmp1, REP8_80
- orn data1, data1, tmp1
- orn tmp2, data2, tmp1
- tst srcin, 8
- csel data1, data1, tmp4, eq
- csel data2, data2, tmp2, eq
- b L(page_cross_entry)
-
- .size strlen, . - strlen
+ bic src, srcin, 31
+ mov tmpw, 0x0c03
+ movk tmpw, 0xc030, lsl 16
+ ld1 {datav1.16b, datav2.16b}, [src]
+ dup maskv.4s, tmpw
+ cmeq datav1.16b, datav1.16b, 0
+ cmeq datav2.16b, datav2.16b, 0
+ and datav1.16b, datav1.16b, maskv.16b
+ and datav2.16b, datav2.16b, maskv.16b
+ addp maskv.16b, datav1.16b, datav2.16b
+ addp maskv.16b, maskv.16b, maskv.16b
+ fmov synd, maskd
+ lsl shift, srcin, 1
+ lsr synd, synd, shift
+ cbz synd, L(loop)
+
+ rbit synd, synd
+ clz len, synd
+ lsr len, len, 1
+ ret
+
+END (strlen)
#endif
@@ -1,49 +1,23 @@
-/* Copyright (c) 2013, 2018, Linaro Limited
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
+/*
+ * strncmp - compare two strings
+ *
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
/* See strcmp-stub.c */
#else
/* Assumptions:
*
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64.
+ * MTE compatible.
*/
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
+#include "asmdefs.h"
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
/* Parameters and result. */
#define src1 x0
@@ -64,86 +38,91 @@
#define tmp3 x10
#define zeroones x11
#define pos x12
-#define limit_wd x13
-#define mask x14
-#define endloop x15
+#define mask x13
+#define endloop x14
#define count mask
+#define offset pos
+#define neg_offset x15
+
+/* Define endian dependent shift operations.
+ On big-endian early bytes are at MSB and on little-endian LSB.
+ LS_FW means shifting towards early bytes.
+ LS_BK means shifting towards later bytes.
+ */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
- .text
- .p2align 6
- .rep 7
- nop /* Pad so that the loop below fits a cache line. */
- .endr
-def_fn strncmp
- cbz limit, .Lret0
+ENTRY (strncmp)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+ cbz limit, L(ret0)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
and count, src1, #7
- b.ne .Lmisaligned8
- cbnz count, .Lmutual_align
- /* Calculate the number of full and partial words -1. */
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
- lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
+ b.ne L(misaligned8)
+ cbnz count, L(mutual_align)
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
- /* Start of performance-critical section -- one 64B cache line. */
-.Lloop_aligned:
+ .p2align 4
+L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
-.Lstart_realigned:
- subs limit_wd, limit_wd, #1
+L(start_realigned):
+ subs limit, limit, #8
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, pl /* Last Dword or differences. */
+ csinv endloop, diff, xzr, hi /* Last Dword or differences. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp endloop, #0, #0, eq
- b.eq .Lloop_aligned
- /* End of performance-critical section -- one 64B cache line. */
+ b.eq L(loop_aligned)
+ /* End of main loop */
- /* Not reached the limit, must have found the end or a diff. */
- tbz limit_wd, #63, .Lnot_limit
-
- /* Limit % 8 == 0 => all bytes significant. */
- ands limit, limit, #7
- b.eq .Lnot_limit
-
- lsl limit, limit, #3 /* Bits -> bytes. */
- mov mask, #~0
-#ifdef __AARCH64EB__
- lsr mask, mask, limit
-#else
- lsl mask, mask, limit
-#endif
- bic data1, data1, mask
- bic data2, data2, mask
-
- /* Make sure that the NUL byte is marked in the syndrome. */
- orr has_nul, has_nul, mask
-
-.Lnot_limit:
+L(full_check):
+#ifndef __AARCH64EB__
orr syndrome, diff, has_nul
-
-#ifndef __AARCH64EB__
+ add limit, limit, 8 /* Rewind limit to before last subs. */
+L(syndrome_check):
+ /* Limit was reached. Check if the NUL byte or the difference
+ is before the limit. */
rev syndrome, syndrome
rev data1, data1
- /* The MS-non-zero bit of the syndrome marks either the first bit
- that is different, or the top bit of the first zero byte.
- Shifting left now will bring the critical information into the
- top bits. */
clz pos, syndrome
rev data2, data2
lsl data1, data1, pos
+ cmp limit, pos, lsr #3
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
+ csel result, result, xzr, hi
ret
#else
+ /* Not reached the limit, must have found the end or a diff. */
+ tbz limit, #63, L(not_limit)
+ add tmp1, limit, 8
+ cbz limit, L(not_limit)
+
+ lsl limit, tmp1, #3 /* Bits -> bytes. */
+ mov mask, #~0
+ lsr mask, mask, limit
+ bic data1, data1, mask
+ bic data2, data2, mask
+
+ /* Make sure that the NUL byte is marked in the syndrome. */
+ orr has_nul, has_nul, mask
+
+L(not_limit):
/* For big-endian we cannot use the trick with the syndrome value
as carry-propagation can corrupt the upper bits if the trailing
bytes in the string contain 0x01. */
@@ -164,10 +143,11 @@ def_fn strncmp
rev has_nul, has_nul
orr syndrome, diff, has_nul
clz pos, syndrome
- /* The MS-non-zero bit of the syndrome marks either the first bit
- that is different, or the top bit of the first zero byte.
+ /* The most-significant-non-zero bit of the syndrome marks either the
+ first bit that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
+L(end_quick):
lsl data1, data1, pos
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
@@ -177,7 +157,7 @@ def_fn strncmp
ret
#endif
-.Lmutual_align:
+L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
the bytes that precede the start point.
@@ -189,102 +169,143 @@ def_fn strncmp
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
ldr data2, [src2], #8
mov tmp2, #~0
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
-#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
-#else
- /* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
-#endif
- and tmp3, limit_wd, #7
- lsr limit_wd, limit_wd, #3
- /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
- add limit, limit, count
- add tmp3, tmp3, count
+ LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
+ /* Adjust the limit and ensure it doesn't overflow. */
+ adds limit, limit, count
+ csinv limit, limit, xzr, lo
orr data1, data1, tmp2
orr data2, data2, tmp2
- add limit_wd, limit_wd, tmp3, lsr #3
- b .Lstart_realigned
+ b L(start_realigned)
- .p2align 6
+ .p2align 4
/* Don't bother with dwords for up to 16 bytes. */
-.Lmisaligned8:
+L(misaligned8):
cmp limit, #16
- b.hs .Ltry_misaligned_words
+ b.hs L(try_misaligned_words)
-.Lbyte_loop:
+L(byte_loop):
/* Perhaps we can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs limit, limit, #1
ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq .Lbyte_loop
-.Ldone:
+ b.eq L(byte_loop)
+L(done):
sub result, data1, data2
ret
/* Align the SRC1 to a dword by doing a bytewise compare and then do
the dword loop. */
-.Ltry_misaligned_words:
- lsr limit_wd, limit, #3
- cbz count, .Ldo_misaligned
+L(try_misaligned_words):
+ cbz count, L(src1_aligned)
neg count, count
and count, count, #7
sub limit, limit, count
- lsr limit_wd, limit, #3
-.Lpage_end_loop:
+L(page_end_loop):
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
cmp data1w, #1
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.ne .Ldone
+ b.ne L(done)
subs count, count, #1
- b.hi .Lpage_end_loop
+ b.hi L(page_end_loop)
+
+ /* The following diagram explains the comparison of misaligned strings.
+ The bytes are shown in natural order. For little-endian, it is
+ reversed in the registers. The "x" bytes are before the string.
+ The "|" separates data that is loaded at one time.
+ src1 | a a a a a a a a | b b b c c c c c | . . .
+ src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
-.Ldo_misaligned:
- /* Prepare ourselves for the next page crossing. Unlike the aligned
- loop, we fetch 1 less dword because we risk crossing bounds on
- SRC2. */
- mov count, #8
- subs limit_wd, limit_wd, #1
- b.lo .Ldone_loop
-.Lloop_misaligned:
- and tmp2, src2, #0xff8
- eor tmp2, tmp2, #0xff8
- cbz tmp2, .Lpage_end_loop
+ After shifting in each step, the data looks like this:
+ STEP_A STEP_B STEP_C
+ data1 a a a a a a a a b b b c c c c c b b b c c c c c
+ data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
+ The bytes with "0" are eliminated from the syndrome via mask.
+
+ Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+ time from SRC2. The comparison happens in 3 steps. After each step
+ the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+ /* Calculate offset from 8 byte alignment to string start in bits. No
+ need to mask offset since shifts are ignoring upper bits. */
+ lsl offset, src2, #3
+ bic src2, src2, #0xf
+ mov mask, -1
+ neg neg_offset, offset
ldr data1, [src1], #8
- ldr data2, [src2], #8
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
- ccmp diff, #0, #0, eq
- b.ne .Lnot_limit
- subs limit_wd, limit_wd, #1
- b.pl .Lloop_misaligned
+ ldp tmp1, tmp2, [src2], #16
+ LS_BK mask, mask, neg_offset
+ and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
+ /* Skip the first compare if data in tmp1 is irrelevant. */
+ tbnz offset, 6, L(misaligned_mid_loop)
-.Ldone_loop:
- /* We found a difference or a NULL before the limit was reached. */
- and limit, limit, #7
- cbz limit, .Lnot_limit
- /* Read the last word. */
- sub src1, src1, 8
- sub src2, src2, 8
- ldr data1, [src1, limit]
- ldr data2, [src2, limit]
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
+L(loop_misaligned):
+ /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+ LS_FW data2, tmp1, offset
+ LS_BK tmp1, tmp2, neg_offset
+ subs limit, limit, #8
+ orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
+ sub has_nul, data1, zeroones
eor diff, data1, data2 /* Non-zero if differences found. */
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
- ccmp diff, #0, #0, eq
- b.ne .Lnot_limit
+ orr tmp3, data1, #REP8_7f
+ csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
+ orr tmp3, endloop, has_nul
+ cbnz tmp3, L(full_check)
+
+ ldr data1, [src1], #8
+L(misaligned_mid_loop):
+ /* STEP_B: Compare first part of data1 to second part of tmp2. */
+ LS_FW data2, tmp2, offset
+#ifdef __AARCH64EB__
+ /* For big-endian we do a byte reverse to avoid carry-propagation
+ problem described above. This way we can reuse the has_nul in the
+ next step and also use syndrome value trick at the end. */
+ rev tmp3, data1
+ #define data1_fixed tmp3
+#else
+ #define data1_fixed data1
+#endif
+ sub has_nul, data1_fixed, zeroones
+ orr tmp3, data1_fixed, #REP8_7f
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
+#ifdef __AARCH64EB__
+ rev has_nul, has_nul
+#endif
+ cmp limit, neg_offset, lsr #3
+ orr syndrome, diff, has_nul
+ bic syndrome, syndrome, mask /* Ignore later bytes. */
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
+ cbnz tmp3, L(syndrome_check)
+
+ /* STEP_C: Compare second part of data1 to first part of tmp1. */
+ ldp tmp1, tmp2, [src2], #16
+ cmp limit, #8
+ LS_BK data2, tmp1, neg_offset
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ orr syndrome, diff, has_nul
+ and syndrome, syndrome, mask /* Ignore earlier bytes. */
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
+ cbnz tmp3, L(syndrome_check)
+
+ ldr data1, [src1], #8
+ sub limit, limit, #8
+ b L(loop_misaligned)
+
+#ifdef __AARCH64EB__
+L(syndrome_check):
+ clz pos, syndrome
+ cmp pos, limit, lsl #3
+ b.lo L(end_quick)
+#endif
-.Lret0:
+L(ret0):
mov result, #0
ret
- .size strncmp, . - strncmp
+END(strncmp)
#endif
@@ -1,187 +1,105 @@
-/* strnlen - calculate the length of a string with limit.
-
- Copyright (c) 2013, Linaro Limited
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
+/*
+ * strnlen - calculate the length of a string with limit.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
/* See strlen-stub.c */
#else
/* Assumptions:
*
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
*/
-/* Arguments and results. */
+#include "asmdefs.h"
+
#define srcin x0
-#define len x0
-#define limit x1
+#define cntin x1
+#define result x0
-/* Locals and temporaries. */
#define src x2
-#define data1 x3
-#define data2 x4
-#define data2a x5
-#define has_nul1 x6
-#define has_nul2 x7
-#define tmp1 x8
-#define tmp2 x9
-#define tmp3 x10
-#define tmp4 x11
-#define zeroones x12
-#define pos x13
-#define limit_wd x14
-
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
- .text
- .p2align 6
-.Lstart:
- /* Pre-pad to ensure critical loop begins an icache line. */
- .rep 7
- nop
- .endr
- /* Put this code here to avoid wasting more space with pre-padding. */
-.Lhit_limit:
- mov len, limit
+#define synd x3
+#define shift x4
+#define tmp x4
+#define cntrem x5
+
+#define qdata q0
+#define vdata v0
+#define vhas_chr v1
+#define vend v2
+#define dend d2
+
+/*
+ Core algorithm:
+ Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+ four bits per byte using the shrn instruction. A count trailing zeros then
+ identifies the first zero byte. */
+
+ENTRY (strnlen)
+ PTR_ARG (0)
+ SIZE_ARG (1)
+ bic src, srcin, 15
+ cbz cntin, L(nomatch)
+ ld1 {vdata.16b}, [src]
+ cmeq vhas_chr.16b, vdata.16b, 0
+ lsl shift, srcin, 2
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
+ fmov synd, dend
+ lsr synd, synd, shift
+ cbz synd, L(start_loop)
+L(finish):
+ rbit synd, synd
+ clz synd, synd
+ lsr result, synd, 2
+ cmp cntin, result
+ csel result, cntin, result, ls
ret
-def_fn strnlen
- cbz limit, .Lhit_limit
- mov zeroones, #REP8_01
- bic src, srcin, #15
- ands tmp1, srcin, #15
- b.ne .Lmisaligned
- /* Calculate the number of full and partial words -1. */
- sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
- lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
-
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. */
- /* The inner loop deals with two Dwords at a time. This has a
- slightly higher start-up cost, but we should win quite quickly,
- especially on cores with a high number of issue slots per
- cycle, as we get much better parallelism out of the operations. */
-
- /* Start of critial section -- keep to one 64Byte cache line. */
-.Lloop:
- ldp data1, data2, [src], #16
-.Lrealigned:
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
- bic has_nul1, tmp1, tmp2
- bic has_nul2, tmp3, tmp4
- subs limit_wd, limit_wd, #1
- orr tmp1, has_nul1, has_nul2
- ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
- b.eq .Lloop
- /* End of critical section -- keep to one 64Byte cache line. */
-
- orr tmp1, has_nul1, has_nul2
- cbz tmp1, .Lhit_limit /* No null in final Qword. */
-
- /* We know there's a null in the final Qword. The easiest thing
- to do now is work out the length of the string and return
- MIN (len, limit). */
-
- sub len, src, srcin
- cbz has_nul1, .Lnul_in_data2
-#ifdef __AARCH64EB__
- mov data2, data1
-#endif
- sub len, len, #8
- mov has_nul2, has_nul1
-.Lnul_in_data2:
-#ifdef __AARCH64EB__
- /* For big-endian, carry propagation (if the final byte in the
- string is 0x01) means we cannot use has_nul directly. The
- easiest way to get the correct byte is to byte-swap the data
- and calculate the syndrome a second time. */
- rev data2, data2
- sub tmp1, data2, zeroones
- orr tmp2, data2, #REP8_7f
- bic has_nul2, tmp1, tmp2
-#endif
- sub len, len, #8
- rev has_nul2, has_nul2
- clz pos, has_nul2
- add len, len, pos, lsr #3 /* Bits to bytes. */
- cmp len, limit
- csel len, len, limit, ls /* Return the lower value. */
+L(nomatch):
+ mov result, cntin
ret
-.Lmisaligned:
- /* Deal with a partial first word.
- We're doing two things in parallel here;
- 1) Calculate the number of words (but avoiding overflow if
- limit is near ULONG_MAX) - to do this we need to work out
- limit + tmp1 - 1 as a 65-bit value before shifting it;
- 2) Load and mask the initial data words - we force the bytes
- before the ones we are interested in to 0xff - this ensures
- early bytes will not hit any zero detection. */
- sub limit_wd, limit, #1
- neg tmp4, tmp1
- cmp tmp1, #8
-
- and tmp3, limit_wd, #15
- lsr limit_wd, limit_wd, #4
- mov tmp2, #~0
-
- ldp data1, data2, [src], #16
- lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
- add tmp3, tmp3, tmp1
-
-#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
-#else
- /* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
+L(start_loop):
+ sub tmp, src, srcin
+ add tmp, tmp, 17
+ subs cntrem, cntin, tmp
+ b.lo L(nomatch)
+
+ /* Make sure that it won't overread by a 16-byte chunk */
+ tbz cntrem, 4, L(loop32_2)
+ sub src, src, 16
+ .p2align 5
+L(loop32):
+ ldr qdata, [src, 32]!
+ cmeq vhas_chr.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ cbnz synd, L(end)
+L(loop32_2):
+ ldr qdata, [src, 16]
+ subs cntrem, cntrem, 32
+ cmeq vhas_chr.16b, vdata.16b, 0
+ b.lo L(end_2)
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ cbz synd, L(loop32)
+L(end_2):
+ add src, src, 16
+L(end):
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
+ sub result, src, srcin
+ fmov synd, dend
+#ifndef __AARCH64EB__
+ rbit synd, synd
#endif
- add limit_wd, limit_wd, tmp3, lsr #4
-
- orr data1, data1, tmp2
- orr data2a, data2, tmp2
-
- csinv data1, data1, xzr, le
- csel data2, data2, data2a, le
- b .Lrealigned
- .size strnlen, . - .Lstart /* Include pre-padding in size. */
+ clz synd, synd
+ add result, result, synd, lsr 2
+ cmp cntin, result
+ csel result, cntin, result, ls
+ ret
+END (strnlen)
#endif
@@ -1,32 +1,9 @@
/*
- strrchr - find last instance of a character in a string
-
- Copyright (c) 2014, ARM Limited
- All rights Reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the company nor the names of its contributors
- may be used to endorse or promote products derived from this
- software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
+ * strrchr - find last position of a character in a string.
+ *
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
/* See strchr-stub.c */
#else
@@ -37,6 +14,8 @@
* Neon Available.
*/
+#include "asmdefs.h"
+
/* Arguments and results. */
#define srcin x0
#define chrin w1
@@ -78,17 +57,8 @@
in the original string a count_trailing_zeros() operation will
identify exactly which byte is causing the termination, and why. */
-/* Locals and temporaries. */
-
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
-def_fn strrchr
+ENTRY (strrchr)
+ PTR_ARG (0)
/* Magic constant 0x40100401 to allow us to identify which lane
matches the requested byte. Magic constant 0x80200802 used
similarly for NUL termination. */
@@ -100,7 +70,7 @@ def_fn strrchr
mov src_offset, #0
ands tmp1, srcin, #31
add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
- b.eq .Laligned
+ b.eq L(aligned)
/* Input string is not 32-byte aligned. Rather than forcing
the padding bytes to a safe value, we calculate the syndrome
@@ -118,45 +88,45 @@ def_fn strrchr
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
- addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b // 128->64
- addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
- mov nul_match, vhas_nul1.2d[0]
+ addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64
+ mov nul_match, vend1.d[0]
lsl tmp1, tmp1, #1
mov const_m1, #~0
- mov chr_match, vhas_chr1.2d[0]
lsr tmp3, const_m1, tmp1
+ mov chr_match, vend1.d[1]
bic nul_match, nul_match, tmp3 // Mask padding bits.
bic chr_match, chr_match, tmp3 // Mask padding bits.
- cbnz nul_match, .Ltail
+ cbnz nul_match, L(tail)
-.Lloop:
+ .p2align 4
+L(loop):
cmp chr_match, #0
csel src_match, src, src_match, ne
csel src_offset, chr_match, src_offset, ne
-.Laligned:
+L(aligned):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
- cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
- cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- addp vend1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
+ uminp vend1.16b, vdata1.16b, vdata2.16b
and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ cmeq vend1.16b, vend1.16b, 0
addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
- addp vend1.16b, vend1.16b, vend1.16b // 128->64
- addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
- mov nul_match, vend1.2d[0]
- mov chr_match, vhas_chr1.2d[0]
- cbz nul_match, .Lloop
+ addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64
+ mov nul_match, vend1.d[0]
+ mov chr_match, vend1.d[1]
+ cbz nul_match, L(loop)
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_nul2.16b, vdata2.16b, #0
and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
- mov nul_match, vhas_nul1.2d[0]
+ mov nul_match, vhas_nul1.d[0]
-.Ltail:
+L(tail):
/* Work out exactly where the string ends. */
sub tmp4, nul_match, #1
eor tmp4, tmp4, nul_match
@@ -178,5 +148,5 @@ def_fn strrchr
ret
- .size strrchr, . - strrchr
+END (strrchr)
#endif