From patchwork Tue Sep 12 10:05:06 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Sebastian Huber X-Patchwork-Id: 75721 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id E81B03858000 for ; Tue, 12 Sep 2023 10:05:59 +0000 (GMT) X-Original-To: newlib@sourceware.org Delivered-To: newlib@sourceware.org Received: from dedi548.your-server.de (dedi548.your-server.de [85.10.215.148]) by sourceware.org (Postfix) with ESMTPS id 20C4C3858296 for ; Tue, 12 Sep 2023 10:05:15 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 20C4C3858296 Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=embedded-brains.de Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=embedded-brains.de Received: from sslproxy05.your-server.de ([78.46.172.2]) by dedi548.your-server.de with esmtpsa (TLS1.3) tls TLS_AES_256_GCM_SHA384 (Exim 4.94.2) (envelope-from ) id 1qg0Gf-000Ee7-Ta; Tue, 12 Sep 2023 12:05:13 +0200 Received: from [82.100.198.138] (helo=mail.embedded-brains.de) by sslproxy05.your-server.de with esmtpsa (TLSv1.3:TLS_AES_256_GCM_SHA384:256) (Exim 4.92) (envelope-from ) id 1qg0Gf-00064k-O5; Tue, 12 Sep 2023 12:05:13 +0200 Received: from localhost (localhost [127.0.0.1]) by mail.embedded-brains.de (Postfix) with ESMTP id 634624801B7; Tue, 12 Sep 2023 12:05:13 +0200 (CEST) Received: from mail.embedded-brains.de ([127.0.0.1]) by localhost (zimbra.eb.localhost [127.0.0.1]) (amavis, port 10032) with ESMTP id bP2keR4de6eU; Tue, 12 Sep 2023 12:05:11 +0200 (CEST) Received: from localhost (localhost [127.0.0.1]) by mail.embedded-brains.de (Postfix) with ESMTP id 8A2D1480051; Tue, 12 Sep 2023 12:05:11 +0200 (CEST) X-Virus-Scanned: amavis at zimbra.eb.localhost Received: from mail.embedded-brains.de ([127.0.0.1]) by localhost (zimbra.eb.localhost [127.0.0.1]) (amavis, port 10026) with ESMTP id pK3psCMn-qVN; Tue, 12 Sep 2023 12:05:11 +0200 (CEST) Received: from zimbra.eb.localhost (unknown [192.168.96.242]) by mail.embedded-brains.de (Postfix) with ESMTPSA id 3A2AF4801AF; Tue, 12 Sep 2023 12:05:11 +0200 (CEST) From: Sebastian Huber To: newlib@sourceware.org Cc: Szabolcs Nagy Subject: [PATCH v3 1/2] aarch64: Sync with ARM-software/optimized-routines Date: Tue, 12 Sep 2023 12:05:06 +0200 Message-Id: <20230912100507.33946-2-sebastian.huber@embedded-brains.de> X-Mailer: git-send-email 2.35.3 In-Reply-To: <20230912100507.33946-1-sebastian.huber@embedded-brains.de> References: <20230912100507.33946-1-sebastian.huber@embedded-brains.de> MIME-Version: 1.0 X-Authenticated-Sender: smtp-embedded@poldi-networks.de X-Virus-Scanned: Clear (ClamAV 0.103.8/27029/Tue Sep 12 09:38:51 2023) X-Spam-Status: No, score=-10.4 required=5.0 tests=BAYES_00, GIT_PATCH_0, KAM_DMARC_STATUS, SCC_10_SHORT_WORD_LINES, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: newlib@sourceware.org X-Mailman-Version: 2.1.30 Precedence: list List-Id: Newlib mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: newlib-bounces+patchwork=sourceware.org@sourceware.org Sender: "Newlib" Update AArch64 assembly string routines from: https://github.com/ARM-software/optimized-routines commit 0cf84f26b6b8dcad8287fe30a4dcc1fdabd06560 Author: Sebastian Huber Date: Thu Jul 27 17:14:57 2023 +0200 string: Fix corrupt GNU_PROPERTY_TYPE (5) size For ELF32 the notes alignment is 4 and not 8. --- newlib/libc/machine/aarch64/asmdefs.h | 106 ++++++ newlib/libc/machine/aarch64/memchr.S | 73 ++-- newlib/libc/machine/aarch64/memcmp.S | 311 +++++++++-------- newlib/libc/machine/aarch64/memcpy.S | 272 ++++++++------- newlib/libc/machine/aarch64/memset.S | 194 ++--------- newlib/libc/machine/aarch64/stpcpy.S | 36 +- newlib/libc/machine/aarch64/strchr.S | 107 ++---- newlib/libc/machine/aarch64/strchrnul.S | 90 ++--- newlib/libc/machine/aarch64/strcmp.S | 282 ++++++++------- newlib/libc/machine/aarch64/strcpy.S | 437 +++++++----------------- newlib/libc/machine/aarch64/strlen.S | 319 ++++++++--------- newlib/libc/machine/aarch64/strncmp.S | 323 ++++++++++-------- newlib/libc/machine/aarch64/strnlen.S | 256 +++++--------- newlib/libc/machine/aarch64/strrchr.S | 86 ++--- 14 files changed, 1226 insertions(+), 1666 deletions(-) create mode 100644 newlib/libc/machine/aarch64/asmdefs.h diff --git a/newlib/libc/machine/aarch64/asmdefs.h b/newlib/libc/machine/aarch64/asmdefs.h new file mode 100644 index 0000000000..131b95e1fe --- /dev/null +++ b/newlib/libc/machine/aarch64/asmdefs.h @@ -0,0 +1,106 @@ +/* + * Macros for asm code. AArch64 version. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _ASMDEFS_H +#define _ASMDEFS_H + +/* Branch Target Identitication support. */ +#define BTI_C hint 34 +#define BTI_J hint 36 +/* Return address signing support (pac-ret). */ +#define PACIASP hint 25; .cfi_window_save +#define AUTIASP hint 29; .cfi_window_save + +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ +#define FEATURE_1_AND 0xc0000000 +#define FEATURE_1_BTI 1 +#define FEATURE_1_PAC 2 + +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#ifdef __ILP32__ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a"; \ + .p2align 2; \ + .word 4; \ + .word 12; \ + .word 5; \ + .asciz "GNU"; \ + .word type; \ + .word 4; \ + .word value; \ + .text +#else +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a"; \ + .p2align 3; \ + .word 4; \ + .word 16; \ + .word 5; \ + .asciz "GNU"; \ + .word type; \ + .word 4; \ + .word value; \ + .word 0; \ + .text +#endif + +/* If set then the GNU Property Note section will be added to + mark objects to support BTI and PAC-RET. */ +#ifndef WANT_GNU_PROPERTY +#define WANT_GNU_PROPERTY 1 +#endif + +#if WANT_GNU_PROPERTY +/* Add property note with supported features to all asm files. */ +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) +#endif + +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name,%function; \ + .align alignment; \ + name: \ + .cfi_startproc; \ + BTI_C; + +#define ENTRY(name) ENTRY_ALIGN(name, 6) + +#define ENTRY_ALIAS(name) \ + .global name; \ + .type name,%function; \ + name: + +#define END(name) \ + .cfi_endproc; \ + .size name, .-name; + +#define L(l) .L ## l + +#ifdef __ILP32__ + /* Sanitize padding bits of pointer arguments as per aapcs64 */ +#define PTR_ARG(n) mov w##n, w##n +#else +#define PTR_ARG(n) +#endif + +#ifdef __ILP32__ + /* Sanitize padding bits of size arguments as per aapcs64 */ +#define SIZE_ARG(n) mov w##n, w##n +#else +#define SIZE_ARG(n) +#endif + +/* Compiler supports SVE instructions */ +#ifndef HAVE_SVE +# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5) +# define HAVE_SVE 1 +# else +# define HAVE_SVE 0 +# endif +#endif + +#endif diff --git a/newlib/libc/machine/aarch64/memchr.S b/newlib/libc/machine/aarch64/memchr.S index 53f5d6bc0e..a0f305e0fc 100644 --- a/newlib/libc/machine/aarch64/memchr.S +++ b/newlib/libc/machine/aarch64/memchr.S @@ -1,31 +1,8 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2014, ARM Limited - * All rights Reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the company nor the names of its contributors - * may be used to endorse or promote products derived from this - * software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) @@ -37,6 +14,8 @@ * Neon Available. */ +#include "asmdefs.h" + /* Arguments and results. */ #define srcin x0 #define chrin w1 @@ -70,17 +49,11 @@ * identify exactly which byte has matched. */ - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -def_fn memchr +ENTRY (memchr) + PTR_ARG (0) + SIZE_ARG (2) /* Do not dereference srcin if no bytes to compare. */ - cbz cntin, .Lzero_length + cbz cntin, L(zero_length) /* * Magic constant 0x40100401 allows us to identify which lane matches * the requested byte. @@ -93,7 +66,7 @@ def_fn memchr dup vrepmask.4s, wtmp2 ands soff, srcin, #31 and cntrem, cntin, #31 - b.eq .Lloop + b.eq L(loop) /* * Input string is not 32-byte aligned. We calculate the syndrome @@ -110,41 +83,41 @@ def_fn memchr and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ addp vend.16b, vend.16b, vend.16b /* 128->64 */ - mov synd, vend.2d[0] + mov synd, vend.d[0] /* Clear the soff*2 lower bits */ lsl tmp, soff, #1 lsr synd, synd, tmp lsl synd, synd, tmp /* The first block can also be the last */ - b.ls .Lmasklast + b.ls L(masklast) /* Have we found something already? */ - cbnz synd, .Ltail + cbnz synd, L(tail) -.Lloop: +L(loop): ld1 {vdata1.16b, vdata2.16b}, [src], #32 subs cntin, cntin, #32 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b /* If we're out of data we finish regardless of the result */ - b.ls .Lend + b.ls L(end) /* Use a fast check for the termination condition */ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b addp vend.2d, vend.2d, vend.2d - mov synd, vend.2d[0] + mov synd, vend.d[0] /* We're not out of data, loop if we haven't found the character */ - cbz synd, .Lloop + cbz synd, L(loop) -.Lend: +L(end): /* Termination condition found, let's calculate the syndrome value */ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ addp vend.16b, vend.16b, vend.16b /* 128->64 */ - mov synd, vend.2d[0] + mov synd, vend.d[0] /* Only do the clear for the last possible block */ - b.hi .Ltail + b.hs L(tail) -.Lmasklast: +L(masklast): /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ add tmp, cntrem, soff and tmp, tmp, #31 @@ -153,7 +126,7 @@ def_fn memchr lsl synd, synd, tmp lsr synd, synd, tmp -.Ltail: +L(tail): /* Count the trailing zeros using bit reversing */ rbit synd, synd /* Compensate the last post-increment */ @@ -168,9 +141,9 @@ def_fn memchr csel result, xzr, result, eq ret -.Lzero_length: +L(zero_length): mov result, #0 ret - .size memchr, . - memchr +END (memchr) #endif diff --git a/newlib/libc/machine/aarch64/memcmp.S b/newlib/libc/machine/aarch64/memcmp.S index 605d99365e..18874d3215 100644 --- a/newlib/libc/machine/aarch64/memcmp.S +++ b/newlib/libc/machine/aarch64/memcmp.S @@ -1,57 +1,7 @@ /* memcmp - compare memory - - Copyright (c) 2018 Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -/* - * Copyright (c) 2017 ARM Ltd - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2013-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) @@ -60,103 +10,79 @@ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses. + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. */ -#define L(l) .L ## l - -/* Parameters and result. */ -#define src1 x0 -#define src2 x1 -#define limit x2 -#define result w0 - -/* Internal variables. */ -#define data1 x3 -#define data1w w3 -#define data1h x4 -#define data2 x5 -#define data2w w5 -#define data2h x6 -#define tmp1 x7 -#define tmp2 x8 - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -def_fn memcmp p2align=6 - subs limit, limit, 8 - b.lo L(less8) - - ldr data1, [src1], 8 - ldr data2, [src2], 8 - cmp data1, data2 - b.ne L(return) - - subs limit, limit, 8 - b.gt L(more16) - - ldr data1, [src1, limit] - ldr data2, [src2, limit] - b L(return) - -L(more16): - ldr data1, [src1], 8 - ldr data2, [src2], 8 - cmp data1, data2 - bne L(return) - - /* Jump directly to comparing the last 16 bytes for 32 byte (or less) - strings. */ - subs limit, limit, 16 +#include "asmdefs.h" + +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result w0 + +#define data1 x3 +#define data1w w3 +#define data2 x4 +#define data2w w4 +#define data3 x5 +#define data3w w5 +#define data4 x6 +#define data4w w6 +#define tmp x6 +#define src1end x7 +#define src2end x8 + + +ENTRY (memcmp) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + cmp limit, 16 + b.lo L(less16) + ldp data1, data3, [src1] + ldp data2, data4, [src2] + ccmp data1, data2, 0, ne + ccmp data3, data4, 0, eq + b.ne L(return2) + + add src1end, src1, limit + add src2end, src2, limit + cmp limit, 32 b.ls L(last_bytes) + cmp limit, 160 + b.hs L(loop_align) + sub limit, limit, 32 - /* We overlap loads between 0-32 bytes at either side of SRC1 when we - try to align, so limit it only to strings larger than 128 bytes. */ - cmp limit, 96 - b.ls L(loop16) - - /* Align src1 and adjust src2 with bytes not yet done. */ - and tmp1, src1, 15 - add limit, limit, tmp1 - sub src1, src1, tmp1 - sub src2, src2, tmp1 - - /* Loop performing 16 bytes per iteration using aligned src1. - Limit is pre-decremented by 16 and must be larger than zero. - Exit if <= 16 bytes left to do or if the data is not equal. */ .p2align 4 -L(loop16): - ldp data1, data1h, [src1], 16 - ldp data2, data2h, [src2], 16 - subs limit, limit, 16 - ccmp data1, data2, 0, hi - ccmp data1h, data2h, 0, eq - b.eq L(loop16) - +L(loop32): + ldp data1, data3, [src1, 16] + ldp data2, data4, [src2, 16] cmp data1, data2 - bne L(return) - mov data1, data1h - mov data2, data2h + ccmp data3, data4, 0, eq + b.ne L(return2) + cmp limit, 16 + b.ls L(last_bytes) + + ldp data1, data3, [src1, 32] + ldp data2, data4, [src2, 32] cmp data1, data2 - bne L(return) + ccmp data3, data4, 0, eq + b.ne L(return2) + add src1, src1, 32 + add src2, src2, 32 +L(last64): + subs limit, limit, 32 + b.hi L(loop32) /* Compare last 1-16 bytes using unaligned access. */ L(last_bytes): - add src1, src1, limit - add src2, src2, limit - ldp data1, data1h, [src1] - ldp data2, data2h, [src2] - cmp data1, data2 - bne L(return) - mov data1, data1h - mov data2, data2h + ldp data1, data3, [src1end, -16] + ldp data2, data4, [src2end, -16] +L(return2): cmp data1, data2 + csel data1, data1, data3, ne + csel data2, data2, data4, ne /* Compare data bytes and set return value to 0, -1 or 1. */ L(return): @@ -164,33 +90,106 @@ L(return): rev data1, data1 rev data2, data2 #endif - cmp data1, data2 -L(ret_eq): + cmp data1, data2 cset result, ne cneg result, result, lo ret .p2align 4 - /* Compare up to 8 bytes. Limit is [-8..-1]. */ +L(less16): + add src1end, src1, limit + add src2end, src2, limit + tbz limit, 3, L(less8) + ldr data1, [src1] + ldr data2, [src2] + ldr data3, [src1end, -8] + ldr data4, [src2end, -8] + b L(return2) + + .p2align 4 L(less8): - adds limit, limit, 4 - b.lo L(less4) - ldr data1w, [src1], 4 - ldr data2w, [src2], 4 + tbz limit, 2, L(less4) + ldr data1w, [src1] + ldr data2w, [src2] + ldr data3w, [src1end, -4] + ldr data4w, [src2end, -4] + b L(return2) + +L(less4): + tbz limit, 1, L(less2) + ldrh data1w, [src1] + ldrh data2w, [src2] cmp data1w, data2w b.ne L(return) - sub limit, limit, 4 -L(less4): - adds limit, limit, 4 - beq L(ret_eq) -L(byte_loop): - ldrb data1w, [src1], 1 - ldrb data2w, [src2], 1 - subs limit, limit, 1 - ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ - b.eq L(byte_loop) +L(less2): + mov result, 0 + tbz limit, 0, L(return_zero) + ldrb data1w, [src1end, -1] + ldrb data2w, [src2end, -1] sub result, data1w, data2w +L(return_zero): + ret + +L(loop_align): + ldp data1, data3, [src1, 16] + ldp data2, data4, [src2, 16] + cmp data1, data2 + ccmp data3, data4, 0, eq + b.ne L(return2) + + /* Align src2 and adjust src1, src2 and limit. */ + and tmp, src2, 15 + sub tmp, tmp, 16 + sub src2, src2, tmp + add limit, limit, tmp + sub src1, src1, tmp + sub limit, limit, 64 + 16 + + .p2align 4 +L(loop64): + ldr q0, [src1, 16] + ldr q1, [src2, 16] + subs limit, limit, 64 + ldr q2, [src1, 32] + ldr q3, [src2, 32] + eor v0.16b, v0.16b, v1.16b + eor v1.16b, v2.16b, v3.16b + ldr q2, [src1, 48] + ldr q3, [src2, 48] + umaxp v0.16b, v0.16b, v1.16b + ldr q4, [src1, 64]! + ldr q5, [src2, 64]! + eor v1.16b, v2.16b, v3.16b + eor v2.16b, v4.16b, v5.16b + umaxp v1.16b, v1.16b, v2.16b + umaxp v0.16b, v0.16b, v1.16b + umaxp v0.16b, v0.16b, v0.16b + fmov tmp, d0 + ccmp tmp, 0, 0, hi + b.eq L(loop64) + + /* If equal, process last 1-64 bytes using scalar loop. */ + add limit, limit, 64 + 16 + cbz tmp, L(last64) + + /* Determine the 8-byte aligned offset of the first difference. */ +#ifdef __AARCH64EB__ + rev16 tmp, tmp +#endif + rev tmp, tmp + clz tmp, tmp + bic tmp, tmp, 7 + sub tmp, tmp, 48 + ldr data1, [src1, tmp] + ldr data2, [src2, tmp] +#ifndef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + mov result, 1 + cmp data1, data2 + cneg result, result, lo ret - .size memcmp, . - memcmp +END (memcmp) #endif diff --git a/newlib/libc/machine/aarch64/memcpy.S b/newlib/libc/machine/aarch64/memcpy.S index 463bad0a18..248e7843a2 100644 --- a/newlib/libc/machine/aarch64/memcpy.S +++ b/newlib/libc/machine/aarch64/memcpy.S @@ -1,55 +1,8 @@ -/* Copyright (c) 2012-2013, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - /* - * Copyright (c) 2015 ARM Ltd - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. + * memcpy - copy memory area * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -61,6 +14,7 @@ #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) /* See memcpy-stub.c */ #else +#include "asmdefs.h" #define dstin x0 #define src x1 @@ -71,122 +25,139 @@ #define A_l x6 #define A_lw w6 #define A_h x7 -#define A_hw w7 #define B_l x8 #define B_lw w8 #define B_h x9 #define C_l x10 +#define C_lw w10 #define C_h x11 #define D_l x12 #define D_h x13 -#define E_l src -#define E_h count -#define F_l srcend -#define F_h dst -#define tmp1 x9 - -#define L(l) .L ## l - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -/* Copies are split into 3 main cases: small copies of up to 16 bytes, - medium copies of 17..96 bytes which are fully unrolled. Large copies - of more than 96 bytes align the destination and use an unrolled loop - processing 64 bytes per iteration. - Small and medium copies read all data before writing, allowing any - kind of overlap, and memmove tailcalls memcpy for these cases as - well as non-overlapping copies. +#define E_l x14 +#define E_h x15 +#define F_l x16 +#define F_h x17 +#define G_l count +#define G_h dst +#define H_l src +#define H_h srcend +#define tmp1 x14 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The destination pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. */ -def_fn memcpy p2align=6 - prfm PLDL1KEEP, [src] +ENTRY_ALIAS (memmove) +ENTRY (memcpy) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) add srcend, src, count add dstend, dstin, count - cmp count, 16 - b.ls L(copy16) - cmp count, 96 + cmp count, 128 b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) - /* Medium copies: 17..96 bytes. */ - sub tmp1, count, 1 + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) ldp A_l, A_h, [src] - tbnz tmp1, 6, L(copy96) ldp D_l, D_h, [srcend, -16] - tbz tmp1, 5, 1f - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [srcend, -32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] -1: stp A_l, A_h, [dstin] stp D_l, D_h, [dstend, -16] ret - .p2align 4 - /* Small copies: 0..16 bytes. */ + /* Copy 8-15 bytes. */ L(copy16): - cmp count, 8 - b.lo 1f + tbz count, 3, L(copy8) ldr A_l, [src] ldr A_h, [srcend, -8] str A_l, [dstin] str A_h, [dstend, -8] ret - .p2align 4 -1: - tbz count, 2, 1f + + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) ldr A_lw, [src] - ldr A_hw, [srcend, -4] + ldr B_lw, [srcend, -4] str A_lw, [dstin] - str A_hw, [dstend, -4] + str B_lw, [dstend, -4] ret - /* Copy 0..3 bytes. Use a branchless sequence that copies the same - byte 3 times if count==1, or the 2nd byte twice if count==2. */ -1: - cbz count, 2f + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) lsr tmp1, count, 1 ldrb A_lw, [src] - ldrb A_hw, [srcend, -1] + ldrb C_lw, [srcend, -1] ldrb B_lw, [src, tmp1] strb A_lw, [dstin] strb B_lw, [dstin, tmp1] - strb A_hw, [dstend, -1] -2: ret + strb C_lw, [dstend, -1] +L(copy0): + ret .p2align 4 - /* Copy 64..96 bytes. Copy 64 bytes from the start and - 32 bytes from the end. */ -L(copy96): + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_l, A_h, [src] ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [src, 32] - ldp D_l, D_h, [src, 48] - ldp E_l, E_h, [srcend, -32] - ldp F_l, F_h, [srcend, -16] + ldp C_l, C_h, [srcend, -32] + ldp D_l, D_h, [srcend, -16] + cmp count, 64 + b.hi L(copy128) stp A_l, A_h, [dstin] stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstin, 32] - stp D_l, D_h, [dstin, 48] - stp E_l, E_h, [dstend, -32] - stp F_l, F_h, [dstend, -16] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] ret - /* Align DST to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 96 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ + .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_l, E_h, [src, 32] + ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls L(copy96) + ldp G_l, G_h, [srcend, -64] + ldp H_l, H_h, [srcend, -48] + stp G_l, G_h, [dstend, -64] + stp H_l, H_h, [dstend, -48] +L(copy96): + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp E_l, E_h, [dstin, 32] + stp F_l, F_h, [dstin, 48] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret .p2align 4 + /* Copy more than 128 bytes. */ L(copy_long): + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(copy0) + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align dst to 16-byte alignment. */ + + ldp D_l, D_h, [src] and tmp1, dstin, 15 bic dst, dstin, 15 - ldp D_l, D_h, [src] sub src, src, tmp1 add count, count, tmp1 /* Count is now 16 too large. */ ldp A_l, A_h, [src, 16] @@ -195,8 +166,9 @@ L(copy_long): ldp C_l, C_h, [src, 48] ldp D_l, D_h, [src, 64]! subs count, count, 128 + 16 /* Test and readjust count. */ - b.ls 2f -1: + b.ls L(copy64_from_end) + +L(loop64): stp A_l, A_h, [dst, 16] ldp A_l, A_h, [src, 16] stp B_l, B_h, [dst, 32] @@ -206,12 +178,10 @@ L(copy_long): stp D_l, D_h, [dst, 64]! ldp D_l, D_h, [src, 64]! subs count, count, 64 - b.hi 1b + b.hi L(loop64) - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the end even if - there is just 1 byte left. */ -2: + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): ldp E_l, E_h, [srcend, -64] stp A_l, A_h, [dst, 16] ldp A_l, A_h, [srcend, -48] @@ -226,5 +196,51 @@ L(copy_long): stp C_l, C_h, [dstend, -16] ret - .size memcpy, . - memcpy + .p2align 4 + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ +L(copy_long_backwards): + ldp D_l, D_h, [srcend, -16] + and tmp1, dstend, 15 + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] + ret + +END (memcpy) #endif diff --git a/newlib/libc/machine/aarch64/memset.S b/newlib/libc/machine/aarch64/memset.S index 103e3f8bb0..ca76439a91 100644 --- a/newlib/libc/machine/aarch64/memset.S +++ b/newlib/libc/machine/aarch64/memset.S @@ -1,66 +1,20 @@ -/* Copyright (c) 2012-2013, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - /* - * Copyright (c) 2015 ARM Ltd - * All rights reserved. + * memset - fill memory with a constant byte * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. * */ #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) /* See memset-stub.c */ #else +#include "asmdefs.h" #define dstin x0 #define val x1 @@ -68,24 +22,11 @@ #define count x2 #define dst x3 #define dstend x4 -#define tmp1 x5 -#define tmp1w w5 -#define tmp2 x6 -#define tmp2w w6 -#define zva_len x7 -#define zva_lenw w7 - -#define L(l) .L ## l +#define zva_val x5 - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -def_fn memset p2align=6 +ENTRY (memset) + PTR_ARG (0) + SIZE_ARG (2) dup v0.16B, valw add dstend, dstin, count @@ -101,7 +42,7 @@ def_fn memset p2align=6 str val, [dstin] str val, [dstend, -8] ret - nop + .p2align 4 1: tbz count, 2, 2f str valw, [dstin] str valw, [dstend, -4] @@ -131,110 +72,49 @@ L(set96): stp q0, q0, [dstend, -32] ret - .p2align 3 - nop + .p2align 4 L(set_long): and valw, valw, 255 bic dst, dstin, 15 str q0, [dstin] - cmp count, 256 - ccmp valw, 0, 0, cs - b.eq L(try_zva) -L(no_zva): - sub count, dstend, dst /* Count is 16 too large. */ - sub dst, dst, 16 /* Dst is biased by -32. */ - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ -1: stp q0, q0, [dst, 32] - stp q0, q0, [dst, 64]! -L(tail64): - subs count, count, 64 - b.hi 1b -2: stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - ret - - .p2align 3 -L(try_zva): - mrs tmp1, dczid_el0 - tbnz tmp1w, 4, L(no_zva) - and tmp1w, tmp1w, 15 - cmp tmp1w, 4 /* ZVA size is 64 bytes. */ - b.ne L(zva_128) - - /* Write the first and last 64 byte aligned block using stp rather - than using DC ZVA. This is faster on some cores. - */ -L(zva_64): + cmp count, 160 + ccmp valw, 0, 0, hs + b.ne L(no_zva) + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) +#endif str q0, [dst, 16] stp q0, q0, [dst, 32] bic dst, dst, 63 - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] - sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ - add dst, dst, 128 - nop -1: dc zva, dst + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + .p2align 4 +L(zva_loop): add dst, dst, 64 + dc zva, dst subs count, count, 64 - b.hi 1b - stp q0, q0, [dst, 0] - stp q0, q0, [dst, 32] + b.hi L(zva_loop) stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] ret - .p2align 3 -L(zva_128): - cmp tmp1w, 5 /* ZVA size is 128 bytes. */ - b.ne L(zva_other) - - str q0, [dst, 16] +L(no_zva): + sub count, dstend, dst /* Count is 16 too large. */ + sub dst, dst, 16 /* Dst is biased by -32. */ + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +L(no_zva_loop): stp q0, q0, [dst, 32] - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] - bic dst, dst, 127 - sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+128 /* Adjust count and bias for loop. */ - add dst, dst, 128 -1: dc zva, dst - add dst, dst, 128 - subs count, count, 128 - b.hi 1b - stp q0, q0, [dstend, -128] - stp q0, q0, [dstend, -96] + stp q0, q0, [dst, 64]! + subs count, count, 64 + b.hi L(no_zva_loop) stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] ret -L(zva_other): - mov tmp2w, 4 - lsl zva_lenw, tmp2w, tmp1w - add tmp1, zva_len, 64 /* Max alignment bytes written. */ - cmp count, tmp1 - blo L(no_zva) - - sub tmp2, zva_len, 1 - add tmp1, dst, zva_len - add dst, dst, 16 - subs count, tmp1, dst /* Actual alignment bytes to write. */ - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ - beq 2f -1: stp q0, q0, [dst], 64 - stp q0, q0, [dst, -32] - subs count, count, 64 - b.hi 1b -2: mov dst, tmp1 - sub count, dstend, tmp1 /* Remaining bytes to write. */ - subs count, count, zva_len - b.lo 4f -3: dc zva, dst - add dst, dst, zva_len - subs count, count, zva_len - b.hs 3b -4: add count, count, zva_len - sub dst, dst, 32 /* Bias dst for tail loop. */ - b L(tail64) - - .size memset, . - memset +END (memset) #endif diff --git a/newlib/libc/machine/aarch64/stpcpy.S b/newlib/libc/machine/aarch64/stpcpy.S index 696b45889f..155c68d75a 100644 --- a/newlib/libc/machine/aarch64/stpcpy.S +++ b/newlib/libc/machine/aarch64/stpcpy.S @@ -1,34 +1,10 @@ /* - stpcpy - copy a string returning pointer to end. + * stpcpy - copy a string returning pointer to end. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ - Copyright (c) 2015 ARM Ltd. - All Rights Reserved. +#define BUILD_STPCPY 1 - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the company nor the names of its contributors - may be used to endorse or promote products derived from this - software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -/* This is just a wrapper that uses strcpy code with appropriate - pre-defines. */ - -#define BUILD_STPCPY #include "strcpy.S" diff --git a/newlib/libc/machine/aarch64/strchr.S b/newlib/libc/machine/aarch64/strchr.S index 2448dbc7d5..500d9aff29 100644 --- a/newlib/libc/machine/aarch64/strchr.S +++ b/newlib/libc/machine/aarch64/strchr.S @@ -1,32 +1,9 @@ /* - strchr - find a character in a string - - Copyright (c) 2014, ARM Limited - All rights Reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the company nor the names of its contributors - may be used to endorse or promote products derived from this - software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - + * strchr - find a character in a string + * + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) /* See strchr-stub.c */ #else @@ -37,6 +14,8 @@ * Neon Available. */ +#include "asmdefs.h" + /* Arguments and results. */ #define srcin x0 #define chrin w1 @@ -74,26 +53,19 @@ /* Locals and temporaries. */ - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -def_fn strchr - /* Magic constant 0x40100401 to allow us to identify which lane - matches the requested byte. Magic constant 0x80200802 used - similarly for NUL termination. */ - mov wtmp2, #0x0401 - movk wtmp2, #0x4010, lsl #16 +ENTRY (strchr) + PTR_ARG (0) + /* Magic constant 0xc0300c03 to allow us to identify which lane + matches the requested byte. Even bits are set if the character + matches, odd bits if either the char is NUL or matches. */ + mov wtmp2, 0x0c03 + movk wtmp2, 0xc030, lsl 16 dup vrepchr.16b, chrin bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ dup vrepmask_c.4s, wtmp2 ands tmp1, srcin, #31 add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ - b.eq .Lloop + b.eq L(loop) /* Input string is not 32-byte aligned. Rather than forcing the padding bytes to a safe value, we calculate the syndrome @@ -105,49 +77,42 @@ def_fn strchr cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b cmeq vhas_nul2.16b, vdata2.16b, #0 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b - and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b - and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b + bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b + bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b + and vend1.16b, vhas_nul1.16b, vrepmask_c.16b + and vend2.16b, vhas_nul2.16b, vrepmask_c.16b lsl tmp1, tmp1, #1 addp vend1.16b, vend1.16b, vend2.16b // 256->128 mov tmp3, #~0 addp vend1.16b, vend1.16b, vend2.16b // 128->64 lsr tmp1, tmp3, tmp1 - mov tmp3, vend1.2d[0] + mov tmp3, vend1.d[0] bic tmp1, tmp3, tmp1 // Mask padding bits. - cbnz tmp1, .Ltail + cbnz tmp1, L(tail) -.Lloop: + .p2align 4 +L(loop): ld1 {vdata1.16b, vdata2.16b}, [src], #32 - cmeq vhas_nul1.16b, vdata1.16b, #0 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b - cmeq vhas_nul2.16b, vdata2.16b, #0 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - /* Use a fast check for the termination condition. */ - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b - orr vend1.16b, vend1.16b, vend2.16b - addp vend1.2d, vend1.2d, vend1.2d - mov tmp1, vend1.2d[0] - cbz tmp1, .Lloop + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b + orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b + umaxp vend1.16b, vend1.16b, vend1.16b + mov tmp1, vend1.d[0] + cbz tmp1, L(loop) /* Termination condition found. Now need to establish exactly why we terminated. */ - and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b - and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b - and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b + bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b + bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b + and vend1.16b, vhas_nul1.16b, vrepmask_c.16b + and vend2.16b, vhas_nul2.16b, vrepmask_c.16b addp vend1.16b, vend1.16b, vend2.16b // 256->128 addp vend1.16b, vend1.16b, vend2.16b // 128->64 - - mov tmp1, vend1.2d[0] -.Ltail: + mov tmp1, vend1.d[0] +L(tail): /* Count the trailing zeros, by bit reversing... */ rbit tmp1, tmp1 /* Re-bias source. */ @@ -160,5 +125,5 @@ def_fn strchr csel result, result, xzr, eq ret - .size strchr, . - strchr +END (strchr) #endif diff --git a/newlib/libc/machine/aarch64/strchrnul.S b/newlib/libc/machine/aarch64/strchrnul.S index a0ac13b7f4..ceaf4dca17 100644 --- a/newlib/libc/machine/aarch64/strchrnul.S +++ b/newlib/libc/machine/aarch64/strchrnul.S @@ -1,32 +1,9 @@ /* - strchrnul - find a character or nul in a string - - Copyright (c) 2014, ARM Limited - All rights Reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the company nor the names of its contributors - may be used to endorse or promote products derived from this - software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - + * strchrnul - find a character or nul in a string + * + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) /* See strchrnul-stub.c */ #else @@ -37,6 +14,8 @@ * Neon Available. */ +#include "asmdefs.h" + /* Arguments and results. */ #define srcin x0 #define chrin w1 @@ -70,15 +49,8 @@ /* Locals and temporaries. */ - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -def_fn strchrnul +ENTRY (strchrnul) + PTR_ARG (0) /* Magic constant 0x40100401 to allow us to identify which lane matches the termination condition. */ mov wtmp2, #0x0401 @@ -87,7 +59,7 @@ def_fn strchrnul bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ dup vrepmask.4s, wtmp2 ands tmp1, srcin, #31 - b.eq .Lloop + b.eq L(loop) /* Input string is not 32-byte aligned. Rather than forcing the padding bytes to a safe value, we calculate the syndrome @@ -95,47 +67,43 @@ def_fn strchrnul syndrome that are related to the padding. */ ld1 {vdata1.16b, vdata2.16b}, [src], #32 neg tmp1, tmp1 - cmeq vhas_nul1.16b, vdata1.16b, #0 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b - cmeq vhas_nul2.16b, vdata2.16b, #0 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b - orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b - and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b + and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b lsl tmp1, tmp1, #1 addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 mov tmp3, #~0 addp vend1.16b, vend1.16b, vend1.16b // 128->64 lsr tmp1, tmp3, tmp1 - mov tmp3, vend1.2d[0] + mov tmp3, vend1.d[0] bic tmp1, tmp3, tmp1 // Mask padding bits. - cbnz tmp1, .Ltail + cbnz tmp1, L(tail) -.Lloop: + .p2align 4 +L(loop): ld1 {vdata1.16b, vdata2.16b}, [src], #32 - cmeq vhas_nul1.16b, vdata1.16b, #0 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b - cmeq vhas_nul2.16b, vdata2.16b, #0 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - /* Use a fast check for the termination condition. */ - orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b - orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b - addp vend1.2d, vend1.2d, vend1.2d - mov tmp1, vend1.2d[0] - cbz tmp1, .Lloop + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b + orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b + umaxp vend1.16b, vend1.16b, vend1.16b + mov tmp1, vend1.d[0] + cbz tmp1, L(loop) /* Termination condition found. Now need to establish exactly why we terminated. */ - and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 addp vend1.16b, vend1.16b, vend1.16b // 128->64 - mov tmp1, vend1.2d[0] -.Ltail: + mov tmp1, vend1.d[0] +L(tail): /* Count the trailing zeros, by bit reversing... */ rbit tmp1, tmp1 /* Re-bias source. */ @@ -145,5 +113,5 @@ def_fn strchrnul add result, src, tmp1, lsr #1 ret - .size strchrnul, . - strchrnul +END (strchrnul) #endif diff --git a/newlib/libc/machine/aarch64/strcmp.S b/newlib/libc/machine/aarch64/strcmp.S index e2bef2d49d..691a1760ee 100644 --- a/newlib/libc/machine/aarch64/strcmp.S +++ b/newlib/libc/machine/aarch64/strcmp.S @@ -1,202 +1,192 @@ -/* Copyright (c) 2012-2018, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -/* Assumptions: +/* + * strcmp - compare two strings * - * ARMv8-a, AArch64 + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) /* See strcmp-stub.c */ #else - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm +/* Assumptions: + * + * ARMv8-a, AArch64. + * MTE compatible. + */ -#define L(label) .L ## label +#include "asmdefs.h" #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 -/* Parameters and result. */ #define src1 x0 #define src2 x1 #define result x0 -/* Internal variables. */ #define data1 x2 #define data1w w2 #define data2 x3 #define data2w w3 #define has_nul x4 #define diff x5 +#define off1 x5 #define syndrome x6 -#define tmp1 x7 -#define tmp2 x8 -#define tmp3 x9 -#define zeroones x10 -#define pos x11 - - /* Start of performance-critical section -- one 64B cache line. */ -def_fn strcmp p2align=6 - eor tmp1, src1, src2 - mov zeroones, #REP8_01 - tst tmp1, #7 +#define tmp x6 +#define data3 x7 +#define zeroones x8 +#define shift x9 +#define off2 x10 + +/* On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. */ +#ifdef __AARCH64EB__ +# define LS_FW lsl +#else +# define LS_FW lsr +#endif + +/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. + Since carry propagation makes 0x1 bytes before a NUL byte appear + NUL too in big-endian, byte-reverse the data before the NUL check. */ + + +ENTRY (strcmp) + PTR_ARG (0) + PTR_ARG (1) + sub off2, src2, src1 + mov zeroones, REP8_01 + and tmp, src1, 7 + tst off2, 7 b.ne L(misaligned8) - ands tmp1, src1, #7 - b.ne L(mutual_align) - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ + cbnz tmp, L(mutual_align) + + .p2align 4 + L(loop_aligned): - ldr data1, [src1], #8 - ldr data2, [src2], #8 + ldr data2, [src1, off2] + ldr data1, [src1], 8 L(start_realigned): - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ +#ifdef __AARCH64EB__ + rev tmp, data1 + sub has_nul, tmp, zeroones + orr tmp, tmp, REP8_7f +#else + sub has_nul, data1, zeroones + orr tmp, data1, REP8_7f +#endif + bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ + ccmp data1, data2, 0, eq + b.eq L(loop_aligned) +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + eor diff, data1, data2 orr syndrome, diff, has_nul - cbz syndrome, L(loop_aligned) - /* End of performance-critical section -- one 64B cache line. */ - L(end): -#ifndef __AARCH64EB__ +#ifndef __AARCH64EB__ rev syndrome, syndrome rev data1, data1 - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ - clz pos, syndrome rev data2, data2 - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - ret -#else - /* For big-endian we cannot use the trick with the syndrome value - as carry-propagation can corrupt the upper bits if the trailing - bytes in the string contain 0x01. */ - /* However, if there is no NUL byte in the dword, we can generate - the result directly. We can't just subtract the bytes as the - MSB might be significant. */ - cbnz has_nul, 1f - cmp data1, data2 - cset result, ne - cneg result, result, lo - ret -1: - /* Re-compute the NUL-byte detection, using a byte-reversed value. */ - rev tmp3, data1 - sub tmp1, tmp3, zeroones - orr tmp2, tmp3, #REP8_7f - bic has_nul, tmp1, tmp2 - rev has_nul, has_nul - orr syndrome, diff, has_nul - clz pos, syndrome - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. +#endif + clz shift, syndrome + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. Shifting left now will bring the critical information into the top bits. */ - lsl data1, data1, pos - lsl data2, data2, pos + lsl data1, data1, shift + lsl data2, data2, shift /* But we need to zero-extend (char is unsigned) the value and then perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 + lsr data1, data1, 56 + sub result, data1, data2, lsr 56 ret -#endif + + .p2align 4 L(mutual_align): /* Sources are mutually aligned, but are not currently at an alignment boundary. Round down the addresses and then mask off - the bytes that preceed the start point. */ - bic src1, src1, #7 - bic src2, src2, #7 - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - ldr data1, [src1], #8 - neg tmp1, tmp1 /* Bits to alignment -64. */ - ldr data2, [src2], #8 - mov tmp2, #~0 -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#endif - orr data1, data1, tmp2 - orr data2, data2, tmp2 + the bytes that precede the start point. */ + bic src1, src1, 7 + ldr data2, [src1, off2] + ldr data1, [src1], 8 + neg shift, src2, lsl 3 /* Bits to alignment -64. */ + mov tmp, -1 + LS_FW tmp, tmp, shift + orr data1, data1, tmp + orr data2, data2, tmp b L(start_realigned) L(misaligned8): /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always - checking to make sure that we don't access beyond page boundary in - SRC2. */ - tst src1, #7 - b.eq L(loop_misaligned) + checking to make sure that we don't access beyond the end of SRC2. */ + cbz tmp, L(src1_aligned) L(do_misaligned): - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - cmp data1w, #1 - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + cmp data1w, 0 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ b.ne L(done) - tst src1, #7 + tst src1, 7 b.ne L(do_misaligned) -L(loop_misaligned): - /* Test if we are within the last dword of the end of a 4K page. If - yes then jump back to the misaligned loop to copy a byte at a time. */ - and tmp1, src2, #0xff8 - eor tmp1, tmp1, #0xff8 - cbz tmp1, L(do_misaligned) - ldr data1, [src1], #8 - ldr data2, [src2], #8 - - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ +L(src1_aligned): + neg shift, src2, lsl 3 + bic src2, src2, 7 + ldr data3, [src2], 8 +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + lsr tmp, zeroones, shift + orr data3, data3, tmp + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + bics has_nul, has_nul, tmp + b.ne L(tail) + + sub off1, src2, src1 + + .p2align 4 + +L(loop_unaligned): + ldr data3, [src1, off1] + ldr data2, [src1, off2] +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + ldr data1, [src1], 8 + bics has_nul, has_nul, tmp + ccmp data1, data2, 0, eq + b.eq L(loop_unaligned) + + lsl tmp, has_nul, shift +#ifdef __AARCH64EB__ + rev tmp, tmp +#endif + eor diff, data1, data2 + orr syndrome, diff, tmp + cbnz syndrome, L(end) +L(tail): + ldr data1, [src1] + neg shift, shift + lsr data2, data3, shift + lsr has_nul, has_nul, shift +#ifdef __AARCH64EB__ + rev data2, data2 + rev has_nul, has_nul +#endif + eor diff, data1, data2 orr syndrome, diff, has_nul - cbz syndrome, L(loop_misaligned) b L(end) L(done): sub result, data1, data2 ret - .size strcmp, .-strcmp +END (strcmp) #endif diff --git a/newlib/libc/machine/aarch64/strcpy.S b/newlib/libc/machine/aarch64/strcpy.S index e5405f2535..57c46f3908 100644 --- a/newlib/libc/machine/aarch64/strcpy.S +++ b/newlib/libc/machine/aarch64/strcpy.S @@ -1,341 +1,160 @@ /* - strcpy/stpcpy - copy a string returning pointer to start/end. - - Copyright (c) 2013, 2014, 2015 ARM Ltd. - All Rights Reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the company nor the names of its contributors - may be used to endorse or promote products derived from this - software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - + * strcpy/stpcpy - copy a string returning pointer to start/end. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) /* See strchr-stub.c */ #else /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses, min page size 4k. + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. */ -/* To build as stpcpy, define BUILD_STPCPY before compiling this file. +#include "asmdefs.h" - To test the page crossing code path more thoroughly, compile with - -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower - entry path. This option is not intended for production use. */ - -/* Arguments and results. */ #define dstin x0 #define srcin x1 +#define result x0 -/* Locals and temporaries. */ #define src x2 #define dst x3 -#define data1 x4 -#define data1w w4 -#define data2 x5 -#define data2w w5 -#define has_nul1 x6 -#define has_nul2 x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define tmp4 x11 -#define zeroones x12 -#define data1a x13 -#define data2a x14 -#define pos x15 -#define len x16 -#define to_align x17 +#define len x4 +#define synd x4 +#define tmp x5 +#define shift x5 +#define data1 x6 +#define dataw1 w6 +#define data2 x7 +#define dataw2 w7 + +#define dataq q0 +#define vdata v0 +#define vhas_nul v1 +#define vend v2 +#define dend d2 +#define dataq2 q1 #ifdef BUILD_STPCPY -#define STRCPY stpcpy +# define STRCPY stpcpy +# define IFSTPCPY(X,...) X,__VA_ARGS__ #else -#define STRCPY strcpy +# define STRCPY strcpy +# define IFSTPCPY(X,...) #endif - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 - - /* AArch64 systems have a minimum page size of 4k. We can do a quick - page size check for crossing this boundary on entry and if we - do not, then we can short-circuit much of the entry code. We - expect early page-crossing strings to be rare (probability of - 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite - predictable, even with random strings. - - We don't bother checking for larger page sizes, the cost of setting - up the correct page size is just not worth the extra gain from - a small reduction in the cases taking the slow path. Note that - we only care about whether the first fetch, which may be - misaligned, crosses a page boundary - after that we move to aligned - fetches for the remainder of the string. */ - -#ifdef STRCPY_TEST_PAGE_CROSS - /* Make everything that isn't Qword aligned look like a page cross. */ -#define MIN_PAGE_P2 4 -#else -#define MIN_PAGE_P2 12 -#endif - -#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) - -def_fn STRCPY p2align=6 - /* For moderately short strings, the fastest way to do the copy is to - calculate the length of the string in the same way as strlen, then - essentially do a memcpy of the result. This avoids the need for - multiple byte copies and further means that by the time we - reach the bulk copy loop we know we can always use DWord - accesses. We expect strcpy to rarely be called repeatedly - with the same source string, so branch prediction is likely to - always be difficult - we mitigate against this by preferring - conditional select operations over branches whenever this is - feasible. */ - and tmp2, srcin, #(MIN_PAGE_SIZE - 1) - mov zeroones, #REP8_01 - and to_align, srcin, #15 - cmp tmp2, #(MIN_PAGE_SIZE - 16) - neg tmp1, to_align - /* The first fetch will straddle a (possible) page boundary iff - srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte - aligned string will never fail the page align check, so will - always take the fast path. */ - b.gt .Lpage_cross - -.Lpage_cross_ok: - ldp data1, data2, [srcin] -#ifdef __AARCH64EB__ - /* Because we expect the end to be found within 16 characters - (profiling shows this is the most common case), it's worth - swapping the bytes now to save having to recalculate the - termination syndrome later. We preserve data1 and data2 - so that we can re-use the values later on. */ - rev tmp2, data1 - sub tmp1, tmp2, zeroones - orr tmp2, tmp2, #REP8_7f - bics has_nul1, tmp1, tmp2 - b.ne .Lfp_le8 - rev tmp4, data2 - sub tmp3, tmp4, zeroones - orr tmp4, tmp4, #REP8_7f -#else - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bics has_nul1, tmp1, tmp2 - b.ne .Lfp_le8 - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f +/* + Core algorithm: + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ + +ENTRY (STRCPY) + PTR_ARG (0) + PTR_ARG (1) + bic src, srcin, 15 + ld1 {vdata.16b}, [src] + cmeq vhas_nul.16b, vdata.16b, 0 + lsl shift, srcin, 2 + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + lsr synd, synd, shift + cbnz synd, L(tail) + + ldr dataq, [src, 16]! + cmeq vhas_nul.16b, vdata.16b, 0 + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + cbz synd, L(start_loop) + +#ifndef __AARCH64EB__ + rbit synd, synd #endif - bics has_nul2, tmp3, tmp4 - b.eq .Lbulk_entry + sub tmp, src, srcin + clz len, synd + add len, tmp, len, lsr 2 + tbz len, 4, L(less16) + sub tmp, len, 15 + ldr dataq, [srcin] + ldr dataq2, [srcin, tmp] + str dataq, [dstin] + str dataq2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret - /* The string is short (<=16 bytes). We don't know exactly how - short though, yet. Work out the exact length so that we can - quickly select the optimal copy strategy. */ -.Lfp_gt8: - rev has_nul2, has_nul2 - clz pos, has_nul2 - mov tmp2, #56 - add dst, dstin, pos, lsr #3 /* Bits to bytes. */ - sub pos, tmp2, pos -#ifdef __AARCH64EB__ - lsr data2, data2, pos -#else - lsl data2, data2, pos -#endif - str data2, [dst, #1] +L(tail): + rbit synd, synd + clz len, synd + lsr len, len, 2 +L(less16): + tbz len, 3, L(less8) + sub tmp, len, 7 + ldr data1, [srcin] + ldr data2, [srcin, tmp] str data1, [dstin] -#ifdef BUILD_STPCPY - add dstin, dst, #8 -#endif + str data2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) ret -.Lfp_le8: - rev has_nul1, has_nul1 - clz pos, has_nul1 - add dst, dstin, pos, lsr #3 /* Bits to bytes. */ - subs tmp2, pos, #24 /* Pos in bits. */ - b.lt .Lfp_lt4 -#ifdef __AARCH64EB__ - mov tmp2, #56 - sub pos, tmp2, pos - lsr data2, data1, pos - lsr data1, data1, #32 -#else - lsr data2, data1, tmp2 -#endif - /* 4->7 bytes to copy. */ - str data2w, [dst, #-3] - str data1w, [dstin] -#ifdef BUILD_STPCPY - mov dstin, dst -#endif - ret -.Lfp_lt4: - cbz pos, .Lfp_lt2 - /* 2->3 bytes to copy. */ -#ifdef __AARCH64EB__ - lsr data1, data1, #48 -#endif - strh data1w, [dstin] - /* Fall-through, one byte (max) to go. */ -.Lfp_lt2: - /* Null-terminated string. Last character must be zero! */ - strb wzr, [dst] -#ifdef BUILD_STPCPY - mov dstin, dst -#endif + .p2align 4 +L(less8): + subs tmp, len, 3 + b.lo L(less4) + ldr dataw1, [srcin] + ldr dataw2, [srcin, tmp] + str dataw1, [dstin] + str dataw2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) ret - .p2align 6 - /* Aligning here ensures that the entry code and main loop all lies - within one 64-byte cache line. */ -.Lbulk_entry: - sub to_align, to_align, #16 - stp data1, data2, [dstin] - sub src, srcin, to_align - sub dst, dstin, to_align - b .Lentry_no_page_cross - - /* The inner loop deals with two Dwords at a time. This has a - slightly higher start-up cost, but we should win quite quickly, - especially on cores with a high number of issue slots per - cycle, as we get much better parallelism out of the operations. */ -.Lmain_loop: - stp data1, data2, [dst], #16 -.Lentry_no_page_cross: - ldp data1, data2, [src], #16 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bics has_nul2, tmp3, tmp4 - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq .Lmain_loop - - /* Since we know we are copying at least 16 bytes, the fastest way - to deal with the tail is to determine the location of the - trailing NUL, then (re)copy the 16 bytes leading up to that. */ - cmp has_nul1, #0 -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - csel data1, data1, data2, ne - rev data1, data1 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bic has_nul1, tmp1, tmp2 -#else - csel has_nul1, has_nul1, has_nul2, ne -#endif - rev has_nul1, has_nul1 - clz pos, has_nul1 - add tmp1, pos, #72 - add pos, pos, #8 - csel pos, pos, tmp1, ne - add src, src, pos, lsr #3 - add dst, dst, pos, lsr #3 - ldp data1, data2, [src, #-32] - stp data1, data2, [dst, #-16] -#ifdef BUILD_STPCPY - sub dstin, dst, #1 -#endif +L(less4): + cbz len, L(zerobyte) + ldrh dataw1, [srcin] + strh dataw1, [dstin] +L(zerobyte): + strb wzr, [dstin, len] + IFSTPCPY (add result, dstin, len) ret -.Lpage_cross: - bic src, srcin, #15 - /* Start by loading two words at [srcin & ~15], then forcing the - bytes that precede srcin to 0xff. This means they never look - like termination bytes. */ - ldp data1, data2, [src] - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - tst to_align, #7 - csetm tmp2, ne -#ifdef __AARCH64EB__ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ + .p2align 4 +L(start_loop): + sub tmp, srcin, dstin + ldr dataq2, [srcin] + sub dst, src, tmp + str dataq2, [dstin] +L(loop): + str dataq, [dst], 32 + ldr dataq, [src, 16] + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loopend) + str dataq, [dst, -16] + ldr dataq, [src, 32]! + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(loop) + add dst, dst, 16 +L(loopend): + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ + fmov synd, dend + sub dst, dst, 31 +#ifndef __AARCH64EB__ + rbit synd, synd #endif - orr data1, data1, tmp2 - orr data2a, data2, tmp2 - cmp to_align, #8 - csinv data1, data1, xzr, lt - csel data2, data2, data2a, lt - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bics has_nul2, tmp3, tmp4 - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq .Lpage_cross_ok - /* We now need to make data1 and data2 look like they've been - loaded directly from srcin. Do a rotate on the 128-bit value. */ - lsl tmp1, to_align, #3 /* Bytes->bits. */ - neg tmp2, to_align, lsl #3 -#ifdef __AARCH64EB__ - lsl data1a, data1, tmp1 - lsr tmp4, data2, tmp2 - lsl data2, data2, tmp1 - orr tmp4, tmp4, data1a - cmp to_align, #8 - csel data1, tmp4, data2, lt - rev tmp2, data1 - rev tmp4, data2 - sub tmp1, tmp2, zeroones - orr tmp2, tmp2, #REP8_7f - sub tmp3, tmp4, zeroones - orr tmp4, tmp4, #REP8_7f -#else - lsr data1a, data1, tmp1 - lsl tmp4, data2, tmp2 - lsr data2, data2, tmp1 - orr tmp4, tmp4, data1a - cmp to_align, #8 - csel data1, tmp4, data2, lt - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f -#endif - bic has_nul1, tmp1, tmp2 - cbnz has_nul1, .Lfp_le8 - bic has_nul2, tmp3, tmp4 - b .Lfp_gt8 + clz len, synd + lsr len, len, 2 + add dst, dst, len + ldr dataq, [dst, tmp] + str dataq, [dst] + IFSTPCPY (add result, dst, 15) + ret - .size STRCPY, . - STRCPY +END (STRCPY) #endif diff --git a/newlib/libc/machine/aarch64/strlen.S b/newlib/libc/machine/aarch64/strlen.S index 872d136ef4..68a6f357cf 100644 --- a/newlib/libc/machine/aarch64/strlen.S +++ b/newlib/libc/machine/aarch64/strlen.S @@ -1,115 +1,92 @@ -/* Copyright (c) 2013-2015, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - +/* + * strlen - calculate the length of a string. + * + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) /* See strlen-stub.c */ #else /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses, min page size 4k. + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * Not MTE compatible. */ -/* To test the page crossing code path more thoroughly, compile with - -DTEST_PAGE_CROSS - this will force all calls through the slower - entry path. This option is not intended for production use. */ - -/* Arguments and results. */ -#define srcin x0 -#define len x0 - -/* Locals and temporaries. */ -#define src x1 -#define data1 x2 -#define data2 x3 -#define has_nul1 x4 -#define has_nul2 x5 -#define tmp1 x4 -#define tmp2 x5 -#define tmp3 x6 -#define tmp4 x7 -#define zeroones x8 - -#define L(l) .L ## l - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. A faster check - (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives - false hits for characters 129..255. */ +#include "asmdefs.h" + +#define srcin x0 +#define len x0 + +#define src x1 +#define data1 x2 +#define data2 x3 +#define has_nul1 x4 +#define has_nul2 x5 +#define tmp1 x4 +#define tmp2 x5 +#define tmp3 x6 +#define tmp4 x7 +#define zeroones x8 + +#define maskv v0 +#define maskd d0 +#define dataq1 q1 +#define dataq2 q2 +#define datav1 v1 +#define datav2 v2 +#define tmp x2 +#define tmpw w2 +#define synd x3 +#define syndw w3 +#define shift x4 + +/* For the first 32 bytes, NUL detection works on the principle that + (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a + byte is zero, and can be done in parallel across the entire word. */ #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 + +/* To test the page crossing code path more thoroughly, compile with + -DTEST_PAGE_CROSS - this will force all calls through the slower + entry path. This option is not intended for production use. */ #ifdef TEST_PAGE_CROSS -# define MIN_PAGE_SIZE 15 +# define MIN_PAGE_SIZE 32 #else # define MIN_PAGE_SIZE 4096 #endif - /* Since strings are short on average, we check the first 16 bytes - of the string for a NUL character. In order to do an unaligned ldp - safely we have to do a page cross check first. If there is a NUL - byte we calculate the length from the 2 8-byte words using - conditional select to reduce branch mispredictions (it is unlikely - strlen will be repeatedly called on strings with the same length). - - If the string is longer than 16 bytes, we align src so don't need - further page cross checks, and process 32 bytes per iteration - using the fast NUL check. If we encounter non-ASCII characters, - fallback to a second loop using the full NUL check. - - If the page cross check fails, we read 16 bytes from an aligned - address, remove any characters before the string, and continue - in the main loop using aligned loads. Since strings crossing a - page in the first 16 bytes are rare (probability of - 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. - - AArch64 systems have a minimum page size of 4k. We don't bother - checking for larger page sizes - the cost of setting up the correct - page size is just not worth the extra gain from a small reduction in - the cases taking the slow path. Note that we only care about - whether the first fetch, which may be misaligned, crosses a page - boundary. */ - -def_fn strlen p2align=6 +/* Core algorithm: + + Since strings are short on average, we check the first 32 bytes of the + string for a NUL character without aligning the string. In order to use + unaligned loads safely we must do a page cross check first. + + If there is a NUL byte we calculate the length from the 2 8-byte words + using conditional select to reduce branch mispredictions (it is unlikely + strlen will be repeatedly called on strings with the same length). + + If the string is longer than 32 bytes, align src so we don't need further + page cross checks, and process 32 bytes per iteration using a fast SIMD + loop. + + If the page cross check fails, we read 32 bytes from an aligned address, + and ignore any characters before the string. If it contains a NUL + character, return the length, if not, continue in the main loop. */ + +ENTRY (strlen) + PTR_ARG (0) and tmp1, srcin, MIN_PAGE_SIZE - 1 - mov zeroones, REP8_01 - cmp tmp1, MIN_PAGE_SIZE - 16 - b.gt L(page_cross) + cmp tmp1, MIN_PAGE_SIZE - 32 + b.hi L(page_cross) + + /* Look for a NUL byte in the first 16 bytes. */ ldp data1, data2, [srcin] + mov zeroones, REP8_01 + #ifdef __AARCH64EB__ /* For big-endian, carry propagation (if the final byte in the string is 0x01) means we cannot use has_nul1/2 directly. @@ -125,114 +102,96 @@ def_fn strlen p2align=6 bics has_nul1, tmp1, tmp2 bic has_nul2, tmp3, tmp4 ccmp has_nul2, 0, 0, eq - beq L(main_loop_entry) + b.eq L(bytes16_31) - /* Enter with C = has_nul1 == 0. */ + /* Find the exact offset of the first NUL byte in the first 16 bytes + from the string start. Enter with C = has_nul1 == 0. */ csel has_nul1, has_nul1, has_nul2, cc mov len, 8 rev has_nul1, has_nul1 - clz tmp1, has_nul1 csel len, xzr, len, cc + clz tmp1, has_nul1 add len, len, tmp1, lsr 3 ret - /* The inner loop processes 32 bytes per iteration and uses the fast - NUL check. If we encounter non-ASCII characters, use a second - loop with the accurate NUL check. */ - .p2align 4 -L(main_loop_entry): - bic src, srcin, 15 - sub src, src, 16 -L(main_loop): - ldp data1, data2, [src, 32]! -.Lpage_cross_entry: - sub tmp1, data1, zeroones - sub tmp3, data2, zeroones - orr tmp2, tmp1, tmp3 - tst tmp2, zeroones, lsl 7 - bne 1f - ldp data1, data2, [src, 16] + /* Look for a NUL byte at offset 16..31 in the string. */ +L(bytes16_31): + ldp data1, data2, [srcin, 16] +#ifdef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif sub tmp1, data1, zeroones - sub tmp3, data2, zeroones - orr tmp2, tmp1, tmp3 - tst tmp2, zeroones, lsl 7 - beq L(main_loop) - add src, src, 16 -1: - /* The fast check failed, so do the slower, accurate NUL check. */ orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones orr tmp4, data2, REP8_7f bics has_nul1, tmp1, tmp2 bic has_nul2, tmp3, tmp4 ccmp has_nul2, 0, 0, eq - beq L(nonascii_loop) + b.eq L(loop_entry) - /* Enter with C = has_nul1 == 0. */ -L(tail): -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul1/2 directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - csel data1, data1, data2, cc - rev data1, data1 - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - bic has_nul1, tmp1, tmp2 -#else + /* Find the exact offset of the first NUL byte at offset 16..31 from + the string start. Enter with C = has_nul1 == 0. */ csel has_nul1, has_nul1, has_nul2, cc -#endif - sub len, src, srcin + mov len, 24 rev has_nul1, has_nul1 - add tmp2, len, 8 + mov tmp3, 16 clz tmp1, has_nul1 - csel len, len, tmp2, cc + csel len, tmp3, len, cc add len, len, tmp1, lsr 3 ret -L(nonascii_loop): - ldp data1, data2, [src, 16]! - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - bne L(tail) - ldp data1, data2, [src, 16]! - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - beq L(nonascii_loop) - b L(tail) + nop +L(loop_entry): + bic src, srcin, 31 + + .p2align 5 +L(loop): + ldp dataq1, dataq2, [src, 32]! + uminp maskv.16b, datav1.16b, datav2.16b + uminp maskv.16b, maskv.16b, maskv.16b + cmeq maskv.8b, maskv.8b, 0 + fmov synd, maskd + cbz synd, L(loop) + + /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ + cmeq maskv.16b, datav1.16b, 0 + sub len, src, srcin + cbnz syndw, 1f + cmeq maskv.16b, datav2.16b, 0 + add len, len, 16 +1: + /* Generate a bitmask and compute correct byte offset. */ + shrn maskv.8b, maskv.8h, 4 + fmov synd, maskd +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz tmp, synd + add len, len, tmp, lsr 2 + ret - /* Load 16 bytes from [srcin & ~15] and force the bytes that precede - srcin to 0x7f, so we ignore any NUL bytes before the string. - Then continue in the aligned loop. */ L(page_cross): - bic src, srcin, 15 - ldp data1, data2, [src] - lsl tmp1, srcin, 3 - mov tmp4, -1 -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ -#endif - orr tmp1, tmp1, REP8_80 - orn data1, data1, tmp1 - orn tmp2, data2, tmp1 - tst srcin, 8 - csel data1, data1, tmp4, eq - csel data2, data2, tmp2, eq - b L(page_cross_entry) - - .size strlen, . - strlen + bic src, srcin, 31 + mov tmpw, 0x0c03 + movk tmpw, 0xc030, lsl 16 + ld1 {datav1.16b, datav2.16b}, [src] + dup maskv.4s, tmpw + cmeq datav1.16b, datav1.16b, 0 + cmeq datav2.16b, datav2.16b, 0 + and datav1.16b, datav1.16b, maskv.16b + and datav2.16b, datav2.16b, maskv.16b + addp maskv.16b, datav1.16b, datav2.16b + addp maskv.16b, maskv.16b, maskv.16b + fmov synd, maskd + lsl shift, srcin, 1 + lsr synd, synd, shift + cbz synd, L(loop) + + rbit synd, synd + clz len, synd + lsr len, len, 1 + ret + +END (strlen) #endif diff --git a/newlib/libc/machine/aarch64/strncmp.S b/newlib/libc/machine/aarch64/strncmp.S index ffdabc2607..373695503d 100644 --- a/newlib/libc/machine/aarch64/strncmp.S +++ b/newlib/libc/machine/aarch64/strncmp.S @@ -1,49 +1,23 @@ -/* Copyright (c) 2013, 2018, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - +/* + * strncmp - compare two strings + * + * Copyright (c) 2013-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) /* See strcmp-stub.c */ #else /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64. + * MTE compatible. */ - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm +#include "asmdefs.h" #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 /* Parameters and result. */ #define src1 x0 @@ -64,86 +38,91 @@ #define tmp3 x10 #define zeroones x11 #define pos x12 -#define limit_wd x13 -#define mask x14 -#define endloop x15 +#define mask x13 +#define endloop x14 #define count mask +#define offset pos +#define neg_offset x15 + +/* Define endian dependent shift operations. + On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. + LS_BK means shifting towards later bytes. + */ +#ifdef __AARCH64EB__ +#define LS_FW lsl +#define LS_BK lsr +#else +#define LS_FW lsr +#define LS_BK lsl +#endif - .text - .p2align 6 - .rep 7 - nop /* Pad so that the loop below fits a cache line. */ - .endr -def_fn strncmp - cbz limit, .Lret0 +ENTRY (strncmp) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + cbz limit, L(ret0) eor tmp1, src1, src2 mov zeroones, #REP8_01 tst tmp1, #7 and count, src1, #7 - b.ne .Lmisaligned8 - cbnz count, .Lmutual_align - /* Calculate the number of full and partial words -1. */ - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ - lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ + b.ne L(misaligned8) + cbnz count, L(mutual_align) /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and can be done in parallel across the entire word. */ - /* Start of performance-critical section -- one 64B cache line. */ -.Lloop_aligned: + .p2align 4 +L(loop_aligned): ldr data1, [src1], #8 ldr data2, [src2], #8 -.Lstart_realigned: - subs limit_wd, limit_wd, #1 +L(start_realigned): + subs limit, limit, #8 sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, pl /* Last Dword or differences. */ + csinv endloop, diff, xzr, hi /* Last Dword or differences. */ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp endloop, #0, #0, eq - b.eq .Lloop_aligned - /* End of performance-critical section -- one 64B cache line. */ + b.eq L(loop_aligned) + /* End of main loop */ - /* Not reached the limit, must have found the end or a diff. */ - tbz limit_wd, #63, .Lnot_limit - - /* Limit % 8 == 0 => all bytes significant. */ - ands limit, limit, #7 - b.eq .Lnot_limit - - lsl limit, limit, #3 /* Bits -> bytes. */ - mov mask, #~0 -#ifdef __AARCH64EB__ - lsr mask, mask, limit -#else - lsl mask, mask, limit -#endif - bic data1, data1, mask - bic data2, data2, mask - - /* Make sure that the NUL byte is marked in the syndrome. */ - orr has_nul, has_nul, mask - -.Lnot_limit: +L(full_check): +#ifndef __AARCH64EB__ orr syndrome, diff, has_nul - -#ifndef __AARCH64EB__ + add limit, limit, 8 /* Rewind limit to before last subs. */ +L(syndrome_check): + /* Limit was reached. Check if the NUL byte or the difference + is before the limit. */ rev syndrome, syndrome rev data1, data1 - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ clz pos, syndrome rev data2, data2 lsl data1, data1, pos + cmp limit, pos, lsr #3 lsl data2, data2, pos /* But we need to zero-extend (char is unsigned) the value and then perform a signed 32-bit subtraction. */ lsr data1, data1, #56 sub result, data1, data2, lsr #56 + csel result, result, xzr, hi ret #else + /* Not reached the limit, must have found the end or a diff. */ + tbz limit, #63, L(not_limit) + add tmp1, limit, 8 + cbz limit, L(not_limit) + + lsl limit, tmp1, #3 /* Bits -> bytes. */ + mov mask, #~0 + lsr mask, mask, limit + bic data1, data1, mask + bic data2, data2, mask + + /* Make sure that the NUL byte is marked in the syndrome. */ + orr has_nul, has_nul, mask + +L(not_limit): /* For big-endian we cannot use the trick with the syndrome value as carry-propagation can corrupt the upper bits if the trailing bytes in the string contain 0x01. */ @@ -164,10 +143,11 @@ def_fn strncmp rev has_nul, has_nul orr syndrome, diff, has_nul clz pos, syndrome - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. Shifting left now will bring the critical information into the top bits. */ +L(end_quick): lsl data1, data1, pos lsl data2, data2, pos /* But we need to zero-extend (char is unsigned) the value and then @@ -177,7 +157,7 @@ def_fn strncmp ret #endif -.Lmutual_align: +L(mutual_align): /* Sources are mutually aligned, but are not currently at an alignment boundary. Round down the addresses and then mask off the bytes that precede the start point. @@ -189,102 +169,143 @@ def_fn strncmp neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ ldr data2, [src2], #8 mov tmp2, #~0 - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ -#endif - and tmp3, limit_wd, #7 - lsr limit_wd, limit_wd, #3 - /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ - add limit, limit, count - add tmp3, tmp3, count + LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ + /* Adjust the limit and ensure it doesn't overflow. */ + adds limit, limit, count + csinv limit, limit, xzr, lo orr data1, data1, tmp2 orr data2, data2, tmp2 - add limit_wd, limit_wd, tmp3, lsr #3 - b .Lstart_realigned + b L(start_realigned) - .p2align 6 + .p2align 4 /* Don't bother with dwords for up to 16 bytes. */ -.Lmisaligned8: +L(misaligned8): cmp limit, #16 - b.hs .Ltry_misaligned_words + b.hs L(try_misaligned_words) -.Lbyte_loop: +L(byte_loop): /* Perhaps we can do better than this. */ ldrb data1w, [src1], #1 ldrb data2w, [src2], #1 subs limit, limit, #1 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq .Lbyte_loop -.Ldone: + b.eq L(byte_loop) +L(done): sub result, data1, data2 ret /* Align the SRC1 to a dword by doing a bytewise compare and then do the dword loop. */ -.Ltry_misaligned_words: - lsr limit_wd, limit, #3 - cbz count, .Ldo_misaligned +L(try_misaligned_words): + cbz count, L(src1_aligned) neg count, count and count, count, #7 sub limit, limit, count - lsr limit_wd, limit, #3 -.Lpage_end_loop: +L(page_end_loop): ldrb data1w, [src1], #1 ldrb data2w, [src2], #1 cmp data1w, #1 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.ne .Ldone + b.ne L(done) subs count, count, #1 - b.hi .Lpage_end_loop + b.hi L(page_end_loop) + + /* The following diagram explains the comparison of misaligned strings. + The bytes are shown in natural order. For little-endian, it is + reversed in the registers. The "x" bytes are before the string. + The "|" separates data that is loaded at one time. + src1 | a a a a a a a a | b b b c c c c c | . . . + src2 | x x x x x a a a a a a a a b b b | c c c c c . . . -.Ldo_misaligned: - /* Prepare ourselves for the next page crossing. Unlike the aligned - loop, we fetch 1 less dword because we risk crossing bounds on - SRC2. */ - mov count, #8 - subs limit_wd, limit_wd, #1 - b.lo .Ldone_loop -.Lloop_misaligned: - and tmp2, src2, #0xff8 - eor tmp2, tmp2, #0xff8 - cbz tmp2, .Lpage_end_loop + After shifting in each step, the data looks like this: + STEP_A STEP_B STEP_C + data1 a a a a a a a a b b b c c c c c b b b c c c c c + data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c + The bytes with "0" are eliminated from the syndrome via mask. + + Align SRC2 down to 16 bytes. This way we can read 16 bytes at a + time from SRC2. The comparison happens in 3 steps. After each step + the loop can exit, or read from SRC1 or SRC2. */ +L(src1_aligned): + /* Calculate offset from 8 byte alignment to string start in bits. No + need to mask offset since shifts are ignoring upper bits. */ + lsl offset, src2, #3 + bic src2, src2, #0xf + mov mask, -1 + neg neg_offset, offset ldr data1, [src1], #8 - ldr data2, [src2], #8 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ - ccmp diff, #0, #0, eq - b.ne .Lnot_limit - subs limit_wd, limit_wd, #1 - b.pl .Lloop_misaligned + ldp tmp1, tmp2, [src2], #16 + LS_BK mask, mask, neg_offset + and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ + /* Skip the first compare if data in tmp1 is irrelevant. */ + tbnz offset, 6, L(misaligned_mid_loop) -.Ldone_loop: - /* We found a difference or a NULL before the limit was reached. */ - and limit, limit, #7 - cbz limit, .Lnot_limit - /* Read the last word. */ - sub src1, src1, 8 - sub src2, src2, 8 - ldr data1, [src1, limit] - ldr data2, [src2, limit] - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f +L(loop_misaligned): + /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ + LS_FW data2, tmp1, offset + LS_BK tmp1, tmp2, neg_offset + subs limit, limit, #8 + orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ + sub has_nul, data1, zeroones eor diff, data1, data2 /* Non-zero if differences found. */ - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ - ccmp diff, #0, #0, eq - b.ne .Lnot_limit + orr tmp3, data1, #REP8_7f + csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ + orr tmp3, endloop, has_nul + cbnz tmp3, L(full_check) + + ldr data1, [src1], #8 +L(misaligned_mid_loop): + /* STEP_B: Compare first part of data1 to second part of tmp2. */ + LS_FW data2, tmp2, offset +#ifdef __AARCH64EB__ + /* For big-endian we do a byte reverse to avoid carry-propagation + problem described above. This way we can reuse the has_nul in the + next step and also use syndrome value trick at the end. */ + rev tmp3, data1 + #define data1_fixed tmp3 +#else + #define data1_fixed data1 +#endif + sub has_nul, data1_fixed, zeroones + orr tmp3, data1_fixed, #REP8_7f + eor diff, data2, data1 /* Non-zero if differences found. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + cmp limit, neg_offset, lsr #3 + orr syndrome, diff, has_nul + bic syndrome, syndrome, mask /* Ignore later bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + /* STEP_C: Compare second part of data1 to first part of tmp1. */ + ldp tmp1, tmp2, [src2], #16 + cmp limit, #8 + LS_BK data2, tmp1, neg_offset + eor diff, data2, data1 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + and syndrome, syndrome, mask /* Ignore earlier bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + ldr data1, [src1], #8 + sub limit, limit, #8 + b L(loop_misaligned) + +#ifdef __AARCH64EB__ +L(syndrome_check): + clz pos, syndrome + cmp pos, limit, lsl #3 + b.lo L(end_quick) +#endif -.Lret0: +L(ret0): mov result, #0 ret - .size strncmp, . - strncmp +END(strncmp) #endif diff --git a/newlib/libc/machine/aarch64/strnlen.S b/newlib/libc/machine/aarch64/strnlen.S index c255c3f7c6..091002e0b0 100644 --- a/newlib/libc/machine/aarch64/strnlen.S +++ b/newlib/libc/machine/aarch64/strnlen.S @@ -1,187 +1,105 @@ -/* strnlen - calculate the length of a string with limit. - - Copyright (c) 2013, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - +/* + * strnlen - calculate the length of a string with limit. + * + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) /* See strlen-stub.c */ #else /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. */ -/* Arguments and results. */ +#include "asmdefs.h" + #define srcin x0 -#define len x0 -#define limit x1 +#define cntin x1 +#define result x0 -/* Locals and temporaries. */ #define src x2 -#define data1 x3 -#define data2 x4 -#define data2a x5 -#define has_nul1 x6 -#define has_nul2 x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define tmp4 x11 -#define zeroones x12 -#define pos x13 -#define limit_wd x14 - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 - - .text - .p2align 6 -.Lstart: - /* Pre-pad to ensure critical loop begins an icache line. */ - .rep 7 - nop - .endr - /* Put this code here to avoid wasting more space with pre-padding. */ -.Lhit_limit: - mov len, limit +#define synd x3 +#define shift x4 +#define tmp x4 +#define cntrem x5 + +#define qdata q0 +#define vdata v0 +#define vhas_chr v1 +#define vend v2 +#define dend d2 + +/* + Core algorithm: + Process the string in 16-byte aligned chunks. Compute a 64-bit mask with + four bits per byte using the shrn instruction. A count trailing zeros then + identifies the first zero byte. */ + +ENTRY (strnlen) + PTR_ARG (0) + SIZE_ARG (1) + bic src, srcin, 15 + cbz cntin, L(nomatch) + ld1 {vdata.16b}, [src] + cmeq vhas_chr.16b, vdata.16b, 0 + lsl shift, srcin, 2 + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + fmov synd, dend + lsr synd, synd, shift + cbz synd, L(start_loop) +L(finish): + rbit synd, synd + clz synd, synd + lsr result, synd, 2 + cmp cntin, result + csel result, cntin, result, ls ret -def_fn strnlen - cbz limit, .Lhit_limit - mov zeroones, #REP8_01 - bic src, srcin, #15 - ands tmp1, srcin, #15 - b.ne .Lmisaligned - /* Calculate the number of full and partial words -1. */ - sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ - lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ - /* The inner loop deals with two Dwords at a time. This has a - slightly higher start-up cost, but we should win quite quickly, - especially on cores with a high number of issue slots per - cycle, as we get much better parallelism out of the operations. */ - - /* Start of critial section -- keep to one 64Byte cache line. */ -.Lloop: - ldp data1, data2, [src], #16 -.Lrealigned: - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - subs limit_wd, limit_wd, #1 - orr tmp1, has_nul1, has_nul2 - ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ - b.eq .Lloop - /* End of critical section -- keep to one 64Byte cache line. */ - - orr tmp1, has_nul1, has_nul2 - cbz tmp1, .Lhit_limit /* No null in final Qword. */ - - /* We know there's a null in the final Qword. The easiest thing - to do now is work out the length of the string and return - MIN (len, limit). */ - - sub len, src, srcin - cbz has_nul1, .Lnul_in_data2 -#ifdef __AARCH64EB__ - mov data2, data1 -#endif - sub len, len, #8 - mov has_nul2, has_nul1 -.Lnul_in_data2: -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - rev data2, data2 - sub tmp1, data2, zeroones - orr tmp2, data2, #REP8_7f - bic has_nul2, tmp1, tmp2 -#endif - sub len, len, #8 - rev has_nul2, has_nul2 - clz pos, has_nul2 - add len, len, pos, lsr #3 /* Bits to bytes. */ - cmp len, limit - csel len, len, limit, ls /* Return the lower value. */ +L(nomatch): + mov result, cntin ret -.Lmisaligned: - /* Deal with a partial first word. - We're doing two things in parallel here; - 1) Calculate the number of words (but avoiding overflow if - limit is near ULONG_MAX) - to do this we need to work out - limit + tmp1 - 1 as a 65-bit value before shifting it; - 2) Load and mask the initial data words - we force the bytes - before the ones we are interested in to 0xff - this ensures - early bytes will not hit any zero detection. */ - sub limit_wd, limit, #1 - neg tmp4, tmp1 - cmp tmp1, #8 - - and tmp3, limit_wd, #15 - lsr limit_wd, limit_wd, #4 - mov tmp2, #~0 - - ldp data1, data2, [src], #16 - lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ - add tmp3, tmp3, tmp1 - -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ +L(start_loop): + sub tmp, src, srcin + add tmp, tmp, 17 + subs cntrem, cntin, tmp + b.lo L(nomatch) + + /* Make sure that it won't overread by a 16-byte chunk */ + tbz cntrem, 4, L(loop32_2) + sub src, src, 16 + .p2align 5 +L(loop32): + ldr qdata, [src, 32]! + cmeq vhas_chr.16b, vdata.16b, 0 + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbnz synd, L(end) +L(loop32_2): + ldr qdata, [src, 16] + subs cntrem, cntrem, 32 + cmeq vhas_chr.16b, vdata.16b, 0 + b.lo L(end_2) + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbz synd, L(loop32) +L(end_2): + add src, src, 16 +L(end): + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + sub result, src, srcin + fmov synd, dend +#ifndef __AARCH64EB__ + rbit synd, synd #endif - add limit_wd, limit_wd, tmp3, lsr #4 - - orr data1, data1, tmp2 - orr data2a, data2, tmp2 - - csinv data1, data1, xzr, le - csel data2, data2, data2a, le - b .Lrealigned - .size strnlen, . - .Lstart /* Include pre-padding in size. */ + clz synd, synd + add result, result, synd, lsr 2 + cmp cntin, result + csel result, cntin, result, ls + ret +END (strnlen) #endif diff --git a/newlib/libc/machine/aarch64/strrchr.S b/newlib/libc/machine/aarch64/strrchr.S index d64fc09b1a..b0574228b6 100644 --- a/newlib/libc/machine/aarch64/strrchr.S +++ b/newlib/libc/machine/aarch64/strrchr.S @@ -1,32 +1,9 @@ /* - strrchr - find last instance of a character in a string - - Copyright (c) 2014, ARM Limited - All rights Reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the company nor the names of its contributors - may be used to endorse or promote products derived from this - software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - + * strrchr - find last position of a character in a string. + * + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) /* See strchr-stub.c */ #else @@ -37,6 +14,8 @@ * Neon Available. */ +#include "asmdefs.h" + /* Arguments and results. */ #define srcin x0 #define chrin w1 @@ -78,17 +57,8 @@ in the original string a count_trailing_zeros() operation will identify exactly which byte is causing the termination, and why. */ -/* Locals and temporaries. */ - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -def_fn strrchr +ENTRY (strrchr) + PTR_ARG (0) /* Magic constant 0x40100401 to allow us to identify which lane matches the requested byte. Magic constant 0x80200802 used similarly for NUL termination. */ @@ -100,7 +70,7 @@ def_fn strrchr mov src_offset, #0 ands tmp1, srcin, #31 add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ - b.eq .Laligned + b.eq L(aligned) /* Input string is not 32-byte aligned. Rather than forcing the padding bytes to a safe value, we calculate the syndrome @@ -118,45 +88,45 @@ def_fn strrchr and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128 addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 - addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b // 128->64 - addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64 - mov nul_match, vhas_nul1.2d[0] + addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64 + mov nul_match, vend1.d[0] lsl tmp1, tmp1, #1 mov const_m1, #~0 - mov chr_match, vhas_chr1.2d[0] lsr tmp3, const_m1, tmp1 + mov chr_match, vend1.d[1] bic nul_match, nul_match, tmp3 // Mask padding bits. bic chr_match, chr_match, tmp3 // Mask padding bits. - cbnz nul_match, .Ltail + cbnz nul_match, L(tail) -.Lloop: + .p2align 4 +L(loop): cmp chr_match, #0 csel src_match, src, src_match, ne csel src_offset, chr_match, src_offset, ne -.Laligned: +L(aligned): ld1 {vdata1.16b, vdata2.16b}, [src], #32 - cmeq vhas_nul1.16b, vdata1.16b, #0 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b - cmeq vhas_nul2.16b, vdata2.16b, #0 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - addp vend1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128 + uminp vend1.16b, vdata1.16b, vdata2.16b and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b + cmeq vend1.16b, vend1.16b, 0 addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 - addp vend1.16b, vend1.16b, vend1.16b // 128->64 - addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64 - mov nul_match, vend1.2d[0] - mov chr_match, vhas_chr1.2d[0] - cbz nul_match, .Lloop + addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64 + mov nul_match, vend1.d[0] + mov chr_match, vend1.d[1] + cbz nul_match, L(loop) + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_nul2.16b, vdata2.16b, #0 and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b - mov nul_match, vhas_nul1.2d[0] + mov nul_match, vhas_nul1.d[0] -.Ltail: +L(tail): /* Work out exactly where the string ends. */ sub tmp4, nul_match, #1 eor tmp4, tmp4, nul_match @@ -178,5 +148,5 @@ def_fn strrchr ret - .size strrchr, . - strrchr +END (strrchr) #endif From patchwork Tue Sep 12 10:05:07 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Sebastian Huber X-Patchwork-Id: 75720 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 932A03857718 for ; Tue, 12 Sep 2023 10:05:29 +0000 (GMT) X-Original-To: newlib@sourceware.org Delivered-To: newlib@sourceware.org Received: from dedi548.your-server.de (dedi548.your-server.de [85.10.215.148]) by sourceware.org (Postfix) with ESMTPS id DBECC385842C for ; Tue, 12 Sep 2023 10:05:13 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org DBECC385842C Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=embedded-brains.de Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=embedded-brains.de Received: from sslproxy05.your-server.de ([78.46.172.2]) by dedi548.your-server.de with esmtpsa (TLS1.3) tls TLS_AES_256_GCM_SHA384 (Exim 4.94.2) (envelope-from ) id 1qg0Ge-000Edz-Nn; Tue, 12 Sep 2023 12:05:12 +0200 Received: from [82.100.198.138] (helo=mail.embedded-brains.de) by sslproxy05.your-server.de with esmtpsa (TLSv1.3:TLS_AES_256_GCM_SHA384:256) (Exim 4.92) (envelope-from ) id 1qg0Ge-0005uU-Je; Tue, 12 Sep 2023 12:05:12 +0200 Received: from localhost (localhost [127.0.0.1]) by mail.embedded-brains.de (Postfix) with ESMTP id 41CD34801B7; Tue, 12 Sep 2023 12:05:12 +0200 (CEST) Received: from mail.embedded-brains.de ([127.0.0.1]) by localhost (zimbra.eb.localhost [127.0.0.1]) (amavis, port 10032) with ESMTP id 7c26RXUsoAfA; Tue, 12 Sep 2023 12:05:11 +0200 (CEST) Received: from localhost (localhost [127.0.0.1]) by mail.embedded-brains.de (Postfix) with ESMTP id 918134801E4; Tue, 12 Sep 2023 12:05:11 +0200 (CEST) X-Virus-Scanned: amavis at zimbra.eb.localhost Received: from mail.embedded-brains.de ([127.0.0.1]) by localhost (zimbra.eb.localhost [127.0.0.1]) (amavis, port 10026) with ESMTP id KF1BXeYbeWVc; Tue, 12 Sep 2023 12:05:11 +0200 (CEST) Received: from zimbra.eb.localhost (unknown [192.168.96.242]) by mail.embedded-brains.de (Postfix) with ESMTPSA id 56FAF4801B7; Tue, 12 Sep 2023 12:05:11 +0200 (CEST) From: Sebastian Huber To: newlib@sourceware.org Cc: Szabolcs Nagy Subject: [PATCH v3 2/2] aarch64: Import memrchr.S Date: Tue, 12 Sep 2023 12:05:07 +0200 Message-Id: <20230912100507.33946-3-sebastian.huber@embedded-brains.de> X-Mailer: git-send-email 2.35.3 In-Reply-To: <20230912100507.33946-1-sebastian.huber@embedded-brains.de> References: <20230912100507.33946-1-sebastian.huber@embedded-brains.de> MIME-Version: 1.0 X-Authenticated-Sender: smtp-embedded@poldi-networks.de X-Virus-Scanned: Clear (ClamAV 0.103.8/27029/Tue Sep 12 09:38:51 2023) X-Spam-Status: No, score=-11.4 required=5.0 tests=BAYES_00, GIT_PATCH_0, KAM_DMARC_STATUS, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: newlib@sourceware.org X-Mailman-Version: 2.1.30 Precedence: list List-Id: Newlib mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: newlib-bounces+patchwork=sourceware.org@sourceware.org Sender: "Newlib" Import memrchr.S for AArch64 from: https://github.com/ARM-software/optimized-routines commit 0cf84f26b6b8dcad8287fe30a4dcc1fdabd06560 Author: Sebastian Huber Date: Thu Jul 27 17:14:57 2023 +0200 string: Fix corrupt GNU_PROPERTY_TYPE (5) size For ELF32 the notes alignment is 4 and not 8. --- newlib/Makefile.in | 40 +++++++ newlib/libc/machine/aarch64/Makefile.inc | 2 + newlib/libc/machine/aarch64/memrchr-stub.c | 11 ++ newlib/libc/machine/aarch64/memrchr.S | 115 +++++++++++++++++++++ 4 files changed, 168 insertions(+) create mode 100644 newlib/libc/machine/aarch64/memrchr-stub.c create mode 100644 newlib/libc/machine/aarch64/memrchr.S diff --git a/newlib/Makefile.in b/newlib/Makefile.in index c3052acb93..028dd99f60 100644 --- a/newlib/Makefile.in +++ b/newlib/Makefile.in @@ -596,6 +596,8 @@ check_PROGRAMS = @HAVE_LIBC_MACHINE_AARCH64_TRUE@ libc/machine/aarch64/memcpy.S \ @HAVE_LIBC_MACHINE_AARCH64_TRUE@ libc/machine/aarch64/memmove-stub.c \ @HAVE_LIBC_MACHINE_AARCH64_TRUE@ libc/machine/aarch64/memmove.S \ +@HAVE_LIBC_MACHINE_AARCH64_TRUE@ libc/machine/aarch64/memrchr-stub.c \ +@HAVE_LIBC_MACHINE_AARCH64_TRUE@ libc/machine/aarch64/memrchr.S \ @HAVE_LIBC_MACHINE_AARCH64_TRUE@ libc/machine/aarch64/memset-stub.c \ @HAVE_LIBC_MACHINE_AARCH64_TRUE@ libc/machine/aarch64/memset.S \ @HAVE_LIBC_MACHINE_AARCH64_TRUE@ libc/machine/aarch64/rawmemchr.S \ @@ -1847,6 +1849,8 @@ am__objects_51 = libc/ssp/libc_a-chk_fail.$(OBJEXT) \ @HAVE_LIBC_MACHINE_AARCH64_TRUE@ libc/machine/aarch64/libc_a-memcpy.$(OBJEXT) \ @HAVE_LIBC_MACHINE_AARCH64_TRUE@ libc/machine/aarch64/libc_a-memmove-stub.$(OBJEXT) \ @HAVE_LIBC_MACHINE_AARCH64_TRUE@ libc/machine/aarch64/libc_a-memmove.$(OBJEXT) \ +@HAVE_LIBC_MACHINE_AARCH64_TRUE@ libc/machine/aarch64/libc_a-memrchr-stub.$(OBJEXT) \ +@HAVE_LIBC_MACHINE_AARCH64_TRUE@ libc/machine/aarch64/libc_a-memrchr.$(OBJEXT) \ @HAVE_LIBC_MACHINE_AARCH64_TRUE@ libc/machine/aarch64/libc_a-memset-stub.$(OBJEXT) \ @HAVE_LIBC_MACHINE_AARCH64_TRUE@ libc/machine/aarch64/libc_a-memset.$(OBJEXT) \ @HAVE_LIBC_MACHINE_AARCH64_TRUE@ libc/machine/aarch64/libc_a-rawmemchr.$(OBJEXT) \ @@ -8024,6 +8028,12 @@ libc/machine/aarch64/libc_a-memmove-stub.$(OBJEXT): \ libc/machine/aarch64/libc_a-memmove.$(OBJEXT): \ libc/machine/aarch64/$(am__dirstamp) \ libc/machine/aarch64/$(DEPDIR)/$(am__dirstamp) +libc/machine/aarch64/libc_a-memrchr-stub.$(OBJEXT): \ + libc/machine/aarch64/$(am__dirstamp) \ + libc/machine/aarch64/$(DEPDIR)/$(am__dirstamp) +libc/machine/aarch64/libc_a-memrchr.$(OBJEXT): \ + libc/machine/aarch64/$(am__dirstamp) \ + libc/machine/aarch64/$(DEPDIR)/$(am__dirstamp) libc/machine/aarch64/libc_a-memset-stub.$(OBJEXT): \ libc/machine/aarch64/$(am__dirstamp) \ libc/machine/aarch64/$(DEPDIR)/$(am__dirstamp) @@ -12730,6 +12740,8 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@libc/machine/aarch64/$(DEPDIR)/libc_a-memcpy.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@libc/machine/aarch64/$(DEPDIR)/libc_a-memmove-stub.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@libc/machine/aarch64/$(DEPDIR)/libc_a-memmove.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr-stub.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@libc/machine/aarch64/$(DEPDIR)/libc_a-memset-stub.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@libc/machine/aarch64/$(DEPDIR)/libc_a-memset.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@libc/machine/aarch64/$(DEPDIR)/libc_a-rawmemchr-stub.Po@am__quote@ @@ -16711,6 +16723,20 @@ libc/machine/aarch64/libc_a-memmove.obj: libc/machine/aarch64/memmove.S @AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -c -o libc/machine/aarch64/libc_a-memmove.obj `if test -f 'libc/machine/aarch64/memmove.S'; then $(CYGPATH_W) 'libc/machine/aarch64/memmove.S'; else $(CYGPATH_W) '$(srcdir)/libc/machine/aarch64/memmove.S'; fi` +libc/machine/aarch64/libc_a-memrchr.o: libc/machine/aarch64/memrchr.S +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -MT libc/machine/aarch64/libc_a-memrchr.o -MD -MP -MF libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr.Tpo -c -o libc/machine/aarch64/libc_a-memrchr.o `test -f 'libc/machine/aarch64/memrchr.S' || echo '$(srcdir)/'`libc/machine/aarch64/memrchr.S +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr.Tpo libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr.Po +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='libc/machine/aarch64/memrchr.S' object='libc/machine/aarch64/libc_a-memrchr.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -c -o libc/machine/aarch64/libc_a-memrchr.o `test -f 'libc/machine/aarch64/memrchr.S' || echo '$(srcdir)/'`libc/machine/aarch64/memrchr.S + +libc/machine/aarch64/libc_a-memrchr.obj: libc/machine/aarch64/memrchr.S +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -MT libc/machine/aarch64/libc_a-memrchr.obj -MD -MP -MF libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr.Tpo -c -o libc/machine/aarch64/libc_a-memrchr.obj `if test -f 'libc/machine/aarch64/memrchr.S'; then $(CYGPATH_W) 'libc/machine/aarch64/memrchr.S'; else $(CYGPATH_W) '$(srcdir)/libc/machine/aarch64/memrchr.S'; fi` +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr.Tpo libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr.Po +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='libc/machine/aarch64/memrchr.S' object='libc/machine/aarch64/libc_a-memrchr.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -c -o libc/machine/aarch64/libc_a-memrchr.obj `if test -f 'libc/machine/aarch64/memrchr.S'; then $(CYGPATH_W) 'libc/machine/aarch64/memrchr.S'; else $(CYGPATH_W) '$(srcdir)/libc/machine/aarch64/memrchr.S'; fi` + libc/machine/aarch64/libc_a-memset.o: libc/machine/aarch64/memset.S @am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -MT libc/machine/aarch64/libc_a-memset.o -MD -MP -MF libc/machine/aarch64/$(DEPDIR)/libc_a-memset.Tpo -c -o libc/machine/aarch64/libc_a-memset.o `test -f 'libc/machine/aarch64/memset.S' || echo '$(srcdir)/'`libc/machine/aarch64/memset.S @am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) libc/machine/aarch64/$(DEPDIR)/libc_a-memset.Tpo libc/machine/aarch64/$(DEPDIR)/libc_a-memset.Po @@ -32981,6 +33007,20 @@ libc/machine/aarch64/libc_a-memmove-stub.obj: libc/machine/aarch64/memmove-stub. @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -c -o libc/machine/aarch64/libc_a-memmove-stub.obj `if test -f 'libc/machine/aarch64/memmove-stub.c'; then $(CYGPATH_W) 'libc/machine/aarch64/memmove-stub.c'; else $(CYGPATH_W) '$(srcdir)/libc/machine/aarch64/memmove-stub.c'; fi` +libc/machine/aarch64/libc_a-memrchr-stub.o: libc/machine/aarch64/memrchr-stub.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -MT libc/machine/aarch64/libc_a-memrchr-stub.o -MD -MP -MF libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr-stub.Tpo -c -o libc/machine/aarch64/libc_a-memrchr-stub.o `test -f 'libc/machine/aarch64/memrchr-stub.c' || echo '$(srcdir)/'`libc/machine/aarch64/memrchr-stub.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr-stub.Tpo libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr-stub.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='libc/machine/aarch64/memrchr-stub.c' object='libc/machine/aarch64/libc_a-memrchr-stub.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -c -o libc/machine/aarch64/libc_a-memrchr-stub.o `test -f 'libc/machine/aarch64/memrchr-stub.c' || echo '$(srcdir)/'`libc/machine/aarch64/memrchr-stub.c + +libc/machine/aarch64/libc_a-memrchr-stub.obj: libc/machine/aarch64/memrchr-stub.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -MT libc/machine/aarch64/libc_a-memrchr-stub.obj -MD -MP -MF libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr-stub.Tpo -c -o libc/machine/aarch64/libc_a-memrchr-stub.obj `if test -f 'libc/machine/aarch64/memrchr-stub.c'; then $(CYGPATH_W) 'libc/machine/aarch64/memrchr-stub.c'; else $(CYGPATH_W) '$(srcdir)/libc/machine/aarch64/memrchr-stub.c'; fi` +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr-stub.Tpo libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr-stub.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='libc/machine/aarch64/memrchr-stub.c' object='libc/machine/aarch64/libc_a-memrchr-stub.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -c -o libc/machine/aarch64/libc_a-memrchr-stub.obj `if test -f 'libc/machine/aarch64/memrchr-stub.c'; then $(CYGPATH_W) 'libc/machine/aarch64/memrchr-stub.c'; else $(CYGPATH_W) '$(srcdir)/libc/machine/aarch64/memrchr-stub.c'; fi` + libc/machine/aarch64/libc_a-memset-stub.o: libc/machine/aarch64/memset-stub.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -MT libc/machine/aarch64/libc_a-memset-stub.o -MD -MP -MF libc/machine/aarch64/$(DEPDIR)/libc_a-memset-stub.Tpo -c -o libc/machine/aarch64/libc_a-memset-stub.o `test -f 'libc/machine/aarch64/memset-stub.c' || echo '$(srcdir)/'`libc/machine/aarch64/memset-stub.c @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) libc/machine/aarch64/$(DEPDIR)/libc_a-memset-stub.Tpo libc/machine/aarch64/$(DEPDIR)/libc_a-memset-stub.Po diff --git a/newlib/libc/machine/aarch64/Makefile.inc b/newlib/libc/machine/aarch64/Makefile.inc index 063a2a84ae..c749b0d575 100644 --- a/newlib/libc/machine/aarch64/Makefile.inc +++ b/newlib/libc/machine/aarch64/Makefile.inc @@ -7,6 +7,8 @@ libc_a_SOURCES += \ %D%/memcpy.S \ %D%/memmove-stub.c \ %D%/memmove.S \ + %D%/memrchr-stub.c \ + %D%/memrchr.S \ %D%/memset-stub.c \ %D%/memset.S \ %D%/rawmemchr.S \ diff --git a/newlib/libc/machine/aarch64/memrchr-stub.c b/newlib/libc/machine/aarch64/memrchr-stub.c new file mode 100644 index 0000000000..48f13bedc8 --- /dev/null +++ b/newlib/libc/machine/aarch64/memrchr-stub.c @@ -0,0 +1,11 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2023 embedded brains GmbH & Co. KG + */ + +#if defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) +#include "../../string/memrchr.c" +#else +/* See memrchr.S */ +#endif diff --git a/newlib/libc/machine/aarch64/memrchr.S b/newlib/libc/machine/aarch64/memrchr.S new file mode 100644 index 0000000000..ba9915cc3d --- /dev/null +++ b/newlib/libc/machine/aarch64/memrchr.S @@ -0,0 +1,115 @@ +/* + * memrchr - find last character in a memory zone. + * + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#if defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) +/* See memrchr-stub.c */ +#else +#include "asmdefs.h" + +#define srcin x0 +#define chrin w1 +#define cntin x2 +#define result x0 + +#define src x3 +#define cntrem x4 +#define synd x5 +#define shift x6 +#define tmp x7 +#define end x8 +#define endm1 x9 + +#define vrepchr v0 +#define qdata q1 +#define vdata v1 +#define vhas_chr v2 +#define vend v3 +#define dend d3 + +/* + Core algorithm: + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ + +ENTRY (memrchr) + PTR_ARG (0) + add end, srcin, cntin + sub endm1, end, 1 + bic src, endm1, 15 + cbz cntin, L(nomatch) + ld1 {vdata.16b}, [src] + dup vrepchr.16b, chrin + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + neg shift, end, lsl 2 + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + fmov synd, dend + lsl synd, synd, shift + cbz synd, L(start_loop) + + clz synd, synd + sub result, endm1, synd, lsr 2 + cmp cntin, synd, lsr 2 + csel result, result, xzr, hi + ret + + nop +L(start_loop): + subs cntrem, src, srcin + b.ls L(nomatch) + + /* Make sure that it won't overread by a 16-byte chunk */ + sub cntrem, cntrem, 1 + tbz cntrem, 4, L(loop32_2) + add src, src, 16 + + .p2align 5 +L(loop32): + ldr qdata, [src, -32]! + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbnz synd, L(end) + +L(loop32_2): + ldr qdata, [src, -16] + subs cntrem, cntrem, 32 + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + b.lo L(end_2) + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbz synd, L(loop32) +L(end_2): + sub src, src, 16 +L(end): + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + fmov synd, dend + + add tmp, src, 15 +#ifdef __AARCH64EB__ + rbit synd, synd +#endif + clz synd, synd + sub tmp, tmp, synd, lsr 2 + cmp tmp, srcin + csel result, tmp, xzr, hs + ret + +L(nomatch): + mov result, 0 + ret + +END (memrchr) +#endif