From patchwork Thu Dec 25 16:08:56 2025 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Pincheng Wang X-Patchwork-Id: 127081 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from vm01.sourceware.org (localhost [127.0.0.1]) by sourceware.org (Postfix) with ESMTP id F336D4BA2E32 for ; Thu, 25 Dec 2025 16:09:42 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org F336D4BA2E32 X-Original-To: newlib@sourceware.org Delivered-To: newlib@sourceware.org Received: from cstnet.cn (smtp84.cstnet.cn [159.226.251.84]) by sourceware.org (Postfix) with ESMTPS id 525634BA2E05 for ; Thu, 25 Dec 2025 16:09:03 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 525634BA2E05 Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=isrc.iscas.ac.cn Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=isrc.iscas.ac.cn ARC-Filter: OpenARC Filter v1.0.0 sourceware.org 525634BA2E05 Authentication-Results: server2.sourceware.org; arc=none smtp.remote-ip=159.226.251.84 ARC-Seal: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1766678944; cv=none; b=eDNZ+w7UD9WvcZHZZObbmvh1x+w2KsH5riVbmqIE6KaFvQ+4nSF0e490pSFkxd+Q5sUobHuUNQukLd4OYjus9JWv8y1uBNjtLY970Jab1H/hdNeLL8zix2vJZSSHq2lUCRf4aZdqS33NDDi2MHTlnSt2puUBSRo1QeY2j/4EbZM= ARC-Message-Signature: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1766678944; c=relaxed/simple; bh=gPS6pf2RXXoHM1XC08eL1qB9PvIqA1UuxrbTv1VUe6o=; h=From:To:Subject:Date:Message-Id:MIME-Version; b=WZFIK9/OfMGE+ZdEWQtjvhpFeWTbDP1fGgECmRCSRC1kNRP3aYjlmGSAEaV3ZMlYRbmtkCxBqu9QxN9igmq4QwqH4w0T0josYAd2JqUXxhmGW8DyTupk+yG9BhN+8Mp2BZ7+fiztP5cHwC9niNtfbAOVt4rO7uiREi2Aflrxr04= ARC-Authentication-Results: i=1; server2.sourceware.org DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 525634BA2E05 Received: from ROG.lan (unknown [120.227.57.105]) by APP-05 (Coremail) with SMTP id zQCowAAHnA+aYU1pzB7sAQ--.14266S3; Fri, 26 Dec 2025 00:08:58 +0800 (CST) From: Pincheng Wang To: newlib@sourceware.org Cc: pincheng.plct@isrc.iscas.ac.cn Subject: [PATCH v2 1/1] riscv: add vectorized memset, memcpy and memmove Date: Fri, 26 Dec 2025 00:08:56 +0800 Message-Id: <20251225160856.16010-2-pincheng.plct@isrc.iscas.ac.cn> X-Mailer: git-send-email 2.39.5 In-Reply-To: <20251225160856.16010-1-pincheng.plct@isrc.iscas.ac.cn> References: <20251225160856.16010-1-pincheng.plct@isrc.iscas.ac.cn> MIME-Version: 1.0 X-CM-TRANSID: zQCowAAHnA+aYU1pzB7sAQ--.14266S3 X-Coremail-Antispam: 1UD129KBjvJXoW3Xw43AF18Jr17XrW3Xw4DXFb_yoWxKryxpF 4UGFZFyw1ftrn3XrZ2q3WFvwsxXry8WFy3GFW3ZayUJFZxGa95GFZ0ya1ay3Wvqrs29w4f uw1fAr15Zw4rA3JanT9S1TB71UUUUU7qnTZGkaVYY2UrUUUUjbIjqfuFe4nvWSU5nxnvy2 9KBjDU0xBIdaVrnRJUUU9214x267AKxVWUJVW8JwAFc2x0x2IEx4CE42xK8VAvwI8IcIk0 rVWrJVCq3wAFIxvE14AKwVWUJVWUGwA2048vs2IY020E87I2jVAFwI0_Jr4l82xGYIkIc2 x26xkF7I0E14v26r18M28lY4IEw2IIxxk0rwA2F7IY1VAKz4vEj48ve4kI8wA2z4x0Y4vE 2Ix0cI8IcVAFwI0_Jr0_JF4l84ACjcxK6xIIjxv20xvEc7CjxVAFwI0_Jr0_Gr1l84ACjc xK6I8E87Iv67AKxVWUJVW8JwA2z4x0Y4vEx4A2jsIEc7CjxVAFwI0_Jr0_Gr1le2I262IY c4CY6c8Ij28IcVAaY2xG8wAqx4xG64xvF2IEw4CE5I8CrVC2j2WlYx0E2Ix0cI8IcVAFwI 0_Jr0_Jr4lYx0Ex4A2jsIE14v26r1j6r4UMcvjeVCFs4IE7xkEbVWUJVW8JwACjcxG0xvY 0x0EwIxGrwACjI8F5VA0II8E6IAqYI8I648v4I1l42xK82IYc2Ij64vIr41l4I8I3I0E4I kC6x0Yz7v_Jr0_Gr1lx2IqxVAqx4xG67AKxVWUJVWUGwC20s026x8GjcxK67AKxVWUGVWU WwC2zVAF1VAY17CE14v26r1Y6r17MIIYrxkI7VAKI48JMIIF0xvE2Ix0cI8IcVAFwI0_Jr 0_JF4lIxAIcVC0I7IYx2IY6xkF7I0E14v26r1j6r4UMIIF0xvE42xK8VAvwI8IcIk0rVWU JVWUCwCI42IY6I8E87Iv67AKxVWUJVW8JwCI42IY6I8E87Iv6xkF7I0E14v26r1j6r4UYx BIdaVFxhVjvjDU0xZFpf9x0JUqfO7UUUUU= X-Originating-IP: [120.227.57.105] X-CM-SenderInfo: pslquxhhqjh1xofwqxxvufhxpvfd2hldfou0/ X-Spam-Status: No, score=-10.6 required=5.0 tests=BAYES_00, GIT_PATCH_0, KAM_DMARC_STATUS, RCVD_IN_DNSWL_BLOCKED, RCVD_IN_VALIDITY_RPBL_BLOCKED, RCVD_IN_VALIDITY_SAFE_BLOCKED, SCC_5_SHORT_WORD_LINES, SPF_HELO_PASS, SPF_PASS, TXREP, URIBL_BLOCKED autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on sourceware.org X-BeenThere: newlib@sourceware.org X-Mailman-Version: 2.1.30 Precedence: list List-Id: Newlib mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: newlib-bounces~patchwork=sourceware.org@sourceware.org The vector implementations use m8 register grouping and process data in vector-length chunks, providing significant performance improvements on RVV-capable hardware. Use conditional compilation to fallback to scalar implementations when __riscv_vector is not available, maintaining compatibility with non-vector RISC-V systems. Signed-off-by: Pincheng Wang --- newlib/libc/machine/riscv/memcpy-asm.S | 54 +++++++++++++- newlib/libc/machine/riscv/memcpy.c | 2 +- newlib/libc/machine/riscv/memmove-asm.S | 95 ++++++++++++++++++++++++- newlib/libc/machine/riscv/memmove.c | 2 +- newlib/libc/machine/riscv/memset.S | 21 +++++- 5 files changed, 169 insertions(+), 5 deletions(-) diff --git a/newlib/libc/machine/riscv/memcpy-asm.S b/newlib/libc/machine/riscv/memcpy-asm.S index 2771285f9..a5f085f42 100644 --- a/newlib/libc/machine/riscv/memcpy-asm.S +++ b/newlib/libc/machine/riscv/memcpy-asm.S @@ -9,7 +9,59 @@ http://www.opensource.org/licenses. */ -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) +#if defined(__riscv_vector) +.text +.global memcpy +.type memcpy, @function +.option push +.option arch, +zve32x +memcpy: + mv t0, a0 /* t0 = running dst */ + mv t1, a1 /* t1 = running src */ + beqz a2, .Ldone /* if n == 0, return */ + + /* Align dst to SZREG: skip when __riscv_misaligned_fast, else align */ +#ifndef __riscv_misaligned_fast + /* process small data directly with vectors, no alignment optimization */ + li t3, 32 + bltu a2, t3, .Lbulk_copy +#if __riscv_xlen == 64 + andi t2, t0, 7 /* t2 = dst & 7 */ + beqz t2, .Lbulk_copy /* already aligned to 8 bytes */ + li t4, 8 + sub t2, t4, t2 /* pad = 8 - (dst & 7) */ +#else + andi t2, t0, 3 /* t2 = dst & 3 */ + beqz t2, .Lbulk_copy /* already aligned to 4 bytes */ + li t4, 4 + sub t2, t4, t2 /* pad = 4 - (dst & 3) */ +#endif + /* copy prologue using vectors */ + vsetvli t3, t2, e8, m8, ta, ma + vle8.v v0, (t1) + vse8.v v0, (t0) + add t0, t0, t3 + add t1, t1, t3 + sub a2, a2, t3 + beqz a2, .Ldone +#endif + +.Lbulk_copy: + vsetvli t2, a2, e8, m8, ta, ma + vle8.v v0, (t1) + vse8.v v0, (t0) + add t0, t0, t2 + add t1, t1, t2 + sub a2, a2, t2 + bnez a2, .Lbulk_copy + /* fallthrough */ + +.Ldone: + ret + + .size memcpy, .-memcpy + .option pop +#elif defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) .text .global memcpy .type memcpy, @function diff --git a/newlib/libc/machine/riscv/memcpy.c b/newlib/libc/machine/riscv/memcpy.c index a27e0ecb1..7fa0ff804 100644 --- a/newlib/libc/machine/riscv/memcpy.c +++ b/newlib/libc/machine/riscv/memcpy.c @@ -10,7 +10,7 @@ http://www.opensource.org/licenses. */ -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) || defined(__riscv_vector) // memcpy defined in memcpy-asm.S #else diff --git a/newlib/libc/machine/riscv/memmove-asm.S b/newlib/libc/machine/riscv/memmove-asm.S index 061472ca2..fea1741d2 100644 --- a/newlib/libc/machine/riscv/memmove-asm.S +++ b/newlib/libc/machine/riscv/memmove-asm.S @@ -9,7 +9,100 @@ http://www.opensource.org/licenses. */ -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) +#if defined(__riscv_vector) +.text +.global memmove +.type memmove, @function +.option push +.option arch, +zve32x +memmove: + beqz a2, .Ldone_move /* n == 0 */ + beq a0, a1, .Ldone_move /* dst == src */ + + /* overlap check */ + bgeu a1, a0, .Lforward_move /* src >= dst then forward move */ + sub t2, a0, a1 /* t2 = dst - src */ + bgeu t2, a2, .Lforward_move /* no overlap then forward move */ + + /* backward move */ + add t0, a0, a2 /* running dst_end */ + add t1, a1, a2 /* running src_end */ + /* Align dst_end to SZREG: skip when __riscv_misaligned_fast, else align */ +#ifndef __riscv_misaligned_fast + /* process small data directly with vectors, no alignment optimization */ + li t3, 32 + bltu a2, t3, .Lbackward_loop + +#if __riscv_xlen == 64 + andi t2, t0, 7 /* misalignment = dst_end & 7 */ +#else + andi t2, t0, 3 /* misalignment = dst_end & 3 */ +#endif + beqz t2, .Lbackward_aligned /* already aligned */ + /* copy tail bytes to reach aligned dst_end */ + vsetvli t3, t2, e8, m8, ta, ma + sub t0, t0, t3 + sub t1, t1, t3 + vle8.v v0, (t1) + vse8.v v0, (t0) + sub a2, a2, t3 +.Lbackward_aligned: +#endif +.Lbackward_loop: + vsetvli t3, a2, e8, m8, ta, ma /* t3 = vl (bytes) */ + sub t0, t0, t3 + sub t1, t1, t3 + vle8.v v0, (t1) + vse8.v v0, (t0) + sub a2, a2, t3 + bnez a2, .Lbackward_loop + ret + + /* forward move, same as memcpy */ +.Lforward_move: + mv t0, a0 /* running dst */ + mv t1, a1 /* running src */ + /* Align dst to SZREG: skip when __riscv_misaligned_fast, else align */ +#ifndef __riscv_misaligned_fast + /* process small data directly with vectors, no alignment optimization */ + li t3, 32 + bltu a2, t3, .Lforward_loop + +#if __riscv_xlen == 64 + andi t2, t0, 7 /* t2 = dst & 7 */ + beqz t2, .Lforward_aligned /* already aligned to 8 bytes */ + li t4, 8 + sub t2, t4, t2 /* pad = 8 - (dst & 7) */ +#else + andi t2, t0, 3 /* t2 = dst & 3 */ + beqz t2, .Lforward_aligned /* already aligned to 4 bytes */ + li t4, 4 + sub t2, t4, t2 /* pad = 4 - (dst & 3) */ +#endif + /* copy prologue using vectors */ + vsetvli t3, t2, e8, m8, ta, ma + vle8.v v0, (t1) + vse8.v v0, (t0) + add t0, t0, t3 + add t1, t1, t3 + sub a2, a2, t3 +.Lforward_aligned: +#endif +.Lforward_loop: + vsetvli t3, a2, e8, m8, ta, ma + vle8.v v0, (t1) + vse8.v v0, (t0) + add t0, t0, t3 + add t1, t1, t3 + sub a2, a2, t3 + bnez a2, .Lforward_loop + /* fallthrough */ + +.Ldone_move: + ret + .size memmove, .-memmove + .option pop +#elif defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) .text .global memmove .type memmove, @function diff --git a/newlib/libc/machine/riscv/memmove.c b/newlib/libc/machine/riscv/memmove.c index 209a75c69..691774e2e 100644 --- a/newlib/libc/machine/riscv/memmove.c +++ b/newlib/libc/machine/riscv/memmove.c @@ -10,7 +10,7 @@ http://www.opensource.org/licenses. */ -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) || defined(__riscv_vector) /* memmove defined in memmove-asm.S */ #else diff --git a/newlib/libc/machine/riscv/memset.S b/newlib/libc/machine/riscv/memset.S index 533f66758..9ade879f8 100644 --- a/newlib/libc/machine/riscv/memset.S +++ b/newlib/libc/machine/riscv/memset.S @@ -50,7 +50,26 @@ memset: -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) +#if defined(__riscv_vector) +.option push +.option arch, +zve32x + mv t0, a0 /* running dst; keep a0 as return */ + beqz a2, .Ldone_set /* n == 0 then return */ + + /* Broadcast fill byte once. */ + vsetvli t1, zero, e8, m8, ta, ma + vmv.v.x v0, a1 + +.Lbulk_set: + vsetvli t1, a2, e8, m8, ta, ma /* t1 = vl (bytes) */ + vse8.v v0, (t0) + add t0, t0, t1 + sub a2, a2, t1 + bnez a2, .Lbulk_set +.Ldone_set: + ret +.option pop +#elif defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) mv a3, a0 beqz a2, .Ldone