[2/2] riscv: Add and use alignment-ignorant memcpy

Message ID 20230206194819.1679472-3-evan@rivosinc.com
State Superseded
Headers
Series RISC-V: ifunced memcpy using new kernel hwprobe interface |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Evan Green Feb. 6, 2023, 7:48 p.m. UTC
  For CPU implementations that can perform unaligned accesses with little
or no performance penalty, create a memcpy implementation that does not
bother aligning buffers. It will use a block of integer registers, a
single integer register, and fall back to bytewise copy for the
remainder.

Signed-off-by: Evan Green <evan@rivosinc.com>

---


---
 sysdeps/riscv/memcopy.h                       |  28 +++++
 sysdeps/riscv/memcpy.c                        |  65 +++++++++++
 sysdeps/riscv/memcpy_noalignment.S            | 103 ++++++++++++++++++
 sysdeps/unix/sysv/linux/riscv/Makefile        |   4 +
 .../unix/sysv/linux/riscv/memcpy-generic.c    |  24 ++++
 5 files changed, 224 insertions(+)
 create mode 100644 sysdeps/riscv/memcopy.h
 create mode 100644 sysdeps/riscv/memcpy.c
 create mode 100644 sysdeps/riscv/memcpy_noalignment.S
 create mode 100644 sysdeps/unix/sysv/linux/riscv/memcpy-generic.c
  

Comments

Richard Henderson Feb. 6, 2023, 10:05 p.m. UTC | #1
On 2/6/23 09:48, Evan Green wrote:
> +	/* Remainder is smaller than a page, compute native word count */
> +	beqz a2, 6f
> +	andi a5, a2, ~(SZREG-1)
> +	andi a2, a2, (SZREG-1)
> +	add a3, a1, a5
> +	/* Jump directly to byte copy if no words. */
> +	beqz a5, 4f
> +
> +3:
> +	/* Use single native register copy */
> +	REG_L a4, 0(a1)
> +	addi a1, a1, SZREG
> +	REG_S a4, 0(t6)
> +	addi t6, t6, SZREG
> +	bltu a1, a3, 3b
> +
> +	/* Jump directly out if no more bytes */
> +	beqz a2, 6f
> +
> +4:
> +	/* Copy the last few individual bytes */
> +	add a3, a1, a2
> +5:
> +	lb a4, 0(a1)
> +	addi a1, a1, 1
> +	sb a4, 0(t6)
> +	addi t6, t6, 1
> +	bltu a1, a3, 5b
> +6:
> +	ret

If you know there are at least SZREG bytes in the range, you can avoid the byte loop by 
copying the last word unaligned.  That may copy some bytes twice, but that's ok too. 
Similarly, you can redundantly copy a few bytes at the beginning to align the destination 
(there's usually some cost for unaligned stores, even if it's generally "fast").

For memcpy < SZREG, you don't need a loop; just test the final few bits of len.
Have a look at the tricks in sysdeps/x86_64/multiarch/memmove-ssse3.S for ideas.


r~
  
Evan Green Feb. 9, 2023, 9:04 p.m. UTC | #2
On Mon, Feb 6, 2023 at 2:06 PM Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> On 2/6/23 09:48, Evan Green wrote:
> > +     /* Remainder is smaller than a page, compute native word count */
> > +     beqz a2, 6f
> > +     andi a5, a2, ~(SZREG-1)
> > +     andi a2, a2, (SZREG-1)
> > +     add a3, a1, a5
> > +     /* Jump directly to byte copy if no words. */
> > +     beqz a5, 4f
> > +
> > +3:
> > +     /* Use single native register copy */
> > +     REG_L a4, 0(a1)
> > +     addi a1, a1, SZREG
> > +     REG_S a4, 0(t6)
> > +     addi t6, t6, SZREG
> > +     bltu a1, a3, 3b
> > +
> > +     /* Jump directly out if no more bytes */
> > +     beqz a2, 6f
> > +
> > +4:
> > +     /* Copy the last few individual bytes */
> > +     add a3, a1, a2
> > +5:
> > +     lb a4, 0(a1)
> > +     addi a1, a1, 1
> > +     sb a4, 0(t6)
> > +     addi t6, t6, 1
> > +     bltu a1, a3, 5b
> > +6:
> > +     ret
>
> If you know there are at least SZREG bytes in the range, you can avoid the byte loop by
> copying the last word unaligned.  That may copy some bytes twice, but that's ok too.
> Similarly, you can redundantly copy a few bytes at the beginning to align the destination
> (there's usually some cost for unaligned stores, even if it's generally "fast").
>
> For memcpy < SZREG, you don't need a loop; just test the final few bits of len.
> Have a look at the tricks in sysdeps/x86_64/multiarch/memmove-ssse3.S for ideas.

Thanks! I haven't gone too deeply into the fine tuning of this
routine, I think you're right there are probably tweaks to be made for
optimal gains. These are good suggestions, though I might save them
for a subsequent patch.
-Evan
  

Patch

diff --git a/sysdeps/riscv/memcopy.h b/sysdeps/riscv/memcopy.h
new file mode 100644
index 0000000000..21f6081b5f
--- /dev/null
+++ b/sysdeps/riscv/memcopy.h
@@ -0,0 +1,28 @@ 
+/* memcopy.h -- definitions for memory copy functions. RISC-V version.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdeps/generic/memcopy.h>
+
+/*
+ * Redefine the generic memcpy implementation to __memcpy_generic, so
+ * the memcpy ifunc can select between generic and special versions.
+ * In rtld, don't bother with all the ifunciness.
+ */
+#if IS_IN (libc)
+#define MEMCPY __memcpy_generic
+#endif
diff --git a/sysdeps/riscv/memcpy.c b/sysdeps/riscv/memcpy.c
new file mode 100644
index 0000000000..1ba25ef976
--- /dev/null
+++ b/sysdeps/riscv/memcpy.c
@@ -0,0 +1,65 @@ 
+/* Multiple versions of memcpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+/* Redefine memcpy so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memcpy
+# define memcpy __redirect_memcpy
+# include <string.h>
+#include <ifunc-init.h>
+#include <sys/hwprobe.h>
+
+#define INIT_ARCH()
+
+extern __typeof (__redirect_memcpy) __libc_memcpy;
+
+extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_noalignment attribute_hidden;
+
+static inline __typeof (__redirect_memcpy) *
+select_memcpy_ifunc (void)
+{
+  INIT_ARCH ();
+
+  struct riscv_hwprobe pair;
+
+  pair.key = RISCV_HWPROBE_KEY_CPUPERF_0;
+  if (__riscv_hwprobe(&pair, 1, 0, NULL, 0) != 0)
+    return __memcpy_generic;
+
+  if ((pair.key > 0) &&
+      (pair.value & RISCV_HWPROBE_MISALIGNED_FAST) ==
+       RISCV_HWPROBE_MISALIGNED_FAST)
+    return __memcpy_noalignment;
+
+  return __memcpy_generic;
+}
+
+libc_ifunc (__libc_memcpy, select_memcpy_ifunc ());
+
+# undef memcpy
+strong_alias (__libc_memcpy, memcpy);
+# ifdef SHARED
+__hidden_ver1 (memcpy, __GI_memcpy, __redirect_memcpy)
+  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (memcpy);
+# endif
+
+#endif
+
diff --git a/sysdeps/riscv/memcpy_noalignment.S b/sysdeps/riscv/memcpy_noalignment.S
new file mode 100644
index 0000000000..fe1d9213c4
--- /dev/null
+++ b/sysdeps/riscv/memcpy_noalignment.S
@@ -0,0 +1,103 @@ 
+/* memcpy for RISC-V, ignoring buffer alignment
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+/* void *memcpy(void *, const void *, size_t) */
+ENTRY (__memcpy_noalignment)
+	move t6, a0  /* Preserve return value */
+
+	/* Round down to the nearest "page" size */
+	andi a4, a2, ~((16*SZREG)-1)
+	beqz a4, 2f
+	add a3, a1, a4
+1:
+	/* Copy "pages" (chunks of 16 registers) */
+	REG_L a4,       0(a1)
+	REG_L a5,   SZREG(a1)
+	REG_L a6, 2*SZREG(a1)
+	REG_L a7, 3*SZREG(a1)
+	REG_L t0, 4*SZREG(a1)
+	REG_L t1, 5*SZREG(a1)
+	REG_L t2, 6*SZREG(a1)
+	REG_L t3, 7*SZREG(a1)
+	REG_L t4, 8*SZREG(a1)
+	REG_L t5, 9*SZREG(a1)
+	REG_S a4,       0(t6)
+	REG_S a5,   SZREG(t6)
+	REG_S a6, 2*SZREG(t6)
+	REG_S a7, 3*SZREG(t6)
+	REG_S t0, 4*SZREG(t6)
+	REG_S t1, 5*SZREG(t6)
+	REG_S t2, 6*SZREG(t6)
+	REG_S t3, 7*SZREG(t6)
+	REG_S t4, 8*SZREG(t6)
+	REG_S t5, 9*SZREG(t6)
+	REG_L a4, 10*SZREG(a1)
+	REG_L a5, 11*SZREG(a1)
+	REG_L a6, 12*SZREG(a1)
+	REG_L a7, 13*SZREG(a1)
+	REG_L t0, 14*SZREG(a1)
+	REG_L t1, 15*SZREG(a1)
+	addi a1, a1, 16*SZREG
+	REG_S a4, 10*SZREG(t6)
+	REG_S a5, 11*SZREG(t6)
+	REG_S a6, 12*SZREG(t6)
+	REG_S a7, 13*SZREG(t6)
+	REG_S t0, 14*SZREG(t6)
+	REG_S t1, 15*SZREG(t6)
+	addi t6, t6, 16*SZREG
+	bltu a1, a3, 1b
+	andi a2, a2, (16*SZREG)-1  /* Update count */
+
+2:
+	/* Remainder is smaller than a page, compute native word count */
+	beqz a2, 6f
+	andi a5, a2, ~(SZREG-1)
+	andi a2, a2, (SZREG-1)
+	add a3, a1, a5
+	/* Jump directly to byte copy if no words. */
+	beqz a5, 4f
+
+3:
+	/* Use single native register copy */
+	REG_L a4, 0(a1)
+	addi a1, a1, SZREG
+	REG_S a4, 0(t6)
+	addi t6, t6, SZREG
+	bltu a1, a3, 3b
+
+	/* Jump directly out if no more bytes */
+	beqz a2, 6f
+
+4:
+	/* Copy the last few individual bytes */
+	add a3, a1, a2
+5:
+	lb a4, 0(a1)
+	addi a1, a1, 1
+	sb a4, 0(t6)
+	addi t6, t6, 1
+	bltu a1, a3, 5b
+6:
+	ret
+
+END (__memcpy_noalignment)
+
+hidden_def (__memcpy_noalignment)
diff --git a/sysdeps/unix/sysv/linux/riscv/Makefile b/sysdeps/unix/sysv/linux/riscv/Makefile
index 45cc29e40d..aa9ea443d6 100644
--- a/sysdeps/unix/sysv/linux/riscv/Makefile
+++ b/sysdeps/unix/sysv/linux/riscv/Makefile
@@ -7,6 +7,10 @@  ifeq ($(subdir),stdlib)
 gen-as-const-headers += ucontext_i.sym
 endif
 
+ifeq ($(subdir),string)
+sysdep_routines += memcpy memcpy-generic memcpy_noalignment
+endif
+
 abi-variants := ilp32 ilp32d lp64 lp64d
 
 ifeq (,$(filter $(default-abi),$(abi-variants)))
diff --git a/sysdeps/unix/sysv/linux/riscv/memcpy-generic.c b/sysdeps/unix/sysv/linux/riscv/memcpy-generic.c
new file mode 100644
index 0000000000..0abe03f7f5
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/riscv/memcpy-generic.c
@@ -0,0 +1,24 @@ 
+/* Re-include the default memcpy implementation.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+
+extern __typeof (memcpy) __memcpy_generic;
+hidden_proto(__memcpy_generic)
+
+#include <string/memcpy.c>