[6/6] newlib: introduce HAVE_HW_UNALIGNED_ACCESS option
Commit Message
Some hardware may perform better when copying unaligned
word-sized memory compared to byte-by-byte copying.
---
newlib/configure.host | 6 ++++++
newlib/libc/machine/riscv/memcpy.c | 4 ++++
newlib/libc/machine/riscv/strcmp.S | 5 ++++-
newlib/libc/machine/riscv/strcpy.c | 2 ++
newlib/libc/string/local.h | 11 ++++++++++-
5 files changed, 26 insertions(+), 2 deletions(-)
--
2.43.0
Comments
Shouldn't this be controlled by __riscv_misaligned_fast according to https://github.com/riscv-non-isa/riscv-c-api-doc/blob/main/src/c-api.adoc#preprocessor-definitions
> -----Original Message-----
> From: Alexey Lapshin <alexey.lapshin@espressif.com>
> Sent: Monday, 27 January 2025 11:52
> To: newlib@sourceware.org
> Cc: Alexey Gerenkov <alexey.gerenkov@espressif.com>; Ivan Grokhotkov
> <ivan@espressif.com>
> Subject: [PATCH 6/6] newlib: introduce HAVE_HW_UNALIGNED_ACCESS
> option
>
> Some hardware may perform better when copying unaligned word-sized
> memory compared to byte-by-byte copying.
> ---
> newlib/configure.host | 6 ++++++
> newlib/libc/machine/riscv/memcpy.c | 4 ++++
> newlib/libc/machine/riscv/strcmp.S | 5 ++++-
> newlib/libc/machine/riscv/strcpy.c | 2 ++
> newlib/libc/string/local.h | 11 ++++++++++-
> 5 files changed, 26 insertions(+), 2 deletions(-)
>
> diff --git a/newlib/configure.host b/newlib/configure.host index
> ff2e51275..0adb1aced 100644
> --- a/newlib/configure.host
> +++ b/newlib/configure.host
> @@ -454,6 +454,12 @@ case "${host}" in
> newlib_cflags="${newlib_cflags} -D_NO_GETPASS -D_NO_SIGSET -
> D_NO_WORDEXP -D_NO_POPEN -D_NO_POSIX_SPAWN"
> newlib_cflags="${newlib_cflags} -DHAVE_FCNTL -DHAVE_BLKSIZE -
> DHAVE_OPENDIR -DHAVE_RENAME"
> newlib_cflags="${newlib_cflags} -DGETREENT_PROVIDED -
> DSIGNAL_PROVIDED"
> + newlib_cflags="${newlib_cflags} -D__ESP__"
> + case "${host_cpu}" in
> + riscv*)
> + newlib_cflags="${newlib_cflags} -
> DHAVE_HW_UNALIGNED_ACCESS"
> + ;;
> + esac
> ;;
> a29k-*-*)
> sys_dir=a29khif
> diff --git a/newlib/libc/machine/riscv/memcpy.c
> b/newlib/libc/machine/riscv/memcpy.c
> index 4098f3ab1..f2d335b33 100644
> --- a/newlib/libc/machine/riscv/memcpy.c
> +++ b/newlib/libc/machine/riscv/memcpy.c
> @@ -33,8 +33,12 @@ memcpy(void *__restrict aa, const void *__restrict bb,
> size_t n)
> const char *b = (const char *)bb;
> char *end = a + n;
> uintptr_t msk = sizeof (long) - 1;
> +#ifdef HAVE_HW_UNALIGNED_ACCESS
> + if (n < sizeof (long))
> +#else
> if (unlikely ((((uintptr_t)a & msk) != ((uintptr_t)b & msk))
> || n < sizeof (long)))
> +#endif // HAVE_HW_UNALIGNED_ACCESS
> {
> small:
> if (__builtin_expect (a < end, 1)) diff --git
> a/newlib/libc/machine/riscv/strcmp.S b/newlib/libc/machine/riscv/strcmp.S
> index 9af9ca1f3..bb0be9afd 100644
> --- a/newlib/libc/machine/riscv/strcmp.S
> +++ b/newlib/libc/machine/riscv/strcmp.S
> @@ -30,10 +30,13 @@ strcmp:
>
> .size strcmp, .-strcmp
> #else
> - or a4, a0, a1
> li t2, -1
> +
> +#ifndef HAVE_HW_UNALIGNED_ACCESS
> + or a4, a0, a1
> and a4, a4, SZREG-1
> bnez a4, .Lmisaligned
> +#endif
>
> #if SZREG == 4
> li a5, 0x7f7f7f7f
> diff --git a/newlib/libc/machine/riscv/strcpy.c
> b/newlib/libc/machine/riscv/strcpy.c
> index 6d802fa8e..7d7554615 100644
> --- a/newlib/libc/machine/riscv/strcpy.c
> +++ b/newlib/libc/machine/riscv/strcpy.c
> @@ -17,8 +17,10 @@ char *strcpy(char *dst, const char *src)
> char *dst0 = dst;
>
> #if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
> +#ifndef HAVE_HW_UNALIGNED_ACCESS
> int misaligned = ((uintptr_t)dst | (uintptr_t)src) & (sizeof (long) - 1);
> if (__builtin_expect(!misaligned, 1))
> +#endif
> {
> long *ldst = (long *)dst;
> const long *lsrc = (const long *)src; diff --git a/newlib/libc/string/local.h
> b/newlib/libc/string/local.h index fb8e6c65c..5eb34319a 100644
> --- a/newlib/libc/string/local.h
> +++ b/newlib/libc/string/local.h
> @@ -17,12 +17,21 @@ int __wcwidth (wint_t); # define
> __inhibit_loop_to_libcall #endif
>
> -/* Nonzero if X is not aligned on a "long" boundary. */
> +/* Nonzero if X is not aligned on a "long" boundary.
> + * This macro is used to skip a few bytes to find an aligned pointer.
> + * It's better to keep it as is even if HAVE_HW_UNALIGNED_ACCESS is
> +enabled,
> + * to avoid small performance penalties (if they are not zero). */
> #define UNALIGNED_X(X) ((long)X & (sizeof (long) - 1))
>
> +#ifdef HAVE_HW_UNALIGNED_ACCESS
> +/* Hardware performs unaligned operations with little
> + * to no penalty compared to byte-to-byte copy. */ #define
> +UNALIGNED_X_Y(X, Y) (0) #else // HAVE_HW_UNALIGNED_ACCESS
> /* Nonzero if either X or Y is not aligned on a "long" boundary. */ #define
> UNALIGNED_X_Y(X, Y) \
> (((long)X & (sizeof (long) - 1)) | ((long)Y & (sizeof (long) - 1)))
> +#endif // HAVE_HW_UNALIGNED_ACCESS
>
> /* How many bytes are copied each iteration of the word copy loop. */
> #define LITTLE_BLOCK_SIZE (sizeof (long))
> --
> 2.43.0
Thanks for highlighting that! But unfortunately this was added too recently in GCC (14.1.0).
Moving to approach with __riscv_misaligned_fast will not work for older GCC.
Also, I'm not sure if this will be applicable to other compilers as well
https://github.com/gcc-mirror/gcc/commit/6e23440b5df4011bbe1dbee74d47641125dd7d16
On Mon, 2025-01-27 at 10:58 +0000, Christian Herber (OSS) wrote:
> Shouldn't this be controlled by __riscv_misaligned_fast according to https://github.com/riscv-non-isa/riscv-c-api-doc/blob/main/src/c-api.adoc#preprocessor-definitions
>
>
Clang has also adopted this: https://lists.llvm.org/pipermail/cfe-commits/Week-of-Mon-20240610/587759.html
I would suggest to work out a solution that works for older and newer versions of compilers.
IMHO, for older versions, this macro could be simply defined with -D.
@@ -454,6 +454,12 @@ case "${host}" in
newlib_cflags="${newlib_cflags} -D_NO_GETPASS -D_NO_SIGSET -D_NO_WORDEXP -D_NO_POPEN -D_NO_POSIX_SPAWN"
newlib_cflags="${newlib_cflags} -DHAVE_FCNTL -DHAVE_BLKSIZE -DHAVE_OPENDIR -DHAVE_RENAME"
newlib_cflags="${newlib_cflags} -DGETREENT_PROVIDED -DSIGNAL_PROVIDED"
+ newlib_cflags="${newlib_cflags} -D__ESP__"
+ case "${host_cpu}" in
+ riscv*)
+ newlib_cflags="${newlib_cflags} -DHAVE_HW_UNALIGNED_ACCESS"
+ ;;
+ esac
;;
a29k-*-*)
sys_dir=a29khif
@@ -33,8 +33,12 @@ memcpy(void *__restrict aa, const void *__restrict bb, size_t n)
const char *b = (const char *)bb;
char *end = a + n;
uintptr_t msk = sizeof (long) - 1;
+#ifdef HAVE_HW_UNALIGNED_ACCESS
+ if (n < sizeof (long))
+#else
if (unlikely ((((uintptr_t)a & msk) != ((uintptr_t)b & msk))
|| n < sizeof (long)))
+#endif // HAVE_HW_UNALIGNED_ACCESS
{
small:
if (__builtin_expect (a < end, 1))
@@ -30,10 +30,13 @@ strcmp:
.size strcmp, .-strcmp
#else
- or a4, a0, a1
li t2, -1
+
+#ifndef HAVE_HW_UNALIGNED_ACCESS
+ or a4, a0, a1
and a4, a4, SZREG-1
bnez a4, .Lmisaligned
+#endif
#if SZREG == 4
li a5, 0x7f7f7f7f
@@ -17,8 +17,10 @@ char *strcpy(char *dst, const char *src)
char *dst0 = dst;
#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
+#ifndef HAVE_HW_UNALIGNED_ACCESS
int misaligned = ((uintptr_t)dst | (uintptr_t)src) & (sizeof (long) - 1);
if (__builtin_expect(!misaligned, 1))
+#endif
{
long *ldst = (long *)dst;
const long *lsrc = (const long *)src;
@@ -17,12 +17,21 @@ int __wcwidth (wint_t);
# define __inhibit_loop_to_libcall
#endif
-/* Nonzero if X is not aligned on a "long" boundary. */
+/* Nonzero if X is not aligned on a "long" boundary.
+ * This macro is used to skip a few bytes to find an aligned pointer.
+ * It's better to keep it as is even if HAVE_HW_UNALIGNED_ACCESS is enabled,
+ * to avoid small performance penalties (if they are not zero). */
#define UNALIGNED_X(X) ((long)X & (sizeof (long) - 1))
+#ifdef HAVE_HW_UNALIGNED_ACCESS
+/* Hardware performs unaligned operations with little
+ * to no penalty compared to byte-to-byte copy. */
+#define UNALIGNED_X_Y(X, Y) (0)
+#else // HAVE_HW_UNALIGNED_ACCESS
/* Nonzero if either X or Y is not aligned on a "long" boundary. */
#define UNALIGNED_X_Y(X, Y) \
(((long)X & (sizeof (long) - 1)) | ((long)Y & (sizeof (long) - 1)))
+#endif // HAVE_HW_UNALIGNED_ACCESS
/* How many bytes are copied each iteration of the word copy loop. */
#define LITTLE_BLOCK_SIZE (sizeof (long))