[v2,2/3] newlib: riscv: Optimize memchr() and memrchr()

Message ID 569ea557b808a897dd75334f51d9fc827f13490b.1746628687.git.ericsalem@gmail.com
State New
Headers
Series newlib: riscv: Add and optimize memchr() and memrchr() functions |

Commit Message

Eric Salem May 7, 2025, 2:46 p.m. UTC
  The RISC-V Zbb, Zbkb, and Zilsd extensions provide instructions
optimized for bit and load/store operations. Use them when available for
the RISC-V port.

Reviewed-by: Christian Herber <christian.herber@oss.nxp.com>
Signed-off-by: Eric Salem <ericsalem@gmail.com>
---
 newlib/libc/machine/riscv/memchr.c    | 137 ++++++++++++++++------
 newlib/libc/machine/riscv/memrchr.c   | 159 +++++++++++++++++++-------
 newlib/libc/machine/riscv/rv_string.h |  45 +++++++-
 newlib/libc/machine/riscv/xlenint.h   |   7 ++
 4 files changed, 265 insertions(+), 83 deletions(-)
  

Patch

diff --git a/newlib/libc/machine/riscv/memchr.c b/newlib/libc/machine/riscv/memchr.c
index 5c08c12813fe..f2824540a96b 100644
--- a/newlib/libc/machine/riscv/memchr.c
+++ b/newlib/libc/machine/riscv/memchr.c
@@ -29,10 +29,17 @@  QUICKREF
 	memchr ansi pure
 */
 
-#include <_ansi.h>
-#include <string.h>
-#include <limits.h>
-#include "../../string/local.h"
+#include <sys/asm.h>
+#include <stddef.h>
+#include "rv_string.h"
+
+// Move size
+#if __riscv_zilsd
+#define MV_SZ           8
+#else
+#define MV_SZ           SZREG
+#endif
+
 
 void *
 memchr (const void *src_void,
@@ -43,47 +50,101 @@  memchr (const void *src_void,
   unsigned char d = c;
 
 #if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
-  unsigned long *asrc;
-  unsigned long  mask;
-  unsigned int i;
+  size_t align = (uintptr_t) src & (MV_SZ - 1);
 
-  while (UNALIGNED_X(src))
+  if (align)
     {
-      if (!length--)
-        return NULL;
-      if (*src == d)
-        return (void *) src;
-      src++;
-    }
+      align = MV_SZ - align;
 
-  if (!TOO_SMALL_LITTLE_BLOCK(length))
-    {
-      /* If we get this far, we know that length is large and src is
-         word-aligned. */
-      /* The fast code reads the source one word at a time and only
-         performs the bytewise search on word-sized segments if they
-         contain the search character, which is detected by XORing
-         the word-sized segment with a word-sized block of the search
-         character and then detecting for the presence of NUL in the
-         result.  */
-      asrc = (unsigned long *) src;
-      mask = d << 8 | d;
-      mask = mask << 16 | mask;
-      for (i = 32; i < sizeof(mask) * 8; i <<= 1)
-        mask = (mask << i) | mask;
-
-      while (!TOO_SMALL_LITTLE_BLOCK(length))
+      if (length < align) align = length;
+
+      switch (align)
         {
-          if (DETECT_CHAR(*asrc, mask))
-            break;
-          length -= LITTLE_BLOCK_SIZE;
-          asrc++;
+#if MV_SZ == 8
+          case 7:
+            if (*src++ == d) return (void *) (src - 1);
+          case 6:
+            if (*src++ == d) return (void *) (src - 1);
+          case 5:
+            if (*src++ == d) return (void *) (src - 1);
+          case 4:
+            if (*src++ == d) return (void *) (src - 1);
+#endif /* MV_SZ == 8 */
+          case 3:
+            if (*src++ == d) return (void *) (src - 1);
+          case 2:
+            if (*src++ == d) return (void *) (src - 1);
+          case 1:
+            if (*src++ == d) return (void *) (src - 1);
         }
 
-      /* If there are fewer than LITTLE_BLOCK_SIZE characters left,
-         then we resort to the bytewise loop.  */
+      length -= align;
+    }
+
+  const unsigned char *end_addr = src + (length & ~(MV_SZ - 1));
 
-      src = (unsigned char *) asrc;
+  if (src < end_addr)
+    {
+      uintxlen_t mask = __libc_splat_byte(d);
+      uintlslen_t val;
+
+      do
+        {
+#if __riscv_zilsd
+          asm volatile ("ld %0, 0(%1)"
+                        : "=R" (val)
+                        : "r" (src)
+          );
+#else /* not riscv_zilsd */
+          val = *(uintxlen_t*) src;
+#endif /* __riscv_zilsd */
+          uintxlen_t word1 = val ^ mask;
+
+          if (__libc_detect_null(word1))
+            {
+#if __riscv_zbb
+              word1 = ~__LIBC_RISCV_ZBB_ORC_B(word1);
+              word1 = __LIBC_RISCV_ZBB_CNT_Z(word1);
+
+              return (void *) (src + (word1 >> 3));
+#else /* not __riscv_zbb */
+              if (*src++ == d) return (void *) (src - 1);
+              if (*src++ == d) return (void *) (src - 1);
+              if (*src++ == d) return (void *) (src - 1);
+#if __riscv_xlen == 64
+              if (*src++ == d) return (void *) (src - 1);
+              if (*src++ == d) return (void *) (src - 1);
+              if (*src++ == d) return (void *) (src - 1);
+              if (*src++ == d) return (void *) (src - 1);
+#endif /* __riscv_xlen == 64 */
+              return (void *) src;
+#endif /* __riscv_zbb */
+            }
+#if __riscv_zilsd
+          uintxlen_t word2 = (val >> 32);
+          word2 ^= mask;
+
+          if (__libc_detect_null(word2))
+            {
+              src += MV_SZ / 2;
+#if __riscv_zbb
+              word2 = ~__LIBC_RISCV_ZBB_ORC_B(word2);
+              word2 = __LIBC_RISCV_ZBB_CNT_Z(word2);
+
+              return (void *) (src + (word2 >> 3));
+#else /* not __riscv_zbb */
+              if (*src++ == d) return (void *) (src - 1);
+              if (*src++ == d) return (void *) (src - 1);
+              if (*src++ == d) return (void *) (src - 1);
+              return (void *) src;
+#endif /* __riscv_zbb */
+            }
+#endif /* __riscv_zilsd */
+
+          src += MV_SZ;
+        } while (src < end_addr);
+
+      length &= MV_SZ - 1;
     }
 
 #endif /* not PREFER_SIZE_OVER_SPEED */
diff --git a/newlib/libc/machine/riscv/memrchr.c b/newlib/libc/machine/riscv/memrchr.c
index 8d15ccb780ec..84f94b0c1bd3 100644
--- a/newlib/libc/machine/riscv/memrchr.c
+++ b/newlib/libc/machine/riscv/memrchr.c
@@ -29,61 +29,142 @@  QUICKREF
 	memrchr
 */
 
-#include <_ansi.h>
-#include <string.h>
-#include <limits.h>
-#include "../../string/local.h"
+#include <sys/asm.h>
+#include <stddef.h>
+#include "rv_string.h"
+
+// Move size
+#if __riscv_zilsd
+#define MV_SZ           8
+
+// Offset is only 4 bytes for Zilsd/Zclsd since each register is 32 bits
+#define OFFSET          4
+#else
+#define MV_SZ           SZREG
+#define OFFSET          SZREG
+#endif
+
 
 void *
 memrchr (const void *src_void,
 	int c,
 	size_t length)
 {
-  const unsigned char *src = (const unsigned char *) src_void + length - 1;
+  const unsigned char *src = (const unsigned char *) src_void;
   unsigned char d = c;
 
+  if (length) src += length - 1;
+
 #if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
-  unsigned long *asrc;
-  unsigned long  mask;
-  unsigned int i;
 
-  while (UNALIGNED_X(src))
-    {
-      if (!length--)
-        return NULL;
-      if (*src == d)
-        return (void *) src;
-      src--;
-    }
+  /*
+    We add one to the address because even if an address is already aligned,
+    when loading words the bytes preceding this address are read, so check
+    the single byte.
 
-  if (!TOO_SMALL_LITTLE_BLOCK(length))
+    If the address has all the least significant bits set equaling MV_SZ - 1,
+    and has a length of at least MV_SZ, we can read a word starting from
+    src & ~(MV_SZ - 1) because no alignment is actually required
+  */
+  size_t align = (uintptr_t) (src + 1) & (MV_SZ - 1);
+
+  if (align)
     {
-      /* If we get this far, we know that length is large and src is
-         word-aligned. */
-      /* The fast code reads the source one word at a time and only
-         performs the bytewise search on word-sized segments if they
-         contain the search character, which is detected by XORing
-         the word-sized segment with a word-sized block of the search
-         character and then detecting for the presence of NUL in the
-         result.  */
-      asrc = (unsigned long *) (src - LITTLE_BLOCK_SIZE + 1);
-      mask = d << 8 | d;
-      mask = mask << 16 | mask;
-      for (i = 32; i < sizeof(mask) * 8; i <<= 1)
-        mask = (mask << i) | mask;
-
-      while (!TOO_SMALL_LITTLE_BLOCK(length))
+      if (length < align) align = length;
+
+      switch (align)
         {
-          if (DETECT_CHAR(*asrc, mask))
-            break;
-          length -= LITTLE_BLOCK_SIZE;
-          asrc--;
+#if MV_SZ == 8
+          case 7:
+            if (*src-- == d) return (void *) (src + 1);
+          case 6:
+            if (*src-- == d) return (void *) (src + 1);
+          case 5:
+            if (*src-- == d) return (void *) (src + 1);
+          case 4:
+            if (*src-- == d) return (void *) (src + 1);
+#endif /* MV_SZ == 8 */
+          case 3:
+            if (*src-- == d) return (void *) (src + 1);
+          case 2:
+            if (*src-- == d) return (void *) (src + 1);
+          case 1:
+            if (*src-- == d) return (void *) (src + 1);
         }
 
-      /* If there are fewer than LITTLE_BLOCK_SIZE characters left,
-         then we resort to the bytewise loop.  */
+      length -= align;
+    }
+
+  const unsigned char *end_addr = src - (length & ~(MV_SZ - 1));
 
-      src = (unsigned char *) asrc + LITTLE_BLOCK_SIZE - 1;
+  if (src > end_addr)
+    {
+      src -= MV_SZ - 1;
+
+      uintxlen_t mask = __libc_splat_byte(d);
+      uintlslen_t val;
+
+      do
+        {
+#if __riscv_zilsd
+          asm volatile ("ld %0, 0(%1)"
+                        : "=R" (val)
+                        : "r" (src)
+          );
+#else /* not riscv_zilsd */
+          val = *(uintxlen_t*) src;
+#endif /* __riscv_zilsd */
+
+#if __riscv_zilsd
+          uintxlen_t word2 = val >> 32;
+          word2 ^= mask;
+
+          if (__libc_detect_null(word2))
+            {
+#if __riscv_zbb
+              src += OFFSET;
+              word2 = ~__LIBC_RISCV_ZBB_ORC_B(word2);
+              word2 = __LIBC_RISCV_ZBB_CNT_Z_REV(word2);
+
+              return (void *) (src + OFFSET - 1 - (word2 >> 3));
+#else /* not __riscv_zbb */
+              src += MV_SZ - 1;
+              if (*src-- == d) return (void *) (src + 1);
+              if (*src-- == d) return (void *) (src + 1);
+              if (*src-- == d) return (void *) (src + 1);
+              return (void *) src;
+#endif /* __riscv_zbb */
+            }
+#endif /* __riscv_zilsd */
+          uintxlen_t word1 = val ^ mask;
+
+          if (__libc_detect_null(word1))
+            {
+#if __riscv_zbb
+              word1 = ~__LIBC_RISCV_ZBB_ORC_B(word1);
+              word1 = __LIBC_RISCV_ZBB_CNT_Z_REV(word1);
+
+              return (void *) (src + OFFSET - 1 - (word1 >> 3));
+#else /* not __riscv_zbb */
+              src += OFFSET - 1;
+              if (*src-- == d) return (void *) (src + 1);
+              if (*src-- == d) return (void *) (src + 1);
+              if (*src-- == d) return (void *) (src + 1);
+#if __riscv_xlen == 64
+              if (*src-- == d) return (void *) (src + 1);
+              if (*src-- == d) return (void *) (src + 1);
+              if (*src-- == d) return (void *) (src + 1);
+              if (*src-- == d) return (void *) (src + 1);
+#endif /* __riscv_xlen == 64 */
+              return (void *) src;
+#endif /* __riscv_zbb */
+            }
+
+          src -= MV_SZ;
+        } while (src > end_addr);
+
+      length &= MV_SZ - 1;
+      src = end_addr;
     }
 
 #endif /* not PREFER_SIZE_OVER_SPEED */
diff --git a/newlib/libc/machine/riscv/rv_string.h b/newlib/libc/machine/riscv/rv_string.h
index 7754303064c9..dc2a26daf1b2 100644
--- a/newlib/libc/machine/riscv/rv_string.h
+++ b/newlib/libc/machine/riscv/rv_string.h
@@ -20,20 +20,24 @@ 
 
   // Determine which intrinsics to use based on XLEN and endianness
   #if __riscv_xlen == 64
-    #define __LIBC_RISCV_ZBB_ORC_B(x)       __riscv_orc_b_64(x)
+    #define __LIBC_RISCV_ZBB_ORC_B(x)         __riscv_orc_b_64(x)
 
     #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-      #define __LIBC_RISCV_ZBB_CNT_Z(x)     __riscv_ctz_64(x)
+      #define __LIBC_RISCV_ZBB_CNT_Z(x)       __riscv_ctz_64(x)
+      #define __LIBC_RISCV_ZBB_CNT_Z_REV(x)   __riscv_clz_64(x)
     #else
-      #define __LIBC_RISCV_ZBB_CNT_Z(x)     __riscv_clz_64(x)
+      #define __LIBC_RISCV_ZBB_CNT_Z(x)       __riscv_clz_64(x)
+      #define __LIBC_RISCV_ZBB_CNT_Z_REV(x)   __riscv_ctz_64(x)
     #endif
   #else
-    #define __LIBC_RISCV_ZBB_ORC_B(x)       __riscv_orc_b_32(x)
+    #define __LIBC_RISCV_ZBB_ORC_B(x)         __riscv_orc_b_32(x)
 
     #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-      #define __LIBC_RISCV_ZBB_CNT_Z(x)     __riscv_ctz_32(x)
+      #define __LIBC_RISCV_ZBB_CNT_Z(x)       __riscv_ctz_32(x)
+      #define __LIBC_RISCV_ZBB_CNT_Z_REV(x)   __riscv_clz_32(x)
     #else
-      #define __LIBC_RISCV_ZBB_CNT_Z(x)     __riscv_clz_32(x)
+      #define __LIBC_RISCV_ZBB_CNT_Z(x)       __riscv_clz_32(x)
+      #define __LIBC_RISCV_ZBB_CNT_Z_REV(x)   __riscv_ctz_32(x)
     #endif
   #endif
 #endif
@@ -121,4 +125,33 @@  static __inline char *__libc_strcpy(char *dst, const char *src, bool ret_start)
 }
 
 
+static __inline uintxlen_t __libc_splat_byte(unsigned char c)
+{
+  uintxlen_t val;
+
+#if __riscv_zbkb
+  asm volatile ("packh %0, %1, %1"
+                : "=r" (val)
+                : "r" (c)
+      );
+#if __riscv_xlen == 64
+  asm volatile ("packw %0, %0, %0"
+                : "+r" (val)
+  );
+#endif /* __riscv_xlen == 64 */
+  asm volatile ("pack %0, %0, %0"
+                : "+r" (val)
+  );
+#else /* not __riscv_zbkb */
+  val = (c << 8) | c;
+  val = (val << 16) | val;
+#if __riscv_xlen == 64
+  val = (val << 32) | val;
+#endif /* __riscv_xlen == 64 */
+#endif /* __riscv_zbkb */
+
+  return val;
+}
+
+
 #endif /* _RV_STRING_H */
diff --git a/newlib/libc/machine/riscv/xlenint.h b/newlib/libc/machine/riscv/xlenint.h
index 86363a80655f..2d444ff9b80e 100644
--- a/newlib/libc/machine/riscv/xlenint.h
+++ b/newlib/libc/machine/riscv/xlenint.h
@@ -11,4 +11,11 @@  typedef uint32_t uintxlen_t;
 # error __riscv_xlen must equal 32 or 64
 #endif
 
+/* Load/Store length */
+#if __riscv_zilsd
+typedef uint64_t uintlslen_t;
+#else
+typedef uintxlen_t uintlslen_t;
+#endif
+
 #endif /* _XLENINT_H */