[v2,2/3] newlib: riscv: Optimize memchr() and memrchr()
Commit Message
The RISC-V Zbb, Zbkb, and Zilsd extensions provide instructions
optimized for bit and load/store operations. Use them when available for
the RISC-V port.
Reviewed-by: Christian Herber <christian.herber@oss.nxp.com>
Signed-off-by: Eric Salem <ericsalem@gmail.com>
---
newlib/libc/machine/riscv/memchr.c | 137 ++++++++++++++++------
newlib/libc/machine/riscv/memrchr.c | 159 +++++++++++++++++++-------
newlib/libc/machine/riscv/rv_string.h | 45 +++++++-
newlib/libc/machine/riscv/xlenint.h | 7 ++
4 files changed, 265 insertions(+), 83 deletions(-)
@@ -29,10 +29,17 @@ QUICKREF
memchr ansi pure
*/
-#include <_ansi.h>
-#include <string.h>
-#include <limits.h>
-#include "../../string/local.h"
+#include <sys/asm.h>
+#include <stddef.h>
+#include "rv_string.h"
+
+// Move size
+#if __riscv_zilsd
+#define MV_SZ 8
+#else
+#define MV_SZ SZREG
+#endif
+
void *
memchr (const void *src_void,
@@ -43,47 +50,101 @@ memchr (const void *src_void,
unsigned char d = c;
#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
- unsigned long *asrc;
- unsigned long mask;
- unsigned int i;
+ size_t align = (uintptr_t) src & (MV_SZ - 1);
- while (UNALIGNED_X(src))
+ if (align)
{
- if (!length--)
- return NULL;
- if (*src == d)
- return (void *) src;
- src++;
- }
+ align = MV_SZ - align;
- if (!TOO_SMALL_LITTLE_BLOCK(length))
- {
- /* If we get this far, we know that length is large and src is
- word-aligned. */
- /* The fast code reads the source one word at a time and only
- performs the bytewise search on word-sized segments if they
- contain the search character, which is detected by XORing
- the word-sized segment with a word-sized block of the search
- character and then detecting for the presence of NUL in the
- result. */
- asrc = (unsigned long *) src;
- mask = d << 8 | d;
- mask = mask << 16 | mask;
- for (i = 32; i < sizeof(mask) * 8; i <<= 1)
- mask = (mask << i) | mask;
-
- while (!TOO_SMALL_LITTLE_BLOCK(length))
+ if (length < align) align = length;
+
+ switch (align)
{
- if (DETECT_CHAR(*asrc, mask))
- break;
- length -= LITTLE_BLOCK_SIZE;
- asrc++;
+#if MV_SZ == 8
+ case 7:
+ if (*src++ == d) return (void *) (src - 1);
+ case 6:
+ if (*src++ == d) return (void *) (src - 1);
+ case 5:
+ if (*src++ == d) return (void *) (src - 1);
+ case 4:
+ if (*src++ == d) return (void *) (src - 1);
+#endif /* MV_SZ == 8 */
+ case 3:
+ if (*src++ == d) return (void *) (src - 1);
+ case 2:
+ if (*src++ == d) return (void *) (src - 1);
+ case 1:
+ if (*src++ == d) return (void *) (src - 1);
}
- /* If there are fewer than LITTLE_BLOCK_SIZE characters left,
- then we resort to the bytewise loop. */
+ length -= align;
+ }
+
+ const unsigned char *end_addr = src + (length & ~(MV_SZ - 1));
- src = (unsigned char *) asrc;
+ if (src < end_addr)
+ {
+ uintxlen_t mask = __libc_splat_byte(d);
+ uintlslen_t val;
+
+ do
+ {
+#if __riscv_zilsd
+ asm volatile ("ld %0, 0(%1)"
+ : "=R" (val)
+ : "r" (src)
+ );
+#else /* not riscv_zilsd */
+ val = *(uintxlen_t*) src;
+#endif /* __riscv_zilsd */
+ uintxlen_t word1 = val ^ mask;
+
+ if (__libc_detect_null(word1))
+ {
+#if __riscv_zbb
+ word1 = ~__LIBC_RISCV_ZBB_ORC_B(word1);
+ word1 = __LIBC_RISCV_ZBB_CNT_Z(word1);
+
+ return (void *) (src + (word1 >> 3));
+#else /* not __riscv_zbb */
+ if (*src++ == d) return (void *) (src - 1);
+ if (*src++ == d) return (void *) (src - 1);
+ if (*src++ == d) return (void *) (src - 1);
+#if __riscv_xlen == 64
+ if (*src++ == d) return (void *) (src - 1);
+ if (*src++ == d) return (void *) (src - 1);
+ if (*src++ == d) return (void *) (src - 1);
+ if (*src++ == d) return (void *) (src - 1);
+#endif /* __riscv_xlen == 64 */
+ return (void *) src;
+#endif /* __riscv_zbb */
+ }
+#if __riscv_zilsd
+ uintxlen_t word2 = (val >> 32);
+ word2 ^= mask;
+
+ if (__libc_detect_null(word2))
+ {
+ src += MV_SZ / 2;
+#if __riscv_zbb
+ word2 = ~__LIBC_RISCV_ZBB_ORC_B(word2);
+ word2 = __LIBC_RISCV_ZBB_CNT_Z(word2);
+
+ return (void *) (src + (word2 >> 3));
+#else /* not __riscv_zbb */
+ if (*src++ == d) return (void *) (src - 1);
+ if (*src++ == d) return (void *) (src - 1);
+ if (*src++ == d) return (void *) (src - 1);
+ return (void *) src;
+#endif /* __riscv_zbb */
+ }
+#endif /* __riscv_zilsd */
+
+ src += MV_SZ;
+ } while (src < end_addr);
+
+ length &= MV_SZ - 1;
}
#endif /* not PREFER_SIZE_OVER_SPEED */
@@ -29,61 +29,142 @@ QUICKREF
memrchr
*/
-#include <_ansi.h>
-#include <string.h>
-#include <limits.h>
-#include "../../string/local.h"
+#include <sys/asm.h>
+#include <stddef.h>
+#include "rv_string.h"
+
+// Move size
+#if __riscv_zilsd
+#define MV_SZ 8
+
+// Offset is only 4 bytes for Zilsd/Zclsd since each register is 32 bits
+#define OFFSET 4
+#else
+#define MV_SZ SZREG
+#define OFFSET SZREG
+#endif
+
void *
memrchr (const void *src_void,
int c,
size_t length)
{
- const unsigned char *src = (const unsigned char *) src_void + length - 1;
+ const unsigned char *src = (const unsigned char *) src_void;
unsigned char d = c;
+ if (length) src += length - 1;
+
#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
- unsigned long *asrc;
- unsigned long mask;
- unsigned int i;
- while (UNALIGNED_X(src))
- {
- if (!length--)
- return NULL;
- if (*src == d)
- return (void *) src;
- src--;
- }
+ /*
+ We add one to the address because even if an address is already aligned,
+ when loading words the bytes preceding this address are read, so check
+ the single byte.
- if (!TOO_SMALL_LITTLE_BLOCK(length))
+ If the address has all the least significant bits set equaling MV_SZ - 1,
+ and has a length of at least MV_SZ, we can read a word starting from
+ src & ~(MV_SZ - 1) because no alignment is actually required
+ */
+ size_t align = (uintptr_t) (src + 1) & (MV_SZ - 1);
+
+ if (align)
{
- /* If we get this far, we know that length is large and src is
- word-aligned. */
- /* The fast code reads the source one word at a time and only
- performs the bytewise search on word-sized segments if they
- contain the search character, which is detected by XORing
- the word-sized segment with a word-sized block of the search
- character and then detecting for the presence of NUL in the
- result. */
- asrc = (unsigned long *) (src - LITTLE_BLOCK_SIZE + 1);
- mask = d << 8 | d;
- mask = mask << 16 | mask;
- for (i = 32; i < sizeof(mask) * 8; i <<= 1)
- mask = (mask << i) | mask;
-
- while (!TOO_SMALL_LITTLE_BLOCK(length))
+ if (length < align) align = length;
+
+ switch (align)
{
- if (DETECT_CHAR(*asrc, mask))
- break;
- length -= LITTLE_BLOCK_SIZE;
- asrc--;
+#if MV_SZ == 8
+ case 7:
+ if (*src-- == d) return (void *) (src + 1);
+ case 6:
+ if (*src-- == d) return (void *) (src + 1);
+ case 5:
+ if (*src-- == d) return (void *) (src + 1);
+ case 4:
+ if (*src-- == d) return (void *) (src + 1);
+#endif /* MV_SZ == 8 */
+ case 3:
+ if (*src-- == d) return (void *) (src + 1);
+ case 2:
+ if (*src-- == d) return (void *) (src + 1);
+ case 1:
+ if (*src-- == d) return (void *) (src + 1);
}
- /* If there are fewer than LITTLE_BLOCK_SIZE characters left,
- then we resort to the bytewise loop. */
+ length -= align;
+ }
+
+ const unsigned char *end_addr = src - (length & ~(MV_SZ - 1));
- src = (unsigned char *) asrc + LITTLE_BLOCK_SIZE - 1;
+ if (src > end_addr)
+ {
+ src -= MV_SZ - 1;
+
+ uintxlen_t mask = __libc_splat_byte(d);
+ uintlslen_t val;
+
+ do
+ {
+#if __riscv_zilsd
+ asm volatile ("ld %0, 0(%1)"
+ : "=R" (val)
+ : "r" (src)
+ );
+#else /* not riscv_zilsd */
+ val = *(uintxlen_t*) src;
+#endif /* __riscv_zilsd */
+
+#if __riscv_zilsd
+ uintxlen_t word2 = val >> 32;
+ word2 ^= mask;
+
+ if (__libc_detect_null(word2))
+ {
+#if __riscv_zbb
+ src += OFFSET;
+ word2 = ~__LIBC_RISCV_ZBB_ORC_B(word2);
+ word2 = __LIBC_RISCV_ZBB_CNT_Z_REV(word2);
+
+ return (void *) (src + OFFSET - 1 - (word2 >> 3));
+#else /* not __riscv_zbb */
+ src += MV_SZ - 1;
+ if (*src-- == d) return (void *) (src + 1);
+ if (*src-- == d) return (void *) (src + 1);
+ if (*src-- == d) return (void *) (src + 1);
+ return (void *) src;
+#endif /* __riscv_zbb */
+ }
+#endif /* __riscv_zilsd */
+ uintxlen_t word1 = val ^ mask;
+
+ if (__libc_detect_null(word1))
+ {
+#if __riscv_zbb
+ word1 = ~__LIBC_RISCV_ZBB_ORC_B(word1);
+ word1 = __LIBC_RISCV_ZBB_CNT_Z_REV(word1);
+
+ return (void *) (src + OFFSET - 1 - (word1 >> 3));
+#else /* not __riscv_zbb */
+ src += OFFSET - 1;
+ if (*src-- == d) return (void *) (src + 1);
+ if (*src-- == d) return (void *) (src + 1);
+ if (*src-- == d) return (void *) (src + 1);
+#if __riscv_xlen == 64
+ if (*src-- == d) return (void *) (src + 1);
+ if (*src-- == d) return (void *) (src + 1);
+ if (*src-- == d) return (void *) (src + 1);
+ if (*src-- == d) return (void *) (src + 1);
+#endif /* __riscv_xlen == 64 */
+ return (void *) src;
+#endif /* __riscv_zbb */
+ }
+
+ src -= MV_SZ;
+ } while (src > end_addr);
+
+ length &= MV_SZ - 1;
+ src = end_addr;
}
#endif /* not PREFER_SIZE_OVER_SPEED */
@@ -20,20 +20,24 @@
// Determine which intrinsics to use based on XLEN and endianness
#if __riscv_xlen == 64
- #define __LIBC_RISCV_ZBB_ORC_B(x) __riscv_orc_b_64(x)
+ #define __LIBC_RISCV_ZBB_ORC_B(x) __riscv_orc_b_64(x)
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- #define __LIBC_RISCV_ZBB_CNT_Z(x) __riscv_ctz_64(x)
+ #define __LIBC_RISCV_ZBB_CNT_Z(x) __riscv_ctz_64(x)
+ #define __LIBC_RISCV_ZBB_CNT_Z_REV(x) __riscv_clz_64(x)
#else
- #define __LIBC_RISCV_ZBB_CNT_Z(x) __riscv_clz_64(x)
+ #define __LIBC_RISCV_ZBB_CNT_Z(x) __riscv_clz_64(x)
+ #define __LIBC_RISCV_ZBB_CNT_Z_REV(x) __riscv_ctz_64(x)
#endif
#else
- #define __LIBC_RISCV_ZBB_ORC_B(x) __riscv_orc_b_32(x)
+ #define __LIBC_RISCV_ZBB_ORC_B(x) __riscv_orc_b_32(x)
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- #define __LIBC_RISCV_ZBB_CNT_Z(x) __riscv_ctz_32(x)
+ #define __LIBC_RISCV_ZBB_CNT_Z(x) __riscv_ctz_32(x)
+ #define __LIBC_RISCV_ZBB_CNT_Z_REV(x) __riscv_clz_32(x)
#else
- #define __LIBC_RISCV_ZBB_CNT_Z(x) __riscv_clz_32(x)
+ #define __LIBC_RISCV_ZBB_CNT_Z(x) __riscv_clz_32(x)
+ #define __LIBC_RISCV_ZBB_CNT_Z_REV(x) __riscv_ctz_32(x)
#endif
#endif
#endif
@@ -121,4 +125,33 @@ static __inline char *__libc_strcpy(char *dst, const char *src, bool ret_start)
}
+static __inline uintxlen_t __libc_splat_byte(unsigned char c)
+{
+ uintxlen_t val;
+
+#if __riscv_zbkb
+ asm volatile ("packh %0, %1, %1"
+ : "=r" (val)
+ : "r" (c)
+ );
+#if __riscv_xlen == 64
+ asm volatile ("packw %0, %0, %0"
+ : "+r" (val)
+ );
+#endif /* __riscv_xlen == 64 */
+ asm volatile ("pack %0, %0, %0"
+ : "+r" (val)
+ );
+#else /* not __riscv_zbkb */
+ val = (c << 8) | c;
+ val = (val << 16) | val;
+#if __riscv_xlen == 64
+ val = (val << 32) | val;
+#endif /* __riscv_xlen == 64 */
+#endif /* __riscv_zbkb */
+
+ return val;
+}
+
+
#endif /* _RV_STRING_H */
@@ -11,4 +11,11 @@ typedef uint32_t uintxlen_t;
# error __riscv_xlen must equal 32 or 64
#endif
+/* Load/Store length */
+#if __riscv_zilsd
+typedef uint64_t uintlslen_t;
+#else
+typedef uintxlen_t uintlslen_t;
+#endif
+
#endif /* _XLENINT_H */