I reran bench-strcpy and no regressions found with smaller strings.
Attached the file.
On 27/06/24 6:28 pm, MAHESH BODAPATI wrote:
> From: Mahesh Bodapati <bmahi496@linux.ibm.com>
>
> This patch modifies the current POWER9 implementation of strcpy and
> stpcpy to optimize it for POWER9/10.
>
> Since no new POWER10 instructions are used, the original POWER9
> strcpy is modified instead of creating a new implementation for POWER10.
>
> The changes also affect stpcpy, which uses the same implementation
> with some additional code before returning.
>
> Improvements compared to POWER9 version:
>
> Use simple comparisons for the first ~512 bytes
> The main loop is good for long strings, but comparing 16B each time is
> better for shorter strings. After aligning the address to 16 bytes, we
> unroll the loop four times, checking 128 bytes each time. There may be
> some overlap with the main loop for unaligned strings, but it is better
> for shorter strings.
>
> Loop with 64 bytes for longer bytes
> using 4 consecutive lxv/stxv instructions.
>
> Showed an average improvement of 13%.
> ---
> sysdeps/powerpc/powerpc64/le/power9/strcpy.S | 306 ++++++++++++++-----
> 1 file changed, 238 insertions(+), 68 deletions(-)
>
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> index 603bde1e39..206eeafcd6 100644
> --- a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> @@ -42,22 +42,48 @@
>
> if USE_AS_STPCPY is defined.
>
> - The implementation can load bytes past a null terminator, but only
> - up to the next 16B boundary, so it never crosses a page. */
> + This implementation never reads across a page boundary, but may
> + read beyond the NUL terminator. */
>
> -/* Load quadword at addr+offset to vreg, check for null bytes,
> +/* Load 4 quadwords, merge into one VR for speed and check for NUL
> + and branch to label if NUL is found. */
> +#define CHECK_64B(offset,addr,label) \
> + lxv 32+v4,(offset+0)(addr); \
> + lxv 32+v5,(offset+16)(addr); \
> + lxv 32+v6,(offset+32)(addr); \
> + lxv 32+v7,(offset+48)(addr); \
> + vminub v14,v4,v5; \
> + vminub v15,v6,v7; \
> + vminub v16,v14,v15; \
> + vcmpequb. v0,v16,v18; \
> + beq cr6,$+12; \
> + li r7,offset; \
> + b L(label); \
> + stxv 32+v4,(offset+0)(r11); \
> + stxv 32+v5,(offset+16)(r11); \
> + stxv 32+v6,(offset+32)(r11); \
> + stxv 32+v7,(offset+48)(r11)
> +
> +/* Load quadword at addr+offset to vreg, check for NUL bytes,
> and branch to label if any are found. */
> -#define CHECK16(vreg,offset,addr,label) \
> - lxv vreg+32,offset(addr); \
> - vcmpequb. v6,vreg,v18; \
> +#define CHECK_16B(vreg,offset,addr,label) \
> + lxv vreg+32,offset(addr); \
> + vcmpequb. v15,vreg,v18; \
> bne cr6,L(label);
>
> +/* Store vreg2 with length if NUL is found. */
> +#define STORE_WITH_LEN(vreg1,vreg2,reg) \
> + vctzlsbb r8,vreg1; \
> + addi r9,r8,1; \
> + sldi r9,r9,56; \
> + stxvl 32+vreg2,reg,r9;
> +
> .machine power9
> ENTRY_TOCLESS (FUNC_NAME, 4)
> CALL_MCOUNT 2
>
> - vspltisb v18,0 /* Zeroes in v18 */
> - vspltisb v19,-1 /* 0xFF bytes in v19 */
> + vspltisb v18,0 /* Zeroes in v18. */
> + vspltisb v19,-1 /* 0xFF bytes in v19. */
>
> /* Next 16B-aligned address. Prepare address for L(loop). */
> addi r5,r4,16
> @@ -70,14 +96,11 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
> lvsr v1,0,r4
> vperm v0,v19,v0,v1
>
> - vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
> + vcmpequb. v6,v0,v18 /* 0xff if byte is NUL, 0x00 otherwise. */
> beq cr6,L(no_null)
>
> - /* There's a null byte. */
> - vctzlsbb r8,v6 /* Number of trailing zeroes */
> - addi r9,r8,1 /* Add null byte. */
> - sldi r10,r9,56 /* stxvl wants size in top 8 bits. */
> - stxvl 32+v0,r3,r10 /* Partial store */
> + /* There's a NUL byte. */
> + STORE_WITH_LEN(v6,v0,r3)
>
> #ifdef USE_AS_STPCPY
> /* stpcpy returns the dest address plus the size not counting the
> @@ -87,17 +110,22 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
> blr
>
> L(no_null):
> - sldi r10,r8,56 /* stxvl wants size in top 8 bits */
> - stxvl 32+v0,r3,r10 /* Partial store */
> + sldi r10,r8,56 /* stxvl wants size in top 8 bits. */
> + stxvl 32+v0,r3,r10 /* Partial store. */
>
> +/* The main loop is optimized for longer strings(> 512 bytes),
> + so checking the first bytes in 16B chunks benefits shorter
> + strings a lot. */
> .p2align 4
> -L(loop):
> - CHECK16(v0,0,r5,tail1)
> - CHECK16(v1,16,r5,tail2)
> - CHECK16(v2,32,r5,tail3)
> - CHECK16(v3,48,r5,tail4)
> - CHECK16(v4,64,r5,tail5)
> - CHECK16(v5,80,r5,tail6)
> +L(aligned):
> + CHECK_16B(v0,0,r5,tail1)
> + CHECK_16B(v1,16,r5,tail2)
> + CHECK_16B(v2,32,r5,tail3)
> + CHECK_16B(v3,48,r5,tail4)
> + CHECK_16B(v4,64,r5,tail5)
> + CHECK_16B(v5,80,r5,tail6)
> + CHECK_16B(v6,96,r5,tail7)
> + CHECK_16B(v7,112,r5,tail8)
>
> stxv 32+v0,0(r11)
> stxv 32+v1,16(r11)
> @@ -105,21 +133,146 @@ L(loop):
> stxv 32+v3,48(r11)
> stxv 32+v4,64(r11)
> stxv 32+v5,80(r11)
> + stxv 32+v6,96(r11)
> + stxv 32+v7,112(r11)
> +
> + addi r11,r11,128
> +
> + CHECK_16B(v0,128,r5,tail1)
> + CHECK_16B(v1,128+16,r5,tail2)
> + CHECK_16B(v2,128+32,r5,tail3)
> + CHECK_16B(v3,128+48,r5,tail4)
> + CHECK_16B(v4,128+64,r5,tail5)
> + CHECK_16B(v5,128+80,r5,tail6)
> + CHECK_16B(v6,128+96,r5,tail7)
> + CHECK_16B(v7,128+112,r5,tail8)
> +
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + stxv 32+v3,48(r11)
> + stxv 32+v4,64(r11)
> + stxv 32+v5,80(r11)
> + stxv 32+v6,96(r11)
> + stxv 32+v7,112(r11)
> +
> + addi r11,r11,128
> +
> + CHECK_16B(v0,256,r5,tail1)
> + CHECK_16B(v1,256+16,r5,tail2)
> + CHECK_16B(v2,256+32,r5,tail3)
> + CHECK_16B(v3,256+48,r5,tail4)
> + CHECK_16B(v4,256+64,r5,tail5)
> + CHECK_16B(v5,256+80,r5,tail6)
> + CHECK_16B(v6,256+96,r5,tail7)
> + CHECK_16B(v7,256+112,r5,tail8)
> +
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + stxv 32+v3,48(r11)
> + stxv 32+v4,64(r11)
> + stxv 32+v5,80(r11)
> + stxv 32+v6,96(r11)
> + stxv 32+v7,112(r11)
> +
> + addi r11,r11,128
> +
> + CHECK_16B(v0,384,r5,tail1)
> + CHECK_16B(v1,384+16,r5,tail2)
> + CHECK_16B(v2,384+32,r5,tail3)
> + CHECK_16B(v3,384+48,r5,tail4)
> + CHECK_16B(v4,384+64,r5,tail5)
> + CHECK_16B(v5,384+80,r5,tail6)
> + CHECK_16B(v6,384+96,r5,tail7)
> + CHECK_16B(v7,384+112,r5,tail8)
> +
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + stxv 32+v3,48(r11)
> + stxv 32+v4,64(r11)
> + stxv 32+v5,80(r11)
> + stxv 32+v6,96(r11)
> + stxv 32+v7,112(r11)
> +
> + /* Align src pointer down to a 64B boundary. */
> + addi r5,r4,512
> + clrrdi r5,r5,6
> + subf r7,r4,r5
> + add r11,r3,r7
> +
> +/* Switch to a more aggressive approach checking 64B each time. */
> + .p2align 5
> +L(strcpy_loop):
> + CHECK_64B(0,r5,tail_64b)
> + CHECK_64B(64,r5,tail_64b)
> + CHECK_64B(128,r5,tail_64b)
> + CHECK_64B(192,r5,tail_64b)
>
> - addi r5,r5,96
> - addi r11,r11,96
> + CHECK_64B(256,r5,tail_64b)
> + CHECK_64B(256+64,r5,tail_64b)
> + CHECK_64B(256+128,r5,tail_64b)
> + CHECK_64B(256+192,r5,tail_64b)
> + addi r5,r5,512
> + addi r11,r11,512
> +
> + b L(strcpy_loop)
> +
> + .p2align 5
> +L(tail_64b):
> + /* OK, we found a NUL byte. Let's look for it in the current 64-byte
> + block and mark it in its corresponding VR. */
> + add r11,r11,r7
> + vcmpequb. v8,v4,v18
> + beq cr6,L(no_null_16B)
> + /* There's a NUL byte. */
> + STORE_WITH_LEN(v8,v4,r11)
> +#ifdef USE_AS_STPCPY
> + add r3,r11,r8
> +#endif
> + blr
> +
> +L(no_null_16B):
> + stxv 32+v4,0(r11)
> + vcmpequb. v8,v5,v18
> + beq cr6,L(no_null_32B)
> + /* There's a NUL byte. */
> + addi r11,r11,16
> + STORE_WITH_LEN(v8,v5,r11)
> +#ifdef USE_AS_STPCPY
> + add r3,r11,r8
> +#endif
> + blr
> +
> +L(no_null_32B):
> + stxv 32+v5,16(r11)
> + vcmpequb. v8,v6,v18
> + beq cr6,L(no_null_48B)
> + /* There's a NUL byte. */
> + addi r11,r11,32
> + STORE_WITH_LEN(v8,v6,r11)
> +#ifdef USE_AS_STPCPY
> + add r3,r11,r8
> +#endif
> + blr
>
> - b L(loop)
> +L(no_null_48B):
> + stxv 32+v6,32(r11)
> + vcmpequb. v8,v7,v18;
> + /* There's a NUL byte. */
> + addi r11,r11,48
> + STORE_WITH_LEN(v8,v7,r11)
> +#ifdef USE_AS_STPCPY
> + add r3,r11,r8
> +#endif
> + blr
>
> .p2align 4
> L(tail1):
> - vctzlsbb r8,v6 /* Number of trailing zeroes */
> - addi r9,r8,1 /* Add null terminator */
> - sldi r9,r9,56 /* stxvl wants size in top 8 bits */
> - stxvl 32+v0,r11,r9 /* Partial store */
> + /* There's a NUL byte. */
> + STORE_WITH_LEN(v15,v0,r11)
> #ifdef USE_AS_STPCPY
> - /* stpcpy returns the dest address plus the size not counting the
> - final '\0'. */
> add r3,r11,r8
> #endif
> blr
> @@ -127,11 +280,9 @@ L(tail1):
> .p2align 4
> L(tail2):
> stxv 32+v0,0(r11)
> - vctzlsbb r8,v6
> - addi r9,r8,1
> - sldi r9,r9,56
> - addi r11,r11,16
> - stxvl 32+v1,r11,r9
> + /* There's a NUL byte. */
> + addi r11,r11,16
> + STORE_WITH_LEN(v15,v1,r11)
> #ifdef USE_AS_STPCPY
> add r3,r11,r8
> #endif
> @@ -141,11 +292,8 @@ L(tail2):
> L(tail3):
> stxv 32+v0,0(r11)
> stxv 32+v1,16(r11)
> - vctzlsbb r8,v6
> - addi r9,r8,1
> - sldi r9,r9,56
> - addi r11,r11,32
> - stxvl 32+v2,r11,r9
> + addi r11,r11,32
> + STORE_WITH_LEN(v15,v2,r11)
> #ifdef USE_AS_STPCPY
> add r3,r11,r8
> #endif
> @@ -156,11 +304,8 @@ L(tail4):
> stxv 32+v0,0(r11)
> stxv 32+v1,16(r11)
> stxv 32+v2,32(r11)
> - vctzlsbb r8,v6
> - addi r9,r8,1
> - sldi r9,r9,56
> - addi r11,r11,48
> - stxvl 32+v3,r11,r9
> + addi r11,r11,48
> + STORE_WITH_LEN(v15,v3,r11)
> #ifdef USE_AS_STPCPY
> add r3,r11,r8
> #endif
> @@ -168,34 +313,59 @@ L(tail4):
>
> .p2align 4
> L(tail5):
> - stxv 32+v0,0(r11)
> - stxv 32+v1,16(r11)
> - stxv 32+v2,32(r11)
> - stxv 32+v3,48(r11)
> - vctzlsbb r8,v6
> - addi r9,r8,1
> - sldi r9,r9,56
> - addi r11,r11,64
> - stxvl 32+v4,r11,r9
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + stxv 32+v3,48(r11)
> + addi r11,r11,64
> + STORE_WITH_LEN(v15,v4,r11)
> #ifdef USE_AS_STPCPY
> - add r3,r11,r8
> + add r3,r11,r8
> #endif
> blr
>
> .p2align 4
> L(tail6):
> - stxv 32+v0,0(r11)
> - stxv 32+v1,16(r11)
> - stxv 32+v2,32(r11)
> - stxv 32+v3,48(r11)
> - stxv 32+v4,64(r11)
> - vctzlsbb r8,v6
> - addi r9,r8,1
> - sldi r9,r9,56
> - addi r11,r11,80
> - stxvl 32+v5,r11,r9
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + stxv 32+v3,48(r11)
> + stxv 32+v4,64(r11)
> + addi r11,r11,80
> + STORE_WITH_LEN(v15,v5,r11)
> #ifdef USE_AS_STPCPY
> - add r3,r11,r8
> + add r3,r11,r8
> +#endif
> + blr
> +
> + .p2align 4
> +L(tail7):
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + stxv 32+v3,48(r11)
> + stxv 32+v4,64(r11)
> + stxv 32+v5,80(r11)
> + addi r11,r11,96
> + STORE_WITH_LEN(v15,v6,r11)
> +#ifdef USE_AS_STPCPY
> + add r3,r11,r8
> +#endif
> + blr
> +
> + .p2align 4
> +L(tail8):
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + stxv 32+v3,48(r11)
> + stxv 32+v4,64(r11)
> + stxv 32+v5,80(r11)
> + stxv 32+v6,96(r11)
> + addi r11,r11,112
> + STORE_WITH_LEN(v15,v7,r11)
> +#ifdef USE_AS_STPCPY
> + add r3,r11,r8
> #endif
> blr
>
strcpy_power9/10 strcpy_power9_old_version
align1=1, align2=0, len=1: 1.21 ( 4.30%) 1.26
align1=0, align2=1, len=1: 1.21 ( 4.14%) 1.26
align1=0, align2=0, len=1: 1.21 ( 4.35%) 1.26
align1=1, align2=1, len=1: 1.21 ( 4.38%) 1.26
align1=1, align2=0, len=2: 1.21 ( 4.16%) 1.26
align1=0, align2=1, len=2: 1.21 ( 4.29%) 1.26
align1=0, align2=0, len=2: 1.21 ( 9.76%) 1.34
align1=1, align2=1, len=2: 1.21 ( 9.52%) 1.33
align1=1, align2=0, len=4: 1.21 ( 9.74%) 1.34
align1=0, align2=1, len=4: 1.21 ( 9.53%) 1.33
align1=0, align2=0, len=4: 1.21 ( 9.62%) 1.33
align1=1, align2=1, len=4: 1.21 ( 9.63%) 1.33
align1=1, align2=0, len=8: 1.21 ( 9.75%) 1.34
align1=0, align2=1, len=8: 1.21 ( 9.54%) 1.33
align1=0, align2=0, len=8: 1.20 ( 9.75%) 1.34
align1=1, align2=1, len=8: 1.21 ( 9.38%) 1.33
align1=1, align2=0, len=16: 1.39 ( -0.03%) 1.39
align1=0, align2=1, len=16: 1.39 ( -0.08%) 1.39
align1=0, align2=0, len=16: 1.40 ( -0.64%) 1.39
align1=1, align2=1, len=16: 1.39 ( -0.29%) 1.39
align1=1, align2=0, len=32: 1.55 ( 0.61%) 1.56
align1=0, align2=1, len=32: 1.55 ( -0.35%) 1.55
align1=0, align2=0, len=32: 1.55 ( -0.31%) 1.55
align1=1, align2=1, len=32: 1.54 ( 0.73%) 1.55
align1=1, align2=0, len=64: 1.71 ( 4.39%) 1.79
align1=0, align2=1, len=64: 1.74 ( 3.44%) 1.80
align1=0, align2=0, len=64: 1.72 ( 4.98%) 1.80
align1=1, align2=1, len=64: 1.80 ( -0.06%) 1.80
align1=1, align2=0, len=128: 1.99 ( 14.46%) 2.32
align1=0, align2=1, len=128: 1.99 ( 13.95%) 2.32
align1=0, align2=0, len=128: 1.98 ( 19.49%) 2.46
align1=1, align2=1, len=128: 1.98 ( 19.33%) 2.45
align1=1, align2=0, len=256: 3.09 ( 3.48%) 3.20
align1=0, align2=1, len=256: 3.10 ( 3.57%) 3.22
align1=0, align2=0, len=256: 3.09 ( 3.86%) 3.22
align1=1, align2=1, len=256: 3.10 ( 3.48%) 3.21
align1=1, align2=0, len=512: 5.33 ( 8.99%) 5.86
align1=0, align2=1, len=512: 5.32 ( 9.84%) 5.90
align1=0, align2=0, len=512: 5.26 ( 10.61%) 5.89
align1=1, align2=1, len=512: 5.26 ( 9.48%) 5.81
align1=1, align2=0, len=1024: 9.89 ( 11.99%) 11.24
align1=0, align2=1, len=1024: 9.80 ( 12.02%) 11.14
align1=0, align2=0, len=1024: 9.67 ( 13.61%) 11.19
align1=1, align2=1, len=1024: 9.67 ( 12.79%) 11.09
align1=1, align2=0, len=2048: 18.05 ( 16.76%) 21.68
align1=0, align2=1, len=2048: 17.92 ( 17.24%) 21.66
align1=0, align2=0, len=2048: 17.40 ( 19.23%) 21.54
align1=1, align2=1, len=2048: 17.40 ( 19.22%) 21.54
align1=1, align2=0, len=4096: 42.17 ( 10.47%) 47.11
align1=0, align2=1, len=4096: 42.36 ( 10.06%) 47.09
align1=0, align2=0, len=4096: 41.75 ( 10.97%) 46.89
align1=1, align2=1, len=4096: 42.52 ( 9.34%) 46.90
align1=1, align2=0, len=8192: 76.01 ( 14.94%) 89.36
align1=0, align2=1, len=8192: 74.84 ( 16.41%) 89.54
align1=0, align2=0, len=8192: 72.47 ( 18.92%) 89.39
align1=1, align2=1, len=8192: 70.79 ( 20.86%) 89.45
align1=1, align2=0, len=16384: 136.33 ( 21.25%) 173.13
align1=0, align2=1, len=16384: 137.39 ( 20.27%) 172.33
align1=0, align2=0, len=16384: 135.00 ( 21.86%) 172.76
align1=1, align2=1, len=16384: 133.84 ( 22.54%) 172.78
align1=1, align2=0, len=32768: 263.05 ( 22.94%) 341.35
align1=0, align2=1, len=32768: 264.12 ( 22.70%) 341.68
align1=0, align2=0, len=32768: 258.43 ( 24.32%) 341.48
align1=1, align2=1, len=32768: 255.88 ( 25.04%) 341.38
@@ -42,22 +42,48 @@
if USE_AS_STPCPY is defined.
- The implementation can load bytes past a null terminator, but only
- up to the next 16B boundary, so it never crosses a page. */
+ This implementation never reads across a page boundary, but may
+ read beyond the NUL terminator. */
-/* Load quadword at addr+offset to vreg, check for null bytes,
+/* Load 4 quadwords, merge into one VR for speed and check for NUL
+ and branch to label if NUL is found. */
+#define CHECK_64B(offset,addr,label) \
+ lxv 32+v4,(offset+0)(addr); \
+ lxv 32+v5,(offset+16)(addr); \
+ lxv 32+v6,(offset+32)(addr); \
+ lxv 32+v7,(offset+48)(addr); \
+ vminub v14,v4,v5; \
+ vminub v15,v6,v7; \
+ vminub v16,v14,v15; \
+ vcmpequb. v0,v16,v18; \
+ beq cr6,$+12; \
+ li r7,offset; \
+ b L(label); \
+ stxv 32+v4,(offset+0)(r11); \
+ stxv 32+v5,(offset+16)(r11); \
+ stxv 32+v6,(offset+32)(r11); \
+ stxv 32+v7,(offset+48)(r11)
+
+/* Load quadword at addr+offset to vreg, check for NUL bytes,
and branch to label if any are found. */
-#define CHECK16(vreg,offset,addr,label) \
- lxv vreg+32,offset(addr); \
- vcmpequb. v6,vreg,v18; \
+#define CHECK_16B(vreg,offset,addr,label) \
+ lxv vreg+32,offset(addr); \
+ vcmpequb. v15,vreg,v18; \
bne cr6,L(label);
+/* Store vreg2 with length if NUL is found. */
+#define STORE_WITH_LEN(vreg1,vreg2,reg) \
+ vctzlsbb r8,vreg1; \
+ addi r9,r8,1; \
+ sldi r9,r9,56; \
+ stxvl 32+vreg2,reg,r9;
+
.machine power9
ENTRY_TOCLESS (FUNC_NAME, 4)
CALL_MCOUNT 2
- vspltisb v18,0 /* Zeroes in v18 */
- vspltisb v19,-1 /* 0xFF bytes in v19 */
+ vspltisb v18,0 /* Zeroes in v18. */
+ vspltisb v19,-1 /* 0xFF bytes in v19. */
/* Next 16B-aligned address. Prepare address for L(loop). */
addi r5,r4,16
@@ -70,14 +96,11 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
lvsr v1,0,r4
vperm v0,v19,v0,v1
- vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
+ vcmpequb. v6,v0,v18 /* 0xff if byte is NUL, 0x00 otherwise. */
beq cr6,L(no_null)
- /* There's a null byte. */
- vctzlsbb r8,v6 /* Number of trailing zeroes */
- addi r9,r8,1 /* Add null byte. */
- sldi r10,r9,56 /* stxvl wants size in top 8 bits. */
- stxvl 32+v0,r3,r10 /* Partial store */
+ /* There's a NUL byte. */
+ STORE_WITH_LEN(v6,v0,r3)
#ifdef USE_AS_STPCPY
/* stpcpy returns the dest address plus the size not counting the
@@ -87,17 +110,22 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
blr
L(no_null):
- sldi r10,r8,56 /* stxvl wants size in top 8 bits */
- stxvl 32+v0,r3,r10 /* Partial store */
+ sldi r10,r8,56 /* stxvl wants size in top 8 bits. */
+ stxvl 32+v0,r3,r10 /* Partial store. */
+/* The main loop is optimized for longer strings(> 512 bytes),
+ so checking the first bytes in 16B chunks benefits shorter
+ strings a lot. */
.p2align 4
-L(loop):
- CHECK16(v0,0,r5,tail1)
- CHECK16(v1,16,r5,tail2)
- CHECK16(v2,32,r5,tail3)
- CHECK16(v3,48,r5,tail4)
- CHECK16(v4,64,r5,tail5)
- CHECK16(v5,80,r5,tail6)
+L(aligned):
+ CHECK_16B(v0,0,r5,tail1)
+ CHECK_16B(v1,16,r5,tail2)
+ CHECK_16B(v2,32,r5,tail3)
+ CHECK_16B(v3,48,r5,tail4)
+ CHECK_16B(v4,64,r5,tail5)
+ CHECK_16B(v5,80,r5,tail6)
+ CHECK_16B(v6,96,r5,tail7)
+ CHECK_16B(v7,112,r5,tail8)
stxv 32+v0,0(r11)
stxv 32+v1,16(r11)
@@ -105,21 +133,146 @@ L(loop):
stxv 32+v3,48(r11)
stxv 32+v4,64(r11)
stxv 32+v5,80(r11)
+ stxv 32+v6,96(r11)
+ stxv 32+v7,112(r11)
+
+ addi r11,r11,128
+
+ CHECK_16B(v0,128,r5,tail1)
+ CHECK_16B(v1,128+16,r5,tail2)
+ CHECK_16B(v2,128+32,r5,tail3)
+ CHECK_16B(v3,128+48,r5,tail4)
+ CHECK_16B(v4,128+64,r5,tail5)
+ CHECK_16B(v5,128+80,r5,tail6)
+ CHECK_16B(v6,128+96,r5,tail7)
+ CHECK_16B(v7,128+112,r5,tail8)
+
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ stxv 32+v3,48(r11)
+ stxv 32+v4,64(r11)
+ stxv 32+v5,80(r11)
+ stxv 32+v6,96(r11)
+ stxv 32+v7,112(r11)
+
+ addi r11,r11,128
+
+ CHECK_16B(v0,256,r5,tail1)
+ CHECK_16B(v1,256+16,r5,tail2)
+ CHECK_16B(v2,256+32,r5,tail3)
+ CHECK_16B(v3,256+48,r5,tail4)
+ CHECK_16B(v4,256+64,r5,tail5)
+ CHECK_16B(v5,256+80,r5,tail6)
+ CHECK_16B(v6,256+96,r5,tail7)
+ CHECK_16B(v7,256+112,r5,tail8)
+
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ stxv 32+v3,48(r11)
+ stxv 32+v4,64(r11)
+ stxv 32+v5,80(r11)
+ stxv 32+v6,96(r11)
+ stxv 32+v7,112(r11)
+
+ addi r11,r11,128
+
+ CHECK_16B(v0,384,r5,tail1)
+ CHECK_16B(v1,384+16,r5,tail2)
+ CHECK_16B(v2,384+32,r5,tail3)
+ CHECK_16B(v3,384+48,r5,tail4)
+ CHECK_16B(v4,384+64,r5,tail5)
+ CHECK_16B(v5,384+80,r5,tail6)
+ CHECK_16B(v6,384+96,r5,tail7)
+ CHECK_16B(v7,384+112,r5,tail8)
+
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ stxv 32+v3,48(r11)
+ stxv 32+v4,64(r11)
+ stxv 32+v5,80(r11)
+ stxv 32+v6,96(r11)
+ stxv 32+v7,112(r11)
+
+ /* Align src pointer down to a 64B boundary. */
+ addi r5,r4,512
+ clrrdi r5,r5,6
+ subf r7,r4,r5
+ add r11,r3,r7
+
+/* Switch to a more aggressive approach checking 64B each time. */
+ .p2align 5
+L(strcpy_loop):
+ CHECK_64B(0,r5,tail_64b)
+ CHECK_64B(64,r5,tail_64b)
+ CHECK_64B(128,r5,tail_64b)
+ CHECK_64B(192,r5,tail_64b)
- addi r5,r5,96
- addi r11,r11,96
+ CHECK_64B(256,r5,tail_64b)
+ CHECK_64B(256+64,r5,tail_64b)
+ CHECK_64B(256+128,r5,tail_64b)
+ CHECK_64B(256+192,r5,tail_64b)
+ addi r5,r5,512
+ addi r11,r11,512
+
+ b L(strcpy_loop)
+
+ .p2align 5
+L(tail_64b):
+ /* OK, we found a NUL byte. Let's look for it in the current 64-byte
+ block and mark it in its corresponding VR. */
+ add r11,r11,r7
+ vcmpequb. v8,v4,v18
+ beq cr6,L(no_null_16B)
+ /* There's a NUL byte. */
+ STORE_WITH_LEN(v8,v4,r11)
+#ifdef USE_AS_STPCPY
+ add r3,r11,r8
+#endif
+ blr
+
+L(no_null_16B):
+ stxv 32+v4,0(r11)
+ vcmpequb. v8,v5,v18
+ beq cr6,L(no_null_32B)
+ /* There's a NUL byte. */
+ addi r11,r11,16
+ STORE_WITH_LEN(v8,v5,r11)
+#ifdef USE_AS_STPCPY
+ add r3,r11,r8
+#endif
+ blr
+
+L(no_null_32B):
+ stxv 32+v5,16(r11)
+ vcmpequb. v8,v6,v18
+ beq cr6,L(no_null_48B)
+ /* There's a NUL byte. */
+ addi r11,r11,32
+ STORE_WITH_LEN(v8,v6,r11)
+#ifdef USE_AS_STPCPY
+ add r3,r11,r8
+#endif
+ blr
- b L(loop)
+L(no_null_48B):
+ stxv 32+v6,32(r11)
+ vcmpequb. v8,v7,v18;
+ /* There's a NUL byte. */
+ addi r11,r11,48
+ STORE_WITH_LEN(v8,v7,r11)
+#ifdef USE_AS_STPCPY
+ add r3,r11,r8
+#endif
+ blr
.p2align 4
L(tail1):
- vctzlsbb r8,v6 /* Number of trailing zeroes */
- addi r9,r8,1 /* Add null terminator */
- sldi r9,r9,56 /* stxvl wants size in top 8 bits */
- stxvl 32+v0,r11,r9 /* Partial store */
+ /* There's a NUL byte. */
+ STORE_WITH_LEN(v15,v0,r11)
#ifdef USE_AS_STPCPY
- /* stpcpy returns the dest address plus the size not counting the
- final '\0'. */
add r3,r11,r8
#endif
blr
@@ -127,11 +280,9 @@ L(tail1):
.p2align 4
L(tail2):
stxv 32+v0,0(r11)
- vctzlsbb r8,v6
- addi r9,r8,1
- sldi r9,r9,56
- addi r11,r11,16
- stxvl 32+v1,r11,r9
+ /* There's a NUL byte. */
+ addi r11,r11,16
+ STORE_WITH_LEN(v15,v1,r11)
#ifdef USE_AS_STPCPY
add r3,r11,r8
#endif
@@ -141,11 +292,8 @@ L(tail2):
L(tail3):
stxv 32+v0,0(r11)
stxv 32+v1,16(r11)
- vctzlsbb r8,v6
- addi r9,r8,1
- sldi r9,r9,56
- addi r11,r11,32
- stxvl 32+v2,r11,r9
+ addi r11,r11,32
+ STORE_WITH_LEN(v15,v2,r11)
#ifdef USE_AS_STPCPY
add r3,r11,r8
#endif
@@ -156,11 +304,8 @@ L(tail4):
stxv 32+v0,0(r11)
stxv 32+v1,16(r11)
stxv 32+v2,32(r11)
- vctzlsbb r8,v6
- addi r9,r8,1
- sldi r9,r9,56
- addi r11,r11,48
- stxvl 32+v3,r11,r9
+ addi r11,r11,48
+ STORE_WITH_LEN(v15,v3,r11)
#ifdef USE_AS_STPCPY
add r3,r11,r8
#endif
@@ -168,34 +313,59 @@ L(tail4):
.p2align 4
L(tail5):
- stxv 32+v0,0(r11)
- stxv 32+v1,16(r11)
- stxv 32+v2,32(r11)
- stxv 32+v3,48(r11)
- vctzlsbb r8,v6
- addi r9,r8,1
- sldi r9,r9,56
- addi r11,r11,64
- stxvl 32+v4,r11,r9
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ stxv 32+v3,48(r11)
+ addi r11,r11,64
+ STORE_WITH_LEN(v15,v4,r11)
#ifdef USE_AS_STPCPY
- add r3,r11,r8
+ add r3,r11,r8
#endif
blr
.p2align 4
L(tail6):
- stxv 32+v0,0(r11)
- stxv 32+v1,16(r11)
- stxv 32+v2,32(r11)
- stxv 32+v3,48(r11)
- stxv 32+v4,64(r11)
- vctzlsbb r8,v6
- addi r9,r8,1
- sldi r9,r9,56
- addi r11,r11,80
- stxvl 32+v5,r11,r9
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ stxv 32+v3,48(r11)
+ stxv 32+v4,64(r11)
+ addi r11,r11,80
+ STORE_WITH_LEN(v15,v5,r11)
#ifdef USE_AS_STPCPY
- add r3,r11,r8
+ add r3,r11,r8
+#endif
+ blr
+
+ .p2align 4
+L(tail7):
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ stxv 32+v3,48(r11)
+ stxv 32+v4,64(r11)
+ stxv 32+v5,80(r11)
+ addi r11,r11,96
+ STORE_WITH_LEN(v15,v6,r11)
+#ifdef USE_AS_STPCPY
+ add r3,r11,r8
+#endif
+ blr
+
+ .p2align 4
+L(tail8):
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ stxv 32+v3,48(r11)
+ stxv 32+v4,64(r11)
+ stxv 32+v5,80(r11)
+ stxv 32+v6,96(r11)
+ addi r11,r11,112
+ STORE_WITH_LEN(v15,v7,r11)
+#ifdef USE_AS_STPCPY
+ add r3,r11,r8
#endif
blr