On Fri, Jul 03, 2015 at 03:38:24PM +0200, Stefan Liebler wrote:
> This patch provides optimized versions of stpcpy and wcpcpy with the z13
> vector instructions.
> +
> + /* Find zero in 16byte aligned loop. */
> +.Lloop2:
> + vst %v18,0(%r5,%r2) /* Store previous part without zero to dst. */
> + aghi %r5,16
> +.Lloop1:
> + vl %v16,0(%r5,%r3) /* Load s. */
> + vfenezbs %v17,%v16,%v16 /* Find element not equal with zero search. */
> + je .Lfound_v16 /* Jump away if zero was found. */
> + vl %v18,16(%r5,%r3) /* Load next part of s. */
> + vst %v16,0(%r5,%r2) /* Store previous part without zero to dst. */
> + aghi %r5,16
> + vfenezbs %v17,%v18,%v18
> + je .Lfound_v18
> + vl %v16,16(%r5,%r3)
> + vst %v18,0(%r5,%r2)
> + aghi %r5,16
> + vfenezbs %v17,%v16,%v16
> + je .Lfound_v16
> + vl %v18,16(%r5,%r3)
> + vst %v16,0(%r5,%r2)
> + aghi %r5,16
> + vfenezbs %v17,%v18,%v18
> + jo .Lloop2 /* No zero found -> loop. */
> +
Here you could improve performance by using different ends so you could
increase src and dest by 64 at end. That will allow simpler addressing
which may help with ooo execution. If space is concern then following
pattern looks promising:
while (1)
{
if (has_zero(src))
goto add0;
if (has_zero(src+16))
goto add16;
if (has_zero(src+32))
goto add32;
if (has_zero(src+48))
goto add48;
x+=64;
n+=64;
}
add48:
src+=16;
dest+=16;
add32:
src+=16;
dest+=16;
add16:
src+=16;
dest+=16;
add0:
> +.Lfound_v18:
> + vlr %v16,%v18
> +.Lfound_v16:
> + la %r3,0(%r5,%r2)
> + vlgvb %r1,%v17,7 /* Load byte index of zero. */
> + vstl %v16,%r1,0(%r3) /* Copy characters including zero. */
> + algr %r5,%r1
> + la %r2,0(%r5,%r2) /* Return pointer to zero. */
> + br %r14
@@ -36,7 +36,7 @@ string-bench := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \
strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
strcoll
-wcsmbs-bench := wcslen wcsnlen wcscpy
+wcsmbs-bench := wcslen wcsnlen wcscpy wcpcpy
string-bench-all := $(string-bench) ${wcsmbs-bench}
# We have to generate locales
@@ -18,19 +18,34 @@
#define STRCPY_RESULT(dst, len) ((dst) + (len))
#define TEST_MAIN
-#define TEST_NAME "stpcpy"
+#ifndef WIDE
+# define TEST_NAME "stpcpy"
+#else
+# define TEST_NAME "wcpcpy"
+#endif /* WIDE */
#include "bench-string.h"
-
-char *simple_stpcpy (char *, const char *);
-
-IMPL (simple_stpcpy, 0)
-IMPL (stpcpy, 1)
-
-char *
-simple_stpcpy (char *dst, const char *src)
+#ifndef WIDE
+# define CHAR char
+# define SIMPLE_STPCPY simple_stpcpy
+# define STPCPY stpcpy
+#else
+# include <wchar.h>
+# define CHAR wchar_t
+# define SIMPLE_STPCPY simple_wcpcpy
+# define STPCPY wcpcpy
+#endif /* WIDE */
+
+CHAR *SIMPLE_STPCPY (CHAR *, const CHAR *);
+
+IMPL (SIMPLE_STPCPY, 0)
+IMPL (STPCPY, 1)
+
+CHAR *
+SIMPLE_STPCPY (CHAR *dst, const CHAR *src)
{
while ((*dst++ = *src++) != '\0');
return dst - 1;
}
+#undef CHAR
#include "bench-strcpy.c"
new file mode 100644
@@ -0,0 +1,20 @@
+/* Measure wcpcpy functions.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define WIDE 1
+#include "bench-stpcpy.c"
@@ -29,6 +29,12 @@
# define __stpcpy stpcpy
#endif
+#ifdef STPCPY
+extern __typeof (__stpcpy) STPCPY;
+# undef __stpcpy
+# define __stpcpy STPCPY
+#endif
+
/* Copy SRC to DEST, returning the address of the terminating '\0' in DEST. */
char *
__stpcpy (dest, src)
@@ -1,4 +1,4 @@
-/* Test and measure stpcpy functions.
+/* Test stpcpy functions.
Copyright (C) 1999-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Written by Jakub Jelinek <jakub@redhat.com>, 1999.
@@ -19,19 +19,34 @@
#define STRCPY_RESULT(dst, len) ((dst) + (len))
#define TEST_MAIN
-#define TEST_NAME "stpcpy"
+#ifndef WIDE
+# define TEST_NAME "stpcpy"
+#else
+# define TEST_NAME "wcpcpy"
+#endif /* !WIDE */
#include "test-string.h"
-
-char *simple_stpcpy (char *, const char *);
-
-IMPL (simple_stpcpy, 0)
-IMPL (stpcpy, 1)
-
-char *
-simple_stpcpy (char *dst, const char *src)
+#ifndef WIDE
+# define CHAR char
+# define SIMPLE_STPCPY simple_stpcpy
+# define STPCPY stpcpy
+#else
+# include <wchar.h>
+# define CHAR wchar_t
+# define SIMPLE_STPCPY simple_wcpcpy
+# define STPCPY wcpcpy
+#endif /* !WIDE */
+
+CHAR *SIMPLE_STPCPY (CHAR *, const CHAR *);
+
+IMPL (SIMPLE_STPCPY, 0)
+IMPL (STPCPY, 1)
+
+CHAR *
+SIMPLE_STPCPY (CHAR *dst, const CHAR *src)
{
while ((*dst++ = *src++) != '\0');
return dst - 1;
}
+#undef CHAR
#include "test-strcpy.c"
@@ -1,11 +1,13 @@
ifeq ($(subdir),string)
sysdep_routines += strlen strlen-vx strlen-c \
strnlen strnlen-vx strnlen-c \
- strcpy strcpy-vx
+ strcpy strcpy-vx \
+ stpcpy stpcpy-vx stpcpy-c
endif
ifeq ($(subdir),wcsmbs)
sysdep_routines += wcslen wcslen-vx wcslen-c \
wcsnlen wcsnlen-vx wcsnlen-c \
- wcscpy wcscpy-vx wcscpy-c
+ wcscpy wcscpy-vx wcscpy-c \
+ wcpcpy wcpcpy-vx wcpcpy-c
endif
@@ -88,6 +88,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_VX_IMPL (strcpy);
IFUNC_VX_IMPL (wcscpy);
+ IFUNC_VX_IMPL (stpcpy);
+ IFUNC_VX_IMPL (wcpcpy);
+
#endif /* HAVE_S390_VX_ASM_SUPPORT */
return i;
new file mode 100644
@@ -0,0 +1,34 @@
+/* Default stpcpy implementation for S/390.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)
+# define STPCPY __stpcpy_c
+# undef libc_hidden_def
+# undef weak_alias
+# undef libc_hidden_builtin_def
+# ifdef SHARED
+# define libc_hidden_def(name) \
+ __hidden_ver1 (__stpcpy_c, __GI___stpcpy, __stpcpy_c);
+# define libc_hidden_builtin_def(name) \
+ strong_alias (__stpcpy_c, __stpcpy_c_1); \
+ __hidden_ver1 (__stpcpy_c_1, __GI_stpcpy, __stpcpy_c_1);
+# endif /* SHARED */
+
+
+# include <string/stpcpy.c>
+#endif /* HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) */
new file mode 100644
@@ -0,0 +1,100 @@
+/* Vector optimized 32/64 bit S/390 version of stpcpy.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)
+
+# include "sysdep.h"
+# include "asm-syntax.h"
+
+ .text
+
+/* char * stpcpy (const char *dest, const char *src)
+ Copy string src to dest returning a pointer to its end.
+
+ Register usage:
+ -r1=tmp
+ -r2=dest and return value
+ -r3=src
+ -r4=tmp
+ -r5=current_len
+ -v16=part of src
+ -v17=index of zero
+ -v18=part of src
+*/
+ENTRY(__stpcpy_vx)
+ .machine "z13"
+ .machinemode "zarch_nohighgprs"
+
+ vlbb %v16,0(%r3),6 /* Load s until next 4k-byte boundary. */
+ lcbb %r1,0(%r3),6 /* Get bytes to 4k-byte boundary or 16. */
+
+ vfenezb %v17,%v16,%v16 /* Find element not equal with zero search. */
+ vlgvb %r5,%v17,7 /* Load zero index or 16 if not found. */
+ clrjl %r5,%r1,.Lfound_align /* If found zero within loaded bytes,
+ copy bytes before and return. */
+
+ /* Align s to 16 byte. */
+ risbgn %r4,%r3,60,128+63,0 /* %r3 = bits 60-63 of %r2 'and' 15. */
+ lghi %r5,15 /* current_len = 15. */
+ slr %r5,%r4 /* Compute highest index to 16byte boundary. */
+
+ vstl %v16,%r5,0(%r2) /* Copy loaded characters - no zero. */
+ ahi %r5,1 /* Start loop at next character. */
+
+ j .Lloop1
+
+.Lfound_align:
+ vstl %v16,%r5,0(%r2) /* Copy characters including zero. */
+ la %r2,0(%r5,%r2) /* Return pointer to zero. */
+ br %r14
+
+ /* Find zero in 16byte aligned loop. */
+.Lloop2:
+ vst %v18,0(%r5,%r2) /* Store previous part without zero to dst. */
+ aghi %r5,16
+.Lloop1:
+ vl %v16,0(%r5,%r3) /* Load s. */
+ vfenezbs %v17,%v16,%v16 /* Find element not equal with zero search. */
+ je .Lfound_v16 /* Jump away if zero was found. */
+ vl %v18,16(%r5,%r3) /* Load next part of s. */
+ vst %v16,0(%r5,%r2) /* Store previous part without zero to dst. */
+ aghi %r5,16
+ vfenezbs %v17,%v18,%v18
+ je .Lfound_v18
+ vl %v16,16(%r5,%r3)
+ vst %v18,0(%r5,%r2)
+ aghi %r5,16
+ vfenezbs %v17,%v16,%v16
+ je .Lfound_v16
+ vl %v18,16(%r5,%r3)
+ vst %v16,0(%r5,%r2)
+ aghi %r5,16
+ vfenezbs %v17,%v18,%v18
+ jo .Lloop2 /* No zero found -> loop. */
+
+.Lfound_v18:
+ vlr %v16,%v18
+.Lfound_v16:
+ la %r3,0(%r5,%r2)
+ vlgvb %r1,%v17,7 /* Load byte index of zero. */
+ vstl %v16,%r1,0(%r3) /* Copy characters including zero. */
+ algr %r5,%r1
+ la %r2,0(%r5,%r2) /* Return pointer to zero. */
+ br %r14
+END(__stpcpy_vx)
+#endif /* HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) */
new file mode 100644
@@ -0,0 +1,30 @@
+/* Multiple versions of stpcpy.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)
+# define NO_MEMPCPY_STPCPY_REDIRECT
+# include <string.h>
+# include <ifunc-resolve.h>
+
+s390_vx_libc_ifunc (__stpcpy)
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_builtin_def (stpcpy)
+
+#else
+# include <string/stpcpy.c>
+#endif /* !(defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)) */
new file mode 100644
@@ -0,0 +1,25 @@
+/* Default wcslen implementation for S/390.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)
+# define WCPCPY __wcpcpy_c
+
+# include <wchar.h>
+extern __typeof (__wcpcpy) __wcpcpy_c;
+# include <wcsmbs/wcpcpy.c>
+#endif
new file mode 100644
@@ -0,0 +1,108 @@
+/* Vector optimized 32/64 bit S/390 version of wcpcpy.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)
+
+# include "sysdep.h"
+# include "asm-syntax.h"
+
+ .text
+
+/* wchar_t * wcpcpy (const wchar_t *dest, const wchar_t *src)
+ Copy string src to dest returning a pointer to its end.
+
+ Register usage:
+ -r0=border-len for switching to vector-instructions
+ -r1=tmp
+ -r2=dest and return value
+ -r3=src
+ -r4=tmp
+ -r5=current_len
+ -v16=part of src
+ -v17=index of zero
+ -v18=part of src
+*/
+ENTRY(__wcpcpy_vx)
+ .machine "z13"
+ .machinemode "zarch_nohighgprs"
+
+ vlbb %v16,0(%r3),6 /* Load s until next 4k-byte boundary. */
+ lcbb %r1,0(%r3),6 /* Get bytes to 4k-byte boundary or 16. */
+
+ tmll %r3,3 /* Test if s is 4-byte aligned? */
+ jne .Lfallback /* And use common-code variant if not. */
+
+ vfenezf %v17,%v16,%v16 /* Find element not equal with zero search. */
+ vlgvb %r5,%v17,7 /* Load zero index or 16 if not found. */
+ clrjl %r5,%r1,.Lfound_align /* If found zero within loaded bytes,
+ copy bytes before and return. */
+
+ /* Align s to 16 byte. */
+ risbgn %r4,%r3,60,128+63,0 /* %r3 = bits 60-63 of %r2 'and' 15. */
+ lghi %r5,15 /* current_len = 15. */
+ slr %r5,%r4 /* Compute highest index to 16byte boundary. */
+
+ vstl %v16,%r5,0(%r2) /* Copy loaded characters - no zero. */
+ ahi %r5,1 /* Start loop at next character. */
+
+ j .Lloop1
+
+ .Lfound_align:
+ aghi %r5,3 /* Also copy remaining bytes of zero. */
+ vstl %v16,%r5,0(%r2) /* Copy characters including zero. */
+ lay %r2,-3(%r5,%r2) /* Return pointer to zero. */
+ br %r14
+
+ /* Find zero in 16byte aligned loop. */
+.Lloop2:
+ vst %v18,0(%r5,%r2) /* Store previous part without zero to dst. */
+ aghi %r5,16
+.Lloop1:
+ vl %v16,0(%r5,%r3) /* Load s. */
+ vfenezfs %v17,%v16,%v16 /* Find element not equal with zero search. */
+ je .Lfound_v16 /* Jump away if zero was found. */
+ vl %v18,16(%r5,%r3) /* Load next part of s. */
+ vst %v16,0(%r5,%r2) /* Store previous part without zero to dst. */
+ aghi %r5,16
+ vfenezfs %v17,%v18,%v18
+ je .Lfound_v18
+ vl %v16,16(%r5,%r3)
+ vst %v18,0(%r5,%r2)
+ aghi %r5,16
+ vfenezfs %v17,%v16,%v16
+ je .Lfound_v16
+ vl %v18,16(%r5,%r3)
+ vst %v16,0(%r5,%r2)
+ aghi %r5,16
+ vfenezfs %v17,%v18,%v18
+ jo .Lloop2 /* No zero found -> loop. */
+
+.Lfound_v18:
+ vlr %v16,%v18
+.Lfound_v16:
+ la %r3,0(%r5,%r2)
+ vlgvb %r1,%v17,7 /* Load byte index of zero. */
+ aghi %r1,3 /* Also copy remaining bytes of zero. */
+ vstl %v16,%r1,0(%r3) /* Copy characters including zero. */
+ algr %r5,%r1
+ lay %r2,-3(%r5,%r2) /* Return pointer to zero. */
+ br %r14
+.Lfallback:
+ jg __wcpcpy_c
+END(__wcpcpy_vx)
+#endif /* HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) */
new file mode 100644
@@ -0,0 +1,28 @@
+/* Multiple versions of wcpcpy.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)
+# include <wchar.h>
+# include <ifunc-resolve.h>
+
+s390_vx_libc_ifunc (__wcpcpy)
+weak_alias (__wcpcpy, wcpcpy)
+
+#else
+# include <wcsmbs/wcpcpy.c>
+#endif /* !(defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)) */
@@ -42,7 +42,8 @@ routines := wcscat wcschr wcscmp wcscpy wcscspn wcsdup wcslen wcsncat \
isoc99_swscanf isoc99_vswscanf \
mbrtoc16 c16rtomb
-strop-tests := wcscmp wcsncmp wmemcmp wcslen wcschr wcsrchr wcscpy wcsnlen
+strop-tests := wcscmp wcsncmp wmemcmp wcslen wcschr wcsrchr wcscpy wcsnlen \
+ wcpcpy
tests := tst-wcstof wcsmbs-tst1 tst-wcsnlen tst-btowc tst-mbrtowc \
tst-wcrtomb tst-wcpncpy tst-mbsrtowcs tst-wchar-h tst-mbrtowc2 \
tst-c16c32-1 wcsatcliff $(addprefix test-,$(strop-tests))
new file mode 100644
@@ -0,0 +1,20 @@
+/* Test wcspcpy functions.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define WIDE 1
+#include "../string/test-stpcpy.c"
@@ -21,7 +21,9 @@
#define __need_ptrdiff_t
#include <stddef.h>
-
+#ifdef WCPCPY
+# define __wcpcpy WCPCPY
+#endif
/* Copy SRC to DEST, returning the address of the terminating L'\0' in
DEST. */
wchar_t *
@@ -43,4 +45,6 @@ __wcpcpy (dest, src)
return wcp;
}
+#ifndef WCPCPY
weak_alias (__wcpcpy, wcpcpy)
+#endif