Hi,
With new optimized strcpy for POWER8 [1], this patch adds an optimized
strcat which uses it along with default implementation at strings/.
I see good improvements over POWER7 version on POWER8 machine, specially
for unaligned cases (where the new strcpy aims to optimize). Benchtests
result in attachments.
Tested on powerpc64 and powerpc64le.
[1] https://sourceware.org/ml/libc-alpha/2014-12/msg00878.html
--
* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
strncat-power8 object.
* sysdeps/powerpc/powerpc64/multiarch/strcat.c (strcat): Add
__strcat_power8 implementation.
* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add __strcat_power8 implementation.
* sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c: New file:
optimized strcat for power8.
--
simple_strcat __strcat_power8 __strcat_power7 __strcat_ppc
Length 0/ 0, alignment 0/ 0: 2.53125 8.21875 8.46875 11.8281
Length 0/ 0, alignment 0/ 0: 1.95312 7.73438 9.01562 11.8281
Length 0/ 0, alignment 0/ 0: 1.9375 9.03125 8.6875 11.8594
Length 0/ 0, alignment 0/ 0: 1.9375 8.5 8.5625 11.8438
Length 1/ 1, alignment 0/ 0: 3.45312 8.15625 9.82812 8.1875
Length 1/ 1, alignment 0/ 0: 2.95312 8.15625 9.73438 8.04688
Length 1/ 1, alignment 0/ 1: 2.85938 7.79688 9.3125 8.10938
Length 1/ 1, alignment 1/ 0: 2.8125 7.70312 10.625 8.07812
Length 2/ 2, alignment 0/ 0: 4.29688 8.79688 10.9219 8.71875
Length 2/ 2, alignment 0/ 0: 3.92188 8.82812 10.875 8.65625
Length 2/ 2, alignment 0/ 2: 3.6875 8.23438 10.9375 12.1875
Length 2/ 2, alignment 2/ 0: 3.79688 8.57812 11.75 8.625
Length 3/ 3, alignment 0/ 0: 4.84375 8.8125 11.9844 9.45312
Length 3/ 3, alignment 0/ 0: 4.5 8.25 12.2812 9.39062
Length 3/ 3, alignment 0/ 3: 4.4375 8.125 10.1875 9.29688
Length 3/ 3, alignment 3/ 0: 4.40625 7.76562 12.6094 9.375
Length 4/ 4, alignment 0/ 0: 5.625 8.40625 12.6719 11.5469
Length 4/ 4, alignment 0/ 0: 5.39062 7.8125 12.5156 11.4219
Length 4/ 4, alignment 0/ 4: 5.32812 8.01562 12.5312 10.1406
Length 4/ 4, alignment 4/ 0: 5.25 8.26562 13.1406 11.5781
Length 5/ 5, alignment 0/ 0: 6.375 8.4375 12.3594 9.875
Length 5/ 5, alignment 0/ 0: 6.26562 8.51562 12.3438 10
Length 5/ 5, alignment 0/ 5: 6.07812 9.01562 12.9844 10.2969
Length 5/ 5, alignment 5/ 0: 6.01562 8.39062 12.6875 9.875
Length 6/ 6, alignment 0/ 0: 7.29688 8.65625 11.6406 10.3438
Length 6/ 6, alignment 0/ 0: 6.82812 8.125 10.7188 10.4375
Length 6/ 6, alignment 0/ 6: 6.92188 8.3125 12.7188 12.1094
Length 6/ 6, alignment 6/ 0: 6.85938 7.98438 12.0312 10.4219
Length 7/ 7, alignment 0/ 0: 8.15625 8.17188 11.5 10.6562
Length 7/ 7, alignment 0/ 0: 7.67188 9.01562 11.125 10.75
Length 7/ 7, alignment 0/ 7: 7.65625 7.65625 11.9219 11.0625
Length 7/ 7, alignment 7/ 0: 7.5625 8.70312 11.3438 10.5469
Length 8/ 8, alignment 0/ 0: 8.89062 9.03125 8.35938 11.9219
Length 8/ 8, alignment 0/ 0: 8.67188 8.40625 8.42188 11.9219
Length 8/ 8, alignment 0/ 0: 8.5625 8.39062 8.25 11.9219
Length 8/ 8, alignment 0/ 0: 8.59375 8.29688 8.34375 12.0312
Length 9/ 9, alignment 0/ 0: 10.0312 8.3125 11.0312 11.4844
Length 9/ 9, alignment 0/ 0: 9.4375 8.39062 10.6094 11.6094
Length 9/ 9, alignment 0/ 1: 9.42188 8.51562 11.0781 11.5312
Length 9/ 9, alignment 1/ 0: 9.45312 8.15625 15.2188 11.6094
Length 10/ 10, alignment 0/ 0: 10.8281 8.89062 10.9844 11.9375
Length 10/ 10, alignment 0/ 0: 10.5781 8.5 11.1719 11.8594
Length 10/ 10, alignment 0/ 2: 10.5156 8.17188 11 13.1406
Length 10/ 10, alignment 2/ 0: 10.4844 8.39062 13.5469 11.7656
Length 11/ 11, alignment 0/ 0: 16.7188 8.28125 11.5938 12.0938
Length 11/ 11, alignment 0/ 0: 16.6406 8.26562 11.4531 12.1094
Length 11/ 11, alignment 0/ 3: 16.5312 7.60938 11.9844 12.1875
Length 11/ 11, alignment 3/ 0: 16.75 8.34375 13.0938 12.3438
Length 12/ 12, alignment 0/ 0: 18.6719 8.3125 11.2969 12.9531
Length 12/ 12, alignment 0/ 0: 18.2656 8.10938 11.125 12.9375
Length 12/ 12, alignment 0/ 4: 18.2188 9.57812 13.625 10.875
Length 12/ 12, alignment 4/ 0: 18.1875 8.51562 10.2969 13
Length 13/ 13, alignment 0/ 0: 19.2812 8.46875 12.875 12.7969
Length 13/ 13, alignment 0/ 0: 19.0938 8.23438 12.25 12.6875
Length 13/ 13, alignment 0/ 5: 19.2188 9.4375 17.0781 13.9531
Length 13/ 13, alignment 5/ 0: 18.9531 8.46875 11.9062 12.9844
Length 14/ 14, alignment 0/ 0: 20.6875 8.48438 13.1875 13.25
Length 14/ 14, alignment 0/ 0: 20.5 7.92188 12.75 13.3125
Length 14/ 14, alignment 0/ 6: 20.3281 9.26562 14.9844 14.25
Length 14/ 14, alignment 6/ 0: 20.9062 8.40625 11.125 13.2031
Length 15/ 15, alignment 0/ 0: 21.7969 8.32812 13.4688 13.3125
Length 15/ 15, alignment 0/ 0: 21.5312 8.65625 13.2344 13.4219
Length 15/ 15, alignment 0/ 7: 21.6719 9.34375 13.5938 14.3125
Length 15/ 15, alignment 7/ 0: 21.6562 8.01562 10.5625 13.5156
Length 16/ 16, alignment 0/ 0: 22.9844 8.70312 11.1094 13.4375
Length 16/ 16, alignment 7/ 2: 23.1562 9.92188 18.7344 14.8281
Length 16/ 4, alignment 0/ 0: 10.9531 9.34375 10.625 13.9375
Length 16/ 4, alignment 7/ 2: 10.5 9.51562 14.625 13.6719
Length 32/ 32, alignment 0/ 0: 44.3594 9.54688 11 14.6406
Length 32/ 32, alignment 6/ 4: 44.125 9.28125 16.3438 20.75
Length 32/ 8, alignment 0/ 0: 22.4219 9.07812 9.65625 13.6562
Length 32/ 8, alignment 6/ 4: 22.1562 9.54688 15.3594 19.7188
Length 64/ 64, alignment 0/ 0: 80.1875 12.375 10.9219 16.2969
Length 64/ 64, alignment 5/ 6: 79.8906 10.6094 18.9062 33.2656
Length 64/ 16, alignment 0/ 0: 45.4531 10.0781 10.9844 15.5781
Length 64/ 16, alignment 5/ 6: 45.9219 9.39062 18.0781 33.2812
Length 128/ 128, alignment 0/ 0: 149.188 15.5312 13.9531 21.3281
Length 128/ 128, alignment 4/ 0: 148.797 15.2812 22.8438 27.3906
Length 128/ 32, alignment 0/ 0: 82.0312 12.7656 13.8906 19.6875
Length 128/ 32, alignment 4/ 0: 81.75 12.2344 21.3438 26.2656
Length 256/ 256, alignment 0/ 0: 288.547 26.0938 25.5156 30.9062
Length 256/ 256, alignment 3/ 2: 287.625 25.6875 38.5 117.781
Length 256/ 64, alignment 0/ 0: 154.344 21.0156 20.9688 25.8594
Length 256/ 64, alignment 3/ 2: 153.969 20.8438 34.2188 109.375
Length 512/ 512, alignment 0/ 0: 566 45.4062 48.1562 54.2031
Length 512/ 512, alignment 2/ 4: 566.125 44.6719 62.7188 224.078
Length 512/ 128, alignment 0/ 0: 298.453 31.1094 33.9062 40.4375
Length 512/ 128, alignment 2/ 4: 299.234 30.875 47.9531 210.5
Length 1024/1024, alignment 0/ 0: 1123.06 75.2344 86.3125 94.7031
Length 1024/1024, alignment 1/ 6: 1125.52 74.6094 109.438 436.594
Length 1024/ 256, alignment 0/ 0: 587.828 50.6875 62.7344 70.3125
Length 1024/ 256, alignment 1/ 6: 587.422 50.0469 85.7656 411.891
Length 16/ 1, alignment 1/ 2: 9.65625 9.60938 11.6875 13.5469
Length 16/ 1, alignment 2/ 1: 9.28125 9.4375 14.25 13.5156
Length 16/ 10, alignment 1/ 1: 13.2656 9.17188 11.375 13.875
Length 16/ 10, alignment 1/ 1: 12.9062 8.9375 11.4062 13.8438
Length 32/ 1, alignment 2/ 4: 19.5625 10.0312 13.4844 19.8906
Length 32/ 1, alignment 4/ 2: 19.2969 9.60938 15.875 19.8438
Length 32/ 10, alignment 2/ 2: 23.3281 8.34375 12.5156 19.8125
Length 32/ 10, alignment 2/ 2: 22.9688 8.8125 12 19.6875
Length 64/ 1, alignment 3/ 6: 32.4688 10.0938 17.7812 32.7656
Length 64/ 1, alignment 6/ 3: 32.1094 9.48438 17.6875 32.4219
Length 64/ 10, alignment 3/ 3: 36.0156 9.6875 19.2031 32.4375
Length 64/ 10, alignment 3/ 3: 35.9844 9.42188 18.7656 32.3594
Length 128/ 1, alignment 4/ 0: 57.0625 11.6719 23.8125 57.75
Length 128/ 1, alignment 0/ 4: 57.1719 11.6562 20.5156 58.0156
Length 128/ 10, alignment 4/ 4: 61.6562 11.5625 22.3438 57.9375
Length 128/ 10, alignment 4/ 4: 62.0781 12.4688 22 58.0781
Length 256/ 1, alignment 5/ 2: 106.906 19.2188 32.3281 108.594
Length 256/ 1, alignment 2/ 5: 106.922 19.6562 30.0312 108.594
Length 256/ 10, alignment 5/ 5: 112.891 18.5938 30.9219 108.828
Length 256/ 10, alignment 5/ 5: 113.578 18.5469 30.9219 108.5
Length 512/ 1, alignment 6/ 4: 207.109 26.75 49.25 208.562
Length 512/ 1, alignment 4/ 6: 206.531 27.375 47.5312 208.703
Length 512/ 10, alignment 6/ 6: 216.219 27.8594 47.0938 209
Length 512/ 10, alignment 6/ 6: 215.688 28 46.6406 209.578
Length 1024/ 1, alignment 7/ 6: 406.062 42.6719 62.2812 408.266
Length 1024/ 1, alignment 6/ 7: 407.625 43.7344 79.0469 407.922
Length 1024/ 10, alignment 7/ 7: 420.484 43.7969 82.6719 408.297
Length 1024/ 10, alignment 7/ 7: 420.781 43.6719 81.9844 408.875
@@ -18,8 +18,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
strncpy-power7 strncpy-ppc64 \
stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
- strcat-power7 strcat-ppc64 memmove-power7 memmove-ppc64 \
- bcopy-ppc64
+ strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
+ memmove-ppc64 bcopy-ppc64
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
@@ -303,6 +303,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c. */
IFUNC_IMPL (i, name, strcat,
IFUNC_IMPL_ADD (array, i, strcat,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __strcat_power8)
+ IFUNC_IMPL_ADD (array, i, strcat,
hwcap & PPC_FEATURE_HAS_VSX,
__strcat_power7)
IFUNC_IMPL_ADD (array, i, strcat, 1,
new file mode 100644
@@ -0,0 +1,30 @@
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/ >. */
+
+#include <string.h>
+
+#define STRCAT __strcat_power8
+
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+extern typeof (strcpy) __strcpy_power8;
+extern typeof (strlen) __strlen_power7;
+
+#define strcpy __strcpy_power8
+#define strlen __strlen_power7
+#include <sysdeps/powerpc/strcat.c>
@@ -23,9 +23,12 @@
extern __typeof (strcat) __strcat_ppc attribute_hidden;
extern __typeof (strcat) __strcat_power7 attribute_hidden;
+extern __typeof (strcat) __strcat_power8 attribute_hidden;
libc_ifunc (strcat,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcat_power7
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __strcat_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strcat_power7
: __strcat_ppc);
#endif