wcsrchr-sse2 can't use `pminud` which can speedup the main loop:
len, align, pos, seek, max_char, freq, New Time / Old Time
256, 1, 64, 23, 1273, 1, 1.082
256, 1, 64, 23, 2147483647, 1, 1.076
256, 15, 64, 23, 1273, 1, 1.061
256, 15, 64, 23, 2147483647, 1, 1.075
256, 2, 64, 23, 1273, 1, 1.108
256, 2, 64, 23, 2147483647, 1, 1.109
256, 30, 64, 23, 1273, 1, 1.072
256, 30, 64, 23, 2147483647, 1, 1.077
256, 3, 64, 23, 1273, 1, 1.108
256, 3, 64, 23, 2147483647, 1, 1.103
256, 45, 64, 23, 1273, 1, 1.076
256, 45, 64, 23, 2147483647, 1, 1.079
256, 4, 64, 23, 1273, 1, 1.119
256, 4, 64, 23, 2147483647, 1, 1.112
256, 60, 64, 23, 1273, 1, 1.117
256, 60, 64, 23, 2147483647, 1, 1.112
256, 5, 64, 23, 1273, 1, 1.21
256, 5, 64, 23, 2147483647, 1, 1.194
256, 75, 64, 23, 1273, 1, 1.055
256, 75, 64, 23, 2147483647, 1, 1.045
256, 6, 64, 23, 1273, 1, 1.264
256, 6, 64, 23, 2147483647, 1, 1.3
256, 90, 64, 23, 1273, 1, 1.022
256, 90, 64, 23, 2147483647, 1, 1.026
256, 7, 64, 23, 1273, 1, 1.316
256, 7, 64, 23, 2147483647, 1, 1.325
Overall this leads to a 5% performance improvement in the benchmark
suite.
Full xcheck passes on x86_64 with and without multiarch enabled.
---
sysdeps/x86_64/multiarch/Makefile | 1 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 3 +++
sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S | 21 +++++++++++++++++++++
sysdeps/x86_64/multiarch/wcsrchr.c | 3 ++-
4 files changed, 27 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S
@@ -154,6 +154,7 @@ sysdep_routines += \
wcsrchr-avx2-rtm \
wcsrchr-evex \
wcsrchr-sse2 \
+ wcsrchr-sse4_1 \
wmemchr-avx2 \
wmemchr-avx2-rtm \
wmemchr-evex \
@@ -685,6 +685,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsrchr_evex)
+ IFUNC_IMPL_ADD (array, i, wcsrchr,
+ CPU_FEATURE_USABLE (SSE4_1),
+ __wcsrchr_sse4_1)
IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
/* Support sysdeps/x86_64/multiarch/wcscmp.c. */
new file mode 100644
@@ -0,0 +1,21 @@
+/* wcsrchr optimized with SSE4.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define USE_AS_WCSRCHR 1
+#define STRRCHR __wcsrchr_sse4_1
+#include "../strrchr.S"
@@ -23,7 +23,8 @@
# undef wcsrchr
# define SYMBOL_NAME wcsrchr
-# include "ifunc-avx2.h"
+
+# include "ifunc-wcslen.h"
libc_ifunc_redirected (__redirect_wcsrchr, wcsrchr, IFUNC_SELECTOR ());
#endif