[v1,3/5] x86: Add wcsrchr optimized with SSE4_1 in wcsrchr-sse4_1.S

Message ID 20220421031410.2142238-3-goldstein.w.n@gmail.com
State Superseded
Headers
Series [v1,1/5] benchtests: Improve bench-strrchr |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Noah Goldstein April 21, 2022, 3:14 a.m. UTC
  wcsrchr-sse2 can't use `pminud` which can speedup the main loop:

len, align, pos, seek,   max_char, freq, New Time / Old Time
256,     1,  64,   23,       1273,    1,               1.082
256,     1,  64,   23, 2147483647,    1,               1.076
256,    15,  64,   23,       1273,    1,               1.061
256,    15,  64,   23, 2147483647,    1,               1.075
256,     2,  64,   23,       1273,    1,               1.108
256,     2,  64,   23, 2147483647,    1,               1.109
256,    30,  64,   23,       1273,    1,               1.072
256,    30,  64,   23, 2147483647,    1,               1.077
256,     3,  64,   23,       1273,    1,               1.108
256,     3,  64,   23, 2147483647,    1,               1.103
256,    45,  64,   23,       1273,    1,               1.076
256,    45,  64,   23, 2147483647,    1,               1.079
256,     4,  64,   23,       1273,    1,               1.119
256,     4,  64,   23, 2147483647,    1,               1.112
256,    60,  64,   23,       1273,    1,               1.117
256,    60,  64,   23, 2147483647,    1,               1.112
256,     5,  64,   23,       1273,    1,                1.21
256,     5,  64,   23, 2147483647,    1,               1.194
256,    75,  64,   23,       1273,    1,               1.055
256,    75,  64,   23, 2147483647,    1,               1.045
256,     6,  64,   23,       1273,    1,               1.264
256,     6,  64,   23, 2147483647,    1,                 1.3
256,    90,  64,   23,       1273,    1,               1.022
256,    90,  64,   23, 2147483647,    1,               1.026
256,     7,  64,   23,       1273,    1,               1.316
256,     7,  64,   23, 2147483647,    1,               1.325

Overall this leads to a 5% performance improvement in the benchmark
suite.

Full xcheck passes on x86_64 with and without multiarch enabled.
---
 sysdeps/x86_64/multiarch/Makefile          |  1 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |  3 +++
 sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S  | 21 +++++++++++++++++++++
 sysdeps/x86_64/multiarch/wcsrchr.c         |  3 ++-
 4 files changed, 27 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S
  

Patch

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 0400ea332b..5ad7bc8c25 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -154,6 +154,7 @@  sysdep_routines += \
   wcsrchr-avx2-rtm \
   wcsrchr-evex \
   wcsrchr-sse2 \
+  wcsrchr-sse4_1 \
   wmemchr-avx2 \
   wmemchr-avx2-rtm \
   wmemchr-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a8afcf81bb..1cbb6938c8 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -685,6 +685,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcsrchr_evex)
+	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+			      CPU_FEATURE_USABLE (SSE4_1),
+			      __wcsrchr_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wcscmp.c.  */
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S b/sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S
new file mode 100644
index 0000000000..34b92d28eb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse4_1.S
@@ -0,0 +1,21 @@ 
+/* wcsrchr optimized with SSE4.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define USE_AS_WCSRCHR	1
+#define STRRCHR	__wcsrchr_sse4_1
+#include "../strrchr.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr.c b/sysdeps/x86_64/multiarch/wcsrchr.c
index 8b30c06f2e..eb18038eec 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr.c
+++ b/sysdeps/x86_64/multiarch/wcsrchr.c
@@ -23,7 +23,8 @@ 
 # undef wcsrchr
 
 # define SYMBOL_NAME wcsrchr
-# include "ifunc-avx2.h"
+
+# include "ifunc-wcslen.h"
 
 libc_ifunc_redirected (__redirect_wcsrchr, wcsrchr, IFUNC_SELECTOR ());
 #endif