[v3] sysdeps/x86_64/multiarch/memmem-avx2.c: add memmem-avx2.c

Message ID 20240110054808.1915609-1-tirtajames45@gmail.com
State Superseded
Headers
Series [v3] sysdeps/x86_64/multiarch/memmem-avx2.c: add memmem-avx2.c |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch fail Patch failed to apply to master at the time it was sent
redhat-pt-bot/TryBot-32bit fail Patch series failed to apply

Commit Message

James Tirta Halim Jan. 10, 2024, 5:48 a.m. UTC
  Timings (Core i3-1115G4):
basic_memmem twoway_memmem __memmem_avx512 __memmem_avx2
__memmem_generic
Total:
6.80124e+06 1.06087e+06 219483 345385 768041
Average:
25958.9 4049.11 837.721 1318.26 2931.45

Changes in v1:
1. Add memmem-avx2.c

Changes in v2:
1. Add avx512 support with a generic header file
2. Use __memcmpeq instead of memcmp
3. Remove scalar loop
4. Fix unsafe unaligned load

Changes in v3:
1. Avoid checking for alignment to the start of the page since that will be rare
2. Use __memcmpeq instead of __memcmpeq_avx2 (it generates undefined
reference errors).
3. Add memmem.c (needs review)
4. Add __memcmpeq_avx2 and __memcmpeq_avx512 to ifunc-impl-list.c (needs
review)
5. Add libc_hidden_builtin_def and MEMMEM to memmem.c (needs review)
6. Add memmem-avx2 and memmem-avx512 to Makefile and their appropriate
CFLAGS

Passes make check

---
 string/memmem.c                               |   7 +-
 sysdeps/x86_64/multiarch/Makefile             |   5 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  11 +
 sysdeps/x86_64/multiarch/memmem-avx2.c        |   4 +
 sysdeps/x86_64/multiarch/memmem-avx512.c      |  19 ++
 .../x86_64/multiarch/memmem-vectorized-avx.h  | 223 ++++++++++++++++++
 sysdeps/x86_64/multiarch/memmem.c             |  68 ++++++
 7 files changed, 336 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/x86_64/multiarch/memmem-avx2.c
 create mode 100644 sysdeps/x86_64/multiarch/memmem-avx512.c
 create mode 100644 sysdeps/x86_64/multiarch/memmem-vectorized-avx.h
 create mode 100644 sysdeps/x86_64/multiarch/memmem.c
  

Comments

Andreas K. Huettel Jan. 11, 2024, 9:52 p.m. UTC | #1
Am Mittwoch, 10. Januar 2024, 06:48:08 CET schrieb James Tirta Halim:
> Timings (Core i3-1115G4):
> basic_memmem twoway_memmem __memmem_avx512 __memmem_avx2
> __memmem_generic
> Total:
> 6.80124e+06 1.06087e+06 219483 345385 768041
> Average:
> 25958.9 4049.11 837.721 1318.26 2931.45
> 

This looks like post-release material to me.
  

Patch

diff --git a/string/memmem.c b/string/memmem.c
index 6badc1c3bd..62654b4bd0 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -32,6 +32,10 @@ 
 
 #undef memmem
 
+#ifndef MEMMEM
+#	define MEMMEM __memmem
+#endif
+
 /* Hash character pairs so a small shift table can be used.  All bits of
    p[0] are included, but not all bits from p[-1].  So if two equal hashes
    match on p[-1], p[0] matches too.  Hash collisions are harmless and result
@@ -50,7 +54,7 @@ 
    The limit also implies worst-case performance is linear.
    Needles larger than 256 characters use the linear-time Two-Way algorithm.  */
 void *
-__memmem(const void *haystack, size_t hs_len,
+MEMMEM(const void *haystack, size_t hs_len,
 const void *needle, size_t ne_len)
 {
 	const unsigned char *hs = (const unsigned char *)haystack;
@@ -122,3 +126,4 @@  const void *needle, size_t ne_len)
 libc_hidden_def(__memmem)
 weak_alias(__memmem, memmem)
 libc_hidden_weak(memmem)
+libc_hidden_builtin_def(MEMMEM)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e1e894c963..e20cac2993 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -119,6 +119,8 @@  sysdep_routines += \
   strspn-sse4 \
   strstr-avx512 \
   strstr-sse2-unaligned \
+  memmem-avx2 \
+  memmem-avx512 \
   varshift \
 # sysdep_routines
 
@@ -127,6 +129,9 @@  CFLAGS-strpbrk-sse4.c += -msse4
 CFLAGS-strspn-sse4.c += -msse4
 
 CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
+
+CFLAGS-memmem-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
+CFLAGS-memmem-avx2.c += -mavx2 -O3
 endif
 
 ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 5427ff1907..2e29e9ee19 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -799,6 +799,17 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_generic))
 
+  /* Support sysdeps/x86_64/multiarch/memmem.c.  */
+  IFUNC_IMPL (i, name, memmem,
+              IFUNC_IMPL_ADD (array, i, memmem,
+                              (CPU_FEATURE_USABLE (AVX512VL)
+                               && CPU_FEATURE_USABLE (AVX512BW)
+                               && CPU_FEATURE_USABLE (AVX512DQ)
+                               && CPU_FEATURE_USABLE (BMI2)),
+                              __memmem_avx512)
+              IFUNC_IMPL_ADD (array, i, memmem, (CPU_FEATURE_USABLE (AVX2)), __memmem_avx2)
+	      IFUNC_IMPL_ADD (array, i, memmem, 1, __memmem_generic))
+
   /* Support sysdeps/x86_64/multiarch/wcschr.c.  */
   IFUNC_IMPL (i, name, wcschr,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr,
diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c b/sysdeps/x86_64/multiarch/memmem-avx2.c
new file mode 100644
index 0000000000..364d7cad1c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmem-avx2.c
@@ -0,0 +1,4 @@ 
+#define MEMCMPEQ __memcmpeq
+#define FUNC_NAME __memmem_avx2
+
+#include "memmem-vectorized-avx.h"
diff --git a/sysdeps/x86_64/multiarch/memmem-avx512.c b/sysdeps/x86_64/multiarch/memmem-avx512.c
new file mode 100644
index 0000000000..4cb54aced7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmem-avx512.c
@@ -0,0 +1,19 @@ 
+#define VEC __m512i
+#define MASK uint64_t
+#define LOAD(x) _mm512_load_si512 (x)
+#define LOADU(x) _mm512_loadu_si512 (x)
+#define STORE(dst, src) _mm512_store_si512 (dst, src)
+#define STOREU(dst, src) _mm512_storeu_si512 (dst, src)
+#define CMPEQ8_MASK(x, y) _mm512_cmpeq_epi8_mask (x, y)
+#define SETZERO(x) _mm512_setzero_si512 (x)
+#define SETONE8(x) _mm512_set1_epi8 (x)
+#define POPCNT(x) _mm_popcnt_u64 (x)
+#define TZCNT(x) _tzcnt_u64 (x)
+#define BLSR(x) _blsr_u64 (x)
+#define LZCNT(x) _lzcnt_u64 (x)
+#define ONES ((MASK) -1)
+
+#define MEMCMPEQ __memcmpeq
+#define FUNC_NAME __memmem_avx512
+
+#include "memmem-vectorized-avx.h"
diff --git a/sysdeps/x86_64/multiarch/memmem-vectorized-avx.h b/sysdeps/x86_64/multiarch/memmem-vectorized-avx.h
new file mode 100644
index 0000000000..c31d1cbae2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmem-vectorized-avx.h
@@ -0,0 +1,223 @@ 
+#include <immintrin.h>
+#include <inttypes.h>
+#include <string.h>
+#include <libc-pointer-arith.h>
+
+#ifndef FUNC_NAME
+#  define __memmem_avx2
+#endif
+#ifndef VEC
+#  define VEC __m256i
+#endif
+#ifndef VEC_SIZE
+#  define VEC_SIZE sizeof (VEC)
+#endif
+#ifndef MASK
+#  define MASK uint32_t
+#endif
+#ifndef MASK_SIZE
+#  define MASK_SIZE sizeof (MASK)
+#endif
+#ifndef LOAD
+#  define LOAD(x) _mm256_load_si256 (x)
+#endif
+#ifndef LOADU
+#  define LOADU(x) _mm256_loadu_si256 (x)
+#endif
+#ifndef STORE
+#  define STORE(dst, src) _mm256_store_si256 (dst, src)
+#endif
+#ifndef STOREU
+#  define STOREU(dst, src) _mm256_storeu_si256 (dst, src)
+#endif
+#ifndef CMPEQ8_MASK
+#  define CMPEQ8_MASK(x, y) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (x, y))
+#endif
+#ifndef SETZERO
+#  define SETZERO(x) _mm256_setzero_si256 (x)
+#endif
+#ifndef SETONE8
+#  define SETONE8(x) _mm256_set1_epi8 (x)
+#endif
+#ifndef POPCNT
+#  define POPCNT(x) _mm_popcnt_u32 (x)
+#endif
+#ifndef TZCNT
+#  define TZCNT(x) _tzcnt_u32 (x)
+#endif
+#ifndef BLSR
+#  define BLSR(x) _blsr_u32 (x)
+#endif
+#ifndef LZCNT
+#  define LZCNT(x) _lzcnt_u32 (x)
+#endif
+#ifndef ONES
+#  define ONES ((MASK) -1)
+#endif
+
+#ifndef MEMCMPEQ
+#  define MEMCMPEQ __memcmpeq
+#endif
+#ifndef MEMCPY
+#  define MEMCPY memcpy
+#endif
+#ifndef MEMCHR
+#  define MEMCHR memchr
+#endif
+#ifndef PAGE_SIZE
+#  define PAGE_SIZE 4096
+#endif
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+static inline void *
+find_rarest_byte (const void *ne, size_t n)
+{
+  /* Lower is rarer. The table is based on the
+   *.c and *.h files in glibc. */
+  static const unsigned char rarebyte_table[256]
+      = { 0,   1,   13,	 56,  59,  60,	61,  62,  63,  232, 248, 2,   158, 4,
+	  5,   6,   7,	 8,   9,   10,	14,  20,  26,  29,  37,	 46,  52,  53,
+	  54,  55,  57,	 58,  255, 172, 242, 193, 162, 174, 178, 182, 218, 219,
+	  212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223,
+	  222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216,
+	  196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204,
+	  214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246,
+	  233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243,
+	  228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190,
+	  166, 3,   140, 134, 124, 126, 86,  128, 95,  117, 114, 93,  81,  87,
+	  132, 96,  112, 97,  103, 82,	139, 89,  98,  88,  119, 74,  156, 115,
+	  104, 75,  120, 106, 76,  155, 90,  122, 107, 125, 152, 145, 136, 137,
+	  101, 116, 102, 108, 99,  141, 77,  78,  118, 79,  109, 100, 150, 73,
+	  94,  72,  121, 151, 113, 135, 110, 105, 83,  91,  11,	 12,  64,  149,
+	  146, 111, 65,	 69,  66,  15,	16,  17,  18,  19,  130, 92,  144, 123,
+	  21,  22,  23,	 24,  131, 133, 127, 142, 25,  70,  129, 27,  28,  67,
+	  153, 84,  143, 138, 147, 157, 148, 68,  71,  30,  31,	 32,  33,  34,
+	  35,  36,  154, 38,  39,  40,	41,  42,  80,  43,  44,	 45,  47,  48,
+	  85,  49,  50,	 51 };
+  const unsigned char *rare = (const unsigned char *) ne;
+  const unsigned char *p = (const unsigned char *) ne;
+  int c_rare = rarebyte_table[*rare];
+  int c;
+  for (; n--; ++p)
+    {
+      c = rarebyte_table[*p];
+      if (c < c_rare)
+	{
+	  rare = p;
+	  c_rare = c;
+	}
+    }
+  return (void *) rare;
+}
+
+void *
+FUNC_NAME (const void *hs, size_t hs_len, const void *ne, size_t ne_len)
+{
+  if (ne_len == 1)
+    return (void *) MEMCHR (hs, *(unsigned char *) ne, hs_len);
+  if (__glibc_unlikely (ne_len == 0))
+    return (void *) hs;
+  if (__glibc_unlikely (hs_len < ne_len))
+    return NULL;
+  VEC hv0, hv1, hv, nv;
+  MASK i, hm0, hm1, m, cmpm;
+  const unsigned int matchsh = ne_len < VEC_SIZE ? VEC_SIZE - ne_len : 0;
+  const MASK matchm = ONES << matchsh;
+  const unsigned char *h = (const unsigned char *) hs;
+  const unsigned char *const end = h + hs_len - ne_len;
+  const unsigned char *hp;
+  size_t shift = PTR_DIFF (find_rarest_byte (ne, ne_len), ne);
+  if (shift == ne_len - 1)
+    --shift;
+  const VEC nv0 = SETONE8 (*((char *) ne + shift));
+  const VEC nv1 = SETONE8 (*((char *) ne + shift + 1));
+  h += shift;
+  if (PTR_DIFF (PTR_ALIGN_UP (ne, PAGE_SIZE), ne) >= VEC_SIZE
+      || PTR_IS_ALIGNED (ne, PAGE_SIZE) || ne_len >= VEC_SIZE)
+    nv = LOADU ((VEC *) ne);
+  else
+    MEMCPY (&nv, ne, MIN (VEC_SIZE, ne_len));
+  const unsigned int off = PTR_DIFF (h, PTR_ALIGN_DOWN (h, VEC_SIZE));
+  unsigned int off2 = (PTR_DIFF (end, (h - shift)) < VEC_SIZE)
+			  ? VEC_SIZE - (unsigned int) (end - (h - shift)) - 1
+			  : 0;
+  h -= off;
+  hv0 = LOAD ((const VEC *) h);
+  hm0 = (MASK) CMPEQ8_MASK (hv0, nv0);
+  hm1 = (MASK) CMPEQ8_MASK (hv0, nv1) >> 1;
+  /* Clear matched bits that are out of bounds. */
+  m = (((hm0 & hm1) >> off) << off2) >> off2;
+  while (m)
+    {
+      i = TZCNT (m);
+      m = BLSR (m);
+      hp = h + off + i - shift;
+      if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE)
+	{
+	  hv = LOADU ((VEC *) hp);
+	  cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh;
+	  if (cmpm == matchm)
+	    if (ne_len <= VEC_SIZE
+		|| !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + VEC_SIZE,
+			      ne_len - VEC_SIZE))
+	      return (void *) hp;
+	}
+      else
+	{
+	  if (!MEMCMPEQ (hp, ne, ne_len))
+	    return (void *) hp;
+	}
+    }
+  h += VEC_SIZE - 1;
+  for (; h - shift + VEC_SIZE <= end; h += VEC_SIZE)
+    {
+      hv0 = LOADU ((const VEC *) h);
+      hv1 = LOAD ((const VEC *) (h + 1));
+      hm1 = (MASK) CMPEQ8_MASK (hv1, nv1);
+      hm0 = (MASK) CMPEQ8_MASK (hv0, nv0);
+      m = hm0 & hm1;
+      while (m)
+	{
+	match:
+	  i = TZCNT (m);
+	  m = BLSR (m);
+	  hp = h + i - shift;
+	  if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE)
+	    {
+	      hv = LOADU ((VEC *) hp);
+	      cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh;
+	      if (cmpm == matchm)
+		if (ne_len <= VEC_SIZE
+		    || !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + VEC_SIZE,
+				  ne_len - VEC_SIZE))
+		  return (void *) hp;
+	    }
+	  else
+	    {
+	      if (!MEMCMPEQ (hp, ne, ne_len))
+		return (void *) hp;
+	    }
+	}
+    }
+  if (h - shift <= end)
+    {
+      off2 = VEC_SIZE - (unsigned int) (end - (h - shift)) - 1;
+      hv1 = LOAD ((const VEC *) (h + 1));
+      if (PTR_DIFF (PTR_ALIGN_UP (h, PAGE_SIZE), h) >= VEC_SIZE)
+	{
+	  hv0 = LOADU ((const VEC *) h);
+	  hm1 = (MASK) CMPEQ8_MASK (hv1, nv1);
+	  hm0 = (MASK) CMPEQ8_MASK (hv0, nv0);
+	}
+      else
+	{
+	  hm1 = (MASK) CMPEQ8_MASK (hv1, nv1);
+	  hm0 = 1 | (MASK) CMPEQ8_MASK (hv1, nv0) << 1;
+	}
+      /* Clear matched bits that are out of bounds. */
+      m = ((hm0 & hm1) << off2) >> off2;
+      if (m)
+	goto match;
+    }
+  return NULL;
+}
diff --git a/sysdeps/x86_64/multiarch/memmem.c b/sysdeps/x86_64/multiarch/memmem.c
new file mode 100644
index 0000000000..29c99e096b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmem.c
@@ -0,0 +1,68 @@ 
+/* Multiple versions of memmem.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2012-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Redefine memmem so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+#undef  memmem
+#define memmem __redirect_memmem
+#include <string.h>
+#undef  memmem
+
+#define MEMMEM __memmem_generic
+#ifdef SHARED
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+  __hidden_ver1 (__memmem_generic, __GI_memmem, __memmem_generic);
+#endif
+
+#include "string/memmem.c"
+
+extern __typeof (__redirect_memmem) __memmem_avx2 attribute_hidden;
+extern __typeof (__redirect_memmem) __memmem_generic attribute_hidden;
+extern __typeof (__redirect_memmem) __memmem_avx512 attribute_hidden;
+
+#define SYMBOL_NAME memmem
+
+#include "init-arch.h"
+
+/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
+   ifunc symbol properly.  */
+extern __typeof (__redirect_memmem) __libc_memmem;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features *cpu_features = __get_cpu_features ();
+
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+    return __memmem_avx512;
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+    return __memmem_avx2;
+
+  return __memmem_generic;
+}
+
+libc_ifunc_redirected (__redirect_memmem, __libc_memmem, IFUNC_SELECTOR ());
+#undef memmem
+strong_alias (__libc_memmem, __memmem)