[RFC,v2] Add strcmp, strncmp, memcmp inline implementation.

Message ID 20150525081156.GA10661@domone
State New, archived
Headers

Commit Message

Ondrej Bilka May 25, 2015, 8:11 a.m. UTC
  Sorry for noise,

I wanted to ask if one needs to surround strlen with
__builtin_constant_p to avoid runtime overhead, and when I read patch I
realized that I by mistake included older partial patch. What I intended
is here, please ignore previous one. I know that it had bug when you
have string literal with zero in middle.

So do I need that to be safe? And other comments?

On Mon, May 25, 2015 at 03:58:26AM +0200, Ondřej Bílka wrote:
> Hi,
> 
> I just found that on x64 gcc __builtin_strcmp suck a lot. And by lot I
> mean its around three times slower than libcall by using rep cmpsb which
> even intel manual says that shouldn't be used.
> 
> So I decided to write a strcmp inline that works. As reencoding it as
> gcc pass would be extra effort without any benefit I skipped it.
> 
> This adds x64 specific implementation for constant arguments less than
> 16 bytes. These are quite common as programmer often does checks like 
> 
> if (strcmp (x, "foo"))
> 
> Extending this for 32 bytes and more would be straightforward but
> wouldn't help much as there aren't lot of that large words.
> 
> As same trick could be used for strncmp and memcmp with size <= 16 
> with few extra checks as we exploited alignment of string literals.
> 
> It could be optimized more with cooperation from gcc. A page-cross check
> could be omitted in most cases using dataflow that gcc already does in
> fortify_source. A CROSS_PAGE macro could first check for
> __builtin_valid_range_p(x, x+16) which evaluates to true if gcc can
> prove that x is more than 16 bytes large.
> 
> 
> A possible issue would be introducing sse with string.h. How detect gcc
> -no-sse flag?
> 

 	* sysdeps/x86_64/bits/string.h: New file.
  

Comments

Andrew Pinski May 25, 2015, 8:23 a.m. UTC | #1
On Mon, May 25, 2015 at 4:11 PM, Ondřej Bílka <neleai@seznam.cz> wrote:
> Sorry for noise,
>
> I wanted to ask if one needs to surround strlen with
> __builtin_constant_p to avoid runtime overhead, and when I read patch I
> realized that I by mistake included older partial patch. What I intended
> is here, please ignore previous one. I know that it had bug when you
> have string literal with zero in middle.
>
> So do I need that to be safe? And other comments?


Yes what will it take to get this into GCC instead of adding one more
hack to glibc for this?
Sorry but I feel like this would be better in GCC than doing this in glibc.

Thanks,
Andrew

>
> On Mon, May 25, 2015 at 03:58:26AM +0200, Ondřej Bílka wrote:
>> Hi,
>>
>> I just found that on x64 gcc __builtin_strcmp suck a lot. And by lot I
>> mean its around three times slower than libcall by using rep cmpsb which
>> even intel manual says that shouldn't be used.
>>
>> So I decided to write a strcmp inline that works. As reencoding it as
>> gcc pass would be extra effort without any benefit I skipped it.
>>
>> This adds x64 specific implementation for constant arguments less than
>> 16 bytes. These are quite common as programmer often does checks like
>>
>> if (strcmp (x, "foo"))
>>
>> Extending this for 32 bytes and more would be straightforward but
>> wouldn't help much as there aren't lot of that large words.
>>
>> As same trick could be used for strncmp and memcmp with size <= 16
>> with few extra checks as we exploited alignment of string literals.
>>
>> It could be optimized more with cooperation from gcc. A page-cross check
>> could be omitted in most cases using dataflow that gcc already does in
>> fortify_source. A CROSS_PAGE macro could first check for
>> __builtin_valid_range_p(x, x+16) which evaluates to true if gcc can
>> prove that x is more than 16 bytes large.
>>
>>
>> A possible issue would be introducing sse with string.h. How detect gcc
>> -no-sse flag?
>>
>
>         * sysdeps/x86_64/bits/string.h: New file.
>
> diff --git a/sysdeps/x86_64/bits/string.h b/sysdeps/x86_64/bits/string.h
> new file mode 100644
> index 0000000..c4d154b
> --- /dev/null
> +++ b/sysdeps/x86_64/bits/string.h
> @@ -0,0 +1,87 @@
> +/* This file should provide inline versions of string functions.
> +
> +   Surround GCC-specific parts with #ifdef __GNUC__, and use `__extern_inline'.
> +
> +   This file should define __STRING_INLINES if functions are actually defined
> +   as inlines.  */
> +
> +#ifndef _BITS_STRING_H
> +#define _BITS_STRING_H 1
> +
> +/* Define if architecture can access unaligned multi-byte variables.  */
> +#define _STRING_ARCH_unaligned   0
> +
> +#ifdef _USE_GNU
> +# if __GNUC_PREREQ (3, 2)
> +#  define _HAVE_STRING_ARCH_strcmp
> +#  define _HAVE_STRING_ARCH_strncmp
> +#  define _HAVE_STRING_ARCH_memcmp
> +
> +#  include <stdint.h>
> +#  include <emmintrin.h>
> +#  define __LOAD(x) _mm_load_si128 ((__tp_vector *) (x))
> +#  define __LOADU(x) _mm_loadu_si128 ((__tp_vector *) (x))
> +#  define get_mask(x) ((uint64_t) _mm_movemask_epi8 (x))
> +#  define __EQ  _mm_cmpeq_epi8
> +typedef __m128i __tp_vector;
> +typedef uint64_t __tp_mask;
> +
> +#define CROSS_PAGE(p) __builtin_expect (((uintptr_t) s) % 4096               \
> +                                       > 4096 - sizeof (__tp_vector) , 0)
> +
> +static inline __attribute__ ((always_inline))
> +int
> +__memcmp_small_a (char *s, char *c, int n)
> +{
> +  if (CROSS_PAGE (s))
> +    return memcmp (s, c, n);
> +  __tp_mask m = get_mask (__EQ (__LOADU (s), __LOAD (c))) | 1UL << n;
> +  int found = __builtin_ctzl (m);
> +  return s[found] - c[found];
> +}
> +static inline __attribute__ ((always_inline))
> +int
> +__memcmp_small (char *s, char *c, int n)
> +{
> +  if (CROSS_PAGE (s) || CROSS_PAGE (c))
> +    return memcmp (s, c, n);
> +  __tp_mask m = get_mask (__EQ (__LOADU (s), __LOADU (c))) | 1UL << n;
> +  int found = __builtin_ctzl (m);
> +  return s[found] - c[found];
> +}
> +#define __min(x,y) (x < y ? x : y)
> +
> +/* Dereferencing a pointer arg to run sizeof on it fails for the void
> +   pointer case, so we use this instead.
> +   Note that __x is evaluated twice. */
> +#define __string2_1bptr_p(__x) __builtin_constant_p (__x) && \
> +  ((size_t)(const void *)((__x) + 1) - (size_t)(const void *)(__x) == 1)
> +
> +
> +#  define strcmp(s1, s2) \
> +   (__extension__                                              \
> +   (__string2_1bptr_p (s1) && strlen (s1) <= 16                        \
> +    ?  __memcmp_small_a (s1, s2, strlen (s1))                  \
> +    : (__string2_1bptr_p (s2) && strlen (s2) <= 16             \
> +       ? - __memcmp_small_a (s2, s1, strlen (s2))              \
> +       : strcmp (s1, s2))))
> +
> +#  define strncmp(s1, s2, n) \
> +   (__extension__                                              \
> +   (__string2_1bptr_p (s1) && strlen (s1) <= 16                        \
> +    ?  __memcmp_small_a (s1, s2, min (n, strlen (s1)))         \
> +    : (__string2_1bptr_p (s2) && strlen (s2) <= 16             \
> +       ? - __memcmp_small_a (s2, s1, min (n, strlen (s2)))     \
> +       : strncmp (s1, s2, n))))
> +
> +#  define memcmp(s1, s2, n) \
> +   (__extension__                                              \
> +   (__builtin_constant_p (n <= 16) && n <= 16                  \
> +    ?  n == 0 ? 0 : __memcmp_small (s1, s2, n - 1))            \
> +       : memcmp (s1, s2, n))
> +
> +
> +# undef __string2_1bptr_p
> +# endif
> +#endif
  
Ondrej Bilka May 25, 2015, 8:57 a.m. UTC | #2
On Mon, May 25, 2015 at 04:23:52PM +0800, Andrew Pinski wrote:
> On Mon, May 25, 2015 at 4:11 PM, Ondřej Bílka <neleai@seznam.cz> wrote:
> > Sorry for noise,
> >
> > I wanted to ask if one needs to surround strlen with
> > __builtin_constant_p to avoid runtime overhead, and when I read patch I
> > realized that I by mistake included older partial patch. What I intended
> > is here, please ignore previous one. I know that it had bug when you
> > have string literal with zero in middle.
> >
> > So do I need that to be safe? And other comments?
> 
> 
> Yes what will it take to get this into GCC instead of adding one more
> hack to glibc for this?
> Sorry but I feel like this would be better in GCC than doing this in glibc.
> 
Sorry but it goes both ways. I could also argue why you should add one
more hack to gcc? See.

Try to convince me. My argument is easier maintainability. Try write a
gcc patch and post it. Then compare these side-by-side how are they easy
to read (I am not big fan of lisp).

Also if you want to rewrite this to gcc you are welcome. You will have
lot of work to do.
  
Joseph Myers May 28, 2015, 4:27 p.m. UTC | #3
On Mon, 25 May 2015, Ondřej Bílka wrote:

> +#  define get_mask(x) ((uint64_t) _mm_movemask_epi8 (x))

Even inside __USE_GNU, it's inappropriate to use names such as get_mask, 
CROSS_PAGE, s, c, n, m, found, always_inline from the user's namespace.
  

Patch

diff --git a/sysdeps/x86_64/bits/string.h b/sysdeps/x86_64/bits/string.h
new file mode 100644
index 0000000..c4d154b
--- /dev/null
+++ b/sysdeps/x86_64/bits/string.h
@@ -0,0 +1,87 @@ 
+/* This file should provide inline versions of string functions.
+
+   Surround GCC-specific parts with #ifdef __GNUC__, and use `__extern_inline'.
+
+   This file should define __STRING_INLINES if functions are actually defined
+   as inlines.  */
+
+#ifndef _BITS_STRING_H
+#define _BITS_STRING_H	1
+
+/* Define if architecture can access unaligned multi-byte variables.  */
+#define _STRING_ARCH_unaligned   0
+
+#ifdef _USE_GNU
+# if __GNUC_PREREQ (3, 2)
+#  define _HAVE_STRING_ARCH_strcmp
+#  define _HAVE_STRING_ARCH_strncmp
+#  define _HAVE_STRING_ARCH_memcmp
+
+#  include <stdint.h>
+#  include <emmintrin.h>
+#  define __LOAD(x) _mm_load_si128 ((__tp_vector *) (x))
+#  define __LOADU(x) _mm_loadu_si128 ((__tp_vector *) (x))
+#  define get_mask(x) ((uint64_t) _mm_movemask_epi8 (x))
+#  define __EQ  _mm_cmpeq_epi8
+typedef __m128i __tp_vector;
+typedef uint64_t __tp_mask;
+
+#define CROSS_PAGE(p) __builtin_expect (((uintptr_t) s) % 4096		      \
+					> 4096 - sizeof (__tp_vector) , 0)
+
+static inline __attribute__ ((always_inline)) 
+int
+__memcmp_small_a (char *s, char *c, int n)
+{
+  if (CROSS_PAGE (s))
+    return memcmp (s, c, n);
+  __tp_mask m = get_mask (__EQ (__LOADU (s), __LOAD (c))) | 1UL << n;
+  int found = __builtin_ctzl (m);
+  return s[found] - c[found];
+}
+static inline __attribute__ ((always_inline)) 
+int
+__memcmp_small (char *s, char *c, int n)
+{
+  if (CROSS_PAGE (s) || CROSS_PAGE (c))
+    return memcmp (s, c, n);
+  __tp_mask m = get_mask (__EQ (__LOADU (s), __LOADU (c))) | 1UL << n;
+  int found = __builtin_ctzl (m);
+  return s[found] - c[found];
+}
+#define __min(x,y) (x < y ? x : y)
+
+/* Dereferencing a pointer arg to run sizeof on it fails for the void
+   pointer case, so we use this instead.
+   Note that __x is evaluated twice. */
+#define __string2_1bptr_p(__x) __builtin_constant_p (__x) && \
+  ((size_t)(const void *)((__x) + 1) - (size_t)(const void *)(__x) == 1)
+
+
+#  define strcmp(s1, s2) \
+   (__extension__						\
+   (__string2_1bptr_p (s1) && strlen (s1) <= 16			\
+    ?  __memcmp_small_a (s1, s2, strlen (s1))			\
+    : (__string2_1bptr_p (s2) && strlen (s2) <= 16   		\
+       ? - __memcmp_small_a (s2, s1, strlen (s2))		\
+       : strcmp (s1, s2))))
+
+#  define strncmp(s1, s2, n) \
+   (__extension__						\
+   (__string2_1bptr_p (s1) && strlen (s1) <= 16			\
+    ?  __memcmp_small_a (s1, s2, min (n, strlen (s1)))		\
+    : (__string2_1bptr_p (s2) && strlen (s2) <= 16   		\
+       ? - __memcmp_small_a (s2, s1, min (n, strlen (s2)))	\
+       : strncmp (s1, s2, n))))
+
+#  define memcmp(s1, s2, n) \
+   (__extension__						\
+   (__builtin_constant_p (n <= 16) && n <= 16			\
+    ?  n == 0 ? 0 : __memcmp_small (s1, s2, n - 1))		\
+       : memcmp (s1, s2, n))
+
+  
+# undef __string2_1bptr_p
+# endif
+#endif