[06/11] Improve generic memrchr

Message ID 20161217065729.28561-7-rth@twiddle.net
State New, archived
Headers

Commit Message

Richard Henderson Dec. 17, 2016, 6:57 a.m. UTC
  * string/memrchr.c: Use haszero.h, whichzero.h.
---
 string/memrchr.c | 180 ++++++++++---------------------------------------------
 1 file changed, 30 insertions(+), 150 deletions(-)
  

Comments

Adhemerval Zanella Dec. 20, 2016, 12:30 p.m. UTC | #1
This implementation has a lot of issues, checking on both x86_64 and i386
(by manually removing the assembly implementation to use the new default
one) I am seeing a lot of issues with string/testers and others. 
Unfortunately the test-memrchr itself does not trigger most of the issues.

On 17/12/2016 04:57, Richard Henderson wrote:
>     * string/memrchr.c: Use haszero.h, whichzero.h.
> ---
>  string/memrchr.c | 180 ++++++++++---------------------------------------------
>  1 file changed, 30 insertions(+), 150 deletions(-)
> 
> diff --git a/string/memrchr.c b/string/memrchr.c
> index b9b0c9e..d45b261 100644
> --- a/string/memrchr.c
> +++ b/string/memrchr.c
> @@ -21,180 +21,60 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
>  
> -#include <stdlib.h>
> -
> -#ifdef HAVE_CONFIG_H
> -# include <config.h>
> -#endif
> -
> -#undef __ptr_t
> -#define __ptr_t void *
> -
> -#if defined _LIBC
> -# include <string.h>
> -# include <memcopy.h>
> -#endif
> -
> -#if defined HAVE_LIMITS_H || defined _LIBC
> -# include <limits.h>
> -#endif
> -
> -#define LONG_MAX_32_BITS 2147483647
> -
> -#ifndef LONG_MAX
> -# define LONG_MAX LONG_MAX_32_BITS
> -#endif
> -
> -#include <sys/types.h>
> +#include <string.h>
> +#include <stdint.h>
> +#include <haszero.h>
> +#include <whichzero.h>
>  
>  #undef __memrchr
>  #undef memrchr
>  
> -#ifndef weak_alias
> -# define __memrchr memrchr
> -#endif
> -
>  /* Search no more than N bytes of S for C.  */
> -__ptr_t
> -#ifndef MEMRCHR
> -__memrchr
> -#else
> -MEMRCHR
> -#endif

We can not get rid of this indirections without change the architectures
that uses the default implementation on ifunc (i386 for instance).

> -     (const __ptr_t s, int c_in, size_t n)
> +void *
> +__memrchr (const void *s, int c_in, size_t n)
>  {
> -  const unsigned char *char_ptr;
> +  const unsigned char *char_ptr = (const unsigned char *) s + n;
>    const unsigned long int *longword_ptr;
> -  unsigned long int longword, magic_bits, charmask;
> +  unsigned long int longword, repeated_c;
> +  uintptr_t i, align;
>    unsigned char c;
>  
>    c = (unsigned char) c_in;
>  
>    /* Handle the last few characters by reading one character at a time.
>       Do this until CHAR_PTR is aligned on a longword boundary.  */
> -  for (char_ptr = (const unsigned char *) s + n;
> -       n > 0 && ((unsigned long int) char_ptr
> -		 & (sizeof (longword) - 1)) != 0;
> -       --n)
> +  align = (uintptr_t)char_ptr % sizeof(longword);
> +  for (i = 0; i < align; ++i)
>      if (*--char_ptr == c)
> -      return (__ptr_t) char_ptr;
> +      return (void *) char_ptr;

We need to take care of 'n' while interacting over the string to align
it.  It might the case where 's' is unaligned and 'n' is less than
the aligned value.

>  
> -  /* All these elucidatory comments refer to 4-byte longwords,
> -     but the theory applies equally well to 8-byte longwords.  */
> +  /* Set up a longword, each of whose bytes is C.  */
> +  repeated_c = (-1ul / 0xff) * c;
>  
>    longword_ptr = (const unsigned long int *) char_ptr;
> +  n -= align;
> +  if (__glibc_unlikely(n == 0))
> +    return NULL;
>  
> -  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
> -     the "holes."  Note that there is a hole just to the left of
> -     each byte, with an extra at the end:
> -
> -     bits:  01111110 11111110 11111110 11111111
> -     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
> -
> -     The 1-bits make sure that carries propagate to the next 0-bit.
> -     The 0-bits provide holes for carries to fall into.  */
> -  magic_bits = -1;
> -  magic_bits = magic_bits / 0xff * 0xfe << 1 >> 1 | 1;
> -
> -  /* Set up a longword, each of whose bytes is C.  */
> -  charmask = c | (c << 8);
> -  charmask |= charmask << 16;
> -#if LONG_MAX > LONG_MAX_32_BITS
> -  charmask |= charmask << 32;
> -#endif
> -
> -  /* Instead of the traditional loop which tests each character,
> -     we will test a longword at a time.  The tricky part is testing
> -     if *any of the four* bytes in the longword in question are zero.  */
> -  while (n >= sizeof (longword))
> +  /* Loop while we have more than one longword remaining.  */
> +  while (n > sizeof (longword))
>      {
> -      /* We tentatively exit the loop if adding MAGIC_BITS to
> -	 LONGWORD fails to change any of the hole bits of LONGWORD.
> -
> -	 1) Is this safe?  Will it catch all the zero bytes?
> -	 Suppose there is a byte with all zeros.  Any carry bits
> -	 propagating from its left will fall into the hole at its
> -	 least significant bit and stop.  Since there will be no
> -	 carry from its most significant bit, the LSB of the
> -	 byte to the left will be unchanged, and the zero will be
> -	 detected.
> -
> -	 2) Is this worthwhile?  Will it ignore everything except
> -	 zero bytes?  Suppose every byte of LONGWORD has a bit set
> -	 somewhere.  There will be a carry into bit 8.  If bit 8
> -	 is set, this will carry into bit 16.  If bit 8 is clear,
> -	 one of bits 9-15 must be set, so there will be a carry
> -	 into bit 16.  Similarly, there will be a carry into bit
> -	 24.  If one of bits 24-30 is set, there will be a carry
> -	 into bit 31, so all of the hole bits will be changed.
> -
> -	 The one misfire occurs when bits 24-30 are clear and bit
> -	 31 is set; in this case, the hole at bit 31 is not
> -	 changed.  If we had access to the processor carry flag,
> -	 we could close this loophole by putting the fourth hole
> -	 at bit 32!
> -
> -	 So it ignores everything except 128's, when they're aligned
> -	 properly.
> -
> -	 3) But wait!  Aren't we looking for C, not zero?
> -	 Good point.  So what we do is XOR LONGWORD with a longword,
> -	 each of whose bytes is C.  This turns each byte that is C
> -	 into a zero.  */
> -
> -      longword = *--longword_ptr ^ charmask;
> -
> -      /* Add MAGIC_BITS to LONGWORD.  */
> -      if ((((longword + magic_bits)
> -
> -	    /* Set those bits that were unchanged by the addition.  */
> -	    ^ ~longword)
> -
> -	   /* Look at only the hole bits.  If any of the hole bits
> -	      are unchanged, most likely one of the bytes was a
> -	      zero.  */
> -	   & ~magic_bits) != 0)
> -	{
> -	  /* Which of the bytes was C?  If none of them were, it was
> -	     a misfire; continue the search.  */
> -
> -	  const unsigned char *cp = (const unsigned char *) longword_ptr;
> -
> -#if LONG_MAX > 2147483647
> -	  if (cp[7] == c)
> -	    return (__ptr_t) &cp[7];
> -	  if (cp[6] == c)
> -	    return (__ptr_t) &cp[6];
> -	  if (cp[5] == c)
> -	    return (__ptr_t) &cp[5];
> -	  if (cp[4] == c)
> -	    return (__ptr_t) &cp[4];
> -#endif
> -	  if (cp[3] == c)
> -	    return (__ptr_t) &cp[3];
> -	  if (cp[2] == c)
> -	    return (__ptr_t) &cp[2];
> -	  if (cp[1] == c)
> -	    return (__ptr_t) &cp[1];
> -	  if (cp[0] == c)
> -	    return (__ptr_t) cp;
> -	}
> -
> +      longword = *--longword_ptr ^ repeated_c;
> +      if (haszero (longword))
> +	goto found;
>        n -= sizeof (longword);
>      }
>  
> -  char_ptr = (const unsigned char *) longword_ptr;
> -
> -  while (n-- > 0)
> +  /* Since our pointer is aligned, we can always read the last longword.  */
> +  longword = *--longword_ptr ^ repeated_c;
> +  if (haszero (longword))
>      {
> -      if (*--char_ptr == c)
> -	return (__ptr_t) char_ptr;
> + found:
> +      i = whichzero (longword);
> +      if (sizeof (longword) - 1 - i < n)
> +	return (char *) longword_ptr + i;

We need to check longword in reverse word since we are checking for last
occurrence. 

Below it is a workable implementation (on x86_64, I haven't checked on
a BE machine since I do not have access to one). It passes all the string
tests:

--
#ifndef weak_alias
# define __memrchr memrchr
#endif

static inline unsigned int
whichzero_reverse (unsigned long int x)
{ 
  if (sizeof (x) == 4)
    return whichzero (__builtin_bswap32 (x));
  else
    return whichzero (__builtin_bswap64 (x));
}

/* Search no more than N bytes of S for C.  */
void *
#ifndef MEMRCHR
__memrchr
#else
MEMRCHR
#endif
     (const void *s, int c_in, size_t n)
{ 
  const unsigned char *char_ptr = (const unsigned char *) s + n;
  const unsigned long int *longword_ptr;
  unsigned long int longword, repeated_c;
  uintptr_t i;
  unsigned char c;

  c = (unsigned char) c_in;

  /* Align s to size_t reading one character at time.  */
  for (;  
       ((uintptr_t)char_ptr & (sizeof(longword) - 1)) && (n != 0);
       --n)
    if (*--char_ptr == c_in)
      return (void *) char_ptr;

  /* Set up a longword, each of whose bytes is C.  */
  repeated_c = (-1ul / 0xff) * c;

  /* Loop while we have more than one longword remaining.  */
  for (longword_ptr = (const void*)char_ptr;
       n > sizeof(size_t);
       n -= sizeof (size_t))
    {
      longword = *--longword_ptr ^ repeated_c;
      if (haszero (longword))
        goto found;
    }

  longword = *--longword_ptr ^ repeated_c;
  if (haszero (longword))
    {
found:
      i = whichzero_reverse (longword);
      if (i < n)
        return (char*) (longword_ptr) + (sizeof (longword) - 1 - i);
     }
  return NULL;
}
#ifndef MEMRCHR
# ifdef weak_alias
weak_alias (__memrchr, memrchr)
# endif
#endif
  
Richard Henderson Dec. 20, 2016, 5:19 p.m. UTC | #2
On 12/20/2016 04:30 AM, Adhemerval Zanella wrote:
> This implementation has a lot of issues, checking on both x86_64 and i386
> (by manually removing the assembly implementation to use the new default
> one) I am seeing a lot of issues with string/testers and others. 
> Unfortunately the test-memrchr itself does not trigger most of the issues.

I did notice problems by eye when going through the patch set for the second
time.  It is unfortunate that the tester doesn't trigger them.

>> +  align = (uintptr_t)char_ptr % sizeof(longword);
>> +  for (i = 0; i < align; ++i)
>>      if (*--char_ptr == c)
>> -      return (__ptr_t) char_ptr;
>> +      return (void *) char_ptr;
> 
> We need to take care of 'n' while interacting over the string to align
> it.  It might the case where 's' is unaligned and 'n' is less than
> the aligned value.

As for memchr, I had intended

  if (align > n)
    align = n;

>> + found:
>> +      i = whichzero (longword);
>> +      if (sizeof (longword) - 1 - i < n)
>> +	return (char *) longword_ptr + i;
> 
> We need to check longword in reverse word since we are checking for last
> occurrence. 

Well, yes and no.  We check from reverse but want the last match.  So it's
still a forward search.  What's needed is to mask out any match that is
excluded by N.

I currently have

  /* Handle any remaining portion of a word.  */
  if (n > 0)
    {
      word = *--word_ptr ^ repeated_c;

      /* Mask out the garbage bytes.  */
      op_t m = -1;
      if (__BYTE_ORDER == __LITTLE_ENDIAN)
        m <<= n * CHAR_BIT;
      else
        m >>= n * CHAR_BIT;
      word |= ~m;

      if (haszero (word))
        {
        found:
          return (char *) word_ptr + findzero (word);
        }
    }


r~
  
Adhemerval Zanella Dec. 20, 2016, 8:24 p.m. UTC | #3
On 20/12/2016 15:19, Richard Henderson wrote:
> On 12/20/2016 04:30 AM, Adhemerval Zanella wrote:
>> This implementation has a lot of issues, checking on both x86_64 and i386
>> (by manually removing the assembly implementation to use the new default
>> one) I am seeing a lot of issues with string/testers and others. 
>> Unfortunately the test-memrchr itself does not trigger most of the issues.
> 
> I did notice problems by eye when going through the patch set for the second
> time.  It is unfortunate that the tester doesn't trigger them.
> 
>>> +  align = (uintptr_t)char_ptr % sizeof(longword);
>>> +  for (i = 0; i < align; ++i)
>>>      if (*--char_ptr == c)
>>> -      return (__ptr_t) char_ptr;
>>> +      return (void *) char_ptr;
>>
>> We need to take care of 'n' while interacting over the string to align
>> it.  It might the case where 's' is unaligned and 'n' is less than
>> the aligned value.
> 
> As for memchr, I had intended
> 
>   if (align > n)
>     align = n;
> 
>>> + found:
>>> +      i = whichzero (longword);
>>> +      if (sizeof (longword) - 1 - i < n)
>>> +	return (char *) longword_ptr + i;
>>
>> We need to check longword in reverse word since we are checking for last
>> occurrence. 
> 
> Well, yes and no.  We check from reverse but want the last match.  So it's
> still a forward search.  What's needed is to mask out any match that is
> excluded by N.
> 
> I currently have
> 
>   /* Handle any remaining portion of a word.  */
>   if (n > 0)
>     {
>       word = *--word_ptr ^ repeated_c;
> 
>       /* Mask out the garbage bytes.  */
>       op_t m = -1;
>       if (__BYTE_ORDER == __LITTLE_ENDIAN)
>         m <<= n * CHAR_BIT;
>       else
>         m >>= n * CHAR_BIT;
>       word |= ~m;
> 
>       if (haszero (word))
>         {
>         found:
>           return (char *) word_ptr + findzero (word);
>         }
>     }

I do not have a strong preference here, neither I am sure which strategy
would yield better performance for mostly architectures.  I would be usually
one instruction if the architecture supports direct swap instruction (like
x86 and aarch64), but the ones that do not it might generate some long
sequences.  Anyway, In such cases I would go either for code simplicity.
  

Patch

diff --git a/string/memrchr.c b/string/memrchr.c
index b9b0c9e..d45b261 100644
--- a/string/memrchr.c
+++ b/string/memrchr.c
@@ -21,180 +21,60 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <stdlib.h>
-
-#ifdef HAVE_CONFIG_H
-# include <config.h>
-#endif
-
-#undef __ptr_t
-#define __ptr_t void *
-
-#if defined _LIBC
-# include <string.h>
-# include <memcopy.h>
-#endif
-
-#if defined HAVE_LIMITS_H || defined _LIBC
-# include <limits.h>
-#endif
-
-#define LONG_MAX_32_BITS 2147483647
-
-#ifndef LONG_MAX
-# define LONG_MAX LONG_MAX_32_BITS
-#endif
-
-#include <sys/types.h>
+#include <string.h>
+#include <stdint.h>
+#include <haszero.h>
+#include <whichzero.h>
 
 #undef __memrchr
 #undef memrchr
 
-#ifndef weak_alias
-# define __memrchr memrchr
-#endif
-
 /* Search no more than N bytes of S for C.  */
-__ptr_t
-#ifndef MEMRCHR
-__memrchr
-#else
-MEMRCHR
-#endif
-     (const __ptr_t s, int c_in, size_t n)
+void *
+__memrchr (const void *s, int c_in, size_t n)
 {
-  const unsigned char *char_ptr;
+  const unsigned char *char_ptr = (const unsigned char *) s + n;
   const unsigned long int *longword_ptr;
-  unsigned long int longword, magic_bits, charmask;
+  unsigned long int longword, repeated_c;
+  uintptr_t i, align;
   unsigned char c;
 
   c = (unsigned char) c_in;
 
   /* Handle the last few characters by reading one character at a time.
      Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = (const unsigned char *) s + n;
-       n > 0 && ((unsigned long int) char_ptr
-		 & (sizeof (longword) - 1)) != 0;
-       --n)
+  align = (uintptr_t)char_ptr % sizeof(longword);
+  for (i = 0; i < align; ++i)
     if (*--char_ptr == c)
-      return (__ptr_t) char_ptr;
+      return (void *) char_ptr;
 
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
+  /* Set up a longword, each of whose bytes is C.  */
+  repeated_c = (-1ul / 0xff) * c;
 
   longword_ptr = (const unsigned long int *) char_ptr;
+  n -= align;
+  if (__glibc_unlikely(n == 0))
+    return NULL;
 
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-  magic_bits = -1;
-  magic_bits = magic_bits / 0xff * 0xfe << 1 >> 1 | 1;
-
-  /* Set up a longword, each of whose bytes is C.  */
-  charmask = c | (c << 8);
-  charmask |= charmask << 16;
-#if LONG_MAX > LONG_MAX_32_BITS
-  charmask |= charmask << 32;
-#endif
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  while (n >= sizeof (longword))
+  /* Loop while we have more than one longword remaining.  */
+  while (n > sizeof (longword))
     {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
-
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
-
-	 So it ignores everything except 128's, when they're aligned
-	 properly.
-
-	 3) But wait!  Aren't we looking for C, not zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-      longword = *--longword_ptr ^ charmask;
-
-      /* Add MAGIC_BITS to LONGWORD.  */
-      if ((((longword + magic_bits)
-
-	    /* Set those bits that were unchanged by the addition.  */
-	    ^ ~longword)
-
-	   /* Look at only the hole bits.  If any of the hole bits
-	      are unchanged, most likely one of the bytes was a
-	      zero.  */
-	   & ~magic_bits) != 0)
-	{
-	  /* Which of the bytes was C?  If none of them were, it was
-	     a misfire; continue the search.  */
-
-	  const unsigned char *cp = (const unsigned char *) longword_ptr;
-
-#if LONG_MAX > 2147483647
-	  if (cp[7] == c)
-	    return (__ptr_t) &cp[7];
-	  if (cp[6] == c)
-	    return (__ptr_t) &cp[6];
-	  if (cp[5] == c)
-	    return (__ptr_t) &cp[5];
-	  if (cp[4] == c)
-	    return (__ptr_t) &cp[4];
-#endif
-	  if (cp[3] == c)
-	    return (__ptr_t) &cp[3];
-	  if (cp[2] == c)
-	    return (__ptr_t) &cp[2];
-	  if (cp[1] == c)
-	    return (__ptr_t) &cp[1];
-	  if (cp[0] == c)
-	    return (__ptr_t) cp;
-	}
-
+      longword = *--longword_ptr ^ repeated_c;
+      if (haszero (longword))
+	goto found;
       n -= sizeof (longword);
     }
 
-  char_ptr = (const unsigned char *) longword_ptr;
-
-  while (n-- > 0)
+  /* Since our pointer is aligned, we can always read the last longword.  */
+  longword = *--longword_ptr ^ repeated_c;
+  if (haszero (longword))
     {
-      if (*--char_ptr == c)
-	return (__ptr_t) char_ptr;
+ found:
+      i = whichzero (longword);
+      if (sizeof (longword) - 1 - i < n)
+	return (char *) longword_ptr + i;
     }
 
-  return 0;
+  return NULL;
 }
-#ifndef MEMRCHR
-# ifdef weak_alias
 weak_alias (__memrchr, memrchr)
-# endif
-#endif