Fix x86_64 memchr for large input sizes

Message ID 1481834313-5626-1-git-send-email-adhemerval.zanella@linaro.org
State Dropped
Headers

Commit Message

Adhemerval Zanella Dec. 15, 2016, 8:38 p.m. UTC
  Current optimized memchr for x86_64 does for input arguments pointers
module 64 in range of [49,63] if there is no searchr char in the rest
of 64-byte block a pointer addition which might overflow:

* sysdeps/x86_64/memchr.S

    77          .p2align 4
    78  L(unaligned_no_match):
    79          add     %rcx, %rdx

Add (uintptr_t)s % 16 to n in %rdx.

    80          sub     $16, %rdx
    81          jbe     L(return_null)

This patch fixes by adding a saturated math that sets a maximum pointer
value if it overflows (UINTPTR_MAX).  This is similar of the fix for
powerpc64/power7 version [1] and rely on for test-memchr.c changes.

Checked on x86_64-linux-gnu and powerpc64-linux-gnu.

[1] https://sourceware.org/ml/libc-alpha/2016-12/msg00576.html

	[BZ# 19387]
	* sysdeps/x86_64/memchr.S (memchr): Avoid overflow in pointer
	addition.
	* string/test-memchr.c (do_test): Remove alignment limitation.
	(test_main): Add test that trigger BZ# 19387.
---
 ChangeLog               | 6 ++++++
 string/test-memchr.c    | 9 ++++-----
 sysdeps/x86_64/memchr.S | 8 ++++++++
 3 files changed, 18 insertions(+), 5 deletions(-)
  

Comments

Adhemerval Zanella Dec. 20, 2016, 2:03 p.m. UTC | #1
Ping.

On 15/12/2016 18:38, Adhemerval Zanella wrote:
> Current optimized memchr for x86_64 does for input arguments pointers
> module 64 in range of [49,63] if there is no searchr char in the rest
> of 64-byte block a pointer addition which might overflow:
> 
> * sysdeps/x86_64/memchr.S
> 
>     77          .p2align 4
>     78  L(unaligned_no_match):
>     79          add     %rcx, %rdx
> 
> Add (uintptr_t)s % 16 to n in %rdx.
> 
>     80          sub     $16, %rdx
>     81          jbe     L(return_null)
> 
> This patch fixes by adding a saturated math that sets a maximum pointer
> value if it overflows (UINTPTR_MAX).  This is similar of the fix for
> powerpc64/power7 version [1] and rely on for test-memchr.c changes.
> 
> Checked on x86_64-linux-gnu and powerpc64-linux-gnu.
> 
> [1] https://sourceware.org/ml/libc-alpha/2016-12/msg00576.html
> 
> 	[BZ# 19387]
> 	* sysdeps/x86_64/memchr.S (memchr): Avoid overflow in pointer
> 	addition.
> 	* string/test-memchr.c (do_test): Remove alignment limitation.
> 	(test_main): Add test that trigger BZ# 19387.
> ---
>  ChangeLog               | 6 ++++++
>  string/test-memchr.c    | 9 ++++-----
>  sysdeps/x86_64/memchr.S | 8 ++++++++
>  3 files changed, 18 insertions(+), 5 deletions(-)
> 
> diff --git a/string/test-memchr.c b/string/test-memchr.c
> index 7431386..6901711 100644
> --- a/string/test-memchr.c
> +++ b/string/test-memchr.c
> @@ -76,7 +76,6 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
>    size_t i;
>    CHAR *result;
>  
> -  align &= 7;
>    if ((align + len) * sizeof (CHAR) >= page_size)
>      return;
>  
> @@ -194,12 +193,12 @@ test_main (void)
>        do_test (i, 64, 256, SIZE_MAX, 0);
>      }
>  
> -  for (i = 1; i < 16; ++i)
> +  for (i = 1; i < 64; ++i)
>      {
> -      for (j = 1; j < 16; j++)
> +      for (j = 1; j < 64; j++)
>          {
> -	  do_test (0, 16 - j, 32, SIZE_MAX, 23);
> -	  do_test (i, 16 - j, 32, SIZE_MAX, 23);
> +	  do_test (0, 64 - j, 64, SIZE_MAX, 23);
> +	  do_test (i, 64 - j, 64, SIZE_MAX, 23);
>          }
>      }
>  
> diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
> index 132eacb..b140de1 100644
> --- a/sysdeps/x86_64/memchr.S
> +++ b/sysdeps/x86_64/memchr.S
> @@ -76,7 +76,15 @@ L(crosscache):
>  
>  	.p2align 4
>  L(unaligned_no_match):
> +        /* Calculate the last acceptable address and check for possible
> +           addition overflow by using satured math:
> +           rdx = rcx + rdx
> +           rdx |= -(rdx < x)  */
>  	add	%rcx, %rdx
> +	sbb	%eax, %eax
> +	movslq	%eax, %rax
> +	orq	%rdx, %rax
> +	mov	%rax, %rdx
>  	sub	$16, %rdx
>  	jbe	L(return_null)
>  	add	$16, %rdi
>
  
Richard Henderson Dec. 20, 2016, 5:07 p.m. UTC | #2
On 12/20/2016 06:03 AM, Adhemerval Zanella wrote:
> +	sbb	%eax, %eax
> +	movslq	%eax, %rax

This should be "sbb %rax, %rax" in one step.

> +	orq	%rdx, %rax
> +	mov	%rax, %rdx

Given that the comment says you wanted the result in %rdx,
why not "or %rax, %rdx"?


r~
  
Adhemerval Zanella Dec. 20, 2016, 9:41 p.m. UTC | #3
On 20/12/2016 15:07, Richard Henderson wrote:
> On 12/20/2016 06:03 AM, Adhemerval Zanella wrote:
>> +	sbb	%eax, %eax
>> +	movslq	%eax, %rax
> 
> This should be "sbb %rax, %rax" in one step.
> 
>> +	orq	%rdx, %rax
>> +	mov	%rax, %rdx
> 
> Given that the comment says you wanted the result in %rdx,
> why not "or %rax, %rdx"?

Indeed it seems the 4 instructions can be simplified to just

        sbb     %rax, %rax
        or      %rax, %rdx

Thanks for the advice.
  

Patch

diff --git a/string/test-memchr.c b/string/test-memchr.c
index 7431386..6901711 100644
--- a/string/test-memchr.c
+++ b/string/test-memchr.c
@@ -76,7 +76,6 @@  do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
   size_t i;
   CHAR *result;
 
-  align &= 7;
   if ((align + len) * sizeof (CHAR) >= page_size)
     return;
 
@@ -194,12 +193,12 @@  test_main (void)
       do_test (i, 64, 256, SIZE_MAX, 0);
     }
 
-  for (i = 1; i < 16; ++i)
+  for (i = 1; i < 64; ++i)
     {
-      for (j = 1; j < 16; j++)
+      for (j = 1; j < 64; j++)
         {
-	  do_test (0, 16 - j, 32, SIZE_MAX, 23);
-	  do_test (i, 16 - j, 32, SIZE_MAX, 23);
+	  do_test (0, 64 - j, 64, SIZE_MAX, 23);
+	  do_test (i, 64 - j, 64, SIZE_MAX, 23);
         }
     }
 
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index 132eacb..b140de1 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -76,7 +76,15 @@  L(crosscache):
 
 	.p2align 4
 L(unaligned_no_match):
+        /* Calculate the last acceptable address and check for possible
+           addition overflow by using satured math:
+           rdx = rcx + rdx
+           rdx |= -(rdx < x)  */
 	add	%rcx, %rdx
+	sbb	%eax, %eax
+	movslq	%eax, %rax
+	orq	%rdx, %rax
+	mov	%rax, %rdx
 	sub	$16, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi