Fix i686 memchr overflow calculation (BZ#21182)

Message ID CAMe9rOr0UFKouWki2VwybG=k9dxe8ZoEJu=TzqrRP9TpLDfybA@mail.gmail.com
State New, archived
Headers

Commit Message

H.J. Lu May 19, 2017, 4:14 p.m. UTC
  On Fri, May 19, 2017 at 8:10 AM, Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
> On 19/05/2017 11:45, Adhemerval Zanella wrote:
>>
>>
>> On 19/05/2017 10:44, H.J. Lu wrote:
>>> On Tue, Mar 28, 2017 at 2:47 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>> On Tue, Mar 28, 2017 at 12:22 PM, Adhemerval Zanella
>>>> <adhemerval.zanella@linaro.org> wrote:
>>>>> Ping (with the page_size/2 fix included).
>>>>>
>>>>> On 14/03/2017 14:28, Adhemerval Zanella wrote:
>>>>>> This patch fixes the regression added by 23d2770 for final address
>>>>>> overflow calculation.  The subtraction of the considered size (16)
>>>>>> at line 120 is at wrong place, for sizes less than 16 subsequent
>>>>>> overflow check will not take in consideration an invalid size (since
>>>>>> the subtraction will be negative).  Also, the lea instruction also
>>>>>> does not raise the carry flag (CF) that is used in subsequent jbe
>>>>>> to check for overflow.
>>>>>>
>>>>>> The fix is to follow x86_64 logic from 3daef2c where the overflow
>>>>>> is first check and a sub instruction is issued.  In case of resulting
>>>>>> negative size, CF will be set by the sub instruction and a NULL
>>>>>> result will be returned.  The patch also add similar tests reported
>>>>>> in bug report.
>>>>>>
>>>>>> Checked on i686-linux-gnu and x86_64-linux-gnu.
>>>>>>
>>>>>>       [BZ# 21182]
>>>>>>       * string/test-memchr.c (do_test): Add BZ#21182 checks for address
>>>>>>       near end of a page.
>>>>>>       * sysdeps/i386/i686/multiarch/memchr-sse2.S (__memchr): Fix
>>>>>>       overflow calculation.
>>>>>> ---
>>>>>>  ChangeLog                                 | 7 +++++++
>>>>>>  string/test-memchr.c                      | 6 ++++++
>>>>>>  sysdeps/i386/i686/multiarch/memchr-sse2.S | 2 +-
>>>>>>  3 files changed, 14 insertions(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/string/test-memchr.c b/string/test-memchr.c
>>>>>> index d64d10c..87a077b 100644
>>>>>> --- a/string/test-memchr.c
>>>>>> +++ b/string/test-memchr.c
>>>>>> @@ -210,6 +210,12 @@ test_main (void)
>>>>>>        do_test (0, i, i + 1, i + 1, 0);
>>>>>>      }
>>>>>>
>>>>>> +  /* BZ#21182 - wrong overflow calculation for i686 implementation
>>>>>> +     with address near end of the page.  */
>>>>>> +  for (i = 2; i < 16; ++i)
>>>>>> +    /* page_size is in fact getpagesize() * 2.  */
>>>>>> +    do_test (page_size/2 - i, i, i, 1, 0x9B);
>>>>>> +
>>>>>>    do_random_tests ();
>>>>>>    return ret;
>>>>>>  }
>>>>>> diff --git a/sysdeps/i386/i686/multiarch/memchr-sse2.S b/sysdeps/i386/i686/multiarch/memchr-sse2.S
>>>>>> index 910679c..e41f324 100644
>>>>>> --- a/sysdeps/i386/i686/multiarch/memchr-sse2.S
>>>>>> +++ b/sysdeps/i386/i686/multiarch/memchr-sse2.S
>>>>>> @@ -117,7 +117,6 @@ L(crosscache):
>>>>>>
>>>>>>  # ifndef USE_AS_RAWMEMCHR
>>>>>>       jnz     L(match_case2_prolog1)
>>>>>> -     lea     -16(%edx), %edx
>>>>>>          /* Calculate the last acceptable address and check for possible
>>>>>>             addition overflow by using satured math:
>>>>>>             edx = ecx + edx
>>>>>> @@ -125,6 +124,7 @@ L(crosscache):
>>>>>>       add     %ecx, %edx
>>>>>>       sbb     %eax, %eax
>>>>>>       or      %eax, %edx
>>>>>> +     sub     $16, %edx
>>>>>>       jbe     L(return_null)
>>>>>>       lea     16(%edi), %edi
>>>>>>  # else
>>>>>>
>>>>
>>>> Looks good.
>>>>
>>>>
>>>
>>> Just a thought.  Is this approach
>>>
>>>         /* Calculate "edx + ecx - 16" by using "edx - (16 - ecx)"
>>>            instead of "(edx + ecx) - 16" to void possible addition
>>>            overflow.  */
>>>         neg     %ecx
>>>         add     $16, %ecx
>>>         sub     %ecx, %edx
>>>         jbe     L(return_null)
>>>
>>> a little bit better?
>>
>> I do not have a strong preference here, although imho it is simpler to
>> understand why the saturated math would work here and it is the same
>> strategy used on other arch implementations.  May you might outline in
>> comment that ecx is always in the range of [0,64) so '-ecx + 16' can
>> possible underflow.  Is is this change just for micro-optimization?
>>
>
> Sorry, my bad here.  I made some confusion, in fact what I meant was ecx
> was indeed in range [0,16) (is the and instruction for alignment that
> uses 64) and your suggestion *can't* overflow.  I would recommend
> you to just outline the possible value of 'edx' in comments.

Here is a patch for SSE2 memchr.   OK for master?
  

Comments

Adhemerval Zanella May 19, 2017, 5:24 p.m. UTC | #1
On 19/05/2017 13:14, H.J. Lu wrote:
> On Fri, May 19, 2017 at 8:10 AM, Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>> On 19/05/2017 11:45, Adhemerval Zanella wrote:
>>>
>>>
>>> On 19/05/2017 10:44, H.J. Lu wrote:
>>>> On Tue, Mar 28, 2017 at 2:47 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>>> On Tue, Mar 28, 2017 at 12:22 PM, Adhemerval Zanella
>>>>> <adhemerval.zanella@linaro.org> wrote:
>>>>>> Ping (with the page_size/2 fix included).
>>>>>>
>>>>>> On 14/03/2017 14:28, Adhemerval Zanella wrote:
>>>>>>> This patch fixes the regression added by 23d2770 for final address
>>>>>>> overflow calculation.  The subtraction of the considered size (16)
>>>>>>> at line 120 is at wrong place, for sizes less than 16 subsequent
>>>>>>> overflow check will not take in consideration an invalid size (since
>>>>>>> the subtraction will be negative).  Also, the lea instruction also
>>>>>>> does not raise the carry flag (CF) that is used in subsequent jbe
>>>>>>> to check for overflow.
>>>>>>>
>>>>>>> The fix is to follow x86_64 logic from 3daef2c where the overflow
>>>>>>> is first check and a sub instruction is issued.  In case of resulting
>>>>>>> negative size, CF will be set by the sub instruction and a NULL
>>>>>>> result will be returned.  The patch also add similar tests reported
>>>>>>> in bug report.
>>>>>>>
>>>>>>> Checked on i686-linux-gnu and x86_64-linux-gnu.
>>>>>>>
>>>>>>>       [BZ# 21182]
>>>>>>>       * string/test-memchr.c (do_test): Add BZ#21182 checks for address
>>>>>>>       near end of a page.
>>>>>>>       * sysdeps/i386/i686/multiarch/memchr-sse2.S (__memchr): Fix
>>>>>>>       overflow calculation.
>>>>>>> ---
>>>>>>>  ChangeLog                                 | 7 +++++++
>>>>>>>  string/test-memchr.c                      | 6 ++++++
>>>>>>>  sysdeps/i386/i686/multiarch/memchr-sse2.S | 2 +-
>>>>>>>  3 files changed, 14 insertions(+), 1 deletion(-)
>>>>>>>
>>>>>>> diff --git a/string/test-memchr.c b/string/test-memchr.c
>>>>>>> index d64d10c..87a077b 100644
>>>>>>> --- a/string/test-memchr.c
>>>>>>> +++ b/string/test-memchr.c
>>>>>>> @@ -210,6 +210,12 @@ test_main (void)
>>>>>>>        do_test (0, i, i + 1, i + 1, 0);
>>>>>>>      }
>>>>>>>
>>>>>>> +  /* BZ#21182 - wrong overflow calculation for i686 implementation
>>>>>>> +     with address near end of the page.  */
>>>>>>> +  for (i = 2; i < 16; ++i)
>>>>>>> +    /* page_size is in fact getpagesize() * 2.  */
>>>>>>> +    do_test (page_size/2 - i, i, i, 1, 0x9B);
>>>>>>> +
>>>>>>>    do_random_tests ();
>>>>>>>    return ret;
>>>>>>>  }
>>>>>>> diff --git a/sysdeps/i386/i686/multiarch/memchr-sse2.S b/sysdeps/i386/i686/multiarch/memchr-sse2.S
>>>>>>> index 910679c..e41f324 100644
>>>>>>> --- a/sysdeps/i386/i686/multiarch/memchr-sse2.S
>>>>>>> +++ b/sysdeps/i386/i686/multiarch/memchr-sse2.S
>>>>>>> @@ -117,7 +117,6 @@ L(crosscache):
>>>>>>>
>>>>>>>  # ifndef USE_AS_RAWMEMCHR
>>>>>>>       jnz     L(match_case2_prolog1)
>>>>>>> -     lea     -16(%edx), %edx
>>>>>>>          /* Calculate the last acceptable address and check for possible
>>>>>>>             addition overflow by using satured math:
>>>>>>>             edx = ecx + edx
>>>>>>> @@ -125,6 +124,7 @@ L(crosscache):
>>>>>>>       add     %ecx, %edx
>>>>>>>       sbb     %eax, %eax
>>>>>>>       or      %eax, %edx
>>>>>>> +     sub     $16, %edx
>>>>>>>       jbe     L(return_null)
>>>>>>>       lea     16(%edi), %edi
>>>>>>>  # else
>>>>>>>
>>>>>
>>>>> Looks good.
>>>>>
>>>>>
>>>>
>>>> Just a thought.  Is this approach
>>>>
>>>>         /* Calculate "edx + ecx - 16" by using "edx - (16 - ecx)"
>>>>            instead of "(edx + ecx) - 16" to void possible addition
>>>>            overflow.  */
>>>>         neg     %ecx
>>>>         add     $16, %ecx
>>>>         sub     %ecx, %edx
>>>>         jbe     L(return_null)
>>>>
>>>> a little bit better?
>>>
>>> I do not have a strong preference here, although imho it is simpler to
>>> understand why the saturated math would work here and it is the same
>>> strategy used on other arch implementations.  May you might outline in
>>> comment that ecx is always in the range of [0,64) so '-ecx + 16' can
>>> possible underflow.  Is is this change just for micro-optimization?
>>>
>>
>> Sorry, my bad here.  I made some confusion, in fact what I meant was ecx
>> was indeed in range [0,16) (is the and instruction for alignment that
>> uses 64) and your suggestion *can't* overflow.  I would recommend
>> you to just outline the possible value of 'edx' in comments.
> 
> Here is a patch for SSE2 memchr.   OK for master?
> 

LGTM, thanks.
  

Patch

From 70c5d56431e54c7ef8dc8b8eb300a343a97ab72d Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 19 May 2017 09:04:23 -0700
Subject: [PATCH] x86: Optimize SSE2 memchr overflow calculation

SSE2 memchr computes "edx + ecx - 16" where ecx is less than 16.  Use
"edx - (16 - ecx)", instead of satured math, to avoid possible addition
overflow.  This replaces

	add	%ecx, %edx
	sbb	%eax, %eax
	or	%eax, %edx
	sub	$16, %edx

with

	neg	%ecx
	add	$16, %ecx
	sub	%ecx, %edx

It is the same for x86_64, except for rcx/rdx, instead of ecx/edx.

	* sysdeps/i386/i686/multiarch/memchr-sse2.S (MEMCHR): Use
	"edx + ecx - 16" to avoid possible addition overflow.
	* sysdeps/x86_64/memchr.S (memchr): Likewise.
---
 sysdeps/i386/i686/multiarch/memchr-sse2.S | 14 ++++++--------
 sysdeps/x86_64/memchr.S                   | 14 ++++++--------
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/sysdeps/i386/i686/multiarch/memchr-sse2.S b/sysdeps/i386/i686/multiarch/memchr-sse2.S
index e41f324..172d70d 100644
--- a/sysdeps/i386/i686/multiarch/memchr-sse2.S
+++ b/sysdeps/i386/i686/multiarch/memchr-sse2.S
@@ -117,14 +117,12 @@  L(crosscache):
 
 # ifndef USE_AS_RAWMEMCHR
 	jnz	L(match_case2_prolog1)
-        /* Calculate the last acceptable address and check for possible
-           addition overflow by using satured math:
-           edx = ecx + edx
-           edx |= -(edx < ecx)  */
-	add	%ecx, %edx
-	sbb	%eax, %eax
-	or	%eax, %edx
-	sub	$16, %edx
+        /* "ecx" is less than 16.  Calculate "edx + ecx - 16" by using
+	   "edx - (16 - ecx)" instead of "(edx + ecx) - 16" to void
+	   possible addition overflow.  */
+	neg	%ecx
+	add	$16, %ecx
+	sub	%ecx, %edx
 	jbe	L(return_null)
 	lea	16(%edi), %edi
 # else
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index a205a25..f82e1c5 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -76,14 +76,12 @@  L(crosscache):
 
 	.p2align 4
 L(unaligned_no_match):
-        /* Calculate the last acceptable address and check for possible
-           addition overflow by using satured math:
-           rdx = rcx + rdx
-           rdx |= -(rdx < rcx)  */
-	add	%rcx, %rdx
-	sbb	%rax, %rax
-	or	%rax, %rdx
-	sub	$16, %rdx
+        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
+	   possible addition overflow.  */
+	neg	%rcx
+	add	$16, %rcx
+	sub	%rcx, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
 	sub	$64, %rdx
-- 
2.9.4