On Wed, May 24, 2017 at 7:55 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Sat, May 20, 2017 at 12:58 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Sat, May 20, 2017 at 7:59 AM, Zack Weinberg <zackw@panix.com> wrote:
>>> On Sat, May 20, 2017 at 10:50 AM, H.J. Lu <hongjiu.lu@intel.com> wrote:
>>>> There is no need to use 64-bit registers when only the lower 32 bits
>>>> are non-zero.
>>>
>>> This code is used generically for x86-64, not for a specific
>>> microarchitecture. Is there a reason why this will never cause partial
>>> register stalls, now or in the future?
>>
>> By x86-64 specification, 32-bit destination registers in these instructions
>> are zero-extended to 64 bits and there is register stall at all.
>>
>
> Here is the updated patch with one more REX byte removed.
>
> Any other comments?
>
This is the patch I am checking in.
From 0b4aae9e15eeb63419dc1df2578b3df50aae7edf Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Thu, 18 May 2017 12:22:31 -0700
Subject: [PATCH] x86_64: Remove redundant REX bytes from memchr.S
By x86-64 specification, 32-bit destination registers are zero-extended
to 64 bits. There is no need to use 64-bit registers when only the lower
32 bits are non-zero.
* sysdeps/x86_64/memchr.S (MEMCHR): Use 32-bit registers for
the lower 32 bits.
---
sysdeps/x86_64/memchr.S | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)
@@ -31,7 +31,7 @@
.text
ENTRY(MEMCHR)
movd %esi, %xmm1
- mov %rdi, %rcx
+ mov %edi, %ecx
#ifdef USE_AS_WMEMCHR
test %rdx, %rdx
@@ -44,10 +44,10 @@ ENTRY(MEMCHR)
punpcklbw %xmm1, %xmm1
#endif
- and $63, %rcx
+ and $63, %ecx
pshufd $0, %xmm1, %xmm1
- cmp $48, %rcx
+ cmp $48, %ecx
ja L(crosscache)
movdqu (%rdi), %xmm0
@@ -59,7 +59,7 @@ ENTRY(MEMCHR)
sub $16, %rdx
jbe L(return_null)
add $16, %rdi
- and $15, %rcx
+ and $15, %ecx
and $-16, %rdi
add %rcx, %rdx
sub $64, %rdx
@@ -68,7 +68,7 @@ ENTRY(MEMCHR)
.p2align 4
L(crosscache):
- and $15, %rcx
+ and $15, %ecx
and $-16, %rdi
movdqa (%rdi), %xmm0
@@ -162,7 +162,7 @@ L(loop_prolog):
mov %rdi, %rcx
and $-64, %rdi
- and $63, %rcx
+ and $63, %ecx
add %rcx, %rdx
.p2align 4
@@ -214,7 +214,7 @@ L(align64_loop):
.p2align 4
L(exit_loop):
- add $32, %rdx
+ add $32, %edx
jle L(exit_loop_32)
movdqa (%rdi), %xmm0
@@ -234,32 +234,32 @@ L(exit_loop):
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32_1)
- sub $16, %rdx
+ sub $16, %edx
jle L(return_null)
PCMPEQ 48(%rdi), %xmm1
pmovmskb %xmm1, %eax
test %eax, %eax
jnz L(matches48_1)
- xor %rax, %rax
+ xor %eax, %eax
ret
.p2align 4
L(exit_loop_32):
- add $32, %rdx
+ add $32, %edx
movdqa (%rdi), %xmm0
PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches_1)
- sub $16, %rdx
+ sub $16, %edx
jbe L(return_null)
PCMPEQ 16(%rdi), %xmm1
pmovmskb %xmm1, %eax
test %eax, %eax
jnz L(matches16_1)
- xor %rax, %rax
+ xor %eax, %eax
ret
.p2align 4
@@ -320,7 +320,7 @@ L(matches48_1):
.p2align 4
L(return_null):
- xor %rax, %rax
+ xor %eax, %eax
ret
END(MEMCHR)
--
2.9.4